From c8b4803b28a9680032be626e43c7d7529e10414d Mon Sep 17 00:00:00 2001 From: Steffen Forkmann Date: Fri, 6 Jan 2017 15:14:21 +0100 Subject: [PATCH 1/2] Use Jaro-Winkler instead of levenshtein --- src/canopy/canopy.fs | 6 +-- src/canopy/canopy.fsproj | 2 +- src/canopy/jarowinkler.fs | 96 +++++++++++++++++++++++++++++++++++++++ src/canopy/levenshtein.fs | 32 ------------- 4 files changed, 100 insertions(+), 36 deletions(-) create mode 100644 src/canopy/jarowinkler.fs delete mode 100644 src/canopy/levenshtein.fs diff --git a/src/canopy/canopy.fs b/src/canopy/canopy.fs index d18771f7..512525cf 100644 --- a/src/canopy/canopy.fs +++ b/src/canopy/canopy.fs @@ -8,12 +8,12 @@ open Microsoft.FSharp.Core.Printf open System.IO open System open configuration -open levenshtein open reporters open types open finders open System.Drawing open System.Drawing.Imaging +open EditDistance let mutable (failureMessage : string) = null let mutable wipTest = false @@ -199,8 +199,8 @@ let private suggestOtherSelectors cssSelector = |> Array.append texts |> Seq.distinct |> List.ofSeq |> remove "." |> remove "#" |> Array.ofList - |> Array.Parallel.map (fun u -> levenshtein cssSelector u) - |> Array.sortBy (fun r -> r.distance) + |> Array.Parallel.map (fun u -> editdistance cssSelector u) + |> Array.sortBy (fun r -> - r.similarity) if results.Length >= 5 then results diff --git a/src/canopy/canopy.fsproj b/src/canopy/canopy.fsproj index 34777cc4..19c96abb 100644 --- a/src/canopy/canopy.fsproj +++ b/src/canopy/canopy.fsproj @@ -55,7 +55,7 @@ - + diff --git a/src/canopy/jarowinkler.fs b/src/canopy/jarowinkler.fs new file mode 100644 index 00000000..e71e0c0b --- /dev/null +++ b/src/canopy/jarowinkler.fs @@ -0,0 +1,96 @@ +/// Functions to compute the edit distance between two strings +/// Taken from the Visual F# compiler +module canopy.EditDistance + +/// Given an offset and a radius from that offset, +/// does mChar exist in that part of str? +let inline existsInWin (mChar: char) (str: string) offset rad = + let startAt = max 0 (offset - rad) + let endAt = min (offset + rad) (String.length str - 1) + if endAt - startAt < 0 then false + else + let rec exists index = + if str.[index] = mChar then true + elif index = endAt then false + else exists (index + 1) + exists startAt + +/// The jaro distance between s1 and s2 +let jaro s1 s2 = + // The radius is half of the lesser + // of the two string lengths rounded up. + let matchRadius = + let minLen = + min (String.length s1) (String.length s2) in + minLen / 2 + minLen % 2 + + // An inner function which recursively finds the number + // of matched characters within the radius. + let commonChars (chars1: string) (chars2: string) = + let rec inner i result = + match i with + | -1 -> result + | _ -> if existsInWin chars1.[i] chars2 i matchRadius + then inner (i - 1) (chars1.[i] :: result) + else inner (i - 1) result + inner (chars1.Length - 1) [] + + // The sets of common characters and their lengths as floats + let c1 = commonChars s1 s2 + let c2 = commonChars s2 s1 + let c1length = float (List.length c1) + let c2length = float (List.length c2) + + // The number of transpositions within + // the sets of common characters. + let transpositions = + let rec inner cl1 cl2 result = + match cl1, cl2 with + | [], _ | _, [] -> result + | c1h :: c1t, c2h :: c2t -> + if c1h <> c2h + then inner c1t c2t (result + 1.0) + else inner c1t c2t result + let mismatches = inner c1 c2 0.0 + // If one common string is longer than the other + // each additional char counts as half a transposition + (mismatches + abs (c1length - c2length)) / 2.0 + + let s1length = float (String.length s1) + let s2length = float (String.length s2) + let tLength = max c1length c2length + + // The jaro distance as given by + // 1/3 ( m2/|s1| + m1/|s2| + (mc-t)/mc ) + let result = (c1length / s1length + + c2length / s2length + + (tLength - transpositions) / tLength) + / 3.0 + + // This is for cases where |s1|, |s2| or m are zero + if System.Double.IsNaN result then 0.0 else result + +type result = { selector : string; similarity : float } + +/// Calculates the Jaro-Winkler edit distance between two strings. +/// The edit distance is a metric that allows to measure the amount of similarity between two strings. +let JaroWinklerDistance s1 s2 = + let jaroScore = jaro s1 s2 + // Accumulate the number of matching initial characters + let maxLength = (min s1.Length s2.Length) - 1 + let rec calcL i acc = + if i > maxLength || s1.[i] <> s2.[i] then acc + else calcL (i + 1) (acc + 1.0) + let l = min (calcL 0 0.0) 4.0 + // Calculate the JW distance + let p = 0.1 + jaroScore + (l * p * (1.0 - jaroScore)) + +// [snippet:Remove first ocurrence from list] +let rec remove char lst = + match lst with + | h::t when h = char -> t + | h::t -> h::remove char t + | _ -> [] + +let editdistance s1 s2 = { selector = s2; similarity = JaroWinklerDistance s1 s2 } \ No newline at end of file diff --git a/src/canopy/levenshtein.fs b/src/canopy/levenshtein.fs deleted file mode 100644 index 80bd28f0..00000000 --- a/src/canopy/levenshtein.fs +++ /dev/null @@ -1,32 +0,0 @@ -module canopy.levenshtein - -type result = { selector : string; distance : int } - -// [snippet:Remove first ocurrence from list] -let rec remove char lst = - match lst with - | h::t when h = char -> t - | h::t -> h::remove char t - | _ -> [] - -//taken from http://fssnip.net/raw/bj -let levenshtein word1 word2 = - let preprocess = fun (str : string) -> str.ToLower().ToCharArray() - let chars1, chars2 = preprocess word1, preprocess word2 - let m, n = chars1.Length, chars2.Length - let table : int[,] = Array2D.zeroCreate (m + 1) (n + 1) - for i in 0..m do - for j in 0..n do - match i, j with - | i, 0 -> table.[i, j] <- i - | 0, j -> table.[i, j] <- j - | _, _ -> - let delete = table.[i-1, j] + 1 - let insert = table.[i, j-1] + 1 - //cost of substitution is 2 - let substitute = - if chars1.[i - 1] = chars2.[j - 1] - then table.[i-1, j-1] //same character - else table.[i-1, j-1] + 2 - table.[i, j] <- List.min [delete; insert; substitute] - { selector = word2; distance = table.[m, n] } From 7cfbf5be77c43778a7e6e01221fe67d3836bdd1c Mon Sep 17 00:00:00 2001 From: Steffen Forkmann Date: Fri, 6 Jan 2017 15:46:02 +0100 Subject: [PATCH 2/2] cleanup --- src/canopy/canopy.fs | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/src/canopy/canopy.fs b/src/canopy/canopy.fs index 512525cf..b4a95ab6 100644 --- a/src/canopy/canopy.fs +++ b/src/canopy/canopy.fs @@ -202,15 +202,10 @@ let private suggestOtherSelectors cssSelector = |> Array.Parallel.map (fun u -> editdistance cssSelector u) |> Array.sortBy (fun r -> - r.similarity) - if results.Length >= 5 then - results - |> Seq.take 5 - |> Seq.map (fun r -> r.selector) |> List.ofSeq - |> (fun suggestions -> reporter.suggestSelectors cssSelector suggestions) - else - results - |> Seq.map (fun r -> r.selector) |> List.ofSeq - |> (fun suggestions -> reporter.suggestSelectors cssSelector suggestions) + results + |> fun xs -> if xs.Length >= 5 then Seq.take 5 xs else Array.toSeq xs + |> Seq.map (fun r -> r.selector) |> List.ofSeq + |> (fun suggestions -> reporter.suggestSelectors cssSelector suggestions) (* documented/actions *) let describe text =