diff --git a/src/canopy/canopy.fs b/src/canopy/canopy.fs
index d18771f7..b4a95ab6 100644
--- a/src/canopy/canopy.fs
+++ b/src/canopy/canopy.fs
@@ -8,12 +8,12 @@ open Microsoft.FSharp.Core.Printf
open System.IO
open System
open configuration
-open levenshtein
open reporters
open types
open finders
open System.Drawing
open System.Drawing.Imaging
+open EditDistance
let mutable (failureMessage : string) = null
let mutable wipTest = false
@@ -199,18 +199,13 @@ let private suggestOtherSelectors cssSelector =
|> Array.append texts
|> Seq.distinct |> List.ofSeq
|> remove "." |> remove "#" |> Array.ofList
- |> Array.Parallel.map (fun u -> levenshtein cssSelector u)
- |> Array.sortBy (fun r -> r.distance)
-
- if results.Length >= 5 then
- results
- |> Seq.take 5
- |> Seq.map (fun r -> r.selector) |> List.ofSeq
- |> (fun suggestions -> reporter.suggestSelectors cssSelector suggestions)
- else
- results
- |> Seq.map (fun r -> r.selector) |> List.ofSeq
- |> (fun suggestions -> reporter.suggestSelectors cssSelector suggestions)
+ |> Array.Parallel.map (fun u -> editdistance cssSelector u)
+ |> Array.sortBy (fun r -> - r.similarity)
+
+ results
+ |> fun xs -> if xs.Length >= 5 then Seq.take 5 xs else Array.toSeq xs
+ |> Seq.map (fun r -> r.selector) |> List.ofSeq
+ |> (fun suggestions -> reporter.suggestSelectors cssSelector suggestions)
(* documented/actions *)
let describe text =
diff --git a/src/canopy/canopy.fsproj b/src/canopy/canopy.fsproj
index 34777cc4..19c96abb 100644
--- a/src/canopy/canopy.fsproj
+++ b/src/canopy/canopy.fsproj
@@ -55,7 +55,7 @@
-
+
diff --git a/src/canopy/jarowinkler.fs b/src/canopy/jarowinkler.fs
new file mode 100644
index 00000000..e71e0c0b
--- /dev/null
+++ b/src/canopy/jarowinkler.fs
@@ -0,0 +1,96 @@
+/// Functions to compute the edit distance between two strings
+/// Taken from the Visual F# compiler
+module canopy.EditDistance
+
+/// Given an offset and a radius from that offset,
+/// does mChar exist in that part of str?
+let inline existsInWin (mChar: char) (str: string) offset rad =
+ let startAt = max 0 (offset - rad)
+ let endAt = min (offset + rad) (String.length str - 1)
+ if endAt - startAt < 0 then false
+ else
+ let rec exists index =
+ if str.[index] = mChar then true
+ elif index = endAt then false
+ else exists (index + 1)
+ exists startAt
+
+/// The jaro distance between s1 and s2
+let jaro s1 s2 =
+ // The radius is half of the lesser
+ // of the two string lengths rounded up.
+ let matchRadius =
+ let minLen =
+ min (String.length s1) (String.length s2) in
+ minLen / 2 + minLen % 2
+
+ // An inner function which recursively finds the number
+ // of matched characters within the radius.
+ let commonChars (chars1: string) (chars2: string) =
+ let rec inner i result =
+ match i with
+ | -1 -> result
+ | _ -> if existsInWin chars1.[i] chars2 i matchRadius
+ then inner (i - 1) (chars1.[i] :: result)
+ else inner (i - 1) result
+ inner (chars1.Length - 1) []
+
+ // The sets of common characters and their lengths as floats
+ let c1 = commonChars s1 s2
+ let c2 = commonChars s2 s1
+ let c1length = float (List.length c1)
+ let c2length = float (List.length c2)
+
+ // The number of transpositions within
+ // the sets of common characters.
+ let transpositions =
+ let rec inner cl1 cl2 result =
+ match cl1, cl2 with
+ | [], _ | _, [] -> result
+ | c1h :: c1t, c2h :: c2t ->
+ if c1h <> c2h
+ then inner c1t c2t (result + 1.0)
+ else inner c1t c2t result
+ let mismatches = inner c1 c2 0.0
+ // If one common string is longer than the other
+ // each additional char counts as half a transposition
+ (mismatches + abs (c1length - c2length)) / 2.0
+
+ let s1length = float (String.length s1)
+ let s2length = float (String.length s2)
+ let tLength = max c1length c2length
+
+ // The jaro distance as given by
+ // 1/3 ( m2/|s1| + m1/|s2| + (mc-t)/mc )
+ let result = (c1length / s1length +
+ c2length / s2length +
+ (tLength - transpositions) / tLength)
+ / 3.0
+
+ // This is for cases where |s1|, |s2| or m are zero
+ if System.Double.IsNaN result then 0.0 else result
+
+type result = { selector : string; similarity : float }
+
+/// Calculates the Jaro-Winkler edit distance between two strings.
+/// The edit distance is a metric that allows to measure the amount of similarity between two strings.
+let JaroWinklerDistance s1 s2 =
+ let jaroScore = jaro s1 s2
+ // Accumulate the number of matching initial characters
+ let maxLength = (min s1.Length s2.Length) - 1
+ let rec calcL i acc =
+ if i > maxLength || s1.[i] <> s2.[i] then acc
+ else calcL (i + 1) (acc + 1.0)
+ let l = min (calcL 0 0.0) 4.0
+ // Calculate the JW distance
+ let p = 0.1
+ jaroScore + (l * p * (1.0 - jaroScore))
+
+// [snippet:Remove first ocurrence from list]
+let rec remove char lst =
+ match lst with
+ | h::t when h = char -> t
+ | h::t -> h::remove char t
+ | _ -> []
+
+let editdistance s1 s2 = { selector = s2; similarity = JaroWinklerDistance s1 s2 }
\ No newline at end of file
diff --git a/src/canopy/levenshtein.fs b/src/canopy/levenshtein.fs
deleted file mode 100644
index 80bd28f0..00000000
--- a/src/canopy/levenshtein.fs
+++ /dev/null
@@ -1,32 +0,0 @@
-module canopy.levenshtein
-
-type result = { selector : string; distance : int }
-
-// [snippet:Remove first ocurrence from list]
-let rec remove char lst =
- match lst with
- | h::t when h = char -> t
- | h::t -> h::remove char t
- | _ -> []
-
-//taken from http://fssnip.net/raw/bj
-let levenshtein word1 word2 =
- let preprocess = fun (str : string) -> str.ToLower().ToCharArray()
- let chars1, chars2 = preprocess word1, preprocess word2
- let m, n = chars1.Length, chars2.Length
- let table : int[,] = Array2D.zeroCreate (m + 1) (n + 1)
- for i in 0..m do
- for j in 0..n do
- match i, j with
- | i, 0 -> table.[i, j] <- i
- | 0, j -> table.[i, j] <- j
- | _, _ ->
- let delete = table.[i-1, j] + 1
- let insert = table.[i, j-1] + 1
- //cost of substitution is 2
- let substitute =
- if chars1.[i - 1] = chars2.[j - 1]
- then table.[i-1, j-1] //same character
- else table.[i-1, j-1] + 2
- table.[i, j] <- List.min [delete; insert; substitute]
- { selector = word2; distance = table.[m, n] }