Skip to content

Commit

Permalink
Merge pull request #323 from forki/jarowinkler
Browse files Browse the repository at this point in the history
Use Jaro-Winkler instead of levenshtein
  • Loading branch information
lefthandedgoat authored Jan 6, 2017
2 parents cb3e8a6 + 7cfbf5b commit 5cd3dcf
Show file tree
Hide file tree
Showing 4 changed files with 105 additions and 46 deletions.
21 changes: 8 additions & 13 deletions src/canopy/canopy.fs
Original file line number Diff line number Diff line change
Expand Up @@ -8,12 +8,12 @@ open Microsoft.FSharp.Core.Printf
open System.IO
open System
open configuration
open levenshtein
open reporters
open types
open finders
open System.Drawing
open System.Drawing.Imaging
open EditDistance

let mutable (failureMessage : string) = null
let mutable wipTest = false
Expand Down Expand Up @@ -199,18 +199,13 @@ let private suggestOtherSelectors cssSelector =
|> Array.append texts
|> Seq.distinct |> List.ofSeq
|> remove "." |> remove "#" |> Array.ofList
|> Array.Parallel.map (fun u -> levenshtein cssSelector u)
|> Array.sortBy (fun r -> r.distance)

if results.Length >= 5 then
results
|> Seq.take 5
|> Seq.map (fun r -> r.selector) |> List.ofSeq
|> (fun suggestions -> reporter.suggestSelectors cssSelector suggestions)
else
results
|> Seq.map (fun r -> r.selector) |> List.ofSeq
|> (fun suggestions -> reporter.suggestSelectors cssSelector suggestions)
|> Array.Parallel.map (fun u -> editdistance cssSelector u)
|> Array.sortBy (fun r -> - r.similarity)

results
|> fun xs -> if xs.Length >= 5 then Seq.take 5 xs else Array.toSeq xs
|> Seq.map (fun r -> r.selector) |> List.ofSeq
|> (fun suggestions -> reporter.suggestSelectors cssSelector suggestions)

(* documented/actions *)
let describe text =
Expand Down
2 changes: 1 addition & 1 deletion src/canopy/canopy.fsproj
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@
<Compile Include="reporters.fs" />
<Compile Include="finders.fs" />
<Compile Include="configuration.fs" />
<Compile Include="levenshtein.fs" />
<Compile Include="jarowinkler.fs" />
<Compile Include="canopy.fs" />
<Compile Include="history.fs" />
<Compile Include="runner.fs" />
Expand Down
96 changes: 96 additions & 0 deletions src/canopy/jarowinkler.fs
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
/// Functions to compute the edit distance between two strings
/// Taken from the Visual F# compiler
module canopy.EditDistance

/// Given an offset and a radius from that offset,
/// does mChar exist in that part of str?
let inline existsInWin (mChar: char) (str: string) offset rad =
let startAt = max 0 (offset - rad)
let endAt = min (offset + rad) (String.length str - 1)
if endAt - startAt < 0 then false
else
let rec exists index =
if str.[index] = mChar then true
elif index = endAt then false
else exists (index + 1)
exists startAt

/// The jaro distance between s1 and s2
let jaro s1 s2 =
// The radius is half of the lesser
// of the two string lengths rounded up.
let matchRadius =
let minLen =
min (String.length s1) (String.length s2) in
minLen / 2 + minLen % 2

// An inner function which recursively finds the number
// of matched characters within the radius.
let commonChars (chars1: string) (chars2: string) =
let rec inner i result =
match i with
| -1 -> result
| _ -> if existsInWin chars1.[i] chars2 i matchRadius
then inner (i - 1) (chars1.[i] :: result)
else inner (i - 1) result
inner (chars1.Length - 1) []

// The sets of common characters and their lengths as floats
let c1 = commonChars s1 s2
let c2 = commonChars s2 s1
let c1length = float (List.length c1)
let c2length = float (List.length c2)

// The number of transpositions within
// the sets of common characters.
let transpositions =
let rec inner cl1 cl2 result =
match cl1, cl2 with
| [], _ | _, [] -> result
| c1h :: c1t, c2h :: c2t ->
if c1h <> c2h
then inner c1t c2t (result + 1.0)
else inner c1t c2t result
let mismatches = inner c1 c2 0.0
// If one common string is longer than the other
// each additional char counts as half a transposition
(mismatches + abs (c1length - c2length)) / 2.0

let s1length = float (String.length s1)
let s2length = float (String.length s2)
let tLength = max c1length c2length

// The jaro distance as given by
// 1/3 ( m2/|s1| + m1/|s2| + (mc-t)/mc )
let result = (c1length / s1length +
c2length / s2length +
(tLength - transpositions) / tLength)
/ 3.0

// This is for cases where |s1|, |s2| or m are zero
if System.Double.IsNaN result then 0.0 else result

type result = { selector : string; similarity : float }

/// Calculates the Jaro-Winkler edit distance between two strings.
/// The edit distance is a metric that allows to measure the amount of similarity between two strings.
let JaroWinklerDistance s1 s2 =
let jaroScore = jaro s1 s2
// Accumulate the number of matching initial characters
let maxLength = (min s1.Length s2.Length) - 1
let rec calcL i acc =
if i > maxLength || s1.[i] <> s2.[i] then acc
else calcL (i + 1) (acc + 1.0)
let l = min (calcL 0 0.0) 4.0
// Calculate the JW distance
let p = 0.1
jaroScore + (l * p * (1.0 - jaroScore))

// [snippet:Remove first ocurrence from list]
let rec remove char lst =
match lst with
| h::t when h = char -> t
| h::t -> h::remove char t
| _ -> []

let editdistance s1 s2 = { selector = s2; similarity = JaroWinklerDistance s1 s2 }
32 changes: 0 additions & 32 deletions src/canopy/levenshtein.fs

This file was deleted.

0 comments on commit 5cd3dcf

Please sign in to comment.