Skip to content

Commit

Permalink
Use Jaro-Winkler distance to give better predictions - fixes #1863 (#…
Browse files Browse the repository at this point in the history
  • Loading branch information
Rickasaurus authored and KevinRansom committed Dec 6, 2016
1 parent 01c2555 commit 037bd0d
Show file tree
Hide file tree
Showing 12 changed files with 200 additions and 144 deletions.
2 changes: 1 addition & 1 deletion src/fsharp/ErrorResolutionHints.fs
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ let FilterPredictions unknownIdent allPredictions =
allPredictions
|> Seq.toList
|> List.distinct
|> List.sortBy (fun s -> Internal.Utilities.EditDistance.CalcEditDistance(unknownIdent,s))
|> List.sortByDescending (Internal.Utilities.EditDistance.JaroWinklerDistance unknownIdent)
|> take 5

let FormatPredictions predictions =
Expand Down
9 changes: 6 additions & 3 deletions src/fsharp/FSharp.Compiler.Unittests/EditDistance.fs
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,9 @@ module EditDistance =
open Internal.Utilities.EditDistance

[<Test>]
[<TestCase("CA", "ABC", ExpectedResult = 3)>]
let RestrictedEditDistance (str1 : string, str2 : string) : int =
CalcEditDistance (str1, str2)
[<TestCase("RICK", "RICK", ExpectedResult = "1.000")>]
[<TestCase("MARTHA", "MARHTA", ExpectedResult = "0.961")>]
[<TestCase("DWAYNE", "DUANE", ExpectedResult = "0.840")>]
[<TestCase("DIXON", "DICKSONX", ExpectedResult = "0.813")>]
let JaroWinklerTest (str1 : string, str2 : string) : string =
String.Format("{0:0.000}", JaroWinklerDistance str1 str2)
10 changes: 7 additions & 3 deletions src/fsharp/NameResolution.fs
Original file line number Diff line number Diff line change
Expand Up @@ -818,9 +818,13 @@ let AddResults res1 res2 =
| Result x,Result l -> Result (x @ l)
| Exception _,Result l -> Result l
| Result x,Exception _ -> Result x
// This prefers error messages with more predictions
| Exception (UndefinedName(n1,_,_,predictions1) as e1),Exception (UndefinedName(n2,_,_,predictions2) as e2) when n1 = n2 ->
if Set.count predictions1 < Set.count predictions2 then Exception e2 else Exception e1
| Exception (UndefinedName(n1,f,id1,predictions1) as e1),Exception (UndefinedName(n2,_,id2,predictions2) as e2) when n1 = n2 ->
if id1.idText = id2.idText && id1.idRange = id2.idRange then
// If we have error messages for the same symbol, then we can merge predictions.
Exception(UndefinedName(n1,f,id1,Set.union predictions1 predictions2))
else
// This prefers error messages with more predictions
if Set.count predictions1 < Set.count predictions2 then Exception e2 else Exception e1
// This prefers error messages coming from deeper failing long identifier paths
| Exception (UndefinedName(n1,_,_,_) as e1),Exception (UndefinedName(n2,_,_,_) as e2) ->
if n1 < n2 then Exception e2 else Exception e1
Expand Down
122 changes: 80 additions & 42 deletions src/utils/EditDistance.fs
Original file line number Diff line number Diff line change
Expand Up @@ -3,46 +3,84 @@
/// Functions to compute the edit distance between two strings
module internal Internal.Utilities.EditDistance

/// Computes the restricted Damerau-Levenstein edit distance,
/// also known as the "optimal string alignment" distance.
/// - read more at https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance
/// - Implementation taken from http://www.navision-blog.de/2008/11/01/damerau-levenshtein-distance-in-fsharp-part-ii/
let private calcDamerauLevenshtein (a:string, b:string) =
let m = b.Length + 1
let mutable lastLine = Array.init m id
let mutable lastLastLine = Array.zeroCreate m
let mutable actLine = Array.zeroCreate m

for i in 1 .. a.Length do
actLine.[0] <- i
for j in 1 .. b.Length do
let cost = if a.[i-1] = b.[j-1] then 0 else 1
let deletion = lastLine.[j] + 1
let insertion = actLine.[j-1] + 1
let substitution = lastLine.[j-1] + cost
actLine.[j] <-
deletion
|> min insertion
|> min substitution

if i > 1 && j > 1 then
if a.[i-1] = b.[j-2] && a.[i-2] = b.[j-1] then
let transposition = lastLastLine.[j-2] + cost
actLine.[j] <- min actLine.[j] transposition

// swap lines
let temp = lastLastLine
lastLastLine <- lastLine
lastLine <- actLine
actLine <- temp

lastLine.[b.Length]

/// Calculates the edit distance between two strings.
/// The edit distance is a metric that allows to measure the amount of difference between two strings
/// and shows how many edit operations (insert, delete, substitution) are needed to transform one string into the other.
let CalcEditDistance(a:string, b:string) =
if a.Length > b.Length then
calcDamerauLevenshtein(a, b)
/// Given an offset and a radius from that offset,
/// does mChar exist in that part of str?
let inline existsInWin (mChar: char) (str: string) offset rad =
let startAt = max 0 (offset - rad)
let endAt = min (offset + rad) (String.length str - 1)
if endAt - startAt < 0 then false
else
calcDamerauLevenshtein(b, a)
let rec exists index =
if str.[index] = mChar then true
elif index = endAt then false
else exists (index + 1)
exists startAt

/// The jaro distance between s1 and s2
let jaro s1 s2 =
// The radius is half of the lesser
// of the two string lengths rounded up.
let matchRadius =
let minLen =
min (String.length s1) (String.length s2) in
minLen / 2 + minLen % 2

// An inner function which recursively finds the number
// of matched characters within the radius.
let commonChars (chars1: string) (chars2: string) =
let rec inner i result =
match i with
| -1 -> result
| _ -> if existsInWin chars1.[i] chars2 i matchRadius
then inner (i - 1) (chars1.[i] :: result)
else inner (i - 1) result
inner (chars1.Length - 1) []

// The sets of common characters and their lengths as floats
let c1 = commonChars s1 s2
let c2 = commonChars s2 s1
let c1length = float (List.length c1)
let c2length = float (List.length c2)

// The number of transpositions within
// the sets of common characters.
let transpositions =
let rec inner cl1 cl2 result =
match cl1, cl2 with
| [], _ | _, [] -> result
| c1h :: c1t, c2h :: c2t ->
if c1h <> c2h
then inner c1t c2t (result + 1.0)
else inner c1t c2t result
let mismatches = inner c1 c2 0.0
// If one common string is longer than the other
// each additional char counts as half a transposition
(mismatches + abs (c1length - c2length)) / 2.0

let s1length = float (String.length s1)
let s2length = float (String.length s2)
let tLength = max c1length c2length

// The jaro distance as given by
// 1/3 ( m2/|s1| + m1/|s2| + (mc-t)/mc )
let result = (c1length / s1length +
c2length / s2length +
(tLength - transpositions) / tLength)
/ 3.0

// This is for cases where |s1|, |s2| or m are zero
if System.Double.IsNaN result then 0.0 else result

/// Calculates the Jaro-Winkler edit distance between two strings.
/// The edit distance is a metric that allows to measure the amount of similarity between two strings.
let JaroWinklerDistance s1 s2 =
let jaroScore = jaro s1 s2
// Accumulate the number of matching initial characters
let maxLength = (min s1.Length s2.Length) - 1
let rec calcL i acc =
if i > maxLength || s1.[i] <> s2.[i] then acc
else calcL (i + 1) (acc + 1.0)
let l = min (calcL 0 0.0) 4.0
// Calculate the JW distance
let p = 0.1
jaroScore + (l * p * (1.0 - jaroScore))
Loading

0 comments on commit 037bd0d

Please sign in to comment.