Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use Jaro-Winkler distance to give better predictions - fixes #1863 #1945

Merged
merged 1 commit into from
Dec 6, 2016
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion src/fsharp/ErrorResolutionHints.fs
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ let FilterPredictions unknownIdent allPredictions =
allPredictions
|> Seq.toList
|> List.distinct
|> List.sortBy (fun s -> Internal.Utilities.EditDistance.CalcEditDistance(unknownIdent,s))
|> List.sortByDescending (Internal.Utilities.EditDistance.JaroWinklerDistance unknownIdent)
|> take 5

let FormatPredictions predictions =
Expand Down
9 changes: 6 additions & 3 deletions src/fsharp/FSharp.Compiler.Unittests/EditDistance.fs
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,9 @@ module EditDistance =
open Internal.Utilities.EditDistance

[<Test>]
[<TestCase("CA", "ABC", ExpectedResult = 3)>]
let RestrictedEditDistance (str1 : string, str2 : string) : int =
CalcEditDistance (str1, str2)
[<TestCase("RICK", "RICK", ExpectedResult = "1.000")>]
[<TestCase("MARTHA", "MARHTA", ExpectedResult = "0.961")>]
[<TestCase("DWAYNE", "DUANE", ExpectedResult = "0.840")>]
[<TestCase("DIXON", "DICKSONX", ExpectedResult = "0.813")>]
let JaroWinklerTest (str1 : string, str2 : string) : string =
String.Format("{0:0.000}", JaroWinklerDistance str1 str2)
10 changes: 7 additions & 3 deletions src/fsharp/NameResolution.fs
Original file line number Diff line number Diff line change
Expand Up @@ -818,9 +818,13 @@ let AddResults res1 res2 =
| Result x,Result l -> Result (x @ l)
| Exception _,Result l -> Result l
| Result x,Exception _ -> Result x
// This prefers error messages with more predictions
| Exception (UndefinedName(n1,_,_,predictions1) as e1),Exception (UndefinedName(n2,_,_,predictions2) as e2) when n1 = n2 ->
if Set.count predictions1 < Set.count predictions2 then Exception e2 else Exception e1
| Exception (UndefinedName(n1,f,id1,predictions1) as e1),Exception (UndefinedName(n2,_,id2,predictions2) as e2) when n1 = n2 ->
if id1.idText = id2.idText && id1.idRange = id2.idRange then
// If we have error messages for the same symbol, then we can merge predictions.
Exception(UndefinedName(n1,f,id1,Set.union predictions1 predictions2))
else
// This prefers error messages with more predictions
if Set.count predictions1 < Set.count predictions2 then Exception e2 else Exception e1
// This prefers error messages coming from deeper failing long identifier paths
| Exception (UndefinedName(n1,_,_,_) as e1),Exception (UndefinedName(n2,_,_,_) as e2) ->
if n1 < n2 then Exception e2 else Exception e1
Expand Down
122 changes: 80 additions & 42 deletions src/utils/EditDistance.fs
Original file line number Diff line number Diff line change
Expand Up @@ -3,46 +3,84 @@
/// Functions to compute the edit distance between two strings
module internal Internal.Utilities.EditDistance

/// Computes the restricted Damerau-Levenstein edit distance,
/// also known as the "optimal string alignment" distance.
/// - read more at https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance
/// - Implementation taken from http://www.navision-blog.de/2008/11/01/damerau-levenshtein-distance-in-fsharp-part-ii/
let private calcDamerauLevenshtein (a:string, b:string) =
let m = b.Length + 1
let mutable lastLine = Array.init m id
let mutable lastLastLine = Array.zeroCreate m
let mutable actLine = Array.zeroCreate m

for i in 1 .. a.Length do
actLine.[0] <- i
for j in 1 .. b.Length do
let cost = if a.[i-1] = b.[j-1] then 0 else 1
let deletion = lastLine.[j] + 1
let insertion = actLine.[j-1] + 1
let substitution = lastLine.[j-1] + cost
actLine.[j] <-
deletion
|> min insertion
|> min substitution

if i > 1 && j > 1 then
if a.[i-1] = b.[j-2] && a.[i-2] = b.[j-1] then
let transposition = lastLastLine.[j-2] + cost
actLine.[j] <- min actLine.[j] transposition

// swap lines
let temp = lastLastLine
lastLastLine <- lastLine
lastLine <- actLine
actLine <- temp

lastLine.[b.Length]

/// Calculates the edit distance between two strings.
/// The edit distance is a metric that allows to measure the amount of difference between two strings
/// and shows how many edit operations (insert, delete, substitution) are needed to transform one string into the other.
let CalcEditDistance(a:string, b:string) =
if a.Length > b.Length then
calcDamerauLevenshtein(a, b)
/// Given an offset and a radius from that offset,
/// does mChar exist in that part of str?
let inline existsInWin (mChar: char) (str: string) offset rad =
let startAt = max 0 (offset - rad)
let endAt = min (offset + rad) (String.length str - 1)
if endAt - startAt < 0 then false
else
calcDamerauLevenshtein(b, a)
let rec exists index =
if str.[index] = mChar then true
elif index = endAt then false
else exists (index + 1)
exists startAt

/// The jaro distance between s1 and s2
let jaro s1 s2 =
// The radius is half of the lesser
// of the two string lengths rounded up.
let matchRadius =
let minLen =
min (String.length s1) (String.length s2) in
minLen / 2 + minLen % 2

// An inner function which recursively finds the number
// of matched characters within the radius.
let commonChars (chars1: string) (chars2: string) =
let rec inner i result =
match i with
| -1 -> result
| _ -> if existsInWin chars1.[i] chars2 i matchRadius
then inner (i - 1) (chars1.[i] :: result)
else inner (i - 1) result
inner (chars1.Length - 1) []

// The sets of common characters and their lengths as floats
let c1 = commonChars s1 s2
let c2 = commonChars s2 s1
let c1length = float (List.length c1)
let c2length = float (List.length c2)

// The number of transpositions within
// the sets of common characters.
let transpositions =
let rec inner cl1 cl2 result =
match cl1, cl2 with
| [], _ | _, [] -> result
| c1h :: c1t, c2h :: c2t ->
if c1h <> c2h
then inner c1t c2t (result + 1.0)
else inner c1t c2t result
let mismatches = inner c1 c2 0.0
// If one common string is longer than the other
// each additional char counts as half a transposition
(mismatches + abs (c1length - c2length)) / 2.0

let s1length = float (String.length s1)
let s2length = float (String.length s2)
let tLength = max c1length c2length

// The jaro distance as given by
// 1/3 ( m2/|s1| + m1/|s2| + (mc-t)/mc )
let result = (c1length / s1length +
c2length / s2length +
(tLength - transpositions) / tLength)
/ 3.0

// This is for cases where |s1|, |s2| or m are zero
if System.Double.IsNaN result then 0.0 else result

/// Calculates the Jaro-Winkler edit distance between two strings.
/// The edit distance is a metric that allows to measure the amount of similarity between two strings.
let JaroWinklerDistance s1 s2 =
let jaroScore = jaro s1 s2
// Accumulate the number of matching initial characters
let maxLength = (min s1.Length s2.Length) - 1
let rec calcL i acc =
if i > maxLength || s1.[i] <> s2.[i] then acc
else calcL (i + 1) (acc + 1.0)
let l = min (calcL 0 0.0) 4.0
// Calculate the JW distance
let p = 0.1
jaroScore + (l * p * (1.0 - jaroScore))
Loading