From 7e618591caa63e69addfb639c645e74df6aa11a5 Mon Sep 17 00:00:00 2001 From: Adam Shannon Date: Thu, 3 Oct 2019 12:44:14 -0500 Subject: [PATCH 1/3] cmd/server: return the maximum between soundex and jaro-winkler in search --- cmd/server/search.go | 54 ++++++++++++++++++++++++++++++++++---------- 1 file changed, 42 insertions(+), 12 deletions(-) diff --git a/cmd/server/search.go b/cmd/server/search.go index 8bc3dd40..cad66eb0 100644 --- a/cmd/server/search.go +++ b/cmd/server/search.go @@ -9,6 +9,7 @@ import ( "encoding/json" "errors" "fmt" + "math" "net/http" "regexp" "strconv" @@ -69,9 +70,13 @@ var ( // topAddressesAddress is a compare method for TopAddressesFn to extract and rank .Address topAddressesAddress = func(needleAddr string) func(*Address) *item { return func(add *Address) *item { + needle := precompute(needleAddr) return &item{ - value: add, - weight: jaroWinkler(add.address, precompute(needleAddr)), + value: add, + weight: math.Max( + soundex(add.address, needle), + jaroWinkler(add.address, needle), + ), } } } @@ -81,9 +86,13 @@ var ( // search criteria. topAddressesCityState = func(needleCityState string) func(*Address) *item { return func(add *Address) *item { + needle := precompute(needleCityState) return &item{ - value: add, - weight: jaroWinkler(add.citystate, precompute(needleCityState)), + value: add, + weight: math.Max( + soundex(add.citystate, needle), + jaroWinkler(add.citystate, needle), + ), } } } @@ -91,9 +100,13 @@ var ( // topAddressesCountry is a compare method for TopAddressesFn to extract and rank .Country topAddressesCountry = func(needleCountry string) func(*Address) *item { return func(add *Address) *item { + needle := precompute(needleCountry) return &item{ - value: add, - weight: jaroWinkler(add.country, precompute(needleCountry)), + value: add, + weight: math.Max( + soundex(add.country, needle), + jaroWinkler(add.country, needle), + ), } } } @@ -180,8 +193,11 @@ func (s *searcher) TopAltNames(limit int, alt string) []Alt { for i := range s.Alts { xs.add(&item{ - value: s.Alts[i], - weight: jaroWinkler(s.Alts[i].name, alt), + value: s.Alts[i], + weight: math.Max( + soundex(s.Alts[i].name, alt), + jaroWinkler(s.Alts[i].name, alt), + ), }) } @@ -252,8 +268,11 @@ func (s *searcher) TopSDNs(limit int, name string) []SDN { for i := range s.SDNs { xs.add(&item{ - value: s.SDNs[i], - weight: jaroWinkler(s.SDNs[i].name, name), + value: s.SDNs[i], + weight: math.Max( + soundex(s.SDNs[i].name, name), + jaroWinkler(s.SDNs[i].name, name), + ), }) } @@ -285,8 +304,11 @@ func (s *searcher) TopDPs(limit int, name string) []DP { for _, dp := range s.DPs { xs.add(&item{ - value: dp, - weight: jaroWinkler(dp.name, name), + value: dp, + weight: math.Max( + soundex(dp.name, name), + jaroWinkler(dp.name, name), + ), }) } @@ -543,6 +565,14 @@ func jaroWinkler(s1, s2 string) float64 { return max } +// soundex will phonetically normalize two strings and then return their jaro-winkler distance +// as a percentage. +// +// For more details see https://en.wikipedia.org/wiki/Soundex +func soundex(s1, s2 string) float64 { + return jaroWinkler(smetrics.Soundex(s1), smetrics.Soundex(s2)) +} + // extractIDFromRemark attempts to parse out a National ID or similar governmental ID value // from an SDN's remarks property. // From 194038c912eb4577fd12274863bcacb3600ecdc6 Mon Sep 17 00:00:00 2001 From: Adam Shannon Date: Thu, 3 Oct 2019 12:57:21 -0500 Subject: [PATCH 2/3] cmd/server: adjust tests after adding soundex to search routes --- cmd/server/issue115_test.go | 6 +++--- cmd/server/search_handlers_test.go | 12 ++++++------ cmd/server/search_test.go | 6 +++--- 3 files changed, 12 insertions(+), 12 deletions(-) diff --git a/cmd/server/issue115_test.go b/cmd/server/issue115_test.go index d0f82cb4..4b3fe9d2 100644 --- a/cmd/server/issue115_test.go +++ b/cmd/server/issue115_test.go @@ -27,12 +27,12 @@ func TestIssue115__TopSDNs(t *testing.T) { // was 89.6% match s.SDNs = precomputeSDNs([]*ofac.SDN{{EntityID: "2680", SDNName: "HABBASH, George", SDNType: "INDIVIDUAL"}}) out := s.TopSDNs(1, "george bush") - eql(t, "issue115: top SDN 2680", out[0].match, 0.896) + eql(t, "issue115: top SDN 2680", out[0].match, 1.0) // was 88.3% match s.SDNs = precomputeSDNs([]*ofac.SDN{{EntityID: "9432", SDNName: "CHIWESHE, George", SDNType: "INDIVIDUAL"}}) out = s.TopSDNs(1, "george bush") - eql(t, "issue115: top SDN 18996", out[0].match, 0.849) + eql(t, "issue115: top SDN 18996", out[0].match, 0.883) // another example s.SDNs = precomputeSDNs([]*ofac.SDN{{EntityID: "0", SDNName: "Bush, George W", SDNType: "INDIVIDUAL"}}) @@ -41,7 +41,7 @@ func TestIssue115__TopSDNs(t *testing.T) { } out = s.TopSDNs(1, "george bush") - eql(t, "issue115: top SDN 0", out[0].match, 0.942) + eql(t, "issue115: top SDN 0", out[0].match, 1.0) out = s.TopSDNs(1, "george w bush") eql(t, "issue115: top SDN 0", out[0].match, 1.0) diff --git a/cmd/server/search_handlers_test.go b/cmd/server/search_handlers_test.go index e57988ff..95988c01 100644 --- a/cmd/server/search_handlers_test.go +++ b/cmd/server/search_handlers_test.go @@ -30,7 +30,7 @@ func TestSearch__Address(t *testing.T) { t.Errorf("bogus status code: %d", w.Code) } - if v := w.Body.String(); !strings.Contains(v, `"match":0.9229`) { + if v := w.Body.String(); !strings.Contains(v, `"match":1`) { t.Errorf("%#v", v) } @@ -86,7 +86,7 @@ func TestSearch__AddressMulti(t *testing.T) { t.Errorf("bogus status code: %d", w.Code) } - if v := w.Body.String(); !strings.Contains(v, `"match":0.945`) { + if v := w.Body.String(); !strings.Contains(v, `"match":1`) { t.Errorf("%#v", v) } } @@ -104,7 +104,7 @@ func TestSearch__AddressProvidence(t *testing.T) { t.Errorf("bogus status code: %d", w.Code) } - if v := w.Body.String(); !strings.Contains(v, `"match":0.963`) { + if v := w.Body.String(); !strings.Contains(v, `"match":1`) { t.Errorf("%#v", v) } } @@ -122,7 +122,7 @@ func TestSearch__AddressCity(t *testing.T) { t.Errorf("bogus status code: %d", w.Code) } - if v := w.Body.String(); !strings.Contains(v, `"match":0.963`) { + if v := w.Body.String(); !strings.Contains(v, `"match":1`) { t.Errorf("%#v", v) } } @@ -140,7 +140,7 @@ func TestSearch__AddressState(t *testing.T) { t.Errorf("bogus status code: %d", w.Code) } - if v := w.Body.String(); !strings.Contains(v, `"match":0.963`) { + if v := w.Body.String(); !strings.Contains(v, `"match":1`) { t.Errorf("%#v", v) } } @@ -178,7 +178,7 @@ func TestSearch__NameAndAltName(t *testing.T) { if wrapper.SDNs[0].EntityID != "2681" { t.Errorf("%#v", wrapper.SDNs[0]) } - if wrapper.AltNames[0].EntityID != "4691" { + if wrapper.AltNames[0].EntityID != "559" { t.Errorf("%#v", wrapper.AltNames[0].EntityID) } if wrapper.Addresses[0].EntityID != "173" { diff --git a/cmd/server/search_test.go b/cmd/server/search_test.go index 16abb29c..7420b9b3 100644 --- a/cmd/server/search_test.go +++ b/cmd/server/search_test.go @@ -247,7 +247,7 @@ func TestSearch_liveData(t *testing.T) { func TestSearch__topAddressesAddress(t *testing.T) { it := topAddressesAddress("needle")(&Address{address: "needleee"}) - eql(t, "topAddressesAddress", it.weight, 0.950) + eql(t, "topAddressesAddress", it.weight, 1.0) if add, ok := it.value.(*Address); !ok || add.address != "needleee" { t.Errorf("got %#v", add) } @@ -256,7 +256,7 @@ func TestSearch__topAddressesAddress(t *testing.T) { func TestSearch__topAddressesCountry(t *testing.T) { it := topAddressesAddress("needle")(&Address{address: "needleee"}) - eql(t, "topAddressesCountry", it.weight, 0.950) + eql(t, "topAddressesCountry", it.weight, 1.0) if add, ok := it.value.(*Address); !ok || add.address != "needleee" { t.Errorf("got %#v", add) } @@ -268,7 +268,7 @@ func TestSearch__multiAddressCompare(t *testing.T) { topAddressesCountry("other"), )(&Address{address: "needlee", country: "other"}) - eql(t, "multiAddressCompare", it.weight, 0.986) + eql(t, "multiAddressCompare", it.weight, 1.0) if add, ok := it.value.(*Address); !ok || add.address != "needlee" || add.country != "other" { t.Errorf("got %#v", add) } From a3d97c134407c7344d4de07e8e3d1555144709b0 Mon Sep 17 00:00:00 2001 From: Adam Shannon Date: Thu, 3 Oct 2019 13:18:59 -0500 Subject: [PATCH 3/3] cmd/server: we should soundex against each word, not the full strings --- cmd/server/search.go | 28 ++++++++++++++++++---------- 1 file changed, 18 insertions(+), 10 deletions(-) diff --git a/cmd/server/search.go b/cmd/server/search.go index cad66eb0..691865f8 100644 --- a/cmd/server/search.go +++ b/cmd/server/search.go @@ -531,13 +531,29 @@ func extractSearchLimit(r *http.Request) int { // Right now s1 is assumes to have been passed through `chomp(..)` already and so this // func only calls `chomp` for s2. func jaroWinkler(s1, s2 string) float64 { + return rankStrings(s1, s2, func(s1, s2 string) float64 { + return smetrics.JaroWinkler(s1, s2, 0.7, 4) + }) +} + +// soundex will phonetically normalize two strings and then return their jaro-winkler distance +// as a percentage. +// +// For more details see https://en.wikipedia.org/wiki/Soundex +func soundex(s1, s2 string) float64 { + return rankStrings(s1, s2, func(s1, s2 string) float64 { + return jaroWinkler(smetrics.Soundex(s1), smetrics.Soundex(s2)) + }) +} + +func rankStrings(s1, s2 string, f func(s1, s2 string) float64) float64 { maxMatch := func(word string, parts []string) float64 { if len(parts) == 0 { return 0.0 } - max := smetrics.JaroWinkler(word, parts[0], 0.7, 4) + max := f(word, parts[0]) for i := 1; i < len(parts); i++ { - if score := smetrics.JaroWinkler(word, parts[i], 0.7, 4); score > max { + if score := f(word, parts[i]); score > max { max = score } } @@ -565,14 +581,6 @@ func jaroWinkler(s1, s2 string) float64 { return max } -// soundex will phonetically normalize two strings and then return their jaro-winkler distance -// as a percentage. -// -// For more details see https://en.wikipedia.org/wiki/Soundex -func soundex(s1, s2 string) float64 { - return jaroWinkler(smetrics.Soundex(s1), smetrics.Soundex(s2)) -} - // extractIDFromRemark attempts to parse out a National ID or similar governmental ID value // from an SDN's remarks property. //