From 582efd797c2ee948f8c40aebbc83779f084baf35 Mon Sep 17 00:00:00 2001 From: leks Date: Wed, 28 Dec 2022 18:38:35 +0400 Subject: [PATCH 1/2] softmax --- lingua/detector.py | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/lingua/detector.py b/lingua/detector.py index 2e2e9158..47090d21 100644 --- a/lingua/detector.py +++ b/lingua/detector.py @@ -50,6 +50,10 @@ def _split_text_into_words(text: str) -> List[str]: return LETTERS.findall(text.lower()) +def _softmax(x: np.ndarray) -> np.ndarray: + return np.exp(x) / np.sum(np.exp(x)) + + def _load_language_models( language: Language, ngram_length: int, @@ -497,19 +501,14 @@ def compute_language_confidence_values(self, text: str) -> List[ConfidenceValue] _sort_confidence_values(values) return values - sorted_probabilities = sorted(summed_up_probabilities.values()) - lowest_probability = sorted_probabilities[0] - highest_probability = sorted_probabilities[-1] - denominator = highest_probability - lowest_probability + lang, prob = zip(*summed_up_probabilities.items()) + prob = np.round(_softmax(np.array(prob)), 2) + summed_up_probabilities = dict(zip(lang, prob)) for language, probability in summed_up_probabilities.items(): - # Apply min-max normalization - normalized_probability = ( - 0.98 * (probability - lowest_probability) / denominator + 0.01 - ) for i in range(len(values)): if values[i].language == language: - values[i] = ConfidenceValue(language, normalized_probability) + values[i] = ConfidenceValue(language, probability) break _sort_confidence_values(values) From fc43d201ba543c675fde159421984038285a8909 Mon Sep 17 00:00:00 2001 From: "Peter M. Stahl" Date: Fri, 30 Dec 2022 00:41:52 +0100 Subject: [PATCH 2/2] Improve implementation and fix tests --- lingua/__init__.py | 10 +++++----- lingua/detector.py | 15 ++++++--------- tests/test_detector.py | 42 +++++++++++++++++++++++++++++++++++------- 3 files changed, 46 insertions(+), 21 deletions(-) diff --git a/lingua/__init__.py b/lingua/__init__.py index 3f5d0af3..57ce48c5 100644 --- a/lingua/__init__.py +++ b/lingua/__init__.py @@ -288,7 +288,7 @@ >>> from lingua import Language, LanguageDetectorBuilder >>> languages = [Language.ENGLISH, Language.FRENCH, Language.GERMAN, Language.SPANISH] >>> detector = LanguageDetectorBuilder.from_languages(*languages)\ -.with_minimum_relative_distance(0.7)\ +.with_minimum_relative_distance(0.9)\ .build() >>> print(detector.detect_language_of("languages are awesome")) None @@ -315,9 +315,9 @@ >>> confidence_values = detector.compute_language_confidence_values("languages are awesome") >>> for language, value in confidence_values: ... print(f"{language.name}: {value:.2f}") -ENGLISH: 0.99 -FRENCH: 0.32 -GERMAN: 0.15 +ENGLISH: 0.93 +FRENCH: 0.04 +GERMAN: 0.02 SPANISH: 0.01 ``` @@ -345,7 +345,7 @@ >>> detector = LanguageDetectorBuilder.from_languages(*languages).build() >>> confidence_value = detector.compute_language_confidence("languages are awesome", Language.FRENCH) >>> print(f"{confidence_value:.2f}") -0.32 +0.04 ``` diff --git a/lingua/detector.py b/lingua/detector.py index 47090d21..b333383e 100644 --- a/lingua/detector.py +++ b/lingua/detector.py @@ -17,6 +17,7 @@ from collections import Counter from dataclasses import dataclass +from math import exp from typing import ( Counter as TypedCounter, Dict, @@ -50,10 +51,6 @@ def _split_text_into_words(text: str) -> List[str]: return LETTERS.findall(text.lower()) -def _softmax(x: np.ndarray) -> np.ndarray: - return np.exp(x) / np.sum(np.exp(x)) - - def _load_language_models( language: Language, ngram_length: int, @@ -80,7 +77,7 @@ def _sum_up_probabilities( if unigram_counts is not None and language in unigram_counts: result /= unigram_counts[language] if result != 0: - summed_up_probabilities[language] = result + summed_up_probabilities[language] = exp(result) return summed_up_probabilities @@ -501,14 +498,14 @@ def compute_language_confidence_values(self, text: str) -> List[ConfidenceValue] _sort_confidence_values(values) return values - lang, prob = zip(*summed_up_probabilities.items()) - prob = np.round(_softmax(np.array(prob)), 2) - summed_up_probabilities = dict(zip(lang, prob)) + denominator = sum(summed_up_probabilities.values()) for language, probability in summed_up_probabilities.items(): for i in range(len(values)): if values[i].language == language: - values[i] = ConfidenceValue(language, probability) + # apply softmax function + normalized_probability = probability / denominator + values[i] = ConfidenceValue(language, normalized_probability) break _sort_confidence_values(values) diff --git a/tests/test_detector.py b/tests/test_detector.py index 43976c94..f42ba25b 100644 --- a/tests/test_detector.py +++ b/tests/test_detector.py @@ -20,6 +20,7 @@ from lingua.builder import LanguageDetectorBuilder from lingua.detector import ( + ConfidenceValue, LanguageDetector, _UNIGRAM_MODELS, _BIGRAM_MODELS, @@ -967,9 +968,27 @@ def test_no_language_is_returned(detector_for_english_and_german): @pytest.mark.parametrize( "text,expected_confidence_values", [ - pytest.param("groß", [(Language.GERMAN, 1.0), (Language.ENGLISH, 0.0)]), - pytest.param("Alter", [(Language.GERMAN, 0.99), (Language.ENGLISH, 0.01)]), - pytest.param("проарплап", [(Language.ENGLISH, 0.0), (Language.GERMAN, 0.0)]), + pytest.param( + "groß", + [ + ConfidenceValue(Language.GERMAN, 1.0), + ConfidenceValue(Language.ENGLISH, 0.0), + ], + ), + pytest.param( + "Alter", + [ + ConfidenceValue(Language.GERMAN, 0.81), + ConfidenceValue(Language.ENGLISH, 0.19), + ], + ), + pytest.param( + "проарплап", + [ + ConfidenceValue(Language.ENGLISH, 0.0), + ConfidenceValue(Language.GERMAN, 0.0), + ], + ), ], ) def test_compute_language_confidence_values( @@ -978,14 +997,23 @@ def test_compute_language_confidence_values( confidence_values = ( detector_for_english_and_german.compute_language_confidence_values(text) ) - assert confidence_values == expected_confidence_values + assert len(confidence_values) == 2 + + first, second = confidence_values + expected_first, expected_second = expected_confidence_values + + assert first.language == expected_first.language + assert round(first.value, 2) == expected_first.value + + assert second.language == expected_second.language + assert round(second.value, 2) == expected_second.value @pytest.mark.parametrize( "text,expected_confidence_for_german,expected_confidence_for_english", [ pytest.param("groß", 1.0, 0.0), - pytest.param("Alter", 0.99, 0.01), + pytest.param("Alter", 0.81, 0.19), pytest.param("проарплап", 0.0, 0.0), ], ) @@ -998,14 +1026,14 @@ def test_compute_language_confidence( confidence_for_german = detector_for_english_and_german.compute_language_confidence( text, Language.GERMAN ) - assert confidence_for_german == expected_confidence_for_german + assert round(confidence_for_german, 2) == expected_confidence_for_german confidence_for_english = ( detector_for_english_and_german.compute_language_confidence( text, Language.ENGLISH ) ) - assert confidence_for_english == expected_confidence_for_english + assert round(confidence_for_english, 2) == expected_confidence_for_english confidence_for_french = detector_for_english_and_german.compute_language_confidence( text, Language.FRENCH