feat: extracting valid literals from documentation (#78)

Closes #46. ### Summary of Changes Currently, valid values for the enum annotations were determined only via the type string. In order to achieve a higher recognition rate of the enum annotations, the recognition of the valid values was extended to the description. ### Testing Instructions 1. Run `pytest` for `test_extract_valid_literals.py`. 2. Check the results of `pytest`. --------- Co-authored-by: Lars Reimann <[email protected]> Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> Co-authored-by: megalinter-bot <[email protected]>
Safe-DS · Apr 3, 2023 · 2c41af2 · 2c41af2
1 parent 9d61efa
commit 2c41af2
Show file tree

Hide file tree

Showing 2 changed files with 255 additions and 0 deletions.
diff --git a/src/library_analyzer/processing/api/_extract_valid_values.py b/src/library_analyzer/processing/api/_extract_valid_values.py
@@ -0,0 +1,203 @@
+import re
+from dataclasses import dataclass, field
+
+
+@dataclass
+class Configuration:
+    _function_list: list = field(default_factory=list)
+
+    def get_function_list(self) -> list:
+        return self._function_list
+
+
+@dataclass
+class DescriptionStringConfiguration(Configuration):
+    if_listings: bool = True
+    indented_listings: bool = True
+    when_set_to: bool = True
+
+    def __post_init__(self) -> None:
+        if self.if_listings:
+            self._function_list.append(_extract_from_description_if_listing)
+        if self.indented_listings:
+            self._function_list.append(_extract_from_description_indented_listing)
+        if self.when_set_to:
+            self._function_list.append(_extract_from_description_when_set_to)
+
+
+@dataclass
+class TypeStringConfiguration(Configuration):
+    curly_enum: bool = True
+    and_or_enum: bool = True
+
+    def __post_init__(self) -> None:
+        if self.curly_enum:
+            self._function_list.append(_extract_from_type_curly_enum)
+        if self.and_or_enum:
+            self._function_list.append(_extract_from_type_listing)
+
+
+def extract_valid_literals(param_description: str, param_type: str) -> set[str]:
+    """
+    Extract all valid literals from the type and description string.
+
+    Parameters
+    ----------
+    param_description: str
+        Description string of the parameter to be examined.
+
+    param_type: str
+        Type string of the parameter to be examined.
+
+
+    Returns
+    -------
+    set[str]
+        A set of valid, extracted values of the parameter to be examined.
+    """
+    description_config: DescriptionStringConfiguration = DescriptionStringConfiguration()
+    type_config: TypeStringConfiguration = TypeStringConfiguration()
+    none_and_bool = {"False", "None", "True"}
+
+    def _execute_pattern(string: str, config: Configuration) -> set[str]:
+        # Function to execute all pattern functions from config
+        result = set()
+        for pattern_function in config.get_function_list():
+            result.update(pattern_function(string))
+        return result
+
+    matches = _execute_pattern(param_type, type_config)
+
+    description_matches = _execute_pattern(param_description, description_config)
+
+    # Check if there are matching values in the description that are not True, False or None
+    # when 'str' occurs in the type string. If this is not the case, unlistable_str is returned as a 'valid' value.
+    if description_matches:
+        matches.update(description_matches)
+        if "str" in matches:
+            if not description_matches.difference(none_and_bool):
+                matches.add("unlistable_str")
+            matches.remove("str")
+
+    return matches
+
+
+def _extract_from_type_curly_enum(type_string: str) -> set[str]:
+    """
+    Extract all valid values of the parameter type string to be examined that were enclosed in curly braces.
+
+    Parameters
+    ----------
+    type_string: str
+        Type string of the parameter to be examined.
+
+    Returns
+    -------
+    set[str]
+        A set of valid values from the parameter description to be examined.
+    """
+    matches = re.findall(r"\{(.*?)}", type_string)
+    extracted = []
+
+    for match in matches:
+        splitted = re.split(r", ", match)
+        extracted.extend(splitted)
+
+    return set(extracted)
+
+
+def _extract_from_type_listing(type_string: str) -> set[str]:
+    """
+    Extract all valid values from the listing of the parameter type string to be examined.
+
+    Parameters
+    ----------
+    type_string: str
+        Type string of the parameter to be examined.
+
+    Returns
+    -------
+    set[str]
+        A set of valid values from the parameter description to be examined.
+    """
+    # Multiple values seperated by ',', 'and' or 'or' with single# quotes
+    single_and_or_pattern = r"('[^']*'|bool|str)\s*(?:and|or|,)?"
+    # Multiple values seperated by ',', 'and' or'or' with double quotes
+    double_and_or_pattern = r"(\"[^\"]*\"|bool|str)\s*(?:and|or|,)?"
+
+    matches = re.findall(single_and_or_pattern, type_string)
+
+    if not matches:
+        matches = re.findall(double_and_or_pattern, type_string)
+
+    extracted = set(matches)
+
+    if "bool" in extracted:
+        extracted.remove("bool")
+        extracted.add("False")
+        extracted.add("True")
+
+    return extracted
+
+
+def _extract_from_description_if_listing(description: str) -> set[str]:
+    """Extract the 'if listing' pattern.
+
+    Detect all substrings starting with 'if' and satisfying one of the following cases:
+    A value between single or double quotes, False, True, or None.
+
+    Parameters
+    ----------
+    description: str
+        Description string of the parameter to be examined.
+
+    Returns
+    -------
+    set[str]
+        A set of valid values from the parameter description to be examined.
+    """
+    pattern = r"[-\+\*]?\s*If\s*('[^']*'|\"[^\"]*\"|True|False|None)"
+    matches = re.findall(pattern, description)
+    return set(matches)
+
+
+def _extract_from_description_indented_listing(description: str) -> set[str]:
+    """Extract the 'indented listing' pattern.
+
+    Detect all substrings that appear in an indented list and match one of the following cases:
+    A value between single or double quotes, False, True, or None.
+
+    Parameters
+    ----------
+    description: str
+        Description string of the parameter to be examined.
+
+
+    Returns
+    -------
+    set[str]
+        A set of valid values from the parameter description to be examined.
+    """
+    pattern = r"[-\+\*]?\s+(\"[^\"]*\"|'[^']*'|None|True|False):"
+    matches = re.findall(pattern, description)
+    return set(matches)
+
+
+def _extract_from_description_when_set_to(description: str) -> set[str]:
+    """Extract the 'when set to' pattern.
+
+    Detect all substrings starting with 'when set to' and satisfying one of the following cases:
+    A value between single or double quotes, False, True, or None.
+
+    Parameters
+    ----------
+    Description string of the parameter to be examined.
+
+    Returns
+    -------
+    set[str]
+        A set of valid literals from the parameter description to be examined.
+    """
+    pattern = r"When set to (\"[^\"]*\"|'[^']*'|None|True|False)"
+    matches = re.findall(pattern, description, re.IGNORECASE)
+    return set(matches)
diff --git a/tests/library_analyzer/processing/api/test_extract_valid_literals.py b/tests/library_analyzer/processing/api/test_extract_valid_literals.py
@@ -0,0 +1,52 @@
+import pytest
+from library_analyzer.processing.api._extract_valid_values import extract_valid_literals
+
+
+@pytest.mark.parametrize(
+    ("type_", "description", "expected_literals"),
+    [
+        (
+            "str",
+            'If "mean", then replace missing values using the mean along each column\nIf "median", then replace missing values using the median along each column\nIf "most_frequent", then replace missing using the most frequent value along each column\nIf "constant", then replace missing values with fill_value\n',
+            ['"mean"', '"median"', '"most_frequent"', '"constant"'],
+        ),
+        (
+            "str",
+            "If 'mean', then replace missing values using the mean along each column\nIf 'median', then replace missing values using the median along each column\nIf 'most_frequent', then replace missing using the most frequent value along each column\nIf 'constant', then replace missing values with fill_value\n",
+            ["'median'", "'most_frequent'", "'constant'", "'mean'"],
+        ),
+        (
+            "str, list or tuple of str",
+            'Attribute name(s) given as string or a list/tuple of strings Eg.: ["coef_", "estimator_", ...], "coef_"\n\nIf None, estimator is considered fitted if there exist an attribute that ends with a underscore and does not start with double underscore.',
+            ["None", "unlistable_str"],
+        ),
+        (
+            "bool or 'allow-nan'",
+            "Whether to raise an error on np.inf, np.nan, pd.NA in X. This parameter does not influence whether y can have np.inf, np.nan, pd.NA values. The possibilities are:\n\n\tTrue: Force all values of X to be finite.\n\tFalse: accepts np.inf, np.nan, pd.NA in X.\n\t'allow-nan': accepts only np.nan or pd.NA values in X. Values cannot be infinite.\n\n.. versionadded: 0.20 force_all_finite accepts the string 'allow-nan'.\n\n.. versionchanged: 0.23 Accepts pd.NA and converts it into np.nan",
+            ["'allow-nan'", "False", "True"],
+        ),
+        (
+            '{"random", "best"}',
+            'The strategy used to choose the split at each node. Supported strategies are "best" to choose the best split and "random" to choose the best random split.',
+            ['"best"', '"random"'],
+        ),
+        (
+            "bool or str",
+            "When set to True, change the display of 'values' and/or 'samples' to be proportions and percentages respectively.",
+            ["False", "True", "unlistable_str"],
+        ),
+        (
+            "int, RandomState instance or None",
+            'Controls the randomness of the estimator. The features are always randomly permuted at each split, even if splitter is set to "best". When max_features < n_features, the algorithm will select max_features at random at each split before finding the best split among them. But the best found split may vary across different runs, even if max_features=n_features. That is the case, if the improvement of the criterion is identical for several splits and one split has to be selected at random. To obtain a deterministic behaviour during fitting, random_state has to be fixed to an integer. See :term:Glossary <random_state> for details.',
+            [],
+        ),
+        ("float", "Independent term in kernel function. It is only significant in 'poly' and 'sigmoid'.", []),
+        (
+            "float",
+            'When self.fit_intercept is True, instance vector x becomes [x, self.intercept_scaling], i.e. a "synthetic" feature with constant value equals to intercept_scaling is appended to the instance vector. The intercept becomes intercept_scaling * synthetic feature weight Note! the synthetic feature weight is subject to l1/l2 regularization as all other features. To lessen the effect of regularization on synthetic feature weight (and therefore on the intercept) intercept_scaling has to be increased.',
+            [],
+        ),
+    ],
+)
+def test_extract_values(type_: str, description: str, expected_literals: list) -> None:
+    assert extract_valid_literals(description, type_) == set(expected_literals)