This repository has been archived by the owner on Nov 23, 2024. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat: extracting valid literals from documentation (#78)
Closes #46. ### Summary of Changes Currently, valid values for the enum annotations were determined only via the type string. In order to achieve a higher recognition rate of the enum annotations, the recognition of the valid values was extended to the description. ### Testing Instructions 1. Run `pytest` for `test_extract_valid_literals.py`. 2. Check the results of `pytest`. --------- Co-authored-by: Lars Reimann <[email protected]> Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> Co-authored-by: megalinter-bot <[email protected]>
- Loading branch information
1 parent
9d61efa
commit 2c41af2
Showing
2 changed files
with
255 additions
and
0 deletions.
There are no files selected for viewing
203 changes: 203 additions & 0 deletions
203
src/library_analyzer/processing/api/_extract_valid_values.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,203 @@ | ||
import re | ||
from dataclasses import dataclass, field | ||
|
||
|
||
@dataclass | ||
class Configuration: | ||
_function_list: list = field(default_factory=list) | ||
|
||
def get_function_list(self) -> list: | ||
return self._function_list | ||
|
||
|
||
@dataclass | ||
class DescriptionStringConfiguration(Configuration): | ||
if_listings: bool = True | ||
indented_listings: bool = True | ||
when_set_to: bool = True | ||
|
||
def __post_init__(self) -> None: | ||
if self.if_listings: | ||
self._function_list.append(_extract_from_description_if_listing) | ||
if self.indented_listings: | ||
self._function_list.append(_extract_from_description_indented_listing) | ||
if self.when_set_to: | ||
self._function_list.append(_extract_from_description_when_set_to) | ||
|
||
|
||
@dataclass | ||
class TypeStringConfiguration(Configuration): | ||
curly_enum: bool = True | ||
and_or_enum: bool = True | ||
|
||
def __post_init__(self) -> None: | ||
if self.curly_enum: | ||
self._function_list.append(_extract_from_type_curly_enum) | ||
if self.and_or_enum: | ||
self._function_list.append(_extract_from_type_listing) | ||
|
||
|
||
def extract_valid_literals(param_description: str, param_type: str) -> set[str]: | ||
""" | ||
Extract all valid literals from the type and description string. | ||
Parameters | ||
---------- | ||
param_description: str | ||
Description string of the parameter to be examined. | ||
param_type: str | ||
Type string of the parameter to be examined. | ||
Returns | ||
------- | ||
set[str] | ||
A set of valid, extracted values of the parameter to be examined. | ||
""" | ||
description_config: DescriptionStringConfiguration = DescriptionStringConfiguration() | ||
type_config: TypeStringConfiguration = TypeStringConfiguration() | ||
none_and_bool = {"False", "None", "True"} | ||
|
||
def _execute_pattern(string: str, config: Configuration) -> set[str]: | ||
# Function to execute all pattern functions from config | ||
result = set() | ||
for pattern_function in config.get_function_list(): | ||
result.update(pattern_function(string)) | ||
return result | ||
|
||
matches = _execute_pattern(param_type, type_config) | ||
|
||
description_matches = _execute_pattern(param_description, description_config) | ||
|
||
# Check if there are matching values in the description that are not True, False or None | ||
# when 'str' occurs in the type string. If this is not the case, unlistable_str is returned as a 'valid' value. | ||
if description_matches: | ||
matches.update(description_matches) | ||
if "str" in matches: | ||
if not description_matches.difference(none_and_bool): | ||
matches.add("unlistable_str") | ||
matches.remove("str") | ||
|
||
return matches | ||
|
||
|
||
def _extract_from_type_curly_enum(type_string: str) -> set[str]: | ||
""" | ||
Extract all valid values of the parameter type string to be examined that were enclosed in curly braces. | ||
Parameters | ||
---------- | ||
type_string: str | ||
Type string of the parameter to be examined. | ||
Returns | ||
------- | ||
set[str] | ||
A set of valid values from the parameter description to be examined. | ||
""" | ||
matches = re.findall(r"\{(.*?)}", type_string) | ||
extracted = [] | ||
|
||
for match in matches: | ||
splitted = re.split(r", ", match) | ||
extracted.extend(splitted) | ||
|
||
return set(extracted) | ||
|
||
|
||
def _extract_from_type_listing(type_string: str) -> set[str]: | ||
""" | ||
Extract all valid values from the listing of the parameter type string to be examined. | ||
Parameters | ||
---------- | ||
type_string: str | ||
Type string of the parameter to be examined. | ||
Returns | ||
------- | ||
set[str] | ||
A set of valid values from the parameter description to be examined. | ||
""" | ||
# Multiple values seperated by ',', 'and' or 'or' with single# quotes | ||
single_and_or_pattern = r"('[^']*'|bool|str)\s*(?:and|or|,)?" | ||
# Multiple values seperated by ',', 'and' or'or' with double quotes | ||
double_and_or_pattern = r"(\"[^\"]*\"|bool|str)\s*(?:and|or|,)?" | ||
|
||
matches = re.findall(single_and_or_pattern, type_string) | ||
|
||
if not matches: | ||
matches = re.findall(double_and_or_pattern, type_string) | ||
|
||
extracted = set(matches) | ||
|
||
if "bool" in extracted: | ||
extracted.remove("bool") | ||
extracted.add("False") | ||
extracted.add("True") | ||
|
||
return extracted | ||
|
||
|
||
def _extract_from_description_if_listing(description: str) -> set[str]: | ||
"""Extract the 'if listing' pattern. | ||
Detect all substrings starting with 'if' and satisfying one of the following cases: | ||
A value between single or double quotes, False, True, or None. | ||
Parameters | ||
---------- | ||
description: str | ||
Description string of the parameter to be examined. | ||
Returns | ||
------- | ||
set[str] | ||
A set of valid values from the parameter description to be examined. | ||
""" | ||
pattern = r"[-\+\*]?\s*If\s*('[^']*'|\"[^\"]*\"|True|False|None)" | ||
matches = re.findall(pattern, description) | ||
return set(matches) | ||
|
||
|
||
def _extract_from_description_indented_listing(description: str) -> set[str]: | ||
"""Extract the 'indented listing' pattern. | ||
Detect all substrings that appear in an indented list and match one of the following cases: | ||
A value between single or double quotes, False, True, or None. | ||
Parameters | ||
---------- | ||
description: str | ||
Description string of the parameter to be examined. | ||
Returns | ||
------- | ||
set[str] | ||
A set of valid values from the parameter description to be examined. | ||
""" | ||
pattern = r"[-\+\*]?\s+(\"[^\"]*\"|'[^']*'|None|True|False):" | ||
matches = re.findall(pattern, description) | ||
return set(matches) | ||
|
||
|
||
def _extract_from_description_when_set_to(description: str) -> set[str]: | ||
"""Extract the 'when set to' pattern. | ||
Detect all substrings starting with 'when set to' and satisfying one of the following cases: | ||
A value between single or double quotes, False, True, or None. | ||
Parameters | ||
---------- | ||
Description string of the parameter to be examined. | ||
Returns | ||
------- | ||
set[str] | ||
A set of valid literals from the parameter description to be examined. | ||
""" | ||
pattern = r"When set to (\"[^\"]*\"|'[^']*'|None|True|False)" | ||
matches = re.findall(pattern, description, re.IGNORECASE) | ||
return set(matches) |
52 changes: 52 additions & 0 deletions
52
tests/library_analyzer/processing/api/test_extract_valid_literals.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,52 @@ | ||
import pytest | ||
from library_analyzer.processing.api._extract_valid_values import extract_valid_literals | ||
|
||
|
||
@pytest.mark.parametrize( | ||
("type_", "description", "expected_literals"), | ||
[ | ||
( | ||
"str", | ||
'If "mean", then replace missing values using the mean along each column\nIf "median", then replace missing values using the median along each column\nIf "most_frequent", then replace missing using the most frequent value along each column\nIf "constant", then replace missing values with fill_value\n', | ||
['"mean"', '"median"', '"most_frequent"', '"constant"'], | ||
), | ||
( | ||
"str", | ||
"If 'mean', then replace missing values using the mean along each column\nIf 'median', then replace missing values using the median along each column\nIf 'most_frequent', then replace missing using the most frequent value along each column\nIf 'constant', then replace missing values with fill_value\n", | ||
["'median'", "'most_frequent'", "'constant'", "'mean'"], | ||
), | ||
( | ||
"str, list or tuple of str", | ||
'Attribute name(s) given as string or a list/tuple of strings Eg.: ["coef_", "estimator_", ...], "coef_"\n\nIf None, estimator is considered fitted if there exist an attribute that ends with a underscore and does not start with double underscore.', | ||
["None", "unlistable_str"], | ||
), | ||
( | ||
"bool or 'allow-nan'", | ||
"Whether to raise an error on np.inf, np.nan, pd.NA in X. This parameter does not influence whether y can have np.inf, np.nan, pd.NA values. The possibilities are:\n\n\tTrue: Force all values of X to be finite.\n\tFalse: accepts np.inf, np.nan, pd.NA in X.\n\t'allow-nan': accepts only np.nan or pd.NA values in X. Values cannot be infinite.\n\n.. versionadded: 0.20 force_all_finite accepts the string 'allow-nan'.\n\n.. versionchanged: 0.23 Accepts pd.NA and converts it into np.nan", | ||
["'allow-nan'", "False", "True"], | ||
), | ||
( | ||
'{"random", "best"}', | ||
'The strategy used to choose the split at each node. Supported strategies are "best" to choose the best split and "random" to choose the best random split.', | ||
['"best"', '"random"'], | ||
), | ||
( | ||
"bool or str", | ||
"When set to True, change the display of 'values' and/or 'samples' to be proportions and percentages respectively.", | ||
["False", "True", "unlistable_str"], | ||
), | ||
( | ||
"int, RandomState instance or None", | ||
'Controls the randomness of the estimator. The features are always randomly permuted at each split, even if splitter is set to "best". When max_features < n_features, the algorithm will select max_features at random at each split before finding the best split among them. But the best found split may vary across different runs, even if max_features=n_features. That is the case, if the improvement of the criterion is identical for several splits and one split has to be selected at random. To obtain a deterministic behaviour during fitting, random_state has to be fixed to an integer. See :term:Glossary <random_state> for details.', | ||
[], | ||
), | ||
("float", "Independent term in kernel function. It is only significant in 'poly' and 'sigmoid'.", []), | ||
( | ||
"float", | ||
'When self.fit_intercept is True, instance vector x becomes [x, self.intercept_scaling], i.e. a "synthetic" feature with constant value equals to intercept_scaling is appended to the instance vector. The intercept becomes intercept_scaling * synthetic feature weight Note! the synthetic feature weight is subject to l1/l2 regularization as all other features. To lessen the effect of regularization on synthetic feature weight (and therefore on the intercept) intercept_scaling has to be increased.', | ||
[], | ||
), | ||
], | ||
) | ||
def test_extract_values(type_: str, description: str, expected_literals: list) -> None: | ||
assert extract_valid_literals(description, type_) == set(expected_literals) |