Skip to content
This repository has been archived by the owner on Nov 23, 2024. It is now read-only.

Commit

Permalink
feat: extracting valid literals from documentation (#78)
Browse files Browse the repository at this point in the history
Closes #46.

### Summary of Changes

Currently, valid values for the enum annotations were determined only
via the type string. In order to achieve a higher recognition rate of
the enum annotations, the recognition of the valid values was extended
to the description.


### Testing Instructions

1. Run `pytest` for `test_extract_valid_literals.py`.
2. Check the results of `pytest`.

---------

Co-authored-by: Lars Reimann <[email protected]>
Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: megalinter-bot <[email protected]>
  • Loading branch information
4 people authored Apr 3, 2023
1 parent 9d61efa commit 2c41af2
Show file tree
Hide file tree
Showing 2 changed files with 255 additions and 0 deletions.
203 changes: 203 additions & 0 deletions src/library_analyzer/processing/api/_extract_valid_values.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,203 @@
import re
from dataclasses import dataclass, field


@dataclass
class Configuration:
_function_list: list = field(default_factory=list)

def get_function_list(self) -> list:
return self._function_list


@dataclass
class DescriptionStringConfiguration(Configuration):
if_listings: bool = True
indented_listings: bool = True
when_set_to: bool = True

def __post_init__(self) -> None:
if self.if_listings:
self._function_list.append(_extract_from_description_if_listing)
if self.indented_listings:
self._function_list.append(_extract_from_description_indented_listing)
if self.when_set_to:
self._function_list.append(_extract_from_description_when_set_to)


@dataclass
class TypeStringConfiguration(Configuration):
curly_enum: bool = True
and_or_enum: bool = True

def __post_init__(self) -> None:
if self.curly_enum:
self._function_list.append(_extract_from_type_curly_enum)
if self.and_or_enum:
self._function_list.append(_extract_from_type_listing)


def extract_valid_literals(param_description: str, param_type: str) -> set[str]:
"""
Extract all valid literals from the type and description string.
Parameters
----------
param_description: str
Description string of the parameter to be examined.
param_type: str
Type string of the parameter to be examined.
Returns
-------
set[str]
A set of valid, extracted values of the parameter to be examined.
"""
description_config: DescriptionStringConfiguration = DescriptionStringConfiguration()
type_config: TypeStringConfiguration = TypeStringConfiguration()
none_and_bool = {"False", "None", "True"}

def _execute_pattern(string: str, config: Configuration) -> set[str]:
# Function to execute all pattern functions from config
result = set()
for pattern_function in config.get_function_list():
result.update(pattern_function(string))
return result

matches = _execute_pattern(param_type, type_config)

description_matches = _execute_pattern(param_description, description_config)

# Check if there are matching values in the description that are not True, False or None
# when 'str' occurs in the type string. If this is not the case, unlistable_str is returned as a 'valid' value.
if description_matches:
matches.update(description_matches)
if "str" in matches:
if not description_matches.difference(none_and_bool):
matches.add("unlistable_str")
matches.remove("str")

return matches


def _extract_from_type_curly_enum(type_string: str) -> set[str]:
"""
Extract all valid values of the parameter type string to be examined that were enclosed in curly braces.
Parameters
----------
type_string: str
Type string of the parameter to be examined.
Returns
-------
set[str]
A set of valid values from the parameter description to be examined.
"""
matches = re.findall(r"\{(.*?)}", type_string)
extracted = []

for match in matches:
splitted = re.split(r", ", match)
extracted.extend(splitted)

return set(extracted)


def _extract_from_type_listing(type_string: str) -> set[str]:
"""
Extract all valid values from the listing of the parameter type string to be examined.
Parameters
----------
type_string: str
Type string of the parameter to be examined.
Returns
-------
set[str]
A set of valid values from the parameter description to be examined.
"""
# Multiple values seperated by ',', 'and' or 'or' with single# quotes
single_and_or_pattern = r"('[^']*'|bool|str)\s*(?:and|or|,)?"
# Multiple values seperated by ',', 'and' or'or' with double quotes
double_and_or_pattern = r"(\"[^\"]*\"|bool|str)\s*(?:and|or|,)?"

matches = re.findall(single_and_or_pattern, type_string)

if not matches:
matches = re.findall(double_and_or_pattern, type_string)

extracted = set(matches)

if "bool" in extracted:
extracted.remove("bool")
extracted.add("False")
extracted.add("True")

return extracted


def _extract_from_description_if_listing(description: str) -> set[str]:
"""Extract the 'if listing' pattern.
Detect all substrings starting with 'if' and satisfying one of the following cases:
A value between single or double quotes, False, True, or None.
Parameters
----------
description: str
Description string of the parameter to be examined.
Returns
-------
set[str]
A set of valid values from the parameter description to be examined.
"""
pattern = r"[-\+\*]?\s*If\s*('[^']*'|\"[^\"]*\"|True|False|None)"
matches = re.findall(pattern, description)
return set(matches)


def _extract_from_description_indented_listing(description: str) -> set[str]:
"""Extract the 'indented listing' pattern.
Detect all substrings that appear in an indented list and match one of the following cases:
A value between single or double quotes, False, True, or None.
Parameters
----------
description: str
Description string of the parameter to be examined.
Returns
-------
set[str]
A set of valid values from the parameter description to be examined.
"""
pattern = r"[-\+\*]?\s+(\"[^\"]*\"|'[^']*'|None|True|False):"
matches = re.findall(pattern, description)
return set(matches)


def _extract_from_description_when_set_to(description: str) -> set[str]:
"""Extract the 'when set to' pattern.
Detect all substrings starting with 'when set to' and satisfying one of the following cases:
A value between single or double quotes, False, True, or None.
Parameters
----------
Description string of the parameter to be examined.
Returns
-------
set[str]
A set of valid literals from the parameter description to be examined.
"""
pattern = r"When set to (\"[^\"]*\"|'[^']*'|None|True|False)"
matches = re.findall(pattern, description, re.IGNORECASE)
return set(matches)
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
import pytest
from library_analyzer.processing.api._extract_valid_values import extract_valid_literals


@pytest.mark.parametrize(
("type_", "description", "expected_literals"),
[
(
"str",
'If "mean", then replace missing values using the mean along each column\nIf "median", then replace missing values using the median along each column\nIf "most_frequent", then replace missing using the most frequent value along each column\nIf "constant", then replace missing values with fill_value\n',
['"mean"', '"median"', '"most_frequent"', '"constant"'],
),
(
"str",
"If 'mean', then replace missing values using the mean along each column\nIf 'median', then replace missing values using the median along each column\nIf 'most_frequent', then replace missing using the most frequent value along each column\nIf 'constant', then replace missing values with fill_value\n",
["'median'", "'most_frequent'", "'constant'", "'mean'"],
),
(
"str, list or tuple of str",
'Attribute name(s) given as string or a list/tuple of strings Eg.: ["coef_", "estimator_", ...], "coef_"\n\nIf None, estimator is considered fitted if there exist an attribute that ends with a underscore and does not start with double underscore.',
["None", "unlistable_str"],
),
(
"bool or 'allow-nan'",
"Whether to raise an error on np.inf, np.nan, pd.NA in X. This parameter does not influence whether y can have np.inf, np.nan, pd.NA values. The possibilities are:\n\n\tTrue: Force all values of X to be finite.\n\tFalse: accepts np.inf, np.nan, pd.NA in X.\n\t'allow-nan': accepts only np.nan or pd.NA values in X. Values cannot be infinite.\n\n.. versionadded: 0.20 force_all_finite accepts the string 'allow-nan'.\n\n.. versionchanged: 0.23 Accepts pd.NA and converts it into np.nan",
["'allow-nan'", "False", "True"],
),
(
'{"random", "best"}',
'The strategy used to choose the split at each node. Supported strategies are "best" to choose the best split and "random" to choose the best random split.',
['"best"', '"random"'],
),
(
"bool or str",
"When set to True, change the display of 'values' and/or 'samples' to be proportions and percentages respectively.",
["False", "True", "unlistable_str"],
),
(
"int, RandomState instance or None",
'Controls the randomness of the estimator. The features are always randomly permuted at each split, even if splitter is set to "best". When max_features < n_features, the algorithm will select max_features at random at each split before finding the best split among them. But the best found split may vary across different runs, even if max_features=n_features. That is the case, if the improvement of the criterion is identical for several splits and one split has to be selected at random. To obtain a deterministic behaviour during fitting, random_state has to be fixed to an integer. See :term:Glossary <random_state> for details.',
[],
),
("float", "Independent term in kernel function. It is only significant in 'poly' and 'sigmoid'.", []),
(
"float",
'When self.fit_intercept is True, instance vector x becomes [x, self.intercept_scaling], i.e. a "synthetic" feature with constant value equals to intercept_scaling is appended to the instance vector. The intercept becomes intercept_scaling * synthetic feature weight Note! the synthetic feature weight is subject to l1/l2 regularization as all other features. To lessen the effect of regularization on synthetic feature weight (and therefore on the intercept) intercept_scaling has to be increased.',
[],
),
],
)
def test_extract_values(type_: str, description: str, expected_literals: list) -> None:
assert extract_valid_literals(description, type_) == set(expected_literals)

0 comments on commit 2c41af2

Please sign in to comment.