Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Efficient filter-rewriting for st.text(...).filter(str.isidentifier) #3725

Merged
merged 2 commits into from
Sep 5, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions hypothesis-python/RELEASE.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
RELEASE_TYPE: patch

This patch by Reagan Lee makes ``st.text(...).filter(str.isidentifier)``
return an efficient custom strategy (:issue:`3480`).
22 changes: 17 additions & 5 deletions hypothesis-python/src/hypothesis/internal/intervalsets.py
Original file line number Diff line number Diff line change
Expand Up @@ -210,9 +210,21 @@ def difference(self, other):
return IntervalSet(map(tuple, result))

def intersection(self, other):
Zac-HD marked this conversation as resolved.
Show resolved Hide resolved
"""Set intersection for lists of intervals.

Conveniently, this is trivial to define in terms of difference.
"""
"""Set intersection for lists of intervals."""
assert isinstance(other, type(self)), other
return self.difference(other - self).difference(self - other)
intervals = []
i = j = 0
while i < len(self.intervals) and j < len(other.intervals):
u, v = self.intervals[i]
U, V = other.intervals[j]
if u > V:
j += 1
elif U > v:
i += 1
else:
intervals.append((max(u, U), min(v, V)))
if v < V:
i += 1
else:
j += 1
return IntervalSet(intervals)
78 changes: 78 additions & 0 deletions hypothesis-python/src/hypothesis/strategies/_internal/strings.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,13 +9,16 @@
# obtain one at https://mozilla.org/MPL/2.0/.

import copy
import re
import warnings
from functools import lru_cache

from hypothesis.errors import HypothesisWarning, InvalidArgument
from hypothesis.internal import charmap
from hypothesis.internal.conjecture.utils import biased_coin, integer_range
from hypothesis.internal.intervalsets import IntervalSet
from hypothesis.strategies._internal.collections import ListStrategy
from hypothesis.strategies._internal.lazy import unwrap_strategies
from hypothesis.strategies._internal.strategies import SearchStrategy


Expand Down Expand Up @@ -166,6 +169,28 @@ def filter(self, condition):
HypothesisWarning,
stacklevel=2,
)
elems = unwrap_strategies(self.element_strategy)
if (
condition is str.isidentifier
and self.max_size >= 1
and isinstance(elems, OneCharStringStrategy)
):
from hypothesis.strategies import builds, nothing

id_start, id_continue = _identifier_characters()
if not (elems.intervals & id_start):
return nothing()
return builds(
"{}{}".format,
OneCharStringStrategy(elems.intervals & id_start),
TextStrategy(
OneCharStringStrategy(elems.intervals & id_continue),
min_size=max(0, self.min_size - 1),
max_size=self.max_size - 1,
),
# Filter to ensure that NFKC normalization keeps working in future
).filter(str.isidentifier)

# We use ListStrategy filter logic for the conditions that *only* imply
# the string is nonempty. Here, we increment the min_size but still apply
# the filter for conditions that imply nonempty *and specific contents*.
Expand All @@ -178,6 +203,59 @@ def filter(self, condition):
return super().filter(condition)


# Excerpted from https://www.unicode.org/Public/15.0.0/ucd/PropList.txt
# Python updates it's Unicode version between minor releases, but fortunately
# these properties do not change between the Unicode versions in question.
_PROPLIST = """
# ================================================

1885..1886 ; Other_ID_Start # Mn [2] MONGOLIAN LETTER ALI GALI BALUDA..MONGOLIAN LETTER ALI GALI THREE BALUDA
2118 ; Other_ID_Start # Sm SCRIPT CAPITAL P
212E ; Other_ID_Start # So ESTIMATED SYMBOL
309B..309C ; Other_ID_Start # Sk [2] KATAKANA-HIRAGANA VOICED SOUND MARK..KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK

# Total code points: 6

# ================================================

00B7 ; Other_ID_Continue # Po MIDDLE DOT
0387 ; Other_ID_Continue # Po GREEK ANO TELEIA
1369..1371 ; Other_ID_Continue # No [9] ETHIOPIC DIGIT ONE..ETHIOPIC DIGIT NINE
19DA ; Other_ID_Continue # No NEW TAI LUE THAM DIGIT ONE

# Total code points: 12
"""


@lru_cache
def _identifier_characters():
"""See https://docs.python.org/3/reference/lexical_analysis.html#identifiers"""
# Start by computing the set of special characters
chars = {"Other_ID_Start": "", "Other_ID_Continue": ""}
for line in _PROPLIST.splitlines():
if m := re.match(r"([0-9A-F.]+) +; (\w+) # ", line):
codes, prop = m.groups()
span = range(int(codes[:4], base=16), int(codes[-4:], base=16) + 1)
chars[prop] += "".join(chr(x) for x in span)

# Then get the basic set by Unicode category and known extras
id_start = charmap.query(
include_categories=("Lu", "Ll", "Lt", "Lm", "Lo", "Nl"),
include_characters="_" + chars["Other_ID_Start"],
)
id_start -= IntervalSet.from_string(
# Magic value: the characters which NFKC-normalize to be invalid identifiers.
# Conveniently they're all in `id_start`, so we only need to do this once.
"\u037a\u0e33\u0eb3\u2e2f\u309b\u309c\ufc5e\ufc5f\ufc60\ufc61\ufc62\ufc63"
"\ufdfa\ufdfb\ufe70\ufe72\ufe74\ufe76\ufe78\ufe7a\ufe7c\ufe7e\uff9e\uff9f"
)
id_continue = id_start | charmap.query(
include_categories=("Mn", "Mc", "Nd", "Pc"),
include_characters=chars["Other_ID_Continue"],
)
return id_start, id_continue


class FixedSizeBytes(SearchStrategy):
def __init__(self, size):
self.size = size
Expand Down
22 changes: 21 additions & 1 deletion hypothesis-python/tests/cover/test_filter_rewriting.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
from hypothesis.errors import HypothesisWarning, Unsatisfiable
from hypothesis.internal.floats import next_down, next_up
from hypothesis.internal.reflection import get_pretty_function_description
from hypothesis.strategies._internal.core import data
from hypothesis.strategies._internal.lazy import LazyStrategy, unwrap_strategies
from hypothesis.strategies._internal.numbers import FloatStrategy, IntegersStrategy
from hypothesis.strategies._internal.strategies import FilteredStrategy
Expand Down Expand Up @@ -343,14 +344,33 @@ def test_warns_on_suspicious_string_methods(method):
assert fs.min_size == 1


@pytest.mark.parametrize("method", [str.isidentifier, str.isalnum])
@pytest.mark.parametrize("method", [str.isalnum])
def test_bumps_min_size_and_filters_for_content_str_methods(method):
s = unwrap_strategies(st.text())
fs = s.filter(method)
assert fs.filtered_strategy.min_size == 1
assert fs.flat_conditions == (method,)


# Should we deterministically check whether ascii or not or st.characters fine?
@pytest.mark.parametrize("al", [None, "cdef123", "cd12¥¦§©"])
Comment on lines +355 to +356
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fine to ignore the non-OneCharStringStrategy case (fyi sampled_from() is automatically converted in text()!); definitely good to test both ascii and non-ascii.

@given(data())
def test_isidentifier_filter_properly_rewritten(al, data):
if al is None:
example = data.draw(st.text().filter(str.isidentifier))
else:
example = data.draw(st.text(alphabet=al).filter(str.isidentifier))
assert set(example).issubset(al)
assert example.isidentifier()


@pytest.mark.parametrize("al", ["¥¦§©"])
def test_isidentifer_filter_unsatisfiable(al):
fs = st.text(alphabet=al).filter(str.isidentifier)
with pytest.raises(Unsatisfiable):
fs.example()


@pytest.mark.parametrize(
"op, attr, value, expected",
[
Expand Down
Loading