From 91cf858f67111316441513958c1ca34b794bf9c3 Mon Sep 17 00:00:00 2001 From: Zac Hatfield-Dodds Date: Fri, 1 Sep 2023 17:17:09 -0700 Subject: [PATCH 1/6] Changelog markup --- hypothesis-python/docs/changes.rst | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/hypothesis-python/docs/changes.rst b/hypothesis-python/docs/changes.rst index 193530b021..e9ee15ce30 100644 --- a/hypothesis-python/docs/changes.rst +++ b/hypothesis-python/docs/changes.rst @@ -144,7 +144,7 @@ help narrow down any particularly weird bugs in complex environments. ------------------- Fixes some lingering issues with inference of recursive types -in `~hypothesis.strategies.from_type`. Closes :issue:`3525`. +in :func:`~hypothesis.strategies.from_type`. Closes :issue:`3525`. .. _v6.81.0: @@ -335,8 +335,8 @@ is strongly recommended. You can ensure you have the dependencies with ------------------- This patch continues the work started in :pull:`3651` by adding -:pypi:`ruff` linter rules for pyflakes, flake8-comprehensions, and -flake8-implicit-str-concat. +:pypi:`ruff` linter rules for :pypi:`pyflakes`, :pypi:`flake8-comprehensions`, +and :pypi:`flake8-implicit-str-concat`. .. _v6.75.5: @@ -1184,7 +1184,7 @@ is really annoying. See :issue:`2701` for details. 6.48.0 - 2022-06-27 ------------------- -This release raises :class:`~unittest.SkipTest` for which never executed any +This release raises :class:`~unittest.SkipTest` for tests which never executed any examples, for example because the :obj:`~hypothesis.settings.phases` setting excluded the :obj:`~hypothesis.Phase.explicit`, :obj:`~hypothesis.Phase.reuse`, and :obj:`~hypothesis.Phase.generate` phases. This helps to avoid cases where From c0a60d7e47d9b10d071c881feab19aedda3c4a66 Mon Sep 17 00:00:00 2001 From: Zac Hatfield-Dodds Date: Fri, 1 Sep 2023 17:17:09 -0700 Subject: [PATCH 2/6] Fix pretty-printer typo --- hypothesis-python/src/hypothesis/vendor/pretty.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hypothesis-python/src/hypothesis/vendor/pretty.py b/hypothesis-python/src/hypothesis/vendor/pretty.py index f9ffb128ea..5a1989182a 100644 --- a/hypothesis-python/src/hypothesis/vendor/pretty.py +++ b/hypothesis-python/src/hypothesis/vendor/pretty.py @@ -153,7 +153,7 @@ def __init__(self, output=None, *, context=None): ipp = sys.modules["IPython.lib.pretty"] self.singleton_pprinters.update(ipp._singleton_pprinters) self.type_pprinters.update(ipp._type_pprinters) - self.deferred_pprinters.update(ipp._deferred_pprinters) + self.deferred_pprinters.update(ipp._deferred_type_pprinters) # If there's overlap between our pprinters and IPython's, we'll use ours. self.singleton_pprinters.update(_singleton_pprinters) self.type_pprinters.update(_type_pprinters) From 8ca6c3423640405afc1650f9cf5ff63a8ba61f80 Mon Sep 17 00:00:00 2001 From: Zac Hatfield-Dodds Date: Fri, 1 Sep 2023 17:17:09 -0700 Subject: [PATCH 3/6] Move helper fn to compat.py --- hypothesis-python/src/hypothesis/core.py | 10 +--------- hypothesis-python/src/hypothesis/internal/compat.py | 9 +++++++++ 2 files changed, 10 insertions(+), 9 deletions(-) diff --git a/hypothesis-python/src/hypothesis/core.py b/hypothesis-python/src/hypothesis/core.py index 113876aae0..bfccf55159 100644 --- a/hypothesis-python/src/hypothesis/core.py +++ b/hypothesis-python/src/hypothesis/core.py @@ -70,6 +70,7 @@ from hypothesis.internal.compat import ( PYPY, BaseExceptionGroup, + add_note, bad_django_TestCase, get_type_hints, int_from_bytes, @@ -1008,15 +1009,6 @@ def run_engine(self): _raise_to_user(errors_to_report, self.settings, report_lines) -def add_note(exc, note): - try: - exc.add_note(note) - except AttributeError: - if not hasattr(exc, "__notes__"): - exc.__notes__ = [] - exc.__notes__.append(note) - - def _raise_to_user(errors_to_report, settings, target_lines, trailer=""): """Helper function for attaching notes and grouping multiple errors.""" failing_prefix = "Falsifying example: " diff --git a/hypothesis-python/src/hypothesis/internal/compat.py b/hypothesis-python/src/hypothesis/internal/compat.py index 29baa7ea79..1f23ce1863 100644 --- a/hypothesis-python/src/hypothesis/internal/compat.py +++ b/hypothesis-python/src/hypothesis/internal/compat.py @@ -43,6 +43,15 @@ WINDOWS = platform.system() == "Windows" +def add_note(exc, note): + try: + exc.add_note(note) + except AttributeError: + if not hasattr(exc, "__notes__"): + exc.__notes__ = [] + exc.__notes__.append(note) + + def escape_unicode_characters(s: str) -> str: return codecs.encode(s, "unicode_escape").decode("ascii") From 49b0b80e77f5437c934f00cd119447fb939737cb Mon Sep 17 00:00:00 2001 From: Zac Hatfield-Dodds Date: Fri, 1 Sep 2023 17:17:09 -0700 Subject: [PATCH 4/6] Fix error messages --- hypothesis-python/src/hypothesis/strategies/_internal/core.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/hypothesis-python/src/hypothesis/strategies/_internal/core.py b/hypothesis-python/src/hypothesis/strategies/_internal/core.py index 2f39fc5376..f6638959e2 100644 --- a/hypothesis-python/src/hypothesis/strategies/_internal/core.py +++ b/hypothesis-python/src/hypothesis/strategies/_internal/core.py @@ -2035,11 +2035,11 @@ def register_type_strategy( ) elif not (isinstance(strategy, SearchStrategy) or callable(strategy)): raise InvalidArgument( - "strategy=%r must be a SearchStrategy, or a function that takes " + f"{strategy=} must be a SearchStrategy, or a function that takes " "a generic type and returns a specific SearchStrategy" ) elif isinstance(strategy, SearchStrategy) and strategy.is_empty: - raise InvalidArgument("strategy=%r must not be empty") + raise InvalidArgument(f"{strategy=} must not be empty") elif types.has_type_arguments(custom_type): raise InvalidArgument( f"Cannot register generic type {custom_type!r}, because it has type " From 26ffda96325e790e4aee58783e20dcd910d0b3af Mon Sep 17 00:00:00 2001 From: Zac Hatfield-Dodds Date: Fri, 1 Sep 2023 17:17:09 -0700 Subject: [PATCH 5/6] Refactor charmap/IntervalSet logic --- .../src/hypothesis/internal/charmap.py | 144 ++--------------- .../src/hypothesis/internal/intervalsets.py | 147 ++++++++++++++++++ hypothesis-python/tests/cover/test_charmap.py | 25 +-- .../tests/cover/test_intervalset.py | 12 +- 4 files changed, 184 insertions(+), 144 deletions(-) diff --git a/hypothesis-python/src/hypothesis/internal/charmap.py b/hypothesis-python/src/hypothesis/internal/charmap.py index 80e94e2846..fe09de5227 100644 --- a/hypothesis-python/src/hypothesis/internal/charmap.py +++ b/hypothesis-python/src/hypothesis/internal/charmap.py @@ -18,6 +18,7 @@ from hypothesis.configuration import mkdir_p, storage_directory from hypothesis.errors import InvalidArgument +from hypothesis.internal.intervalsets import IntervalSet intervals = Tuple[Tuple[int, int], ...] cache_type = Dict[Tuple[Tuple[str, ...], int, int, intervals], intervals] @@ -146,126 +147,6 @@ def as_general_categories(cats, name="cats"): return tuple(c for c in cs if c in out) -def _union_intervals(x, y): - """Merge two sequences of intervals into a single tuple of intervals. - - Any integer bounded by `x` or `y` is also bounded by the result. - - >>> _union_intervals([(3, 10)], [(1, 2), (5, 17)]) - ((1, 17),) - """ - if not x: - return tuple((u, v) for u, v in y) - if not y: - return tuple((u, v) for u, v in x) - intervals = sorted(x + y, reverse=True) - result = [intervals.pop()] - while intervals: - # 1. intervals is in descending order - # 2. pop() takes from the RHS. - # 3. (a, b) was popped 1st, then (u, v) was popped 2nd - # 4. Therefore: a <= u - # 5. We assume that u <= v and a <= b - # 6. So we need to handle 2 cases of overlap, and one disjoint case - # | u--v | u----v | u--v | - # | a----b | a--b | a--b | - u, v = intervals.pop() - a, b = result[-1] - if u <= b + 1: - # Overlap cases - result[-1] = (a, max(v, b)) - else: - # Disjoint case - result.append((u, v)) - return tuple(result) - - -def _subtract_intervals(x, y): - """Set difference for lists of intervals. That is, returns a list of - intervals that bounds all values bounded by x that are not also bounded by - y. x and y are expected to be in sorted order. - - For example _subtract_intervals([(1, 10)], [(2, 3), (9, 15)]) would - return [(1, 1), (4, 8)], removing the values 2, 3, 9 and 10 from the - interval. - """ - if not y: - return tuple(x) - x = list(map(list, x)) - i = 0 - j = 0 - result = [] - while i < len(x) and j < len(y): - # Iterate in parallel over x and y. j stays pointing at the smallest - # interval in the left hand side that could still overlap with some - # element of x at index >= i. - # Similarly, i is not incremented until we know that it does not - # overlap with any element of y at index >= j. - - xl, xr = x[i] - assert xl <= xr - yl, yr = y[j] - assert yl <= yr - - if yr < xl: - # The interval at y[j] is strictly to the left of the interval at - # x[i], so will not overlap with it or any later interval of x. - j += 1 - elif yl > xr: - # The interval at y[j] is strictly to the right of the interval at - # x[i], so all of x[i] goes into the result as no further intervals - # in y will intersect it. - result.append(x[i]) - i += 1 - elif yl <= xl: - if yr >= xr: - # x[i] is contained entirely in y[j], so we just skip over it - # without adding it to the result. - i += 1 - else: - # The beginning of x[i] is contained in y[j], so we update the - # left endpoint of x[i] to remove this, and increment j as we - # now have moved past it. Note that this is not added to the - # result as is, as more intervals from y may intersect it so it - # may need updating further. - x[i][0] = yr + 1 - j += 1 - else: - # yl > xl, so the left hand part of x[i] is not contained in y[j], - # so there are some values we should add to the result. - result.append((xl, yl - 1)) - - if yr + 1 <= xr: - # If y[j] finishes before x[i] does, there may be some values - # in x[i] left that should go in the result (or they may be - # removed by a later interval in y), so we update x[i] to - # reflect that and increment j because it no longer overlaps - # with any remaining element of x. - x[i][0] = yr + 1 - j += 1 - else: - # Every element of x[i] other than the initial part we have - # already added is contained in y[j], so we move to the next - # interval. - i += 1 - # Any remaining intervals in x do not overlap with any of y, as if they did - # we would not have incremented j to the end, so can be added to the result - # as they are. - result.extend(x[i:]) - return tuple(map(tuple, result)) - - -def _intervals(s): - """Return a tuple of intervals, covering the codepoints of characters in - `s`. - - >>> _intervals('abcdef0123456789') - ((48, 57), (97, 102)) - """ - intervals = tuple((ord(c), ord(c)) for c in sorted(s)) - return _union_intervals(intervals, intervals) - - category_index_cache = {(): ()} @@ -306,11 +187,14 @@ def _query_for_key(key): pass assert key if set(key) == set(categories()): - result = ((0, sys.maxunicode),) + result = IntervalSet([(0, sys.maxunicode)]) else: - result = _union_intervals(_query_for_key(key[:-1]), charmap()[key[-1]]) - category_index_cache[key] = result - return result + result = IntervalSet(_query_for_key(key[:-1])).union( + IntervalSet(charmap()[key[-1]]) + ) + assert isinstance(result, IntervalSet) + category_index_cache[key] = result.intervals + return result.intervals limited_category_index_cache: cache_type = {} @@ -344,14 +228,14 @@ def query( if max_codepoint is None: max_codepoint = sys.maxunicode catkey = _category_key(exclude_categories, include_categories) - character_intervals = _intervals(include_characters or "") - exclude_intervals = _intervals(exclude_characters or "") + character_intervals = IntervalSet.from_string(include_characters or "") + exclude_intervals = IntervalSet.from_string(exclude_characters or "") qkey = ( catkey, min_codepoint, max_codepoint, - character_intervals, - exclude_intervals, + character_intervals.intervals, + exclude_intervals.intervals, ) try: return limited_category_index_cache[qkey] @@ -362,8 +246,6 @@ def query( for u, v in base: if v >= min_codepoint and u <= max_codepoint: result.append((max(u, min_codepoint), min(v, max_codepoint))) - result = tuple(result) - result = _union_intervals(result, character_intervals) - result = _subtract_intervals(result, exclude_intervals) + result = (IntervalSet(result) | character_intervals) - exclude_intervals limited_category_index_cache[qkey] = result return result diff --git a/hypothesis-python/src/hypothesis/internal/intervalsets.py b/hypothesis-python/src/hypothesis/internal/intervalsets.py index 5bdd731d2d..33d02dd8a8 100644 --- a/hypothesis-python/src/hypothesis/internal/intervalsets.py +++ b/hypothesis-python/src/hypothesis/internal/intervalsets.py @@ -10,6 +10,16 @@ class IntervalSet: + @classmethod + def from_string(cls, s): + """Return a tuple of intervals, covering the codepoints of characters in `s`. + + >>> IntervalSet.from_string('abcdef0123456789') + ((48, 57), (97, 102)) + """ + x = cls((ord(c), ord(c)) for c in sorted(s)) + return x.union(x) + def __init__(self, intervals): self.intervals = tuple(intervals) self.offsets = [0] @@ -49,6 +59,13 @@ def __getitem__(self, i): assert r <= v return r + def __contains__(self, elem): + if isinstance(elem, str): + elem = ord(elem) + assert isinstance(elem, int) + assert 0 <= elem <= 0x10FFFF + return any(start <= elem <= end for start, end in self.intervals) + def __repr__(self): return f"IntervalSet({self.intervals!r})" @@ -69,3 +86,133 @@ def index_above(self, value): if value <= v: return offset + (value - u) return self.size + + def __or__(self, other): + return self.union(other) + + def __sub__(self, other): + return self.difference(other) + + def __and__(self, other): + return self.intersection(other) + + def union(self, other): + """Merge two sequences of intervals into a single tuple of intervals. + + Any integer bounded by `x` or `y` is also bounded by the result. + + >>> union([(3, 10)], [(1, 2), (5, 17)]) + ((1, 17),) + """ + assert isinstance(other, type(self)) + x = self.intervals + y = other.intervals + if not x: + return IntervalSet((u, v) for u, v in y) + if not y: + return IntervalSet((u, v) for u, v in x) + intervals = sorted(x + y, reverse=True) + result = [intervals.pop()] + while intervals: + # 1. intervals is in descending order + # 2. pop() takes from the RHS. + # 3. (a, b) was popped 1st, then (u, v) was popped 2nd + # 4. Therefore: a <= u + # 5. We assume that u <= v and a <= b + # 6. So we need to handle 2 cases of overlap, and one disjoint case + # | u--v | u----v | u--v | + # | a----b | a--b | a--b | + u, v = intervals.pop() + a, b = result[-1] + if u <= b + 1: + # Overlap cases + result[-1] = (a, max(v, b)) + else: + # Disjoint case + result.append((u, v)) + return IntervalSet(result) + + def difference(self, other): + """Set difference for lists of intervals. That is, returns a list of + intervals that bounds all values bounded by x that are not also bounded by + y. x and y are expected to be in sorted order. + + For example difference([(1, 10)], [(2, 3), (9, 15)]) would + return [(1, 1), (4, 8)], removing the values 2, 3, 9 and 10 from the + interval. + """ + assert isinstance(other, type(self)) + x = self.intervals + y = other.intervals + if not y: + return IntervalSet(x) + x = list(map(list, x)) + i = 0 + j = 0 + result = [] + while i < len(x) and j < len(y): + # Iterate in parallel over x and y. j stays pointing at the smallest + # interval in the left hand side that could still overlap with some + # element of x at index >= i. + # Similarly, i is not incremented until we know that it does not + # overlap with any element of y at index >= j. + + xl, xr = x[i] + assert xl <= xr + yl, yr = y[j] + assert yl <= yr + + if yr < xl: + # The interval at y[j] is strictly to the left of the interval at + # x[i], so will not overlap with it or any later interval of x. + j += 1 + elif yl > xr: + # The interval at y[j] is strictly to the right of the interval at + # x[i], so all of x[i] goes into the result as no further intervals + # in y will intersect it. + result.append(x[i]) + i += 1 + elif yl <= xl: + if yr >= xr: + # x[i] is contained entirely in y[j], so we just skip over it + # without adding it to the result. + i += 1 + else: + # The beginning of x[i] is contained in y[j], so we update the + # left endpoint of x[i] to remove this, and increment j as we + # now have moved past it. Note that this is not added to the + # result as is, as more intervals from y may intersect it so it + # may need updating further. + x[i][0] = yr + 1 + j += 1 + else: + # yl > xl, so the left hand part of x[i] is not contained in y[j], + # so there are some values we should add to the result. + result.append((xl, yl - 1)) + + if yr + 1 <= xr: + # If y[j] finishes before x[i] does, there may be some values + # in x[i] left that should go in the result (or they may be + # removed by a later interval in y), so we update x[i] to + # reflect that and increment j because it no longer overlaps + # with any remaining element of x. + x[i][0] = yr + 1 + j += 1 + else: + # Every element of x[i] other than the initial part we have + # already added is contained in y[j], so we move to the next + # interval. + i += 1 + # Any remaining intervals in x do not overlap with any of y, as if they did + # we would not have incremented j to the end, so can be added to the result + # as they are. + result.extend(x[i:]) + return IntervalSet(map(tuple, result)) + + def intersection(self, other): + """Set intersection for lists of intervals. + + Conveniently, this is trivial to define in terms of difference. + """ + assert isinstance(other, type(self)), other + return self.difference(other - self).difference(self - other) diff --git a/hypothesis-python/tests/cover/test_charmap.py b/hypothesis-python/tests/cover/test_charmap.py index fe45fefb5e..48ebd97c84 100644 --- a/hypothesis-python/tests/cover/test_charmap.py +++ b/hypothesis-python/tests/cover/test_charmap.py @@ -16,6 +16,7 @@ from hypothesis import assume, given, strategies as st from hypothesis.internal import charmap as cm +from hypothesis.internal.intervalsets import IntervalSet def test_charmap_contains_all_unicode(): @@ -47,7 +48,7 @@ def assert_valid_range_list(ls): st.sets(st.sampled_from(cm.categories())) | st.none(), ) def test_query_matches_categories(exclude, include): - values = cm.query(exclude, include) + values = cm.query(exclude, include).intervals assert_valid_range_list(values) for u, v in values: for i in (u, v, (u + v) // 2): @@ -65,7 +66,7 @@ def test_query_matches_categories(exclude, include): ) def test_query_matches_categories_codepoints(exclude, include, m1, m2): m1, m2 = sorted((m1, m2)) - values = cm.query(exclude, include, min_codepoint=m1, max_codepoint=m2) + values = cm.query(exclude, include, min_codepoint=m1, max_codepoint=m2).intervals assert_valid_range_list(values) for u, v in values: assert m1 <= u @@ -76,7 +77,7 @@ def test_query_matches_categories_codepoints(exclude, include, m1, m2): def test_exclude_only_excludes_from_that_category(cat, i): c = chr(i) assume(unicodedata.category(c) != cat) - intervals = cm.query(exclude_categories=(cat,)) + intervals = cm.query(exclude_categories=(cat,)).intervals assert any(a <= i <= b for a, b in intervals) @@ -115,30 +116,34 @@ def test_uses_cached_charmap(): assert statinfo.st_mtime == mtime +def _union_intervals(x, y): + return IntervalSet(x).union(IntervalSet(y)).intervals + + def test_union_empty(): - assert cm._union_intervals([], []) == () - assert cm._union_intervals([], [[1, 2]]) == ((1, 2),) - assert cm._union_intervals([[1, 2]], []) == ((1, 2),) + assert _union_intervals([], []) == () + assert _union_intervals([], [[1, 2]]) == ((1, 2),) + assert _union_intervals([[1, 2]], []) == ((1, 2),) def test_union_handles_totally_overlapped_gap(): # < xx > Imagine the intervals x and y as bit strings. # | The bit at position n is set if n falls inside that interval. # = In this model _union_intervals() performs bit-wise or. - assert cm._union_intervals([[2, 3]], [[1, 2], [4, 5]]) == ((1, 5),) + assert _union_intervals([[2, 3]], [[1, 2], [4, 5]]) == ((1, 5),) def test_union_handles_partially_overlapped_gap(): # < x > Imagine the intervals x and y as bit strings. # | The bit at position n is set if n falls inside that interval. # = In this model _union_intervals() performs bit-wise or. - assert cm._union_intervals([[3, 3]], [[1, 2], [5, 5]]) == ((1, 3), (5, 5)) + assert _union_intervals([[3, 3]], [[1, 2], [5, 5]]) == ((1, 3), (5, 5)) def test_successive_union(): x = [] for v in cm.charmap().values(): - x = cm._union_intervals(x, v) + x = _union_intervals(x, v) assert x == ((0, sys.maxunicode),) @@ -175,7 +180,7 @@ def test_regenerate_broken_charmap_file(): def test_exclude_characters_are_included_in_key(): - assert cm.query() != cm.query(exclude_characters="0") + assert cm.query().intervals != cm.query(exclude_characters="0").intervals def test_error_writing_charmap_file_is_suppressed(monkeypatch): diff --git a/hypothesis-python/tests/cover/test_intervalset.py b/hypothesis-python/tests/cover/test_intervalset.py index 9b2f2d3485..8714378522 100644 --- a/hypothesis-python/tests/cover/test_intervalset.py +++ b/hypothesis-python/tests/cover/test_intervalset.py @@ -11,7 +11,6 @@ import pytest from hypothesis import HealthCheck, assume, example, given, settings, strategies as st -from hypothesis.internal.charmap import _subtract_intervals from hypothesis.internal.intervalsets import IntervalSet @@ -58,7 +57,7 @@ def test_intervals_match_indexes(intervals): @example(intervals=IntervalSet(((1, 1),)), v=0) @example(intervals=IntervalSet(()), v=0) -@given(Intervals, st.integers()) +@given(Intervals, st.integers(0, 0x10FFFF)) def test_error_for_index_of_not_present_value(intervals, v): assume(v not in intervals) with pytest.raises(ValueError): @@ -98,8 +97,15 @@ def test_subtraction_of_intervals(x, y): xs = intervals_to_set(x) ys = intervals_to_set(y) assume(not xs.isdisjoint(ys)) - z = _subtract_intervals(x, y) + z = IntervalSet(x).difference(IntervalSet(y)).intervals assert z == tuple(sorted(z)) for a, b in z: assert a <= b assert intervals_to_set(z) == intervals_to_set(x) - intervals_to_set(y) + + +@given(Intervals, Intervals) +def test_interval_intersection(x, y): + print(f"{set(x)=} {set(y)=} {set(x)-(set(y)-set(x))=}") + assert set(x & y) == set(x) & set(y) + assert set(x.intersection(y)) == set(x).intersection(y) From 0cd8ca9bd3190c6247bb7f67be36e5652a164026 Mon Sep 17 00:00:00 2001 From: Zac Hatfield-Dodds Date: Fri, 1 Sep 2023 17:17:09 -0700 Subject: [PATCH 6/6] from_regex(..., alphabet=characters()) --- hypothesis-python/RELEASE.rst | 13 ++ .../hypothesis/strategies/_internal/core.py | 55 ++++++- .../hypothesis/strategies/_internal/regex.py | 135 +++++++++++------- .../strategies/_internal/strings.py | 26 ++-- .../tests/cover/test_direct_strategies.py | 18 +++ hypothesis-python/tests/cover/test_regex.py | 25 +++- hypothesis-python/tests/cover/test_text.py | 2 +- hypothesis-python/tests/nocover/test_regex.py | 2 +- 8 files changed, 203 insertions(+), 73 deletions(-) create mode 100644 hypothesis-python/RELEASE.rst diff --git a/hypothesis-python/RELEASE.rst b/hypothesis-python/RELEASE.rst new file mode 100644 index 0000000000..c75d34d13d --- /dev/null +++ b/hypothesis-python/RELEASE.rst @@ -0,0 +1,13 @@ +RELEASE_TYPE: minor + +The :func:`~hypothesis.strategies.from_regex` strategy now takes an optional +``alphabet=characters(codec="utf-8")`` argument for unicode strings, like +:func:`~hypothesis.strategies.text`. + +This offers more and more-consistent control over the generated strings, +removing previously-hard-coded limitations. With ``fullmatch=False`` and +``alphabet=characters()``, surrogate characters are now possible in leading +and trailing text as well as the body of the match. Negated character classes +such as ``[^A-Z]`` or ``\S`` had a hard-coded exclusion of control characters +and surrogate characters; now they permit anything in ``alphabet=`` consistent +with the class, and control characters are permitted by default. diff --git a/hypothesis-python/src/hypothesis/strategies/_internal/core.py b/hypothesis-python/src/hypothesis/strategies/_internal/core.py index f6638959e2..e36fc64ae6 100644 --- a/hypothesis-python/src/hypothesis/strategies/_internal/core.py +++ b/hypothesis-python/src/hypothesis/strategies/_internal/core.py @@ -661,7 +661,7 @@ def characters( # caching that, and taking the intersection of their intervals. raise InvalidArgument(f"{codec=} must be one of 'ascii', 'utf-8', or None") - return OneCharStringStrategy( + return OneCharStringStrategy.from_characters_args( whitelist_categories=whitelist_categories, blacklist_categories=blacklist_categories, blacklist_characters=blacklist_characters, @@ -742,10 +742,32 @@ def text( return TextStrategy(char_strategy, min_size=min_size, max_size=max_size) +@overload +def from_regex( + regex: Union[bytes, Pattern[bytes]], + *, + fullmatch: bool = False, +) -> SearchStrategy[bytes]: # pragma: no cover + ... + + +@overload +def from_regex( + regex: Union[str, Pattern[str]], + *, + fullmatch: bool = False, + alphabet: Union[str, SearchStrategy[str]] = characters(codec="utf-8"), +) -> SearchStrategy[str]: # pragma: no cover + ... + + @cacheable @defines_strategy() def from_regex( - regex: Union[AnyStr, Pattern[AnyStr]], *, fullmatch: bool = False + regex: Union[AnyStr, Pattern[AnyStr]], + *, + fullmatch: bool = False, + alphabet: Union[str, SearchStrategy[str], None] = None, ) -> SearchStrategy[AnyStr]: r"""Generates strings that contain a match for the given regex (i.e. ones for which :func:`python:re.search` will return a non-None result). @@ -771,15 +793,42 @@ def from_regex( Alternatively, passing ``fullmatch=True`` will ensure that the whole string is a match, as if you had used the ``\A`` and ``\Z`` markers. + The ``alphabet=`` argument constrains the characters in the generated + string, as for :func:`text`, and is only supported for unicode strings. + Examples from this strategy shrink towards shorter strings and lower character values, with exact behaviour that may depend on the pattern. """ + check_type((str, bytes, re.Pattern), regex, "regex") check_type(bool, fullmatch, "fullmatch") + pattern = regex.pattern if isinstance(regex, re.Pattern) else regex + if alphabet is not None: + check_type((str, SearchStrategy), alphabet, "alphabet") + if not isinstance(pattern, str): + raise InvalidArgument("alphabet= is not supported for bytestrings") + + if isinstance(alphabet, str): + alphabet = characters( + whitelist_categories=(), whitelist_characters=alphabet + ) + char_strategy = unwrap_strategies(alphabet) + if isinstance(char_strategy, SampledFromStrategy): + alphabet = characters( + whitelist_categories=(), + whitelist_characters=alphabet.elements, # type: ignore + ) + elif not isinstance(char_strategy, OneCharStringStrategy): + raise InvalidArgument( + f"{alphabet=} must be a sampled_from() or characters() strategy" + ) + elif isinstance(pattern, str): + alphabet = characters(codec="utf-8") + # TODO: We would like to move this to the top level, but pending some major # refactoring it's hard to do without creating circular imports. from hypothesis.strategies._internal.regex import regex_strategy - return regex_strategy(regex, fullmatch) + return regex_strategy(regex, fullmatch, alphabet=alphabet) @cacheable diff --git a/hypothesis-python/src/hypothesis/strategies/_internal/regex.py b/hypothesis-python/src/hypothesis/strategies/_internal/regex.py index 978153581e..df3ab324bb 100644 --- a/hypothesis-python/src/hypothesis/strategies/_internal/regex.py +++ b/hypothesis-python/src/hypothesis/strategies/_internal/regex.py @@ -11,6 +11,11 @@ import operator import re +from hypothesis.errors import InvalidArgument +from hypothesis.internal import charmap +from hypothesis.strategies._internal.lazy import unwrap_strategies +from hypothesis.strategies._internal.strings import OneCharStringStrategy + try: # pragma: no cover import re._constants as sre import re._parser as sre_parse @@ -26,7 +31,7 @@ from hypothesis import reject, strategies as st from hypothesis.internal.charmap import as_general_categories, categories -from hypothesis.internal.compat import int_to_byte +from hypothesis.internal.compat import add_note, int_to_byte UNICODE_CATEGORIES = set(categories()) @@ -90,6 +95,14 @@ def clear_cache_after_draw(draw, base_strategy): return result +def chars_not_in_alphabet(alphabet, string): + # Given a string, return a tuple of the characters which are not in alphabet + if alphabet is None: + return () + intset = unwrap_strategies(alphabet).intervals + return tuple(c for c in string if c not in intset) + + class Context: __slots__ = ["flags"] @@ -107,42 +120,38 @@ class CharactersBuilder: :param flags: Regex flags. They affect how and which characters are matched """ - def __init__(self, negate=False, flags=0): + def __init__(self, negate=False, flags=0, *, alphabet): self._categories = set() self._whitelist_chars = set() self._blacklist_chars = set() self._negate = negate self._ignorecase = flags & re.IGNORECASE - self._unicode = not bool(flags & re.ASCII) self.code_to_char = chr + self._alphabet = unwrap_strategies(alphabet) + if flags & re.ASCII: + self._alphabet = OneCharStringStrategy( + self._alphabet.intervals & charmap.query(max_codepoint=127) + ) @property def strategy(self): """Returns resulting strategy that generates configured char set.""" - max_codepoint = None if self._unicode else 127 - # Due to the .swapcase() issue described below (and in issue #2657), - # self._whitelist_chars may contain strings of len > 1. We therefore - # have some extra logic to filter them out of st.characters() args, - # but still generate them if allowed to. - if self._negate: - black_chars = self._blacklist_chars - self._whitelist_chars - return st.characters( - blacklist_categories=self._categories | {"Cc", "Cs"}, - blacklist_characters={c for c in self._whitelist_chars if len(c) == 1}, - whitelist_characters=black_chars, - max_codepoint=max_codepoint, - ) + # Start by getting the set of all characters allowed by the pattern white_chars = self._whitelist_chars - self._blacklist_chars multi_chars = {c for c in white_chars if len(c) > 1} - char_strategy = st.characters( - whitelist_categories=self._categories, - blacklist_characters=self._blacklist_chars, - whitelist_characters=white_chars - multi_chars, - max_codepoint=max_codepoint, + intervals = charmap.query( + include_categories=self._categories, + exclude_characters=self._blacklist_chars, + include_characters=white_chars - multi_chars, + ) + # Then take the complement if this is from a negated character class + if self._negate: + intervals = charmap.query() - intervals + multi_chars.clear() + # and finally return the intersection with our alphabet + return OneCharStringStrategy(intervals & self._alphabet.intervals) | ( + st.sampled_from(sorted(multi_chars)) if multi_chars else st.nothing() ) - if multi_chars: - char_strategy |= st.sampled_from(sorted(multi_chars)) - return char_strategy def add_category(self, category): """Update unicode state to match sre_parse object ``category``.""" @@ -152,14 +161,10 @@ def add_category(self, category): self._categories |= UNICODE_CATEGORIES - UNICODE_DIGIT_CATEGORIES elif category == sre.CATEGORY_SPACE: self._categories |= UNICODE_SPACE_CATEGORIES - self._whitelist_chars |= ( - UNICODE_SPACE_CHARS if self._unicode else SPACE_CHARS - ) + self._whitelist_chars |= UNICODE_SPACE_CHARS elif category == sre.CATEGORY_NOT_SPACE: self._categories |= UNICODE_CATEGORIES - UNICODE_SPACE_CATEGORIES - self._blacklist_chars |= ( - UNICODE_SPACE_CHARS if self._unicode else SPACE_CHARS - ) + self._blacklist_chars |= UNICODE_SPACE_CHARS elif category == sre.CATEGORY_WORD: self._categories |= UNICODE_WORD_CATEGORIES self._whitelist_chars.add("_") @@ -169,9 +174,11 @@ def add_category(self, category): else: raise NotImplementedError(f"Unknown character category: {category}") - def add_char(self, char): + def add_char(self, char, *, check=True): """Add given char to the whitelist.""" c = self.code_to_char(char) + if check and chars_not_in_alphabet(self._alphabet, c): + raise InvalidArgument(f"Literal {c!r} is not in the specified alphabet") self._whitelist_chars.add(c) if ( self._ignorecase @@ -186,6 +193,7 @@ def __init__(self, negate=False, flags=0): self._whitelist_chars = set() self._blacklist_chars = set() self._negate = negate + self._alphabet = None self._ignorecase = flags & re.IGNORECASE self.code_to_char = int_to_byte @@ -216,15 +224,25 @@ def maybe_pad(draw, regex, strategy, left_pad_strategy, right_pad_strategy): return result -def base_regex_strategy(regex, parsed=None): +def base_regex_strategy(regex, parsed=None, alphabet=None): if parsed is None: parsed = sre_parse.parse(regex.pattern, flags=regex.flags) - return clear_cache_after_draw( - _strategy(parsed, Context(flags=regex.flags), isinstance(regex.pattern, str)) - ) + try: + s = _strategy( + parsed, + context=Context(flags=regex.flags), + is_unicode=isinstance(regex.pattern, str), + alphabet=alphabet, + ) + except Exception as err: + add_note(err, f"{alphabet=} {regex=}") + raise + return clear_cache_after_draw(s) -def regex_strategy(regex, fullmatch, *, _temp_jsonschema_hack_no_end_newline=False): +def regex_strategy( + regex, fullmatch, *, alphabet, _temp_jsonschema_hack_no_end_newline=False +): if not hasattr(regex, "pattern"): regex = re.compile(regex) @@ -235,16 +253,16 @@ def regex_strategy(regex, fullmatch, *, _temp_jsonschema_hack_no_end_newline=Fal if fullmatch: if not parsed: return st.just("" if is_unicode else b"") - return base_regex_strategy(regex, parsed).filter(regex.fullmatch) + return base_regex_strategy(regex, parsed, alphabet).filter(regex.fullmatch) if not parsed: if is_unicode: - return st.text() + return st.text(alphabet=alphabet) else: return st.binary() if is_unicode: - base_padding_strategy = st.text() + base_padding_strategy = st.text(alphabet=alphabet) empty = st.just("") newline = st.just("\n") else: @@ -283,12 +301,12 @@ def regex_strategy(regex, fullmatch, *, _temp_jsonschema_hack_no_end_newline=Fal else: left_pad = empty - base = base_regex_strategy(regex, parsed).filter(regex.search) + base = base_regex_strategy(regex, parsed, alphabet).filter(regex.search) return maybe_pad(regex, base, left_pad, right_pad) -def _strategy(codes, context, is_unicode): +def _strategy(codes, context, is_unicode, *, alphabet): """Convert SRE regex parse tree to strategy that generates strings matching that regex represented by that parse tree. @@ -317,7 +335,7 @@ def _strategy(codes, context, is_unicode): """ def recurse(codes): - return _strategy(codes, context, is_unicode) + return _strategy(codes, context, is_unicode, alphabet=alphabet) if is_unicode: empty = "" @@ -341,8 +359,13 @@ def recurse(codes): j += 1 if i + 1 < j: - chars = (to_char(charcode) for _, charcode in codes[i:j]) - strategies.append(st.just(empty.join(chars))) + chars = empty.join(to_char(charcode) for _, charcode in codes[i:j]) + if invalid := chars_not_in_alphabet(alphabet, chars): + raise InvalidArgument( + f"Literal {chars!r} contains characters {invalid!r} " + f"which are not in the specified alphabet" + ) + strategies.append(st.just(chars)) i = j continue @@ -363,10 +386,13 @@ def recurse(codes): if code == sre.LITERAL: # Regex 'a' (single char) c = to_char(value) + if chars_not_in_alphabet(alphabet, c): + raise InvalidArgument(f"Literal {c!r} is not in the specified alphabet") if ( context.flags & re.IGNORECASE and c != c.swapcase() and re.match(re.escape(c), c.swapcase(), re.IGNORECASE) is not None + and not chars_not_in_alphabet(alphabet, c.swapcase()) ): # We do the explicit check for swapped-case matching because # eg 'ß'.upper() == 'SS' and ignorecase doesn't match it. @@ -399,7 +425,10 @@ def recurse(codes): stack.extend(set(char.swapcase()) - blacklist) if is_unicode: - return st.characters(blacklist_characters=blacklist) + return OneCharStringStrategy( + unwrap_strategies(alphabet).intervals + & charmap.query(exclude_characters=blacklist) + ) else: return binary_char.filter(lambda c: c not in blacklist) @@ -407,7 +436,7 @@ def recurse(codes): # Regex '[abc0-9]' (set of characters) negate = value[0][0] == sre.NEGATE if is_unicode: - builder = CharactersBuilder(negate, context.flags) + builder = CharactersBuilder(negate, context.flags, alphabet=alphabet) else: builder = BytesBuilder(negate, context.flags) @@ -423,7 +452,7 @@ def recurse(codes): # Regex '[a-z]' (char range) low, high = charset_value for char_code in range(low, high + 1): - builder.add_char(char_code) + builder.add_char(char_code, check=char_code in (low, high)) elif charset_code == sre.CATEGORY: # Regex '[\w]' (char category) builder.add_category(charset_value) @@ -436,9 +465,13 @@ def recurse(codes): elif code == sre.ANY: # Regex '.' (any char) if is_unicode: + assert alphabet is not None if context.flags & re.DOTALL: - return st.characters() - return st.characters(blacklist_characters="\n") + return alphabet + return OneCharStringStrategy( + unwrap_strategies(alphabet).intervals + & charmap.query(exclude_characters="\n") + ) else: if context.flags & re.DOTALL: return binary_char @@ -455,7 +488,7 @@ def recurse(codes): old_flags = context.flags context.flags = (context.flags | value[1]) & ~value[2] - strat = _strategy(value[-1], context, is_unicode) + strat = _strategy(value[-1], context, is_unicode, alphabet=alphabet) context.flags = old_flags @@ -501,7 +534,7 @@ def recurse(codes): recurse(value[2]) if value[2] else st.just(empty), ) elif code == ATOMIC_GROUP: # pragma: no cover # new in Python 3.11 - return _strategy(value, context, is_unicode) + return _strategy(value, context, is_unicode, alphabet=alphabet) else: # Currently there are no known code points other than handled here. diff --git a/hypothesis-python/src/hypothesis/strategies/_internal/strings.py b/hypothesis-python/src/hypothesis/strategies/_internal/strings.py index f1eb143ed6..8507f6ddbf 100644 --- a/hypothesis-python/src/hypothesis/strategies/_internal/strings.py +++ b/hypothesis-python/src/hypothesis/strategies/_internal/strings.py @@ -22,8 +22,18 @@ class OneCharStringStrategy(SearchStrategy): """A strategy which generates single character strings of text type.""" - def __init__( - self, + def __init__(self, intervals, force_repr=None): + assert isinstance(intervals, IntervalSet) + self.intervals = intervals + self._force_repr = force_repr + self.zero_point = self.intervals.index_above(ord("0")) + self.Z_point = min( + self.intervals.index_above(ord("Z")), len(self.intervals) - 1 + ) + + @classmethod + def from_characters_args( + cls, whitelist_categories=None, blacklist_categories=None, blacklist_characters=None, @@ -41,7 +51,7 @@ def __init__( include_characters=whitelist_characters, exclude_characters=blacklist_characters, ) - self._arg_repr = ", ".join( + _arg_repr = ", ".join( f"{k}={v!r}" for k, v in [ ("whitelist_categories", whitelist_categories), @@ -56,16 +66,12 @@ def __init__( if not intervals: raise InvalidArgument( "No characters are allowed to be generated by this " - f"combination of arguments: {self._arg_repr}" + f"combination of arguments: {_arg_repr}" ) - self.intervals = IntervalSet(intervals) - self.zero_point = self.intervals.index_above(ord("0")) - self.Z_point = min( - self.intervals.index_above(ord("Z")), len(self.intervals) - 1 - ) + return cls(intervals, force_repr=f"characters({_arg_repr})") def __repr__(self): - return f"characters({self._arg_repr})" + return self._force_repr or f"OneCharStringStrategy({self.intervals!r})" def do_draw(self, data): if len(self.intervals) > 256: diff --git a/hypothesis-python/tests/cover/test_direct_strategies.py b/hypothesis-python/tests/cover/test_direct_strategies.py index cf8a0ea0f6..b584a250be 100644 --- a/hypothesis-python/tests/cover/test_direct_strategies.py +++ b/hypothesis-python/tests/cover/test_direct_strategies.py @@ -128,6 +128,18 @@ def fn_ktest(*fnkwargs): (ds.text, {"alphabet": ds.sampled_from([123, 456])}), (ds.text, {"alphabet": ds.builds(lambda: "abc")}), (ds.text, {"alphabet": ds.builds(lambda: 123)}), + (ds.from_regex, {"regex": 123}), + (ds.from_regex, {"regex": b"abc", "alphabet": "abc"}), + (ds.from_regex, {"regex": b"abc", "alphabet": b"def"}), + (ds.from_regex, {"regex": "abc", "alphabet": "def"}), + (ds.from_regex, {"regex": "[abc]", "alphabet": "def"}), + (ds.from_regex, {"regex": "[a-d]", "alphabet": "def"}), + (ds.from_regex, {"regex": "[f-z]", "alphabet": "def"}), + (ds.from_regex, {"regex": "[ab]x[de]", "alphabet": "abcdef"}), + (ds.from_regex, {"regex": "...", "alphabet": ds.builds(lambda: "a")}), + (ds.from_regex, {"regex": "abc", "alphabet": ds.sampled_from("def")}), + (ds.from_regex, {"regex": "abc", "alphabet": ds.characters(min_codepoint=128)}), + (ds.from_regex, {"regex": "abc", "alphabet": 123}), (ds.binary, {"min_size": 10, "max_size": 9}), (ds.floats, {"min_value": math.nan}), (ds.floats, {"min_value": "0"}), @@ -272,6 +284,12 @@ def test_validates_keyword_arguments(fn, kwargs): (ds.text, {"alphabet": ds.builds(lambda: "a")}), (ds.characters, {"whitelist_categories": ["N"]}), (ds.characters, {"blacklist_categories": []}), + (ds.from_regex, {"regex": "abc", "alphabet": "abc"}), + (ds.from_regex, {"regex": "abc", "alphabet": "abcdef"}), + (ds.from_regex, {"regex": "[abc]", "alphabet": "abcdef"}), + (ds.from_regex, {"regex": "[a-f]", "alphabet": "abef"}), + (ds.from_regex, {"regex": "abc", "alphabet": ds.sampled_from("abc")}), + (ds.from_regex, {"regex": "abc", "alphabet": ds.characters(codec="ascii")}), (ds.ip_addresses, {}), (ds.ip_addresses, {"v": 4}), (ds.ip_addresses, {"v": 6}), diff --git a/hypothesis-python/tests/cover/test_regex.py b/hypothesis-python/tests/cover/test_regex.py index 373f8edc34..f11f302687 100644 --- a/hypothesis-python/tests/cover/test_regex.py +++ b/hypothesis-python/tests/cover/test_regex.py @@ -134,12 +134,16 @@ def pred(s): "[^\\S]", # categories ], ) -@pytest.mark.parametrize("encode", [False, True]) +@pytest.mark.parametrize("encode", [None, False, True]) def test_can_generate(pattern, encode): + alphabet = st.characters(max_codepoint=1000) if encode is None else None if encode: pattern = pattern.encode("ascii") with local_settings(settings(suppress_health_check=[HealthCheck.data_too_large])): - assert_all_examples(st.from_regex(pattern), re.compile(pattern).search) + assert_all_examples( + st.from_regex(pattern, alphabet=alphabet), + re.compile(pattern).search, + ) @pytest.mark.parametrize( @@ -268,8 +272,8 @@ def test_groupref_not_shared_between_regex(): @given(st.data()) def test_group_ref_is_not_shared_between_identical_regex(data): pattern = re.compile("^(.+)\\1\\Z", re.UNICODE) - x = data.draw(base_regex_strategy(pattern)) - y = data.draw(base_regex_strategy(pattern)) + x = data.draw(base_regex_strategy(pattern, alphabet=st.characters())) + y = data.draw(base_regex_strategy(pattern, alphabet=st.characters())) assume(x != y) assert pattern.match(x).end() == len(x) assert pattern.match(y).end() == len(y) @@ -277,9 +281,11 @@ def test_group_ref_is_not_shared_between_identical_regex(data): @given(st.data()) def test_does_not_leak_groups(data): - a = data.draw(base_regex_strategy(re.compile("^(a)\\Z"))) + a = data.draw(base_regex_strategy(re.compile("^(a)\\Z"), alphabet=st.characters())) assert a == "a" - b = data.draw(base_regex_strategy(re.compile("^(?(1)a|b)(.)\\Z"))) + b = data.draw( + base_regex_strategy(re.compile("^(?(1)a|b)(.)\\Z"), alphabet=st.characters()) + ) assert b[0] == "b" @@ -469,6 +475,11 @@ def test_internals_can_disable_newline_from_dollar_for_jsonschema(): pattern = "^abc$" find_any(st.from_regex(pattern), lambda s: s == "abc\n") assert_all_examples( - regex_strategy(pattern, False, _temp_jsonschema_hack_no_end_newline=True), + regex_strategy( + pattern, + False, + alphabet=st.characters(), + _temp_jsonschema_hack_no_end_newline=True, + ), lambda s: s == "abc", ) diff --git a/hypothesis-python/tests/cover/test_text.py b/hypothesis-python/tests/cover/test_text.py index 964db142f3..c1cb7f03b2 100644 --- a/hypothesis-python/tests/cover/test_text.py +++ b/hypothesis-python/tests/cover/test_text.py @@ -12,7 +12,7 @@ def test_rewriting_integers_covers_right_range(): - strategy = OneCharStringStrategy() + strategy = OneCharStringStrategy.from_characters_args() rewritten = [strategy.rewrite_integer(i) for i in range(256)] assert sorted(rewritten) == sorted(range(256)) diff --git a/hypothesis-python/tests/nocover/test_regex.py b/hypothesis-python/tests/nocover/test_regex.py index 11b1d4a31f..c16c4a360a 100644 --- a/hypothesis-python/tests/nocover/test_regex.py +++ b/hypothesis-python/tests/nocover/test_regex.py @@ -63,7 +63,7 @@ def conservative_regex(draw): @given(st.data()) def test_conservative_regex_are_correct_by_construction(data): pattern = re.compile(data.draw(CONSERVATIVE_REGEX), flags=data.draw(FLAGS)) - result = data.draw(base_regex_strategy(pattern)) + result = data.draw(base_regex_strategy(pattern, alphabet=st.characters())) # We'll skip "capital I with dot above" due to awful casefolding behaviour # and "latin small letter dotless i" for the same reason. assume({"ı", "İ"}.isdisjoint(pattern.pattern + result))