Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add alphabet= argument to st.from_regex() #3730

Merged
merged 6 commits into from
Sep 4, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 13 additions & 0 deletions hypothesis-python/RELEASE.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
RELEASE_TYPE: minor

The :func:`~hypothesis.strategies.from_regex` strategy now takes an optional
``alphabet=characters(codec="utf-8")`` argument for unicode strings, like
:func:`~hypothesis.strategies.text`.

This offers more and more-consistent control over the generated strings,
removing previously-hard-coded limitations. With ``fullmatch=False`` and
``alphabet=characters()``, surrogate characters are now possible in leading
and trailing text as well as the body of the match. Negated character classes
such as ``[^A-Z]`` or ``\S`` had a hard-coded exclusion of control characters
and surrogate characters; now they permit anything in ``alphabet=`` consistent
with the class, and control characters are permitted by default.
8 changes: 4 additions & 4 deletions hypothesis-python/docs/changes.rst
Original file line number Diff line number Diff line change
Expand Up @@ -144,7 +144,7 @@ help narrow down any particularly weird bugs in complex environments.
-------------------

Fixes some lingering issues with inference of recursive types
in `~hypothesis.strategies.from_type`. Closes :issue:`3525`.
in :func:`~hypothesis.strategies.from_type`. Closes :issue:`3525`.

.. _v6.81.0:

Expand Down Expand Up @@ -335,8 +335,8 @@ is strongly recommended. You can ensure you have the dependencies with
-------------------

This patch continues the work started in :pull:`3651` by adding
:pypi:`ruff` linter rules for pyflakes, flake8-comprehensions, and
flake8-implicit-str-concat.
:pypi:`ruff` linter rules for :pypi:`pyflakes`, :pypi:`flake8-comprehensions`,
and :pypi:`flake8-implicit-str-concat`.

.. _v6.75.5:

Expand Down Expand Up @@ -1184,7 +1184,7 @@ is really annoying. See :issue:`2701` for details.
6.48.0 - 2022-06-27
-------------------

This release raises :class:`~unittest.SkipTest` for which never executed any
This release raises :class:`~unittest.SkipTest` for tests which never executed any
examples, for example because the :obj:`~hypothesis.settings.phases` setting
excluded the :obj:`~hypothesis.Phase.explicit`, :obj:`~hypothesis.Phase.reuse`,
and :obj:`~hypothesis.Phase.generate` phases. This helps to avoid cases where
Expand Down
10 changes: 1 addition & 9 deletions hypothesis-python/src/hypothesis/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,7 @@
from hypothesis.internal.compat import (
PYPY,
BaseExceptionGroup,
add_note,
bad_django_TestCase,
get_type_hints,
int_from_bytes,
Expand Down Expand Up @@ -1008,15 +1009,6 @@ def run_engine(self):
_raise_to_user(errors_to_report, self.settings, report_lines)


def add_note(exc, note):
try:
exc.add_note(note)
except AttributeError:
if not hasattr(exc, "__notes__"):
exc.__notes__ = []
exc.__notes__.append(note)


def _raise_to_user(errors_to_report, settings, target_lines, trailer=""):
"""Helper function for attaching notes and grouping multiple errors."""
failing_prefix = "Falsifying example: "
Expand Down
144 changes: 13 additions & 131 deletions hypothesis-python/src/hypothesis/internal/charmap.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@

from hypothesis.configuration import mkdir_p, storage_directory
from hypothesis.errors import InvalidArgument
from hypothesis.internal.intervalsets import IntervalSet

intervals = Tuple[Tuple[int, int], ...]
cache_type = Dict[Tuple[Tuple[str, ...], int, int, intervals], intervals]
Expand Down Expand Up @@ -146,126 +147,6 @@ def as_general_categories(cats, name="cats"):
return tuple(c for c in cs if c in out)


def _union_intervals(x, y):
"""Merge two sequences of intervals into a single tuple of intervals.

Any integer bounded by `x` or `y` is also bounded by the result.

>>> _union_intervals([(3, 10)], [(1, 2), (5, 17)])
((1, 17),)
"""
if not x:
return tuple((u, v) for u, v in y)
if not y:
return tuple((u, v) for u, v in x)
intervals = sorted(x + y, reverse=True)
result = [intervals.pop()]
while intervals:
# 1. intervals is in descending order
# 2. pop() takes from the RHS.
# 3. (a, b) was popped 1st, then (u, v) was popped 2nd
# 4. Therefore: a <= u
# 5. We assume that u <= v and a <= b
# 6. So we need to handle 2 cases of overlap, and one disjoint case
# | u--v | u----v | u--v |
# | a----b | a--b | a--b |
u, v = intervals.pop()
a, b = result[-1]
if u <= b + 1:
# Overlap cases
result[-1] = (a, max(v, b))
else:
# Disjoint case
result.append((u, v))
return tuple(result)


def _subtract_intervals(x, y):
"""Set difference for lists of intervals. That is, returns a list of
intervals that bounds all values bounded by x that are not also bounded by
y. x and y are expected to be in sorted order.

For example _subtract_intervals([(1, 10)], [(2, 3), (9, 15)]) would
return [(1, 1), (4, 8)], removing the values 2, 3, 9 and 10 from the
interval.
"""
if not y:
return tuple(x)
x = list(map(list, x))
i = 0
j = 0
result = []
while i < len(x) and j < len(y):
# Iterate in parallel over x and y. j stays pointing at the smallest
# interval in the left hand side that could still overlap with some
# element of x at index >= i.
# Similarly, i is not incremented until we know that it does not
# overlap with any element of y at index >= j.

xl, xr = x[i]
assert xl <= xr
yl, yr = y[j]
assert yl <= yr

if yr < xl:
# The interval at y[j] is strictly to the left of the interval at
# x[i], so will not overlap with it or any later interval of x.
j += 1
elif yl > xr:
# The interval at y[j] is strictly to the right of the interval at
# x[i], so all of x[i] goes into the result as no further intervals
# in y will intersect it.
result.append(x[i])
i += 1
elif yl <= xl:
if yr >= xr:
# x[i] is contained entirely in y[j], so we just skip over it
# without adding it to the result.
i += 1
else:
# The beginning of x[i] is contained in y[j], so we update the
# left endpoint of x[i] to remove this, and increment j as we
# now have moved past it. Note that this is not added to the
# result as is, as more intervals from y may intersect it so it
# may need updating further.
x[i][0] = yr + 1
j += 1
else:
# yl > xl, so the left hand part of x[i] is not contained in y[j],
# so there are some values we should add to the result.
result.append((xl, yl - 1))

if yr + 1 <= xr:
# If y[j] finishes before x[i] does, there may be some values
# in x[i] left that should go in the result (or they may be
# removed by a later interval in y), so we update x[i] to
# reflect that and increment j because it no longer overlaps
# with any remaining element of x.
x[i][0] = yr + 1
j += 1
else:
# Every element of x[i] other than the initial part we have
# already added is contained in y[j], so we move to the next
# interval.
i += 1
# Any remaining intervals in x do not overlap with any of y, as if they did
# we would not have incremented j to the end, so can be added to the result
# as they are.
result.extend(x[i:])
return tuple(map(tuple, result))


def _intervals(s):
"""Return a tuple of intervals, covering the codepoints of characters in
`s`.

>>> _intervals('abcdef0123456789')
((48, 57), (97, 102))
"""
intervals = tuple((ord(c), ord(c)) for c in sorted(s))
return _union_intervals(intervals, intervals)


category_index_cache = {(): ()}


Expand Down Expand Up @@ -306,11 +187,14 @@ def _query_for_key(key):
pass
assert key
if set(key) == set(categories()):
result = ((0, sys.maxunicode),)
result = IntervalSet([(0, sys.maxunicode)])
else:
result = _union_intervals(_query_for_key(key[:-1]), charmap()[key[-1]])
category_index_cache[key] = result
return result
result = IntervalSet(_query_for_key(key[:-1])).union(
IntervalSet(charmap()[key[-1]])
)
assert isinstance(result, IntervalSet)
category_index_cache[key] = result.intervals
return result.intervals


limited_category_index_cache: cache_type = {}
Expand Down Expand Up @@ -344,14 +228,14 @@ def query(
if max_codepoint is None:
max_codepoint = sys.maxunicode
catkey = _category_key(exclude_categories, include_categories)
character_intervals = _intervals(include_characters or "")
exclude_intervals = _intervals(exclude_characters or "")
character_intervals = IntervalSet.from_string(include_characters or "")
exclude_intervals = IntervalSet.from_string(exclude_characters or "")
qkey = (
catkey,
min_codepoint,
max_codepoint,
character_intervals,
exclude_intervals,
character_intervals.intervals,
exclude_intervals.intervals,
)
try:
return limited_category_index_cache[qkey]
Expand All @@ -362,8 +246,6 @@ def query(
for u, v in base:
if v >= min_codepoint and u <= max_codepoint:
result.append((max(u, min_codepoint), min(v, max_codepoint)))
result = tuple(result)
result = _union_intervals(result, character_intervals)
result = _subtract_intervals(result, exclude_intervals)
result = (IntervalSet(result) | character_intervals) - exclude_intervals
limited_category_index_cache[qkey] = result
return result
9 changes: 9 additions & 0 deletions hypothesis-python/src/hypothesis/internal/compat.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,15 @@
WINDOWS = platform.system() == "Windows"


def add_note(exc, note):
try:
exc.add_note(note)
except AttributeError:
if not hasattr(exc, "__notes__"):
exc.__notes__ = []
exc.__notes__.append(note)


def escape_unicode_characters(s: str) -> str:
return codecs.encode(s, "unicode_escape").decode("ascii")

Expand Down
Loading
Loading