Skip to content

Commit

Permalink
Identifier grammar is complicated
Browse files Browse the repository at this point in the history
  • Loading branch information
Zac-HD committed Sep 2, 2023
1 parent cf0e650 commit a2ca289
Show file tree
Hide file tree
Showing 3 changed files with 82 additions and 66 deletions.
4 changes: 0 additions & 4 deletions hypothesis-python/src/hypothesis/internal/intervalsets.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,6 @@
# v. 2.0. If a copy of the MPL was not distributed with this file, You can
# obtain one at https://mozilla.org/MPL/2.0/.

from hypothesis.errors import InvalidArgument


class IntervalSet:
@classmethod
Expand Down Expand Up @@ -229,6 +227,4 @@ def intersection(self, other):
i += 1
else:
j += 1
if not intervals:
raise InvalidArgument
return IntervalSet(intervals)
126 changes: 76 additions & 50 deletions hypothesis-python/src/hypothesis/strategies/_internal/strings.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,20 +9,18 @@
# obtain one at https://mozilla.org/MPL/2.0/.

import copy
import sys
import warnings
from functools import lru_cache
import re
import unicodedata
import warnings


from hypothesis.control import assume
from hypothesis.errors import HypothesisWarning, InvalidArgument
from hypothesis.internal import charmap
from hypothesis.internal.conjecture.utils import biased_coin, integer_range
from hypothesis.internal.intervalsets import IntervalSet
from hypothesis.strategies._internal.collections import ListStrategy
from hypothesis.strategies._internal.strategies import SearchStrategy
from hypothesis.strategies._internal.lazy import unwrap_strategies
from hypothesis.strategies._internal.utils import utf8_encodable
from hypothesis.strategies._internal.strategies import SearchStrategy


class OneCharStringStrategy(SearchStrategy):
Expand Down Expand Up @@ -92,18 +90,6 @@ def do_draw(self, data):

return chr(self.intervals[i])

def intersection(self, other):
assert isinstance(other, OneCharStringStrategy)
try:
self.intervals = self.intervals.intersection(other.intervals)
except InvalidArgument:
raise HypothesisWarning(
"No characters are allowed to be generated by this "
f"intersection of arguments: {self._arg_repr} and {other._arg_repr}"
)
# TODO: update _repr__?
return self

def rewrite_integer(self, i):
# We would like it so that, where possible, shrinking replaces
# characters with simple ascii characters, so we rejig this
Expand Down Expand Up @@ -184,40 +170,27 @@ def filter(self, condition):
HypothesisWarning,
stacklevel=2,
)
if condition is str.isidentifier and isinstance(
unwrap_strategies(self.element_strategy), OneCharStringStrategy
elems = unwrap_strategies(self.element_strategy)
if (
condition is str.isidentifier
and self.max_size >= 1
and isinstance(elems, OneCharStringStrategy)
):
from hypothesis.strategies._internal.core import characters, composite

@composite
def make_identifier(draw, element_strategy):
# TODO: Include Other_ID_Start_chars and Other_ID_Continue_chars
id_start_categories = ("Lu", "Ll", "Lt", "Lm", "Lo", "Nl")
id_continue_categories = id_start_categories + ("Mn", "Mc", "Nd", "Pc")

is_start_strategy = unwrap_strategies(
characters(
whitelist_categories=id_start_categories,
whitelist_characters=("_",),
)
).intersection(unwrap_strategies(self.element_strategy))

is_continue_strategy = unwrap_strategies(
characters(
whitelist_categories=(id_continue_categories),
whitelist_characters=("_",),
)
).intersection(unwrap_strategies(self.element_strategy))

start_var = draw(is_start_strategy)
continue_var = draw(is_continue_strategy)

example = start_var + continue_var
assume(example.isidentifier())

return example # is_continue: May want to make _ more likely to not waste as much time
from hypothesis.strategies import builds, nothing

return make_identifier(self.element_strategy)
id_start, id_continue = _identifier_characters()
if not (elems.intervals & id_start):
return nothing()
return builds(
"{}{}".format,
OneCharStringStrategy(elems.intervals & id_start),
TextStrategy(
OneCharStringStrategy(elems.intervals & id_continue),
min_size=max(0, self.min_size - 1),
max_size=self.max_size - 1,
),
# Filter to ensure that NFKC normalization keeps working in future
).filter(str.isidentifier)

# We use ListStrategy filter logic for the conditions that *only* imply
# the string is nonempty. Here, we increment the min_size but still apply
Expand All @@ -231,6 +204,59 @@ def make_identifier(draw, element_strategy):
return super().filter(condition)


# Excerpted from https://www.unicode.org/Public/15.0.0/ucd/PropList.txt
# Python updates it's Unicode version between minor releases, but fortunately
# these properties do not change between the Unicode versions in question.
_PROPLIST = """
# ================================================
1885..1886 ; Other_ID_Start # Mn [2] MONGOLIAN LETTER ALI GALI BALUDA..MONGOLIAN LETTER ALI GALI THREE BALUDA
2118 ; Other_ID_Start # Sm SCRIPT CAPITAL P
212E ; Other_ID_Start # So ESTIMATED SYMBOL
309B..309C ; Other_ID_Start # Sk [2] KATAKANA-HIRAGANA VOICED SOUND MARK..KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK
# Total code points: 6
# ================================================
00B7 ; Other_ID_Continue # Po MIDDLE DOT
0387 ; Other_ID_Continue # Po GREEK ANO TELEIA
1369..1371 ; Other_ID_Continue # No [9] ETHIOPIC DIGIT ONE..ETHIOPIC DIGIT NINE
19DA ; Other_ID_Continue # No NEW TAI LUE THAM DIGIT ONE
# Total code points: 12
"""


@lru_cache
def _identifier_characters():
"""See https://docs.python.org/3/reference/lexical_analysis.html#identifiers"""
# Start by computing the set of special characters
chars = {"Other_ID_Start": "", "Other_ID_Continue": ""}
for line in _PROPLIST.splitlines():
if m := re.match(r"([0-9A-F.]+) +; (\w+) # ", line):
codes, prop = m.groups()
span = range(int(codes[:4], base=16), int(codes[-4:], base=16) + 1)
chars[prop] += "".join(chr(x) for x in span)

# Then get the basic set by Unicode category and known extras
id_start = charmap.query(
include_categories=("Lu", "Ll", "Lt", "Lm", "Lo", "Nl"),
include_characters="_" + chars["Other_ID_Start"],
)
id_start -= IntervalSet.from_string(
# Magic value: the characters which NFKC-normalize to be invalid identifiers.
# Conveniently they're all in `id_start`, so we only need to do this once.
"\u037a\u0e33\u0eb3\u2e2f\u309b\u309c\ufc5e\ufc5f\ufc60\ufc61\ufc62\ufc63"
"\ufdfa\ufdfb\ufe70\ufe72\ufe74\ufe76\ufe78\ufe7a\ufe7c\ufe7e\uff9e\uff9f"
)
id_continue = id_start | charmap.query(
include_categories=("Mn", "Mc", "Nd", "Pc"),
include_characters=chars["Other_ID_Continue"],
)
return id_start, id_continue


class FixedSizeBytes(SearchStrategy):
def __init__(self, size):
self.size = size
Expand Down
18 changes: 6 additions & 12 deletions hypothesis-python/tests/cover/test_filter_rewriting.py
Original file line number Diff line number Diff line change
Expand Up @@ -358,23 +358,17 @@ def test_bumps_min_size_and_filters_for_content_str_methods(method):
def test_isidentifier_filter_properly_rewritten(al, data):
if al == None:
example = data.draw(st.text().filter(str.isidentifier))
assert example.isidentifier()
else:
example = data.draw(st.text(alphabet=al).filter(str.isidentifier))
print(example)
assert set(example).issubset(set(al))
assert set(example).issubset(al)
assert example.isidentifier()


@pytest.mark.parametrize("al", ["¥¦§©"])
@given(data())
def test_isidentifer_filter_unsatisfiable(al, data):
s = unwrap_strategies(st.text(alphabet=al))

with pytest.warns(
HypothesisWarning, match="No characters are allowed to be generated by this"
):
fs = s.filter(str.isidentifier)
example = data.draw(fs)
def test_isidentifer_filter_unsatisfiable(al):
fs = st.text(alphabet=al).filter(str.isidentifier)
with pytest.raises(Unsatisfiable):
fs.example()


@pytest.mark.parametrize(
Expand Down

0 comments on commit a2ca289

Please sign in to comment.