Identifier grammar is complicated

HypothesisWorks · Sep 2, 2023 · a2ca289 · a2ca289
1 parent cf0e650
commit a2ca289
Show file tree

Hide file tree

Showing 3 changed files with 82 additions and 66 deletions.
diff --git a/hypothesis-python/src/hypothesis/internal/intervalsets.py b/hypothesis-python/src/hypothesis/internal/intervalsets.py
@@ -8,8 +8,6 @@
 # v. 2.0. If a copy of the MPL was not distributed with this file, You can
 # obtain one at https://mozilla.org/MPL/2.0/.
 
-from hypothesis.errors import InvalidArgument
-
 
 class IntervalSet:
     @classmethod
@@ -229,6 +227,4 @@ def intersection(self, other):
                     i += 1
                 else:
                     j += 1
-        if not intervals:
-            raise InvalidArgument
         return IntervalSet(intervals)
diff --git a/hypothesis-python/src/hypothesis/strategies/_internal/strings.py b/hypothesis-python/src/hypothesis/strategies/_internal/strings.py
@@ -9,20 +9,18 @@
 # obtain one at https://mozilla.org/MPL/2.0/.
 
 import copy
-import sys
-import warnings
 from functools import lru_cache
+import re
+import unicodedata
+import warnings
 
-
-from hypothesis.control import assume
 from hypothesis.errors import HypothesisWarning, InvalidArgument
 from hypothesis.internal import charmap
 from hypothesis.internal.conjecture.utils import biased_coin, integer_range
 from hypothesis.internal.intervalsets import IntervalSet
 from hypothesis.strategies._internal.collections import ListStrategy
-from hypothesis.strategies._internal.strategies import SearchStrategy
 from hypothesis.strategies._internal.lazy import unwrap_strategies
-from hypothesis.strategies._internal.utils import utf8_encodable
+from hypothesis.strategies._internal.strategies import SearchStrategy
 
 
 class OneCharStringStrategy(SearchStrategy):
@@ -92,18 +90,6 @@ def do_draw(self, data):
 
         return chr(self.intervals[i])
 
-    def intersection(self, other):
-        assert isinstance(other, OneCharStringStrategy)
-        try:
-            self.intervals = self.intervals.intersection(other.intervals)
-        except InvalidArgument:
-            raise HypothesisWarning(
-                "No characters are allowed to be generated by this "
-                f"intersection of arguments: {self._arg_repr} and {other._arg_repr}"
-            )
-        # TODO: update _repr__?
-        return self
-
     def rewrite_integer(self, i):
         # We would like it so that, where possible, shrinking replaces
         # characters with simple ascii characters, so we rejig this
@@ -184,40 +170,27 @@ def filter(self, condition):
                 HypothesisWarning,
                 stacklevel=2,
             )
-        if condition is str.isidentifier and isinstance(
-            unwrap_strategies(self.element_strategy), OneCharStringStrategy
+        elems = unwrap_strategies(self.element_strategy)
+        if (
+            condition is str.isidentifier
+            and self.max_size >= 1
+            and isinstance(elems, OneCharStringStrategy)
         ):
-            from hypothesis.strategies._internal.core import characters, composite
-
-            @composite
-            def make_identifier(draw, element_strategy):
-                # TODO: Include Other_ID_Start_chars and Other_ID_Continue_chars
-                id_start_categories = ("Lu", "Ll", "Lt", "Lm", "Lo", "Nl")
-                id_continue_categories = id_start_categories + ("Mn", "Mc", "Nd", "Pc")
-
-                is_start_strategy = unwrap_strategies(
-                    characters(
-                        whitelist_categories=id_start_categories,
-                        whitelist_characters=("_",),
-                    )
-                ).intersection(unwrap_strategies(self.element_strategy))
-
-                is_continue_strategy = unwrap_strategies(
-                    characters(
-                        whitelist_categories=(id_continue_categories),
-                        whitelist_characters=("_",),
-                    )
-                ).intersection(unwrap_strategies(self.element_strategy))
-
-                start_var = draw(is_start_strategy)
-                continue_var = draw(is_continue_strategy)
-
-                example = start_var + continue_var
-                assume(example.isidentifier())
-
-                return example  # is_continue: May want to make _ more likely to not waste as much time
+            from hypothesis.strategies import builds, nothing
 
-            return make_identifier(self.element_strategy)
+            id_start, id_continue = _identifier_characters()
+            if not (elems.intervals & id_start):
+                return nothing()
+            return builds(
+                "{}{}".format,
+                OneCharStringStrategy(elems.intervals & id_start),
+                TextStrategy(
+                    OneCharStringStrategy(elems.intervals & id_continue),
+                    min_size=max(0, self.min_size - 1),
+                    max_size=self.max_size - 1,
+                ),
+                # Filter to ensure that NFKC normalization keeps working in future
+            ).filter(str.isidentifier)
 
         # We use ListStrategy filter logic for the conditions that *only* imply
         # the string is nonempty.  Here, we increment the min_size but still apply
@@ -231,6 +204,59 @@ def make_identifier(draw, element_strategy):
         return super().filter(condition)
 
 
+# Excerpted from https://www.unicode.org/Public/15.0.0/ucd/PropList.txt
+# Python updates it's Unicode version between minor releases, but fortunately
+# these properties do not change between the Unicode versions in question.
+_PROPLIST = """
+# ================================================
+
+1885..1886    ; Other_ID_Start # Mn   [2] MONGOLIAN LETTER ALI GALI BALUDA..MONGOLIAN LETTER ALI GALI THREE BALUDA
+2118          ; Other_ID_Start # Sm       SCRIPT CAPITAL P
+212E          ; Other_ID_Start # So       ESTIMATED SYMBOL
+309B..309C    ; Other_ID_Start # Sk   [2] KATAKANA-HIRAGANA VOICED SOUND MARK..KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK
+
+# Total code points: 6
+
+# ================================================
+
+00B7          ; Other_ID_Continue # Po       MIDDLE DOT
+0387          ; Other_ID_Continue # Po       GREEK ANO TELEIA
+1369..1371    ; Other_ID_Continue # No   [9] ETHIOPIC DIGIT ONE..ETHIOPIC DIGIT NINE
+19DA          ; Other_ID_Continue # No       NEW TAI LUE THAM DIGIT ONE
+
+# Total code points: 12
+"""
+
+
+@lru_cache
+def _identifier_characters():
+    """See https://docs.python.org/3/reference/lexical_analysis.html#identifiers"""
+    # Start by computing the set of special characters
+    chars = {"Other_ID_Start": "", "Other_ID_Continue": ""}
+    for line in _PROPLIST.splitlines():
+        if m := re.match(r"([0-9A-F.]+) +; (\w+) # ", line):
+            codes, prop = m.groups()
+            span = range(int(codes[:4], base=16), int(codes[-4:], base=16) + 1)
+            chars[prop] += "".join(chr(x) for x in span)
+
+    # Then get the basic set by Unicode category and known extras
+    id_start = charmap.query(
+        include_categories=("Lu", "Ll", "Lt", "Lm", "Lo", "Nl"),
+        include_characters="_" + chars["Other_ID_Start"],
+    )
+    id_start -= IntervalSet.from_string(
+        # Magic value: the characters which NFKC-normalize to be invalid identifiers.
+        # Conveniently they're all in `id_start`, so we only need to do this once.
+        "\u037a\u0e33\u0eb3\u2e2f\u309b\u309c\ufc5e\ufc5f\ufc60\ufc61\ufc62\ufc63"
+        "\ufdfa\ufdfb\ufe70\ufe72\ufe74\ufe76\ufe78\ufe7a\ufe7c\ufe7e\uff9e\uff9f"
+    )
+    id_continue = id_start | charmap.query(
+        include_categories=("Mn", "Mc", "Nd", "Pc"),
+        include_characters=chars["Other_ID_Continue"],
+    )
+    return id_start, id_continue
+
+
 class FixedSizeBytes(SearchStrategy):
     def __init__(self, size):
         self.size = size

diff --git a/hypothesis-python/tests/cover/test_filter_rewriting.py b/hypothesis-python/tests/cover/test_filter_rewriting.py
@@ -358,23 +358,17 @@ def test_bumps_min_size_and_filters_for_content_str_methods(method):
 def test_isidentifier_filter_properly_rewritten(al, data):
     if al == None:
         example = data.draw(st.text().filter(str.isidentifier))
-        assert example.isidentifier()
     else:
         example = data.draw(st.text(alphabet=al).filter(str.isidentifier))
-        print(example)
-        assert set(example).issubset(set(al))
+        assert set(example).issubset(al)
+    assert example.isidentifier()
 
 
 @pytest.mark.parametrize("al", ["¥¦§©"])
-@given(data())
-def test_isidentifer_filter_unsatisfiable(al, data):
-    s = unwrap_strategies(st.text(alphabet=al))
-
-    with pytest.warns(
-        HypothesisWarning, match="No characters are allowed to be generated by this"
-    ):
-        fs = s.filter(str.isidentifier)
-        example = data.draw(fs)
+def test_isidentifer_filter_unsatisfiable(al):
+    fs = st.text(alphabet=al).filter(str.isidentifier)
+    with pytest.raises(Unsatisfiable):
+        fs.example()
 
 
 @pytest.mark.parametrize(