HypothesisWorks · Zac-HD · Sep 5, 2023 · Aug 30, 2023 · Sep 5, 2023 · Zac-HD
diff --git a/hypothesis-python/RELEASE.rst b/hypothesis-python/RELEASE.rst
@@ -0,0 +1,4 @@
+RELEASE_TYPE: patch
+
+This patch by Reagan Lee makes ``st.text(...).filter(str.isidentifier)``
+return an efficient custom strategy (:issue:`3480`).
diff --git a/hypothesis-python/src/hypothesis/internal/intervalsets.py b/hypothesis-python/src/hypothesis/internal/intervalsets.py
@@ -210,9 +210,21 @@ def difference(self, other):
         return IntervalSet(map(tuple, result))
 
     def intersection(self, other):
-        """Set intersection for lists of intervals.
-
-        Conveniently, this is trivial to define in terms of difference.
-        """
+        """Set intersection for lists of intervals."""
         assert isinstance(other, type(self)), other
-        return self.difference(other - self).difference(self - other)
+        intervals = []
+        i = j = 0
+        while i < len(self.intervals) and j < len(other.intervals):
+            u, v = self.intervals[i]
+            U, V = other.intervals[j]
+            if u > V:
+                j += 1
+            elif U > v:
+                i += 1
+            else:
+                intervals.append((max(u, U), min(v, V)))
+                if v < V:
+                    i += 1
+                else:
+                    j += 1
+        return IntervalSet(intervals)
diff --git a/hypothesis-python/src/hypothesis/strategies/_internal/strings.py b/hypothesis-python/src/hypothesis/strategies/_internal/strings.py
@@ -9,13 +9,16 @@
 # obtain one at https://mozilla.org/MPL/2.0/.
 
 import copy
+import re
 import warnings
+from functools import lru_cache
 
 from hypothesis.errors import HypothesisWarning, InvalidArgument
 from hypothesis.internal import charmap
 from hypothesis.internal.conjecture.utils import biased_coin, integer_range
 from hypothesis.internal.intervalsets import IntervalSet
 from hypothesis.strategies._internal.collections import ListStrategy
+from hypothesis.strategies._internal.lazy import unwrap_strategies
 from hypothesis.strategies._internal.strategies import SearchStrategy
 
 
@@ -166,6 +169,28 @@ def filter(self, condition):
                 HypothesisWarning,
                 stacklevel=2,
             )
+        elems = unwrap_strategies(self.element_strategy)
+        if (
+            condition is str.isidentifier
+            and self.max_size >= 1
+            and isinstance(elems, OneCharStringStrategy)
+        ):
+            from hypothesis.strategies import builds, nothing
+
+            id_start, id_continue = _identifier_characters()
+            if not (elems.intervals & id_start):
+                return nothing()
+            return builds(
+                "{}{}".format,
+                OneCharStringStrategy(elems.intervals & id_start),
+                TextStrategy(
+                    OneCharStringStrategy(elems.intervals & id_continue),
+                    min_size=max(0, self.min_size - 1),
+                    max_size=self.max_size - 1,
+                ),
+                # Filter to ensure that NFKC normalization keeps working in future
+            ).filter(str.isidentifier)
+
         # We use ListStrategy filter logic for the conditions that *only* imply
         # the string is nonempty.  Here, we increment the min_size but still apply
         # the filter for conditions that imply nonempty *and specific contents*.
@@ -178,6 +203,59 @@ def filter(self, condition):
         return super().filter(condition)
 
 
+# Excerpted from https://www.unicode.org/Public/15.0.0/ucd/PropList.txt
+# Python updates it's Unicode version between minor releases, but fortunately
+# these properties do not change between the Unicode versions in question.
+_PROPLIST = """
+# ================================================
+
+1885..1886    ; Other_ID_Start # Mn   [2] MONGOLIAN LETTER ALI GALI BALUDA..MONGOLIAN LETTER ALI GALI THREE BALUDA
+2118          ; Other_ID_Start # Sm       SCRIPT CAPITAL P
+212E          ; Other_ID_Start # So       ESTIMATED SYMBOL
+309B..309C    ; Other_ID_Start # Sk   [2] KATAKANA-HIRAGANA VOICED SOUND MARK..KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK
+
+# Total code points: 6
+
+# ================================================
+
+00B7          ; Other_ID_Continue # Po       MIDDLE DOT
+0387          ; Other_ID_Continue # Po       GREEK ANO TELEIA
+1369..1371    ; Other_ID_Continue # No   [9] ETHIOPIC DIGIT ONE..ETHIOPIC DIGIT NINE
+19DA          ; Other_ID_Continue # No       NEW TAI LUE THAM DIGIT ONE
+
+# Total code points: 12
+"""
+
+
+@lru_cache
+def _identifier_characters():
+    """See https://docs.python.org/3/reference/lexical_analysis.html#identifiers"""
+    # Start by computing the set of special characters
+    chars = {"Other_ID_Start": "", "Other_ID_Continue": ""}
+    for line in _PROPLIST.splitlines():
+        if m := re.match(r"([0-9A-F.]+) +; (\w+) # ", line):
+            codes, prop = m.groups()
+            span = range(int(codes[:4], base=16), int(codes[-4:], base=16) + 1)
+            chars[prop] += "".join(chr(x) for x in span)
+
+    # Then get the basic set by Unicode category and known extras
+    id_start = charmap.query(
+        include_categories=("Lu", "Ll", "Lt", "Lm", "Lo", "Nl"),
+        include_characters="_" + chars["Other_ID_Start"],
+    )
+    id_start -= IntervalSet.from_string(
+        # Magic value: the characters which NFKC-normalize to be invalid identifiers.
+        # Conveniently they're all in `id_start`, so we only need to do this once.
+        "\u037a\u0e33\u0eb3\u2e2f\u309b\u309c\ufc5e\ufc5f\ufc60\ufc61\ufc62\ufc63"
+        "\ufdfa\ufdfb\ufe70\ufe72\ufe74\ufe76\ufe78\ufe7a\ufe7c\ufe7e\uff9e\uff9f"
+    )
+    id_continue = id_start | charmap.query(
+        include_categories=("Mn", "Mc", "Nd", "Pc"),
+        include_characters=chars["Other_ID_Continue"],
+    )
+    return id_start, id_continue
+
+
 class FixedSizeBytes(SearchStrategy):
     def __init__(self, size):
         self.size = size

diff --git a/hypothesis-python/tests/cover/test_filter_rewriting.py b/hypothesis-python/tests/cover/test_filter_rewriting.py
@@ -21,6 +21,7 @@
 from hypothesis.errors import HypothesisWarning, Unsatisfiable
 from hypothesis.internal.floats import next_down, next_up
 from hypothesis.internal.reflection import get_pretty_function_description
+from hypothesis.strategies._internal.core import data
 from hypothesis.strategies._internal.lazy import LazyStrategy, unwrap_strategies
 from hypothesis.strategies._internal.numbers import FloatStrategy, IntegersStrategy
 from hypothesis.strategies._internal.strategies import FilteredStrategy
@@ -343,14 +344,33 @@ def test_warns_on_suspicious_string_methods(method):
     assert fs.min_size == 1
 
 
-@pytest.mark.parametrize("method", [str.isidentifier, str.isalnum])
+@pytest.mark.parametrize("method", [str.isalnum])
 def test_bumps_min_size_and_filters_for_content_str_methods(method):
     s = unwrap_strategies(st.text())
     fs = s.filter(method)
     assert fs.filtered_strategy.min_size == 1
     assert fs.flat_conditions == (method,)
 
 
+# Should we deterministically check whether ascii or not or st.characters fine?
+@pytest.mark.parametrize("al", [None, "cdef123", "cd12¥¦§©"])
+@given(data())
+def test_isidentifier_filter_properly_rewritten(al, data):
+    if al is None:
+        example = data.draw(st.text().filter(str.isidentifier))
+    else:
+        example = data.draw(st.text(alphabet=al).filter(str.isidentifier))
+        assert set(example).issubset(al)
+    assert example.isidentifier()
+
+
+@pytest.mark.parametrize("al", ["¥¦§©"])
+def test_isidentifer_filter_unsatisfiable(al):
+    fs = st.text(alphabet=al).filter(str.isidentifier)
+    with pytest.raises(Unsatisfiable):
+        fs.example()
+
+
 @pytest.mark.parametrize(
     "op, attr, value, expected",
     [