ICU-22984 Clean up old monkeys

unicode-org · Dec 4, 2024 · e59065c · e59065c
1 parent 757f27c
commit e59065c
Show file tree

Hide file tree

Showing 2 changed files with 80 additions and 17 deletions.
diff --git a/docs/userguide/dev/rules_update.md b/docs/userguide/dev/rules_update.md
@@ -212,6 +212,16 @@ The rule updates are done first for ICU4C, and then ported (code changes) or mov
 
     Updating the test with new or revised rules requires changing the test source code, in `icu4c/source/test/intltest/rbbitst.cpp`. Look for the classes RBBICharMonkey, RBBIWordMonkey, RBBISentMonkey and RBBILineMonkey. The body of each class tracks the corresponding UAX-14 or UAX-29 specifications in defining the character classes and break rules.
 
+    The rules, as well as the partition of the code space used to generate the random sample strings,
+    are defined by regular expressions and Unicode sets generated by GenerateBreakTest in the
+    Unicode tools, which runs as part of MakeUnicodeFiles.
+    Copy the relevant lines from `Generated/UCD/17.0.0/extra/*BreakTest.cpp.txt` into `rbbitst.cpp`.
+    When developing changes to the line breaking algorithms that require changes to property assignments,
+    the generated rules and partition may need to be adjusted for testing.
+    However, the updated rules should only be merged into ICU once the property changes have actually been
+    made in the UCD and imported into ICU, at which point the unmodified generated partition and rules can
+    be used in `rbbitst.cpp`.
+
     After making changes, as a final check, let the test run for an extended period of time, on the order of several hours.
     Run it from a terminal, and just interrupt it (Ctrl-C) when it's gone long enough.
 

diff --git a/icu4c/source/test/intltest/rbbitst.cpp b/icu4c/source/test/intltest/rbbitst.cpp
@@ -1602,8 +1602,8 @@ class SegmentationRule {
         NO_BREAK = u'×',
     };
     struct BreakContext {
-        BreakContext(std::size_t index) : indexInRemapped(index) {}
-        std::optional<std::size_t> indexInRemapped;
+        BreakContext(int32_t index) : indexInRemapped(index) {}
+        std::optional<int32_t> indexInRemapped;
         const SegmentationRule *appliedRule = nullptr;
     };
 
@@ -1639,15 +1639,37 @@ class RemapRule : public SegmentationRule {
         auto const start = std::chrono::steady_clock::now();
         UErrorCode status = U_ZERO_ERROR;
         UnicodeString result;
-        std::size_t i = 0;
-        std::ptrdiff_t offset = 0;
+        int32_t i = 0;
+        int32_t offset = 0;
+        // We find all matches of the `pattern_` and replace them according to
+        // the `replacement_`, producing the new remapped string `result`.
+        // For every position i in the original string,
+        // `resolved[i].indexInRemapped` is nullopt if i lies within a replaced
+        // match, and is set to the new index in `result` otherwise, by adding
+        // the accumulated difference `offset` between match lengths and
+        // replacement lengths.
+        // Consider a 4-codepoint, 6 code unit string s = ⟨ 𒀀, ◌́, ␠, ◌𝅲 ⟩, where
+        // ␠ stands for U+0020 and U+12000 𒀀 and U+1D172 ◌𝅲 each require two code
+        // units, and apply the following two rules:
+        //   1. (?<X>\P{lb=SP}) \p{lb=CM}* → ${X}
+        //   2. \p{lb=CM} → A
+        // The string remapped and the indexInRemapped values change as follows:
+        //   indexInRemapped                   remapped string  rule      final
+        //   (aligned on the initial string)                    applied   offset
+        //      𒀀     ◌́     ␠     ◌𝅲
+        //   0  1  2     3     4  5  6         ⟨ 𒀀, ◌́, ␠, ◌𝅲 ⟩   (none)
+        //   0  -  -     2     3  4  5         ⟨ 𒀀,    ␠, ◌𝅲 ⟩   1              -1
+        //   0  -  -     2     3  -  4         ⟨ 𒀀,    ␠, A ⟩   2              -1
+        //
+        // Note that the last indexInRemapped is always equal to the length of
+        // the remapped string.
         std::unique_ptr<RegexMatcher> matcher(pattern_->matcher(remapped, status));
         while (matcher->find()) {
             for (;; ++i) {
                 if (!resolved[i].indexInRemapped.has_value()) {
                     continue;
                 }
-                if (*resolved[i].indexInRemapped > static_cast<std::size_t>(matcher->start64(status))) {
+                if (*resolved[i].indexInRemapped > matcher->start(status)) {
                     break;
                 }
                 *resolved[i].indexInRemapped += offset;
@@ -1656,22 +1678,37 @@ class RemapRule : public SegmentationRule {
                 if (!resolved[i].indexInRemapped.has_value()) {
                     continue;
                 }
-                if (*resolved[i].indexInRemapped == static_cast<std::size_t>(matcher->end64(status))) {
+                // Note that
+                // `*resolved[i].indexInRemapped > matcher->end(status)` should
+                // never happen with ordinary rules, but could in principle
+                // happen with rules that remap to code point sequences, e.g.,
+                //   1. BC → TYZ
+                //   2. AT → X
+                // applied to ⟨ A, B, C ⟩:
+                //   indexInRemapped  remapped       rule
+                //    A B C
+                //   0 1 2 3          ⟨ A, B, C ⟩     (none)
+                //   0 1 - 4          ⟨ A, T, Y, Z ⟩  1
+                //   0 - - 3          ⟨ X, Y, Z ⟩     2
+                // Where for the application of rule 2, the match ends at
+                // position 2 in remapped, which does not correspond to a
+                // position in the original string.
+                if (*resolved[i].indexInRemapped >= matcher->end(status)) {
                     break;
                 }
                 if (resolved[i].appliedRule != nullptr &&
-                    resolved[i].appliedRule->resolution() == BREAK) {
+                        resolved[i].appliedRule->resolution() == BREAK) {
                     printf("Replacement rule at remapped indices %d sqq. spans a break",
                            matcher->start(status));
                     std::terminate();
                 }
                 resolved[i].appliedRule = this;
-                resolved[i].indexInRemapped = std::nullopt;
+                resolved[i].indexInRemapped.reset();
             }
             matcher->appendReplacement(result, replacement_, status);
             offset = result.length() - *resolved[i].indexInRemapped;
         }
-        for (; i < resolved.size(); ++i) {
+        for (; i < static_cast<int32_t>(resolved.size()); ++i) {
             if (!resolved[i].indexInRemapped.has_value()) {
                 continue;
             }
@@ -1691,7 +1728,10 @@ class RemapRule : public SegmentationRule {
             std::terminate();
         }
         remapped = result;
-        U_ASSERT(U_SUCCESS(status));
+        if (U_FAILURE(status)) {
+            puts(("Failed to apply rule " + name()).c_str());
+            std::terminate();
+        }
         timeSpent_ += std::chrono::steady_clock::now() - start;
     }
 
@@ -1715,7 +1755,10 @@ class RegexRule : public SegmentationRule {
         endsWithBefore_.reset(RegexPattern::compile(
             ".*(" + before + ")", UREGEX_COMMENTS | UREGEX_DOTALL, parseError, status));
         after_.reset(RegexPattern::compile(after, UREGEX_COMMENTS | UREGEX_DOTALL, parseError, status));
-        U_ASSERT(U_SUCCESS(status));
+        if (U_FAILURE(status)) {
+            puts(("Failed to compile regular expressions for rule " + this->name()).c_str());
+            std::terminate();
+        }
     }
 
     virtual void apply(UnicodeString &remapped, std::vector<BreakContext> &resolved) const override {
@@ -1764,7 +1807,13 @@ class RegexRule : public SegmentationRule {
                     auto const it = std::find_if(resolved.begin(), resolved.end(), [&](auto r) {
                         return r.indexInRemapped == afterSearch->start(status);
                     });
-                    U_ASSERT(it != resolved.end());
+                    if (it == resolved.end()) {
+                        puts(("Rule " + name() +
+                              " found a break at a position which does not correspond to an index in "
+                              "the original string")
+                                 .c_str());
+                        std::terminate();
+                    }
                     U_ASSERT(U_SUCCESS(status));
                     if (it->appliedRule == nullptr &&
                         std::unique_ptr<RegexMatcher>(endsWithBefore_->matcher(remapped, status))
@@ -1785,6 +1834,10 @@ class RegexRule : public SegmentationRule {
                 U_ASSERT(U_SUCCESS(status));
             }
         }
+        if (U_FAILURE(status)) {
+            puts(("Failed to apply rule " + name()).c_str());
+            std::terminate();
+        }
         timeSpent_ += std::chrono::steady_clock::now() - start;
     }
 
@@ -3035,9 +3088,9 @@ RBBILineMonkey::RBBILineMonkey() :
     const UnicodeSet lbSA(uR"(\p{lb=SA})", status);
     for (auto it = partition.begin(); it != partition.end();) {
         if (lbSA.containsAll(it->second)) {
-          it = partition.erase(it);
+            it = partition.erase(it);
         } else {
-          ++it;
+            ++it;
         }
     }
 
@@ -3079,9 +3132,9 @@ void RBBILineMonkey::setText(const UnicodeString &s) {
 
 int32_t RBBILineMonkey::next(int32_t startPos) {
     for (std::size_t i = startPos + 1; i < resolved.size(); ++i) {
-        if (resolved[i].appliedRule != nullptr && resolved[i].appliedRule->resolution() ==
-            SegmentationRule::BREAK) {
-          return i;
+        if (resolved[i].appliedRule != nullptr &&
+                resolved[i].appliedRule->resolution() == SegmentationRule::BREAK) {
+            return i;
         }
     }
     return -1;