diff --git a/docs/userguide/dev/rules_update.md b/docs/userguide/dev/rules_update.md index 57d3fd96d45b..b71da364e716 100644 --- a/docs/userguide/dev/rules_update.md +++ b/docs/userguide/dev/rules_update.md @@ -212,6 +212,16 @@ The rule updates are done first for ICU4C, and then ported (code changes) or mov Updating the test with new or revised rules requires changing the test source code, in `icu4c/source/test/intltest/rbbitst.cpp`. Look for the classes RBBICharMonkey, RBBIWordMonkey, RBBISentMonkey and RBBILineMonkey. The body of each class tracks the corresponding UAX-14 or UAX-29 specifications in defining the character classes and break rules. + The rules, as well as the partition of the code space used to generate the random sample strings, + are defined by regular expressions and Unicode sets generated by GenerateBreakTest in the + Unicode tools, which runs as part of MakeUnicodeFiles. + Copy the relevant lines from `Generated/UCD/17.0.0/extra/*BreakTest.cpp.txt` into `rbbitst.cpp`. + When developing changes to the line breaking algorithms that require changes to property assignments, + the generated rules and partition may need to be adjusted for testing. + However, the updated rules should only be merged into ICU once the property changes have actually been + made in the UCD and imported into ICU, at which point the unmodified generated partition and rules can + be used in `rbbitst.cpp`. + After making changes, as a final check, let the test run for an extended period of time, on the order of several hours. Run it from a terminal, and just interrupt it (Ctrl-C) when it's gone long enough. diff --git a/icu4c/source/test/intltest/rbbitst.cpp b/icu4c/source/test/intltest/rbbitst.cpp index f21c53a79713..c043a0a5d838 100644 --- a/icu4c/source/test/intltest/rbbitst.cpp +++ b/icu4c/source/test/intltest/rbbitst.cpp @@ -1602,8 +1602,8 @@ class SegmentationRule { NO_BREAK = u'×', }; struct BreakContext { - BreakContext(std::size_t index) : indexInRemapped(index) {} - std::optional indexInRemapped; + BreakContext(int32_t index) : indexInRemapped(index) {} + std::optional indexInRemapped; const SegmentationRule *appliedRule = nullptr; }; @@ -1639,15 +1639,37 @@ class RemapRule : public SegmentationRule { auto const start = std::chrono::steady_clock::now(); UErrorCode status = U_ZERO_ERROR; UnicodeString result; - std::size_t i = 0; - std::ptrdiff_t offset = 0; + int32_t i = 0; + int32_t offset = 0; + // We find all matches of the `pattern_` and replace them according to + // the `replacement_`, producing the new remapped string `result`. + // For every position i in the original string, + // `resolved[i].indexInRemapped` is nullopt if i lies within a replaced + // match, and is set to the new index in `result` otherwise, by adding + // the accumulated difference `offset` between match lengths and + // replacement lengths. + // Consider a 4-codepoint, 6 code unit string s = ⟨ 𒀀, ◌́, ␠, ◌𝅲 ⟩, where + // ␠ stands for U+0020 and U+12000 𒀀 and U+1D172 ◌𝅲 each require two code + // units, and apply the following two rules: + // 1. (?\P{lb=SP}) \p{lb=CM}* → ${X} + // 2. \p{lb=CM} → A + // The string remapped and the indexInRemapped values change as follows: + // indexInRemapped remapped string rule final + // (aligned on the initial string) applied offset + // 𒀀 ◌́ ␠ ◌𝅲 + // 0 1 2 3 4 5 6 ⟨ 𒀀, ◌́, ␠, ◌𝅲 ⟩ (none) + // 0 - - 2 3 4 5 ⟨ 𒀀, ␠, ◌𝅲 ⟩ 1 -1 + // 0 - - 2 3 - 4 ⟨ 𒀀, ␠, A ⟩ 2 -1 + // + // Note that the last indexInRemapped is always equal to the length of + // the remapped string. std::unique_ptr matcher(pattern_->matcher(remapped, status)); while (matcher->find()) { for (;; ++i) { if (!resolved[i].indexInRemapped.has_value()) { continue; } - if (*resolved[i].indexInRemapped > static_cast(matcher->start64(status))) { + if (*resolved[i].indexInRemapped > matcher->start(status)) { break; } *resolved[i].indexInRemapped += offset; @@ -1656,22 +1678,37 @@ class RemapRule : public SegmentationRule { if (!resolved[i].indexInRemapped.has_value()) { continue; } - if (*resolved[i].indexInRemapped == static_cast(matcher->end64(status))) { + // Note that + // `*resolved[i].indexInRemapped > matcher->end(status)` should + // never happen with ordinary rules, but could in principle + // happen with rules that remap to code point sequences, e.g., + // 1. BC → TYZ + // 2. AT → X + // applied to ⟨ A, B, C ⟩: + // indexInRemapped remapped rule + // A B C + // 0 1 2 3 ⟨ A, B, C ⟩ (none) + // 0 1 - 4 ⟨ A, T, Y, Z ⟩ 1 + // 0 - - 3 ⟨ X, Y, Z ⟩ 2 + // Where for the application of rule 2, the match ends at + // position 2 in remapped, which does not correspond to a + // position in the original string. + if (*resolved[i].indexInRemapped >= matcher->end(status)) { break; } if (resolved[i].appliedRule != nullptr && - resolved[i].appliedRule->resolution() == BREAK) { + resolved[i].appliedRule->resolution() == BREAK) { printf("Replacement rule at remapped indices %d sqq. spans a break", matcher->start(status)); std::terminate(); } resolved[i].appliedRule = this; - resolved[i].indexInRemapped = std::nullopt; + resolved[i].indexInRemapped.reset(); } matcher->appendReplacement(result, replacement_, status); offset = result.length() - *resolved[i].indexInRemapped; } - for (; i < resolved.size(); ++i) { + for (; i < static_cast(resolved.size()); ++i) { if (!resolved[i].indexInRemapped.has_value()) { continue; } @@ -1691,7 +1728,10 @@ class RemapRule : public SegmentationRule { std::terminate(); } remapped = result; - U_ASSERT(U_SUCCESS(status)); + if (U_FAILURE(status)) { + puts(("Failed to apply rule " + name()).c_str()); + std::terminate(); + } timeSpent_ += std::chrono::steady_clock::now() - start; } @@ -1715,7 +1755,10 @@ class RegexRule : public SegmentationRule { endsWithBefore_.reset(RegexPattern::compile( ".*(" + before + ")", UREGEX_COMMENTS | UREGEX_DOTALL, parseError, status)); after_.reset(RegexPattern::compile(after, UREGEX_COMMENTS | UREGEX_DOTALL, parseError, status)); - U_ASSERT(U_SUCCESS(status)); + if (U_FAILURE(status)) { + puts(("Failed to compile regular expressions for rule " + this->name()).c_str()); + std::terminate(); + } } virtual void apply(UnicodeString &remapped, std::vector &resolved) const override { @@ -1764,7 +1807,13 @@ class RegexRule : public SegmentationRule { auto const it = std::find_if(resolved.begin(), resolved.end(), [&](auto r) { return r.indexInRemapped == afterSearch->start(status); }); - U_ASSERT(it != resolved.end()); + if (it == resolved.end()) { + puts(("Rule " + name() + + " found a break at a position which does not correspond to an index in " + "the original string") + .c_str()); + std::terminate(); + } U_ASSERT(U_SUCCESS(status)); if (it->appliedRule == nullptr && std::unique_ptr(endsWithBefore_->matcher(remapped, status)) @@ -1785,6 +1834,10 @@ class RegexRule : public SegmentationRule { U_ASSERT(U_SUCCESS(status)); } } + if (U_FAILURE(status)) { + puts(("Failed to apply rule " + name()).c_str()); + std::terminate(); + } timeSpent_ += std::chrono::steady_clock::now() - start; } @@ -3035,9 +3088,9 @@ RBBILineMonkey::RBBILineMonkey() : const UnicodeSet lbSA(uR"(\p{lb=SA})", status); for (auto it = partition.begin(); it != partition.end();) { if (lbSA.containsAll(it->second)) { - it = partition.erase(it); + it = partition.erase(it); } else { - ++it; + ++it; } } @@ -3079,9 +3132,9 @@ void RBBILineMonkey::setText(const UnicodeString &s) { int32_t RBBILineMonkey::next(int32_t startPos) { for (std::size_t i = startPos + 1; i < resolved.size(); ++i) { - if (resolved[i].appliedRule != nullptr && resolved[i].appliedRule->resolution() == - SegmentationRule::BREAK) { - return i; + if (resolved[i].appliedRule != nullptr && + resolved[i].appliedRule->resolution() == SegmentationRule::BREAK) { + return i; } } return -1;