Skip to content

Commit

Permalink
ICU-22984 Clean up old monkeys
Browse files Browse the repository at this point in the history
  • Loading branch information
eggrobin committed Dec 4, 2024
1 parent 757f27c commit e59065c
Show file tree
Hide file tree
Showing 2 changed files with 80 additions and 17 deletions.
10 changes: 10 additions & 0 deletions docs/userguide/dev/rules_update.md
Original file line number Diff line number Diff line change
Expand Up @@ -212,6 +212,16 @@ The rule updates are done first for ICU4C, and then ported (code changes) or mov

Updating the test with new or revised rules requires changing the test source code, in `icu4c/source/test/intltest/rbbitst.cpp`. Look for the classes RBBICharMonkey, RBBIWordMonkey, RBBISentMonkey and RBBILineMonkey. The body of each class tracks the corresponding UAX-14 or UAX-29 specifications in defining the character classes and break rules.

The rules, as well as the partition of the code space used to generate the random sample strings,
are defined by regular expressions and Unicode sets generated by GenerateBreakTest in the
Unicode tools, which runs as part of MakeUnicodeFiles.
Copy the relevant lines from `Generated/UCD/17.0.0/extra/*BreakTest.cpp.txt` into `rbbitst.cpp`.
When developing changes to the line breaking algorithms that require changes to property assignments,
the generated rules and partition may need to be adjusted for testing.
However, the updated rules should only be merged into ICU once the property changes have actually been
made in the UCD and imported into ICU, at which point the unmodified generated partition and rules can
be used in `rbbitst.cpp`.

After making changes, as a final check, let the test run for an extended period of time, on the order of several hours.
Run it from a terminal, and just interrupt it (Ctrl-C) when it's gone long enough.

Expand Down
87 changes: 70 additions & 17 deletions icu4c/source/test/intltest/rbbitst.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1602,8 +1602,8 @@ class SegmentationRule {
NO_BREAK = u'×',
};
struct BreakContext {
BreakContext(std::size_t index) : indexInRemapped(index) {}
std::optional<std::size_t> indexInRemapped;
BreakContext(int32_t index) : indexInRemapped(index) {}
std::optional<int32_t> indexInRemapped;
const SegmentationRule *appliedRule = nullptr;
};

Expand Down Expand Up @@ -1639,15 +1639,37 @@ class RemapRule : public SegmentationRule {
auto const start = std::chrono::steady_clock::now();
UErrorCode status = U_ZERO_ERROR;
UnicodeString result;
std::size_t i = 0;
std::ptrdiff_t offset = 0;
int32_t i = 0;
int32_t offset = 0;
// We find all matches of the `pattern_` and replace them according to
// the `replacement_`, producing the new remapped string `result`.
// For every position i in the original string,
// `resolved[i].indexInRemapped` is nullopt if i lies within a replaced
// match, and is set to the new index in `result` otherwise, by adding
// the accumulated difference `offset` between match lengths and
// replacement lengths.
// Consider a 4-codepoint, 6 code unit string s = ⟨ 𒀀, ◌́, ␠, ◌𝅲 ⟩, where
// ␠ stands for U+0020 and U+12000 𒀀 and U+1D172 ◌𝅲 each require two code
// units, and apply the following two rules:
// 1. (?<X>\P{lb=SP}) \p{lb=CM}* → ${X}
// 2. \p{lb=CM} → A
// The string remapped and the indexInRemapped values change as follows:
// indexInRemapped remapped string rule final
// (aligned on the initial string) applied offset
// 𒀀 ◌́ ␠ ◌𝅲
// 0 1 2 3 4 5 6 ⟨ 𒀀, ◌́, ␠, ◌𝅲 ⟩ (none)
// 0 - - 2 3 4 5 ⟨ 𒀀, ␠, ◌𝅲 ⟩ 1 -1
// 0 - - 2 3 - 4 ⟨ 𒀀, ␠, A ⟩ 2 -1
//
// Note that the last indexInRemapped is always equal to the length of
// the remapped string.
std::unique_ptr<RegexMatcher> matcher(pattern_->matcher(remapped, status));
while (matcher->find()) {
for (;; ++i) {
if (!resolved[i].indexInRemapped.has_value()) {
continue;
}
if (*resolved[i].indexInRemapped > static_cast<std::size_t>(matcher->start64(status))) {
if (*resolved[i].indexInRemapped > matcher->start(status)) {
break;
}
*resolved[i].indexInRemapped += offset;
Expand All @@ -1656,22 +1678,37 @@ class RemapRule : public SegmentationRule {
if (!resolved[i].indexInRemapped.has_value()) {
continue;
}
if (*resolved[i].indexInRemapped == static_cast<std::size_t>(matcher->end64(status))) {
// Note that
// `*resolved[i].indexInRemapped > matcher->end(status)` should
// never happen with ordinary rules, but could in principle
// happen with rules that remap to code point sequences, e.g.,
// 1. BC → TYZ
// 2. AT → X
// applied to ⟨ A, B, C ⟩:
// indexInRemapped remapped rule
// A B C
// 0 1 2 3 ⟨ A, B, C ⟩ (none)
// 0 1 - 4 ⟨ A, T, Y, Z ⟩ 1
// 0 - - 3 ⟨ X, Y, Z ⟩ 2
// Where for the application of rule 2, the match ends at
// position 2 in remapped, which does not correspond to a
// position in the original string.
if (*resolved[i].indexInRemapped >= matcher->end(status)) {
break;
}
if (resolved[i].appliedRule != nullptr &&
resolved[i].appliedRule->resolution() == BREAK) {
resolved[i].appliedRule->resolution() == BREAK) {
printf("Replacement rule at remapped indices %d sqq. spans a break",
matcher->start(status));
std::terminate();
}
resolved[i].appliedRule = this;
resolved[i].indexInRemapped = std::nullopt;
resolved[i].indexInRemapped.reset();
}
matcher->appendReplacement(result, replacement_, status);
offset = result.length() - *resolved[i].indexInRemapped;
}
for (; i < resolved.size(); ++i) {
for (; i < static_cast<int32_t>(resolved.size()); ++i) {
if (!resolved[i].indexInRemapped.has_value()) {
continue;
}
Expand All @@ -1691,7 +1728,10 @@ class RemapRule : public SegmentationRule {
std::terminate();
}
remapped = result;
U_ASSERT(U_SUCCESS(status));
if (U_FAILURE(status)) {
puts(("Failed to apply rule " + name()).c_str());
std::terminate();
}
timeSpent_ += std::chrono::steady_clock::now() - start;
}

Expand All @@ -1715,7 +1755,10 @@ class RegexRule : public SegmentationRule {
endsWithBefore_.reset(RegexPattern::compile(
".*(" + before + ")", UREGEX_COMMENTS | UREGEX_DOTALL, parseError, status));
after_.reset(RegexPattern::compile(after, UREGEX_COMMENTS | UREGEX_DOTALL, parseError, status));
U_ASSERT(U_SUCCESS(status));
if (U_FAILURE(status)) {
puts(("Failed to compile regular expressions for rule " + this->name()).c_str());
std::terminate();
}
}

virtual void apply(UnicodeString &remapped, std::vector<BreakContext> &resolved) const override {
Expand Down Expand Up @@ -1764,7 +1807,13 @@ class RegexRule : public SegmentationRule {
auto const it = std::find_if(resolved.begin(), resolved.end(), [&](auto r) {
return r.indexInRemapped == afterSearch->start(status);
});
U_ASSERT(it != resolved.end());
if (it == resolved.end()) {
puts(("Rule " + name() +
" found a break at a position which does not correspond to an index in "
"the original string")
.c_str());
std::terminate();
}
U_ASSERT(U_SUCCESS(status));
if (it->appliedRule == nullptr &&
std::unique_ptr<RegexMatcher>(endsWithBefore_->matcher(remapped, status))
Expand All @@ -1785,6 +1834,10 @@ class RegexRule : public SegmentationRule {
U_ASSERT(U_SUCCESS(status));
}
}
if (U_FAILURE(status)) {
puts(("Failed to apply rule " + name()).c_str());
std::terminate();
}
timeSpent_ += std::chrono::steady_clock::now() - start;
}

Expand Down Expand Up @@ -3035,9 +3088,9 @@ RBBILineMonkey::RBBILineMonkey() :
const UnicodeSet lbSA(uR"(\p{lb=SA})", status);
for (auto it = partition.begin(); it != partition.end();) {
if (lbSA.containsAll(it->second)) {
it = partition.erase(it);
it = partition.erase(it);
} else {
++it;
++it;
}
}

Expand Down Expand Up @@ -3079,9 +3132,9 @@ void RBBILineMonkey::setText(const UnicodeString &s) {

int32_t RBBILineMonkey::next(int32_t startPos) {
for (std::size_t i = startPos + 1; i < resolved.size(); ++i) {
if (resolved[i].appliedRule != nullptr && resolved[i].appliedRule->resolution() ==
SegmentationRule::BREAK) {
return i;
if (resolved[i].appliedRule != nullptr &&
resolved[i].appliedRule->resolution() == SegmentationRule::BREAK) {
return i;
}
}
return -1;
Expand Down

0 comments on commit e59065c

Please sign in to comment.