Skip to content

Commit

Permalink
(?!.) ftw
Browse files Browse the repository at this point in the history
  • Loading branch information
eggrobin committed Dec 12, 2024
1 parent d428c8d commit 938ef97
Show file tree
Hide file tree
Showing 2 changed files with 9 additions and 20 deletions.
16 changes: 6 additions & 10 deletions icu4c/source/test/intltest/rbbitst.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2906,20 +2906,16 @@ RBBILineMonkey::RBBILineMonkey() :

std::list<std::pair<std::string, UnicodeSet>> partition;

// TODO(egg): The following two workarounds for what seems to be ICU bugs;
// with UREGEX_DOTALL (but not UREGEX_MULTILINE):
// 1. /.*\u000A/ does not match CR LF;
// 2. /$/ matches ( BK | CR | LF | NL ) eot.
// TODO(egg): The following is a workaround for what seems to be an ICU bug:
// with UREGEX_DOTALL (but not UREGEX_MULTILINE), /.*\u000A/ does not match
// CR LF.
rules.push_back(std::make_unique<RegexRule>(uR"(CR LF ÷)", uR"(\u000D\u000A)", u'÷', uR"()"));
rules.push_back(std::make_unique<RegexRule>(
uR"([^ BK CR LF NL ] × [ BK CR LF NL ] eot)",
uR"([^ \p{lb=BK} \p{lb=CR} \p{lb=LF} \p{lb=NL} ])",
u'×',
uR"([ \p{lb=BK} \p{lb=CR} \p{lb=LF} \p{lb=NL} ] $)"));

rules.push_back(std::make_unique<RegexRule>(uR"(sot ÷ contra LB2)", uR"(^)", u'÷', uR"()"));
// This one could be part of the rules.
rules.push_back(std::make_unique<RegexRule>(uR"(LB3 ÷ eot)", uR"()", u'÷', uR"($)"));
// Note that /$/ matches ( BK | CR | LF | NL ) eot, so we use (?!.) instead.
// The generated rules use the same (?!.).
rules.push_back(std::make_unique<RegexRule>(uR"(LB3 ÷ eot)", uR"()", u'÷', uR"((?!.))"));

// --- NOLI ME TANGERE ---
// Generated by GenerateBreakTest.java in the Unicode tools.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -701,18 +701,11 @@ class NamedSet {
List<NamedSet> partition = new ArrayList<>();
rules = new ArrayList<>();

// /$/ matches ( BK | CR | LF | NL ) eot, so in this case we need to apply LB6 before
// LB3 gets incorrectly applied.
rules.add(new RegexRule(
"[^ BK CR LF NL ] × [ BK CR LF NL ] eot",
"[^ \\p{lb=BK} \\p{lb=CR} \\p{lb=LF} \\p{lb=NL} ]",
Resolution.NO_BREAK,
"[ \\p{lb=BK} \\p{lb=CR} \\p{lb=LF} \\p{lb=NL} ] $"));

rules.add(new RegexRule("sot ÷ contra LB2", "^", Resolution.BREAK, ""));
// This one could be part of the rules.
rules.add(new RegexRule("LB3 ÷ eot", "", Resolution.BREAK, "$"));

// Note that /$/ matches ( BK | CR | LF | NL ) eot, so we use (?!.) instead.
// The generated rules use the same (?!.).
rules.add(new RegexRule("LB3 ÷ eot", "", Resolution.BREAK, "(?!.)"));

// --- NOLI ME TANGERE ---
// Generated by GenerateBreakTest.java in the Unicode tools.
Expand Down

0 comments on commit 938ef97

Please sign in to comment.