diff --git a/icu4c/source/data/brkitr/rules/char.txt b/icu4c/source/data/brkitr/rules/char.txt index f3b16ded6790..12840aec7f39 100644 --- a/icu4c/source/data/brkitr/rules/char.txt +++ b/icu4c/source/data/brkitr/rules/char.txt @@ -24,13 +24,9 @@ $Regional_Indicator = [\p{Grapheme_Cluster_Break = Regional_Indicator}]; $Prepend = [\p{Grapheme_Cluster_Break = Prepend}]; $SpacingMark = [\p{Grapheme_Cluster_Break = SpacingMark}]; -# -# From cldr/common/properties/segments/ -# and issue CLDR-10994 -# -$Virama = [\p{Gujr}\p{sc=Telu}\p{sc=Mlym}\p{sc=Orya}\p{sc=Beng}\p{sc=Deva}&\p{Indic_Syllabic_Category=Virama}]; -$LinkingConsonant = [\p{Gujr}\p{sc=Telu}\p{sc=Mlym}\p{sc=Orya}\p{sc=Beng}\p{sc=Deva}&\p{Indic_Syllabic_Category=Consonant}]; -$ExtCccZwj = [[\p{gcb=Extend}-\p{ccc=0}] \p{gcb=ZWJ}]; +$InCBConsonant = [\p{InCB=Consonant}]; +$InCBExtend = [\p{InCB=Extend}]; +$InCBLinker = [\p{InCB=Linker}]; # Korean Syllable Definitions # @@ -64,8 +60,8 @@ $L ($L | $V | $LV | $LVT); # GB 9b $Prepend [^$Control $CR $LF]; -# GB 9.3, from CLDR-10994 -$LinkingConsonant $ExtCccZwj* $Virama $ExtCccZwj* $LinkingConsonant; +# GB 9c +$InCBConsonant [ $InCBExtend $InCBLinker ]* $InCBLinker [ $InCBExtend $InCBLinker ]* $InCBConsonant; # GB 11 Do not break within emoji modifier sequences or emoji zwj sequences. $Extended_Pict $Extend* $ZWJ $Extended_Pict; diff --git a/icu4c/source/test/intltest/rbbitst.cpp b/icu4c/source/test/intltest/rbbitst.cpp index 0d1623083b6b..aadadcecdabb 100644 --- a/icu4c/source/test/intltest/rbbitst.cpp +++ b/icu4c/source/test/intltest/rbbitst.cpp @@ -1655,9 +1655,9 @@ class RBBICharMonkey: public RBBIMonkeyKind { UnicodeSet *fLVTSet; UnicodeSet *fHangulSet; UnicodeSet *fExtendedPictSet; - UnicodeSet *fViramaSet; - UnicodeSet *fLinkingConsonantSet; - UnicodeSet *fExtCccZwjSet; + UnicodeSet *fInCBLinkerSet; + UnicodeSet *fInCBConsonantSet; + UnicodeSet *fInCBExtendSet; UnicodeSet *fAnySet; const UnicodeString *fText; @@ -1690,11 +1690,9 @@ RBBICharMonkey::RBBICharMonkey() { fHangulSet->addAll(*fLVTSet); fExtendedPictSet = new UnicodeSet(u"[:Extended_Pictographic:]", status); - fViramaSet = new UnicodeSet(u"[\\p{Gujr}\\p{sc=Telu}\\p{sc=Mlym}\\p{sc=Orya}\\p{sc=Beng}\\p{sc=Deva}&" - "\\p{Indic_Syllabic_Category=Virama}]", status); - fLinkingConsonantSet = new UnicodeSet(u"[\\p{Gujr}\\p{sc=Telu}\\p{sc=Mlym}\\p{sc=Orya}\\p{sc=Beng}\\p{sc=Deva}&" - "\\p{Indic_Syllabic_Category=Consonant}]", status); - fExtCccZwjSet = new UnicodeSet(u"[[\\p{gcb=Extend}-\\p{ccc=0}] \\p{gcb=ZWJ}]", status); + fInCBLinkerSet = new UnicodeSet(u"[\\p{InCB=Linker}]", status); + fInCBConsonantSet = new UnicodeSet(u"[\\p{InCB=Consonant}]", status); + fInCBExtendSet = new UnicodeSet(u"[\\p{InCB=Extend}]", status); fAnySet = new UnicodeSet(0, 0x10ffff); // Create sets of characters, and add the names of the above character sets. @@ -1713,9 +1711,9 @@ RBBICharMonkey::RBBICharMonkey() { sets.emplace_back(*fHangulSet); classNames.emplace_back("Hangul"); sets.emplace_back(*fZWJSet); classNames.emplace_back("ZWJ"); sets.emplace_back(*fExtendedPictSet); classNames.emplace_back("ExtendedPict"); - sets.emplace_back(*fViramaSet); classNames.emplace_back("Virama"); - sets.emplace_back(*fLinkingConsonantSet); classNames.emplace_back("LinkingConsonant"); - sets.emplace_back(*fExtCccZwjSet); classNames.emplace_back("ExtCcccZwj"); + sets.emplace_back(*fInCBLinkerSet); classNames.emplace_back("InCB=Linker"); + sets.emplace_back(*fInCBConsonantSet); classNames.emplace_back("InCB=Consonant"); + sets.emplace_back(*fInCBExtendSet); classNames.emplace_back("InCB=Extend"); sets.emplace_back(*fAnySet); classNames.emplace_back("Any"); if (U_FAILURE(status)) { @@ -1838,19 +1836,20 @@ int32_t RBBICharMonkey::next(int32_t prevPos) { continue; } - // Note: Viramas are also included in the ExtCccZwj class. - if (fLinkingConsonantSet->contains(c2)) { + if (fInCBConsonantSet->contains(c2)) { int pi = p1; bool sawVirama = false; - while (pi > 0 && fExtCccZwjSet->contains(fText->char32At(pi))) { - if (fViramaSet->contains(fText->char32At(pi))) { + while (pi > 0 && (fInCBExtendSet->contains(fText->char32At(pi)) || + fInCBLinkerSet->contains(fText->char32At(pi)))) { + if (fInCBLinkerSet->contains(fText->char32At(pi))) { sawVirama = true; } pi = fText->moveIndex32(pi, -1); } - if (sawVirama && fLinkingConsonantSet->contains(fText->char32At(pi))) { - setAppliedRule(p2, "GB9.3 LinkingConsonant ExtCccZwj* Virama ExtCccZwj* x LinkingConsonant"); - continue; + if (sawVirama && fInCBConsonantSet->contains(fText->char32At(pi))) { + setAppliedRule( + p2, R"(GB9c \p{InCB=Consonant} [ \p{InCB=Extend} \p{InCB=Linker} ]* \p{InCB=Linker} [ \p{InCB=Extend} \p{InCB=Linker} ]* x \p{InCB=Consonant})"); + continue; } } @@ -1903,9 +1902,9 @@ RBBICharMonkey::~RBBICharMonkey() { delete fAnySet; delete fZWJSet; delete fExtendedPictSet; - delete fViramaSet; - delete fLinkingConsonantSet; - delete fExtCccZwjSet; + delete fInCBLinkerSet; + delete fInCBConsonantSet; + delete fInCBExtendSet; } //------------------------------------------------------------------------------------------ diff --git a/icu4c/source/test/testdata/break_rules/grapheme.txt b/icu4c/source/test/testdata/break_rules/grapheme.txt index d5776f33c206..0a811057a579 100644 --- a/icu4c/source/test/testdata/break_rules/grapheme.txt +++ b/icu4c/source/test/testdata/break_rules/grapheme.txt @@ -18,7 +18,7 @@ CR = [\p{Grapheme_Cluster_Break = CR}]; LF = [\p{Grapheme_Cluster_Break = LF}]; Control = [[\p{Grapheme_Cluster_Break = Control}]]; -Extend = [[\p{Grapheme_Cluster_Break = Extend}]]; +Extend_ = [[\p{Grapheme_Cluster_Break = Extend}]]; ZWJ = [\p{Grapheme_Cluster_Break = ZWJ}]; Regional_Indicator = [\p{Grapheme_Cluster_Break = Regional_Indicator}]; Prepend = [\p{Grapheme_Cluster_Break = Prepend}]; @@ -38,11 +38,9 @@ LVT = [\p{Grapheme_Cluster_Break = LVT}]; Extended_Pict = [:ExtPict:]; # Indic Sequences -Virama_ = [[\p{Gujr}\p{sc=Telu}\p{sc=Mlym}\p{sc=Orya}\p{sc=Beng}\p{sc=Deva}] & [\p{Indic_Syllabic_Category=Virama}]]; - -LinkingConsonant = [[\p{Gujr}\p{sc=Telu}\p{sc=Mlym}\p{sc=Orya}\p{sc=Beng}\p{sc=Deva}] & [\p{Indic_Syllabic_Category=Consonant}]]; - -ExtCccZwj = [[Extend-[\p{ccc=0}]] ZWJ]; +InCBLinker = [\p{InCB=Linker}]; +InCBConsonant = [\p{InCB=Consonant}]; +InCBExtend = [\p{InCB=Extend}]; GB3: CR LF; GB4: (Control | CR | LF) ÷; @@ -52,9 +50,9 @@ GB6: L (L | V | LV | LVT); GB7: (LV | V) (V | T); GB8: (LVT | T) T; -GB11: Extended_Pict Extend* ZWJ Extended_Pict; -GB9c: LinkingConsonant ExtCccZwj* Virama_ ExtCccZwj* LinkingConsonant; -GB9: . (Extend | ZWJ); +GB11: Extended_Pict Extend_* ZWJ Extended_Pict; +GB9c: InCBConsonant ( InCBExtend | InCBLinker )* InCBLinker ( InCBExtend | InCBLinker )* InCBConsonant; +GB9: . (Extend_ | ZWJ); GB9a: . SpacingMark; GB9b: Prepend .; diff --git a/icu4c/source/test/testdata/rbbitst.txt b/icu4c/source/test/testdata/rbbitst.txt index 3eb591576ef5..1c7fe9975699 100644 --- a/icu4c/source/test/testdata/rbbitst.txt +++ b/icu4c/source/test/testdata/rbbitst.txt @@ -169,18 +169,9 @@ # #•\u0e40\u0e01•\u0e44\u0301\u0e23\u0302\u0303•\u0e40•\u0e40\u0e02•\u0e02• • -# -# ICU-13637 and CLDR-10994 - Indic Grapheme Cluster Boundary changes to support aksaras -# New rule: LinkingConsonant ExtCccZwj* Virama ExtCccZwj* × LinkingConsonant -# Sample Chars: LinkingConsonant: \u0915 -# Virama: \u094d [also Extend] -# ExtCccZWJ: \u0308 -# Extend but not ExtCCCZWJ \u093A - -•\u0915\u094d\u0915• -•\u0915\u0308\u0308\u094d\u0308\u0308\u0915• -•\u0915\u0308\u0308\u094d\u0308\u0308•\u0041• -•\u0915\u0308\u0308\u094d\u093A\u093A•\u0915• +# From L2/14-131, §3.2; made into a single EGC by UTC-179-C31. +# This test would have caught ICU-22956. +•સૻ્સૻ• # # From cldr/common/testData/segmentation/graphemeCluster/TestSegmenter-Bengali.txt diff --git a/icu4j/main/core/src/main/resources/com/ibm/icu/impl/data/icudata/brkitr/char.brk b/icu4j/main/core/src/main/resources/com/ibm/icu/impl/data/icudata/brkitr/char.brk index 16a9aceee89a..fd22a1c22e5b 100644 Binary files a/icu4j/main/core/src/main/resources/com/ibm/icu/impl/data/icudata/brkitr/char.brk and b/icu4j/main/core/src/main/resources/com/ibm/icu/impl/data/icudata/brkitr/char.brk differ diff --git a/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/rbbi/RBBITestMonkey.java b/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/rbbi/RBBITestMonkey.java index 2dabd5f2e890..56a4801bea29 100644 --- a/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/rbbi/RBBITestMonkey.java +++ b/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/rbbi/RBBITestMonkey.java @@ -145,9 +145,9 @@ static class RBBICharMonkey extends RBBIMonkeyKind { UnicodeSet fHangulSet; UnicodeSet fZWJSet; UnicodeSet fExtendedPictSet; - UnicodeSet fViramaSet; - UnicodeSet fLinkingConsonantSet; - UnicodeSet fExtCccZwjSet; + UnicodeSet fInCBLinkerSet; + UnicodeSet fInCBConsonantSet; + UnicodeSet fInCBExtendSet; UnicodeSet fAnySet; @@ -176,11 +176,9 @@ static class RBBICharMonkey extends RBBIMonkeyKind { fHangulSet.addAll(fLVTSet); fExtendedPictSet = new UnicodeSet("[:Extended_Pictographic:]"); - fViramaSet = new UnicodeSet("[\\p{Gujr}\\p{sc=Telu}\\p{sc=Mlym}\\p{sc=Orya}\\p{sc=Beng}\\p{sc=Deva}&" - + "\\p{Indic_Syllabic_Category=Virama}]"); - fLinkingConsonantSet = new UnicodeSet("[\\p{Gujr}\\p{sc=Telu}\\p{sc=Mlym}\\p{sc=Orya}\\p{sc=Beng}\\p{sc=Deva}&" - + "\\p{Indic_Syllabic_Category=Consonant}]"); - fExtCccZwjSet = new UnicodeSet("[[\\p{gcb=Extend}-\\p{ccc=0}] \\p{gcb=ZWJ}]"); + fInCBLinkerSet = new UnicodeSet("[\\p{InCB=Linker}]"); + fInCBConsonantSet = new UnicodeSet("[\\p{InCB=Consonant}]"); + fInCBExtendSet = new UnicodeSet("[\\p{InCB=Extend}]"); fAnySet = new UnicodeSet("[\\u0000-\\U0010ffff]"); @@ -196,9 +194,9 @@ static class RBBICharMonkey extends RBBIMonkeyKind { fSets.add(fAnySet); fClassNames.add("Any"); fSets.add(fZWJSet); fClassNames.add("ZWJ"); fSets.add(fExtendedPictSet); fClassNames.add("ExtendedPict"); - fSets.add(fViramaSet); fClassNames.add("Virama"); - fSets.add(fLinkingConsonantSet); fClassNames.add("LinkingConsonant"); - fSets.add(fExtCccZwjSet); fClassNames.add("ExtCccZwj"); + fSets.add(fInCBLinkerSet); fClassNames.add("InCB=Linker"); + fSets.add(fInCBConsonantSet); fClassNames.add("InCB=Consonant"); + fSets.add(fInCBExtendSet); fClassNames.add("InCB=Extend"); } @@ -315,17 +313,18 @@ int next(int prevPos) { } // Note: Viramas are also included in the ExtCccZwj class. - if (fLinkingConsonantSet.contains(c2)) { + if (fInCBConsonantSet.contains(c2)) { int pi = p1; boolean sawVirama = false; - while (pi > 0 && fExtCccZwjSet.contains(fText.codePointAt(pi))) { - if (fViramaSet.contains(fText.codePointAt(pi))) { + while (pi > 0 && (fInCBExtendSet.contains(fText.codePointAt(pi)) || + fInCBLinkerSet.contains(fText.codePointAt(pi)))) { + if (fInCBLinkerSet.contains(fText.codePointAt(pi))) { sawVirama = true; } pi = fText.offsetByCodePoints(pi, -1); } - if (sawVirama && fLinkingConsonantSet.contains(fText.codePointAt(pi))) { - setAppliedRule(p2, "GB 9.3 LinkingConsonant ExtCccZwj* Virama ExtCccZwj* × LinkingConsonant"); + if (sawVirama && fInCBConsonantSet.contains(fText.codePointAt(pi))) { + setAppliedRule(p2, "GB9c \\p{InCB=Consonant} [ \\p{InCB=Extend} \\p{InCB=Linker} ]* \\p{InCB=Linker} [ \\p{InCB=Extend} \\p{InCB=Linker} ]* × \\p{InCB=Consonant})"); continue; } } diff --git a/icu4j/main/core/src/test/resources/com/ibm/icu/dev/test/rbbi/break_rules/grapheme.txt b/icu4j/main/core/src/test/resources/com/ibm/icu/dev/test/rbbi/break_rules/grapheme.txt index d5776f33c206..0a811057a579 100644 --- a/icu4j/main/core/src/test/resources/com/ibm/icu/dev/test/rbbi/break_rules/grapheme.txt +++ b/icu4j/main/core/src/test/resources/com/ibm/icu/dev/test/rbbi/break_rules/grapheme.txt @@ -18,7 +18,7 @@ CR = [\p{Grapheme_Cluster_Break = CR}]; LF = [\p{Grapheme_Cluster_Break = LF}]; Control = [[\p{Grapheme_Cluster_Break = Control}]]; -Extend = [[\p{Grapheme_Cluster_Break = Extend}]]; +Extend_ = [[\p{Grapheme_Cluster_Break = Extend}]]; ZWJ = [\p{Grapheme_Cluster_Break = ZWJ}]; Regional_Indicator = [\p{Grapheme_Cluster_Break = Regional_Indicator}]; Prepend = [\p{Grapheme_Cluster_Break = Prepend}]; @@ -38,11 +38,9 @@ LVT = [\p{Grapheme_Cluster_Break = LVT}]; Extended_Pict = [:ExtPict:]; # Indic Sequences -Virama_ = [[\p{Gujr}\p{sc=Telu}\p{sc=Mlym}\p{sc=Orya}\p{sc=Beng}\p{sc=Deva}] & [\p{Indic_Syllabic_Category=Virama}]]; - -LinkingConsonant = [[\p{Gujr}\p{sc=Telu}\p{sc=Mlym}\p{sc=Orya}\p{sc=Beng}\p{sc=Deva}] & [\p{Indic_Syllabic_Category=Consonant}]]; - -ExtCccZwj = [[Extend-[\p{ccc=0}]] ZWJ]; +InCBLinker = [\p{InCB=Linker}]; +InCBConsonant = [\p{InCB=Consonant}]; +InCBExtend = [\p{InCB=Extend}]; GB3: CR LF; GB4: (Control | CR | LF) ÷; @@ -52,9 +50,9 @@ GB6: L (L | V | LV | LVT); GB7: (LV | V) (V | T); GB8: (LVT | T) T; -GB11: Extended_Pict Extend* ZWJ Extended_Pict; -GB9c: LinkingConsonant ExtCccZwj* Virama_ ExtCccZwj* LinkingConsonant; -GB9: . (Extend | ZWJ); +GB11: Extended_Pict Extend_* ZWJ Extended_Pict; +GB9c: InCBConsonant ( InCBExtend | InCBLinker )* InCBLinker ( InCBExtend | InCBLinker )* InCBConsonant; +GB9: . (Extend_ | ZWJ); GB9a: . SpacingMark; GB9b: Prepend .; diff --git a/icu4j/main/core/src/test/resources/com/ibm/icu/dev/test/rbbi/rbbitst.txt b/icu4j/main/core/src/test/resources/com/ibm/icu/dev/test/rbbi/rbbitst.txt index 3eb591576ef5..1c7fe9975699 100644 --- a/icu4j/main/core/src/test/resources/com/ibm/icu/dev/test/rbbi/rbbitst.txt +++ b/icu4j/main/core/src/test/resources/com/ibm/icu/dev/test/rbbi/rbbitst.txt @@ -169,18 +169,9 @@ # #•\u0e40\u0e01•\u0e44\u0301\u0e23\u0302\u0303•\u0e40•\u0e40\u0e02•\u0e02• • -# -# ICU-13637 and CLDR-10994 - Indic Grapheme Cluster Boundary changes to support aksaras -# New rule: LinkingConsonant ExtCccZwj* Virama ExtCccZwj* × LinkingConsonant -# Sample Chars: LinkingConsonant: \u0915 -# Virama: \u094d [also Extend] -# ExtCccZWJ: \u0308 -# Extend but not ExtCCCZWJ \u093A - -•\u0915\u094d\u0915• -•\u0915\u0308\u0308\u094d\u0308\u0308\u0915• -•\u0915\u0308\u0308\u094d\u0308\u0308•\u0041• -•\u0915\u0308\u0308\u094d\u093A\u093A•\u0915• +# From L2/14-131, §3.2; made into a single EGC by UTC-179-C31. +# This test would have caught ICU-22956. +•સૻ્સૻ• # # From cldr/common/testData/segmentation/graphemeCluster/TestSegmenter-Bengali.txt