From 0b9eb9ca711dc81d51af0b96e7f9f1004c0fecb6 Mon Sep 17 00:00:00 2001 From: Robin Leroy Date: Tue, 29 Oct 2024 14:52:08 +0100 Subject: [PATCH] ICU-22956 Use InCB for grapheme cluster segmentation --- icu4c/source/data/brkitr/rules/char.txt | 14 +++--- icu4c/source/test/intltest/rbbitst.cpp | 41 +++++++++--------- .../test/testdata/break_rules/grapheme.txt | 16 +++---- icu4c/source/test/testdata/rbbitst.txt | 15 ++----- .../ibm/icu/impl/data/icudata/brkitr/char.brk | Bin 13984 -> 13360 bytes .../ibm/icu/dev/test/rbbi/RBBITestMonkey.java | 31 +++++++------ .../dev/test/rbbi/break_rules/grapheme.txt | 16 +++---- .../com/ibm/icu/dev/test/rbbi/rbbitst.txt | 15 ++----- 8 files changed, 60 insertions(+), 88 deletions(-) diff --git a/icu4c/source/data/brkitr/rules/char.txt b/icu4c/source/data/brkitr/rules/char.txt index f3b16ded6790..12840aec7f39 100644 --- a/icu4c/source/data/brkitr/rules/char.txt +++ b/icu4c/source/data/brkitr/rules/char.txt @@ -24,13 +24,9 @@ $Regional_Indicator = [\p{Grapheme_Cluster_Break = Regional_Indicator}]; $Prepend = [\p{Grapheme_Cluster_Break = Prepend}]; $SpacingMark = [\p{Grapheme_Cluster_Break = SpacingMark}]; -# -# From cldr/common/properties/segments/ -# and issue CLDR-10994 -# -$Virama = [\p{Gujr}\p{sc=Telu}\p{sc=Mlym}\p{sc=Orya}\p{sc=Beng}\p{sc=Deva}&\p{Indic_Syllabic_Category=Virama}]; -$LinkingConsonant = [\p{Gujr}\p{sc=Telu}\p{sc=Mlym}\p{sc=Orya}\p{sc=Beng}\p{sc=Deva}&\p{Indic_Syllabic_Category=Consonant}]; -$ExtCccZwj = [[\p{gcb=Extend}-\p{ccc=0}] \p{gcb=ZWJ}]; +$InCBConsonant = [\p{InCB=Consonant}]; +$InCBExtend = [\p{InCB=Extend}]; +$InCBLinker = [\p{InCB=Linker}]; # Korean Syllable Definitions # @@ -64,8 +60,8 @@ $L ($L | $V | $LV | $LVT); # GB 9b $Prepend [^$Control $CR $LF]; -# GB 9.3, from CLDR-10994 -$LinkingConsonant $ExtCccZwj* $Virama $ExtCccZwj* $LinkingConsonant; +# GB 9c +$InCBConsonant [ $InCBExtend $InCBLinker ]* $InCBLinker [ $InCBExtend $InCBLinker ]* $InCBConsonant; # GB 11 Do not break within emoji modifier sequences or emoji zwj sequences. $Extended_Pict $Extend* $ZWJ $Extended_Pict; diff --git a/icu4c/source/test/intltest/rbbitst.cpp b/icu4c/source/test/intltest/rbbitst.cpp index 0d1623083b6b..aadadcecdabb 100644 --- a/icu4c/source/test/intltest/rbbitst.cpp +++ b/icu4c/source/test/intltest/rbbitst.cpp @@ -1655,9 +1655,9 @@ class RBBICharMonkey: public RBBIMonkeyKind { UnicodeSet *fLVTSet; UnicodeSet *fHangulSet; UnicodeSet *fExtendedPictSet; - UnicodeSet *fViramaSet; - UnicodeSet *fLinkingConsonantSet; - UnicodeSet *fExtCccZwjSet; + UnicodeSet *fInCBLinkerSet; + UnicodeSet *fInCBConsonantSet; + UnicodeSet *fInCBExtendSet; UnicodeSet *fAnySet; const UnicodeString *fText; @@ -1690,11 +1690,9 @@ RBBICharMonkey::RBBICharMonkey() { fHangulSet->addAll(*fLVTSet); fExtendedPictSet = new UnicodeSet(u"[:Extended_Pictographic:]", status); - fViramaSet = new UnicodeSet(u"[\\p{Gujr}\\p{sc=Telu}\\p{sc=Mlym}\\p{sc=Orya}\\p{sc=Beng}\\p{sc=Deva}&" - "\\p{Indic_Syllabic_Category=Virama}]", status); - fLinkingConsonantSet = new UnicodeSet(u"[\\p{Gujr}\\p{sc=Telu}\\p{sc=Mlym}\\p{sc=Orya}\\p{sc=Beng}\\p{sc=Deva}&" - "\\p{Indic_Syllabic_Category=Consonant}]", status); - fExtCccZwjSet = new UnicodeSet(u"[[\\p{gcb=Extend}-\\p{ccc=0}] \\p{gcb=ZWJ}]", status); + fInCBLinkerSet = new UnicodeSet(u"[\\p{InCB=Linker}]", status); + fInCBConsonantSet = new UnicodeSet(u"[\\p{InCB=Consonant}]", status); + fInCBExtendSet = new UnicodeSet(u"[\\p{InCB=Extend}]", status); fAnySet = new UnicodeSet(0, 0x10ffff); // Create sets of characters, and add the names of the above character sets. @@ -1713,9 +1711,9 @@ RBBICharMonkey::RBBICharMonkey() { sets.emplace_back(*fHangulSet); classNames.emplace_back("Hangul"); sets.emplace_back(*fZWJSet); classNames.emplace_back("ZWJ"); sets.emplace_back(*fExtendedPictSet); classNames.emplace_back("ExtendedPict"); - sets.emplace_back(*fViramaSet); classNames.emplace_back("Virama"); - sets.emplace_back(*fLinkingConsonantSet); classNames.emplace_back("LinkingConsonant"); - sets.emplace_back(*fExtCccZwjSet); classNames.emplace_back("ExtCcccZwj"); + sets.emplace_back(*fInCBLinkerSet); classNames.emplace_back("InCB=Linker"); + sets.emplace_back(*fInCBConsonantSet); classNames.emplace_back("InCB=Consonant"); + sets.emplace_back(*fInCBExtendSet); classNames.emplace_back("InCB=Extend"); sets.emplace_back(*fAnySet); classNames.emplace_back("Any"); if (U_FAILURE(status)) { @@ -1838,19 +1836,20 @@ int32_t RBBICharMonkey::next(int32_t prevPos) { continue; } - // Note: Viramas are also included in the ExtCccZwj class. - if (fLinkingConsonantSet->contains(c2)) { + if (fInCBConsonantSet->contains(c2)) { int pi = p1; bool sawVirama = false; - while (pi > 0 && fExtCccZwjSet->contains(fText->char32At(pi))) { - if (fViramaSet->contains(fText->char32At(pi))) { + while (pi > 0 && (fInCBExtendSet->contains(fText->char32At(pi)) || + fInCBLinkerSet->contains(fText->char32At(pi)))) { + if (fInCBLinkerSet->contains(fText->char32At(pi))) { sawVirama = true; } pi = fText->moveIndex32(pi, -1); } - if (sawVirama && fLinkingConsonantSet->contains(fText->char32At(pi))) { - setAppliedRule(p2, "GB9.3 LinkingConsonant ExtCccZwj* Virama ExtCccZwj* x LinkingConsonant"); - continue; + if (sawVirama && fInCBConsonantSet->contains(fText->char32At(pi))) { + setAppliedRule( + p2, R"(GB9c \p{InCB=Consonant} [ \p{InCB=Extend} \p{InCB=Linker} ]* \p{InCB=Linker} [ \p{InCB=Extend} \p{InCB=Linker} ]* x \p{InCB=Consonant})"); + continue; } } @@ -1903,9 +1902,9 @@ RBBICharMonkey::~RBBICharMonkey() { delete fAnySet; delete fZWJSet; delete fExtendedPictSet; - delete fViramaSet; - delete fLinkingConsonantSet; - delete fExtCccZwjSet; + delete fInCBLinkerSet; + delete fInCBConsonantSet; + delete fInCBExtendSet; } //------------------------------------------------------------------------------------------ diff --git a/icu4c/source/test/testdata/break_rules/grapheme.txt b/icu4c/source/test/testdata/break_rules/grapheme.txt index d5776f33c206..0a811057a579 100644 --- a/icu4c/source/test/testdata/break_rules/grapheme.txt +++ b/icu4c/source/test/testdata/break_rules/grapheme.txt @@ -18,7 +18,7 @@ CR = [\p{Grapheme_Cluster_Break = CR}]; LF = [\p{Grapheme_Cluster_Break = LF}]; Control = [[\p{Grapheme_Cluster_Break = Control}]]; -Extend = [[\p{Grapheme_Cluster_Break = Extend}]]; +Extend_ = [[\p{Grapheme_Cluster_Break = Extend}]]; ZWJ = [\p{Grapheme_Cluster_Break = ZWJ}]; Regional_Indicator = [\p{Grapheme_Cluster_Break = Regional_Indicator}]; Prepend = [\p{Grapheme_Cluster_Break = Prepend}]; @@ -38,11 +38,9 @@ LVT = [\p{Grapheme_Cluster_Break = LVT}]; Extended_Pict = [:ExtPict:]; # Indic Sequences -Virama_ = [[\p{Gujr}\p{sc=Telu}\p{sc=Mlym}\p{sc=Orya}\p{sc=Beng}\p{sc=Deva}] & [\p{Indic_Syllabic_Category=Virama}]]; - -LinkingConsonant = [[\p{Gujr}\p{sc=Telu}\p{sc=Mlym}\p{sc=Orya}\p{sc=Beng}\p{sc=Deva}] & [\p{Indic_Syllabic_Category=Consonant}]]; - -ExtCccZwj = [[Extend-[\p{ccc=0}]] ZWJ]; +InCBLinker = [\p{InCB=Linker}]; +InCBConsonant = [\p{InCB=Consonant}]; +InCBExtend = [\p{InCB=Extend}]; GB3: CR LF; GB4: (Control | CR | LF) ÷; @@ -52,9 +50,9 @@ GB6: L (L | V | LV | LVT); GB7: (LV | V) (V | T); GB8: (LVT | T) T; -GB11: Extended_Pict Extend* ZWJ Extended_Pict; -GB9c: LinkingConsonant ExtCccZwj* Virama_ ExtCccZwj* LinkingConsonant; -GB9: . (Extend | ZWJ); +GB11: Extended_Pict Extend_* ZWJ Extended_Pict; +GB9c: InCBConsonant ( InCBExtend | InCBLinker )* InCBLinker ( InCBExtend | InCBLinker )* InCBConsonant; +GB9: . (Extend_ | ZWJ); GB9a: . SpacingMark; GB9b: Prepend .; diff --git a/icu4c/source/test/testdata/rbbitst.txt b/icu4c/source/test/testdata/rbbitst.txt index 3eb591576ef5..1c7fe9975699 100644 --- a/icu4c/source/test/testdata/rbbitst.txt +++ b/icu4c/source/test/testdata/rbbitst.txt @@ -169,18 +169,9 @@ # #•\u0e40\u0e01•\u0e44\u0301\u0e23\u0302\u0303•\u0e40•\u0e40\u0e02•\u0e02• • -# -# ICU-13637 and CLDR-10994 - Indic Grapheme Cluster Boundary changes to support aksaras -# New rule: LinkingConsonant ExtCccZwj* Virama ExtCccZwj* × LinkingConsonant -# Sample Chars: LinkingConsonant: \u0915 -# Virama: \u094d [also Extend] -# ExtCccZWJ: \u0308 -# Extend but not ExtCCCZWJ \u093A - -•\u0915\u094d\u0915• -•\u0915\u0308\u0308\u094d\u0308\u0308\u0915• -•\u0915\u0308\u0308\u094d\u0308\u0308•\u0041• -•\u0915\u0308\u0308\u094d\u093A\u093A•\u0915• +# From L2/14-131, §3.2; made into a single EGC by UTC-179-C31. +# This test would have caught ICU-22956. +•સૻ્સૻ• # # From cldr/common/testData/segmentation/graphemeCluster/TestSegmenter-Bengali.txt diff --git a/icu4j/main/core/src/main/resources/com/ibm/icu/impl/data/icudata/brkitr/char.brk b/icu4j/main/core/src/main/resources/com/ibm/icu/impl/data/icudata/brkitr/char.brk index 16a9aceee89adb8570249f9b5159e6098ff54240..fd22a1c22e5b128c1a3ce0e03d7a8809a238b166 100644 GIT binary patch literal 13360 zcmeHNdyG_989#I9zGh~3W_H=#d9-OSaiu_TsRAvsE(JEC=++R}E?OSzWOrZ(mYHRC zk!}j9O|&u5Xf-x%Lo9?pnyS%)8nhDKsL?+p21DW>1yg(^jY&-tNW_Xr{J!(J_uO+I zI|~y3;k(>>zsK);o-=pvOd!u+dWdupk{}w{)I8vrgb+PJh%?xjnr+thP0Tn;2bVb? zdiaBnIFr=7-L z=kRoOoii~rGqrBz$|FaP^c`l-eU0Y6m181J-^9$pNe8l@If*q8vg#$cbi=t7ssldp zH#pZ4l5_}Jktbv&@cJ&eKMWWf&L{#bDZ+`dl5iSu#*n5Zbwf9GOSe;LT~DPmFpf~- zdQCM-c%9$@h2llh8=%EHj5b*xo!0_>P=Aasc~#wGd_vxzqn?-ECQP}{0~Q9GgiKABB^BsrNpmAs-qpzqPYrT@{m&v?>! z*?7||nLErE%s*IN)^_U~)5m+jj3l-zc9C_Zlwp(PZKA7E`2%u z@67u|PU2+7JU+-}zMlC_=0@Avwu8iJdkxNu4AY)%|8U}d?b-IH+RwIM&-P|Njr#2K zJi?qBDf`>(e~6R&KyD9QU-GQEUxEDBd>7N?x8XfMk$)xsR{mPY3OM(6oa*?UhtGq* zd~{l!k9uvQ=6FR-(*LLz={Mvicy$Rf#!2f>&oqx|X;rIr#3FC0*1|&p%_FEOY*zIs z8)w6`sxG~sX$m8%1!ZG{tNo$Ny+LSY=}A`uNT^~jm48g$qtfS+f1>y23h*{SkDGuw zMQ+$Ml6D`r|DfO8X z-YBJsQ!1gpv_vbE)=+L^>60olPPWdLc1rf2mG+lrB^hIXsq`Y#lundh_uEv-x=@-! zUFkfzsquG21(PjZCjU}&vAw(4Go`<{e7FZ+Q?`5N!MYJj*IGrA)8#~2gCk8~AF@L$ z6L?Qn4`V&KU)9CPb7YQOg_G|37@RLpxvRXm+y~zf#yyYZc8^~WhZW3~SK*ah(@N0Q zkI2tkkC3`pUSA%P@TVo-t2(kj_cP}`Av`aYn-6m)G&hUlt00jHN9M1^AGs6o9q17A z&7Cao$~DWi+zIAWo-IFH!esz4DaBKN^xL(zE^yVK`bxdZJsWyMF9cw)~ zzZS8%liY7j?i6x)c%I~5@uK8TQvPlHrWb#Bk;A!o@A<7Ay(9|<-Lw3PvV6$4ATJ|t zgV)#SPr$DPaOj8iFX+ehbNZY5dq$73+88#P##flle>J}CmxGe=6QMKC8&{0$Lb{#1 z+2i*zmze$L4yH4Y;N5)D{GK@n_qVwAmi2(O-ujF+Ykk%FzV*7_f8mmK!FpFdk}m1V zq~QuKW8PhI^F%Oib_=*9Ui@U=A^D@>uWXc5b!+m1YJpVrUu&U*s1zknFC{mVo;s3~ z@E)t8ZB@Z!?vUc+!m&{{hz3YdDQmpkjFW9G^HPwOecZK%+68%>(9k=E=dE&%Nj{J) zAgTpQr6>r)Q<~33;M99cRrqB1?O35qD`cl_Go2QZ(m-wLw3vbjKv!Ogj-qR~!AB)w zl<&gB%y4grY(fb@CX|6^n;VA?H`|hta(IW3R=eeTN%+_(6H=s z&w}@QhnA2q3LiafLGIlPPYaVo!DiT+!SsU7iFA+%OpU$5DYhL*fa0glATVEd8~~hc>~QiwhN*nH8eJK zK&Ou-AcZ6h=`k`UH{fg;358#r&)>6qk)mf2ZGkv2XExE>X@ghIZmcD-CXxyF?!4r+ z)B=am_yi$1E>XCXumD^IUuRa(76g-q5>$AW@LmX@rLG z%{e6OeD)9*P`MibmY;qw=?4{870LDSI9H`I08h36^SenY{02#yHz;JLrQ;R-pat3s zZfz-!%+lR1E24OOcr#)ja4vWWgq40egOBN`xJ)miu@7HVu}TcrN!CLISq$>D4Y9aE zbC?EOhe7fQgdtQN8@tHDx>7MvzW+()4j7Pp1aJebKX0e4;(JZMc+XrtjeUN ziCc)kR13R$RWHmB`;t6h&++c?Cb2xeJO(sV1owe3!h@n|Sg$TWhhRY-M>VP_A%zFl z?D3Ht#k2yHBj9KI&WL^Pe0hC>6x}~D$q>n6aOv5@9yhe&sIu`nf}qAAkwMKWGvbxcgtmV_(4_4mt$G#cJS>2&SX6^e2mQYq336ju+Z< zbGd~U>pDi2rt0a2!m&^T?k^P)DdXN^vutT9FhBYa{xk+db}ovhyJ3?nP~W=Iw%|8x zbi!RC6Y6A=ZOF^ER(ScZKf}|Al*MrFB6^{D7B3a|fVJ>t1c2uRJia#p;ng(ZJGU1A zC6N&TECGkp;IO>X?}*B3Qy`1Yqu?Wj`msnnI97v2K$Yx-0+8WI7sY+CKDv|fM*`bU zN#z1m%0Thb2BaNgP_{Yc8K56ps6TLsa4Q8)ih=_JD=Uv-utM}k0u3#gHiQ*2Q!1VE z#SnF(6H}wG_v2~xr0bA>JeaWc7HnnoC}2*%!~}MT#^Ti zCE|}#I3}+T7`|LHpCN5UfJ9!}B?yorizg7=*tltMexXWU5Afh2J_t0DsFY5{jV5CdTm zhHCW#Rn{T3G8uh`;%XT}#7G=s1QEk=h+&!~Hpmc-GBgq!2h13W^#yx#Yi$hXu?}S9 z*6r?Jym)M)Qmey#ve7uOaiUro|9GW2PB$m+UD%q??$X|&qrD?Xdxz*bysRG-z>X|i zidrrYBiB3JzjKe+tIU7*Ql9^4FKnY_LA$)j?*0fwQC4*)U1Zi6W!dWNUM>mo7)#`G@A8n3LWpW*h;G3_`uqCv`SKl3{{pyG~;7!-a`t+yoVeP(W#nA;cvu)Cd}k5d`Bd>&4z=z3Z&k z1dD_sTA?BV0#p?WRHgkvDJm_fPl_llMXFRlRjbmZsYD+?s)`B(MT%$zsOOw}@64U2 z*GVJwNA6~3&OPTl&wKC8ok^nHcl4vmdPPYns1ZdY5|?W-HLN^`dQG5gX+WrPIH_I$OqG-6NGD%HkP)gHD1xUgh5 zZ?cz{s@v`Pc6)LAmMsSk92i+<$|KFz+?I-PGcw;^sN3NEna3~&MLFMu(*`)MgyN&yE0>5}=;ehswJ4Om0QgITg~S_;G|+vk2E{xH0vMDa1s z9yd;?18_p(?*^`c@!*kgD0T{eJ%4O7Q)5e?l$H5FP?)l#~y>4pyDqMK$W3M}}s z{-eBcKAVHwQnX(ki-TAX9@WrJ*sLVJ6_k-*7(~;jM0_-;P`;C`+Q${ud}8pR^0HzZ z2;QN5Ua)G5-jn>Lmeh7?cWA#z zDXEK6x22v>eV~u&x9ZR8|2DQ62aF#X$ISE0w)v#_x>c|?Syk&E>(z93`m*$N`qA`J z#m*En+cVc??hrnK^P)=!sm$A5XLMcL^hF5RP5|FSDLLr{|nh4Fi!Sgc+Hu)3v;*S9>~3td!SG6 z+u64u;Xz;i^?j@Fu-or@Q?b{buGopoT^^&>oePqdm%Hu}Z~s*9R0U2|;Qyrp>wdWI zU29oHvmWj{>-VpJYWWci;u9EvmlYwWA9=V9jq zF}(rXUcCONU@~0dBZ^(SK;c^6^wSz>fu5a_H@UPQ&=qA zrr%t+J5k~qUoAXQ_^}4N8j`+P_?^mkOw|8d;ToKUi`|}|D(otL*3a|yi{;{eF9$r_Uc3*^Uq^oNJL5aSs04h){nR6+TAUnq zk$g-Og?OA-TB7;y<%Rdu!=*2k?tRSS#uOxVN1P8CBZzzLuKb}qx!|#Ihu=++k z&MPfFU3$I*>!tJum!E9A^jav2V7T;mx{m3$=qu`<(r0vTp>(YD;b519e_EoQQH>rK zXU%;-OwY$#4fBi-@A!uq;+YwP`EVq>$;AG$7r{GFj_GE13~tKy4W5_1i0KSoGWeO? z{k@N7bAx-bzmK;QE*+c;=LfhiWe*2<6gP{^FZ1S_JL2}UM-ZFc!83X-yA!GGYj9lX zTFLGp`c8V&i>;4qIh=BB9w+sZ(xddSIm^!2mw&P>NXy9Mp|$h0tKlmF{Iq55i`paF zbJ`!Z_fq|-b5oP4R_bmh^PW@R_V$Cl)K3I2^>XUZsbivdJXgKntECU?qxvwskwZ~xSn*`k(7jYNk7{3 zGgzN1;8#m*ST-VqXn+KnGKX1k&$6sz+HzaRI()2I{9G#{R>C+TVx}zdNMDHPprz9U zaxJf-;{bg`+Sm7Xp*ER-;jT`al`-{9M$|R~tvi#U)PW}q=z92aapV^0_?r;QSDknC zE4wvE&hgE-VYjbd@^;#Uf$CyvsQY z+RYtO!iKr?(o+g@?o!a++#Nv|8#AchT1zjvhuG1KzxKq!0SeZhM$QRh)nZA=Ajv|9 zDF+^ zs*fRHlbj0^Avp?B)YXF8LW3J5N6-cV9j&#wmRx1L?;ejWCz`Mz;90|-p~sOfgo6Q5 z!Y16!DRJ(N`0RvpeAUKc6LkxVG@geJsQ7Rng}>|sG8``?0QUpXgv6sEV10jBGECBt z$(|Ezy&bmf2{t+A0PRlXfJiWK@C6-qWzRLA#8YRV47;nzLsUqMO#r+-(VC_;j-v`e zxI-2jF0%{@Nmn7KV*xo)0YRG43SPtEJvrzTu!K?R6q>k% zbQo)GXE*A#>0w@y2Fy909bP1o+n39LVnT2oFe5YwO=jJ=ycB{(b1CBZyAo1pV9YKZ zNfD+KplpG!Vwtr@nyodDUr)zl&U1_KNIV^HX+5Hd4z7CXCKZ9=?A4>d=PKRJq8_k# z$E5qWe9seJk7JiAw)k{Emi7U{q;r*TZH#tANJPc!0r975=-A^elsD;>6Vzk3is*UO z2bwPis4#o(P%XXA0#MHAFg+FyMnn%pY188vSncWSkHAs%3!?h;%!C}-1c6J~ObJ(} zph?tGRUk?F15efpcC|mQJfNU^Ghh*#niP3YCr4KYcxT^N9kFRla-$w7c7rbJ=m(h_AsCcu36CniP$Nce-FMC4jyjQHIT>*gk6!M~b!OaQ@(iXj8V1i~++$!gUL z61)u%fDB6~rX|lx;P9lxdVs4~VVvcgJZv_JtSAzSL%{eIX9P0<6eG~RKq2kjWx4xA zl<{rjdS4TK+6}rI6aZ@BzwySFz|+Q!`g-It2LF$uTwj`LHtH**L*u*0_FTVs z>!(`f#rf(&b!NQ2yaa%movmtl-`M!>L(`)}6T9Lt6T1*I-fXm6&HC7$IBN`ZXd3XB z+|sT#W@F(DLg?$RxhyUO5MvCxt8=wxqg z%EVNx9T4*rVY0xgy0TV*%5I1JklH>yx^ZJ=zFcd-wcc#*yLi4@p8a&WHA~A1R~;-W z7@JK)69FLbu zfR*4BaYyb*&lDGc? DI4MN* diff --git a/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/rbbi/RBBITestMonkey.java b/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/rbbi/RBBITestMonkey.java index 2dabd5f2e890..56a4801bea29 100644 --- a/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/rbbi/RBBITestMonkey.java +++ b/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/rbbi/RBBITestMonkey.java @@ -145,9 +145,9 @@ static class RBBICharMonkey extends RBBIMonkeyKind { UnicodeSet fHangulSet; UnicodeSet fZWJSet; UnicodeSet fExtendedPictSet; - UnicodeSet fViramaSet; - UnicodeSet fLinkingConsonantSet; - UnicodeSet fExtCccZwjSet; + UnicodeSet fInCBLinkerSet; + UnicodeSet fInCBConsonantSet; + UnicodeSet fInCBExtendSet; UnicodeSet fAnySet; @@ -176,11 +176,9 @@ static class RBBICharMonkey extends RBBIMonkeyKind { fHangulSet.addAll(fLVTSet); fExtendedPictSet = new UnicodeSet("[:Extended_Pictographic:]"); - fViramaSet = new UnicodeSet("[\\p{Gujr}\\p{sc=Telu}\\p{sc=Mlym}\\p{sc=Orya}\\p{sc=Beng}\\p{sc=Deva}&" - + "\\p{Indic_Syllabic_Category=Virama}]"); - fLinkingConsonantSet = new UnicodeSet("[\\p{Gujr}\\p{sc=Telu}\\p{sc=Mlym}\\p{sc=Orya}\\p{sc=Beng}\\p{sc=Deva}&" - + "\\p{Indic_Syllabic_Category=Consonant}]"); - fExtCccZwjSet = new UnicodeSet("[[\\p{gcb=Extend}-\\p{ccc=0}] \\p{gcb=ZWJ}]"); + fInCBLinkerSet = new UnicodeSet("[\\p{InCB=Linker}]"); + fInCBConsonantSet = new UnicodeSet("[\\p{InCB=Consonant}]"); + fInCBExtendSet = new UnicodeSet("[\\p{InCB=Extend}]"); fAnySet = new UnicodeSet("[\\u0000-\\U0010ffff]"); @@ -196,9 +194,9 @@ static class RBBICharMonkey extends RBBIMonkeyKind { fSets.add(fAnySet); fClassNames.add("Any"); fSets.add(fZWJSet); fClassNames.add("ZWJ"); fSets.add(fExtendedPictSet); fClassNames.add("ExtendedPict"); - fSets.add(fViramaSet); fClassNames.add("Virama"); - fSets.add(fLinkingConsonantSet); fClassNames.add("LinkingConsonant"); - fSets.add(fExtCccZwjSet); fClassNames.add("ExtCccZwj"); + fSets.add(fInCBLinkerSet); fClassNames.add("InCB=Linker"); + fSets.add(fInCBConsonantSet); fClassNames.add("InCB=Consonant"); + fSets.add(fInCBExtendSet); fClassNames.add("InCB=Extend"); } @@ -315,17 +313,18 @@ int next(int prevPos) { } // Note: Viramas are also included in the ExtCccZwj class. - if (fLinkingConsonantSet.contains(c2)) { + if (fInCBConsonantSet.contains(c2)) { int pi = p1; boolean sawVirama = false; - while (pi > 0 && fExtCccZwjSet.contains(fText.codePointAt(pi))) { - if (fViramaSet.contains(fText.codePointAt(pi))) { + while (pi > 0 && (fInCBExtendSet.contains(fText.codePointAt(pi)) || + fInCBLinkerSet.contains(fText.codePointAt(pi)))) { + if (fInCBLinkerSet.contains(fText.codePointAt(pi))) { sawVirama = true; } pi = fText.offsetByCodePoints(pi, -1); } - if (sawVirama && fLinkingConsonantSet.contains(fText.codePointAt(pi))) { - setAppliedRule(p2, "GB 9.3 LinkingConsonant ExtCccZwj* Virama ExtCccZwj* × LinkingConsonant"); + if (sawVirama && fInCBConsonantSet.contains(fText.codePointAt(pi))) { + setAppliedRule(p2, "GB9c \\p{InCB=Consonant} [ \\p{InCB=Extend} \\p{InCB=Linker} ]* \\p{InCB=Linker} [ \\p{InCB=Extend} \\p{InCB=Linker} ]* × \\p{InCB=Consonant})"); continue; } } diff --git a/icu4j/main/core/src/test/resources/com/ibm/icu/dev/test/rbbi/break_rules/grapheme.txt b/icu4j/main/core/src/test/resources/com/ibm/icu/dev/test/rbbi/break_rules/grapheme.txt index d5776f33c206..0a811057a579 100644 --- a/icu4j/main/core/src/test/resources/com/ibm/icu/dev/test/rbbi/break_rules/grapheme.txt +++ b/icu4j/main/core/src/test/resources/com/ibm/icu/dev/test/rbbi/break_rules/grapheme.txt @@ -18,7 +18,7 @@ CR = [\p{Grapheme_Cluster_Break = CR}]; LF = [\p{Grapheme_Cluster_Break = LF}]; Control = [[\p{Grapheme_Cluster_Break = Control}]]; -Extend = [[\p{Grapheme_Cluster_Break = Extend}]]; +Extend_ = [[\p{Grapheme_Cluster_Break = Extend}]]; ZWJ = [\p{Grapheme_Cluster_Break = ZWJ}]; Regional_Indicator = [\p{Grapheme_Cluster_Break = Regional_Indicator}]; Prepend = [\p{Grapheme_Cluster_Break = Prepend}]; @@ -38,11 +38,9 @@ LVT = [\p{Grapheme_Cluster_Break = LVT}]; Extended_Pict = [:ExtPict:]; # Indic Sequences -Virama_ = [[\p{Gujr}\p{sc=Telu}\p{sc=Mlym}\p{sc=Orya}\p{sc=Beng}\p{sc=Deva}] & [\p{Indic_Syllabic_Category=Virama}]]; - -LinkingConsonant = [[\p{Gujr}\p{sc=Telu}\p{sc=Mlym}\p{sc=Orya}\p{sc=Beng}\p{sc=Deva}] & [\p{Indic_Syllabic_Category=Consonant}]]; - -ExtCccZwj = [[Extend-[\p{ccc=0}]] ZWJ]; +InCBLinker = [\p{InCB=Linker}]; +InCBConsonant = [\p{InCB=Consonant}]; +InCBExtend = [\p{InCB=Extend}]; GB3: CR LF; GB4: (Control | CR | LF) ÷; @@ -52,9 +50,9 @@ GB6: L (L | V | LV | LVT); GB7: (LV | V) (V | T); GB8: (LVT | T) T; -GB11: Extended_Pict Extend* ZWJ Extended_Pict; -GB9c: LinkingConsonant ExtCccZwj* Virama_ ExtCccZwj* LinkingConsonant; -GB9: . (Extend | ZWJ); +GB11: Extended_Pict Extend_* ZWJ Extended_Pict; +GB9c: InCBConsonant ( InCBExtend | InCBLinker )* InCBLinker ( InCBExtend | InCBLinker )* InCBConsonant; +GB9: . (Extend_ | ZWJ); GB9a: . SpacingMark; GB9b: Prepend .; diff --git a/icu4j/main/core/src/test/resources/com/ibm/icu/dev/test/rbbi/rbbitst.txt b/icu4j/main/core/src/test/resources/com/ibm/icu/dev/test/rbbi/rbbitst.txt index 3eb591576ef5..1c7fe9975699 100644 --- a/icu4j/main/core/src/test/resources/com/ibm/icu/dev/test/rbbi/rbbitst.txt +++ b/icu4j/main/core/src/test/resources/com/ibm/icu/dev/test/rbbi/rbbitst.txt @@ -169,18 +169,9 @@ # #•\u0e40\u0e01•\u0e44\u0301\u0e23\u0302\u0303•\u0e40•\u0e40\u0e02•\u0e02• • -# -# ICU-13637 and CLDR-10994 - Indic Grapheme Cluster Boundary changes to support aksaras -# New rule: LinkingConsonant ExtCccZwj* Virama ExtCccZwj* × LinkingConsonant -# Sample Chars: LinkingConsonant: \u0915 -# Virama: \u094d [also Extend] -# ExtCccZWJ: \u0308 -# Extend but not ExtCCCZWJ \u093A - -•\u0915\u094d\u0915• -•\u0915\u0308\u0308\u094d\u0308\u0308\u0915• -•\u0915\u0308\u0308\u094d\u0308\u0308•\u0041• -•\u0915\u0308\u0308\u094d\u093A\u093A•\u0915• +# From L2/14-131, §3.2; made into a single EGC by UTC-179-C31. +# This test would have caught ICU-22956. +•સૻ્સૻ• # # From cldr/common/testData/segmentation/graphemeCluster/TestSegmenter-Bengali.txt