Support Khmer

- Support nonspacing coeng signs - Assign width 2 to KHMER INDEPENDENT VOWEL QAA and 3 to KHMER SIGN BEYYAL (https://unicode.org/charts/nameslist/n_1780.html)
unicode-rs · Jun 6, 2024 · 84455bb · 84455bb
1 parent 060cbbb
commit 84455bb
Show file tree

Hide file tree

Showing 5 changed files with 113 additions and 21 deletions.
diff --git a/README.md b/README.md
@@ -25,7 +25,7 @@ fn main() {
 ```
 
 **NOTE:** The computed width values may not match the actual rendered column
-width. For example, Brahmic scripts like Devanagari have complex rendering rules
+width. For example, many Brahmic scripts like Devanagari have complex rendering rules
 which this crate does not currently handle (and will never fully handle, because
 the exact rendering depends on the font):
 

diff --git a/scripts/unicode.py b/scripts/unicode.py
@@ -184,6 +184,9 @@ class WidthState(enum.IntEnum):
     WIDE = 0x1_0002
     "Two columns wide."
 
+    THREE = 0x1_0003
+    "Three columns wide."
+
     # \r\n
     LINE_FEED = 0b0000_0000_0000_0001
     "\\n (CRLF has width 1)"
@@ -324,6 +327,11 @@ class WidthState(enum.IntEnum):
     ZWJ_OLD_TURKIC_LETTER_ORKHON_I = 0b0011_1100_0000_0110
     "\\u10C03 (ORKHON EC-ZWJ-ORKHON I ligature)"
 
+    # Khmer coeng signs
+
+    KHMER_COENG_ELIGIBLE_LETTER = 0b0011_1100_0000_0111
+    "\\u1780..=\\u17A2 | \\u17A7 | \\u17AB | \\u17AC | \\u17AF"
+
     def table_width(self) -> CharWidthInTable:
         "The width of a character as stored in the lookup tables."
         match self:
@@ -336,6 +344,10 @@ def table_width(self) -> CharWidthInTable:
             case _:
                 return CharWidthInTable.SPECIAL
 
+    def is_carried(self) -> bool:
+        "Whether this corresponds to a non-default `WidthInfo`."
+        return int(self) <= 0xFFFF
+
     def width_alone(self) -> int:
         "The width of a character with this type when it appears alone."
         match self:
@@ -352,6 +364,8 @@ def width_alone(self) -> int:
                 | WidthState.EMOJI_PRESENTATION
             ):
                 return 2
+            case WidthState.THREE:
+                return 3
             case _:
                 return 1
 
@@ -591,6 +605,18 @@ def load_width_maps() -> tuple[list[WidthState], list[WidthState]]:
         ([0x0A], WidthState.LINE_FEED),
         ([0x05DC], WidthState.HEBREW_LETTER_LAMED),
         (alef_joining, WidthState.JOINING_GROUP_ALEF),
+        (range(0x1780, 0x1783), WidthState.KHMER_COENG_ELIGIBLE_LETTER),
+        (range(0x1784, 0x1788), WidthState.KHMER_COENG_ELIGIBLE_LETTER),
+        (range(0x1789, 0x178D), WidthState.KHMER_COENG_ELIGIBLE_LETTER),
+        (range(0x178E, 0x1794), WidthState.KHMER_COENG_ELIGIBLE_LETTER),
+        (range(0x1795, 0x1799), WidthState.KHMER_COENG_ELIGIBLE_LETTER),
+        (range(0x179B, 0x179E), WidthState.KHMER_COENG_ELIGIBLE_LETTER),
+        (
+            [0x17A0, 0x17A2, 0x17A7, 0x17AB, 0x17AC, 0x17AF],
+            WidthState.KHMER_COENG_ELIGIBLE_LETTER,
+        ),
+        ([0x17A4], WidthState.WIDE),
+        ([0x17D8], WidthState.THREE),
         ([0x1A10], WidthState.BUGINESE_LETTER_YA),
         (range(0x2D31, 0x2D66), WidthState.TIFINAGH_CONSONANT),
         ([0x2D6F], WidthState.TIFINAGH_CONSONANT),
@@ -1189,7 +1215,11 @@ def lookup_fns(
         s += f"            '\\u{{{lo:X}}}'"
         if hi != lo:
             s += f"..='\\u{{{hi:X}}}'"
-        s += f" => ({width.width_alone()}, WidthInfo::{str(width.name)}),\n"
+        if width.is_carried():
+            width_info = width.name
+        else:
+            width_info = "DEFAULT"
+        s += f" => ({width.width_alone()}, WidthInfo::{width_info}),\n"
 
     s += f"""            _ => (2, WidthInfo::EMOJI_PRESENTATION),
         }}
@@ -1323,6 +1353,11 @@ def lookup_fns(
                     return (0, WidthInfo::DEFAULT);
                 }
 
+                // Khmer coeng signs
+                (WidthInfo::KHMER_COENG_ELIGIBLE_LETTER, '\\u{17D2}') => {
+                    return (-1, WidthInfo::DEFAULT);
+                }
+
                 // Buginese <a, -i> ZWJ ya ligature
                 (WidthInfo::ZWJ_BUGINESE_LETTER_YA, '\\u{1A17}') => {
                     return (0, WidthInfo::BUGINESE_VOWEL_SIGN_I_ZWJ_LETTER_YA)
@@ -1519,7 +1554,7 @@ def emit_module(
         )
 
         for variant in WidthState:
-            if variant.table_width() == CharWidthInTable.SPECIAL:
+            if variant.is_carried():
                 if variant.is_cjk_only():
                     module.write('    #[cfg(feature = "cjk")]\n')
                 module.write(
@@ -1913,7 +1948,7 @@ def emit_module(
         test_width_variants = []
         test_width_variants_cjk = []
         for variant in WidthState:
-            if variant.table_width() == CharWidthInTable.SPECIAL:
+            if variant.is_carried():
                 if not variant.is_cjk_only():
                     test_width_variants.append(variant)
                 if not variant.is_non_cjk_only():
@@ -1991,10 +2026,7 @@ def emit_module(
         )
 
         for variant in WidthState:
-            if (
-                variant.table_width() == CharWidthInTable.SPECIAL
-                and not variant.is_cjk_only()
-            ):
+            if variant.is_carried() and not variant.is_cjk_only():
                 module.write(f"        WidthInfo::{variant.name},\n")
 
         module.write(
@@ -2006,10 +2038,7 @@ def emit_module(
         )
 
         for variant in WidthState:
-            if (
-                variant.table_width() == CharWidthInTable.SPECIAL
-                and not variant.is_non_cjk_only()
-            ):
+            if variant.is_carried() and not variant.is_non_cjk_only():
                 module.write(f"        WidthInfo::{variant.name},\n")
 
         module.write(

diff --git a/src/lib.rs b/src/lib.rs
@@ -72,6 +72,9 @@
 //!        with [`Joining_Group`]`=Alef`, has total width 1. For example: `لا`‎, `لآ`‎, `ڸا`‎, `لٟٞأ`
 //!      - **[Buginese]**: `"\u{1A15}\u{1A17}\u{200D}\u{1A10}"` (<a, -i> ya, `ᨕᨗ‍ᨐ`) has total width 1.
 //!      - **[Hebrew]**: `"א\u{200D}ל"` (Alef-Lamed, `א‍ל`) has total width 1.
+//!      - **[Khmer]**: Coeng signs consisting of `'\u{17D2}'` followed by a character in
+//!        `'\u{1780}'..='\u{1782}' | '\u{1784}'..='\u{1787}' | '\u{1789}'..='\u{178C}'  | '\u{178E}'..='\u{1793}' | '\u{1795}'..='\u{1798}' | '\u{179B}'..='\u{179D}' | '\u{17A0}' | '\u{17A2}'  | '\u{17A7}' | '\u{17AB}'..='\u{17AC}' | '\u{17AF}'`
+//!        have width 0.
 //!      - **[Lisu]**: Tone letter combinations consisting of a character in the range `'\u{A4F8}'..='\u{A4FB}'`
 //!        followed by a character in the range `'\u{A4FC}'..='\u{A4FD}'` have width 1. For example: `ꓹꓼ`
 //!      - **[Old Turkic]**: `"\u{10C32}\u{200D}\u{10C03}"` (`𐰲‍𐰃`) has total width 1.
@@ -85,8 +88,10 @@
 //!      - Is a [default-ignorable][`Default_Ignorable_Code_Point`] [combining mark][combining marks].
 //! 2. In all other cases, the width of the string equals the sum of its character widths:
 //!    1. [`'\u{2D7F}'` TIFINAGH CONSONANT JOINER] has width 1 (outside of the ligatures described previously).
-//!    2. [`'\u{115F}'` HANGUL CHOSEONG FILLER](https://util.unicode.org/UnicodeJsps/character.jsp?a=115F) has width 2.
-//!    3. The following have width 0:
+//!    2. [`'\u{115F}'` HANGUL CHOSEONG FILLER](https://util.unicode.org/UnicodeJsps/character.jsp?a=115F) and
+//!       [`'\u{17A4}'` KHMER INDEPENDENT VOWEL QAA](https://util.unicode.org/UnicodeJsps/character.jsp?a=17A4) have width 2.
+//!    3. [`'\u{17D8}'` KHMER SIGN BEYYAL](https://util.unicode.org/UnicodeJsps/character.jsp?a=17D8) has width 3.
+//!    4. The following have width 0:
 //!       - [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BDefault_Ignorable_Code_Point%7D)
 //!         with the [`Default_Ignorable_Code_Point`] property.
 //!       - [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BGrapheme_Extend%7D)
@@ -109,15 +114,15 @@
 //!         - [`'\u{0891}'` PIASTRE MARK ABOVE](https://util.unicode.org/UnicodeJsps/character.jsp?a=0891), and
 //!         - [`'\u{08E2}'` DISPUTED END OF AYAH](https://util.unicode.org/UnicodeJsps/character.jsp?a=08E2).
 //!       - [`'\u{A8FA}'` DEVANAGARI CARET](https://util.unicode.org/UnicodeJsps/character.jsp?a=A8FA).
-//!    4. [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BEast_Asian_Width%3DF%7D%5Cp%7BEast_Asian_Width%3DW%7D)
+//!    5. [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BEast_Asian_Width%3DF%7D%5Cp%7BEast_Asian_Width%3DW%7D)
 //!       with an [`East_Asian_Width`] of [`Fullwidth`] or [`Wide`] have width 2.
-//!    5. Characters fulfilling all of the following conditions have width 2 in an East Asian context, and width 1 otherwise:
+//!    6. Characters fulfilling all of the following conditions have width 2 in an East Asian context, and width 1 otherwise:
 //!       - Has an [`East_Asian_Width`] of [`Ambiguous`], or
 //!         has a canonical decomposition to an [`Ambiguous`] character followed by [`'\u{0338}'` COMBINING LONG SOLIDUS OVERLAY], or
 //!         is [`'\u{0387}'` GREEK ANO TELEIA](https://util.unicode.org/UnicodeJsps/character.jsp?a=0387), and
 //!       - Does not have a [`General_Category`] of `Modifier_Symbol`, and
 //!       - Does not have a [`Script`] of `Latin`, `Greek`, or `Cyrillic`, or is a Roman numeral in the range `'\u{2160}'..='\u{217F}'`.
-//!    6. All other characters have width 1.
+//!    7. All other characters have width 1.
 //!
 //! [`'\u{0338}'` COMBINING LONG SOLIDUS OVERLAY]: https://util.unicode.org/UnicodeJsps/character.jsp?a=0338
 //! [`'\u{2D7F}'` TIFINAGH CONSONANT JOINER]: https://util.unicode.org/UnicodeJsps/character.jsp?a=2D7F
@@ -150,6 +155,7 @@
 //! [Arabic]: https://www.unicode.org/versions/Unicode15.0.0/ch09.pdf#G7480
 //! [Buginese]: https://www.unicode.org/versions/Unicode15.0.0/ch17.pdf#G26743
 //! [Hebrew]: https://www.unicode.org/versions/Unicode15.0.0/ch09.pdf#G6528
+//! [Khmer]: https://www.unicode.org/versions/Unicode15.0.0/ch16.pdf#G64642
 //! [Lisu]: https://www.unicode.org/versions/Unicode15.0.0/ch18.pdf#G44587
 //! [Old Turkic]: https://www.unicode.org/versions/Unicode15.0.0/ch14.pdf#G41975
 //! [Tifinagh]: http://www.unicode.org/versions/Unicode15.0.0/ch19.pdf#G43184

diff --git a/src/tables.rs b/src/tables.rs
@@ -58,6 +58,7 @@ impl WidthInfo {
     const LISU_TONE_LETTER_MYA_NA_JEU: Self = Self(0b0011110000000101);
     const OLD_TURKIC_LETTER_ORKHON_I: Self = Self(0b0011100000000110);
     const ZWJ_OLD_TURKIC_LETTER_ORKHON_I: Self = Self(0b0011110000000110);
+    const KHMER_COENG_ELIGIBLE_LETTER: Self = Self(0b0011110000000111);
 
     /// Whether this width mode is ligature_transparent
     /// (has 5th MSB set.)
@@ -159,6 +160,8 @@ fn lookup_width(c: char) -> (u8, WidthInfo) {
             '\u{A}' => (1, WidthInfo::LINE_FEED),
             '\u{5DC}' => (1, WidthInfo::HEBREW_LETTER_LAMED),
             '\u{622}'..='\u{882}' => (1, WidthInfo::JOINING_GROUP_ALEF),
+            '\u{1780}'..='\u{17AF}' => (1, WidthInfo::KHMER_COENG_ELIGIBLE_LETTER),
+            '\u{17D8}' => (3, WidthInfo::DEFAULT),
             '\u{1A10}' => (1, WidthInfo::BUGINESE_LETTER_YA),
             '\u{2D31}'..='\u{2D6F}' => (1, WidthInfo::TIFINAGH_CONSONANT),
             '\u{A4FC}'..='\u{A4FD}' => (1, WidthInfo::LISU_TONE_LETTER_MYA_NA_JEU),
@@ -255,6 +258,11 @@ fn width_in_str(c: char, mut next_info: WidthInfo) -> (i8, WidthInfo) {
                     return (0, WidthInfo::DEFAULT);
                 }
 
+                // Khmer coeng signs
+                (WidthInfo::KHMER_COENG_ELIGIBLE_LETTER, '\u{17D2}') => {
+                    return (-1, WidthInfo::DEFAULT);
+                }
+
                 // Buginese <a, -i> ZWJ ya ligature
                 (WidthInfo::ZWJ_BUGINESE_LETTER_YA, '\u{1A17}') => {
                     return (0, WidthInfo::BUGINESE_VOWEL_SIGN_I_ZWJ_LETTER_YA)
@@ -436,6 +444,8 @@ fn lookup_width_cjk(c: char) -> (u8, WidthInfo) {
             '\u{338}' => (0, WidthInfo::COMBINING_LONG_SOLIDUS_OVERLAY),
             '\u{5DC}' => (1, WidthInfo::HEBREW_LETTER_LAMED),
             '\u{622}'..='\u{882}' => (1, WidthInfo::JOINING_GROUP_ALEF),
+            '\u{1780}'..='\u{17AF}' => (1, WidthInfo::KHMER_COENG_ELIGIBLE_LETTER),
+            '\u{17D8}' => (3, WidthInfo::DEFAULT),
             '\u{1A10}' => (1, WidthInfo::BUGINESE_LETTER_YA),
             '\u{2D31}'..='\u{2D6F}' => (1, WidthInfo::TIFINAGH_CONSONANT),
             '\u{A4FC}'..='\u{A4FD}' => (1, WidthInfo::LISU_TONE_LETTER_MYA_NA_JEU),
@@ -539,6 +549,11 @@ fn width_in_str_cjk(c: char, mut next_info: WidthInfo) -> (i8, WidthInfo) {
                     return (0, WidthInfo::DEFAULT);
                 }
 
+                // Khmer coeng signs
+                (WidthInfo::KHMER_COENG_ELIGIBLE_LETTER, '\u{17D2}') => {
+                    return (-1, WidthInfo::DEFAULT);
+                }
+
                 // Buginese <a, -i> ZWJ ya ligature
                 (WidthInfo::ZWJ_BUGINESE_LETTER_YA, '\u{1A17}') => {
                     return (0, WidthInfo::BUGINESE_VOWEL_SIGN_I_ZWJ_LETTER_YA)
@@ -1206,8 +1221,8 @@ static WIDTH_LEAVES: Align32<[[u8; 32]; WIDTH_LEAVES_LEN]> = Align32([
         0x55, 0x55,
     ],
     [
-        0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x10, 0x00,
-        0x50, 0x55, 0x45, 0x01, 0x00, 0x00, 0x55, 0x55, 0x51, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55,
+        0x7F, 0xFF, 0xFD, 0xF7, 0xFF, 0xFD, 0xD7, 0x5F, 0x77, 0xD6, 0xD5, 0xD7, 0x55, 0x10, 0x00,
+        0x50, 0x55, 0x45, 0x01, 0x00, 0x00, 0x55, 0x57, 0x51, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55,
         0x55, 0x55,
     ],
     [
@@ -2575,7 +2590,7 @@ mod tests {
         }
     }
 
-    static NORMALIZATION_TEST_WIDTHS: [WidthInfo; 37] = [
+    static NORMALIZATION_TEST_WIDTHS: [WidthInfo; 38] = [
         WidthInfo::DEFAULT,
         WidthInfo::LINE_FEED,
         WidthInfo::EMOJI_MODIFIER,
@@ -2613,10 +2628,11 @@ mod tests {
         WidthInfo::LISU_TONE_LETTER_MYA_NA_JEU,
         WidthInfo::OLD_TURKIC_LETTER_ORKHON_I,
         WidthInfo::ZWJ_OLD_TURKIC_LETTER_ORKHON_I,
+        WidthInfo::KHMER_COENG_ELIGIBLE_LETTER,
     ];
 
     #[cfg(feature = "cjk")]
-    static NORMALIZATION_TEST_WIDTHS_CJK: [WidthInfo; 38] = [
+    static NORMALIZATION_TEST_WIDTHS_CJK: [WidthInfo; 39] = [
         WidthInfo::DEFAULT,
         WidthInfo::LINE_FEED,
         WidthInfo::EMOJI_MODIFIER,
@@ -2655,6 +2671,7 @@ mod tests {
         WidthInfo::LISU_TONE_LETTER_MYA_NA_JEU,
         WidthInfo::OLD_TURKIC_LETTER_ORKHON_I,
         WidthInfo::ZWJ_OLD_TURKIC_LETTER_ORKHON_I,
+        WidthInfo::KHMER_COENG_ELIGIBLE_LETTER,
     ];
 
     #[rustfmt::skip]

diff --git a/tests/tests.rs b/tests/tests.rs
@@ -385,6 +385,46 @@ fn test_old_turkic_ligature() {
     assert_width!("\u{200D}\u{10C32}", 1, 1);
 }
 
+#[test]
+fn test_khmer_coeng() {
+    assert_width!("ល", 1, 1);
+    assert_width!("ង", 1, 1);
+    assert_width!("លង", 2, 2);
+    assert_width!("ល្ង", 1, 1);
+
+    for c in '\0'..=char::MAX {
+        if matches!(
+            c,
+            '\u{1780}'..='\u{1782}' | '\u{1784}'..='\u{1787}'
+            | '\u{1789}'..='\u{178C}'  | '\u{178E}'..='\u{1793}'
+            | '\u{1795}'..='\u{1798}' | '\u{179B}'..='\u{179D}'
+            | '\u{17A0}' | '\u{17A2}'  | '\u{17A7}'
+            | '\u{17AB}'..='\u{17AC}' | '\u{17AF}'
+        ) {
+            assert_width!(format!("\u{17D2}{c}"), 0, 0);
+            assert_width!(format!("\u{17D2}\u{200D}\u{200D}{c}"), 0, 0);
+        } else {
+            assert_width!(
+                format!("\u{17D2}{c}"),
+                c.width().unwrap_or(1),
+                c.width_cjk().unwrap_or(1)
+            );
+        }
+    }
+}
+
+#[test]
+fn test_khmer_qaa() {
+    assert_width!("\u{17A4}", 2, 2);
+    assert_width!("\u{17A2}\u{17A6}", 2, 2);
+}
+
+#[test]
+fn test_khmer_sign_beyyal() {
+    assert_width!("\u{17D8}", 3, 3);
+    assert_width!("\u{17D4}\u{179B}\u{17D4}", 3, 3);
+}
+
 #[test]
 fn test_emoji_modifier() {
     assert_width!("\u{1F46A}", 2, 2);