Skip to content

Commit

Permalink
Support Khmer
Browse files Browse the repository at this point in the history
- Support nonspacing coeng signs
- Assign width 2 to KHMER INDEPENDENT VOWEL QAA
  and 3 to KHMER SIGN BEYYAL
  (https://unicode.org/charts/nameslist/n_1780.html)
  • Loading branch information
Jules-Bertholet committed Jun 6, 2024
1 parent 060cbbb commit 84455bb
Show file tree
Hide file tree
Showing 5 changed files with 113 additions and 21 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ fn main() {
```

**NOTE:** The computed width values may not match the actual rendered column
width. For example, Brahmic scripts like Devanagari have complex rendering rules
width. For example, many Brahmic scripts like Devanagari have complex rendering rules
which this crate does not currently handle (and will never fully handle, because
the exact rendering depends on the font):

Expand Down
51 changes: 40 additions & 11 deletions scripts/unicode.py
Original file line number Diff line number Diff line change
Expand Up @@ -184,6 +184,9 @@ class WidthState(enum.IntEnum):
WIDE = 0x1_0002
"Two columns wide."

THREE = 0x1_0003
"Three columns wide."

# \r\n
LINE_FEED = 0b0000_0000_0000_0001
"\\n (CRLF has width 1)"
Expand Down Expand Up @@ -324,6 +327,11 @@ class WidthState(enum.IntEnum):
ZWJ_OLD_TURKIC_LETTER_ORKHON_I = 0b0011_1100_0000_0110
"\\u10C03 (ORKHON EC-ZWJ-ORKHON I ligature)"

# Khmer coeng signs

KHMER_COENG_ELIGIBLE_LETTER = 0b0011_1100_0000_0111
"\\u1780..=\\u17A2 | \\u17A7 | \\u17AB | \\u17AC | \\u17AF"

def table_width(self) -> CharWidthInTable:
"The width of a character as stored in the lookup tables."
match self:
Expand All @@ -336,6 +344,10 @@ def table_width(self) -> CharWidthInTable:
case _:
return CharWidthInTable.SPECIAL

def is_carried(self) -> bool:
"Whether this corresponds to a non-default `WidthInfo`."
return int(self) <= 0xFFFF

def width_alone(self) -> int:
"The width of a character with this type when it appears alone."
match self:
Expand All @@ -352,6 +364,8 @@ def width_alone(self) -> int:
| WidthState.EMOJI_PRESENTATION
):
return 2
case WidthState.THREE:
return 3
case _:
return 1

Expand Down Expand Up @@ -591,6 +605,18 @@ def load_width_maps() -> tuple[list[WidthState], list[WidthState]]:
([0x0A], WidthState.LINE_FEED),
([0x05DC], WidthState.HEBREW_LETTER_LAMED),
(alef_joining, WidthState.JOINING_GROUP_ALEF),
(range(0x1780, 0x1783), WidthState.KHMER_COENG_ELIGIBLE_LETTER),
(range(0x1784, 0x1788), WidthState.KHMER_COENG_ELIGIBLE_LETTER),
(range(0x1789, 0x178D), WidthState.KHMER_COENG_ELIGIBLE_LETTER),
(range(0x178E, 0x1794), WidthState.KHMER_COENG_ELIGIBLE_LETTER),
(range(0x1795, 0x1799), WidthState.KHMER_COENG_ELIGIBLE_LETTER),
(range(0x179B, 0x179E), WidthState.KHMER_COENG_ELIGIBLE_LETTER),
(
[0x17A0, 0x17A2, 0x17A7, 0x17AB, 0x17AC, 0x17AF],
WidthState.KHMER_COENG_ELIGIBLE_LETTER,
),
([0x17A4], WidthState.WIDE),
([0x17D8], WidthState.THREE),
([0x1A10], WidthState.BUGINESE_LETTER_YA),
(range(0x2D31, 0x2D66), WidthState.TIFINAGH_CONSONANT),
([0x2D6F], WidthState.TIFINAGH_CONSONANT),
Expand Down Expand Up @@ -1189,7 +1215,11 @@ def lookup_fns(
s += f" '\\u{{{lo:X}}}'"
if hi != lo:
s += f"..='\\u{{{hi:X}}}'"
s += f" => ({width.width_alone()}, WidthInfo::{str(width.name)}),\n"
if width.is_carried():
width_info = width.name
else:
width_info = "DEFAULT"
s += f" => ({width.width_alone()}, WidthInfo::{width_info}),\n"

s += f""" _ => (2, WidthInfo::EMOJI_PRESENTATION),
}}
Expand Down Expand Up @@ -1323,6 +1353,11 @@ def lookup_fns(
return (0, WidthInfo::DEFAULT);
}
// Khmer coeng signs
(WidthInfo::KHMER_COENG_ELIGIBLE_LETTER, '\\u{17D2}') => {
return (-1, WidthInfo::DEFAULT);
}
// Buginese <a, -i> ZWJ ya ligature
(WidthInfo::ZWJ_BUGINESE_LETTER_YA, '\\u{1A17}') => {
return (0, WidthInfo::BUGINESE_VOWEL_SIGN_I_ZWJ_LETTER_YA)
Expand Down Expand Up @@ -1519,7 +1554,7 @@ def emit_module(
)

for variant in WidthState:
if variant.table_width() == CharWidthInTable.SPECIAL:
if variant.is_carried():
if variant.is_cjk_only():
module.write(' #[cfg(feature = "cjk")]\n')
module.write(
Expand Down Expand Up @@ -1913,7 +1948,7 @@ def emit_module(
test_width_variants = []
test_width_variants_cjk = []
for variant in WidthState:
if variant.table_width() == CharWidthInTable.SPECIAL:
if variant.is_carried():
if not variant.is_cjk_only():
test_width_variants.append(variant)
if not variant.is_non_cjk_only():
Expand Down Expand Up @@ -1991,10 +2026,7 @@ def emit_module(
)

for variant in WidthState:
if (
variant.table_width() == CharWidthInTable.SPECIAL
and not variant.is_cjk_only()
):
if variant.is_carried() and not variant.is_cjk_only():
module.write(f" WidthInfo::{variant.name},\n")

module.write(
Expand All @@ -2006,10 +2038,7 @@ def emit_module(
)

for variant in WidthState:
if (
variant.table_width() == CharWidthInTable.SPECIAL
and not variant.is_non_cjk_only()
):
if variant.is_carried() and not variant.is_non_cjk_only():
module.write(f" WidthInfo::{variant.name},\n")

module.write(
Expand Down
16 changes: 11 additions & 5 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,9 @@
//! with [`Joining_Group`]`=Alef`, has total width 1. For example: `لا`‎, `لآ`‎, `ڸا`‎, `لٟٞأ`
//! - **[Buginese]**: `"\u{1A15}\u{1A17}\u{200D}\u{1A10}"` (<a, -i> ya, `ᨕᨗ‍ᨐ`) has total width 1.
//! - **[Hebrew]**: `"א\u{200D}ל"` (Alef-Lamed, `א‍ל`) has total width 1.
//! - **[Khmer]**: Coeng signs consisting of `'\u{17D2}'` followed by a character in
//! `'\u{1780}'..='\u{1782}' | '\u{1784}'..='\u{1787}' | '\u{1789}'..='\u{178C}' | '\u{178E}'..='\u{1793}' | '\u{1795}'..='\u{1798}' | '\u{179B}'..='\u{179D}' | '\u{17A0}' | '\u{17A2}' | '\u{17A7}' | '\u{17AB}'..='\u{17AC}' | '\u{17AF}'`
//! have width 0.
//! - **[Lisu]**: Tone letter combinations consisting of a character in the range `'\u{A4F8}'..='\u{A4FB}'`
//! followed by a character in the range `'\u{A4FC}'..='\u{A4FD}'` have width 1. For example: `ꓹꓼ`
//! - **[Old Turkic]**: `"\u{10C32}\u{200D}\u{10C03}"` (`𐰲‍𐰃`) has total width 1.
Expand All @@ -85,8 +88,10 @@
//! - Is a [default-ignorable][`Default_Ignorable_Code_Point`] [combining mark][combining marks].
//! 2. In all other cases, the width of the string equals the sum of its character widths:
//! 1. [`'\u{2D7F}'` TIFINAGH CONSONANT JOINER] has width 1 (outside of the ligatures described previously).
//! 2. [`'\u{115F}'` HANGUL CHOSEONG FILLER](https://util.unicode.org/UnicodeJsps/character.jsp?a=115F) has width 2.
//! 3. The following have width 0:
//! 2. [`'\u{115F}'` HANGUL CHOSEONG FILLER](https://util.unicode.org/UnicodeJsps/character.jsp?a=115F) and
//! [`'\u{17A4}'` KHMER INDEPENDENT VOWEL QAA](https://util.unicode.org/UnicodeJsps/character.jsp?a=17A4) have width 2.
//! 3. [`'\u{17D8}'` KHMER SIGN BEYYAL](https://util.unicode.org/UnicodeJsps/character.jsp?a=17D8) has width 3.
//! 4. The following have width 0:
//! - [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BDefault_Ignorable_Code_Point%7D)
//! with the [`Default_Ignorable_Code_Point`] property.
//! - [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BGrapheme_Extend%7D)
Expand All @@ -109,15 +114,15 @@
//! - [`'\u{0891}'` PIASTRE MARK ABOVE](https://util.unicode.org/UnicodeJsps/character.jsp?a=0891), and
//! - [`'\u{08E2}'` DISPUTED END OF AYAH](https://util.unicode.org/UnicodeJsps/character.jsp?a=08E2).
//! - [`'\u{A8FA}'` DEVANAGARI CARET](https://util.unicode.org/UnicodeJsps/character.jsp?a=A8FA).
//! 4. [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BEast_Asian_Width%3DF%7D%5Cp%7BEast_Asian_Width%3DW%7D)
//! 5. [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BEast_Asian_Width%3DF%7D%5Cp%7BEast_Asian_Width%3DW%7D)
//! with an [`East_Asian_Width`] of [`Fullwidth`] or [`Wide`] have width 2.
//! 5. Characters fulfilling all of the following conditions have width 2 in an East Asian context, and width 1 otherwise:
//! 6. Characters fulfilling all of the following conditions have width 2 in an East Asian context, and width 1 otherwise:
//! - Has an [`East_Asian_Width`] of [`Ambiguous`], or
//! has a canonical decomposition to an [`Ambiguous`] character followed by [`'\u{0338}'` COMBINING LONG SOLIDUS OVERLAY], or
//! is [`'\u{0387}'` GREEK ANO TELEIA](https://util.unicode.org/UnicodeJsps/character.jsp?a=0387), and
//! - Does not have a [`General_Category`] of `Modifier_Symbol`, and
//! - Does not have a [`Script`] of `Latin`, `Greek`, or `Cyrillic`, or is a Roman numeral in the range `'\u{2160}'..='\u{217F}'`.
//! 6. All other characters have width 1.
//! 7. All other characters have width 1.
//!
//! [`'\u{0338}'` COMBINING LONG SOLIDUS OVERLAY]: https://util.unicode.org/UnicodeJsps/character.jsp?a=0338
//! [`'\u{2D7F}'` TIFINAGH CONSONANT JOINER]: https://util.unicode.org/UnicodeJsps/character.jsp?a=2D7F
Expand Down Expand Up @@ -150,6 +155,7 @@
//! [Arabic]: https://www.unicode.org/versions/Unicode15.0.0/ch09.pdf#G7480
//! [Buginese]: https://www.unicode.org/versions/Unicode15.0.0/ch17.pdf#G26743
//! [Hebrew]: https://www.unicode.org/versions/Unicode15.0.0/ch09.pdf#G6528
//! [Khmer]: https://www.unicode.org/versions/Unicode15.0.0/ch16.pdf#G64642
//! [Lisu]: https://www.unicode.org/versions/Unicode15.0.0/ch18.pdf#G44587
//! [Old Turkic]: https://www.unicode.org/versions/Unicode15.0.0/ch14.pdf#G41975
//! [Tifinagh]: http://www.unicode.org/versions/Unicode15.0.0/ch19.pdf#G43184
Expand Down
25 changes: 21 additions & 4 deletions src/tables.rs
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@ impl WidthInfo {
const LISU_TONE_LETTER_MYA_NA_JEU: Self = Self(0b0011110000000101);
const OLD_TURKIC_LETTER_ORKHON_I: Self = Self(0b0011100000000110);
const ZWJ_OLD_TURKIC_LETTER_ORKHON_I: Self = Self(0b0011110000000110);
const KHMER_COENG_ELIGIBLE_LETTER: Self = Self(0b0011110000000111);

/// Whether this width mode is ligature_transparent
/// (has 5th MSB set.)
Expand Down Expand Up @@ -159,6 +160,8 @@ fn lookup_width(c: char) -> (u8, WidthInfo) {
'\u{A}' => (1, WidthInfo::LINE_FEED),
'\u{5DC}' => (1, WidthInfo::HEBREW_LETTER_LAMED),
'\u{622}'..='\u{882}' => (1, WidthInfo::JOINING_GROUP_ALEF),
'\u{1780}'..='\u{17AF}' => (1, WidthInfo::KHMER_COENG_ELIGIBLE_LETTER),
'\u{17D8}' => (3, WidthInfo::DEFAULT),
'\u{1A10}' => (1, WidthInfo::BUGINESE_LETTER_YA),
'\u{2D31}'..='\u{2D6F}' => (1, WidthInfo::TIFINAGH_CONSONANT),
'\u{A4FC}'..='\u{A4FD}' => (1, WidthInfo::LISU_TONE_LETTER_MYA_NA_JEU),
Expand Down Expand Up @@ -255,6 +258,11 @@ fn width_in_str(c: char, mut next_info: WidthInfo) -> (i8, WidthInfo) {
return (0, WidthInfo::DEFAULT);
}

// Khmer coeng signs
(WidthInfo::KHMER_COENG_ELIGIBLE_LETTER, '\u{17D2}') => {
return (-1, WidthInfo::DEFAULT);
}

// Buginese <a, -i> ZWJ ya ligature
(WidthInfo::ZWJ_BUGINESE_LETTER_YA, '\u{1A17}') => {
return (0, WidthInfo::BUGINESE_VOWEL_SIGN_I_ZWJ_LETTER_YA)
Expand Down Expand Up @@ -436,6 +444,8 @@ fn lookup_width_cjk(c: char) -> (u8, WidthInfo) {
'\u{338}' => (0, WidthInfo::COMBINING_LONG_SOLIDUS_OVERLAY),
'\u{5DC}' => (1, WidthInfo::HEBREW_LETTER_LAMED),
'\u{622}'..='\u{882}' => (1, WidthInfo::JOINING_GROUP_ALEF),
'\u{1780}'..='\u{17AF}' => (1, WidthInfo::KHMER_COENG_ELIGIBLE_LETTER),
'\u{17D8}' => (3, WidthInfo::DEFAULT),
'\u{1A10}' => (1, WidthInfo::BUGINESE_LETTER_YA),
'\u{2D31}'..='\u{2D6F}' => (1, WidthInfo::TIFINAGH_CONSONANT),
'\u{A4FC}'..='\u{A4FD}' => (1, WidthInfo::LISU_TONE_LETTER_MYA_NA_JEU),
Expand Down Expand Up @@ -539,6 +549,11 @@ fn width_in_str_cjk(c: char, mut next_info: WidthInfo) -> (i8, WidthInfo) {
return (0, WidthInfo::DEFAULT);
}

// Khmer coeng signs
(WidthInfo::KHMER_COENG_ELIGIBLE_LETTER, '\u{17D2}') => {
return (-1, WidthInfo::DEFAULT);
}

// Buginese <a, -i> ZWJ ya ligature
(WidthInfo::ZWJ_BUGINESE_LETTER_YA, '\u{1A17}') => {
return (0, WidthInfo::BUGINESE_VOWEL_SIGN_I_ZWJ_LETTER_YA)
Expand Down Expand Up @@ -1206,8 +1221,8 @@ static WIDTH_LEAVES: Align32<[[u8; 32]; WIDTH_LEAVES_LEN]> = Align32([
0x55, 0x55,
],
[
0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x10, 0x00,
0x50, 0x55, 0x45, 0x01, 0x00, 0x00, 0x55, 0x55, 0x51, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55,
0x7F, 0xFF, 0xFD, 0xF7, 0xFF, 0xFD, 0xD7, 0x5F, 0x77, 0xD6, 0xD5, 0xD7, 0x55, 0x10, 0x00,
0x50, 0x55, 0x45, 0x01, 0x00, 0x00, 0x55, 0x57, 0x51, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55,
0x55, 0x55,
],
[
Expand Down Expand Up @@ -2575,7 +2590,7 @@ mod tests {
}
}

static NORMALIZATION_TEST_WIDTHS: [WidthInfo; 37] = [
static NORMALIZATION_TEST_WIDTHS: [WidthInfo; 38] = [
WidthInfo::DEFAULT,
WidthInfo::LINE_FEED,
WidthInfo::EMOJI_MODIFIER,
Expand Down Expand Up @@ -2613,10 +2628,11 @@ mod tests {
WidthInfo::LISU_TONE_LETTER_MYA_NA_JEU,
WidthInfo::OLD_TURKIC_LETTER_ORKHON_I,
WidthInfo::ZWJ_OLD_TURKIC_LETTER_ORKHON_I,
WidthInfo::KHMER_COENG_ELIGIBLE_LETTER,
];

#[cfg(feature = "cjk")]
static NORMALIZATION_TEST_WIDTHS_CJK: [WidthInfo; 38] = [
static NORMALIZATION_TEST_WIDTHS_CJK: [WidthInfo; 39] = [
WidthInfo::DEFAULT,
WidthInfo::LINE_FEED,
WidthInfo::EMOJI_MODIFIER,
Expand Down Expand Up @@ -2655,6 +2671,7 @@ mod tests {
WidthInfo::LISU_TONE_LETTER_MYA_NA_JEU,
WidthInfo::OLD_TURKIC_LETTER_ORKHON_I,
WidthInfo::ZWJ_OLD_TURKIC_LETTER_ORKHON_I,
WidthInfo::KHMER_COENG_ELIGIBLE_LETTER,
];

#[rustfmt::skip]
Expand Down
40 changes: 40 additions & 0 deletions tests/tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -385,6 +385,46 @@ fn test_old_turkic_ligature() {
assert_width!("\u{200D}\u{10C32}", 1, 1);
}

#[test]
fn test_khmer_coeng() {
assert_width!("ល", 1, 1);
assert_width!("ង", 1, 1);
assert_width!("លង", 2, 2);
assert_width!("ល្ង", 1, 1);

for c in '\0'..=char::MAX {
if matches!(
c,
'\u{1780}'..='\u{1782}' | '\u{1784}'..='\u{1787}'
| '\u{1789}'..='\u{178C}' | '\u{178E}'..='\u{1793}'
| '\u{1795}'..='\u{1798}' | '\u{179B}'..='\u{179D}'
| '\u{17A0}' | '\u{17A2}' | '\u{17A7}'
| '\u{17AB}'..='\u{17AC}' | '\u{17AF}'
) {
assert_width!(format!("\u{17D2}{c}"), 0, 0);
assert_width!(format!("\u{17D2}\u{200D}\u{200D}{c}"), 0, 0);
} else {
assert_width!(
format!("\u{17D2}{c}"),
c.width().unwrap_or(1),
c.width_cjk().unwrap_or(1)
);
}
}
}

#[test]
fn test_khmer_qaa() {
assert_width!("\u{17A4}", 2, 2);
assert_width!("\u{17A2}\u{17A6}", 2, 2);
}

#[test]
fn test_khmer_sign_beyyal() {
assert_width!("\u{17D8}", 3, 3);
assert_width!("\u{17D4}\u{179B}\u{17D4}", 3, 3);
}

#[test]
fn test_emoji_modifier() {
assert_width!("\u{1F46A}", 2, 2);
Expand Down

0 comments on commit 84455bb

Please sign in to comment.