unicode-rs · Manishearth · Jun 8, 2024 · May 10, 2024 · May 10, 2024 · May 10, 2024
diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml
@@ -7,28 +7,31 @@ on:
     branches: [ master ]
 
 env:
+  CARGO_INCREMENTAL: 0
   CARGO_TERM_COLOR: always
+  RUST_BACKTRACE: 1
+  RUSTFLAGS: -D warnings
+  RUSTDOCFLAGS: -D warnings
 
 jobs:
   build:
-
     runs-on: ubuntu-latest
-
     steps:
     - uses: actions/checkout@v2
     - name: Build
       run: cargo build --verbose
     - name: Run tests
       run: cargo test --verbose
-  fmt:
+    - name: Run clippy
+      run: cargo clippy --all-targets --all --verbose
 
+  fmt:
     runs-on: ubuntu-latest
-
     steps:
     - uses: actions/checkout@v2
     - name: Rustfmt
-      run: cargo fmt --check
+      run: cargo fmt --all --check
     - name: Verify regenerated files
       run: ./scripts/unicode.py && diff tables.rs src/tables.rs
     - name: Verify regenerated tests
-      run: ./scripts/unicode_gen_breaktests.py && rustfmt testdata.rs && diff testdata.rs src/testdata.rs
+      run: ./scripts/unicode_gen_breaktests.py && diff testdata.rs tests/testdata/mod.rs
diff --git a/benches/chars.rs b/benches/chars.rs
@@ -6,7 +6,6 @@
 //! is how much slower full unicode handling is.
 
 use criterion::{black_box, criterion_group, criterion_main, BenchmarkId, Criterion};
-use unicode_segmentation;
 
 use std::fs;
 use unicode_segmentation::UnicodeSegmentation;
@@ -24,14 +23,14 @@ const FILES: &[&str] = &[
 
 #[inline(always)]
 fn grapheme(text: &str) {
-    for c in UnicodeSegmentation::graphemes(black_box(&*text), true) {
+    for c in UnicodeSegmentation::graphemes(black_box(text), true) {
         black_box(c);
     }
 }
 
 #[inline(always)]
 fn scalar(text: &str) {
-    for c in black_box(&*text).chars() {
+    for c in black_box(text).chars() {
         black_box(c);
     }
 }

diff --git a/scripts/unicode.py b/scripts/unicode.py
@@ -155,11 +155,11 @@ def format_table_content(f, content, indent):
             line = " "*indent + chunk
     f.write(line)
 
-def load_properties(f, interestingprops):
+def load_properties(f, interestingprops: "list[str | tuple[str, str]] | None" = None):
     fetch(f)
     props = {}
-    re1 = re.compile(r"^ *([0-9A-F]+) *; *(\w+)")
-    re2 = re.compile(r"^ *([0-9A-F]+)\.\.([0-9A-F]+) *; *(\w+)")
+    re1 = re.compile(r"^\s*([0-9A-F]+)\s*;\s*(\w+)(?:\s*;\s*(\w+))?")
+    re2 = re.compile(r"^\s*([0-9A-F]+)\.\.([0-9A-F]+)\s*;\s*(\w+)(?:\s*;\s*(\w+))?")
 
     for line in fileinput.input(os.path.basename(f)):
         prop = None
@@ -168,17 +168,21 @@ def load_properties(f, interestingprops):
         m = re1.match(line)
         if m:
             d_lo = m.group(1)
-            d_hi = m.group(1)
+            d_hi = d_lo
             prop = m.group(2)
+            value = m.group(3)
         else:
             m = re2.match(line)
             if m:
                 d_lo = m.group(1)
                 d_hi = m.group(2)
                 prop = m.group(3)
+                value = m.group(4)
             else:
                 continue
-        if interestingprops and prop not in interestingprops:
+        if value is not None:
+            prop = (prop, value)
+        if interestingprops is not None and prop not in interestingprops:
             continue
         d_lo = int(d_lo, 16)
         d_hi = int(d_hi, 16)
@@ -195,7 +199,7 @@ def load_properties(f, interestingprops):
 def escape_char(c):
     return "'\\u{%x}'" % c
 
-def emit_table(f, name, t_data, t_type = "&'static [(char, char)]", is_pub=True,
+def emit_table(f, name, t_data, t_type = "&[(char, char)]", is_pub=True,
         pfun=lambda x: "(%s,%s)" % (escape_char(x[0]), escape_char(x[1])), is_const=True):
     pub_string = "const"
     if not is_const:
@@ -217,7 +221,7 @@ def emit_util_mod(f):
     f.write("""
 pub mod util {
     #[inline]
-    pub fn bsearch_range_table(c: char, r: &'static [(char,char)]) -> bool {
+    pub fn bsearch_range_table(c: char, r: &[(char,char)]) -> bool {
         use core::cmp::Ordering::{Equal, Less, Greater};
         r.binary_search_by(|&(lo,hi)| {
             if lo <= c && c <= hi { Equal }
@@ -252,13 +256,22 @@ def emit_util_mod(f):
 
 """)
 
-def emit_property_module(f, mod, tbl, emit):
-    f.write("mod %s {\n" % mod)
-    for cat in sorted(emit):
-        emit_table(f, "%s_table" % cat, tbl[cat], is_pub=False)
+def emit_property_module(f, mod, tbl, emit: "list[str | tuple[str, str]]"):
+    f.write("pub mod %s {\n" % mod)
+
+    cats = []
+    for cat in emit:
+        if type(cat) is tuple:
+            cats.append((f"{cat[0]}_{cat[1]}", cat))
+        else:
+            cats.append((cat, cat))
+    cats.sort(key=lambda x: x[0])
+
+    for cat_str, cat in cats:
+        emit_table(f, "%s_table" % cat_str, tbl[cat], is_pub=False)
         f.write("    #[inline]\n")
-        f.write("    pub fn %s(c: char) -> bool {\n" % cat)
-        f.write("        super::util::bsearch_range_table(c, %s_table)\n" % cat)
+        f.write("    pub fn %s(c: char) -> bool {\n" % cat_str)
+        f.write("        super::util::bsearch_range_table(c, %s_table)\n" % cat_str)
         f.write("    }\n\n")
     f.write("}\n\n")
 
@@ -303,7 +316,7 @@ def emit_break_module(f, break_table, break_cats, name):
         f.write(("        %sC_" % Name[0]) + cat + ",\n")
     f.write("""    }
 
-    fn bsearch_range_value_table(c: char, r: &'static [(char, char, %sCat)], default_lower: u32, default_upper: u32) -> (u32, u32, %sCat) {
+    fn bsearch_range_value_table(c: char, r: &[(char, char, %sCat)], default_lower: u32, default_upper: u32) -> (u32, u32, %sCat) {
         use core::cmp::Ordering::{Equal, Less, Greater};
         match r.binary_search_by(|&(lo, hi, _)| {
             if lo <= c && c <= hi { Equal }
@@ -355,11 +368,11 @@ def emit_break_module(f, break_table, break_cats, name):
     else:
       lookup_type = "u32"
 
-    emit_table(f, "%s_cat_lookup" % name, lookup_table, "&'static [%s]" % lookup_type,
+    emit_table(f, "%s_cat_lookup" % name, lookup_table, "&[%s]" % lookup_type,
         pfun=lambda x: "%d" % x,
         is_pub=False, is_const=True)
 
-    emit_table(f, "%s_cat_table" % name, break_table, "&'static [(char, char, %sCat)]" % Name,
+    emit_table(f, "%s_cat_table" % name, break_table, "&[(char, char, %sCat)]" % Name,
         pfun=lambda x: "(%s,%s,%sC_%s)" % (escape_char(x[0]), escape_char(x[1]), Name[0], x[2]),
         is_pub=False, is_const=True)
     f.write("}\n")
@@ -379,17 +392,26 @@ def emit_break_module(f, break_table, break_cats, name):
 
         # download and parse all the data
         gencats = load_gencats("UnicodeData.txt")
-        derived = load_properties("DerivedCoreProperties.txt", ["Alphabetic"])
+        derived = load_properties("DerivedCoreProperties.txt", ["Alphabetic", ("InCB", "Consonant"), ("InCB", "Extend"), ("InCB", "Linker")])
 
         emit_util_mod(rf)
         for (name, cat, pfuns) in ("general_category", gencats, ["N"]), \
-                                  ("derived_property", derived, ["Alphabetic"]):
+                                  ("derived_property", derived, ["Alphabetic", ("InCB", "Extend")]):
             emit_property_module(rf, name, cat, pfuns)
 
+        rf.write("""pub fn is_incb_linker(c: char) -> bool {
+    matches!(c,""")
+
+        for (lo, hi) in derived[("InCB", "Linker")]:
+            rf.write(f" | '\\u{{{lo:X}}}'")
+            if lo != hi:
+                rf.write(f"..'\\u{{{lo:X}}}'")
+
+        rf.write(")\n}\n\n")
+
         ### grapheme cluster module
         # from http://www.unicode.org/reports/tr29/#Grapheme_Cluster_Break_Property_Values
-        grapheme_cats = load_properties("auxiliary/GraphemeBreakProperty.txt", [])
-
+        grapheme_cats = load_properties("auxiliary/GraphemeBreakProperty.txt")
         # Control
         #  Note:
         # This category also includes Cs (surrogate codepoints), but Rust's `char`s are
@@ -398,22 +420,22 @@ def emit_break_module(f, break_table, break_cats, name):
         grapheme_cats["Control"] = group_cat(list(
             set(ungroup_cat(grapheme_cats["Control"]))
             - set(ungroup_cat([surrogate_codepoints]))))
-
+        grapheme_cats["InCB_Consonant"] = derived[("InCB", "Consonant")]
+        emoji_props = load_properties("emoji-data.txt", ["Extended_Pictographic"])
+        grapheme_cats["Extended_Pictographic"] = emoji_props["Extended_Pictographic"]
         grapheme_table = []
         for cat in grapheme_cats:
             grapheme_table.extend([(x, y, cat) for (x, y) in grapheme_cats[cat]])
-        emoji_props = load_properties("emoji-data.txt", ["Extended_Pictographic"])
-        grapheme_table.extend([(x, y, "Extended_Pictographic") for (x, y) in emoji_props["Extended_Pictographic"]])
         grapheme_table.sort(key=lambda w: w[0])
         last = -1
         for chars in grapheme_table:
             if chars[0] <= last:
                 raise "Grapheme tables and Extended_Pictographic values overlap; need to store these separately!"
             last = chars[1]
-        emit_break_module(rf, grapheme_table, list(grapheme_cats.keys()) + ["Extended_Pictographic"], "grapheme")
+        emit_break_module(rf, grapheme_table, list(grapheme_cats.keys()), "grapheme")
         rf.write("\n")
 
-        word_cats = load_properties("auxiliary/WordBreakProperty.txt", [])
+        word_cats = load_properties("auxiliary/WordBreakProperty.txt")
         word_table = []
         for cat in word_cats:
             word_table.extend([(x, y, cat) for (x, y) in word_cats[cat]])
@@ -425,7 +447,7 @@ def emit_break_module(f, break_table, break_cats, name):
         emoji_table = [(x, y, "Extended_Pictographic") for (x, y) in emoji_props["Extended_Pictographic"]]
         emit_break_module(rf, emoji_table, ["Extended_Pictographic"], "emoji")
 
-        sentence_cats = load_properties("auxiliary/SentenceBreakProperty.txt", [])
+        sentence_cats = load_properties("auxiliary/SentenceBreakProperty.txt")
         sentence_table = []
         for cat in sentence_cats:
             sentence_table.extend([(x, y, cat) for (x, y) in sentence_cats[cat]])

diff --git a/scripts/unicode_gen_breaktests.py b/scripts/unicode_gen_breaktests.py
@@ -140,8 +140,8 @@ def showfun(x):
     return outstr
 
 def create_grapheme_data(f):
-    # rules 9.1 and 9.2 are for extended graphemes only
-    optsplits = ['9.1','9.2']
+    # rules 9.1, 9.2, and 9.3 are for extended graphemes only
+    optsplits = ['9.1', '9.2', '9.3']
     d = load_test_data("auxiliary/GraphemeBreakTest.txt", optsplits)
 
     test_same = []
@@ -169,8 +169,8 @@ def create_grapheme_data(f):
         else:
             test_diff.append((allchars, extgraphs, c))
 
-    stype = "&'static [(&'static str, &'static [&'static str])]"
-    dtype = "&'static [(&'static str, &'static [&'static str], &'static [&'static str])]"
+    stype = "&[(&str, &[&str])]"
+    dtype = "&[(&str, &[&str], &[&str])]"
     f.write("    // official Unicode test data\n")
     f.write("    // http://www.unicode.org/Public/%s/ucd/auxiliary/GraphemeBreakTest.txt\n" % unicode.UNICODE_VERSION_NUMBER)
     unicode.emit_table(f, "TEST_SAME", test_same, stype, True, showfun, True)
@@ -185,7 +185,7 @@ def create_words_data(f):
         allchars = [cn for s in c for cn in s]
         test.append((allchars, c))
 
-    wtype = "&'static [(&'static str, &'static [&'static str])]"
+    wtype = "&[(&str, &[&str])]"
     f.write("    // official Unicode test data\n")
     f.write("    // http://www.unicode.org/Public/%s/ucd/auxiliary/WordBreakTest.txt\n" % unicode.UNICODE_VERSION_NUMBER)
     unicode.emit_table(f, "TEST_WORD", test, wtype, True, showfun, True)
@@ -199,7 +199,7 @@ def create_sentence_data(f):
         allchars = [cn for s in c for cn in s]
         test.append((allchars, c))
 
-    wtype = "&'static [(&'static str, &'static [&'static str])]"
+    wtype = "&[(&str, &[&str])]"
     f.write("    // official Unicode test data\n")
     f.write("    // http://www.unicode.org/Public/%s/ucd/auxiliary/SentenceBreakTest.txt\n" % unicode.UNICODE_VERSION_NUMBER)
     unicode.emit_table(f, "TEST_SENTENCE", test, wtype, True, showfun, True)