Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix #125 #134

Merged
merged 6 commits into from
Jun 8, 2024
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 9 additions & 6 deletions .github/workflows/rust.yml
Original file line number Diff line number Diff line change
Expand Up @@ -7,28 +7,31 @@ on:
branches: [ master ]

env:
CARGO_INCREMENTAL: 0
CARGO_TERM_COLOR: always
RUST_BACKTRACE: 1
RUSTFLAGS: -D warnings
RUSTDOCFLAGS: -D warnings

jobs:
build:

runs-on: ubuntu-latest

steps:
- uses: actions/checkout@v2
- name: Build
run: cargo build --verbose
- name: Run tests
run: cargo test --verbose
fmt:
- name: Run clippy
run: cargo clippy --all-targets --all --verbose

fmt:
runs-on: ubuntu-latest

steps:
- uses: actions/checkout@v2
- name: Rustfmt
run: cargo fmt --check
run: cargo fmt --all --check
- name: Verify regenerated files
run: ./scripts/unicode.py && diff tables.rs src/tables.rs
- name: Verify regenerated tests
run: ./scripts/unicode_gen_breaktests.py && rustfmt testdata.rs && diff testdata.rs src/testdata.rs
run: ./scripts/unicode_gen_breaktests.py && rustfmt testdata.rs && diff testdata.rs tests/testdata/mod.rs
5 changes: 2 additions & 3 deletions benches/chars.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
//! is how much slower full unicode handling is.

use criterion::{black_box, criterion_group, criterion_main, BenchmarkId, Criterion};
use unicode_segmentation;

use std::fs;
use unicode_segmentation::UnicodeSegmentation;
Expand All @@ -24,14 +23,14 @@ const FILES: &[&str] = &[

#[inline(always)]
fn grapheme(text: &str) {
for c in UnicodeSegmentation::graphemes(black_box(&*text), true) {
for c in UnicodeSegmentation::graphemes(black_box(text), true) {
black_box(c);
}
}

#[inline(always)]
fn scalar(text: &str) {
for c in black_box(&*text).chars() {
for c in black_box(text).chars() {
black_box(c);
}
}
Expand Down
93 changes: 67 additions & 26 deletions scripts/unicode.py
Original file line number Diff line number Diff line change
Expand Up @@ -155,11 +155,11 @@ def format_table_content(f, content, indent):
line = " "*indent + chunk
f.write(line)

def load_properties(f, interestingprops):
def load_properties(f, interestingprops: "list[str | tuple[str, str]] | None" = None):
fetch(f)
props = {}
re1 = re.compile(r"^ *([0-9A-F]+) *; *(\w+)")
re2 = re.compile(r"^ *([0-9A-F]+)\.\.([0-9A-F]+) *; *(\w+)")
re1 = re.compile(r"^\s*([0-9A-F]+)\s*;\s*(\w+)(?:\s*;\s*(\w+))?")
re2 = re.compile(r"^\s*([0-9A-F]+)\.\.([0-9A-F]+)\s*;\s*(\w+)(?:\s*;\s*(\w+))?")

for line in fileinput.input(os.path.basename(f)):
prop = None
Expand All @@ -168,17 +168,21 @@ def load_properties(f, interestingprops):
m = re1.match(line)
if m:
d_lo = m.group(1)
d_hi = m.group(1)
d_hi = d_lo
prop = m.group(2)
value = m.group(3)
else:
m = re2.match(line)
if m:
d_lo = m.group(1)
d_hi = m.group(2)
prop = m.group(3)
value = m.group(4)
else:
continue
if interestingprops and prop not in interestingprops:
if value is not None:
prop = (prop, value)
if interestingprops is not None and prop not in interestingprops:
continue
d_lo = int(d_lo, 16)
d_hi = int(d_hi, 16)
Expand All @@ -195,7 +199,7 @@ def load_properties(f, interestingprops):
def escape_char(c):
return "'\\u{%x}'" % c

def emit_table(f, name, t_data, t_type = "&'static [(char, char)]", is_pub=True,
def emit_table(f, name, t_data, t_type = "&[(char, char)]", is_pub=True,
pfun=lambda x: "(%s,%s)" % (escape_char(x[0]), escape_char(x[1])), is_const=True):
pub_string = "const"
if not is_const:
Expand All @@ -217,7 +221,7 @@ def emit_util_mod(f):
f.write("""
pub mod util {
#[inline]
pub fn bsearch_range_table(c: char, r: &'static [(char,char)]) -> bool {
pub fn bsearch_range_table(c: char, r: &[(char,char)]) -> bool {
use core::cmp::Ordering::{Equal, Less, Greater};
r.binary_search_by(|&(lo,hi)| {
if lo <= c && c <= hi { Equal }
Expand Down Expand Up @@ -252,13 +256,22 @@ def emit_util_mod(f):

""")

def emit_property_module(f, mod, tbl, emit):
f.write("mod %s {\n" % mod)
for cat in sorted(emit):
emit_table(f, "%s_table" % cat, tbl[cat], is_pub=False)
def emit_property_module(f, mod, tbl, emit: "list[str | tuple[str, str]]"):
f.write("pub mod %s {\n" % mod)

cats = []
for cat in emit:
if type(cat) is tuple:
cats.append((f"{cat[0]}_{cat[1]}", cat))
else:
cats.append((cat, cat))
cats.sort(key=lambda x: x[0])

for cat_str, cat in cats:
emit_table(f, "%s_table" % cat_str, tbl[cat], is_pub=False)
f.write(" #[inline]\n")
f.write(" pub fn %s(c: char) -> bool {\n" % cat)
f.write(" super::util::bsearch_range_table(c, %s_table)\n" % cat)
f.write(" pub fn %s(c: char) -> bool {\n" % cat_str)
f.write(" super::util::bsearch_range_table(c, %s_table)\n" % cat_str)
f.write(" }\n\n")
f.write("}\n\n")

Expand Down Expand Up @@ -303,7 +316,7 @@ def emit_break_module(f, break_table, break_cats, name):
f.write((" %sC_" % Name[0]) + cat + ",\n")
f.write(""" }

fn bsearch_range_value_table(c: char, r: &'static [(char, char, %sCat)], default_lower: u32, default_upper: u32) -> (u32, u32, %sCat) {
fn bsearch_range_value_table(c: char, r: &[(char, char, %sCat)], default_lower: u32, default_upper: u32) -> (u32, u32, %sCat) {
use core::cmp::Ordering::{Equal, Less, Greater};
match r.binary_search_by(|&(lo, hi, _)| {
if lo <= c && c <= hi { Equal }
Expand Down Expand Up @@ -355,11 +368,11 @@ def emit_break_module(f, break_table, break_cats, name):
else:
lookup_type = "u32"

emit_table(f, "%s_cat_lookup" % name, lookup_table, "&'static [%s]" % lookup_type,
emit_table(f, "%s_cat_lookup" % name, lookup_table, "&[%s]" % lookup_type,
pfun=lambda x: "%d" % x,
is_pub=False, is_const=True)

emit_table(f, "%s_cat_table" % name, break_table, "&'static [(char, char, %sCat)]" % Name,
emit_table(f, "%s_cat_table" % name, break_table, "&[(char, char, %sCat)]" % Name,
pfun=lambda x: "(%s,%s,%sC_%s)" % (escape_char(x[0]), escape_char(x[1]), Name[0], x[2]),
is_pub=False, is_const=True)
f.write("}\n")
Expand All @@ -379,17 +392,26 @@ def emit_break_module(f, break_table, break_cats, name):

# download and parse all the data
gencats = load_gencats("UnicodeData.txt")
derived = load_properties("DerivedCoreProperties.txt", ["Alphabetic"])
derived = load_properties("DerivedCoreProperties.txt", ["Alphabetic", ("InCB", "Consonant"), ("InCB", "Extend"), ("InCB", "Linker")])

emit_util_mod(rf)
for (name, cat, pfuns) in ("general_category", gencats, ["N"]), \
("derived_property", derived, ["Alphabetic"]):
("derived_property", derived, ["Alphabetic", ("InCB", "Extend")]):
emit_property_module(rf, name, cat, pfuns)

rf.write("""pub fn is_incb_linker(c: char) -> bool {
matches!(c,""")

for (lo, hi) in derived[("InCB", "Linker")]:
rf.write(f" | '\\u{{{lo:X}}}'")
if lo != hi:
rf.write(f"..'\\u{{{lo:X}}}'")

rf.write(")\n}\n\n")

### grapheme cluster module
# from http://www.unicode.org/reports/tr29/#Grapheme_Cluster_Break_Property_Values
grapheme_cats = load_properties("auxiliary/GraphemeBreakProperty.txt", [])

grapheme_cats = load_properties("auxiliary/GraphemeBreakProperty.txt")
# Control
# Note:
# This category also includes Cs (surrogate codepoints), but Rust's `char`s are
Expand All @@ -398,22 +420,22 @@ def emit_break_module(f, break_table, break_cats, name):
grapheme_cats["Control"] = group_cat(list(
set(ungroup_cat(grapheme_cats["Control"]))
- set(ungroup_cat([surrogate_codepoints]))))

grapheme_cats["InCB_Consonant"] = derived[("InCB", "Consonant")]
emoji_props = load_properties("emoji-data.txt", ["Extended_Pictographic"])
grapheme_cats["Extended_Pictographic"] = emoji_props["Extended_Pictographic"]
grapheme_table = []
for cat in grapheme_cats:
grapheme_table.extend([(x, y, cat) for (x, y) in grapheme_cats[cat]])
emoji_props = load_properties("emoji-data.txt", ["Extended_Pictographic"])
grapheme_table.extend([(x, y, "Extended_Pictographic") for (x, y) in emoji_props["Extended_Pictographic"]])
grapheme_table.sort(key=lambda w: w[0])
last = -1
for chars in grapheme_table:
if chars[0] <= last:
raise "Grapheme tables and Extended_Pictographic values overlap; need to store these separately!"
last = chars[1]
emit_break_module(rf, grapheme_table, list(grapheme_cats.keys()) + ["Extended_Pictographic"], "grapheme")
emit_break_module(rf, grapheme_table, list(grapheme_cats.keys()), "grapheme")
rf.write("\n")

word_cats = load_properties("auxiliary/WordBreakProperty.txt", [])
word_cats = load_properties("auxiliary/WordBreakProperty.txt")
word_table = []
for cat in word_cats:
word_table.extend([(x, y, cat) for (x, y) in word_cats[cat]])
Expand All @@ -425,9 +447,28 @@ def emit_break_module(f, break_table, break_cats, name):
emoji_table = [(x, y, "Extended_Pictographic") for (x, y) in emoji_props["Extended_Pictographic"]]
emit_break_module(rf, emoji_table, ["Extended_Pictographic"], "emoji")

sentence_cats = load_properties("auxiliary/SentenceBreakProperty.txt", [])
sentence_cats = load_properties("auxiliary/SentenceBreakProperty.txt")
sentence_table = []
for cat in sentence_cats:
sentence_table.extend([(x, y, cat) for (x, y) in sentence_cats[cat]])
sentence_table.sort(key=lambda w: w[0])
emit_break_module(rf, sentence_table, list(sentence_cats.keys()), "sentence")

rf.write("""
Jules-Bertholet marked this conversation as resolved.
Show resolved Hide resolved
#[cfg(test)]
mod tests {
#[test]
fn test_syriac_abbr_mark() {
use crate::tables::word as wd;
let (_, _, cat) = wd::word_category('\\u{70f}');
assert_eq!(cat, wd::WC_ALetter);
}

#[test]
fn test_end_of_ayah_cat() {
use crate::tables::word as wd;
let (_, _, cat) = wd::word_category('\\u{6dd}');
assert_eq!(cat, wd::WC_Numeric);
}
}
""")
12 changes: 6 additions & 6 deletions scripts/unicode_gen_breaktests.py
Original file line number Diff line number Diff line change
Expand Up @@ -140,8 +140,8 @@ def showfun(x):
return outstr

def create_grapheme_data(f):
# rules 9.1 and 9.2 are for extended graphemes only
optsplits = ['9.1','9.2']
# rules 9.1, 9.2, and 9.3 are for extended graphemes only
optsplits = ['9.1', '9.2', '9.3']
d = load_test_data("auxiliary/GraphemeBreakTest.txt", optsplits)

test_same = []
Expand Down Expand Up @@ -169,8 +169,8 @@ def create_grapheme_data(f):
else:
test_diff.append((allchars, extgraphs, c))

stype = "&'static [(&'static str, &'static [&'static str])]"
dtype = "&'static [(&'static str, &'static [&'static str], &'static [&'static str])]"
stype = "&[(&str, &[&str])]"
dtype = "&[(&str, &[&str], &[&str])]"
f.write(" // official Unicode test data\n")
f.write(" // http://www.unicode.org/Public/%s/ucd/auxiliary/GraphemeBreakTest.txt\n" % unicode.UNICODE_VERSION_NUMBER)
unicode.emit_table(f, "TEST_SAME", test_same, stype, True, showfun, True)
Expand All @@ -185,7 +185,7 @@ def create_words_data(f):
allchars = [cn for s in c for cn in s]
test.append((allchars, c))

wtype = "&'static [(&'static str, &'static [&'static str])]"
wtype = "&[(&str, &[&str])]"
f.write(" // official Unicode test data\n")
f.write(" // http://www.unicode.org/Public/%s/ucd/auxiliary/WordBreakTest.txt\n" % unicode.UNICODE_VERSION_NUMBER)
unicode.emit_table(f, "TEST_WORD", test, wtype, True, showfun, True)
Expand All @@ -199,7 +199,7 @@ def create_sentence_data(f):
allchars = [cn for s in c for cn in s]
test.append((allchars, c))

wtype = "&'static [(&'static str, &'static [&'static str])]"
wtype = "&[(&str, &[&str])]"
f.write(" // official Unicode test data\n")
f.write(" // http://www.unicode.org/Public/%s/ucd/auxiliary/SentenceBreakTest.txt\n" % unicode.UNICODE_VERSION_NUMBER)
unicode.emit_table(f, "TEST_SENTENCE", test, wtype, True, showfun, True)
Expand Down
Loading
Loading