Skip to content

Commit

Permalink
MAINT Fixes for Python scripts (#54)
Browse files Browse the repository at this point in the history
* Fixes to python scripts

* Update src/testdata.rs
  • Loading branch information
rth authored and Manishearth committed May 15, 2019
1 parent c7a6b6f commit 7cb6dca
Show file tree
Hide file tree
Showing 3 changed files with 23 additions and 23 deletions.
18 changes: 9 additions & 9 deletions scripts/unicode.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
# Since this should not require frequent updates, we just store this
# out-of-line and check the unicode.rs file into git.

import fileinput, re, os, sys, operator
import fileinput, re, os, sys

preamble = '''// Copyright 2012-2018 The Rust Project Developers. See the COPYRIGHT
// file at the top-level directory of this distribution and at
Expand Down Expand Up @@ -59,7 +59,7 @@ def is_surrogate(n):

def fetch(f):
if not os.path.exists(os.path.basename(f)):
os.system("curl -O http://www.unicode.org/Public/UNIDATA/%s"
os.system("curl -O http://www.unicode.org/Public/9.0.0/ucd/%s"
% f)

if not os.path.exists(os.path.basename(f)):
Expand All @@ -80,7 +80,7 @@ def load_gencats(f):
if is_surrogate(cp):
continue
if range_start >= 0:
for i in xrange(range_start, cp):
for i in range(range_start, cp):
udict[i] = data;
range_start = -1;
if data[1].endswith(", First>"):
Expand Down Expand Up @@ -150,8 +150,8 @@ def format_table_content(f, content, indent):
def load_properties(f, interestingprops):
fetch(f)
props = {}
re1 = re.compile("^ *([0-9A-F]+) *; *(\w+)")
re2 = re.compile("^ *([0-9A-F]+)\.\.([0-9A-F]+) *; *(\w+)")
re1 = re.compile(r"^ *([0-9A-F]+) *; *(\w+)")
re2 = re.compile(r"^ *([0-9A-F]+)\.\.([0-9A-F]+) *; *(\w+)")

for line in fileinput.input(os.path.basename(f)):
prop = None
Expand Down Expand Up @@ -309,7 +309,7 @@ def emit_break_module(f, break_table, break_cats, name):
# download and parse all the data
fetch("ReadMe.txt")
with open("ReadMe.txt") as readme:
pattern = "for Version (\d+)\.(\d+)\.(\d+) of the Unicode"
pattern = r"for Version (\d+)\.(\d+)\.(\d+) of the Unicode"
unicode_version = re.search(pattern, readme.read()).groups()
rf.write("""
/// The version of [Unicode](http://www.unicode.org/)
Expand Down Expand Up @@ -342,19 +342,19 @@ def emit_break_module(f, break_table, break_cats, name):
for cat in grapheme_cats:
grapheme_table.extend([(x, y, cat) for (x, y) in grapheme_cats[cat]])
grapheme_table.sort(key=lambda w: w[0])
emit_break_module(rf, grapheme_table, grapheme_cats.keys(), "grapheme")
emit_break_module(rf, grapheme_table, list(grapheme_cats.keys()), "grapheme")
rf.write("\n")

word_cats = load_properties("auxiliary/WordBreakProperty.txt", [])
word_table = []
for cat in word_cats:
word_table.extend([(x, y, cat) for (x, y) in word_cats[cat]])
word_table.sort(key=lambda w: w[0])
emit_break_module(rf, word_table, word_cats.keys(), "word")
emit_break_module(rf, word_table, list(word_cats.keys()), "word")

sentence_cats = load_properties("auxiliary/SentenceBreakProperty.txt", [])
sentence_table = []
for cat in sentence_cats:
sentence_table.extend([(x, y, cat) for (x, y) in sentence_cats[cat]])
sentence_table.sort(key=lambda w: w[0])
emit_break_module(rf, sentence_table, sentence_cats.keys(), "sentence")
emit_break_module(rf, sentence_table, list(sentence_cats.keys()), "sentence")
22 changes: 11 additions & 11 deletions scripts/unicode_gen_breaktests.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,23 +17,23 @@
#
# Since this should not require frequent updates, we just store this
# out-of-line and check the unicode.rs file into git.
from __future__ import print_function

import unicode, re, os, fileinput

def load_test_data(f, optsplit=[]):
outls = []
testRe1 = re.compile("^÷\s+([^\s].*[^\s])\s+÷\s+#\s+÷\s+\[0.2\].*?([÷×].*)\s+÷\s+\[0.3\]\s*$")
testRe1 = re.compile(r"^÷\s+([^\s].*[^\s])\s+÷\s+#\s+÷\s+\[0.2\].*?([÷×].*)\s+÷\s+\[0.3\]\s*$")

unicode.fetch(f)
data = []
for line in fileinput.input(os.path.basename(f)):
# lines that include a test start with the ÷ character
if len(line) < 2 or line[0:2] != '÷':
if len(line) < 2 or not line.startswith('÷'):
continue

m = testRe1.match(line)
if not m:
print "error: no match on line where test was expected: %s" % line
print("error: no match on line where test was expected: %s" % line)
continue

# process the characters in this test case
Expand All @@ -48,9 +48,9 @@ def load_test_data(f, optsplit=[]):
# make sure that we have break info for each break!
assert len(chars) - 1 == len(info)

outls.append((chars, info))
data.append((chars, info))

return outls
return data

def process_split_info(s, c, o):
outcs = []
Expand All @@ -59,7 +59,7 @@ def process_split_info(s, c, o):

# are we on a × or a ÷?
isX = False
if s[0:2] == '×':
if s.startswith('×'):
isX = True

# find each instance of '(÷|×) [x.y] '
Expand All @@ -81,10 +81,10 @@ def process_split_info(s, c, o):

idx = 1
while idx < len(s):
if s[idx:idx+2] == '×':
if s[idx:].startswith('×'):
isX = True
break
if s[idx:idx+2] == '÷':
if s[idx:].startswith('÷'):
isX = False
break
idx += 1
Expand Down Expand Up @@ -172,7 +172,7 @@ def create_grapheme_data(f):
stype = "&'static [(&'static str, &'static [&'static str])]"
dtype = "&'static [(&'static str, &'static [&'static str], &'static [&'static str])]"
f.write(" // official Unicode test data\n")
f.write(" // http://www.unicode.org/Public/UNIDATA/auxiliary/GraphemeBreakTest.txt\n")
f.write(" // http://www.unicode.org/Public/9.0.0/ucd/auxiliary/GraphemeBreakTest.txt\n")
unicode.emit_table(f, "TEST_SAME", test_same, stype, True, showfun, True)
unicode.emit_table(f, "TEST_DIFF", test_diff, dtype, True, showfun, True)

Expand All @@ -187,7 +187,7 @@ def create_words_data(f):

wtype = "&'static [(&'static str, &'static [&'static str])]"
f.write(" // official Unicode test data\n")
f.write(" // http://www.unicode.org/Public/UNIDATA/auxiliary/WordBreakTest.txt\n")
f.write(" // http://www.unicode.org/Public/9.0.0/ucd/auxiliary/WordBreakTest.txt\n")
unicode.emit_table(f, "TEST_WORD", test, wtype, True, showfun, True)

def create_sentence_data(f):
Expand Down
6 changes: 3 additions & 3 deletions src/testdata.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
// Copyright 2012-2015 The Rust Project Developers. See the COPYRIGHT
// Copyright 2012-2018 The Rust Project Developers. See the COPYRIGHT
// file at the top-level directory of this distribution and at
// http://rust-lang.org/COPYRIGHT.
//
Expand All @@ -12,7 +12,7 @@

#![allow(missing_docs, non_upper_case_globals, non_snake_case)]
// official Unicode test data
// http://www.unicode.org/Public/UNIDATA/auxiliary/GraphemeBreakTest.txt
// http://www.unicode.org/Public/9.0.0/ucd/auxiliary/GraphemeBreakTest.txt
pub const TEST_SAME: &'static [(&'static str, &'static [&'static str])] = &[
("\u{20}\u{20}", &["\u{20}", "\u{20}"]), ("\u{20}\u{308}\u{20}", &["\u{20}\u{308}",
"\u{20}"]), ("\u{20}\u{d}", &["\u{20}", "\u{d}"]), ("\u{20}\u{308}\u{d}", &["\u{20}\u{308}",
Expand Down Expand Up @@ -516,7 +516,7 @@
];

// official Unicode test data
// http://www.unicode.org/Public/UNIDATA/auxiliary/WordBreakTest.txt
// http://www.unicode.org/Public/9.0.0/ucd/auxiliary/WordBreakTest.txt
pub const TEST_WORD: &'static [(&'static str, &'static [&'static str])] = &[
("\u{1}\u{1}", &["\u{1}", "\u{1}"]), ("\u{1}\u{308}\u{1}", &["\u{1}\u{308}", "\u{1}"]),
("\u{1}\u{d}", &["\u{1}", "\u{d}"]), ("\u{1}\u{308}\u{d}", &["\u{1}\u{308}", "\u{d}"]),
Expand Down

0 comments on commit 7cb6dca

Please sign in to comment.