Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Unicode sentence boundaries #24

Merged
merged 4 commits into from
May 15, 2019
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ license = "MIT/Apache-2.0"
keywords = ["text", "unicode", "grapheme", "word", "boundary"]
readme = "README.md"
description = """
This crate provides Grapheme Cluster and Word boundaries
This crate provides Grapheme Cluster, Word and Sentence boundaries
according to Unicode Standard Annex #29 rules.
"""

Expand Down
7 changes: 7 additions & 0 deletions scripts/unicode.py
Original file line number Diff line number Diff line change
Expand Up @@ -351,3 +351,10 @@ def emit_break_module(f, break_table, break_cats, name):
word_table.extend([(x, y, cat) for (x, y) in word_cats[cat]])
word_table.sort(key=lambda w: w[0])
emit_break_module(rf, word_table, word_cats.keys(), "word")

sentence_cats = load_properties("auxiliary/SentenceBreakProperty.txt", [])
sentence_table = []
for cat in sentence_cats:
sentence_table.extend([(x, y, cat) for (x, y) in sentence_cats[cat]])
sentence_table.sort(key=lambda w: w[0])
emit_break_module(rf, sentence_table, sentence_cats.keys(), "sentence")
15 changes: 15 additions & 0 deletions scripts/unicode_gen_breaktests.py
Original file line number Diff line number Diff line change
Expand Up @@ -190,8 +190,23 @@ def create_words_data(f):
f.write(" // http://www.unicode.org/Public/UNIDATA/auxiliary/WordBreakTest.txt\n")
unicode.emit_table(f, "TEST_WORD", test, wtype, True, showfun, True)

def create_sentence_data(f):
d = load_test_data("auxiliary/SentenceBreakTest.txt")

test = []

for (c, i) in d:
allchars = [cn for s in c for cn in s]
test.append((allchars, c))

wtype = "&'static [(&'static str, &'static [&'static str])]"
f.write(" // official Unicode test data\n")
f.write(" // http://www.unicode.org/Public/UNIDATA/auxiliary/SentenceBreakTest.txt\n")
unicode.emit_table(f, "TEST_SENTENCE", test, wtype, True, showfun, True)

if __name__ == "__main__":
with open("testdata.rs", "w") as rf:
rf.write(unicode.preamble)
create_grapheme_data(rf)
create_words_data(rf)
create_sentence_data(rf)
15 changes: 14 additions & 1 deletion src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
// option. This file may not be copied, modified, or distributed
// except according to those terms.

//! Iterators which split strings on Grapheme Cluster or Word boundaries, according
//! Iterators which split strings on Grapheme Cluster, Word or Sentence boundaries, according
//! to the [Unicode Standard Annex #29](http://www.unicode.org/reports/tr29/) rules.
//!
//! ```rust
Expand Down Expand Up @@ -67,10 +67,12 @@ pub use grapheme::{Graphemes, GraphemeIndices};
pub use grapheme::{GraphemeCursor, GraphemeIncomplete};
pub use tables::UNICODE_VERSION;
pub use word::{UWordBounds, UWordBoundIndices, UnicodeWords};
pub use sentence::{USentenceBounds};

mod grapheme;
mod tables;
mod word;
mod sentence;

#[cfg(test)]
mod test;
Expand Down Expand Up @@ -174,6 +176,12 @@ pub trait UnicodeSegmentation {
/// assert_eq!(&swi1[..], b);
/// ```
fn split_word_bound_indices<'a>(&'a self) -> UWordBoundIndices<'a>;

/// Returns an iterator over substrings of `self` separated on
/// [UAX#29 sentence boundaries](http://www.unicode.org/reports/tr29/#Sentence_Boundaries).
///
/// The concatenation of the substrings returned by this function is just the original string.
fn split_sentence_bounds<'a>(&'a self) -> USentenceBounds<'a>;
tomcumming marked this conversation as resolved.
Show resolved Hide resolved
}

impl UnicodeSegmentation for str {
Expand Down Expand Up @@ -201,4 +209,9 @@ impl UnicodeSegmentation for str {
fn split_word_bound_indices(&self) -> UWordBoundIndices {
word::new_word_bound_indices(self)
}

#[inline]
fn split_sentence_bounds(&self) -> USentenceBounds {
sentence::new_sentence_bounds(self)
}
}
302 changes: 302 additions & 0 deletions src/sentence.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,302 @@
// Copyright 2012-2014 The Rust Project Developers. See the COPYRIGHT
// file at the top-level directory of this distribution and at
// http://rust-lang.org/COPYRIGHT.
//
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.

use core::cmp;

// All of the logic for forward iteration over sentences
mod fwd {
use tables::sentence::SentenceCat;
use core::cmp;

#[derive(Clone, Copy, PartialEq, Eq)]
enum StatePart {
Sot,
Eot,
Other,
CR,
LF,
Sep,
ATerm,
UpperLower,
ClosePlus,
SpPlus,
tomcumming marked this conversation as resolved.
Show resolved Hide resolved
STerm
}

#[derive(Clone, PartialEq, Eq)]
struct SentenceBreaksState(pub [StatePart; 4]);

const INITIAL_STATE: SentenceBreaksState = SentenceBreaksState([
StatePart::Sot,
StatePart::Sot,
StatePart::Sot,
StatePart::Sot
]);

pub struct SentenceBreaks<'a> {
pub string: &'a str,
pos: usize,
state: SentenceBreaksState
}

impl SentenceBreaksState {
fn next(&self, cat: SentenceCat) -> SentenceBreaksState {
let &SentenceBreaksState(parts) = self;
let parts = match (parts[3], cat) {
(StatePart::ClosePlus, SentenceCat::SC_Close) => parts,
tomcumming marked this conversation as resolved.
Show resolved Hide resolved
(StatePart::SpPlus, SentenceCat::SC_Sp) => parts,
_ => [
parts[1],
parts[2],
parts[3],
match cat {
SentenceCat::SC_CR => StatePart::CR,
SentenceCat::SC_LF => StatePart::LF,
SentenceCat::SC_Sep => StatePart::Sep,
SentenceCat::SC_ATerm => StatePart::ATerm,
SentenceCat::SC_Upper |
SentenceCat::SC_Lower => StatePart::UpperLower,
SentenceCat::SC_Close => StatePart::ClosePlus,
SentenceCat::SC_Sp => StatePart::SpPlus,
SentenceCat::SC_STerm => StatePart::STerm,
_ => StatePart::Other
}
]
};
SentenceBreaksState(parts)
}

fn end(&self) -> SentenceBreaksState {
let &SentenceBreaksState(parts) = self;
SentenceBreaksState([
parts[1],
parts[2],
parts[3],
StatePart::Eot
])
}

fn match1(&self, part: StatePart) -> bool {
tomcumming marked this conversation as resolved.
Show resolved Hide resolved
let &SentenceBreaksState(parts) = self;
part == parts[3]
}

fn match2(&self, part1: StatePart, part2: StatePart) -> bool {
let &SentenceBreaksState(parts) = self;
part1 == parts[2] && part2 == parts[3]
}
}

fn match_sb8(state: &SentenceBreaksState, ahead: &str) -> bool {
tomcumming marked this conversation as resolved.
Show resolved Hide resolved
let aterm_part = {
tomcumming marked this conversation as resolved.
Show resolved Hide resolved
// ATerm Close* Sp*
let &SentenceBreaksState(parts) = state;
let mut idx = if parts[3] == StatePart::SpPlus { 2 } else { 3 };
if parts[idx] == StatePart::ClosePlus { idx -= 1 }
parts[idx]
};

if aterm_part == StatePart::ATerm {
use tables::sentence as se;

for next_char in ahead.chars() {
//( ¬(OLetter | Upper | Lower | ParaSep | SATerm) )* Lower
match se::sentence_category(next_char) {
tomcumming marked this conversation as resolved.
Show resolved Hide resolved
se::SC_Lower => return true,
se::SC_OLetter |
se::SC_Upper |
se::SC_Sep | se::SC_CR | se::SC_LF |
se::SC_STerm | se::SC_ATerm => return false,
_ => continue
}
}
}

false
}

fn match_sb8a(state: &SentenceBreaksState) -> bool {
// SATerm Close* Sp*
let &SentenceBreaksState(parts) = state;
let mut idx = if parts[3] == StatePart::SpPlus { 2 } else { 3 };
if parts[idx] == StatePart::ClosePlus { idx -= 1 }
parts[idx] == StatePart::STerm || parts[idx] == StatePart::ATerm
}

fn match_sb9(state: &SentenceBreaksState) -> bool {
// SATerm Close*
let &SentenceBreaksState(parts) = state;
let idx = if parts[3] == StatePart::ClosePlus { 2 } else { 3 };
parts[idx] == StatePart::STerm || parts[idx] == StatePart::ATerm
}

fn match_sb11(state: &SentenceBreaksState) -> bool {
// SATerm Close* Sp* ParaSep?
let &SentenceBreaksState(parts) = state;
let mut idx = match parts[3] {
StatePart::Sep |
StatePart::CR |
StatePart::LF => 2,
_ => 3
};

if parts[idx] == StatePart::SpPlus { idx -= 1 }
if parts[idx] == StatePart::ClosePlus { idx -= 1}

parts[idx] == StatePart::STerm || parts[idx] == StatePart::ATerm
}

impl<'a> Iterator for SentenceBreaks<'a> {
// Returns the index of the character which follows a break
type Item = usize;

#[inline]
fn size_hint(&self) -> (usize, Option<usize>) {
let slen = self.string.len();
// A sentence could be one character
(cmp::min(slen, 2), Some(slen + 1))
}

#[inline]
fn next(&mut self) -> Option<usize> {
use tables::sentence as se;

for next_char in self.string[self.pos..].chars() {
let position_before = self.pos;
let state_before = self.state.clone();

let next_cat = se::sentence_category(next_char);

self.pos += next_char.len_utf8();
self.state = self.state.next(next_cat);

match next_cat {
// SB1
_ if state_before.match1(StatePart::Sot) =>
return Some(position_before),

tomcumming marked this conversation as resolved.
Show resolved Hide resolved
// SB3
SentenceCat::SC_LF if state_before.match1(StatePart::CR) =>
continue,

// SB4
_ if state_before.match1(StatePart::Sep)
|| state_before.match1(StatePart::CR)
|| state_before.match1(StatePart::LF)
=> return Some(position_before),

// SB5
SentenceCat::SC_Extend |
SentenceCat::SC_Format => self.state = state_before,

// SB6
SentenceCat::SC_Numeric if state_before.match1(StatePart::ATerm) =>
continue,

// SB7
SentenceCat::SC_Upper if state_before.match2(StatePart::UpperLower, StatePart::ATerm) =>
continue,

// SB8
_ if match_sb8(&state_before, &self.string[position_before..]) =>
continue,

// SB8a
SentenceCat::SC_SContinue |
SentenceCat::SC_STerm |
SentenceCat::SC_ATerm if match_sb8a(&state_before) =>
continue,

// SB9
SentenceCat::SC_Close |
SentenceCat::SC_Sp |
SentenceCat::SC_Sep |
SentenceCat::SC_CR |
SentenceCat::SC_LF if match_sb9(&state_before) =>
continue,

// SB10
SentenceCat::SC_Sp |
SentenceCat::SC_Sep |
SentenceCat::SC_CR |
SentenceCat::SC_LF if match_sb8a(&state_before) =>
continue,

// SB11
_ if match_sb11(&state_before) =>
return Some(position_before),

// SB998
_ => continue
tomcumming marked this conversation as resolved.
Show resolved Hide resolved
}
}

// SB2
if self.state.match1(StatePart::Sot) {
None
} else if self.state.match1(StatePart::Eot) {
None
} else {
self.state = self.state.end();
Some(self.pos)
}
}
}

pub fn new_sentence_breaks<'a>(source: &'a str) -> SentenceBreaks<'a> {
SentenceBreaks { string: source, pos: 0, state: INITIAL_STATE }
}

}

/// External iterator for a string's
/// [sentence boundaries](http://www.unicode.org/reports/tr29/#Sentence_Boundaries).
pub struct USentenceBounds<'a> {
iter: fwd::SentenceBreaks<'a>,
sentence_start: Option<usize>
}

#[inline]
pub fn new_sentence_bounds<'a>(source: &'a str) -> USentenceBounds<'a> {
USentenceBounds {
iter: fwd::new_sentence_breaks(source),
sentence_start: None
}
}

impl<'a> Iterator for USentenceBounds<'a> {
type Item = &'a str;

#[inline]
fn size_hint(&self) -> (usize, Option<usize>) {
let (lower, upper) = self.iter.size_hint();
(cmp::max(0, lower - 1), upper.map(|u| cmp::max(0, u - 1)))
}

#[inline]
fn next(&mut self) -> Option<&'a str> {
if self.sentence_start == None {
if let Some(start_pos) = self.iter.next() {
self.sentence_start = Some(start_pos)
} else {
return None
}
}

if let Some(break_pos) = self.iter.next() {
let start_pos = self.sentence_start.unwrap();
let sentence = &self.iter.string[start_pos..break_pos];
self.sentence_start = Some(break_pos);
Some(sentence)
} else {
None
}
}
}
Loading