unicode-rs · Manishearth · May 15, 2019 · May 2, 2017 · May 4, 2017 · May 13, 2019
diff --git a/Cargo.toml b/Cargo.toml
@@ -12,7 +12,7 @@ license = "MIT/Apache-2.0"
 keywords = ["text", "unicode", "grapheme", "word", "boundary"]
 readme = "README.md"
 description = """
-This crate provides Grapheme Cluster and Word boundaries
+This crate provides Grapheme Cluster, Word and Sentence boundaries
 according to Unicode Standard Annex #29 rules.
 """
 

diff --git a/scripts/unicode.py b/scripts/unicode.py
@@ -351,3 +351,10 @@ def emit_break_module(f, break_table, break_cats, name):
             word_table.extend([(x, y, cat) for (x, y) in word_cats[cat]])
         word_table.sort(key=lambda w: w[0])
         emit_break_module(rf, word_table, word_cats.keys(), "word")
+
+        sentence_cats = load_properties("auxiliary/SentenceBreakProperty.txt", [])
+        sentence_table = []
+        for cat in sentence_cats:
+            sentence_table.extend([(x, y, cat) for (x, y) in sentence_cats[cat]])
+        sentence_table.sort(key=lambda w: w[0])
+        emit_break_module(rf, sentence_table, sentence_cats.keys(), "sentence")
diff --git a/scripts/unicode_gen_breaktests.py b/scripts/unicode_gen_breaktests.py
@@ -190,8 +190,23 @@ def create_words_data(f):
     f.write("    // http://www.unicode.org/Public/UNIDATA/auxiliary/WordBreakTest.txt\n")
     unicode.emit_table(f, "TEST_WORD", test, wtype, True, showfun, True)
 
+def create_sentence_data(f):
+    d = load_test_data("auxiliary/SentenceBreakTest.txt")
+
+    test = []
+
+    for (c, i) in d:
+        allchars = [cn for s in c for cn in s]
+        test.append((allchars, c))
+
+    wtype = "&'static [(&'static str, &'static [&'static str])]"
+    f.write("    // official Unicode test data\n")
+    f.write("    // http://www.unicode.org/Public/UNIDATA/auxiliary/SentenceBreakTest.txt\n")
+    unicode.emit_table(f, "TEST_SENTENCE", test, wtype, True, showfun, True)
+
 if __name__ == "__main__":
     with open("testdata.rs", "w") as rf:
         rf.write(unicode.preamble)
         create_grapheme_data(rf)
         create_words_data(rf)
+        create_sentence_data(rf)
diff --git a/src/lib.rs b/src/lib.rs
@@ -8,7 +8,7 @@
 // option. This file may not be copied, modified, or distributed
 // except according to those terms.
 
-//! Iterators which split strings on Grapheme Cluster or Word boundaries, according
+//! Iterators which split strings on Grapheme Cluster, Word or Sentence boundaries, according
 //! to the [Unicode Standard Annex #29](http://www.unicode.org/reports/tr29/) rules.
 //!
 //! ```rust
@@ -67,10 +67,12 @@ pub use grapheme::{Graphemes, GraphemeIndices};
 pub use grapheme::{GraphemeCursor, GraphemeIncomplete};
 pub use tables::UNICODE_VERSION;
 pub use word::{UWordBounds, UWordBoundIndices, UnicodeWords};
+pub use sentence::{USentenceBounds};
 
 mod grapheme;
 mod tables;
 mod word;
+mod sentence;
 
 #[cfg(test)]
 mod test;
@@ -174,6 +176,12 @@ pub trait UnicodeSegmentation {
     /// assert_eq!(&swi1[..], b);
     /// ```
     fn split_word_bound_indices<'a>(&'a self) -> UWordBoundIndices<'a>;
+
+    /// Returns an iterator over substrings of `self` separated on
+    /// [UAX#29 sentence boundaries](http://www.unicode.org/reports/tr29/#Sentence_Boundaries).
+    ///
+    /// The concatenation of the substrings returned by this function is just the original string.
+    fn split_sentence_bounds<'a>(&'a self) -> USentenceBounds<'a>;
 }
 
 impl UnicodeSegmentation for str {
@@ -201,4 +209,9 @@ impl UnicodeSegmentation for str {
     fn split_word_bound_indices(&self) -> UWordBoundIndices {
         word::new_word_bound_indices(self)
     }
+
+    #[inline]
+    fn split_sentence_bounds(&self) -> USentenceBounds {
+        sentence::new_sentence_bounds(self)
+    }
 }
diff --git a/src/sentence.rs b/src/sentence.rs
@@ -0,0 +1,302 @@
+// Copyright 2012-2014 The Rust Project Developers. See the COPYRIGHT
+// file at the top-level directory of this distribution and at
+// http://rust-lang.org/COPYRIGHT.
+//
+// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
+// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
+// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
+// option. This file may not be copied, modified, or distributed
+// except according to those terms.
+
+use core::cmp;
+
+// All of the logic for forward iteration over sentences
+mod fwd {
+    use tables::sentence::SentenceCat;
+    use core::cmp;
+
+    #[derive(Clone, Copy, PartialEq, Eq)]
+    enum StatePart {
+        Sot,
+        Eot,
+        Other,
+        CR,
+        LF,
+        Sep,
+        ATerm,
+        UpperLower,
+        ClosePlus,
+        SpPlus,
+        STerm
+    }
+
+    #[derive(Clone, PartialEq, Eq)]
+    struct SentenceBreaksState(pub [StatePart; 4]);
+
+    const INITIAL_STATE: SentenceBreaksState = SentenceBreaksState([
+        StatePart::Sot,
+        StatePart::Sot,
+        StatePart::Sot,
+        StatePart::Sot
+    ]);
+
+    pub struct SentenceBreaks<'a> {
+        pub string: &'a str,
+        pos: usize,
+        state: SentenceBreaksState
+    }
+
+    impl SentenceBreaksState {
+        fn next(&self, cat: SentenceCat) -> SentenceBreaksState {
+            let &SentenceBreaksState(parts) = self;
+            let parts = match (parts[3], cat) {
+                (StatePart::ClosePlus, SentenceCat::SC_Close) => parts,
+                (StatePart::SpPlus, SentenceCat::SC_Sp) => parts,
+                _ => [
+                    parts[1],
+                    parts[2],
+                    parts[3],
+                    match cat {
+                        SentenceCat::SC_CR => StatePart::CR,
+                        SentenceCat::SC_LF => StatePart::LF,
+                        SentenceCat::SC_Sep => StatePart::Sep,
+                        SentenceCat::SC_ATerm => StatePart::ATerm,
+                        SentenceCat::SC_Upper |
+                        SentenceCat::SC_Lower => StatePart::UpperLower,
+                        SentenceCat::SC_Close => StatePart::ClosePlus,
+                        SentenceCat::SC_Sp => StatePart::SpPlus,
+                        SentenceCat::SC_STerm => StatePart::STerm,
+                        _ => StatePart::Other
+                    }
+                ]
+            };
+            SentenceBreaksState(parts)
+        }
+
+        fn end(&self) -> SentenceBreaksState {
+            let &SentenceBreaksState(parts) = self;
+            SentenceBreaksState([
+                parts[1],
+                parts[2],
+                parts[3],
+                StatePart::Eot
+            ])
+        }
+
+        fn match1(&self, part: StatePart) -> bool {
+            let &SentenceBreaksState(parts) = self;
+            part == parts[3]
+        }
+
+        fn match2(&self, part1: StatePart, part2: StatePart) -> bool {
+            let &SentenceBreaksState(parts) = self;
+            part1 == parts[2] && part2 == parts[3]
+        }
+    }
+
+    fn match_sb8(state: &SentenceBreaksState, ahead: &str) -> bool {
+        let aterm_part = {
+            // ATerm Close* Sp*
+            let &SentenceBreaksState(parts) = state;
+            let mut idx = if parts[3] == StatePart::SpPlus { 2 } else { 3 };
+            if parts[idx] == StatePart::ClosePlus { idx -= 1 }
+            parts[idx]
+        };
+
+        if aterm_part == StatePart::ATerm {
+            use tables::sentence as se;
+
+            for next_char in ahead.chars() {
+                //( ¬(OLetter | Upper | Lower | ParaSep | SATerm) )* Lower
+                match se::sentence_category(next_char) {
+                    se::SC_Lower => return true,
+                    se::SC_OLetter |
+                    se::SC_Upper |
+                    se::SC_Sep | se::SC_CR | se::SC_LF |
+                    se::SC_STerm | se::SC_ATerm => return false,
+                    _ => continue
+                }
+            }
+        }
+
+        false
+    }
+
+    fn match_sb8a(state: &SentenceBreaksState) -> bool {
+        // SATerm Close* Sp*
+        let &SentenceBreaksState(parts) = state;
+        let mut idx = if parts[3] == StatePart::SpPlus { 2 } else { 3 };
+        if parts[idx] == StatePart::ClosePlus { idx -= 1 }
+        parts[idx] == StatePart::STerm || parts[idx] == StatePart::ATerm
+    }
+
+    fn match_sb9(state: &SentenceBreaksState) -> bool {
+        // SATerm Close*
+        let &SentenceBreaksState(parts) = state;
+        let idx = if parts[3] == StatePart::ClosePlus { 2 } else { 3 };
+        parts[idx] == StatePart::STerm || parts[idx] == StatePart::ATerm
+    }
+
+    fn match_sb11(state: &SentenceBreaksState) -> bool {
+        // SATerm Close* Sp* ParaSep?
+        let &SentenceBreaksState(parts) = state;
+        let mut idx = match parts[3] {
+            StatePart::Sep |
+            StatePart::CR |
+            StatePart::LF => 2,
+            _ => 3
+        };
+
+        if parts[idx] == StatePart::SpPlus { idx -= 1 }
+        if parts[idx] == StatePart::ClosePlus { idx -= 1}
+
+        parts[idx] == StatePart::STerm || parts[idx] == StatePart::ATerm
+    }
+
+    impl<'a> Iterator for SentenceBreaks<'a> {
+        // Returns the index of the character which follows a break
+        type Item = usize;
+
+        #[inline]
+        fn size_hint(&self) -> (usize, Option<usize>) {
+            let slen = self.string.len();
+            // A sentence could be one character
+            (cmp::min(slen, 2), Some(slen + 1))
+        }
+
+        #[inline]
+        fn next(&mut self) -> Option<usize> {
+            use tables::sentence as se;
+
+            for next_char in self.string[self.pos..].chars() {
+                let position_before = self.pos;
+                let state_before = self.state.clone();
+
+                let next_cat = se::sentence_category(next_char);
+
+                self.pos += next_char.len_utf8();
+                self.state = self.state.next(next_cat);
+
+                match next_cat {
+                    // SB1
+                    _ if state_before.match1(StatePart::Sot) =>
+                        return Some(position_before),
+
+                    // SB3
+                    SentenceCat::SC_LF if state_before.match1(StatePart::CR) =>
+                        continue,
+
+                    // SB4
+                    _ if state_before.match1(StatePart::Sep)
+                        || state_before.match1(StatePart::CR)
+                        || state_before.match1(StatePart::LF)
+                    => return Some(position_before),
+
+                    // SB5
+                    SentenceCat::SC_Extend |
+                    SentenceCat::SC_Format => self.state = state_before,
+
+                    // SB6
+                    SentenceCat::SC_Numeric if state_before.match1(StatePart::ATerm) =>
+                        continue,
+
+                    // SB7
+                    SentenceCat::SC_Upper if state_before.match2(StatePart::UpperLower, StatePart::ATerm) =>
+                        continue,
+
+                    // SB8
+                    _ if match_sb8(&state_before, &self.string[position_before..]) =>
+                        continue,
+
+                    // SB8a
+                    SentenceCat::SC_SContinue |
+                    SentenceCat::SC_STerm |
+                    SentenceCat::SC_ATerm if match_sb8a(&state_before) =>
+                        continue,
+
+                    // SB9
+                    SentenceCat::SC_Close |
+                    SentenceCat::SC_Sp |
+                    SentenceCat::SC_Sep |
+                    SentenceCat::SC_CR |
+                    SentenceCat::SC_LF if match_sb9(&state_before) =>
+                        continue,
+
+                    // SB10
+                    SentenceCat::SC_Sp |
+                    SentenceCat::SC_Sep |
+                    SentenceCat::SC_CR |
+                    SentenceCat::SC_LF if match_sb8a(&state_before) =>
+                        continue,
+
+                    // SB11
+                    _ if match_sb11(&state_before) =>
+                        return Some(position_before),
+
+                    // SB998
+                    _ => continue
+                }
+            }
+
+            // SB2
+            if self.state.match1(StatePart::Sot) {
+                None
+            } else if self.state.match1(StatePart::Eot) {
+                None
+            } else {
+                self.state = self.state.end();
+                Some(self.pos)
+            }
+        }
+    }
+
+    pub fn new_sentence_breaks<'a>(source: &'a str) -> SentenceBreaks<'a> {
+        SentenceBreaks { string: source, pos: 0, state: INITIAL_STATE }
+    }
+
+}
+
+/// External iterator for a string's
+/// [sentence boundaries](http://www.unicode.org/reports/tr29/#Sentence_Boundaries).
+pub struct USentenceBounds<'a> {
+    iter: fwd::SentenceBreaks<'a>,
+    sentence_start: Option<usize>
+}
+
+#[inline]
+pub fn new_sentence_bounds<'a>(source: &'a str) -> USentenceBounds<'a> {
+    USentenceBounds {
+        iter: fwd::new_sentence_breaks(source),
+        sentence_start: None
+    }
+}
+
+impl<'a> Iterator for USentenceBounds<'a> {
+    type Item = &'a str;
+
+    #[inline]
+    fn size_hint(&self) -> (usize, Option<usize>) {
+        let (lower, upper) = self.iter.size_hint();
+        (cmp::max(0, lower - 1), upper.map(|u| cmp::max(0, u - 1)))
+    }
+
+    #[inline]
+    fn next(&mut self) -> Option<&'a str> {
+        if self.sentence_start == None {
+            if let Some(start_pos) = self.iter.next() {
+                self.sentence_start = Some(start_pos)
+            } else {
+                return None
+            }
+        }
+
+        if let Some(break_pos) = self.iter.next() {
+            let start_pos = self.sentence_start.unwrap();
+            let sentence = &self.iter.string[start_pos..break_pos];
+            self.sentence_start = Some(break_pos);
+            Some(sentence)
+        } else {
+            None
+        }
+    }
+}