Trait unicode_segmentation::UnicodeSegmentation [−][src]
pub trait UnicodeSegmentation { fn graphemes<'a>(&'a self, is_extended: bool) -> Graphemes<'a>ⓘ; fn grapheme_indices<'a>(&'a self, is_extended: bool) -> GraphemeIndices<'a>ⓘ; fn unicode_words<'a>(&'a self) -> UnicodeWords<'a>ⓘNotable traits for GraphemeIndices<'a>impl<'a> Iterator for GraphemeIndices<'a> type Item = (usize, &'a str);
; fn split_word_bounds<'a>(&'a self) -> UWordBounds<'a>ⓘNotable traits for UnicodeWords<'a>impl<'a> Iterator for UnicodeWords<'a> type Item = &'a str;
; fn split_word_bound_indices<'a>(&'a self) -> UWordBoundIndices<'a>ⓘNotable traits for UWordBounds<'a>impl<'a> Iterator for UWordBounds<'a> type Item = &'a str;
; fn unicode_sentences<'a>(&'a self) -> UnicodeSentences<'a>ⓘNotable traits for UWordBoundIndices<'a>impl<'a> Iterator for UWordBoundIndices<'a> type Item = (usize, &'a str);
; fn split_sentence_bounds<'a>(&'a self) -> USentenceBounds<'a>ⓘNotable traits for UnicodeSentences<'a>impl<'a> Iterator for UnicodeSentences<'a> type Item = &'a str;
; fn split_sentence_bound_indices<'a>(&'a self) -> USentenceBoundIndices<'a>ⓘNotable traits for USentenceBounds<'a>impl<'a> Iterator for USentenceBounds<'a> type Item = &'a str;
; }Notable traits for USentenceBoundIndices<'a>impl<'a> Iterator for USentenceBoundIndices<'a> type Item = (usize, &'a str);
Expand description
Methods for segmenting strings according to Unicode Standard Annex #29.
Required methods
Returns an iterator over the grapheme clusters of self
.
If is_extended
is true, the iterator is over the
extended grapheme clusters;
otherwise, the iterator is over the legacy grapheme clusters.
UAX#29
recommends extended grapheme cluster boundaries for general processing.
Examples
let gr1 = UnicodeSegmentation::graphemes("a\u{310}e\u{301}o\u{308}\u{332}", true) .collect::<Vec<&str>>(); let b: &[_] = &["a\u{310}", "e\u{301}", "o\u{308}\u{332}"]; assert_eq!(&gr1[..], b); let gr2 = UnicodeSegmentation::graphemes("a\r\nb🇷🇺🇸🇹", true).collect::<Vec<&str>>(); let b: &[_] = &["a", "\r\n", "b", "🇷🇺", "🇸🇹"]; assert_eq!(&gr2[..], b);
fn grapheme_indices<'a>(&'a self, is_extended: bool) -> GraphemeIndices<'a>ⓘNotable traits for GraphemeIndices<'a>impl<'a> Iterator for GraphemeIndices<'a> type Item = (usize, &'a str);
fn grapheme_indices<'a>(&'a self, is_extended: bool) -> GraphemeIndices<'a>ⓘNotable traits for GraphemeIndices<'a>impl<'a> Iterator for GraphemeIndices<'a> type Item = (usize, &'a str);
impl<'a> Iterator for GraphemeIndices<'a> type Item = (usize, &'a str);
Returns an iterator over the grapheme clusters of self
and their
byte offsets. See graphemes()
for more information.
Examples
let gr_inds = UnicodeSegmentation::grapheme_indices("a̐éö̲\r\n", true) .collect::<Vec<(usize, &str)>>(); let b: &[_] = &[(0, "a̐"), (3, "é"), (6, "ö̲"), (11, "\r\n")]; assert_eq!(&gr_inds[..], b);
fn unicode_words<'a>(&'a self) -> UnicodeWords<'a>ⓘNotable traits for UnicodeWords<'a>impl<'a> Iterator for UnicodeWords<'a> type Item = &'a str;
fn unicode_words<'a>(&'a self) -> UnicodeWords<'a>ⓘNotable traits for UnicodeWords<'a>impl<'a> Iterator for UnicodeWords<'a> type Item = &'a str;
impl<'a> Iterator for UnicodeWords<'a> type Item = &'a str;
Returns an iterator over the words of self
, separated on
UAX#29 word boundaries.
Here, “words” are just those substrings which, after splitting on UAX#29 word boundaries, contain any alphanumeric characters. That is, the substring must contain at least one character with the Alphabetic property, or with General_Category=Number.
Example
let uws = "The quick (\"brown\") fox can't jump 32.3 feet, right?"; let uw1 = uws.unicode_words().collect::<Vec<&str>>(); let b: &[_] = &["The", "quick", "brown", "fox", "can't", "jump", "32.3", "feet", "right"]; assert_eq!(&uw1[..], b);
fn split_word_bounds<'a>(&'a self) -> UWordBounds<'a>ⓘNotable traits for UWordBounds<'a>impl<'a> Iterator for UWordBounds<'a> type Item = &'a str;
fn split_word_bounds<'a>(&'a self) -> UWordBounds<'a>ⓘNotable traits for UWordBounds<'a>impl<'a> Iterator for UWordBounds<'a> type Item = &'a str;
impl<'a> Iterator for UWordBounds<'a> type Item = &'a str;
Returns an iterator over substrings of self
separated on
UAX#29 word boundaries.
The concatenation of the substrings returned by this function is just the original string.
Example
let swu1 = "The quick (\"brown\") fox".split_word_bounds().collect::<Vec<&str>>(); let b: &[_] = &["The", " ", "quick", " ", "(", "\"", "brown", "\"", ")", " ", "fox"]; assert_eq!(&swu1[..], b);
fn split_word_bound_indices<'a>(&'a self) -> UWordBoundIndices<'a>ⓘNotable traits for UWordBoundIndices<'a>impl<'a> Iterator for UWordBoundIndices<'a> type Item = (usize, &'a str);
fn split_word_bound_indices<'a>(&'a self) -> UWordBoundIndices<'a>ⓘNotable traits for UWordBoundIndices<'a>impl<'a> Iterator for UWordBoundIndices<'a> type Item = (usize, &'a str);
impl<'a> Iterator for UWordBoundIndices<'a> type Item = (usize, &'a str);
Returns an iterator over substrings of self
, split on UAX#29 word boundaries,
and their offsets. See split_word_bounds()
for more information.
Example
let swi1 = "Brr, it's 29.3°F!".split_word_bound_indices().collect::<Vec<(usize, &str)>>(); let b: &[_] = &[(0, "Brr"), (3, ","), (4, " "), (5, "it's"), (9, " "), (10, "29.3"), (14, "°"), (16, "F"), (17, "!")]; assert_eq!(&swi1[..], b);
fn unicode_sentences<'a>(&'a self) -> UnicodeSentences<'a>ⓘNotable traits for UnicodeSentences<'a>impl<'a> Iterator for UnicodeSentences<'a> type Item = &'a str;
fn unicode_sentences<'a>(&'a self) -> UnicodeSentences<'a>ⓘNotable traits for UnicodeSentences<'a>impl<'a> Iterator for UnicodeSentences<'a> type Item = &'a str;
impl<'a> Iterator for UnicodeSentences<'a> type Item = &'a str;
Returns an iterator over substrings of self
separated on
UAX#29 sentence boundaries.
Here, “sentences” are just those substrings which, after splitting on UAX#29 sentence boundaries, contain any alphanumeric characters. That is, the substring must contain at least one character with the Alphabetic property, or with General_Category=Number.
Example
let uss = "Mr. Fox jumped. [...] The dog was too lazy."; let us1 = uss.unicode_sentences().collect::<Vec<&str>>(); let b: &[_] = &["Mr. ", "Fox jumped. ", "The dog was too lazy."]; assert_eq!(&us1[..], b);
fn split_sentence_bounds<'a>(&'a self) -> USentenceBounds<'a>ⓘNotable traits for USentenceBounds<'a>impl<'a> Iterator for USentenceBounds<'a> type Item = &'a str;
fn split_sentence_bounds<'a>(&'a self) -> USentenceBounds<'a>ⓘNotable traits for USentenceBounds<'a>impl<'a> Iterator for USentenceBounds<'a> type Item = &'a str;
impl<'a> Iterator for USentenceBounds<'a> type Item = &'a str;
Returns an iterator over substrings of self
separated on
UAX#29 sentence boundaries.
The concatenation of the substrings returned by this function is just the original string.
Example
let ssbs = "Mr. Fox jumped. [...] The dog was too lazy."; let ssb1 = ssbs.split_sentence_bounds().collect::<Vec<&str>>(); let b: &[_] = &["Mr. ", "Fox jumped. ", "[...] ", "The dog was too lazy."]; assert_eq!(&ssb1[..], b);
fn split_sentence_bound_indices<'a>(&'a self) -> USentenceBoundIndices<'a>ⓘNotable traits for USentenceBoundIndices<'a>impl<'a> Iterator for USentenceBoundIndices<'a> type Item = (usize, &'a str);
fn split_sentence_bound_indices<'a>(&'a self) -> USentenceBoundIndices<'a>ⓘNotable traits for USentenceBoundIndices<'a>impl<'a> Iterator for USentenceBoundIndices<'a> type Item = (usize, &'a str);
impl<'a> Iterator for USentenceBoundIndices<'a> type Item = (usize, &'a str);
Returns an iterator over substrings of self
, split on UAX#29 sentence boundaries,
and their offsets. See split_sentence_bounds()
for more information.
Example
let ssis = "Mr. Fox jumped. [...] The dog was too lazy."; let ssi1 = ssis.split_sentence_bound_indices().collect::<Vec<(usize, &str)>>(); let b: &[_] = &[(0, "Mr. "), (4, "Fox jumped. "), (16, "[...] "), (22, "The dog was too lazy.")]; assert_eq!(&ssi1[..], b);