1use std::cmp::Ordering;
76use std::collections::HashMap;
77use std::fmt;
78use std::io::BufRead;
79
80use cedarwood::Cedar;
81
82pub(crate) type FxHashMap<K, V> = HashMap<K, V, rustc_hash::FxBuildHasher>;
83
84pub use crate::errors::Error;
85pub use crate::hmm::HmmModel;
86#[cfg(feature = "textrank")]
87pub use crate::keywords::textrank::TextRank;
88#[cfg(feature = "tfidf")]
89pub use crate::keywords::tfidf::TfIdf;
90#[cfg(any(feature = "tfidf", feature = "textrank"))]
91pub use crate::keywords::{DEFAULT_STOP_WORDS, Keyword, KeywordExtract, KeywordExtractConfig};
92
93mod errors;
94mod hmm;
95#[cfg(any(feature = "tfidf", feature = "textrank"))]
96mod keywords;
97mod posseg;
98mod sparse_dag;
99
100#[cfg(feature = "default-dict")]
101include_flate::flate!(static DEFAULT_DICT: str from "src/data/dict.txt");
102
103use sparse_dag::StaticSparseDAG;
104
105thread_local! {
106 static HMM_CONTEXT: std::cell::RefCell<hmm::HmmContext> = std::cell::RefCell::new(hmm::HmmContext::default());
107}
108
109#[inline]
111fn is_cjk(c: char) -> bool {
112 matches!(c,
113 '\u{3400}'..='\u{4DBF}'
114 | '\u{4E00}'..='\u{9FFF}'
115 | '\u{F900}'..='\u{FAFF}'
116 | '\u{20000}'..='\u{2A6DF}'
117 | '\u{2A700}'..='\u{2B73F}'
118 | '\u{2B740}'..='\u{2B81F}'
119 | '\u{2B820}'..='\u{2CEAF}'
120 | '\u{2CEB0}'..='\u{2EBEF}'
121 | '\u{2F800}'..='\u{2FA1F}'
122 )
123}
124
125#[inline]
127fn is_han_default(c: char) -> bool {
128 is_cjk(c) || c.is_ascii_alphanumeric() || matches!(c, '+' | '#' | '&' | '.' | '_' | '%' | '-')
129}
130
131#[inline]
133fn is_han_cut_all(c: char) -> bool {
134 is_cjk(c)
135}
136
137#[inline]
139fn is_skip_cut_all(c: char) -> bool {
140 !c.is_ascii_alphanumeric() && c != '+' && c != '#' && c != '\n'
141}
142
143#[inline]
144fn char_count(s: &str) -> usize {
145 if s.len() >= 16 {
146 bytecount::num_chars(s.as_bytes())
147 } else {
148 s.as_bytes().iter().filter(|&&b| (b as i8) >= -0x40).count()
149 }
150}
151
152pub(crate) struct SplitByCharacterClass<'t, F> {
156 text: &'t str,
157 pos: usize,
158 classify: F,
159}
160
161impl<'t, F: Fn(char) -> bool> SplitByCharacterClass<'t, F> {
162 #[inline]
163 fn new(text: &'t str, classify: F) -> Self {
164 SplitByCharacterClass { text, pos: 0, classify }
165 }
166}
167
168impl<'t, F: Fn(char) -> bool> Iterator for SplitByCharacterClass<'t, F> {
169 type Item = SplitState<'t>;
170
171 fn next(&mut self) -> Option<SplitState<'t>> {
172 if self.pos >= self.text.len() {
173 return None;
174 }
175
176 let remaining = &self.text[self.pos..];
177 let first_char = remaining.chars().next().unwrap();
178
179 if (self.classify)(first_char) {
180 let start = self.pos;
182 let mut end = self.pos + first_char.len_utf8();
183 for c in remaining[first_char.len_utf8()..].chars() {
184 if (self.classify)(c) {
185 end += c.len_utf8();
186 } else {
187 break;
188 }
189 }
190 self.pos = end;
191 Some(SplitState::Matched(&self.text[start..end]))
192 } else {
193 let start = self.pos;
195 let mut end = self.pos + first_char.len_utf8();
196 for c in remaining[first_char.len_utf8()..].chars() {
197 if (self.classify)(c) {
198 break;
199 }
200 end += c.len_utf8();
201 }
202 self.pos = end;
203 Some(SplitState::Unmatched(&self.text[start..end]))
204 }
205 }
206}
207
208#[derive(Debug)]
209pub(crate) enum SplitState<'t> {
210 Unmatched(&'t str),
211 Matched(&'t str),
212}
213
214impl<'t> SplitState<'t> {
215 #[inline]
216 fn as_str(&self) -> &'t str {
217 match self {
218 SplitState::Unmatched(t) => t,
219 SplitState::Matched(t) => t,
220 }
221 }
222
223 #[inline]
224 pub fn is_matched(&self) -> bool {
225 matches!(self, SplitState::Matched(_))
226 }
227}
228
229#[derive(Debug, Clone, Copy, PartialEq, Eq)]
230pub enum TokenizeMode {
231 Default,
233 Search,
235}
236
237#[derive(Debug, Clone, PartialEq, Eq, Hash)]
239pub struct Token<'a> {
240 pub word: &'a str,
242 pub start: usize,
244 pub end: usize,
246 pub byte_start: usize,
248 pub byte_end: usize,
250}
251
252#[derive(Debug, Clone, PartialEq, Eq, Hash)]
254pub struct Tag<'a> {
255 pub word: &'a str,
257 pub tag: &'a str,
259 pub start: usize,
261 pub end: usize,
263 pub byte_start: usize,
265 pub byte_end: usize,
267}
268
269#[derive(Debug, Clone)]
270struct Record {
271 freq: usize,
272 log_freq: f64,
273 tag: Box<str>,
274}
275
276impl Record {
277 #[inline(always)]
278 fn new(freq: usize, tag: Box<str>) -> Self {
279 Self {
280 freq,
281 log_freq: (freq as f64).ln(),
282 tag,
283 }
284 }
285
286 #[inline]
287 fn set_freq(&mut self, freq: usize) {
288 self.freq = freq;
289 self.log_freq = (freq as f64).ln();
290 }
291}
292
293#[derive(Clone)]
295pub struct Jieba {
296 records: Vec<Record>,
297 cedar: Cedar,
298 total: usize,
299 hmm_model: Option<HmmModel>,
300}
301
302impl fmt::Debug for Jieba {
303 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
304 f.debug_struct("Jieba")
305 .field("records_len", &self.records.len())
306 .field("total_freq", &self.total)
307 .finish()
308 }
309}
310
311#[cfg(feature = "default-dict")]
312impl Default for Jieba {
313 fn default() -> Self {
314 Jieba::new()
315 }
316}
317
318impl Jieba {
319 pub fn empty() -> Self {
321 Jieba {
322 records: Vec::new(),
323 cedar: Cedar::new(),
324 total: 0,
325 hmm_model: None,
326 }
327 }
328
329 #[cfg(feature = "default-dict")]
333 pub fn new() -> Self {
334 let mut instance = Self::empty();
335 instance.load_default_dict();
336 instance
337 }
338
339 pub fn with_dict<R: BufRead>(dict: &mut R) -> Result<Self, Error> {
341 let mut instance = Self::empty();
342 instance.load_dict(dict)?;
343 Ok(instance)
344 }
345
346 #[cfg(feature = "default-dict")]
367 pub fn load_default_dict(&mut self) {
368 use std::io::BufReader;
369
370 let mut default_dict = BufReader::new(DEFAULT_DICT.as_bytes());
371 self.load_dict(&mut default_dict).unwrap();
372 }
373
374 pub fn set_hmm_model(&mut self, model: HmmModel) {
394 self.hmm_model = Some(model);
395 }
396
397 pub fn clear(&mut self) {
419 self.records.clear();
420 self.cedar = Cedar::new();
421 self.total = 0;
422 }
423
424 pub fn add_word(&mut self, word: &str, freq: Option<usize>, tag: Option<&str>) -> usize {
430 if word.is_empty() {
431 return 0;
432 }
433 let freq = freq.unwrap_or_else(|| self.suggest_freq(word));
434 let tag = tag.unwrap_or("");
435
436 match self.cedar.exact_match_search(word) {
437 Some((word_id, _, _)) => {
438 let old_freq = self.records[word_id as usize].freq;
439 self.records[word_id as usize].set_freq(freq);
440
441 self.total += freq;
442 self.total -= old_freq;
443 }
444 None => {
445 let word_id = self.records.len() as i32;
446 self.records.push(Record::new(freq, tag.into()));
447
448 self.cedar.update(word, word_id);
449 self.total += freq;
450 }
451 };
452
453 freq
454 }
455
456 pub fn has_word(&self, word: &str) -> bool {
466 self.cedar.exact_match_search(word).is_some()
467 }
468
469 pub fn load_dict<R: BufRead>(&mut self, dict: &mut R) -> Result<(), Error> {
493 let mut buf = String::new();
494 self.total = 0;
495
496 let mut line_no = 0;
497 while dict.read_line(&mut buf)? > 0 {
498 {
499 line_no += 1;
500 let mut iter = buf.split_whitespace();
501 if let Some(word) = iter.next() {
502 let freq = iter
503 .next()
504 .map(|x| {
505 x.parse::<usize>().map_err(|e| {
506 Error::InvalidDictEntry(format!(
507 "line {line_no} `{buf}` frequency {x} is not a valid integer: {e}"
508 ))
509 })
510 })
511 .unwrap_or(Ok(0))?;
512 let tag = iter.next().unwrap_or("");
513
514 match self.cedar.exact_match_search(word) {
515 Some((word_id, _, _)) => {
516 self.records[word_id as usize].set_freq(freq);
517 }
518 None => {
519 let word_id = self.records.len() as i32;
520 self.records.push(Record::new(freq, tag.into()));
521 self.cedar.update(word, word_id);
522 }
523 };
524 }
525 }
526 buf.clear();
527 }
528 self.total = self.records.iter().map(|n| n.freq).sum();
529
530 Ok(())
531 }
532
533 fn get_word_freq(&self, word: &str, default: usize) -> usize {
534 match self.cedar.exact_match_search(word) {
535 Some((word_id, _, _)) => self.records[word_id as usize].freq,
536 _ => default,
537 }
538 }
539
540 pub fn suggest_freq(&self, segment: &str) -> usize {
542 let logtotal = (self.total as f64).ln();
543 let logfreq = self.cut(segment, false).iter().fold(0f64, |freq, token| {
544 freq + (self.get_word_freq(token.word, 1) as f64).ln() - logtotal
545 });
546 std::cmp::max((logfreq + logtotal).exp() as usize + 1, self.get_word_freq(segment, 1))
547 }
548
549 #[allow(clippy::ptr_arg)]
550 fn calc(&self, sentence: &str, dag: &StaticSparseDAG, route: &mut Vec<(f64, usize)>) {
551 let str_len = sentence.len();
552
553 if str_len + 1 > route.len() {
554 route.resize(str_len + 1, (0.0, 0));
555 }
556
557 let logtotal = (self.total as f64).ln();
558 let log1 = 0.0f64 - logtotal; let mut prev_byte_start = str_len;
560 let curr = sentence.char_indices().map(|x| x.0).rev();
561 for byte_start in curr {
562 let pair = dag
563 .iter_edges(byte_start)
564 .map(|(byte_end, word_id)| {
565 let log_freq = if word_id != sparse_dag::NO_MATCH {
566 self.records[word_id as usize].log_freq
567 } else {
568 0.0 };
570
571 (log_freq - logtotal + route[byte_end].0, byte_end)
572 })
573 .max_by(|x, y| x.partial_cmp(y).unwrap_or(Ordering::Equal));
574
575 if let Some(p) = pair {
576 route[byte_start] = p;
577 } else {
578 let byte_end = prev_byte_start;
579 route[byte_start] = (log1 + route[byte_end].0, byte_end);
580 }
581
582 prev_byte_start = byte_start;
583 }
584 }
585
586 fn dag(&self, sentence: &str, dag: &mut StaticSparseDAG) {
587 for (byte_start, _) in sentence.char_indices() {
588 dag.start(byte_start);
589 let haystack = &sentence[byte_start..];
590
591 for (word_id, end_index) in self.cedar.common_prefix_iter(haystack) {
592 dag.insert(end_index + byte_start + 1, word_id);
593 }
594
595 dag.commit();
596 }
597 }
598
599 fn cut_all_tokens<'a>(&self, block: &'a str, base: usize, block_unicode_start: usize, tokens: &mut Vec<Token<'a>>) {
602 let str_len = block.len();
603 let mut dag = StaticSparseDAG::with_size_hint(block.len());
604 self.dag(block, &mut dag);
605
606 let block_base = block.as_ptr() as usize;
607 let byte_offset_in_sentence = block_base - base;
608
609 for (unicode_idx, (byte_start, _)) in block.char_indices().enumerate() {
610 let unicode_start = block_unicode_start + unicode_idx;
611 for (byte_end, _) in dag.iter_edges(byte_start) {
612 let word = if byte_end == str_len {
613 &block[byte_start..]
614 } else {
615 &block[byte_start..byte_end]
616 };
617 let char_count = char_count(word);
618 let bs = byte_offset_in_sentence + byte_start;
619 tokens.push(Token {
620 word,
621 start: unicode_start,
622 end: unicode_start + char_count,
623 byte_start: bs,
624 byte_end: bs + word.len(),
625 });
626 }
627 }
628 }
629
630 fn cut_dag_no_hmm<'a>(
631 &self,
632 sentence: &'a str,
633 words: &mut Vec<&'a str>,
634 route: &mut Vec<(f64, usize)>,
635 dag: &mut StaticSparseDAG,
636 ) {
637 self.dag(sentence, dag);
638 self.calc(sentence, dag, route);
639 let mut x = 0;
640 let mut left: Option<usize> = None;
641
642 while x < sentence.len() {
643 let y = route[x].1;
644 let l_str = &sentence[x..y];
645
646 if l_str.chars().nth(1).is_none() && l_str.as_bytes()[0].is_ascii_alphanumeric() {
647 if left.is_none() {
648 left = Some(x);
649 }
650 } else {
651 if let Some(byte_start) = left {
652 let word = &sentence[byte_start..x];
653 words.push(word);
654 left = None;
655 }
656
657 words.push(l_str);
658 }
659 x = y;
660 }
661
662 if let Some(byte_start) = left {
663 let word = &sentence[byte_start..];
664 words.push(word);
665 }
666
667 dag.clear();
668 route.clear();
669 }
670
671 #[inline]
672 fn hmm_cut<'a>(&self, word: &'a str, words: &mut Vec<&'a str>, hmm_context: &mut hmm::HmmContext) {
673 if let Some(ref model) = self.hmm_model {
674 hmm::cut_with_allocated_memory(word, words, model, hmm_context);
675 } else {
676 hmm::cut_with_allocated_memory(word, words, &hmm::builtin_hmm(), hmm_context);
677 }
678 }
679
680 #[allow(non_snake_case, clippy::too_many_arguments)]
681 fn cut_dag_hmm<'a>(
682 &self,
683 sentence: &'a str,
684 words: &mut Vec<&'a str>,
685 route: &mut Vec<(f64, usize)>,
686 dag: &mut StaticSparseDAG,
687 hmm_context: &mut hmm::HmmContext,
688 ) {
689 self.dag(sentence, dag);
690 self.calc(sentence, dag, route);
691 let mut x = 0;
692 let mut left: Option<usize> = None;
693
694 while x < sentence.len() {
695 let y = route[x].1;
696
697 if sentence[x..y].chars().nth(1).is_none() {
698 if left.is_none() {
699 left = Some(x);
700 }
701 } else {
702 if let Some(byte_start) = left {
703 let byte_end = x;
704 let word = &sentence[byte_start..byte_end];
705 if word.chars().nth(1).is_none() {
706 words.push(word);
707 } else if self.cedar.exact_match_search(word).is_none() {
708 self.hmm_cut(word, words, hmm_context);
709 } else {
710 let mut word_indices = word.char_indices().map(|x| x.0).peekable();
711 while let Some(byte_start) = word_indices.next() {
712 if let Some(byte_end) = word_indices.peek() {
713 words.push(&word[byte_start..*byte_end]);
714 } else {
715 words.push(&word[byte_start..]);
716 }
717 }
718 }
719 left = None;
720 }
721 let word = &sentence[x..y];
722 words.push(word);
723 }
724 x = y;
725 }
726
727 if let Some(byte_start) = left {
728 let word = &sentence[byte_start..];
729
730 if word.chars().nth(1).is_none() {
731 words.push(word);
732 } else if self.cedar.exact_match_search(word).is_none() {
733 self.hmm_cut(word, words, hmm_context);
734 } else {
735 let mut word_indices = word.char_indices().map(|x| x.0).peekable();
736 while let Some(byte_start) = word_indices.next() {
737 if let Some(byte_end) = word_indices.peek() {
738 words.push(&word[byte_start..*byte_end]);
739 } else {
740 words.push(&word[byte_start..]);
741 }
742 }
743 }
744 }
745
746 dag.clear();
747 route.clear();
748 }
749
750 #[inline]
753 fn make_token_incremental<'a>(word: &'a str, base: usize, unicode_offset: &mut usize) -> Token<'a> {
754 let ptr = word.as_ptr() as usize;
755 debug_assert!(ptr >= base, "word is not a subslice of sentence");
756 let byte_start = ptr - base;
757 let byte_end = byte_start + word.len();
758 let start = *unicode_offset;
759 let char_count = char_count(word);
761 *unicode_offset = start + char_count;
762 Token {
763 word,
764 start,
765 end: *unicode_offset,
766 byte_start,
767 byte_end,
768 }
769 }
770
771 #[allow(non_snake_case)]
772 fn cut_internal<'a>(&self, sentence: &'a str, cut_all: bool, hmm: bool) -> Vec<Token<'a>> {
773 if cut_all {
774 return self.cut_all_toplevel(sentence);
775 }
776 let base = sentence.as_ptr() as usize;
777 let mut unicode_offset = 0;
778
779 let heuristic_capacity = sentence.len() / 2;
780 let mut str_words = Vec::with_capacity(heuristic_capacity);
781 let mut tokens = Vec::with_capacity(heuristic_capacity);
782
783 let splitter = SplitByCharacterClass::new(sentence, is_han_default);
784 let mut route = Vec::with_capacity(heuristic_capacity);
785 let mut dag = StaticSparseDAG::with_size_hint(heuristic_capacity);
786
787 for state in splitter {
788 match state {
789 SplitState::Matched(_) => {
790 let block = state.as_str();
791 assert!(!block.is_empty());
792
793 str_words.clear();
794 if hmm {
795 HMM_CONTEXT.with(|ctx| {
796 let mut hmm_context = ctx.borrow_mut();
797 self.cut_dag_hmm(block, &mut str_words, &mut route, &mut dag, &mut hmm_context);
798 });
799 } else {
800 self.cut_dag_no_hmm(block, &mut str_words, &mut route, &mut dag);
801 }
802 for &word in &str_words {
803 tokens.push(Self::make_token_incremental(word, base, &mut unicode_offset));
804 }
805 }
806 SplitState::Unmatched(_) => {
807 let block = state.as_str();
808 assert!(!block.is_empty());
809
810 let mut chars = block.char_indices().peekable();
811 while let Some((i, c)) = chars.next() {
812 let word = if c == '\r' {
814 if let Some(&(_, '\n')) = chars.peek() {
815 let _ = chars.next();
816 let end = i + 2;
817 &block[i..end]
818 } else {
819 let end = i + c.len_utf8();
820 &block[i..end]
821 }
822 } else {
823 let end = i + c.len_utf8();
824 &block[i..end]
825 };
826 tokens.push(Self::make_token_incremental(word, base, &mut unicode_offset));
827 }
828 }
829 }
830 }
831 tokens
832 }
833
834 fn cut_all_toplevel<'a>(&self, sentence: &'a str) -> Vec<Token<'a>> {
836 let base = sentence.as_ptr() as usize;
837 let mut unicode_offset = 0;
838
839 let heuristic_capacity = sentence.len() / 2;
840 let mut tokens = Vec::with_capacity(heuristic_capacity);
841
842 let splitter = SplitByCharacterClass::new(sentence, is_han_cut_all);
843
844 for state in splitter {
845 match state {
846 SplitState::Matched(_) => {
847 let block = state.as_str();
848 assert!(!block.is_empty());
849 let block_unicode_start = unicode_offset;
850 unicode_offset += char_count(block);
852 self.cut_all_tokens(block, base, block_unicode_start, &mut tokens);
853 }
854 SplitState::Unmatched(_) => {
855 let block = state.as_str();
856 assert!(!block.is_empty());
857
858 let skip_splitter = SplitByCharacterClass::new(block, is_skip_cut_all);
859 for skip_state in skip_splitter {
860 let word = skip_state.as_str();
861 if word.is_empty() {
862 continue;
863 }
864 if skip_state.is_matched() {
865 let mut indices = word.char_indices().peekable();
868 while let Some((i, _)) = indices.next() {
869 let end = indices.peek().map_or(word.len(), |&(j, _)| j);
870 tokens.push(Self::make_token_incremental(&word[i..end], base, &mut unicode_offset));
871 }
872 } else {
873 tokens.push(Self::make_token_incremental(word, base, &mut unicode_offset));
874 }
875 }
876 }
877 }
878 }
879 tokens
880 }
881
882 pub fn cut<'a>(&self, sentence: &'a str, hmm: bool) -> Vec<Token<'a>> {
890 self.cut_internal(sentence, false, hmm)
891 }
892
893 pub fn cut_all<'a>(&self, sentence: &'a str) -> Vec<Token<'a>> {
899 self.cut_internal(sentence, true, false)
900 }
901
902 pub fn cut_for_search<'a>(&self, sentence: &'a str, hmm: bool) -> Vec<Token<'a>> {
910 let words = self.cut(sentence, hmm);
911 let mut new_words = Vec::with_capacity(words.len());
912 let base = sentence.as_ptr() as usize;
913 let mut char_indices = Vec::new();
914 for token in words {
915 let word = token.word;
916 char_indices.clear();
917 char_indices.extend(word.char_indices().map(|x| x.0));
918 let char_count = char_indices.len();
919 if char_count > 2 {
920 for i in 0..char_count - 1 {
921 let local_byte_start = char_indices[i];
922 let gram2 = if i + 2 < char_count {
923 &word[local_byte_start..char_indices[i + 2]]
924 } else {
925 &word[local_byte_start..]
926 };
927 if self.cedar.exact_match_search(gram2).is_some() {
928 let byte_start = gram2.as_ptr() as usize - base;
929 let byte_end = byte_start + gram2.len();
930 new_words.push(Token {
931 word: gram2,
932 start: token.start + i,
933 end: token.start + i + 2,
934 byte_start,
935 byte_end,
936 });
937 }
938 }
939 }
940 if char_count > 3 {
941 for i in 0..char_count - 2 {
942 let local_byte_start = char_indices[i];
943 let gram3 = if i + 3 < char_count {
944 &word[local_byte_start..char_indices[i + 3]]
945 } else {
946 &word[local_byte_start..]
947 };
948 if self.cedar.exact_match_search(gram3).is_some() {
949 let byte_start = gram3.as_ptr() as usize - base;
950 let byte_end = byte_start + gram3.len();
951 new_words.push(Token {
952 word: gram3,
953 start: token.start + i,
954 end: token.start + i + 3,
955 byte_start,
956 byte_end,
957 });
958 }
959 }
960 }
961 new_words.push(token);
962 }
963 new_words
964 }
965
966 pub fn tokenize<'a>(&self, sentence: &'a str, mode: TokenizeMode, hmm: bool) -> Vec<Token<'a>> {
976 match mode {
977 TokenizeMode::Default => self.cut(sentence, hmm),
978 TokenizeMode::Search => self.cut_for_search(sentence, hmm),
979 }
980 }
981
982 pub fn tag<'a>(&'a self, sentence: &'a str, hmm: bool) -> Vec<Tag<'a>> {
990 let tokens = self.cut(sentence, hmm);
991 tokens
992 .into_iter()
993 .map(|token| {
994 let word = token.word;
995 if let Some((word_id, _, _)) = self.cedar.exact_match_search(word) {
996 let t = &self.records[word_id as usize].tag;
997 return Tag {
998 word,
999 tag: t,
1000 start: token.start,
1001 end: token.end,
1002 byte_start: token.byte_start,
1003 byte_end: token.byte_end,
1004 };
1005 }
1006 let tag = self.guess_tag(word);
1007 Tag {
1008 word,
1009 tag,
1010 start: token.start,
1011 end: token.end,
1012 byte_start: token.byte_start,
1013 byte_end: token.byte_end,
1014 }
1015 })
1016 .collect()
1017 }
1018
1019 fn guess_tag(&self, word: &str) -> &'static str {
1024 let mut eng = 0;
1025 let mut m = 0;
1026 for chr in word.chars() {
1027 if chr.is_ascii_alphanumeric() {
1028 eng += 1;
1029 if chr.is_ascii_digit() {
1030 m += 1;
1031 }
1032 }
1033 }
1034 if eng > 0 {
1035 return if eng == m { "m" } else { "eng" };
1036 }
1037
1038 #[cfg(feature = "default-dict")]
1039 {
1040 if word.chars().any(|c| is_cjk(c)) {
1042 let results = posseg::cut_with_pos(word);
1043 if results.len() == 1 {
1044 return results[0].1;
1045 }
1046 if let Some((_w, tag)) = results.iter().max_by_key(|(w, _)| w.len()) {
1047 return tag;
1048 }
1049 }
1050 }
1051
1052 "x"
1053 }
1054}
1055
1056#[cfg(test)]
1057mod tests {
1058 use super::{Jieba, SplitByCharacterClass, SplitState, TokenizeMode, is_han_default};
1059 use expect_test::expect;
1060 use std::io::BufReader;
1061
1062 #[test]
1063 fn test_init_with_default_dict() {
1064 let _ = Jieba::new();
1065 }
1066
1067 #[test]
1068 fn test_has_word() {
1069 let jieba = Jieba::new();
1070 assert!(jieba.has_word("中国"));
1071 assert!(jieba.has_word("开源"));
1072 assert!(!jieba.has_word("不存在的词"));
1073 }
1074
1075 #[test]
1076 fn test_split_matches() {
1077 let splitter = SplitByCharacterClass::new(
1078 "👪 PS: 我觉得开源有一个好处,就是能够敦促自己不断改进 👪,避免敞帚自珍",
1079 is_han_default,
1080 );
1081 for state in splitter {
1082 match state {
1083 SplitState::Matched(_) => {
1084 let block = state.as_str();
1085 assert!(!block.is_empty());
1086 }
1087 SplitState::Unmatched(_) => {
1088 let block = state.as_str();
1089 assert!(!block.is_empty());
1090 }
1091 }
1092 }
1093 }
1094
1095 #[test]
1096 fn test_split_matches_against_unicode_sip() {
1097 let splitter = SplitByCharacterClass::new("讥䶯䶰䶱䶲䶳䶴䶵𦡦", is_han_default);
1098
1099 let result: Vec<&str> = splitter.map(|x| x.as_str()).collect();
1100 expect![[r#"["讥䶯䶰䶱䶲䶳䶴䶵𦡦"]"#]].assert_eq(&format!("{:?}", result));
1101 }
1102
1103 #[test]
1104 fn test_cut_all_skip_single_char() {
1105 let jieba = Jieba::new();
1106 let words: Vec<&str> = jieba.cut_all("a!!b").iter().map(|t| t.word).collect();
1107 assert_eq!(words, vec!["a", "!", "!", "b"]);
1108 }
1109
1110 #[test]
1111 fn test_cut_default_crlf_and_whitespace() {
1112 let jieba = Jieba::new();
1113 let words: Vec<&str> = jieba.cut("x\r\n\ty", false).iter().map(|t| t.word).collect();
1114 assert_eq!(words, vec!["x", "\r\n", "\t", "y"]);
1115 }
1116
1117 #[test]
1118 fn test_cut_all() {
1119 let jieba = Jieba::new();
1120 let tokens = jieba.cut_all("abc网球拍卖会def");
1121 let words: Vec<&str> = tokens.iter().map(|t| t.word).collect();
1122 expect![[r#"["abc", "网", "网球", "网球拍", "球", "球拍", "拍", "拍卖", "拍卖会", "卖", "会", "def"]"#]]
1123 .assert_eq(&format!("{:?}", words));
1124
1125 let tokens = jieba.cut_all("我来到北京清华大学");
1129 let words: Vec<&str> = tokens.iter().map(|t| t.word).collect();
1130 expect![[r#"["我", "来", "来到", "到", "北", "北京", "京", "清", "清华", "清华大学", "华", "华大", "大", "大学", "学"]"#]]
1131 .assert_eq(&format!("{:?}", words));
1132 }
1133
1134 #[test]
1135 fn test_cut_no_hmm() {
1136 let jieba = Jieba::new();
1137 let tokens = jieba.cut("abc网球拍卖会def", false);
1138 let words: Vec<&str> = tokens.iter().map(|t| t.word).collect();
1139 expect![[r#"["abc", "网球", "拍卖会", "def"]"#]].assert_eq(&format!("{:?}", words));
1140 }
1141
1142 #[test]
1143 fn test_cut_no_hmm1() {
1144 let jieba = Jieba::new();
1145 let tokens = jieba.cut("abc网球拍卖会def!!?\r\n\t", false);
1146 let words: Vec<&str> = tokens.iter().map(|t| t.word).collect();
1147 expect![[r#"["abc", "网球", "拍卖会", "def", "!", "!", "?", "\r\n", "\t"]"#]]
1148 .assert_eq(&format!("{:?}", words));
1149 }
1150
1151 #[test]
1152 fn test_cut_with_hmm() {
1153 let jieba = Jieba::new();
1154 let tokens = jieba.cut("我们中出了一个叛徒", false);
1155 let words: Vec<&str> = tokens.iter().map(|t| t.word).collect();
1156 expect![[r#"["我们", "中", "出", "了", "一个", "叛徒"]"#]].assert_eq(&format!("{:?}", words));
1157 let tokens = jieba.cut("我们中出了一个叛徒", true);
1158 let words: Vec<&str> = tokens.iter().map(|t| t.word).collect();
1159 expect![[r#"["我们", "中出", "了", "一个", "叛徒"]"#]].assert_eq(&format!("{:?}", words));
1160 let tokens = jieba.cut("我们中出了一个叛徒👪", true);
1161 let words: Vec<&str> = tokens.iter().map(|t| t.word).collect();
1162 expect![[r#"["我们", "中出", "了", "一个", "叛徒", "👪"]"#]].assert_eq(&format!("{:?}", words));
1163
1164 let tokens = jieba.cut("我来到北京清华大学", true);
1165 let words: Vec<&str> = tokens.iter().map(|t| t.word).collect();
1166 expect![[r#"["我", "来到", "北京", "清华大学"]"#]].assert_eq(&format!("{:?}", words));
1167
1168 let tokens = jieba.cut("他来到了网易杭研大厦", true);
1169 let words: Vec<&str> = tokens.iter().map(|t| t.word).collect();
1170 expect![[r#"["他", "来到", "了", "网易", "杭研", "大厦"]"#]].assert_eq(&format!("{:?}", words));
1171 }
1172
1173 #[test]
1174 fn test_cut_weicheng() {
1175 static WEICHENG_TXT: &str = include_str!("../../examples/weicheng/src/weicheng.txt");
1176 let jieba = Jieba::new();
1177 for line in WEICHENG_TXT.split('\n') {
1178 let _ = jieba.cut(line, true);
1179 }
1180 }
1181
1182 #[test]
1183 fn test_cut_for_search() {
1184 let jieba = Jieba::new();
1185 let tokens = jieba.cut_for_search("南京市长江大桥", true);
1186 let words: Vec<&str> = tokens.iter().map(|t| t.word).collect();
1187 expect![[r#"["南京", "京市", "南京市", "长江", "大桥", "长江大桥"]"#]].assert_eq(&format!("{:?}", words));
1188
1189 let tokens = jieba.cut_for_search("小明硕士毕业于中国科学院计算所,后在日本京都大学深造", true);
1190 let words: Vec<&str> = tokens.iter().map(|t| t.word).collect();
1191
1192 expect![[r#"["小明", "硕士", "毕业", "于", "中国", "科学", "学院", "科学院", "中国科学院", "计算", "计算所", ",", "后", "在", "日本", "京都", "大学", "日本京都大学", "深造"]"#]]
1195 .assert_eq(&format!("{:?}", words));
1196 }
1197
1198 #[test]
1199 fn test_tag() {
1200 let jieba = Jieba::new();
1201 let tags = jieba.tag(
1202 "我是拖拉机学院手扶拖拉机专业的。不用多久,我就会升职加薪,当上CEO,走上人生巅峰。",
1203 true,
1204 );
1205 expect![[r#"
1206 [
1207 Tag {
1208 word: "我",
1209 tag: "r",
1210 start: 0,
1211 end: 1,
1212 byte_start: 0,
1213 byte_end: 3,
1214 },
1215 Tag {
1216 word: "是",
1217 tag: "v",
1218 start: 1,
1219 end: 2,
1220 byte_start: 3,
1221 byte_end: 6,
1222 },
1223 Tag {
1224 word: "拖拉机",
1225 tag: "n",
1226 start: 2,
1227 end: 5,
1228 byte_start: 6,
1229 byte_end: 15,
1230 },
1231 Tag {
1232 word: "学院",
1233 tag: "n",
1234 start: 5,
1235 end: 7,
1236 byte_start: 15,
1237 byte_end: 21,
1238 },
1239 Tag {
1240 word: "手扶拖拉机",
1241 tag: "n",
1242 start: 7,
1243 end: 12,
1244 byte_start: 21,
1245 byte_end: 36,
1246 },
1247 Tag {
1248 word: "专业",
1249 tag: "n",
1250 start: 12,
1251 end: 14,
1252 byte_start: 36,
1253 byte_end: 42,
1254 },
1255 Tag {
1256 word: "的",
1257 tag: "uj",
1258 start: 14,
1259 end: 15,
1260 byte_start: 42,
1261 byte_end: 45,
1262 },
1263 Tag {
1264 word: "。",
1265 tag: "x",
1266 start: 15,
1267 end: 16,
1268 byte_start: 45,
1269 byte_end: 48,
1270 },
1271 Tag {
1272 word: "不用",
1273 tag: "v",
1274 start: 16,
1275 end: 18,
1276 byte_start: 48,
1277 byte_end: 54,
1278 },
1279 Tag {
1280 word: "多久",
1281 tag: "m",
1282 start: 18,
1283 end: 20,
1284 byte_start: 54,
1285 byte_end: 60,
1286 },
1287 Tag {
1288 word: ",",
1289 tag: "x",
1290 start: 20,
1291 end: 21,
1292 byte_start: 60,
1293 byte_end: 63,
1294 },
1295 Tag {
1296 word: "我",
1297 tag: "r",
1298 start: 21,
1299 end: 22,
1300 byte_start: 63,
1301 byte_end: 66,
1302 },
1303 Tag {
1304 word: "就",
1305 tag: "d",
1306 start: 22,
1307 end: 23,
1308 byte_start: 66,
1309 byte_end: 69,
1310 },
1311 Tag {
1312 word: "会",
1313 tag: "v",
1314 start: 23,
1315 end: 24,
1316 byte_start: 69,
1317 byte_end: 72,
1318 },
1319 Tag {
1320 word: "升职",
1321 tag: "v",
1322 start: 24,
1323 end: 26,
1324 byte_start: 72,
1325 byte_end: 78,
1326 },
1327 Tag {
1328 word: "加薪",
1329 tag: "nr",
1330 start: 26,
1331 end: 28,
1332 byte_start: 78,
1333 byte_end: 84,
1334 },
1335 Tag {
1336 word: ",",
1337 tag: "x",
1338 start: 28,
1339 end: 29,
1340 byte_start: 84,
1341 byte_end: 87,
1342 },
1343 Tag {
1344 word: "当上",
1345 tag: "t",
1346 start: 29,
1347 end: 31,
1348 byte_start: 87,
1349 byte_end: 93,
1350 },
1351 Tag {
1352 word: "CEO",
1353 tag: "eng",
1354 start: 31,
1355 end: 34,
1356 byte_start: 93,
1357 byte_end: 96,
1358 },
1359 Tag {
1360 word: ",",
1361 tag: "x",
1362 start: 34,
1363 end: 35,
1364 byte_start: 96,
1365 byte_end: 99,
1366 },
1367 Tag {
1368 word: "走上",
1369 tag: "v",
1370 start: 35,
1371 end: 37,
1372 byte_start: 99,
1373 byte_end: 105,
1374 },
1375 Tag {
1376 word: "人生",
1377 tag: "n",
1378 start: 37,
1379 end: 39,
1380 byte_start: 105,
1381 byte_end: 111,
1382 },
1383 Tag {
1384 word: "巅峰",
1385 tag: "n",
1386 start: 39,
1387 end: 41,
1388 byte_start: 111,
1389 byte_end: 117,
1390 },
1391 Tag {
1392 word: "。",
1393 tag: "x",
1394 start: 41,
1395 end: 42,
1396 byte_start: 117,
1397 byte_end: 120,
1398 },
1399 ]"#]]
1400 .assert_eq(&format!("{:#?}", tags));
1401
1402 let tags = jieba.tag("今天纽约的天气真好啊,京华大酒店的张尧经理吃了一只北京烤鸭。", true);
1403 expect![[r#"
1404 [
1405 Tag {
1406 word: "今天",
1407 tag: "t",
1408 start: 0,
1409 end: 2,
1410 byte_start: 0,
1411 byte_end: 6,
1412 },
1413 Tag {
1414 word: "纽约",
1415 tag: "ns",
1416 start: 2,
1417 end: 4,
1418 byte_start: 6,
1419 byte_end: 12,
1420 },
1421 Tag {
1422 word: "的",
1423 tag: "uj",
1424 start: 4,
1425 end: 5,
1426 byte_start: 12,
1427 byte_end: 15,
1428 },
1429 Tag {
1430 word: "天气",
1431 tag: "n",
1432 start: 5,
1433 end: 7,
1434 byte_start: 15,
1435 byte_end: 21,
1436 },
1437 Tag {
1438 word: "真好",
1439 tag: "d",
1440 start: 7,
1441 end: 9,
1442 byte_start: 21,
1443 byte_end: 27,
1444 },
1445 Tag {
1446 word: "啊",
1447 tag: "zg",
1448 start: 9,
1449 end: 10,
1450 byte_start: 27,
1451 byte_end: 30,
1452 },
1453 Tag {
1454 word: ",",
1455 tag: "x",
1456 start: 10,
1457 end: 11,
1458 byte_start: 30,
1459 byte_end: 33,
1460 },
1461 Tag {
1462 word: "京华",
1463 tag: "nz",
1464 start: 11,
1465 end: 13,
1466 byte_start: 33,
1467 byte_end: 39,
1468 },
1469 Tag {
1470 word: "大酒店",
1471 tag: "n",
1472 start: 13,
1473 end: 16,
1474 byte_start: 39,
1475 byte_end: 48,
1476 },
1477 Tag {
1478 word: "的",
1479 tag: "uj",
1480 start: 16,
1481 end: 17,
1482 byte_start: 48,
1483 byte_end: 51,
1484 },
1485 Tag {
1486 word: "张尧",
1487 tag: "nr",
1488 start: 17,
1489 end: 19,
1490 byte_start: 51,
1491 byte_end: 57,
1492 },
1493 Tag {
1494 word: "经理",
1495 tag: "n",
1496 start: 19,
1497 end: 21,
1498 byte_start: 57,
1499 byte_end: 63,
1500 },
1501 Tag {
1502 word: "吃",
1503 tag: "v",
1504 start: 21,
1505 end: 22,
1506 byte_start: 63,
1507 byte_end: 66,
1508 },
1509 Tag {
1510 word: "了",
1511 tag: "ul",
1512 start: 22,
1513 end: 23,
1514 byte_start: 66,
1515 byte_end: 69,
1516 },
1517 Tag {
1518 word: "一只",
1519 tag: "m",
1520 start: 23,
1521 end: 25,
1522 byte_start: 69,
1523 byte_end: 75,
1524 },
1525 Tag {
1526 word: "北京烤鸭",
1527 tag: "n",
1528 start: 25,
1529 end: 29,
1530 byte_start: 75,
1531 byte_end: 87,
1532 },
1533 Tag {
1534 word: "。",
1535 tag: "x",
1536 start: 29,
1537 end: 30,
1538 byte_start: 87,
1539 byte_end: 90,
1540 },
1541 ]"#]]
1542 .assert_eq(&format!("{:#?}", tags));
1543 }
1544
1545 #[test]
1546 fn test_tokenize() {
1547 let jieba = Jieba::new();
1548 let tokens = jieba.tokenize("南京市长江大桥", TokenizeMode::Default, false);
1549 expect![[r#"
1550 [
1551 Token {
1552 word: "南京市",
1553 start: 0,
1554 end: 3,
1555 byte_start: 0,
1556 byte_end: 9,
1557 },
1558 Token {
1559 word: "长江大桥",
1560 start: 3,
1561 end: 7,
1562 byte_start: 9,
1563 byte_end: 21,
1564 },
1565 ]"#]]
1566 .assert_eq(&format!("{:#?}", tokens));
1567
1568 let tokens = jieba.tokenize("南京市长江大桥", TokenizeMode::Search, false);
1569 expect![[r#"
1570 [
1571 Token {
1572 word: "南京",
1573 start: 0,
1574 end: 2,
1575 byte_start: 0,
1576 byte_end: 6,
1577 },
1578 Token {
1579 word: "京市",
1580 start: 1,
1581 end: 3,
1582 byte_start: 3,
1583 byte_end: 9,
1584 },
1585 Token {
1586 word: "南京市",
1587 start: 0,
1588 end: 3,
1589 byte_start: 0,
1590 byte_end: 9,
1591 },
1592 Token {
1593 word: "长江",
1594 start: 3,
1595 end: 5,
1596 byte_start: 9,
1597 byte_end: 15,
1598 },
1599 Token {
1600 word: "大桥",
1601 start: 5,
1602 end: 7,
1603 byte_start: 15,
1604 byte_end: 21,
1605 },
1606 Token {
1607 word: "长江大桥",
1608 start: 3,
1609 end: 7,
1610 byte_start: 9,
1611 byte_end: 21,
1612 },
1613 ]"#]]
1614 .assert_eq(&format!("{:#?}", tokens));
1615
1616 let tokens = jieba.tokenize("我们中出了一个叛徒", TokenizeMode::Default, false);
1617 expect![[r#"
1618 [
1619 Token {
1620 word: "我们",
1621 start: 0,
1622 end: 2,
1623 byte_start: 0,
1624 byte_end: 6,
1625 },
1626 Token {
1627 word: "中",
1628 start: 2,
1629 end: 3,
1630 byte_start: 6,
1631 byte_end: 9,
1632 },
1633 Token {
1634 word: "出",
1635 start: 3,
1636 end: 4,
1637 byte_start: 9,
1638 byte_end: 12,
1639 },
1640 Token {
1641 word: "了",
1642 start: 4,
1643 end: 5,
1644 byte_start: 12,
1645 byte_end: 15,
1646 },
1647 Token {
1648 word: "一个",
1649 start: 5,
1650 end: 7,
1651 byte_start: 15,
1652 byte_end: 21,
1653 },
1654 Token {
1655 word: "叛徒",
1656 start: 7,
1657 end: 9,
1658 byte_start: 21,
1659 byte_end: 27,
1660 },
1661 ]"#]]
1662 .assert_eq(&format!("{:#?}", tokens));
1663 let tokens = jieba.tokenize("我们中出了一个叛徒", TokenizeMode::Default, true);
1664 expect![[r#"
1665 [
1666 Token {
1667 word: "我们",
1668 start: 0,
1669 end: 2,
1670 byte_start: 0,
1671 byte_end: 6,
1672 },
1673 Token {
1674 word: "中出",
1675 start: 2,
1676 end: 4,
1677 byte_start: 6,
1678 byte_end: 12,
1679 },
1680 Token {
1681 word: "了",
1682 start: 4,
1683 end: 5,
1684 byte_start: 12,
1685 byte_end: 15,
1686 },
1687 Token {
1688 word: "一个",
1689 start: 5,
1690 end: 7,
1691 byte_start: 15,
1692 byte_end: 21,
1693 },
1694 Token {
1695 word: "叛徒",
1696 start: 7,
1697 end: 9,
1698 byte_start: 21,
1699 byte_end: 27,
1700 },
1701 ]"#]]
1702 .assert_eq(&format!("{:#?}", tokens));
1703
1704 let tokens = jieba.tokenize("永和服装饰品有限公司", TokenizeMode::Default, true);
1705 expect![[r#"
1706 [
1707 Token {
1708 word: "永和",
1709 start: 0,
1710 end: 2,
1711 byte_start: 0,
1712 byte_end: 6,
1713 },
1714 Token {
1715 word: "服装",
1716 start: 2,
1717 end: 4,
1718 byte_start: 6,
1719 byte_end: 12,
1720 },
1721 Token {
1722 word: "饰品",
1723 start: 4,
1724 end: 6,
1725 byte_start: 12,
1726 byte_end: 18,
1727 },
1728 Token {
1729 word: "有限公司",
1730 start: 6,
1731 end: 10,
1732 byte_start: 18,
1733 byte_end: 30,
1734 },
1735 ]"#]]
1736 .assert_eq(&format!("{:#?}", tokens));
1737 }
1738
1739 #[test]
1740 fn test_userdict() {
1741 let mut jieba = Jieba::new();
1742 let tokens = jieba.tokenize("我们中出了一个叛徒", TokenizeMode::Default, false);
1743 expect![[r#"
1744 [
1745 Token {
1746 word: "我们",
1747 start: 0,
1748 end: 2,
1749 byte_start: 0,
1750 byte_end: 6,
1751 },
1752 Token {
1753 word: "中",
1754 start: 2,
1755 end: 3,
1756 byte_start: 6,
1757 byte_end: 9,
1758 },
1759 Token {
1760 word: "出",
1761 start: 3,
1762 end: 4,
1763 byte_start: 9,
1764 byte_end: 12,
1765 },
1766 Token {
1767 word: "了",
1768 start: 4,
1769 end: 5,
1770 byte_start: 12,
1771 byte_end: 15,
1772 },
1773 Token {
1774 word: "一个",
1775 start: 5,
1776 end: 7,
1777 byte_start: 15,
1778 byte_end: 21,
1779 },
1780 Token {
1781 word: "叛徒",
1782 start: 7,
1783 end: 9,
1784 byte_start: 21,
1785 byte_end: 27,
1786 },
1787 ]"#]]
1788 .assert_eq(&format!("{:#?}", tokens));
1789 let userdict = "中出 10000";
1790 jieba.load_dict(&mut BufReader::new(userdict.as_bytes())).unwrap();
1791 let tokens = jieba.tokenize("我们中出了一个叛徒", TokenizeMode::Default, false);
1792 expect![[r#"
1793 [
1794 Token {
1795 word: "我们",
1796 start: 0,
1797 end: 2,
1798 byte_start: 0,
1799 byte_end: 6,
1800 },
1801 Token {
1802 word: "中出",
1803 start: 2,
1804 end: 4,
1805 byte_start: 6,
1806 byte_end: 12,
1807 },
1808 Token {
1809 word: "了",
1810 start: 4,
1811 end: 5,
1812 byte_start: 12,
1813 byte_end: 15,
1814 },
1815 Token {
1816 word: "一个",
1817 start: 5,
1818 end: 7,
1819 byte_start: 15,
1820 byte_end: 21,
1821 },
1822 Token {
1823 word: "叛徒",
1824 start: 7,
1825 end: 9,
1826 byte_start: 21,
1827 byte_end: 27,
1828 },
1829 ]"#]]
1830 .assert_eq(&format!("{:#?}", tokens));
1831 }
1832
1833 #[test]
1834 fn test_userdict_hmm() {
1835 let mut jieba = Jieba::new();
1836 let tokens = jieba.tokenize("我们中出了一个叛徒", TokenizeMode::Default, true);
1837 expect![[r#"
1838 [
1839 Token {
1840 word: "我们",
1841 start: 0,
1842 end: 2,
1843 byte_start: 0,
1844 byte_end: 6,
1845 },
1846 Token {
1847 word: "中出",
1848 start: 2,
1849 end: 4,
1850 byte_start: 6,
1851 byte_end: 12,
1852 },
1853 Token {
1854 word: "了",
1855 start: 4,
1856 end: 5,
1857 byte_start: 12,
1858 byte_end: 15,
1859 },
1860 Token {
1861 word: "一个",
1862 start: 5,
1863 end: 7,
1864 byte_start: 15,
1865 byte_end: 21,
1866 },
1867 Token {
1868 word: "叛徒",
1869 start: 7,
1870 end: 9,
1871 byte_start: 21,
1872 byte_end: 27,
1873 },
1874 ]"#]]
1875 .assert_eq(&format!("{:#?}", tokens));
1876 let userdict = "出了 10000";
1877 jieba.load_dict(&mut BufReader::new(userdict.as_bytes())).unwrap();
1878 let tokens = jieba.tokenize("我们中出了一个叛徒", TokenizeMode::Default, true);
1879 expect![[r#"
1880 [
1881 Token {
1882 word: "我们",
1883 start: 0,
1884 end: 2,
1885 byte_start: 0,
1886 byte_end: 6,
1887 },
1888 Token {
1889 word: "中",
1890 start: 2,
1891 end: 3,
1892 byte_start: 6,
1893 byte_end: 9,
1894 },
1895 Token {
1896 word: "出了",
1897 start: 3,
1898 end: 5,
1899 byte_start: 9,
1900 byte_end: 15,
1901 },
1902 Token {
1903 word: "一个",
1904 start: 5,
1905 end: 7,
1906 byte_start: 15,
1907 byte_end: 21,
1908 },
1909 Token {
1910 word: "叛徒",
1911 start: 7,
1912 end: 9,
1913 byte_start: 21,
1914 byte_end: 27,
1915 },
1916 ]"#]]
1917 .assert_eq(&format!("{:#?}", tokens));
1918 expect![[r#"
1919 [
1920 Token {
1921 word: "我们",
1922 start: 0,
1923 end: 2,
1924 byte_start: 0,
1925 byte_end: 6,
1926 },
1927 Token {
1928 word: "中",
1929 start: 2,
1930 end: 3,
1931 byte_start: 6,
1932 byte_end: 9,
1933 },
1934 Token {
1935 word: "出了",
1936 start: 3,
1937 end: 5,
1938 byte_start: 9,
1939 byte_end: 15,
1940 },
1941 Token {
1942 word: "一个",
1943 start: 5,
1944 end: 7,
1945 byte_start: 15,
1946 byte_end: 21,
1947 },
1948 Token {
1949 word: "叛徒",
1950 start: 7,
1951 end: 9,
1952 byte_start: 21,
1953 byte_end: 27,
1954 },
1955 ]"#]]
1956 .assert_eq(&format!("{:#?}", tokens));
1957 }
1958
1959 #[test]
1960 fn test_userdict_error() {
1961 let mut jieba = Jieba::empty();
1962 let userdict = "出了 not_a_int";
1963 let ret = jieba.load_dict(&mut BufReader::new(userdict.as_bytes()));
1964 assert!(ret.is_err());
1965 }
1966
1967 #[test]
1968 fn test_suggest_freq() {
1969 let mut jieba = Jieba::new();
1972 assert_eq!(jieba.suggest_freq("中出"), 348);
1974 assert_eq!(jieba.suggest_freq("出了"), 1263);
1975
1976 let userdict = "中出 300";
1978 jieba.load_dict(&mut BufReader::new(userdict.as_bytes())).unwrap();
1979 assert_eq!(jieba.suggest_freq("中出"), 348);
1981
1982 let userdict = "中出 500";
1983 jieba.load_dict(&mut BufReader::new(userdict.as_bytes())).unwrap();
1984 assert_eq!(jieba.suggest_freq("中出"), 500)
1986 }
1987
1988 #[test]
1989 fn test_custom_lower_freq() {
1990 let mut jieba = Jieba::new();
1991
1992 jieba.add_word("测试", Some(2445), None);
1993 jieba.add_word("测试", Some(10), None);
1994 let tokens = jieba.cut("测试", false);
1995 let words: Vec<&str> = tokens.iter().map(|t| t.word).collect();
1996 expect![[r#"["测试"]"#]].assert_eq(&format!("{:?}", words));
1997 }
1998
1999 #[test]
2000 fn test_cut_dag_no_hmm_against_string_with_sip() {
2001 let mut jieba = Jieba::empty();
2002
2003 jieba.add_word("䶴䶵𦡦", Some(1000), None);
2005 jieba.add_word("讥䶯䶰䶱䶲䶳", Some(1000), None);
2006
2007 let tokens = jieba.cut("讥䶯䶰䶱䶲䶳䶴䶵𦡦", false);
2008 let words: Vec<&str> = tokens.iter().map(|t| t.word).collect();
2009 expect![[r#"["讥䶯䶰䶱䶲䶳", "䶴䶵𦡦"]"#]].assert_eq(&format!("{:?}", words));
2010 }
2011
2012 #[test]
2013 fn test_add_custom_word_with_underscrore() {
2014 let mut jieba = Jieba::empty();
2015 jieba.add_word("田-女士", Some(42), Some("n"));
2016 let tokens = jieba.cut("市民田-女士急匆匆", false);
2017 let words: Vec<&str> = tokens.iter().map(|t| t.word).collect();
2018 expect![[r#"["市", "民", "田-女士", "急", "匆", "匆"]"#]].assert_eq(&format!("{:?}", words));
2019 }
2020
2021 #[test]
2022 fn test_cut_with_custom_hmm_model() {
2023 use crate::hmm::HmmModel;
2024
2025 let hmm_data = include_str!("../../jieba-macros/src/hmm.model");
2027 let mut reader = BufReader::new(hmm_data.as_bytes());
2028 let model = HmmModel::load(&mut reader).unwrap();
2029
2030 let mut jieba_custom = Jieba::new();
2031 jieba_custom.set_hmm_model(model);
2032 let jieba_builtin = Jieba::new();
2033
2034 let sentences = [
2036 "我们中出了一个叛徒",
2037 "小明硕士毕业于中国科学院计算所后在日本京都大学深造",
2038 "他来到了网易杭研大厦",
2039 "我来到北京清华大学",
2040 ];
2041 for sentence in sentences {
2042 let builtin_words = jieba_builtin.cut(sentence, true);
2043 let custom_words = jieba_custom.cut(sentence, true);
2044 assert_eq!(custom_words, builtin_words, "mismatch for: {sentence}");
2045 }
2046 }
2047}