jieba_rs/
lib.rs

1//! The Jieba Chinese Word Segmentation Implemented in Rust
2//!
3//! ## Installation
4//!
5//! Add it to your `Cargo.toml`:
6//!
7//! ```toml
8//! [dependencies]
9//! jieba-rs = "0.9"
10//! ```
11//!
12//! then you are good to go. If you are using Rust 2015 you have to ``extern crate jieba_rs`` to your crate root as well.
13//!
14//! ## Example
15//!
16//! ```rust
17//! use jieba_rs::Jieba;
18//!
19//! let jieba = Jieba::new();
20//! let words = jieba.cut("我们中出了一个叛徒", false);
21//! let words: Vec<&str> = words.iter().map(|t| t.word).collect();
22//! assert_eq!(words, vec!["我们", "中", "出", "了", "一个", "叛徒"]);
23//! ```
24//!
25//! ```rust
26//! # #[cfg(feature = "tfidf")] {
27//! use jieba_rs::Jieba;
28//! use jieba_rs::{TfIdf, KeywordExtract};
29//!
30//! fn main() {
31//!     let jieba = Jieba::new();
32//!     let keyword_extractor = TfIdf::default();
33//!     let top_k = keyword_extractor.extract_keywords(
34//!         &jieba,
35//!         "今天纽约的天气真好啊，京华大酒店的张尧经理吃了一只北京烤鸭。后天纽约的天气不好，昨天纽约的天气也不好，北京烤鸭真好吃",
36//!         3,
37//!         vec![],
38//!     );
39//!     println!("{:?}", top_k);
40//! }
41//! # }
42//! ```
43//!
44//! ```rust
45//! # #[cfg(feature = "textrank")] {
46//! use jieba_rs::Jieba;
47//! use jieba_rs::{TextRank, KeywordExtract};
48//!
49//! fn main() {
50//!     let jieba = Jieba::new();
51//!     let keyword_extractor = TextRank::default();
52//!     let top_k = keyword_extractor.extract_keywords(
53//!         &jieba,
54//!         "此外，公司拟对全资子公司吉林欧亚置业有限公司增资4.3亿元，增资后，吉林欧亚置业注册资本由7000万元增加到5亿元。吉林欧亚置业主要经营范围为房地产开发及百货零售等业务。目前在建吉林欧亚城市商业综合体项目。2013年，实现营业收入0万元，实现净利润-139.13万元。",
55//!         6,
56//!         vec![String::from("ns"), String::from("n"), String::from("vn"), String::from("v")],
57//!     );
58//!     println!("{:?}", top_k);
59//! }
60//! # }
61//! ```
62//!
63//! ## Enabling Additional Features
64//!
65//! * `default-dict` feature enables embedded dictionary, this features is enabled by default
66//! * `tfidf` feature enables TF-IDF keywords extractor
67//! * `textrank` feature enables TextRank keywords extractor
68//!
69//! ```toml
70//! [dependencies]
71//! jieba-rs = { version = "0.7", features = ["tfidf", "textrank"] }
72//! ```
73//!
74
75use std::cmp::Ordering;
76use std::collections::HashMap;
77use std::fmt;
78use std::io::BufRead;
79
80use cedarwood::Cedar;
81
82pub(crate) type FxHashMap<K, V> = HashMap<K, V, rustc_hash::FxBuildHasher>;
83
84pub use crate::errors::Error;
85pub use crate::hmm::HmmModel;
86#[cfg(feature = "textrank")]
87pub use crate::keywords::textrank::TextRank;
88#[cfg(feature = "tfidf")]
89pub use crate::keywords::tfidf::TfIdf;
90#[cfg(any(feature = "tfidf", feature = "textrank"))]
91pub use crate::keywords::{DEFAULT_STOP_WORDS, Keyword, KeywordExtract, KeywordExtractConfig};
92
93mod errors;
94mod hmm;
95#[cfg(any(feature = "tfidf", feature = "textrank"))]
96mod keywords;
97mod posseg;
98mod sparse_dag;
99
100#[cfg(feature = "default-dict")]
101include_flate::flate!(static DEFAULT_DICT: str from "src/data/dict.txt");
102
103use sparse_dag::StaticSparseDAG;
104
105thread_local! {
106    static HMM_CONTEXT: std::cell::RefCell<hmm::HmmContext> = std::cell::RefCell::new(hmm::HmmContext::default());
107}
108
109/// Check if a character is in a CJK Unified Ideographs range.
110#[inline]
111fn is_cjk(c: char) -> bool {
112    matches!(c,
113        '\u{3400}'..='\u{4DBF}'
114        | '\u{4E00}'..='\u{9FFF}'
115        | '\u{F900}'..='\u{FAFF}'
116        | '\u{20000}'..='\u{2A6DF}'
117        | '\u{2A700}'..='\u{2B73F}'
118        | '\u{2B740}'..='\u{2B81F}'
119        | '\u{2B820}'..='\u{2CEAF}'
120        | '\u{2CEB0}'..='\u{2EBEF}'
121        | '\u{2F800}'..='\u{2FA1F}'
122    )
123}
124
125/// RE_HAN_DEFAULT character class: CJK + ASCII alphanumeric + `+#&._%\-`
126#[inline]
127fn is_han_default(c: char) -> bool {
128    is_cjk(c) || c.is_ascii_alphanumeric() || matches!(c, '+' | '#' | '&' | '.' | '_' | '%' | '-')
129}
130
131/// RE_HAN_CUT_ALL character class: CJK only
132#[inline]
133fn is_han_cut_all(c: char) -> bool {
134    is_cjk(c)
135}
136
137/// RE_SKIP_CUT_ALL: anything not in `[a-zA-Z0-9+#\n]`
138#[inline]
139fn is_skip_cut_all(c: char) -> bool {
140    !c.is_ascii_alphanumeric() && c != '+' && c != '#' && c != '\n'
141}
142
143#[inline]
144fn char_count(s: &str) -> usize {
145    if s.len() >= 16 {
146        bytecount::num_chars(s.as_bytes())
147    } else {
148        s.as_bytes().iter().filter(|&&b| (b as i8) >= -0x40).count()
149    }
150}
151
152/// Iterator that splits text into matched/unmatched regions by a character classifier.
153/// Matched = maximal runs where `classify(c)` is true.
154/// Unmatched = everything between matched runs.
155pub(crate) struct SplitByCharacterClass<'t, F> {
156    text: &'t str,
157    pos: usize,
158    classify: F,
159}
160
161impl<'t, F: Fn(char) -> bool> SplitByCharacterClass<'t, F> {
162    #[inline]
163    fn new(text: &'t str, classify: F) -> Self {
164        SplitByCharacterClass { text, pos: 0, classify }
165    }
166}
167
168impl<'t, F: Fn(char) -> bool> Iterator for SplitByCharacterClass<'t, F> {
169    type Item = SplitState<'t>;
170
171    fn next(&mut self) -> Option<SplitState<'t>> {
172        if self.pos >= self.text.len() {
173            return None;
174        }
175
176        let remaining = &self.text[self.pos..];
177        let first_char = remaining.chars().next().unwrap();
178
179        if (self.classify)(first_char) {
180            // Matched run: consume while classify is true
181            let start = self.pos;
182            let mut end = self.pos + first_char.len_utf8();
183            for c in remaining[first_char.len_utf8()..].chars() {
184                if (self.classify)(c) {
185                    end += c.len_utf8();
186                } else {
187                    break;
188                }
189            }
190            self.pos = end;
191            Some(SplitState::Matched(&self.text[start..end]))
192        } else {
193            // Unmatched run: consume while classify is false
194            let start = self.pos;
195            let mut end = self.pos + first_char.len_utf8();
196            for c in remaining[first_char.len_utf8()..].chars() {
197                if (self.classify)(c) {
198                    break;
199                }
200                end += c.len_utf8();
201            }
202            self.pos = end;
203            Some(SplitState::Unmatched(&self.text[start..end]))
204        }
205    }
206}
207
208#[derive(Debug)]
209pub(crate) enum SplitState<'t> {
210    Unmatched(&'t str),
211    Matched(&'t str),
212}
213
214impl<'t> SplitState<'t> {
215    #[inline]
216    fn as_str(&self) -> &'t str {
217        match self {
218            SplitState::Unmatched(t) => t,
219            SplitState::Matched(t) => t,
220        }
221    }
222
223    #[inline]
224    pub fn is_matched(&self) -> bool {
225        matches!(self, SplitState::Matched(_))
226    }
227}
228
229#[derive(Debug, Clone, Copy, PartialEq, Eq)]
230pub enum TokenizeMode {
231    /// Default mode
232    Default,
233    /// Search mode
234    Search,
235}
236
237/// A Token
238#[derive(Debug, Clone, PartialEq, Eq, Hash)]
239pub struct Token<'a> {
240    /// Word of the token
241    pub word: &'a str,
242    /// Unicode start position of the token
243    pub start: usize,
244    /// Unicode end position of the token
245    pub end: usize,
246    /// Byte start position of the token in the original input
247    pub byte_start: usize,
248    /// Byte end position of the token in the original input
249    pub byte_end: usize,
250}
251
252/// A tagged word
253#[derive(Debug, Clone, PartialEq, Eq, Hash)]
254pub struct Tag<'a> {
255    /// Word
256    pub word: &'a str,
257    /// Word tag
258    pub tag: &'a str,
259    /// Unicode start position of the word in the original input
260    pub start: usize,
261    /// Unicode end position of the word in the original input
262    pub end: usize,
263    /// Byte start position of the word in the original input
264    pub byte_start: usize,
265    /// Byte end position of the word in the original input
266    pub byte_end: usize,
267}
268
269#[derive(Debug, Clone)]
270struct Record {
271    freq: usize,
272    log_freq: f64,
273    tag: Box<str>,
274}
275
276impl Record {
277    #[inline(always)]
278    fn new(freq: usize, tag: Box<str>) -> Self {
279        Self {
280            freq,
281            log_freq: (freq as f64).ln(),
282            tag,
283        }
284    }
285
286    #[inline]
287    fn set_freq(&mut self, freq: usize) {
288        self.freq = freq;
289        self.log_freq = (freq as f64).ln();
290    }
291}
292
293/// Jieba segmentation
294#[derive(Clone)]
295pub struct Jieba {
296    records: Vec<Record>,
297    cedar: Cedar,
298    total: usize,
299    hmm_model: Option<HmmModel>,
300}
301
302impl fmt::Debug for Jieba {
303    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
304        f.debug_struct("Jieba")
305            .field("records_len", &self.records.len())
306            .field("total_freq", &self.total)
307            .finish()
308    }
309}
310
311#[cfg(feature = "default-dict")]
312impl Default for Jieba {
313    fn default() -> Self {
314        Jieba::new()
315    }
316}
317
318impl Jieba {
319    /// Create a new instance with empty dict
320    pub fn empty() -> Self {
321        Jieba {
322            records: Vec::new(),
323            cedar: Cedar::new(),
324            total: 0,
325            hmm_model: None,
326        }
327    }
328
329    /// Create a new instance with embed dict
330    ///
331    /// Requires `default-dict` feature to be enabled.
332    #[cfg(feature = "default-dict")]
333    pub fn new() -> Self {
334        let mut instance = Self::empty();
335        instance.load_default_dict();
336        instance
337    }
338
339    /// Create a new instance with dict
340    pub fn with_dict<R: BufRead>(dict: &mut R) -> Result<Self, Error> {
341        let mut instance = Self::empty();
342        instance.load_dict(dict)?;
343        Ok(instance)
344    }
345
346    /// Loads the default dictionary into the instance.
347    ///
348    /// This method reads the default dictionary from a predefined byte slice (`DEFAULT_DICT`)
349    /// and loads it into the current instance using the `load_dict` method.
350    ///
351    /// # Arguments
352    ///
353    /// * `&mut self` - Mutable reference to the current instance.
354    ///
355    /// Requires `default-dict` feature to be enabled.
356    ///
357    /// # Examples
358    ///
359    /// ```
360    /// use jieba_rs::Jieba;
361    ///
362    /// let mut instance = Jieba::empty();
363    /// instance.load_default_dict(); // Loads the default dictionary into the instance
364    /// assert!(instance.has_word("我们"), "The word '我们' should be in the dictionary after loading the default dictionary");
365    /// ```
366    #[cfg(feature = "default-dict")]
367    pub fn load_default_dict(&mut self) {
368        use std::io::BufReader;
369
370        let mut default_dict = BufReader::new(DEFAULT_DICT.as_bytes());
371        self.load_dict(&mut default_dict).unwrap();
372    }
373
374    /// Set a custom HMM model for segmentation.
375    ///
376    /// When set, the custom model is used instead of the compile-time embedded model
377    /// for HMM-based segmentation of out-of-vocabulary words.
378    ///
379    /// The model can be trained using `scripts/train_hmm.py`.
380    ///
381    /// ## Example
382    ///
383    /// ```no_run
384    /// use std::io::BufReader;
385    /// use std::fs::File;
386    /// use jieba_rs::{Jieba, HmmModel};
387    ///
388    /// let mut jieba = Jieba::new();
389    /// let mut f = BufReader::new(File::open("my_hmm.model").unwrap());
390    /// let model = HmmModel::load(&mut f).unwrap();
391    /// jieba.set_hmm_model(model);
392    /// ```
393    pub fn set_hmm_model(&mut self, model: HmmModel) {
394        self.hmm_model = Some(model);
395    }
396
397    /// Clears all data
398    ///
399    /// This method performs the following actions:
400    /// 1. Clears the `records` list, removing all entries.
401    /// 2. Resets `cedar` to a new instance of `Cedar`.
402    /// 3. Sets `total` to 0, resetting the count.
403    ///
404    /// # Arguments
405    ///
406    /// * `&mut self` - Mutable reference to the current instance.
407    ///
408    /// # Examples
409    ///
410    /// ```
411    /// use jieba_rs::Jieba;
412    ///
413    /// let mut instance = Jieba::new();
414    /// assert!(instance.has_word("我们"), "The word '我们' should be in the dictionary after loading the default dictionary");
415    /// instance.clear(); // clear all dict data
416    /// assert!(!instance.has_word("我们"), "The word '我们' should not be in the dictionary after clearing the dictionary");
417    /// ```
418    pub fn clear(&mut self) {
419        self.records.clear();
420        self.cedar = Cedar::new();
421        self.total = 0;
422    }
423
424    /// Add word to dict, return `freq`
425    ///
426    /// `freq`: if `None`, will be given by [suggest_freq](#method.suggest_freq)
427    ///
428    /// `tag`: if `None`, will be given `""`
429    pub fn add_word(&mut self, word: &str, freq: Option<usize>, tag: Option<&str>) -> usize {
430        if word.is_empty() {
431            return 0;
432        }
433        let freq = freq.unwrap_or_else(|| self.suggest_freq(word));
434        let tag = tag.unwrap_or("");
435
436        match self.cedar.exact_match_search(word) {
437            Some((word_id, _, _)) => {
438                let old_freq = self.records[word_id as usize].freq;
439                self.records[word_id as usize].set_freq(freq);
440
441                self.total += freq;
442                self.total -= old_freq;
443            }
444            None => {
445                let word_id = self.records.len() as i32;
446                self.records.push(Record::new(freq, tag.into()));
447
448                self.cedar.update(word, word_id);
449                self.total += freq;
450            }
451        };
452
453        freq
454    }
455
456    /// Checks if a word exists in the dictionary.
457    ///
458    /// # Arguments
459    ///
460    /// * `word` - The word to check.
461    ///
462    /// # Returns
463    ///
464    /// * `bool` - Whether the word exists in the dictionary.
465    pub fn has_word(&self, word: &str) -> bool {
466        self.cedar.exact_match_search(word).is_some()
467    }
468
469    /// Loads a dictionary by adding entries to the existing dictionary rather than resetting it.
470    ///
471    /// This function reads from a `BufRead` source, parsing each line as a dictionary entry. Each entry
472    /// is expected to contain a word, its frequency, and optionally a tag.
473    ///
474    /// # Type Parameters
475    ///
476    /// * `R`: A type that implements the `BufRead` trait, used for reading lines from the dictionary.
477    ///
478    /// # Arguments
479    ///
480    /// * `dict` - A mutable reference to a `BufRead` source containing the dictionary entries.
481    ///
482    /// # Returns
483    ///
484    /// * `Result<(), Error>` - Returns `Ok(())` if the dictionary is successfully loaded; otherwise,
485    ///   returns an error describing what went wrong.
486    ///
487    /// # Errors
488    ///
489    /// This function will return an error if:
490    /// * There is an issue reading from the provided `BufRead` source.
491    /// * A line in the dictionary file contains invalid frequency data (not a valid integer).
492    pub fn load_dict<R: BufRead>(&mut self, dict: &mut R) -> Result<(), Error> {
493        let mut buf = String::new();
494        self.total = 0;
495
496        let mut line_no = 0;
497        while dict.read_line(&mut buf)? > 0 {
498            {
499                line_no += 1;
500                let mut iter = buf.split_whitespace();
501                if let Some(word) = iter.next() {
502                    let freq = iter
503                        .next()
504                        .map(|x| {
505                            x.parse::<usize>().map_err(|e| {
506                                Error::InvalidDictEntry(format!(
507                                    "line {line_no} `{buf}` frequency {x} is not a valid integer: {e}"
508                                ))
509                            })
510                        })
511                        .unwrap_or(Ok(0))?;
512                    let tag = iter.next().unwrap_or("");
513
514                    match self.cedar.exact_match_search(word) {
515                        Some((word_id, _, _)) => {
516                            self.records[word_id as usize].set_freq(freq);
517                        }
518                        None => {
519                            let word_id = self.records.len() as i32;
520                            self.records.push(Record::new(freq, tag.into()));
521                            self.cedar.update(word, word_id);
522                        }
523                    };
524                }
525            }
526            buf.clear();
527        }
528        self.total = self.records.iter().map(|n| n.freq).sum();
529
530        Ok(())
531    }
532
533    fn get_word_freq(&self, word: &str, default: usize) -> usize {
534        match self.cedar.exact_match_search(word) {
535            Some((word_id, _, _)) => self.records[word_id as usize].freq,
536            _ => default,
537        }
538    }
539
540    /// Suggest word frequency to force the characters in a word to be joined or split.
541    pub fn suggest_freq(&self, segment: &str) -> usize {
542        let logtotal = (self.total as f64).ln();
543        let logfreq = self.cut(segment, false).iter().fold(0f64, |freq, token| {
544            freq + (self.get_word_freq(token.word, 1) as f64).ln() - logtotal
545        });
546        std::cmp::max((logfreq + logtotal).exp() as usize + 1, self.get_word_freq(segment, 1))
547    }
548
549    #[allow(clippy::ptr_arg)]
550    fn calc(&self, sentence: &str, dag: &StaticSparseDAG, route: &mut Vec<(f64, usize)>) {
551        let str_len = sentence.len();
552
553        if str_len + 1 > route.len() {
554            route.resize(str_len + 1, (0.0, 0));
555        }
556
557        let logtotal = (self.total as f64).ln();
558        let log1 = 0.0f64 - logtotal; // ln(1) - logtotal, precomputed for freq=1 case
559        let mut prev_byte_start = str_len;
560        let curr = sentence.char_indices().map(|x| x.0).rev();
561        for byte_start in curr {
562            let pair = dag
563                .iter_edges(byte_start)
564                .map(|(byte_end, word_id)| {
565                    let log_freq = if word_id != sparse_dag::NO_MATCH {
566                        self.records[word_id as usize].log_freq
567                    } else {
568                        0.0 // ln(1)
569                    };
570
571                    (log_freq - logtotal + route[byte_end].0, byte_end)
572                })
573                .max_by(|x, y| x.partial_cmp(y).unwrap_or(Ordering::Equal));
574
575            if let Some(p) = pair {
576                route[byte_start] = p;
577            } else {
578                let byte_end = prev_byte_start;
579                route[byte_start] = (log1 + route[byte_end].0, byte_end);
580            }
581
582            prev_byte_start = byte_start;
583        }
584    }
585
586    fn dag(&self, sentence: &str, dag: &mut StaticSparseDAG) {
587        for (byte_start, _) in sentence.char_indices() {
588            dag.start(byte_start);
589            let haystack = &sentence[byte_start..];
590
591            for (word_id, end_index) in self.cedar.common_prefix_iter(haystack) {
592                dag.insert(end_index + byte_start + 1, word_id);
593            }
594
595            dag.commit();
596        }
597    }
598
599    /// Emits `Token`s directly with unicode positions for cut_all,
600    /// avoiding the need for a separate byte-to-unicode lookup table.
601    fn cut_all_tokens<'a>(&self, block: &'a str, base: usize, block_unicode_start: usize, tokens: &mut Vec<Token<'a>>) {
602        let str_len = block.len();
603        let mut dag = StaticSparseDAG::with_size_hint(block.len());
604        self.dag(block, &mut dag);
605
606        let block_base = block.as_ptr() as usize;
607        let byte_offset_in_sentence = block_base - base;
608
609        for (unicode_idx, (byte_start, _)) in block.char_indices().enumerate() {
610            let unicode_start = block_unicode_start + unicode_idx;
611            for (byte_end, _) in dag.iter_edges(byte_start) {
612                let word = if byte_end == str_len {
613                    &block[byte_start..]
614                } else {
615                    &block[byte_start..byte_end]
616                };
617                let char_count = char_count(word);
618                let bs = byte_offset_in_sentence + byte_start;
619                tokens.push(Token {
620                    word,
621                    start: unicode_start,
622                    end: unicode_start + char_count,
623                    byte_start: bs,
624                    byte_end: bs + word.len(),
625                });
626            }
627        }
628    }
629
630    fn cut_dag_no_hmm<'a>(
631        &self,
632        sentence: &'a str,
633        words: &mut Vec<&'a str>,
634        route: &mut Vec<(f64, usize)>,
635        dag: &mut StaticSparseDAG,
636    ) {
637        self.dag(sentence, dag);
638        self.calc(sentence, dag, route);
639        let mut x = 0;
640        let mut left: Option<usize> = None;
641
642        while x < sentence.len() {
643            let y = route[x].1;
644            let l_str = &sentence[x..y];
645
646            if l_str.chars().nth(1).is_none() && l_str.as_bytes()[0].is_ascii_alphanumeric() {
647                if left.is_none() {
648                    left = Some(x);
649                }
650            } else {
651                if let Some(byte_start) = left {
652                    let word = &sentence[byte_start..x];
653                    words.push(word);
654                    left = None;
655                }
656
657                words.push(l_str);
658            }
659            x = y;
660        }
661
662        if let Some(byte_start) = left {
663            let word = &sentence[byte_start..];
664            words.push(word);
665        }
666
667        dag.clear();
668        route.clear();
669    }
670
671    #[inline]
672    fn hmm_cut<'a>(&self, word: &'a str, words: &mut Vec<&'a str>, hmm_context: &mut hmm::HmmContext) {
673        if let Some(ref model) = self.hmm_model {
674            hmm::cut_with_allocated_memory(word, words, model, hmm_context);
675        } else {
676            hmm::cut_with_allocated_memory(word, words, &hmm::builtin_hmm(), hmm_context);
677        }
678    }
679
680    #[allow(non_snake_case, clippy::too_many_arguments)]
681    fn cut_dag_hmm<'a>(
682        &self,
683        sentence: &'a str,
684        words: &mut Vec<&'a str>,
685        route: &mut Vec<(f64, usize)>,
686        dag: &mut StaticSparseDAG,
687        hmm_context: &mut hmm::HmmContext,
688    ) {
689        self.dag(sentence, dag);
690        self.calc(sentence, dag, route);
691        let mut x = 0;
692        let mut left: Option<usize> = None;
693
694        while x < sentence.len() {
695            let y = route[x].1;
696
697            if sentence[x..y].chars().nth(1).is_none() {
698                if left.is_none() {
699                    left = Some(x);
700                }
701            } else {
702                if let Some(byte_start) = left {
703                    let byte_end = x;
704                    let word = &sentence[byte_start..byte_end];
705                    if word.chars().nth(1).is_none() {
706                        words.push(word);
707                    } else if self.cedar.exact_match_search(word).is_none() {
708                        self.hmm_cut(word, words, hmm_context);
709                    } else {
710                        let mut word_indices = word.char_indices().map(|x| x.0).peekable();
711                        while let Some(byte_start) = word_indices.next() {
712                            if let Some(byte_end) = word_indices.peek() {
713                                words.push(&word[byte_start..*byte_end]);
714                            } else {
715                                words.push(&word[byte_start..]);
716                            }
717                        }
718                    }
719                    left = None;
720                }
721                let word = &sentence[x..y];
722                words.push(word);
723            }
724            x = y;
725        }
726
727        if let Some(byte_start) = left {
728            let word = &sentence[byte_start..];
729
730            if word.chars().nth(1).is_none() {
731                words.push(word);
732            } else if self.cedar.exact_match_search(word).is_none() {
733                self.hmm_cut(word, words, hmm_context);
734            } else {
735                let mut word_indices = word.char_indices().map(|x| x.0).peekable();
736                while let Some(byte_start) = word_indices.next() {
737                    if let Some(byte_end) = word_indices.peek() {
738                        words.push(&word[byte_start..*byte_end]);
739                    } else {
740                        words.push(&word[byte_start..]);
741                    }
742                }
743            }
744        }
745
746        dag.clear();
747        route.clear();
748    }
749
750    /// Create a Token with incrementally tracked unicode offset.
751    /// Returns the updated unicode_offset (past the end of this token).
752    #[inline]
753    fn make_token_incremental<'a>(word: &'a str, base: usize, unicode_offset: &mut usize) -> Token<'a> {
754        let ptr = word.as_ptr() as usize;
755        debug_assert!(ptr >= base, "word is not a subslice of sentence");
756        let byte_start = ptr - base;
757        let byte_end = byte_start + word.len();
758        let start = *unicode_offset;
759        // Count UTF-8 leading bytes to get char count without allocating
760        let char_count = char_count(word);
761        *unicode_offset = start + char_count;
762        Token {
763            word,
764            start,
765            end: *unicode_offset,
766            byte_start,
767            byte_end,
768        }
769    }
770
771    #[allow(non_snake_case)]
772    fn cut_internal<'a>(&self, sentence: &'a str, cut_all: bool, hmm: bool) -> Vec<Token<'a>> {
773        if cut_all {
774            return self.cut_all_toplevel(sentence);
775        }
776        let base = sentence.as_ptr() as usize;
777        let mut unicode_offset = 0;
778
779        let heuristic_capacity = sentence.len() / 2;
780        let mut str_words = Vec::with_capacity(heuristic_capacity);
781        let mut tokens = Vec::with_capacity(heuristic_capacity);
782
783        let splitter = SplitByCharacterClass::new(sentence, is_han_default);
784        let mut route = Vec::with_capacity(heuristic_capacity);
785        let mut dag = StaticSparseDAG::with_size_hint(heuristic_capacity);
786
787        for state in splitter {
788            match state {
789                SplitState::Matched(_) => {
790                    let block = state.as_str();
791                    assert!(!block.is_empty());
792
793                    str_words.clear();
794                    if hmm {
795                        HMM_CONTEXT.with(|ctx| {
796                            let mut hmm_context = ctx.borrow_mut();
797                            self.cut_dag_hmm(block, &mut str_words, &mut route, &mut dag, &mut hmm_context);
798                        });
799                    } else {
800                        self.cut_dag_no_hmm(block, &mut str_words, &mut route, &mut dag);
801                    }
802                    for &word in &str_words {
803                        tokens.push(Self::make_token_incremental(word, base, &mut unicode_offset));
804                    }
805                }
806                SplitState::Unmatched(_) => {
807                    let block = state.as_str();
808                    assert!(!block.is_empty());
809
810                    let mut chars = block.char_indices().peekable();
811                    while let Some((i, c)) = chars.next() {
812                        // Group \r\n as a single token, otherwise emit each char
813                        let word = if c == '\r' {
814                            if let Some(&(_, '\n')) = chars.peek() {
815                                let _ = chars.next();
816                                let end = i + 2;
817                                &block[i..end]
818                            } else {
819                                let end = i + c.len_utf8();
820                                &block[i..end]
821                            }
822                        } else {
823                            let end = i + c.len_utf8();
824                            &block[i..end]
825                        };
826                        tokens.push(Self::make_token_incremental(word, base, &mut unicode_offset));
827                    }
828                }
829            }
830        }
831        tokens
832    }
833
834    /// Dedicated top-level cut_all implementation that avoids allocating a byte-to-unicode table.
835    fn cut_all_toplevel<'a>(&self, sentence: &'a str) -> Vec<Token<'a>> {
836        let base = sentence.as_ptr() as usize;
837        let mut unicode_offset = 0;
838
839        let heuristic_capacity = sentence.len() / 2;
840        let mut tokens = Vec::with_capacity(heuristic_capacity);
841
842        let splitter = SplitByCharacterClass::new(sentence, is_han_cut_all);
843
844        for state in splitter {
845            match state {
846                SplitState::Matched(_) => {
847                    let block = state.as_str();
848                    assert!(!block.is_empty());
849                    let block_unicode_start = unicode_offset;
850                    // Advance unicode_offset past this block
851                    unicode_offset += char_count(block);
852                    self.cut_all_tokens(block, base, block_unicode_start, &mut tokens);
853                }
854                SplitState::Unmatched(_) => {
855                    let block = state.as_str();
856                    assert!(!block.is_empty());
857
858                    let skip_splitter = SplitByCharacterClass::new(block, is_skip_cut_all);
859                    for skip_state in skip_splitter {
860                        let word = skip_state.as_str();
861                        if word.is_empty() {
862                            continue;
863                        }
864                        if skip_state.is_matched() {
865                            // Emit each char individually to match old RE_SKIP_CUT_ALL
866                            // which matched single characters, not runs.
867                            let mut indices = word.char_indices().peekable();
868                            while let Some((i, _)) = indices.next() {
869                                let end = indices.peek().map_or(word.len(), |&(j, _)| j);
870                                tokens.push(Self::make_token_incremental(&word[i..end], base, &mut unicode_offset));
871                            }
872                        } else {
873                            tokens.push(Self::make_token_incremental(word, base, &mut unicode_offset));
874                        }
875                    }
876                }
877            }
878        }
879        tokens
880    }
881
882    /// Cut the input text
883    ///
884    /// ## Params
885    ///
886    /// `sentence`: input text
887    ///
888    /// `hmm`: enable HMM or not
889    pub fn cut<'a>(&self, sentence: &'a str, hmm: bool) -> Vec<Token<'a>> {
890        self.cut_internal(sentence, false, hmm)
891    }
892
893    /// Cut the input text, return all possible words
894    ///
895    /// ## Params
896    ///
897    /// `sentence`: input text
898    pub fn cut_all<'a>(&self, sentence: &'a str) -> Vec<Token<'a>> {
899        self.cut_internal(sentence, true, false)
900    }
901
902    /// Cut the input text in search mode
903    ///
904    /// ## Params
905    ///
906    /// `sentence`: input text
907    ///
908    /// `hmm`: enable HMM or not
909    pub fn cut_for_search<'a>(&self, sentence: &'a str, hmm: bool) -> Vec<Token<'a>> {
910        let words = self.cut(sentence, hmm);
911        let mut new_words = Vec::with_capacity(words.len());
912        let base = sentence.as_ptr() as usize;
913        let mut char_indices = Vec::new();
914        for token in words {
915            let word = token.word;
916            char_indices.clear();
917            char_indices.extend(word.char_indices().map(|x| x.0));
918            let char_count = char_indices.len();
919            if char_count > 2 {
920                for i in 0..char_count - 1 {
921                    let local_byte_start = char_indices[i];
922                    let gram2 = if i + 2 < char_count {
923                        &word[local_byte_start..char_indices[i + 2]]
924                    } else {
925                        &word[local_byte_start..]
926                    };
927                    if self.cedar.exact_match_search(gram2).is_some() {
928                        let byte_start = gram2.as_ptr() as usize - base;
929                        let byte_end = byte_start + gram2.len();
930                        new_words.push(Token {
931                            word: gram2,
932                            start: token.start + i,
933                            end: token.start + i + 2,
934                            byte_start,
935                            byte_end,
936                        });
937                    }
938                }
939            }
940            if char_count > 3 {
941                for i in 0..char_count - 2 {
942                    let local_byte_start = char_indices[i];
943                    let gram3 = if i + 3 < char_count {
944                        &word[local_byte_start..char_indices[i + 3]]
945                    } else {
946                        &word[local_byte_start..]
947                    };
948                    if self.cedar.exact_match_search(gram3).is_some() {
949                        let byte_start = gram3.as_ptr() as usize - base;
950                        let byte_end = byte_start + gram3.len();
951                        new_words.push(Token {
952                            word: gram3,
953                            start: token.start + i,
954                            end: token.start + i + 3,
955                            byte_start,
956                            byte_end,
957                        });
958                    }
959                }
960            }
961            new_words.push(token);
962        }
963        new_words
964    }
965
966    /// Tokenize
967    ///
968    /// ## Params
969    ///
970    /// `sentence`: input text
971    ///
972    /// `mode`: tokenize mode
973    ///
974    /// `hmm`: enable HMM or not
975    pub fn tokenize<'a>(&self, sentence: &'a str, mode: TokenizeMode, hmm: bool) -> Vec<Token<'a>> {
976        match mode {
977            TokenizeMode::Default => self.cut(sentence, hmm),
978            TokenizeMode::Search => self.cut_for_search(sentence, hmm),
979        }
980    }
981
982    /// Tag the input text
983    ///
984    /// ## Params
985    ///
986    /// `sentence`: input text
987    ///
988    /// `hmm`: enable HMM or not
989    pub fn tag<'a>(&'a self, sentence: &'a str, hmm: bool) -> Vec<Tag<'a>> {
990        let tokens = self.cut(sentence, hmm);
991        tokens
992            .into_iter()
993            .map(|token| {
994                let word = token.word;
995                if let Some((word_id, _, _)) = self.cedar.exact_match_search(word) {
996                    let t = &self.records[word_id as usize].tag;
997                    return Tag {
998                        word,
999                        tag: t,
1000                        start: token.start,
1001                        end: token.end,
1002                        byte_start: token.byte_start,
1003                        byte_end: token.byte_end,
1004                    };
1005                }
1006                let tag = self.guess_tag(word);
1007                Tag {
1008                    word,
1009                    tag,
1010                    start: token.start,
1011                    end: token.end,
1012                    byte_start: token.byte_start,
1013                    byte_end: token.byte_end,
1014                }
1015            })
1016            .collect()
1017    }
1018
1019    /// Guess the POS tag for an OOV word.
1020    ///
1021    /// For CJK words, uses the posseg HMM model (when available) to predict the tag.
1022    /// For ASCII words, uses simple heuristics (digits → "m", alpha → "eng", else → "x").
1023    fn guess_tag(&self, word: &str) -> &'static str {
1024        let mut eng = 0;
1025        let mut m = 0;
1026        for chr in word.chars() {
1027            if chr.is_ascii_alphanumeric() {
1028                eng += 1;
1029                if chr.is_ascii_digit() {
1030                    m += 1;
1031                }
1032            }
1033        }
1034        if eng > 0 {
1035            return if eng == m { "m" } else { "eng" };
1036        }
1037
1038        #[cfg(feature = "default-dict")]
1039        {
1040            // Only use posseg HMM for words containing CJK characters
1041            if word.chars().any(|c| is_cjk(c)) {
1042                let results = posseg::cut_with_pos(word);
1043                if results.len() == 1 {
1044                    return results[0].1;
1045                }
1046                if let Some((_w, tag)) = results.iter().max_by_key(|(w, _)| w.len()) {
1047                    return tag;
1048                }
1049            }
1050        }
1051
1052        "x"
1053    }
1054}
1055
1056#[cfg(test)]
1057mod tests {
1058    use super::{Jieba, SplitByCharacterClass, SplitState, TokenizeMode, is_han_default};
1059    use expect_test::expect;
1060    use std::io::BufReader;
1061
1062    #[test]
1063    fn test_init_with_default_dict() {
1064        let _ = Jieba::new();
1065    }
1066
1067    #[test]
1068    fn test_has_word() {
1069        let jieba = Jieba::new();
1070        assert!(jieba.has_word("中国"));
1071        assert!(jieba.has_word("开源"));
1072        assert!(!jieba.has_word("不存在的词"));
1073    }
1074
1075    #[test]
1076    fn test_split_matches() {
1077        let splitter = SplitByCharacterClass::new(
1078            "👪 PS: 我觉得开源有一个好处，就是能够敦促自己不断改进 👪，避免敞帚自珍",
1079            is_han_default,
1080        );
1081        for state in splitter {
1082            match state {
1083                SplitState::Matched(_) => {
1084                    let block = state.as_str();
1085                    assert!(!block.is_empty());
1086                }
1087                SplitState::Unmatched(_) => {
1088                    let block = state.as_str();
1089                    assert!(!block.is_empty());
1090                }
1091            }
1092        }
1093    }
1094
1095    #[test]
1096    fn test_split_matches_against_unicode_sip() {
1097        let splitter = SplitByCharacterClass::new("讥䶯䶰䶱䶲䶳䶴䶵𦡦", is_han_default);
1098
1099        let result: Vec<&str> = splitter.map(|x| x.as_str()).collect();
1100        expect![[r#"["讥䶯䶰䶱䶲䶳䶴䶵𦡦"]"#]].assert_eq(&format!("{:?}", result));
1101    }
1102
1103    #[test]
1104    fn test_cut_all_skip_single_char() {
1105        let jieba = Jieba::new();
1106        let words: Vec<&str> = jieba.cut_all("a！！b").iter().map(|t| t.word).collect();
1107        assert_eq!(words, vec!["a", "！", "！", "b"]);
1108    }
1109
1110    #[test]
1111    fn test_cut_default_crlf_and_whitespace() {
1112        let jieba = Jieba::new();
1113        let words: Vec<&str> = jieba.cut("x\r\n\ty", false).iter().map(|t| t.word).collect();
1114        assert_eq!(words, vec!["x", "\r\n", "\t", "y"]);
1115    }
1116
1117    #[test]
1118    fn test_cut_all() {
1119        let jieba = Jieba::new();
1120        let tokens = jieba.cut_all("abc网球拍卖会def");
1121        let words: Vec<&str> = tokens.iter().map(|t| t.word).collect();
1122        expect![[r#"["abc", "网", "网球", "网球拍", "球", "球拍", "拍", "拍卖", "拍卖会", "卖", "会", "def"]"#]]
1123            .assert_eq(&format!("{:?}", words));
1124
1125        // The cut_all from the python de-facto implementation is loosely defined,
1126        // And the answer "我, 来到, 北京, 清华, 清华大学, 华大, 大学" from the python implementation looks weird since it drops the single character word even though it is part of the DAG candidates.
1127        // For example, it includes "华大" but it doesn't include "清" and "学"
1128        let tokens = jieba.cut_all("我来到北京清华大学");
1129        let words: Vec<&str> = tokens.iter().map(|t| t.word).collect();
1130        expect![[r#"["我", "来", "来到", "到", "北", "北京", "京", "清", "清华", "清华大学", "华", "华大", "大", "大学", "学"]"#]]
1131            .assert_eq(&format!("{:?}", words));
1132    }
1133
1134    #[test]
1135    fn test_cut_no_hmm() {
1136        let jieba = Jieba::new();
1137        let tokens = jieba.cut("abc网球拍卖会def", false);
1138        let words: Vec<&str> = tokens.iter().map(|t| t.word).collect();
1139        expect![[r#"["abc", "网球", "拍卖会", "def"]"#]].assert_eq(&format!("{:?}", words));
1140    }
1141
1142    #[test]
1143    fn test_cut_no_hmm1() {
1144        let jieba = Jieba::new();
1145        let tokens = jieba.cut("abc网球拍卖会def！！？\r\n\t", false);
1146        let words: Vec<&str> = tokens.iter().map(|t| t.word).collect();
1147        expect![[r#"["abc", "网球", "拍卖会", "def", "！", "！", "？", "\r\n", "\t"]"#]]
1148            .assert_eq(&format!("{:?}", words));
1149    }
1150
1151    #[test]
1152    fn test_cut_with_hmm() {
1153        let jieba = Jieba::new();
1154        let tokens = jieba.cut("我们中出了一个叛徒", false);
1155        let words: Vec<&str> = tokens.iter().map(|t| t.word).collect();
1156        expect![[r#"["我们", "中", "出", "了", "一个", "叛徒"]"#]].assert_eq(&format!("{:?}", words));
1157        let tokens = jieba.cut("我们中出了一个叛徒", true);
1158        let words: Vec<&str> = tokens.iter().map(|t| t.word).collect();
1159        expect![[r#"["我们", "中出", "了", "一个", "叛徒"]"#]].assert_eq(&format!("{:?}", words));
1160        let tokens = jieba.cut("我们中出了一个叛徒👪", true);
1161        let words: Vec<&str> = tokens.iter().map(|t| t.word).collect();
1162        expect![[r#"["我们", "中出", "了", "一个", "叛徒", "👪"]"#]].assert_eq(&format!("{:?}", words));
1163
1164        let tokens = jieba.cut("我来到北京清华大学", true);
1165        let words: Vec<&str> = tokens.iter().map(|t| t.word).collect();
1166        expect![[r#"["我", "来到", "北京", "清华大学"]"#]].assert_eq(&format!("{:?}", words));
1167
1168        let tokens = jieba.cut("他来到了网易杭研大厦", true);
1169        let words: Vec<&str> = tokens.iter().map(|t| t.word).collect();
1170        expect![[r#"["他", "来到", "了", "网易", "杭研", "大厦"]"#]].assert_eq(&format!("{:?}", words));
1171    }
1172
1173    #[test]
1174    fn test_cut_weicheng() {
1175        static WEICHENG_TXT: &str = include_str!("../../examples/weicheng/src/weicheng.txt");
1176        let jieba = Jieba::new();
1177        for line in WEICHENG_TXT.split('\n') {
1178            let _ = jieba.cut(line, true);
1179        }
1180    }
1181
1182    #[test]
1183    fn test_cut_for_search() {
1184        let jieba = Jieba::new();
1185        let tokens = jieba.cut_for_search("南京市长江大桥", true);
1186        let words: Vec<&str> = tokens.iter().map(|t| t.word).collect();
1187        expect![[r#"["南京", "京市", "南京市", "长江", "大桥", "长江大桥"]"#]].assert_eq(&format!("{:?}", words));
1188
1189        let tokens = jieba.cut_for_search("小明硕士毕业于中国科学院计算所，后在日本京都大学深造", true);
1190        let words: Vec<&str> = tokens.iter().map(|t| t.word).collect();
1191
1192        // The python implementation silently filtered "，". but we include it here in the output
1193        // to let the library user to decide their own filtering strategy
1194        expect![[r#"["小明", "硕士", "毕业", "于", "中国", "科学", "学院", "科学院", "中国科学院", "计算", "计算所", "，", "后", "在", "日本", "京都", "大学", "日本京都大学", "深造"]"#]]
1195            .assert_eq(&format!("{:?}", words));
1196    }
1197
1198    #[test]
1199    fn test_tag() {
1200        let jieba = Jieba::new();
1201        let tags = jieba.tag(
1202            "我是拖拉机学院手扶拖拉机专业的。不用多久，我就会升职加薪，当上CEO，走上人生巅峰。",
1203            true,
1204        );
1205        expect![[r#"
1206            [
1207                Tag {
1208                    word: "我",
1209                    tag: "r",
1210                    start: 0,
1211                    end: 1,
1212                    byte_start: 0,
1213                    byte_end: 3,
1214                },
1215                Tag {
1216                    word: "是",
1217                    tag: "v",
1218                    start: 1,
1219                    end: 2,
1220                    byte_start: 3,
1221                    byte_end: 6,
1222                },
1223                Tag {
1224                    word: "拖拉机",
1225                    tag: "n",
1226                    start: 2,
1227                    end: 5,
1228                    byte_start: 6,
1229                    byte_end: 15,
1230                },
1231                Tag {
1232                    word: "学院",
1233                    tag: "n",
1234                    start: 5,
1235                    end: 7,
1236                    byte_start: 15,
1237                    byte_end: 21,
1238                },
1239                Tag {
1240                    word: "手扶拖拉机",
1241                    tag: "n",
1242                    start: 7,
1243                    end: 12,
1244                    byte_start: 21,
1245                    byte_end: 36,
1246                },
1247                Tag {
1248                    word: "专业",
1249                    tag: "n",
1250                    start: 12,
1251                    end: 14,
1252                    byte_start: 36,
1253                    byte_end: 42,
1254                },
1255                Tag {
1256                    word: "的",
1257                    tag: "uj",
1258                    start: 14,
1259                    end: 15,
1260                    byte_start: 42,
1261                    byte_end: 45,
1262                },
1263                Tag {
1264                    word: "。",
1265                    tag: "x",
1266                    start: 15,
1267                    end: 16,
1268                    byte_start: 45,
1269                    byte_end: 48,
1270                },
1271                Tag {
1272                    word: "不用",
1273                    tag: "v",
1274                    start: 16,
1275                    end: 18,
1276                    byte_start: 48,
1277                    byte_end: 54,
1278                },
1279                Tag {
1280                    word: "多久",
1281                    tag: "m",
1282                    start: 18,
1283                    end: 20,
1284                    byte_start: 54,
1285                    byte_end: 60,
1286                },
1287                Tag {
1288                    word: "，",
1289                    tag: "x",
1290                    start: 20,
1291                    end: 21,
1292                    byte_start: 60,
1293                    byte_end: 63,
1294                },
1295                Tag {
1296                    word: "我",
1297                    tag: "r",
1298                    start: 21,
1299                    end: 22,
1300                    byte_start: 63,
1301                    byte_end: 66,
1302                },
1303                Tag {
1304                    word: "就",
1305                    tag: "d",
1306                    start: 22,
1307                    end: 23,
1308                    byte_start: 66,
1309                    byte_end: 69,
1310                },
1311                Tag {
1312                    word: "会",
1313                    tag: "v",
1314                    start: 23,
1315                    end: 24,
1316                    byte_start: 69,
1317                    byte_end: 72,
1318                },
1319                Tag {
1320                    word: "升职",
1321                    tag: "v",
1322                    start: 24,
1323                    end: 26,
1324                    byte_start: 72,
1325                    byte_end: 78,
1326                },
1327                Tag {
1328                    word: "加薪",
1329                    tag: "nr",
1330                    start: 26,
1331                    end: 28,
1332                    byte_start: 78,
1333                    byte_end: 84,
1334                },
1335                Tag {
1336                    word: "，",
1337                    tag: "x",
1338                    start: 28,
1339                    end: 29,
1340                    byte_start: 84,
1341                    byte_end: 87,
1342                },
1343                Tag {
1344                    word: "当上",
1345                    tag: "t",
1346                    start: 29,
1347                    end: 31,
1348                    byte_start: 87,
1349                    byte_end: 93,
1350                },
1351                Tag {
1352                    word: "CEO",
1353                    tag: "eng",
1354                    start: 31,
1355                    end: 34,
1356                    byte_start: 93,
1357                    byte_end: 96,
1358                },
1359                Tag {
1360                    word: "，",
1361                    tag: "x",
1362                    start: 34,
1363                    end: 35,
1364                    byte_start: 96,
1365                    byte_end: 99,
1366                },
1367                Tag {
1368                    word: "走上",
1369                    tag: "v",
1370                    start: 35,
1371                    end: 37,
1372                    byte_start: 99,
1373                    byte_end: 105,
1374                },
1375                Tag {
1376                    word: "人生",
1377                    tag: "n",
1378                    start: 37,
1379                    end: 39,
1380                    byte_start: 105,
1381                    byte_end: 111,
1382                },
1383                Tag {
1384                    word: "巅峰",
1385                    tag: "n",
1386                    start: 39,
1387                    end: 41,
1388                    byte_start: 111,
1389                    byte_end: 117,
1390                },
1391                Tag {
1392                    word: "。",
1393                    tag: "x",
1394                    start: 41,
1395                    end: 42,
1396                    byte_start: 117,
1397                    byte_end: 120,
1398                },
1399            ]"#]]
1400        .assert_eq(&format!("{:#?}", tags));
1401
1402        let tags = jieba.tag("今天纽约的天气真好啊，京华大酒店的张尧经理吃了一只北京烤鸭。", true);
1403        expect![[r#"
1404            [
1405                Tag {
1406                    word: "今天",
1407                    tag: "t",
1408                    start: 0,
1409                    end: 2,
1410                    byte_start: 0,
1411                    byte_end: 6,
1412                },
1413                Tag {
1414                    word: "纽约",
1415                    tag: "ns",
1416                    start: 2,
1417                    end: 4,
1418                    byte_start: 6,
1419                    byte_end: 12,
1420                },
1421                Tag {
1422                    word: "的",
1423                    tag: "uj",
1424                    start: 4,
1425                    end: 5,
1426                    byte_start: 12,
1427                    byte_end: 15,
1428                },
1429                Tag {
1430                    word: "天气",
1431                    tag: "n",
1432                    start: 5,
1433                    end: 7,
1434                    byte_start: 15,
1435                    byte_end: 21,
1436                },
1437                Tag {
1438                    word: "真好",
1439                    tag: "d",
1440                    start: 7,
1441                    end: 9,
1442                    byte_start: 21,
1443                    byte_end: 27,
1444                },
1445                Tag {
1446                    word: "啊",
1447                    tag: "zg",
1448                    start: 9,
1449                    end: 10,
1450                    byte_start: 27,
1451                    byte_end: 30,
1452                },
1453                Tag {
1454                    word: "，",
1455                    tag: "x",
1456                    start: 10,
1457                    end: 11,
1458                    byte_start: 30,
1459                    byte_end: 33,
1460                },
1461                Tag {
1462                    word: "京华",
1463                    tag: "nz",
1464                    start: 11,
1465                    end: 13,
1466                    byte_start: 33,
1467                    byte_end: 39,
1468                },
1469                Tag {
1470                    word: "大酒店",
1471                    tag: "n",
1472                    start: 13,
1473                    end: 16,
1474                    byte_start: 39,
1475                    byte_end: 48,
1476                },
1477                Tag {
1478                    word: "的",
1479                    tag: "uj",
1480                    start: 16,
1481                    end: 17,
1482                    byte_start: 48,
1483                    byte_end: 51,
1484                },
1485                Tag {
1486                    word: "张尧",
1487                    tag: "nr",
1488                    start: 17,
1489                    end: 19,
1490                    byte_start: 51,
1491                    byte_end: 57,
1492                },
1493                Tag {
1494                    word: "经理",
1495                    tag: "n",
1496                    start: 19,
1497                    end: 21,
1498                    byte_start: 57,
1499                    byte_end: 63,
1500                },
1501                Tag {
1502                    word: "吃",
1503                    tag: "v",
1504                    start: 21,
1505                    end: 22,
1506                    byte_start: 63,
1507                    byte_end: 66,
1508                },
1509                Tag {
1510                    word: "了",
1511                    tag: "ul",
1512                    start: 22,
1513                    end: 23,
1514                    byte_start: 66,
1515                    byte_end: 69,
1516                },
1517                Tag {
1518                    word: "一只",
1519                    tag: "m",
1520                    start: 23,
1521                    end: 25,
1522                    byte_start: 69,
1523                    byte_end: 75,
1524                },
1525                Tag {
1526                    word: "北京烤鸭",
1527                    tag: "n",
1528                    start: 25,
1529                    end: 29,
1530                    byte_start: 75,
1531                    byte_end: 87,
1532                },
1533                Tag {
1534                    word: "。",
1535                    tag: "x",
1536                    start: 29,
1537                    end: 30,
1538                    byte_start: 87,
1539                    byte_end: 90,
1540                },
1541            ]"#]]
1542        .assert_eq(&format!("{:#?}", tags));
1543    }
1544
1545    #[test]
1546    fn test_tokenize() {
1547        let jieba = Jieba::new();
1548        let tokens = jieba.tokenize("南京市长江大桥", TokenizeMode::Default, false);
1549        expect![[r#"
1550            [
1551                Token {
1552                    word: "南京市",
1553                    start: 0,
1554                    end: 3,
1555                    byte_start: 0,
1556                    byte_end: 9,
1557                },
1558                Token {
1559                    word: "长江大桥",
1560                    start: 3,
1561                    end: 7,
1562                    byte_start: 9,
1563                    byte_end: 21,
1564                },
1565            ]"#]]
1566        .assert_eq(&format!("{:#?}", tokens));
1567
1568        let tokens = jieba.tokenize("南京市长江大桥", TokenizeMode::Search, false);
1569        expect![[r#"
1570            [
1571                Token {
1572                    word: "南京",
1573                    start: 0,
1574                    end: 2,
1575                    byte_start: 0,
1576                    byte_end: 6,
1577                },
1578                Token {
1579                    word: "京市",
1580                    start: 1,
1581                    end: 3,
1582                    byte_start: 3,
1583                    byte_end: 9,
1584                },
1585                Token {
1586                    word: "南京市",
1587                    start: 0,
1588                    end: 3,
1589                    byte_start: 0,
1590                    byte_end: 9,
1591                },
1592                Token {
1593                    word: "长江",
1594                    start: 3,
1595                    end: 5,
1596                    byte_start: 9,
1597                    byte_end: 15,
1598                },
1599                Token {
1600                    word: "大桥",
1601                    start: 5,
1602                    end: 7,
1603                    byte_start: 15,
1604                    byte_end: 21,
1605                },
1606                Token {
1607                    word: "长江大桥",
1608                    start: 3,
1609                    end: 7,
1610                    byte_start: 9,
1611                    byte_end: 21,
1612                },
1613            ]"#]]
1614        .assert_eq(&format!("{:#?}", tokens));
1615
1616        let tokens = jieba.tokenize("我们中出了一个叛徒", TokenizeMode::Default, false);
1617        expect![[r#"
1618            [
1619                Token {
1620                    word: "我们",
1621                    start: 0,
1622                    end: 2,
1623                    byte_start: 0,
1624                    byte_end: 6,
1625                },
1626                Token {
1627                    word: "中",
1628                    start: 2,
1629                    end: 3,
1630                    byte_start: 6,
1631                    byte_end: 9,
1632                },
1633                Token {
1634                    word: "出",
1635                    start: 3,
1636                    end: 4,
1637                    byte_start: 9,
1638                    byte_end: 12,
1639                },
1640                Token {
1641                    word: "了",
1642                    start: 4,
1643                    end: 5,
1644                    byte_start: 12,
1645                    byte_end: 15,
1646                },
1647                Token {
1648                    word: "一个",
1649                    start: 5,
1650                    end: 7,
1651                    byte_start: 15,
1652                    byte_end: 21,
1653                },
1654                Token {
1655                    word: "叛徒",
1656                    start: 7,
1657                    end: 9,
1658                    byte_start: 21,
1659                    byte_end: 27,
1660                },
1661            ]"#]]
1662        .assert_eq(&format!("{:#?}", tokens));
1663        let tokens = jieba.tokenize("我们中出了一个叛徒", TokenizeMode::Default, true);
1664        expect![[r#"
1665            [
1666                Token {
1667                    word: "我们",
1668                    start: 0,
1669                    end: 2,
1670                    byte_start: 0,
1671                    byte_end: 6,
1672                },
1673                Token {
1674                    word: "中出",
1675                    start: 2,
1676                    end: 4,
1677                    byte_start: 6,
1678                    byte_end: 12,
1679                },
1680                Token {
1681                    word: "了",
1682                    start: 4,
1683                    end: 5,
1684                    byte_start: 12,
1685                    byte_end: 15,
1686                },
1687                Token {
1688                    word: "一个",
1689                    start: 5,
1690                    end: 7,
1691                    byte_start: 15,
1692                    byte_end: 21,
1693                },
1694                Token {
1695                    word: "叛徒",
1696                    start: 7,
1697                    end: 9,
1698                    byte_start: 21,
1699                    byte_end: 27,
1700                },
1701            ]"#]]
1702        .assert_eq(&format!("{:#?}", tokens));
1703
1704        let tokens = jieba.tokenize("永和服装饰品有限公司", TokenizeMode::Default, true);
1705        expect![[r#"
1706            [
1707                Token {
1708                    word: "永和",
1709                    start: 0,
1710                    end: 2,
1711                    byte_start: 0,
1712                    byte_end: 6,
1713                },
1714                Token {
1715                    word: "服装",
1716                    start: 2,
1717                    end: 4,
1718                    byte_start: 6,
1719                    byte_end: 12,
1720                },
1721                Token {
1722                    word: "饰品",
1723                    start: 4,
1724                    end: 6,
1725                    byte_start: 12,
1726                    byte_end: 18,
1727                },
1728                Token {
1729                    word: "有限公司",
1730                    start: 6,
1731                    end: 10,
1732                    byte_start: 18,
1733                    byte_end: 30,
1734                },
1735            ]"#]]
1736        .assert_eq(&format!("{:#?}", tokens));
1737    }
1738
1739    #[test]
1740    fn test_userdict() {
1741        let mut jieba = Jieba::new();
1742        let tokens = jieba.tokenize("我们中出了一个叛徒", TokenizeMode::Default, false);
1743        expect![[r#"
1744            [
1745                Token {
1746                    word: "我们",
1747                    start: 0,
1748                    end: 2,
1749                    byte_start: 0,
1750                    byte_end: 6,
1751                },
1752                Token {
1753                    word: "中",
1754                    start: 2,
1755                    end: 3,
1756                    byte_start: 6,
1757                    byte_end: 9,
1758                },
1759                Token {
1760                    word: "出",
1761                    start: 3,
1762                    end: 4,
1763                    byte_start: 9,
1764                    byte_end: 12,
1765                },
1766                Token {
1767                    word: "了",
1768                    start: 4,
1769                    end: 5,
1770                    byte_start: 12,
1771                    byte_end: 15,
1772                },
1773                Token {
1774                    word: "一个",
1775                    start: 5,
1776                    end: 7,
1777                    byte_start: 15,
1778                    byte_end: 21,
1779                },
1780                Token {
1781                    word: "叛徒",
1782                    start: 7,
1783                    end: 9,
1784                    byte_start: 21,
1785                    byte_end: 27,
1786                },
1787            ]"#]]
1788        .assert_eq(&format!("{:#?}", tokens));
1789        let userdict = "中出 10000";
1790        jieba.load_dict(&mut BufReader::new(userdict.as_bytes())).unwrap();
1791        let tokens = jieba.tokenize("我们中出了一个叛徒", TokenizeMode::Default, false);
1792        expect![[r#"
1793            [
1794                Token {
1795                    word: "我们",
1796                    start: 0,
1797                    end: 2,
1798                    byte_start: 0,
1799                    byte_end: 6,
1800                },
1801                Token {
1802                    word: "中出",
1803                    start: 2,
1804                    end: 4,
1805                    byte_start: 6,
1806                    byte_end: 12,
1807                },
1808                Token {
1809                    word: "了",
1810                    start: 4,
1811                    end: 5,
1812                    byte_start: 12,
1813                    byte_end: 15,
1814                },
1815                Token {
1816                    word: "一个",
1817                    start: 5,
1818                    end: 7,
1819                    byte_start: 15,
1820                    byte_end: 21,
1821                },
1822                Token {
1823                    word: "叛徒",
1824                    start: 7,
1825                    end: 9,
1826                    byte_start: 21,
1827                    byte_end: 27,
1828                },
1829            ]"#]]
1830        .assert_eq(&format!("{:#?}", tokens));
1831    }
1832
1833    #[test]
1834    fn test_userdict_hmm() {
1835        let mut jieba = Jieba::new();
1836        let tokens = jieba.tokenize("我们中出了一个叛徒", TokenizeMode::Default, true);
1837        expect![[r#"
1838            [
1839                Token {
1840                    word: "我们",
1841                    start: 0,
1842                    end: 2,
1843                    byte_start: 0,
1844                    byte_end: 6,
1845                },
1846                Token {
1847                    word: "中出",
1848                    start: 2,
1849                    end: 4,
1850                    byte_start: 6,
1851                    byte_end: 12,
1852                },
1853                Token {
1854                    word: "了",
1855                    start: 4,
1856                    end: 5,
1857                    byte_start: 12,
1858                    byte_end: 15,
1859                },
1860                Token {
1861                    word: "一个",
1862                    start: 5,
1863                    end: 7,
1864                    byte_start: 15,
1865                    byte_end: 21,
1866                },
1867                Token {
1868                    word: "叛徒",
1869                    start: 7,
1870                    end: 9,
1871                    byte_start: 21,
1872                    byte_end: 27,
1873                },
1874            ]"#]]
1875        .assert_eq(&format!("{:#?}", tokens));
1876        let userdict = "出了 10000";
1877        jieba.load_dict(&mut BufReader::new(userdict.as_bytes())).unwrap();
1878        let tokens = jieba.tokenize("我们中出了一个叛徒", TokenizeMode::Default, true);
1879        expect![[r#"
1880            [
1881                Token {
1882                    word: "我们",
1883                    start: 0,
1884                    end: 2,
1885                    byte_start: 0,
1886                    byte_end: 6,
1887                },
1888                Token {
1889                    word: "中",
1890                    start: 2,
1891                    end: 3,
1892                    byte_start: 6,
1893                    byte_end: 9,
1894                },
1895                Token {
1896                    word: "出了",
1897                    start: 3,
1898                    end: 5,
1899                    byte_start: 9,
1900                    byte_end: 15,
1901                },
1902                Token {
1903                    word: "一个",
1904                    start: 5,
1905                    end: 7,
1906                    byte_start: 15,
1907                    byte_end: 21,
1908                },
1909                Token {
1910                    word: "叛徒",
1911                    start: 7,
1912                    end: 9,
1913                    byte_start: 21,
1914                    byte_end: 27,
1915                },
1916            ]"#]]
1917        .assert_eq(&format!("{:#?}", tokens));
1918        expect![[r#"
1919            [
1920                Token {
1921                    word: "我们",
1922                    start: 0,
1923                    end: 2,
1924                    byte_start: 0,
1925                    byte_end: 6,
1926                },
1927                Token {
1928                    word: "中",
1929                    start: 2,
1930                    end: 3,
1931                    byte_start: 6,
1932                    byte_end: 9,
1933                },
1934                Token {
1935                    word: "出了",
1936                    start: 3,
1937                    end: 5,
1938                    byte_start: 9,
1939                    byte_end: 15,
1940                },
1941                Token {
1942                    word: "一个",
1943                    start: 5,
1944                    end: 7,
1945                    byte_start: 15,
1946                    byte_end: 21,
1947                },
1948                Token {
1949                    word: "叛徒",
1950                    start: 7,
1951                    end: 9,
1952                    byte_start: 21,
1953                    byte_end: 27,
1954                },
1955            ]"#]]
1956        .assert_eq(&format!("{:#?}", tokens));
1957    }
1958
1959    #[test]
1960    fn test_userdict_error() {
1961        let mut jieba = Jieba::empty();
1962        let userdict = "出了 not_a_int";
1963        let ret = jieba.load_dict(&mut BufReader::new(userdict.as_bytes()));
1964        assert!(ret.is_err());
1965    }
1966
1967    #[test]
1968    fn test_suggest_freq() {
1969        // NOTE: Following behaviors are aligned with original Jieba
1970
1971        let mut jieba = Jieba::new();
1972        // These values were calculated by original Jieba
1973        assert_eq!(jieba.suggest_freq("中出"), 348);
1974        assert_eq!(jieba.suggest_freq("出了"), 1263);
1975
1976        // Freq in dict.txt was 3, which became 300 after loading user dict
1977        let userdict = "中出 300";
1978        jieba.load_dict(&mut BufReader::new(userdict.as_bytes())).unwrap();
1979        // But it's less than calculated freq 348
1980        assert_eq!(jieba.suggest_freq("中出"), 348);
1981
1982        let userdict = "中出 500";
1983        jieba.load_dict(&mut BufReader::new(userdict.as_bytes())).unwrap();
1984        // Now it's significant enough
1985        assert_eq!(jieba.suggest_freq("中出"), 500)
1986    }
1987
1988    #[test]
1989    fn test_custom_lower_freq() {
1990        let mut jieba = Jieba::new();
1991
1992        jieba.add_word("测试", Some(2445), None);
1993        jieba.add_word("测试", Some(10), None);
1994        let tokens = jieba.cut("测试", false);
1995        let words: Vec<&str> = tokens.iter().map(|t| t.word).collect();
1996        expect![[r#"["测试"]"#]].assert_eq(&format!("{:?}", words));
1997    }
1998
1999    #[test]
2000    fn test_cut_dag_no_hmm_against_string_with_sip() {
2001        let mut jieba = Jieba::empty();
2002
2003        //add fake word into dictionary
2004        jieba.add_word("䶴䶵𦡦", Some(1000), None);
2005        jieba.add_word("讥䶯䶰䶱䶲䶳", Some(1000), None);
2006
2007        let tokens = jieba.cut("讥䶯䶰䶱䶲䶳䶴䶵𦡦", false);
2008        let words: Vec<&str> = tokens.iter().map(|t| t.word).collect();
2009        expect![[r#"["讥䶯䶰䶱䶲䶳", "䶴䶵𦡦"]"#]].assert_eq(&format!("{:?}", words));
2010    }
2011
2012    #[test]
2013    fn test_add_custom_word_with_underscrore() {
2014        let mut jieba = Jieba::empty();
2015        jieba.add_word("田-女士", Some(42), Some("n"));
2016        let tokens = jieba.cut("市民田-女士急匆匆", false);
2017        let words: Vec<&str> = tokens.iter().map(|t| t.word).collect();
2018        expect![[r#"["市", "民", "田-女士", "急", "匆", "匆"]"#]].assert_eq(&format!("{:?}", words));
2019    }
2020
2021    #[test]
2022    fn test_cut_with_custom_hmm_model() {
2023        use crate::hmm::HmmModel;
2024
2025        // Load the builtin hmm.model at runtime
2026        let hmm_data = include_str!("../../jieba-macros/src/hmm.model");
2027        let mut reader = BufReader::new(hmm_data.as_bytes());
2028        let model = HmmModel::load(&mut reader).unwrap();
2029
2030        let mut jieba_custom = Jieba::new();
2031        jieba_custom.set_hmm_model(model);
2032        let jieba_builtin = Jieba::new();
2033
2034        // Runtime-loaded model should produce the same results as the builtin
2035        let sentences = [
2036            "我们中出了一个叛徒",
2037            "小明硕士毕业于中国科学院计算所后在日本京都大学深造",
2038            "他来到了网易杭研大厦",
2039            "我来到北京清华大学",
2040        ];
2041        for sentence in sentences {
2042            let builtin_words = jieba_builtin.cut(sentence, true);
2043            let custom_words = jieba_custom.cut(sentence, true);
2044            assert_eq!(custom_words, builtin_words, "mismatch for: {sentence}");
2045        }
2046    }
2047}
jieba_rs/lib.rs

jieba_rs/
lib.rs