msg_tool\output_scripts/
m3t.rs

1//! A simple text format that supports both original/llm/translated messages.
2//!
3//! A simple m3t file example:
4//! ```text
5//! ○ NAME: Example
6//!
7//! ○ Original message
8//! △ LLM message
9//! ● Translated message
10//! ```
11use crate::types::*;
12use anyhow::Result;
13
14/// A parser for the M3T format.
15pub struct M3tParser<'a> {
16    str: &'a str,
17    line: usize,
18    llm_mark: Option<&'a str>,
19    use_original_text: bool,
20}
21
22impl<'a> M3tParser<'a> {
23    /// Creates a new M3tParser with the given string.
24    pub fn new(str: &'a str, llm_mark: Option<&'a str>, use_original_text: bool) -> Self {
25        M3tParser {
26            str,
27            line: 1,
28            llm_mark,
29            use_original_text,
30        }
31    }
32
33    fn next_line(&mut self) -> Option<&'a str> {
34        match self.str.find('\n') {
35            Some(pos) => {
36                let line = &self.str[..pos];
37                self.str = &self.str[pos + 1..];
38                self.line += 1;
39                Some(line.trim())
40            }
41            None => {
42                if !self.str.is_empty() {
43                    let line = self.str;
44                    self.str = "";
45                    Some(line)
46                } else {
47                    None
48                }
49            }
50        }
51    }
52
53    pub fn parse_as_vec(&mut self) -> Result<Vec<(String, String)>> {
54        let mut map = Vec::new();
55        let mut ori = None;
56        let mut llm = None;
57        while let Some(line) = self.next_line() {
58            if line.is_empty() {
59                continue;
60            }
61            // Remove zero-width space characters
62            let line = line.trim().trim_matches('\u{200b}');
63            if line.starts_with("○") {
64                let line = line[3..].trim();
65                if !line.starts_with("NAME:") {
66                    ori = Some(line.to_string());
67                }
68            } else if line.starts_with("△") {
69                let line = line[3..].trim();
70                llm = Some(line);
71            } else if line.starts_with("●") {
72                let message = line[3..].trim();
73                let message = if message
74                    .trim_start_matches("「")
75                    .trim_end_matches("」")
76                    .is_empty()
77                {
78                    llm.take()
79                        .map(|s| {
80                            let mut s = s.to_string();
81                            if let Some(mark) = self.llm_mark {
82                                s.push_str(mark);
83                            }
84                            s
85                        })
86                        .unwrap_or_else(|| {
87                            String::from(if message.starts_with("「") {
88                                "「」"
89                            } else {
90                                ""
91                            })
92                        })
93                        .replace("\\n", "\n")
94                } else {
95                    let mut tmp = message.to_owned();
96                    if let Some(llm) = llm.take() {
97                        if tmp == llm {
98                            if let Some(mark) = self.llm_mark {
99                                tmp.push_str(mark);
100                            }
101                        }
102                    }
103                    tmp.replace("\\n", "\n")
104                };
105                if let Some(ori) = ori.take() {
106                    map.push((ori, message));
107                } else {
108                    return Err(anyhow::anyhow!(
109                        "Missing original message before translated message at line {}",
110                        self.line
111                    ));
112                }
113            } else {
114                return Err(anyhow::anyhow!(
115                    "Invalid line format at line {}: {}",
116                    self.line,
117                    line
118                ));
119            }
120        }
121        Ok(map)
122    }
123
124    /// Parses the M3T format and returns a vector of messages.
125    pub fn parse(&mut self) -> Result<Vec<Message>> {
126        let mut messages = Vec::new();
127        let mut name = None;
128        let mut llm = None;
129        let mut ori = None;
130        while let Some(line) = self.next_line() {
131            if line.is_empty() {
132                continue;
133            }
134            // Remove zero-width space characters
135            let line = line.trim().trim_matches('\u{200b}');
136            if line.starts_with("○") {
137                let line = line[3..].trim();
138                if line.starts_with("NAME:") {
139                    name = Some(line[5..].trim().to_string());
140                } else {
141                    ori = Some(line.to_string());
142                }
143            } else if line.starts_with("△") {
144                let line = line[3..].trim();
145                llm = Some(line);
146            } else if line.starts_with("●") {
147                let message = line[3..].trim();
148                let message = if message
149                    .trim_start_matches("「")
150                    .trim_end_matches("」")
151                    .is_empty()
152                {
153                    llm.take()
154                        .map(|s| {
155                            let mut s = s.to_string();
156                            if let Some(mark) = self.llm_mark {
157                                s.push_str(mark);
158                            }
159                            s
160                        })
161                        .unwrap_or_else(|| {
162                            let m = if self.use_original_text {
163                                ori.clone()
164                            } else {
165                                None
166                            };
167                            m.unwrap_or_else(|| {
168                                String::from(if message.starts_with("「") {
169                                    "「」"
170                                } else {
171                                    ""
172                                })
173                            })
174                        })
175                        .replace("\\n", "\n")
176                } else {
177                    let mut tmp = message.to_owned();
178                    if let Some(llm) = llm.take() {
179                        if tmp == llm {
180                            if let Some(mark) = self.llm_mark {
181                                tmp.push_str(mark);
182                            }
183                        }
184                    }
185                    tmp.replace("\\n", "\n")
186                };
187                messages.push(Message::new(message, name.take()));
188            } else {
189                return Err(anyhow::anyhow!(
190                    "Invalid line format at line {}: {}",
191                    self.line,
192                    line
193                ));
194            }
195        }
196        Ok(messages)
197    }
198
199    pub fn parse_as_extend(&mut self) -> Result<Vec<ExtendedMessage>> {
200        let mut messages = Vec::new();
201        let mut name = None;
202        let mut llm = None;
203        let mut source = None;
204        while let Some(line) = self.next_line() {
205            if line.is_empty() {
206                continue;
207            }
208            // Remove zero-width space characters
209            let line = line.trim().trim_matches('\u{200b}');
210            if line.starts_with("○") {
211                let line = line[3..].trim();
212                if line.starts_with("NAME:") {
213                    name = Some(line[5..].trim().to_string());
214                } else {
215                    source = Some(line.replace("\\n", "\n"));
216                }
217            } else if line.starts_with("△") {
218                let line = line[3..].trim();
219                llm = Some(line.replace("\\n", "\n"));
220            } else if line.starts_with("●") {
221                let message = line[3..].trim();
222                let source = match source.take() {
223                    Some(s) => s,
224                    None => {
225                        return Err(anyhow::anyhow!(
226                            "Missing original message before translated message at line {}",
227                            self.line
228                        ));
229                    }
230                };
231                let m = ExtendedMessage {
232                    name: name.take(),
233                    source,
234                    translated: message.replace("\\n", "\n"),
235                    llm: llm.take(),
236                };
237                messages.push(m);
238            }
239        }
240        Ok(messages)
241    }
242}
243
244/// A dumper for the M3T format.
245pub struct M3tDumper {}
246
247impl M3tDumper {
248    /// Dumps the messages in M3T format.
249    pub fn dump(messages: &[Message], no_quote: bool) -> String {
250        let mut result = String::new();
251        for message in messages {
252            if let Some(name) = &message.name {
253                result.push_str(&format!("○ NAME: {}\n\n", name));
254            }
255            result.push_str(&format!("○ {}\n", message.message.replace("\n", "\\n")));
256            if !no_quote && message.message.starts_with("「") {
257                result.push_str("● 「」\n\n");
258            } else {
259                result.push_str("●\n\n");
260            }
261        }
262        result
263    }
264
265    /// Dumps the extended messages in M3T format.
266    pub fn dump_extended(messages: &[ExtendedMessage]) -> String {
267        let mut result = String::new();
268        for message in messages {
269            if let Some(name) = &message.name {
270                result.push_str(&format!("○ NAME: {}\n\n", name));
271            }
272            result.push_str(&format!("○ {}\n", message.source.replace("\n", "\\n")));
273            if let Some(llm) = &message.llm {
274                result.push_str(&format!("△ {}\n", llm.replace("\n", "\\n")));
275            }
276            result.push_str(&format!(
277                "● {}\n\n",
278                message.translated.replace("\n", "\\n")
279            ));
280        }
281        result
282    }
283}
284
285#[test]
286fn test_zero_width_space() {
287    let input = "○ NAME: Example\n\n○ Original message\n\u{200b}● 「」\n\n";
288    let mut parser = M3tParser::new(input, None, false);
289    let messages = parser.parse().unwrap();
290    assert_eq!(messages.len(), 1);
291    let map = M3tParser::new(input, None, false).parse_as_vec().unwrap();
292    assert_eq!(map.len(), 1);
293}