msg_tool\utils/
encoding.rs

1//! Encoding Utilities
2use crate::ext::atomic::*;
3use crate::types::*;
4use encoding::{ByteWriter, DecoderTrap, EncoderTrap, Encoding as EncodingTrait, RawEncoder};
5use std::sync::atomic::AtomicBool;
6
7/// Decodes a byte slice to a string using the specified encoding with BOM detection.
8///
9/// * `check` - If true, checks for decoding errors and returns an error if any.
10///
11/// Returns the decoded string and the detected BOM type.
12pub fn decode_with_bom_detect(
13    encoding: Encoding,
14    data: &[u8],
15    check: bool,
16) -> Result<(String, BomType), anyhow::Error> {
17    if data.len() >= 2 {
18        if data[0] == 0xFE && data[1] == 0xFF {
19            return Ok((
20                encoding::codec::utf_16::UTF_16BE_ENCODING
21                    .decode(
22                        &data[2..],
23                        if check {
24                            DecoderTrap::Strict
25                        } else {
26                            DecoderTrap::Replace
27                        },
28                    )
29                    .map_err(|_| anyhow::anyhow!("Failed to decode UTF-16BE"))?,
30                BomType::Utf16BE,
31            ));
32        } else if data[0] == 0xFF && data[1] == 0xFE {
33            return Ok((
34                encoding::codec::utf_16::UTF_16LE_ENCODING
35                    .decode(
36                        &data[2..],
37                        if check {
38                            DecoderTrap::Strict
39                        } else {
40                            DecoderTrap::Replace
41                        },
42                    )
43                    .map_err(|_| anyhow::anyhow!("Failed to decode UTF-16LE"))?,
44                BomType::Utf16LE,
45            ));
46        }
47    }
48    if data.len() >= 3 {
49        if data[0] == 0xEF && data[1] == 0xBB && data[2] == 0xBF {
50            return Ok((String::from_utf8(data[3..].to_vec())?, BomType::Utf8));
51        }
52    }
53    #[cfg(feature = "kirikiri")]
54    {
55        use crate::ext::io::*;
56        use crate::scripts::kirikiri::mdf::Mdf;
57        use crate::scripts::kirikiri::simple_crypt::SimpleCrypt;
58        if data.len() >= 8 && data.starts_with(b"mdf\0") {
59            let reader = MemReaderRef::new(&data[4..]);
60            let decoded = Mdf::unpack(reader)?;
61            return decode_with_bom_detect(encoding, &decoded, check);
62        }
63        if data.len() >= 5
64            && data[0] == 0xFE
65            && data[1] == 0xFE
66            && (data[2] == 0 || data[2] == 1 || data[2] == 2)
67            && data[3] == 0xFF
68            && data[4] == 0xFE
69        {
70            let crypt = data[2];
71            let reader = MemReaderRef::new(data);
72            let decoded = SimpleCrypt::unpack(crypt, reader)?;
73            return decode_with_bom_detect(encoding, &decoded, check);
74        }
75    }
76    decode_to_string(encoding, data, check).map(|s| (s, BomType::None))
77}
78
79/// Decodes a byte slice to a string using the specified encoding.
80///
81/// * `check` - If true, checks for decoding errors and returns an error if any.
82pub fn decode_to_string(
83    encoding: Encoding,
84    data: &[u8],
85    check: bool,
86) -> Result<String, anyhow::Error> {
87    match encoding {
88        Encoding::Auto => decode_to_string(Encoding::Utf8, data, check)
89            .or_else(|_| decode_to_string(Encoding::Cp932, data, check))
90            .or_else(|_| decode_to_string(Encoding::Gb2312, data, check)),
91        Encoding::Utf8 => Ok(String::from_utf8(data.to_vec())?),
92        Encoding::Cp932 => {
93            let result = encoding::codec::japanese::Windows31JEncoding
94                .decode(
95                    data,
96                    if check {
97                        DecoderTrap::Strict
98                    } else {
99                        DecoderTrap::Call(|_, d, out| {
100                            if d.len() == 1 {
101                                if d[0] == 0xFF {
102                                    out.write_char('\u{f8f3}'); // PUA character for U+F8F3
103                                } else if d[0] == 0xFE {
104                                    out.write_char('\u{f8f2}'); // PUA character for U+F8F2
105                                } else if d[0] == 0xFD {
106                                    out.write_char('\u{f8f1}'); // PUA character for U+F8F1
107                                } else {
108                                    out.write_char('\u{FFFD}'); // Replacement character
109                                }
110                            } else {
111                                out.write_char('\u{FFFD}'); // Replacement character
112                            }
113                            true
114                        })
115                    },
116                )
117                .map_err(|_| anyhow::anyhow!("Failed to decode Shift-JIS"))?;
118            if result.contains('\u{FFFD}') {
119                eprintln!(
120                    "Warning: Some characters could not be decoded in Shift-JIS: {:?}",
121                    data
122                );
123                crate::COUNTER.inc_warning();
124            }
125            Ok(result)
126        }
127        Encoding::Gb2312 => {
128            let result = encoding::codec::simpchinese::GBK_ENCODING
129                .decode(
130                    data,
131                    if check {
132                        DecoderTrap::Strict
133                    } else {
134                        DecoderTrap::Replace
135                    },
136                )
137                .map_err(|_| anyhow::anyhow!("Failed to decode GB2312"))?;
138            if result.contains('\u{FFFD}') {
139                eprintln!(
140                    "Warning: Some characters could not be decoded in GB2312: {:?}",
141                    data
142                );
143                crate::COUNTER.inc_warning();
144            }
145            Ok(result)
146        }
147        Encoding::Utf16LE => Ok(encoding::codec::utf_16::UTF_16LE_ENCODING
148            .decode(
149                data,
150                if check {
151                    DecoderTrap::Strict
152                } else {
153                    DecoderTrap::Replace
154                },
155            )
156            .map_err(|_| anyhow::anyhow!("Failed to decode UTF-16LE"))?),
157        Encoding::Utf16BE => Ok(encoding::codec::utf_16::UTF_16BE_ENCODING
158            .decode(
159                data,
160                if check {
161                    DecoderTrap::Strict
162                } else {
163                    DecoderTrap::Replace
164                },
165            )
166            .map_err(|_| anyhow::anyhow!("Failed to decode UTF-16BE"))?),
167        #[cfg(windows)]
168        Encoding::CodePage(code_page) => Ok(super::encoding_win::decode_to_string(
169            code_page, data, check,
170        )?),
171    }
172}
173
174thread_local! {
175    static ENCODE_REPLACED: AtomicBool = AtomicBool::new(false);
176}
177
178fn jis_encoder_trap(_: &mut dyn RawEncoder, data: &str, out: &mut dyn ByteWriter) -> bool {
179    if data == "\u{f8f3}" {
180        out.write_byte(0xFF); // PUA character for U+F8F3
181    } else if data == "\u{f8f2}" {
182        out.write_byte(0xFE); // PUA character for U+F8F2
183    } else if data == "\u{f8f1}" {
184        out.write_byte(0xFD); // PUA character for U+F8F1
185    } else {
186        out.write_byte(b'?'); // Replacement character
187        ENCODE_REPLACED.with(|f| f.qsave(true));
188    }
189    true
190}
191
192fn gbk_encoder_trap(_: &mut dyn RawEncoder, _: &str, out: &mut dyn ByteWriter) -> bool {
193    out.write_byte(b'?'); // Replacement character
194    ENCODE_REPLACED.with(|f| f.qsave(true));
195    true
196}
197
198/// Encodes a string to a byte vector using the specified encoding.
199///
200/// * `check` - If true, checks for encoding errors and returns an error if any.
201pub fn encode_string(
202    encoding: Encoding,
203    data: &str,
204    check: bool,
205) -> Result<Vec<u8>, anyhow::Error> {
206    match encoding {
207        Encoding::Auto => Ok(data.as_bytes().to_vec()),
208        Encoding::Utf8 => Ok(data.as_bytes().to_vec()),
209        Encoding::Cp932 => {
210            ENCODE_REPLACED.with(|f| f.qsave(false));
211            let result = encoding::codec::japanese::Windows31JEncoding
212                .encode(
213                    data,
214                    if check {
215                        // Keep same behavior as Windows API (Code Page 932)
216                        EncoderTrap::Call(|_, data, out| {
217                            if data == "\u{f8f3}" {
218                                out.write_byte(0xFF); // PUA character for U+F8F3
219                                true
220                            } else if data == "\u{f8f2}" {
221                                out.write_byte(0xFE); // PUA character for U+F8F2
222                                true
223                            } else if data == "\u{f8f1}" {
224                                out.write_byte(0xFD); // PUA character for U+F8F1
225                                true
226                            } else {
227                                false
228                            }
229                        })
230                    } else {
231                        EncoderTrap::Call(jis_encoder_trap)
232                    },
233                )
234                .map_err(|_| anyhow::anyhow!("Failed to encode Shift-JIS"))?;
235            ENCODE_REPLACED.with(|f| {
236                if f.qload() {
237                    eprintln!(
238                        "Warning: Some characters could not be encoded in Shift-JIS: {}",
239                        data
240                    );
241                    crate::COUNTER.inc_warning();
242                }
243            });
244            Ok(result)
245        }
246        Encoding::Gb2312 => {
247            ENCODE_REPLACED.with(|f| f.qsave(false));
248            let result = encoding::codec::simpchinese::GBK_ENCODING
249                .encode(
250                    data,
251                    if check {
252                        EncoderTrap::Strict
253                    } else {
254                        EncoderTrap::Call(gbk_encoder_trap)
255                    },
256                )
257                .map_err(|_| anyhow::anyhow!("Failed to encode GB2312"))?;
258            ENCODE_REPLACED.with(|f| {
259                if f.qload() {
260                    eprintln!(
261                        "Warning: Some characters could not be encoded in GB2312: {}",
262                        data
263                    );
264                    crate::COUNTER.inc_warning();
265                }
266            });
267            Ok(result)
268        }
269        Encoding::Utf16LE => {
270            let re = utf16string::WString::<utf16string::LE>::from(data);
271            Ok(re.as_bytes().to_vec())
272        }
273        Encoding::Utf16BE => {
274            let re = utf16string::WString::<utf16string::BE>::from(data);
275            Ok(re.as_bytes().to_vec())
276        }
277        #[cfg(windows)]
278        Encoding::CodePage(code_page) => {
279            Ok(super::encoding_win::encode_string(code_page, data, check)?)
280        }
281    }
282}
283
284/// Encodes a string to a byte vector using the specified encoding with BOM.
285///
286/// * `bom` - The BOM type to use.
287/// * `check` - If true, checks for encoding errors and returns an error if any
288pub fn encode_string_with_bom(
289    encoding: Encoding,
290    data: &str,
291    check: bool,
292    bom: BomType,
293) -> Result<Vec<u8>, anyhow::Error> {
294    match bom {
295        BomType::None => encode_string(encoding, data, check),
296        BomType::Utf8 => {
297            let mut result = vec![0xEF, 0xBB, 0xBF];
298            result.extend_from_slice(data.as_bytes());
299            Ok(result)
300        }
301        BomType::Utf16LE => {
302            let mut result = vec![0xFF, 0xFE];
303            let re = utf16string::WString::<utf16string::LE>::from(data);
304            result.extend(re.as_bytes());
305            Ok(result)
306        }
307        BomType::Utf16BE => {
308            let mut result = vec![0xFE, 0xFF];
309            let re = utf16string::WString::<utf16string::BE>::from(data);
310            result.extend(re.as_bytes());
311            Ok(result)
312        }
313    }
314}
315
316#[test]
317fn test_decode_to_string() {
318    assert_eq!(
319        decode_to_string(
320            Encoding::Utf8,
321            &[228, 184, 173, 230, 150, 135, 230, 181, 139, 232, 175, 149],
322            true
323        )
324        .unwrap(),
325        "中文测试".to_string()
326    );
327    assert_eq!(
328        decode_to_string(
329            Encoding::Cp932,
330            &[
331                130, 171, 130, 225, 130, 215, 130, 194, 130, 187, 130, 211, 130, 198
332            ],
333            true
334        )
335        .unwrap(),
336        "きゃべつそふと".to_string()
337    );
338    assert_eq!(
339        decode_to_string(Encoding::Gb2312, &[214, 208, 206, 196], true).unwrap(),
340        "中文".to_string()
341    );
342    assert_eq!(
343        decode_to_string(
344            Encoding::Auto,
345            &[228, 184, 173, 230, 150, 135, 230, 181, 139, 232, 175, 149],
346            true
347        )
348        .unwrap(),
349        "中文测试".to_string()
350    );
351    assert_eq!(
352        decode_to_string(
353            Encoding::Auto,
354            &[
355                130, 171, 130, 225, 130, 215, 130, 194, 130, 187, 130, 211, 130, 198
356            ],
357            true
358        )
359        .unwrap(),
360        "きゃべつそふと".to_string()
361    );
362    #[cfg(windows)]
363    assert_eq!(
364        decode_to_string(Encoding::CodePage(936), &[214, 208, 206, 196], true).unwrap(),
365        "中文".to_string()
366    );
367}
368
369#[test]
370fn test_encode_string() {
371    assert_eq!(
372        encode_string(Encoding::Utf8, "中文测试", true).unwrap(),
373        vec![228, 184, 173, 230, 150, 135, 230, 181, 139, 232, 175, 149]
374    );
375    assert_eq!(
376        encode_string(Encoding::Cp932, "きゃべつそふと", true).unwrap(),
377        vec![
378            130, 171, 130, 225, 130, 215, 130, 194, 130, 187, 130, 211, 130, 198
379        ]
380    );
381    assert_eq!(
382        encode_string(Encoding::Gb2312, "中文", true).unwrap(),
383        vec![214, 208, 206, 196]
384    );
385    #[cfg(windows)]
386    assert_eq!(
387        encode_string(Encoding::CodePage(936), "中文", true).unwrap(),
388        vec![214, 208, 206, 196]
389    );
390}
391
392#[test]
393fn test_decode_with_bom_detect() {
394    let utf8_data = vec![0xEF, 0xBB, 0xBF, 0xE4, 0xB8, 0xAD, 0xE6, 0x96, 0x87];
395    let (decoded_utf8, bom_type) =
396        decode_with_bom_detect(Encoding::Auto, &utf8_data, true).unwrap();
397    assert_eq!(decoded_utf8, "中文");
398    assert_eq!(bom_type, BomType::Utf8);
399    let utf16le_data = vec![0xFF, 0xFE, 0x2D, 0x4E, 0x87, 0x65];
400    let (decoded_utf16le, bom_type) =
401        decode_with_bom_detect(Encoding::Auto, &utf16le_data, true).unwrap();
402    assert_eq!(decoded_utf16le, "中文");
403    assert_eq!(bom_type, BomType::Utf16LE);
404    let utf16be_data = vec![0xFE, 0xFF, 0x4E, 0x2D, 0x65, 0x87];
405    let (decoded_utf16be, bom_type) =
406        decode_with_bom_detect(Encoding::Auto, &utf16be_data, true).unwrap();
407    assert_eq!(decoded_utf16be, "中文");
408    assert_eq!(bom_type, BomType::Utf16BE);
409    let no_bom_data = vec![0xE4, 0xB8, 0xAD, 0xE6, 0x96, 0x87];
410    let (decoded_no_bom, bom_type) =
411        decode_with_bom_detect(Encoding::Auto, &no_bom_data, true).unwrap();
412    assert_eq!(decoded_no_bom, "中文");
413    assert_eq!(bom_type, BomType::None);
414    #[cfg(feature = "kirikiri")]
415    {
416        let simple_crypt_data = vec![
417            0xFE, 0xFE, 0x01, 0xFF, 0xFE, // Header
418            0x11, 0x00, 0x34, 0x00, 0x36, 0x00, 0x3a, 0x00, 0x11, 0x00, 0x0e, 0x00, 0x05, 0x00,
419        ];
420        let (decoded_simple_crypt, bom_type) =
421            decode_with_bom_detect(Encoding::Auto, &simple_crypt_data, true).unwrap();
422        assert_eq!(decoded_simple_crypt, "\"895\"\r\n");
423        assert_eq!(bom_type, BomType::Utf16LE);
424    }
425}
426
427#[test]
428fn test_encode_string_with_bom() {
429    assert_eq!(
430        encode_string_with_bom(Encoding::Utf8, "中文", true, BomType::Utf8).unwrap(),
431        vec![0xEF, 0xBB, 0xBF, 0xE4, 0xB8, 0xAD, 0xE6, 0x96, 0x87]
432    );
433    assert_eq!(
434        encode_string_with_bom(Encoding::Utf8, "中文", true, BomType::Utf16LE).unwrap(),
435        vec![0xFF, 0xFE, 0x2D, 0x4E, 0x87, 0x65]
436    );
437    assert_eq!(
438        encode_string_with_bom(Encoding::Utf8, "中文", true, BomType::Utf16BE).unwrap(),
439        vec![0xFE, 0xFF, 0x4E, 0x2D, 0x65, 0x87]
440    );
441    assert_eq!(
442        encode_string_with_bom(Encoding::Utf8, "中文", true, BomType::None).unwrap(),
443        vec![0xE4, 0xB8, 0xAD, 0xE6, 0x96, 0x87]
444    );
445}
446
447#[test]
448fn shift_jis_pua_test() {
449    let ff = [0xFF, 0x01];
450    #[cfg(windows)]
451    assert_eq!(
452        decode_to_string(Encoding::CodePage(932), &ff, false).unwrap(),
453        "\u{f8f3}\x01".to_string()
454    );
455    assert_eq!(
456        decode_to_string(Encoding::Cp932, &ff, false).unwrap(),
457        "\u{f8f3}\x01".to_string()
458    );
459    #[cfg(windows)]
460    assert!(decode_to_string(Encoding::CodePage(932), &ff, true).is_err());
461    assert!(decode_to_string(Encoding::Cp932, &ff, true).is_err());
462    let fe = [0xFE, 0x01];
463    #[cfg(windows)]
464    assert_eq!(
465        decode_to_string(Encoding::CodePage(932), &fe, false).unwrap(),
466        "\u{f8f2}\x01".to_string()
467    );
468    assert_eq!(
469        decode_to_string(Encoding::Cp932, &fe, false).unwrap(),
470        "\u{f8f2}\x01".to_string()
471    );
472    #[cfg(windows)]
473    assert!(decode_to_string(Encoding::CodePage(932), &fe, true).is_err());
474    assert!(decode_to_string(Encoding::Cp932, &fe, true).is_err());
475    let fd = [0xFD, 0x01];
476    #[cfg(windows)]
477    assert_eq!(
478        decode_to_string(Encoding::CodePage(932), &fd, false).unwrap(),
479        "\u{f8f1}\x01".to_string()
480    );
481    assert_eq!(
482        decode_to_string(Encoding::Cp932, &fd, false).unwrap(),
483        "\u{f8f1}\x01".to_string()
484    );
485    #[cfg(windows)]
486    assert!(decode_to_string(Encoding::CodePage(932), &fd, true).is_err());
487    assert!(decode_to_string(Encoding::Cp932, &fd, true).is_err());
488    let ff = "\u{f8f3}\x01";
489    #[cfg(windows)]
490    assert_eq!(
491        encode_string(Encoding::CodePage(932), ff, false).unwrap(),
492        vec![0xFF, 0x01]
493    );
494    assert_eq!(
495        encode_string(Encoding::Cp932, ff, false).unwrap(),
496        vec![0xFF, 0x01]
497    );
498    #[cfg(windows)]
499    assert_eq!(
500        encode_string(Encoding::CodePage(932), ff, true).unwrap(),
501        vec![0xFF, 0x01]
502    );
503    assert_eq!(
504        encode_string(Encoding::Cp932, ff, true).unwrap(),
505        vec![0xFF, 0x01]
506    );
507    let fe = "\u{f8f2}\x01";
508    #[cfg(windows)]
509    assert_eq!(
510        encode_string(Encoding::CodePage(932), fe, false).unwrap(),
511        vec![0xFE, 0x01]
512    );
513    assert_eq!(
514        encode_string(Encoding::Cp932, fe, false).unwrap(),
515        vec![0xFE, 0x01]
516    );
517    #[cfg(windows)]
518    assert_eq!(
519        encode_string(Encoding::CodePage(932), fe, true).unwrap(),
520        vec![0xFE, 0x01]
521    );
522    assert_eq!(
523        encode_string(Encoding::Cp932, fe, true).unwrap(),
524        vec![0xFE, 0x01]
525    );
526    let fd = "\u{f8f1}\x01";
527    #[cfg(windows)]
528    assert_eq!(
529        encode_string(Encoding::CodePage(932), fd, false).unwrap(),
530        vec![0xFD, 0x01]
531    );
532    assert_eq!(
533        encode_string(Encoding::Cp932, fd, false).unwrap(),
534        vec![0xFD, 0x01]
535    );
536    #[cfg(windows)]
537    assert_eq!(
538        encode_string(Encoding::CodePage(932), fd, true).unwrap(),
539        vec![0xFD, 0x01]
540    );
541    assert_eq!(
542        encode_string(Encoding::Cp932, fd, true).unwrap(),
543        vec![0xFD, 0x01]
544    );
545    let failed_test = "\u{f8f4}\x01";
546    #[cfg(windows)]
547    assert!(encode_string(Encoding::CodePage(932), failed_test, true).is_err());
548    assert!(encode_string(Encoding::Cp932, failed_test, true).is_err());
549}