1use crate::ext::atomic::*;
3use crate::types::*;
4use encoding::{ByteWriter, DecoderTrap, EncoderTrap, Encoding as EncodingTrait, RawEncoder};
5use std::sync::atomic::AtomicBool;
6
7pub fn decode_with_bom_detect(
13 encoding: Encoding,
14 data: &[u8],
15 check: bool,
16) -> Result<(String, BomType), anyhow::Error> {
17 if data.len() >= 2 {
18 if data[0] == 0xFE && data[1] == 0xFF {
19 return Ok((
20 encoding::codec::utf_16::UTF_16BE_ENCODING
21 .decode(
22 &data[2..],
23 if check {
24 DecoderTrap::Strict
25 } else {
26 DecoderTrap::Replace
27 },
28 )
29 .map_err(|_| anyhow::anyhow!("Failed to decode UTF-16BE"))?,
30 BomType::Utf16BE,
31 ));
32 } else if data[0] == 0xFF && data[1] == 0xFE {
33 return Ok((
34 encoding::codec::utf_16::UTF_16LE_ENCODING
35 .decode(
36 &data[2..],
37 if check {
38 DecoderTrap::Strict
39 } else {
40 DecoderTrap::Replace
41 },
42 )
43 .map_err(|_| anyhow::anyhow!("Failed to decode UTF-16LE"))?,
44 BomType::Utf16LE,
45 ));
46 }
47 }
48 if data.len() >= 3 {
49 if data[0] == 0xEF && data[1] == 0xBB && data[2] == 0xBF {
50 return Ok((String::from_utf8(data[3..].to_vec())?, BomType::Utf8));
51 }
52 }
53 #[cfg(feature = "kirikiri")]
54 {
55 use crate::ext::io::*;
56 use crate::scripts::kirikiri::mdf::Mdf;
57 use crate::scripts::kirikiri::simple_crypt::SimpleCrypt;
58 if data.len() >= 8 && data.starts_with(b"mdf\0") {
59 let reader = MemReaderRef::new(&data[4..]);
60 let decoded = Mdf::unpack(reader)?;
61 return decode_with_bom_detect(encoding, &decoded, check);
62 }
63 if data.len() >= 5
64 && data[0] == 0xFE
65 && data[1] == 0xFE
66 && (data[2] == 0 || data[2] == 1 || data[2] == 2)
67 && data[3] == 0xFF
68 && data[4] == 0xFE
69 {
70 let crypt = data[2];
71 let reader = MemReaderRef::new(data);
72 let decoded = SimpleCrypt::unpack(crypt, reader)?;
73 return decode_with_bom_detect(encoding, &decoded, check);
74 }
75 }
76 decode_to_string(encoding, data, check).map(|s| (s, BomType::None))
77}
78
79pub fn decode_to_string(
83 encoding: Encoding,
84 data: &[u8],
85 check: bool,
86) -> Result<String, anyhow::Error> {
87 match encoding {
88 Encoding::Auto => decode_to_string(Encoding::Utf8, data, check)
89 .or_else(|_| decode_to_string(Encoding::Cp932, data, check))
90 .or_else(|_| decode_to_string(Encoding::Gb2312, data, check)),
91 Encoding::Utf8 => Ok(String::from_utf8(data.to_vec())?),
92 Encoding::Cp932 => {
93 let result = encoding::codec::japanese::Windows31JEncoding
94 .decode(
95 data,
96 if check {
97 DecoderTrap::Strict
98 } else {
99 DecoderTrap::Call(|_, d, out| {
100 if d.len() == 1 {
101 if d[0] == 0xFF {
102 out.write_char('\u{f8f3}'); } else if d[0] == 0xFE {
104 out.write_char('\u{f8f2}'); } else if d[0] == 0xFD {
106 out.write_char('\u{f8f1}'); } else {
108 out.write_char('\u{FFFD}'); }
110 } else {
111 out.write_char('\u{FFFD}'); }
113 true
114 })
115 },
116 )
117 .map_err(|_| anyhow::anyhow!("Failed to decode Shift-JIS"))?;
118 if result.contains('\u{FFFD}') {
119 eprintln!(
120 "Warning: Some characters could not be decoded in Shift-JIS: {:?}",
121 data
122 );
123 crate::COUNTER.inc_warning();
124 }
125 Ok(result)
126 }
127 Encoding::Gb2312 => {
128 let result = encoding::codec::simpchinese::GBK_ENCODING
129 .decode(
130 data,
131 if check {
132 DecoderTrap::Strict
133 } else {
134 DecoderTrap::Replace
135 },
136 )
137 .map_err(|_| anyhow::anyhow!("Failed to decode GB2312"))?;
138 if result.contains('\u{FFFD}') {
139 eprintln!(
140 "Warning: Some characters could not be decoded in GB2312: {:?}",
141 data
142 );
143 crate::COUNTER.inc_warning();
144 }
145 Ok(result)
146 }
147 #[cfg(windows)]
148 Encoding::CodePage(code_page) => Ok(super::encoding_win::decode_to_string(
149 code_page, data, check,
150 )?),
151 }
152}
153
154thread_local! {
155 static ENCODE_REPLACED: AtomicBool = AtomicBool::new(false);
156}
157
158fn jis_encoder_trap(_: &mut dyn RawEncoder, data: &str, out: &mut dyn ByteWriter) -> bool {
159 if data == "\u{f8f3}" {
160 out.write_byte(0xFF); } else if data == "\u{f8f2}" {
162 out.write_byte(0xFE); } else if data == "\u{f8f1}" {
164 out.write_byte(0xFD); } else {
166 out.write_byte(b'?'); ENCODE_REPLACED.with(|f| f.qsave(true));
168 }
169 true
170}
171
172fn gbk_encoder_trap(_: &mut dyn RawEncoder, _: &str, out: &mut dyn ByteWriter) -> bool {
173 out.write_byte(b'?'); ENCODE_REPLACED.with(|f| f.qsave(true));
175 true
176}
177
178pub fn encode_string(
182 encoding: Encoding,
183 data: &str,
184 check: bool,
185) -> Result<Vec<u8>, anyhow::Error> {
186 match encoding {
187 Encoding::Auto => Ok(data.as_bytes().to_vec()),
188 Encoding::Utf8 => Ok(data.as_bytes().to_vec()),
189 Encoding::Cp932 => {
190 ENCODE_REPLACED.with(|f| f.qsave(false));
191 let result = encoding::codec::japanese::Windows31JEncoding
192 .encode(
193 data,
194 if check {
195 EncoderTrap::Call(|_, data, out| {
197 if data == "\u{f8f3}" {
198 out.write_byte(0xFF); true
200 } else if data == "\u{f8f2}" {
201 out.write_byte(0xFE); true
203 } else if data == "\u{f8f1}" {
204 out.write_byte(0xFD); true
206 } else {
207 false
208 }
209 })
210 } else {
211 EncoderTrap::Call(jis_encoder_trap)
212 },
213 )
214 .map_err(|_| anyhow::anyhow!("Failed to encode Shift-JIS"))?;
215 ENCODE_REPLACED.with(|f| {
216 if f.qload() {
217 eprintln!(
218 "Warning: Some characters could not be encoded in Shift-JIS: {}",
219 data
220 );
221 crate::COUNTER.inc_warning();
222 }
223 });
224 Ok(result)
225 }
226 Encoding::Gb2312 => {
227 ENCODE_REPLACED.with(|f| f.qsave(false));
228 let result = encoding::codec::simpchinese::GBK_ENCODING
229 .encode(
230 data,
231 if check {
232 EncoderTrap::Strict
233 } else {
234 EncoderTrap::Call(gbk_encoder_trap)
235 },
236 )
237 .map_err(|_| anyhow::anyhow!("Failed to encode GB2312"))?;
238 ENCODE_REPLACED.with(|f| {
239 if f.qload() {
240 eprintln!(
241 "Warning: Some characters could not be encoded in GB2312: {}",
242 data
243 );
244 crate::COUNTER.inc_warning();
245 }
246 });
247 Ok(result)
248 }
249 #[cfg(windows)]
250 Encoding::CodePage(code_page) => {
251 Ok(super::encoding_win::encode_string(code_page, data, check)?)
252 }
253 }
254}
255
256pub fn encode_string_with_bom(
261 encoding: Encoding,
262 data: &str,
263 check: bool,
264 bom: BomType,
265) -> Result<Vec<u8>, anyhow::Error> {
266 match bom {
267 BomType::None => encode_string(encoding, data, check),
268 BomType::Utf8 => {
269 let mut result = vec![0xEF, 0xBB, 0xBF];
270 result.extend_from_slice(data.as_bytes());
271 Ok(result)
272 }
273 BomType::Utf16LE => {
274 let mut result = vec![0xFF, 0xFE];
275 let re = utf16string::WString::<utf16string::LE>::from(data);
276 result.extend(re.as_bytes());
277 Ok(result)
278 }
279 BomType::Utf16BE => {
280 let mut result = vec![0xFE, 0xFF];
281 let re = utf16string::WString::<utf16string::BE>::from(data);
282 result.extend(re.as_bytes());
283 Ok(result)
284 }
285 }
286}
287
288#[test]
289fn test_decode_to_string() {
290 assert_eq!(
291 decode_to_string(
292 Encoding::Utf8,
293 &[228, 184, 173, 230, 150, 135, 230, 181, 139, 232, 175, 149],
294 true
295 )
296 .unwrap(),
297 "中文测试".to_string()
298 );
299 assert_eq!(
300 decode_to_string(
301 Encoding::Cp932,
302 &[
303 130, 171, 130, 225, 130, 215, 130, 194, 130, 187, 130, 211, 130, 198
304 ],
305 true
306 )
307 .unwrap(),
308 "きゃべつそふと".to_string()
309 );
310 assert_eq!(
311 decode_to_string(Encoding::Gb2312, &[214, 208, 206, 196], true).unwrap(),
312 "中文".to_string()
313 );
314 assert_eq!(
315 decode_to_string(
316 Encoding::Auto,
317 &[228, 184, 173, 230, 150, 135, 230, 181, 139, 232, 175, 149],
318 true
319 )
320 .unwrap(),
321 "中文测试".to_string()
322 );
323 assert_eq!(
324 decode_to_string(
325 Encoding::Auto,
326 &[
327 130, 171, 130, 225, 130, 215, 130, 194, 130, 187, 130, 211, 130, 198
328 ],
329 true
330 )
331 .unwrap(),
332 "きゃべつそふと".to_string()
333 );
334 #[cfg(windows)]
335 assert_eq!(
336 decode_to_string(Encoding::CodePage(936), &[214, 208, 206, 196], true).unwrap(),
337 "中文".to_string()
338 );
339}
340
341#[test]
342fn test_encode_string() {
343 assert_eq!(
344 encode_string(Encoding::Utf8, "中文测试", true).unwrap(),
345 vec![228, 184, 173, 230, 150, 135, 230, 181, 139, 232, 175, 149]
346 );
347 assert_eq!(
348 encode_string(Encoding::Cp932, "きゃべつそふと", true).unwrap(),
349 vec![
350 130, 171, 130, 225, 130, 215, 130, 194, 130, 187, 130, 211, 130, 198
351 ]
352 );
353 assert_eq!(
354 encode_string(Encoding::Gb2312, "中文", true).unwrap(),
355 vec![214, 208, 206, 196]
356 );
357 #[cfg(windows)]
358 assert_eq!(
359 encode_string(Encoding::CodePage(936), "中文", true).unwrap(),
360 vec![214, 208, 206, 196]
361 );
362}
363
364#[test]
365fn test_decode_with_bom_detect() {
366 let utf8_data = vec![0xEF, 0xBB, 0xBF, 0xE4, 0xB8, 0xAD, 0xE6, 0x96, 0x87];
367 let (decoded_utf8, bom_type) =
368 decode_with_bom_detect(Encoding::Auto, &utf8_data, true).unwrap();
369 assert_eq!(decoded_utf8, "中文");
370 assert_eq!(bom_type, BomType::Utf8);
371 let utf16le_data = vec![0xFF, 0xFE, 0x2D, 0x4E, 0x87, 0x65];
372 let (decoded_utf16le, bom_type) =
373 decode_with_bom_detect(Encoding::Auto, &utf16le_data, true).unwrap();
374 assert_eq!(decoded_utf16le, "中文");
375 assert_eq!(bom_type, BomType::Utf16LE);
376 let utf16be_data = vec![0xFE, 0xFF, 0x4E, 0x2D, 0x65, 0x87];
377 let (decoded_utf16be, bom_type) =
378 decode_with_bom_detect(Encoding::Auto, &utf16be_data, true).unwrap();
379 assert_eq!(decoded_utf16be, "中文");
380 assert_eq!(bom_type, BomType::Utf16BE);
381 let no_bom_data = vec![0xE4, 0xB8, 0xAD, 0xE6, 0x96, 0x87];
382 let (decoded_no_bom, bom_type) =
383 decode_with_bom_detect(Encoding::Auto, &no_bom_data, true).unwrap();
384 assert_eq!(decoded_no_bom, "中文");
385 assert_eq!(bom_type, BomType::None);
386 #[cfg(feature = "kirikiri")]
387 {
388 let simple_crypt_data = vec![
389 0xFE, 0xFE, 0x01, 0xFF, 0xFE, 0x11, 0x00, 0x34, 0x00, 0x36, 0x00, 0x3a, 0x00, 0x11, 0x00, 0x0e, 0x00, 0x05, 0x00,
391 ];
392 let (decoded_simple_crypt, bom_type) =
393 decode_with_bom_detect(Encoding::Auto, &simple_crypt_data, true).unwrap();
394 assert_eq!(decoded_simple_crypt, "\"895\"\r\n");
395 assert_eq!(bom_type, BomType::Utf16LE);
396 }
397}
398
399#[test]
400fn test_encode_string_with_bom() {
401 assert_eq!(
402 encode_string_with_bom(Encoding::Utf8, "中文", true, BomType::Utf8).unwrap(),
403 vec![0xEF, 0xBB, 0xBF, 0xE4, 0xB8, 0xAD, 0xE6, 0x96, 0x87]
404 );
405 assert_eq!(
406 encode_string_with_bom(Encoding::Utf8, "中文", true, BomType::Utf16LE).unwrap(),
407 vec![0xFF, 0xFE, 0x2D, 0x4E, 0x87, 0x65]
408 );
409 assert_eq!(
410 encode_string_with_bom(Encoding::Utf8, "中文", true, BomType::Utf16BE).unwrap(),
411 vec![0xFE, 0xFF, 0x4E, 0x2D, 0x65, 0x87]
412 );
413 assert_eq!(
414 encode_string_with_bom(Encoding::Utf8, "中文", true, BomType::None).unwrap(),
415 vec![0xE4, 0xB8, 0xAD, 0xE6, 0x96, 0x87]
416 );
417}
418
419#[test]
420fn shift_jis_pua_test() {
421 let ff = [0xFF, 0x01];
422 #[cfg(windows)]
423 assert_eq!(
424 decode_to_string(Encoding::CodePage(932), &ff, false).unwrap(),
425 "\u{f8f3}\x01".to_string()
426 );
427 assert_eq!(
428 decode_to_string(Encoding::Cp932, &ff, false).unwrap(),
429 "\u{f8f3}\x01".to_string()
430 );
431 #[cfg(windows)]
432 assert!(decode_to_string(Encoding::CodePage(932), &ff, true).is_err());
433 assert!(decode_to_string(Encoding::Cp932, &ff, true).is_err());
434 let fe = [0xFE, 0x01];
435 #[cfg(windows)]
436 assert_eq!(
437 decode_to_string(Encoding::CodePage(932), &fe, false).unwrap(),
438 "\u{f8f2}\x01".to_string()
439 );
440 assert_eq!(
441 decode_to_string(Encoding::Cp932, &fe, false).unwrap(),
442 "\u{f8f2}\x01".to_string()
443 );
444 #[cfg(windows)]
445 assert!(decode_to_string(Encoding::CodePage(932), &fe, true).is_err());
446 assert!(decode_to_string(Encoding::Cp932, &fe, true).is_err());
447 let fd = [0xFD, 0x01];
448 #[cfg(windows)]
449 assert_eq!(
450 decode_to_string(Encoding::CodePage(932), &fd, false).unwrap(),
451 "\u{f8f1}\x01".to_string()
452 );
453 assert_eq!(
454 decode_to_string(Encoding::Cp932, &fd, false).unwrap(),
455 "\u{f8f1}\x01".to_string()
456 );
457 #[cfg(windows)]
458 assert!(decode_to_string(Encoding::CodePage(932), &fd, true).is_err());
459 assert!(decode_to_string(Encoding::Cp932, &fd, true).is_err());
460 let ff = "\u{f8f3}\x01";
461 #[cfg(windows)]
462 assert_eq!(
463 encode_string(Encoding::CodePage(932), ff, false).unwrap(),
464 vec![0xFF, 0x01]
465 );
466 assert_eq!(
467 encode_string(Encoding::Cp932, ff, false).unwrap(),
468 vec![0xFF, 0x01]
469 );
470 #[cfg(windows)]
471 assert_eq!(
472 encode_string(Encoding::CodePage(932), ff, true).unwrap(),
473 vec![0xFF, 0x01]
474 );
475 assert_eq!(
476 encode_string(Encoding::Cp932, ff, true).unwrap(),
477 vec![0xFF, 0x01]
478 );
479 let fe = "\u{f8f2}\x01";
480 #[cfg(windows)]
481 assert_eq!(
482 encode_string(Encoding::CodePage(932), fe, false).unwrap(),
483 vec![0xFE, 0x01]
484 );
485 assert_eq!(
486 encode_string(Encoding::Cp932, fe, false).unwrap(),
487 vec![0xFE, 0x01]
488 );
489 #[cfg(windows)]
490 assert_eq!(
491 encode_string(Encoding::CodePage(932), fe, true).unwrap(),
492 vec![0xFE, 0x01]
493 );
494 assert_eq!(
495 encode_string(Encoding::Cp932, fe, true).unwrap(),
496 vec![0xFE, 0x01]
497 );
498 let fd = "\u{f8f1}\x01";
499 #[cfg(windows)]
500 assert_eq!(
501 encode_string(Encoding::CodePage(932), fd, false).unwrap(),
502 vec![0xFD, 0x01]
503 );
504 assert_eq!(
505 encode_string(Encoding::Cp932, fd, false).unwrap(),
506 vec![0xFD, 0x01]
507 );
508 #[cfg(windows)]
509 assert_eq!(
510 encode_string(Encoding::CodePage(932), fd, true).unwrap(),
511 vec![0xFD, 0x01]
512 );
513 assert_eq!(
514 encode_string(Encoding::Cp932, fd, true).unwrap(),
515 vec![0xFD, 0x01]
516 );
517 let failed_test = "\u{f8f4}\x01";
518 #[cfg(windows)]
519 assert!(encode_string(Encoding::CodePage(932), failed_test, true).is_err());
520 assert!(encode_string(Encoding::Cp932, failed_test, true).is_err());
521}