1use crate::ext::atomic::*;
3use crate::types::*;
4use encoding::{ByteWriter, DecoderTrap, EncoderTrap, Encoding as EncodingTrait, RawEncoder};
5use std::sync::atomic::AtomicBool;
6
7pub fn decode_with_bom_detect(
13 encoding: Encoding,
14 data: &[u8],
15 check: bool,
16) -> Result<(String, BomType), anyhow::Error> {
17 if data.len() >= 2 {
18 if data[0] == 0xFE && data[1] == 0xFF {
19 return Ok((
20 encoding::codec::utf_16::UTF_16BE_ENCODING
21 .decode(
22 &data[2..],
23 if check {
24 DecoderTrap::Strict
25 } else {
26 DecoderTrap::Replace
27 },
28 )
29 .map_err(|_| anyhow::anyhow!("Failed to decode UTF-16BE"))?,
30 BomType::Utf16BE,
31 ));
32 } else if data[0] == 0xFF && data[1] == 0xFE {
33 return Ok((
34 encoding::codec::utf_16::UTF_16LE_ENCODING
35 .decode(
36 &data[2..],
37 if check {
38 DecoderTrap::Strict
39 } else {
40 DecoderTrap::Replace
41 },
42 )
43 .map_err(|_| anyhow::anyhow!("Failed to decode UTF-16LE"))?,
44 BomType::Utf16LE,
45 ));
46 }
47 }
48 if data.len() >= 3 {
49 if data[0] == 0xEF && data[1] == 0xBB && data[2] == 0xBF {
50 return Ok((String::from_utf8(data[3..].to_vec())?, BomType::Utf8));
51 }
52 }
53 #[cfg(feature = "kirikiri")]
54 {
55 use crate::ext::io::*;
56 use crate::scripts::kirikiri::mdf::Mdf;
57 use crate::scripts::kirikiri::simple_crypt::SimpleCrypt;
58 if data.len() >= 8 && data.starts_with(b"mdf\0") {
59 let reader = MemReaderRef::new(&data[4..]);
60 let decoded = Mdf::unpack(reader)?;
61 return decode_with_bom_detect(encoding, &decoded, check);
62 }
63 if data.len() >= 5
64 && data[0] == 0xFE
65 && data[1] == 0xFE
66 && (data[2] == 0 || data[2] == 1 || data[2] == 2)
67 && data[3] == 0xFF
68 && data[4] == 0xFE
69 {
70 let crypt = data[2];
71 let reader = MemReaderRef::new(data);
72 let decoded = SimpleCrypt::unpack(crypt, reader)?;
73 return decode_with_bom_detect(encoding, &decoded, check);
74 }
75 }
76 decode_to_string(encoding, data, check).map(|s| (s, BomType::None))
77}
78
79pub fn decode_to_string(
83 encoding: Encoding,
84 data: &[u8],
85 check: bool,
86) -> Result<String, anyhow::Error> {
87 match encoding {
88 Encoding::Auto => decode_to_string(Encoding::Utf8, data, check)
89 .or_else(|_| decode_to_string(Encoding::Cp932, data, check))
90 .or_else(|_| decode_to_string(Encoding::Gb2312, data, check)),
91 Encoding::Utf8 => Ok(String::from_utf8(data.to_vec())?),
92 Encoding::Cp932 => {
93 let result = encoding::codec::japanese::Windows31JEncoding
94 .decode(
95 data,
96 if check {
97 DecoderTrap::Strict
98 } else {
99 DecoderTrap::Call(|_, d, out| {
100 if d.len() == 1 {
101 if d[0] == 0xFF {
102 out.write_char('\u{f8f3}'); } else if d[0] == 0xFE {
104 out.write_char('\u{f8f2}'); } else if d[0] == 0xFD {
106 out.write_char('\u{f8f1}'); } else {
108 out.write_char('\u{FFFD}'); }
110 } else {
111 out.write_char('\u{FFFD}'); }
113 true
114 })
115 },
116 )
117 .map_err(|_| anyhow::anyhow!("Failed to decode Shift-JIS"))?;
118 if result.contains('\u{FFFD}') {
119 eprintln!(
120 "Warning: Some characters could not be decoded in Shift-JIS: {:?}",
121 data
122 );
123 crate::COUNTER.inc_warning();
124 }
125 Ok(result)
126 }
127 Encoding::Gb2312 => {
128 let result = encoding::codec::simpchinese::GBK_ENCODING
129 .decode(
130 data,
131 if check {
132 DecoderTrap::Strict
133 } else {
134 DecoderTrap::Replace
135 },
136 )
137 .map_err(|_| anyhow::anyhow!("Failed to decode GB2312"))?;
138 if result.contains('\u{FFFD}') {
139 eprintln!(
140 "Warning: Some characters could not be decoded in GB2312: {:?}",
141 data
142 );
143 crate::COUNTER.inc_warning();
144 }
145 Ok(result)
146 }
147 Encoding::Utf16LE => Ok(encoding::codec::utf_16::UTF_16LE_ENCODING
148 .decode(
149 data,
150 if check {
151 DecoderTrap::Strict
152 } else {
153 DecoderTrap::Replace
154 },
155 )
156 .map_err(|_| anyhow::anyhow!("Failed to decode UTF-16LE"))?),
157 #[cfg(windows)]
158 Encoding::CodePage(code_page) => Ok(super::encoding_win::decode_to_string(
159 code_page, data, check,
160 )?),
161 }
162}
163
164thread_local! {
165 static ENCODE_REPLACED: AtomicBool = AtomicBool::new(false);
166}
167
168fn jis_encoder_trap(_: &mut dyn RawEncoder, data: &str, out: &mut dyn ByteWriter) -> bool {
169 if data == "\u{f8f3}" {
170 out.write_byte(0xFF); } else if data == "\u{f8f2}" {
172 out.write_byte(0xFE); } else if data == "\u{f8f1}" {
174 out.write_byte(0xFD); } else {
176 out.write_byte(b'?'); ENCODE_REPLACED.with(|f| f.qsave(true));
178 }
179 true
180}
181
182fn gbk_encoder_trap(_: &mut dyn RawEncoder, _: &str, out: &mut dyn ByteWriter) -> bool {
183 out.write_byte(b'?'); ENCODE_REPLACED.with(|f| f.qsave(true));
185 true
186}
187
188pub fn encode_string(
192 encoding: Encoding,
193 data: &str,
194 check: bool,
195) -> Result<Vec<u8>, anyhow::Error> {
196 match encoding {
197 Encoding::Auto => Ok(data.as_bytes().to_vec()),
198 Encoding::Utf8 => Ok(data.as_bytes().to_vec()),
199 Encoding::Cp932 => {
200 ENCODE_REPLACED.with(|f| f.qsave(false));
201 let result = encoding::codec::japanese::Windows31JEncoding
202 .encode(
203 data,
204 if check {
205 EncoderTrap::Call(|_, data, out| {
207 if data == "\u{f8f3}" {
208 out.write_byte(0xFF); true
210 } else if data == "\u{f8f2}" {
211 out.write_byte(0xFE); true
213 } else if data == "\u{f8f1}" {
214 out.write_byte(0xFD); true
216 } else {
217 false
218 }
219 })
220 } else {
221 EncoderTrap::Call(jis_encoder_trap)
222 },
223 )
224 .map_err(|_| anyhow::anyhow!("Failed to encode Shift-JIS"))?;
225 ENCODE_REPLACED.with(|f| {
226 if f.qload() {
227 eprintln!(
228 "Warning: Some characters could not be encoded in Shift-JIS: {}",
229 data
230 );
231 crate::COUNTER.inc_warning();
232 }
233 });
234 Ok(result)
235 }
236 Encoding::Gb2312 => {
237 ENCODE_REPLACED.with(|f| f.qsave(false));
238 let result = encoding::codec::simpchinese::GBK_ENCODING
239 .encode(
240 data,
241 if check {
242 EncoderTrap::Strict
243 } else {
244 EncoderTrap::Call(gbk_encoder_trap)
245 },
246 )
247 .map_err(|_| anyhow::anyhow!("Failed to encode GB2312"))?;
248 ENCODE_REPLACED.with(|f| {
249 if f.qload() {
250 eprintln!(
251 "Warning: Some characters could not be encoded in GB2312: {}",
252 data
253 );
254 crate::COUNTER.inc_warning();
255 }
256 });
257 Ok(result)
258 }
259 Encoding::Utf16LE => {
260 let re = utf16string::WString::<utf16string::LE>::from(data);
261 Ok(re.as_bytes().to_vec())
262 }
263 #[cfg(windows)]
264 Encoding::CodePage(code_page) => {
265 Ok(super::encoding_win::encode_string(code_page, data, check)?)
266 }
267 }
268}
269
270pub fn encode_string_with_bom(
275 encoding: Encoding,
276 data: &str,
277 check: bool,
278 bom: BomType,
279) -> Result<Vec<u8>, anyhow::Error> {
280 match bom {
281 BomType::None => encode_string(encoding, data, check),
282 BomType::Utf8 => {
283 let mut result = vec![0xEF, 0xBB, 0xBF];
284 result.extend_from_slice(data.as_bytes());
285 Ok(result)
286 }
287 BomType::Utf16LE => {
288 let mut result = vec![0xFF, 0xFE];
289 let re = utf16string::WString::<utf16string::LE>::from(data);
290 result.extend(re.as_bytes());
291 Ok(result)
292 }
293 BomType::Utf16BE => {
294 let mut result = vec![0xFE, 0xFF];
295 let re = utf16string::WString::<utf16string::BE>::from(data);
296 result.extend(re.as_bytes());
297 Ok(result)
298 }
299 }
300}
301
302#[test]
303fn test_decode_to_string() {
304 assert_eq!(
305 decode_to_string(
306 Encoding::Utf8,
307 &[228, 184, 173, 230, 150, 135, 230, 181, 139, 232, 175, 149],
308 true
309 )
310 .unwrap(),
311 "中文测试".to_string()
312 );
313 assert_eq!(
314 decode_to_string(
315 Encoding::Cp932,
316 &[
317 130, 171, 130, 225, 130, 215, 130, 194, 130, 187, 130, 211, 130, 198
318 ],
319 true
320 )
321 .unwrap(),
322 "きゃべつそふと".to_string()
323 );
324 assert_eq!(
325 decode_to_string(Encoding::Gb2312, &[214, 208, 206, 196], true).unwrap(),
326 "中文".to_string()
327 );
328 assert_eq!(
329 decode_to_string(
330 Encoding::Auto,
331 &[228, 184, 173, 230, 150, 135, 230, 181, 139, 232, 175, 149],
332 true
333 )
334 .unwrap(),
335 "中文测试".to_string()
336 );
337 assert_eq!(
338 decode_to_string(
339 Encoding::Auto,
340 &[
341 130, 171, 130, 225, 130, 215, 130, 194, 130, 187, 130, 211, 130, 198
342 ],
343 true
344 )
345 .unwrap(),
346 "きゃべつそふと".to_string()
347 );
348 #[cfg(windows)]
349 assert_eq!(
350 decode_to_string(Encoding::CodePage(936), &[214, 208, 206, 196], true).unwrap(),
351 "中文".to_string()
352 );
353}
354
355#[test]
356fn test_encode_string() {
357 assert_eq!(
358 encode_string(Encoding::Utf8, "中文测试", true).unwrap(),
359 vec![228, 184, 173, 230, 150, 135, 230, 181, 139, 232, 175, 149]
360 );
361 assert_eq!(
362 encode_string(Encoding::Cp932, "きゃべつそふと", true).unwrap(),
363 vec![
364 130, 171, 130, 225, 130, 215, 130, 194, 130, 187, 130, 211, 130, 198
365 ]
366 );
367 assert_eq!(
368 encode_string(Encoding::Gb2312, "中文", true).unwrap(),
369 vec![214, 208, 206, 196]
370 );
371 #[cfg(windows)]
372 assert_eq!(
373 encode_string(Encoding::CodePage(936), "中文", true).unwrap(),
374 vec![214, 208, 206, 196]
375 );
376}
377
378#[test]
379fn test_decode_with_bom_detect() {
380 let utf8_data = vec![0xEF, 0xBB, 0xBF, 0xE4, 0xB8, 0xAD, 0xE6, 0x96, 0x87];
381 let (decoded_utf8, bom_type) =
382 decode_with_bom_detect(Encoding::Auto, &utf8_data, true).unwrap();
383 assert_eq!(decoded_utf8, "中文");
384 assert_eq!(bom_type, BomType::Utf8);
385 let utf16le_data = vec![0xFF, 0xFE, 0x2D, 0x4E, 0x87, 0x65];
386 let (decoded_utf16le, bom_type) =
387 decode_with_bom_detect(Encoding::Auto, &utf16le_data, true).unwrap();
388 assert_eq!(decoded_utf16le, "中文");
389 assert_eq!(bom_type, BomType::Utf16LE);
390 let utf16be_data = vec![0xFE, 0xFF, 0x4E, 0x2D, 0x65, 0x87];
391 let (decoded_utf16be, bom_type) =
392 decode_with_bom_detect(Encoding::Auto, &utf16be_data, true).unwrap();
393 assert_eq!(decoded_utf16be, "中文");
394 assert_eq!(bom_type, BomType::Utf16BE);
395 let no_bom_data = vec![0xE4, 0xB8, 0xAD, 0xE6, 0x96, 0x87];
396 let (decoded_no_bom, bom_type) =
397 decode_with_bom_detect(Encoding::Auto, &no_bom_data, true).unwrap();
398 assert_eq!(decoded_no_bom, "中文");
399 assert_eq!(bom_type, BomType::None);
400 #[cfg(feature = "kirikiri")]
401 {
402 let simple_crypt_data = vec![
403 0xFE, 0xFE, 0x01, 0xFF, 0xFE, 0x11, 0x00, 0x34, 0x00, 0x36, 0x00, 0x3a, 0x00, 0x11, 0x00, 0x0e, 0x00, 0x05, 0x00,
405 ];
406 let (decoded_simple_crypt, bom_type) =
407 decode_with_bom_detect(Encoding::Auto, &simple_crypt_data, true).unwrap();
408 assert_eq!(decoded_simple_crypt, "\"895\"\r\n");
409 assert_eq!(bom_type, BomType::Utf16LE);
410 }
411}
412
413#[test]
414fn test_encode_string_with_bom() {
415 assert_eq!(
416 encode_string_with_bom(Encoding::Utf8, "中文", true, BomType::Utf8).unwrap(),
417 vec![0xEF, 0xBB, 0xBF, 0xE4, 0xB8, 0xAD, 0xE6, 0x96, 0x87]
418 );
419 assert_eq!(
420 encode_string_with_bom(Encoding::Utf8, "中文", true, BomType::Utf16LE).unwrap(),
421 vec![0xFF, 0xFE, 0x2D, 0x4E, 0x87, 0x65]
422 );
423 assert_eq!(
424 encode_string_with_bom(Encoding::Utf8, "中文", true, BomType::Utf16BE).unwrap(),
425 vec![0xFE, 0xFF, 0x4E, 0x2D, 0x65, 0x87]
426 );
427 assert_eq!(
428 encode_string_with_bom(Encoding::Utf8, "中文", true, BomType::None).unwrap(),
429 vec![0xE4, 0xB8, 0xAD, 0xE6, 0x96, 0x87]
430 );
431}
432
433#[test]
434fn shift_jis_pua_test() {
435 let ff = [0xFF, 0x01];
436 #[cfg(windows)]
437 assert_eq!(
438 decode_to_string(Encoding::CodePage(932), &ff, false).unwrap(),
439 "\u{f8f3}\x01".to_string()
440 );
441 assert_eq!(
442 decode_to_string(Encoding::Cp932, &ff, false).unwrap(),
443 "\u{f8f3}\x01".to_string()
444 );
445 #[cfg(windows)]
446 assert!(decode_to_string(Encoding::CodePage(932), &ff, true).is_err());
447 assert!(decode_to_string(Encoding::Cp932, &ff, true).is_err());
448 let fe = [0xFE, 0x01];
449 #[cfg(windows)]
450 assert_eq!(
451 decode_to_string(Encoding::CodePage(932), &fe, false).unwrap(),
452 "\u{f8f2}\x01".to_string()
453 );
454 assert_eq!(
455 decode_to_string(Encoding::Cp932, &fe, false).unwrap(),
456 "\u{f8f2}\x01".to_string()
457 );
458 #[cfg(windows)]
459 assert!(decode_to_string(Encoding::CodePage(932), &fe, true).is_err());
460 assert!(decode_to_string(Encoding::Cp932, &fe, true).is_err());
461 let fd = [0xFD, 0x01];
462 #[cfg(windows)]
463 assert_eq!(
464 decode_to_string(Encoding::CodePage(932), &fd, false).unwrap(),
465 "\u{f8f1}\x01".to_string()
466 );
467 assert_eq!(
468 decode_to_string(Encoding::Cp932, &fd, false).unwrap(),
469 "\u{f8f1}\x01".to_string()
470 );
471 #[cfg(windows)]
472 assert!(decode_to_string(Encoding::CodePage(932), &fd, true).is_err());
473 assert!(decode_to_string(Encoding::Cp932, &fd, true).is_err());
474 let ff = "\u{f8f3}\x01";
475 #[cfg(windows)]
476 assert_eq!(
477 encode_string(Encoding::CodePage(932), ff, false).unwrap(),
478 vec![0xFF, 0x01]
479 );
480 assert_eq!(
481 encode_string(Encoding::Cp932, ff, false).unwrap(),
482 vec![0xFF, 0x01]
483 );
484 #[cfg(windows)]
485 assert_eq!(
486 encode_string(Encoding::CodePage(932), ff, true).unwrap(),
487 vec![0xFF, 0x01]
488 );
489 assert_eq!(
490 encode_string(Encoding::Cp932, ff, true).unwrap(),
491 vec![0xFF, 0x01]
492 );
493 let fe = "\u{f8f2}\x01";
494 #[cfg(windows)]
495 assert_eq!(
496 encode_string(Encoding::CodePage(932), fe, false).unwrap(),
497 vec![0xFE, 0x01]
498 );
499 assert_eq!(
500 encode_string(Encoding::Cp932, fe, false).unwrap(),
501 vec![0xFE, 0x01]
502 );
503 #[cfg(windows)]
504 assert_eq!(
505 encode_string(Encoding::CodePage(932), fe, true).unwrap(),
506 vec![0xFE, 0x01]
507 );
508 assert_eq!(
509 encode_string(Encoding::Cp932, fe, true).unwrap(),
510 vec![0xFE, 0x01]
511 );
512 let fd = "\u{f8f1}\x01";
513 #[cfg(windows)]
514 assert_eq!(
515 encode_string(Encoding::CodePage(932), fd, false).unwrap(),
516 vec![0xFD, 0x01]
517 );
518 assert_eq!(
519 encode_string(Encoding::Cp932, fd, false).unwrap(),
520 vec![0xFD, 0x01]
521 );
522 #[cfg(windows)]
523 assert_eq!(
524 encode_string(Encoding::CodePage(932), fd, true).unwrap(),
525 vec![0xFD, 0x01]
526 );
527 assert_eq!(
528 encode_string(Encoding::Cp932, fd, true).unwrap(),
529 vec![0xFD, 0x01]
530 );
531 let failed_test = "\u{f8f4}\x01";
532 #[cfg(windows)]
533 assert!(encode_string(Encoding::CodePage(932), failed_test, true).is_err());
534 assert!(encode_string(Encoding::Cp932, failed_test, true).is_err());
535}