1use crate::ext::atomic::*;
3use crate::types::*;
4use encoding::{ByteWriter, DecoderTrap, EncoderTrap, Encoding as EncodingTrait, RawEncoder};
5use std::sync::atomic::AtomicBool;
6
7pub fn decode_with_bom_detect(
13 encoding: Encoding,
14 data: &[u8],
15 check: bool,
16) -> Result<(String, BomType), anyhow::Error> {
17 if data.len() >= 2 {
18 if data[0] == 0xFE && data[1] == 0xFF {
19 return Ok((
20 encoding::codec::utf_16::UTF_16BE_ENCODING
21 .decode(
22 &data[2..],
23 if check {
24 DecoderTrap::Strict
25 } else {
26 DecoderTrap::Replace
27 },
28 )
29 .map_err(|_| anyhow::anyhow!("Failed to decode UTF-16BE"))?,
30 BomType::Utf16BE,
31 ));
32 } else if data[0] == 0xFF && data[1] == 0xFE {
33 return Ok((
34 encoding::codec::utf_16::UTF_16LE_ENCODING
35 .decode(
36 &data[2..],
37 if check {
38 DecoderTrap::Strict
39 } else {
40 DecoderTrap::Replace
41 },
42 )
43 .map_err(|_| anyhow::anyhow!("Failed to decode UTF-16LE"))?,
44 BomType::Utf16LE,
45 ));
46 }
47 }
48 if data.len() >= 3 {
49 if data[0] == 0xEF && data[1] == 0xBB && data[2] == 0xBF {
50 return Ok((String::from_utf8(data[3..].to_vec())?, BomType::Utf8));
51 }
52 }
53 #[cfg(feature = "kirikiri")]
54 {
55 use crate::ext::io::*;
56 use crate::scripts::kirikiri::mdf::Mdf;
57 use crate::scripts::kirikiri::simple_crypt::SimpleCrypt;
58 if data.len() >= 8 && data.starts_with(b"mdf\0") {
59 let reader = MemReaderRef::new(&data[4..]);
60 let decoded = Mdf::unpack(reader)?;
61 return decode_with_bom_detect(encoding, &decoded, check);
62 }
63 if data.len() >= 5
64 && data[0] == 0xFE
65 && data[1] == 0xFE
66 && (data[2] == 0 || data[2] == 1 || data[2] == 2)
67 && data[3] == 0xFF
68 && data[4] == 0xFE
69 {
70 let crypt = data[2];
71 let reader = MemReaderRef::new(data);
72 let decoded = SimpleCrypt::unpack(crypt, reader)?;
73 return decode_with_bom_detect(encoding, &decoded, check);
74 }
75 }
76 decode_to_string(encoding, data, check).map(|s| (s, BomType::None))
77}
78
79pub fn decode_to_string(
83 encoding: Encoding,
84 data: &[u8],
85 check: bool,
86) -> Result<String, anyhow::Error> {
87 match encoding {
88 Encoding::Auto => decode_to_string(Encoding::Utf8, data, check)
89 .or_else(|_| decode_to_string(Encoding::Cp932, data, check))
90 .or_else(|_| decode_to_string(Encoding::Gb2312, data, check)),
91 Encoding::Utf8 => Ok(String::from_utf8(data.to_vec())?),
92 Encoding::Cp932 => {
93 let result = encoding::codec::japanese::Windows31JEncoding
94 .decode(
95 data,
96 if check {
97 DecoderTrap::Strict
98 } else {
99 DecoderTrap::Call(|_, d, out| {
100 if d.len() == 1 {
101 if d[0] == 0xFF {
102 out.write_char('\u{f8f3}'); } else if d[0] == 0xFE {
104 out.write_char('\u{f8f2}'); } else if d[0] == 0xFD {
106 out.write_char('\u{f8f1}'); } else {
108 out.write_char('\u{FFFD}'); }
110 } else {
111 out.write_char('\u{FFFD}'); }
113 true
114 })
115 },
116 )
117 .map_err(|_| anyhow::anyhow!("Failed to decode Shift-JIS"))?;
118 if result.contains('\u{FFFD}') {
119 eprintln!(
120 "Warning: Some characters could not be decoded in Shift-JIS: {:?}",
121 data
122 );
123 crate::COUNTER.inc_warning();
124 }
125 Ok(result)
126 }
127 Encoding::Gb2312 => {
128 let result = encoding::codec::simpchinese::GBK_ENCODING
129 .decode(
130 data,
131 if check {
132 DecoderTrap::Strict
133 } else {
134 DecoderTrap::Replace
135 },
136 )
137 .map_err(|_| anyhow::anyhow!("Failed to decode GB2312"))?;
138 if result.contains('\u{FFFD}') {
139 eprintln!(
140 "Warning: Some characters could not be decoded in GB2312: {:?}",
141 data
142 );
143 crate::COUNTER.inc_warning();
144 }
145 Ok(result)
146 }
147 Encoding::Utf16LE => Ok(encoding::codec::utf_16::UTF_16LE_ENCODING
148 .decode(
149 data,
150 if check {
151 DecoderTrap::Strict
152 } else {
153 DecoderTrap::Replace
154 },
155 )
156 .map_err(|_| anyhow::anyhow!("Failed to decode UTF-16LE"))?),
157 Encoding::Utf16BE => Ok(encoding::codec::utf_16::UTF_16BE_ENCODING
158 .decode(
159 data,
160 if check {
161 DecoderTrap::Strict
162 } else {
163 DecoderTrap::Replace
164 },
165 )
166 .map_err(|_| anyhow::anyhow!("Failed to decode UTF-16BE"))?),
167 #[cfg(windows)]
168 Encoding::CodePage(code_page) => Ok(super::encoding_win::decode_to_string(
169 code_page, data, check,
170 )?),
171 }
172}
173
174thread_local! {
175 static ENCODE_REPLACED: AtomicBool = AtomicBool::new(false);
176}
177
178fn jis_encoder_trap(_: &mut dyn RawEncoder, data: &str, out: &mut dyn ByteWriter) -> bool {
179 if data == "\u{f8f3}" {
180 out.write_byte(0xFF); } else if data == "\u{f8f2}" {
182 out.write_byte(0xFE); } else if data == "\u{f8f1}" {
184 out.write_byte(0xFD); } else {
186 out.write_byte(b'?'); ENCODE_REPLACED.with(|f| f.qsave(true));
188 }
189 true
190}
191
192fn gbk_encoder_trap(_: &mut dyn RawEncoder, _: &str, out: &mut dyn ByteWriter) -> bool {
193 out.write_byte(b'?'); ENCODE_REPLACED.with(|f| f.qsave(true));
195 true
196}
197
198pub fn encode_string(
202 encoding: Encoding,
203 data: &str,
204 check: bool,
205) -> Result<Vec<u8>, anyhow::Error> {
206 match encoding {
207 Encoding::Auto => Ok(data.as_bytes().to_vec()),
208 Encoding::Utf8 => Ok(data.as_bytes().to_vec()),
209 Encoding::Cp932 => {
210 ENCODE_REPLACED.with(|f| f.qsave(false));
211 let result = encoding::codec::japanese::Windows31JEncoding
212 .encode(
213 data,
214 if check {
215 EncoderTrap::Call(|_, data, out| {
217 if data == "\u{f8f3}" {
218 out.write_byte(0xFF); true
220 } else if data == "\u{f8f2}" {
221 out.write_byte(0xFE); true
223 } else if data == "\u{f8f1}" {
224 out.write_byte(0xFD); true
226 } else {
227 false
228 }
229 })
230 } else {
231 EncoderTrap::Call(jis_encoder_trap)
232 },
233 )
234 .map_err(|_| anyhow::anyhow!("Failed to encode Shift-JIS"))?;
235 ENCODE_REPLACED.with(|f| {
236 if f.qload() {
237 eprintln!(
238 "Warning: Some characters could not be encoded in Shift-JIS: {}",
239 data
240 );
241 crate::COUNTER.inc_warning();
242 }
243 });
244 Ok(result)
245 }
246 Encoding::Gb2312 => {
247 ENCODE_REPLACED.with(|f| f.qsave(false));
248 let result = encoding::codec::simpchinese::GBK_ENCODING
249 .encode(
250 data,
251 if check {
252 EncoderTrap::Strict
253 } else {
254 EncoderTrap::Call(gbk_encoder_trap)
255 },
256 )
257 .map_err(|_| anyhow::anyhow!("Failed to encode GB2312"))?;
258 ENCODE_REPLACED.with(|f| {
259 if f.qload() {
260 eprintln!(
261 "Warning: Some characters could not be encoded in GB2312: {}",
262 data
263 );
264 crate::COUNTER.inc_warning();
265 }
266 });
267 Ok(result)
268 }
269 Encoding::Utf16LE => {
270 let re = utf16string::WString::<utf16string::LE>::from(data);
271 Ok(re.as_bytes().to_vec())
272 }
273 Encoding::Utf16BE => {
274 let re = utf16string::WString::<utf16string::BE>::from(data);
275 Ok(re.as_bytes().to_vec())
276 }
277 #[cfg(windows)]
278 Encoding::CodePage(code_page) => {
279 Ok(super::encoding_win::encode_string(code_page, data, check)?)
280 }
281 }
282}
283
284pub fn encode_string_with_bom(
289 encoding: Encoding,
290 data: &str,
291 check: bool,
292 bom: BomType,
293) -> Result<Vec<u8>, anyhow::Error> {
294 match bom {
295 BomType::None => encode_string(encoding, data, check),
296 BomType::Utf8 => {
297 let mut result = vec![0xEF, 0xBB, 0xBF];
298 result.extend_from_slice(data.as_bytes());
299 Ok(result)
300 }
301 BomType::Utf16LE => {
302 let mut result = vec![0xFF, 0xFE];
303 let re = utf16string::WString::<utf16string::LE>::from(data);
304 result.extend(re.as_bytes());
305 Ok(result)
306 }
307 BomType::Utf16BE => {
308 let mut result = vec![0xFE, 0xFF];
309 let re = utf16string::WString::<utf16string::BE>::from(data);
310 result.extend(re.as_bytes());
311 Ok(result)
312 }
313 }
314}
315
316#[test]
317fn test_decode_to_string() {
318 assert_eq!(
319 decode_to_string(
320 Encoding::Utf8,
321 &[228, 184, 173, 230, 150, 135, 230, 181, 139, 232, 175, 149],
322 true
323 )
324 .unwrap(),
325 "中文测试".to_string()
326 );
327 assert_eq!(
328 decode_to_string(
329 Encoding::Cp932,
330 &[
331 130, 171, 130, 225, 130, 215, 130, 194, 130, 187, 130, 211, 130, 198
332 ],
333 true
334 )
335 .unwrap(),
336 "きゃべつそふと".to_string()
337 );
338 assert_eq!(
339 decode_to_string(Encoding::Gb2312, &[214, 208, 206, 196], true).unwrap(),
340 "中文".to_string()
341 );
342 assert_eq!(
343 decode_to_string(
344 Encoding::Auto,
345 &[228, 184, 173, 230, 150, 135, 230, 181, 139, 232, 175, 149],
346 true
347 )
348 .unwrap(),
349 "中文测试".to_string()
350 );
351 assert_eq!(
352 decode_to_string(
353 Encoding::Auto,
354 &[
355 130, 171, 130, 225, 130, 215, 130, 194, 130, 187, 130, 211, 130, 198
356 ],
357 true
358 )
359 .unwrap(),
360 "きゃべつそふと".to_string()
361 );
362 #[cfg(windows)]
363 assert_eq!(
364 decode_to_string(Encoding::CodePage(936), &[214, 208, 206, 196], true).unwrap(),
365 "中文".to_string()
366 );
367}
368
369#[test]
370fn test_encode_string() {
371 assert_eq!(
372 encode_string(Encoding::Utf8, "中文测试", true).unwrap(),
373 vec![228, 184, 173, 230, 150, 135, 230, 181, 139, 232, 175, 149]
374 );
375 assert_eq!(
376 encode_string(Encoding::Cp932, "きゃべつそふと", true).unwrap(),
377 vec![
378 130, 171, 130, 225, 130, 215, 130, 194, 130, 187, 130, 211, 130, 198
379 ]
380 );
381 assert_eq!(
382 encode_string(Encoding::Gb2312, "中文", true).unwrap(),
383 vec![214, 208, 206, 196]
384 );
385 #[cfg(windows)]
386 assert_eq!(
387 encode_string(Encoding::CodePage(936), "中文", true).unwrap(),
388 vec![214, 208, 206, 196]
389 );
390}
391
392#[test]
393fn test_decode_with_bom_detect() {
394 let utf8_data = vec![0xEF, 0xBB, 0xBF, 0xE4, 0xB8, 0xAD, 0xE6, 0x96, 0x87];
395 let (decoded_utf8, bom_type) =
396 decode_with_bom_detect(Encoding::Auto, &utf8_data, true).unwrap();
397 assert_eq!(decoded_utf8, "中文");
398 assert_eq!(bom_type, BomType::Utf8);
399 let utf16le_data = vec![0xFF, 0xFE, 0x2D, 0x4E, 0x87, 0x65];
400 let (decoded_utf16le, bom_type) =
401 decode_with_bom_detect(Encoding::Auto, &utf16le_data, true).unwrap();
402 assert_eq!(decoded_utf16le, "中文");
403 assert_eq!(bom_type, BomType::Utf16LE);
404 let utf16be_data = vec![0xFE, 0xFF, 0x4E, 0x2D, 0x65, 0x87];
405 let (decoded_utf16be, bom_type) =
406 decode_with_bom_detect(Encoding::Auto, &utf16be_data, true).unwrap();
407 assert_eq!(decoded_utf16be, "中文");
408 assert_eq!(bom_type, BomType::Utf16BE);
409 let no_bom_data = vec![0xE4, 0xB8, 0xAD, 0xE6, 0x96, 0x87];
410 let (decoded_no_bom, bom_type) =
411 decode_with_bom_detect(Encoding::Auto, &no_bom_data, true).unwrap();
412 assert_eq!(decoded_no_bom, "中文");
413 assert_eq!(bom_type, BomType::None);
414 #[cfg(feature = "kirikiri")]
415 {
416 let simple_crypt_data = vec![
417 0xFE, 0xFE, 0x01, 0xFF, 0xFE, 0x11, 0x00, 0x34, 0x00, 0x36, 0x00, 0x3a, 0x00, 0x11, 0x00, 0x0e, 0x00, 0x05, 0x00,
419 ];
420 let (decoded_simple_crypt, bom_type) =
421 decode_with_bom_detect(Encoding::Auto, &simple_crypt_data, true).unwrap();
422 assert_eq!(decoded_simple_crypt, "\"895\"\r\n");
423 assert_eq!(bom_type, BomType::Utf16LE);
424 }
425}
426
427#[test]
428fn test_encode_string_with_bom() {
429 assert_eq!(
430 encode_string_with_bom(Encoding::Utf8, "中文", true, BomType::Utf8).unwrap(),
431 vec![0xEF, 0xBB, 0xBF, 0xE4, 0xB8, 0xAD, 0xE6, 0x96, 0x87]
432 );
433 assert_eq!(
434 encode_string_with_bom(Encoding::Utf8, "中文", true, BomType::Utf16LE).unwrap(),
435 vec![0xFF, 0xFE, 0x2D, 0x4E, 0x87, 0x65]
436 );
437 assert_eq!(
438 encode_string_with_bom(Encoding::Utf8, "中文", true, BomType::Utf16BE).unwrap(),
439 vec![0xFE, 0xFF, 0x4E, 0x2D, 0x65, 0x87]
440 );
441 assert_eq!(
442 encode_string_with_bom(Encoding::Utf8, "中文", true, BomType::None).unwrap(),
443 vec![0xE4, 0xB8, 0xAD, 0xE6, 0x96, 0x87]
444 );
445}
446
447#[test]
448fn shift_jis_pua_test() {
449 let ff = [0xFF, 0x01];
450 #[cfg(windows)]
451 assert_eq!(
452 decode_to_string(Encoding::CodePage(932), &ff, false).unwrap(),
453 "\u{f8f3}\x01".to_string()
454 );
455 assert_eq!(
456 decode_to_string(Encoding::Cp932, &ff, false).unwrap(),
457 "\u{f8f3}\x01".to_string()
458 );
459 #[cfg(windows)]
460 assert!(decode_to_string(Encoding::CodePage(932), &ff, true).is_err());
461 assert!(decode_to_string(Encoding::Cp932, &ff, true).is_err());
462 let fe = [0xFE, 0x01];
463 #[cfg(windows)]
464 assert_eq!(
465 decode_to_string(Encoding::CodePage(932), &fe, false).unwrap(),
466 "\u{f8f2}\x01".to_string()
467 );
468 assert_eq!(
469 decode_to_string(Encoding::Cp932, &fe, false).unwrap(),
470 "\u{f8f2}\x01".to_string()
471 );
472 #[cfg(windows)]
473 assert!(decode_to_string(Encoding::CodePage(932), &fe, true).is_err());
474 assert!(decode_to_string(Encoding::Cp932, &fe, true).is_err());
475 let fd = [0xFD, 0x01];
476 #[cfg(windows)]
477 assert_eq!(
478 decode_to_string(Encoding::CodePage(932), &fd, false).unwrap(),
479 "\u{f8f1}\x01".to_string()
480 );
481 assert_eq!(
482 decode_to_string(Encoding::Cp932, &fd, false).unwrap(),
483 "\u{f8f1}\x01".to_string()
484 );
485 #[cfg(windows)]
486 assert!(decode_to_string(Encoding::CodePage(932), &fd, true).is_err());
487 assert!(decode_to_string(Encoding::Cp932, &fd, true).is_err());
488 let ff = "\u{f8f3}\x01";
489 #[cfg(windows)]
490 assert_eq!(
491 encode_string(Encoding::CodePage(932), ff, false).unwrap(),
492 vec![0xFF, 0x01]
493 );
494 assert_eq!(
495 encode_string(Encoding::Cp932, ff, false).unwrap(),
496 vec![0xFF, 0x01]
497 );
498 #[cfg(windows)]
499 assert_eq!(
500 encode_string(Encoding::CodePage(932), ff, true).unwrap(),
501 vec![0xFF, 0x01]
502 );
503 assert_eq!(
504 encode_string(Encoding::Cp932, ff, true).unwrap(),
505 vec![0xFF, 0x01]
506 );
507 let fe = "\u{f8f2}\x01";
508 #[cfg(windows)]
509 assert_eq!(
510 encode_string(Encoding::CodePage(932), fe, false).unwrap(),
511 vec![0xFE, 0x01]
512 );
513 assert_eq!(
514 encode_string(Encoding::Cp932, fe, false).unwrap(),
515 vec![0xFE, 0x01]
516 );
517 #[cfg(windows)]
518 assert_eq!(
519 encode_string(Encoding::CodePage(932), fe, true).unwrap(),
520 vec![0xFE, 0x01]
521 );
522 assert_eq!(
523 encode_string(Encoding::Cp932, fe, true).unwrap(),
524 vec![0xFE, 0x01]
525 );
526 let fd = "\u{f8f1}\x01";
527 #[cfg(windows)]
528 assert_eq!(
529 encode_string(Encoding::CodePage(932), fd, false).unwrap(),
530 vec![0xFD, 0x01]
531 );
532 assert_eq!(
533 encode_string(Encoding::Cp932, fd, false).unwrap(),
534 vec![0xFD, 0x01]
535 );
536 #[cfg(windows)]
537 assert_eq!(
538 encode_string(Encoding::CodePage(932), fd, true).unwrap(),
539 vec![0xFD, 0x01]
540 );
541 assert_eq!(
542 encode_string(Encoding::Cp932, fd, true).unwrap(),
543 vec![0xFD, 0x01]
544 );
545 let failed_test = "\u{f8f4}\x01";
546 #[cfg(windows)]
547 assert!(encode_string(Encoding::CodePage(932), failed_test, true).is_err());
548 assert!(encode_string(Encoding::Cp932, failed_test, true).is_err());
549}