1use std::default::Default;
23use std::{char, mem, str};
24
25use futf::{self, Codepoint, Meaning};
26
27pub mod imp {
32 use std::default::Default;
33 use std::{iter, mem, slice};
34
35 pub struct Fixup {
40 pub drop_left: u32,
41 pub drop_right: u32,
42 pub insert_len: u32,
43 pub insert_bytes: [u8; 4],
44 }
45
46 impl Default for Fixup {
47 #[inline(always)]
48 fn default() -> Fixup {
49 Fixup {
50 drop_left: 0,
51 drop_right: 0,
52 insert_len: 0,
53 insert_bytes: [0; 4],
54 }
55 }
56 }
57
58 #[inline(always)]
59 unsafe fn from_u32_unchecked(n: u32) -> char {
60 mem::transmute(n)
61 }
62
63 pub struct SingleByteCharIndices<'a> {
64 inner: iter::Enumerate<slice::Iter<'a, u8>>,
65 }
66
67 impl<'a> Iterator for SingleByteCharIndices<'a> {
68 type Item = (usize, char);
69
70 #[inline]
71 fn next(&mut self) -> Option<(usize, char)> {
72 self.inner
73 .next()
74 .map(|(i, &b)| unsafe { (i, from_u32_unchecked(b as u32)) })
75 }
76 }
77
78 impl<'a> SingleByteCharIndices<'a> {
79 #[inline]
80 pub fn new(buf: &'a [u8]) -> SingleByteCharIndices<'a> {
81 SingleByteCharIndices {
82 inner: buf.iter().enumerate(),
83 }
84 }
85 }
86}
87
88pub unsafe trait Format {
93 fn validate(buf: &[u8]) -> bool;
95
96 #[inline]
100 fn validate_prefix(buf: &[u8]) -> bool {
101 <Self as Format>::validate(buf)
102 }
103
104 #[inline]
108 fn validate_suffix(buf: &[u8]) -> bool {
109 <Self as Format>::validate(buf)
110 }
111
112 #[inline]
118 fn validate_subseq(buf: &[u8]) -> bool {
119 <Self as Format>::validate(buf)
120 }
121
122 #[inline(always)]
130 unsafe fn fixup(_lhs: &[u8], _rhs: &[u8]) -> imp::Fixup {
131 Default::default()
132 }
133}
134
135pub unsafe trait SubsetOf<Super>: Format
140where
141 Super: Format,
142{
143 fn revalidate_subset(x: &[u8]) -> bool {
151 Self::validate(x)
152 }
153}
154
155pub unsafe trait SliceFormat: Format + Sized {
158 type Slice: ?Sized + Slice;
159}
160
161pub unsafe trait CharFormat<'a>: Format {
164 type Iter: Iterator<Item = (usize, char)>;
166
167 unsafe fn char_indices(buf: &'a [u8]) -> Self::Iter;
172
173 fn encode_char<F>(ch: char, cont: F) -> Result<(), ()>
177 where
178 F: FnOnce(&[u8]);
179}
180
181pub unsafe trait Slice {
183 fn as_bytes(&self) -> &[u8];
185
186 unsafe fn from_bytes(x: &[u8]) -> &Self;
191
192 unsafe fn from_mut_bytes(x: &mut [u8]) -> &mut Self;
197}
198
199#[derive(Copy, Clone, Default, Debug)]
203pub struct Bytes;
204
205unsafe impl Format for Bytes {
206 #[inline(always)]
207 fn validate(_: &[u8]) -> bool {
208 true
209 }
210}
211
212unsafe impl SliceFormat for Bytes {
213 type Slice = [u8];
214}
215
216unsafe impl Slice for [u8] {
217 #[inline(always)]
218 fn as_bytes(&self) -> &[u8] {
219 self
220 }
221
222 #[inline(always)]
223 unsafe fn from_bytes(x: &[u8]) -> &[u8] {
224 x
225 }
226
227 #[inline(always)]
228 unsafe fn from_mut_bytes(x: &mut [u8]) -> &mut [u8] {
229 x
230 }
231}
232
233#[derive(Copy, Clone, Default, Debug)]
235pub struct ASCII;
236
237unsafe impl Format for ASCII {
238 #[inline]
239 fn validate(buf: &[u8]) -> bool {
240 buf.iter().all(|&n| n <= 127)
241 }
242
243 #[inline(always)]
244 fn validate_prefix(_: &[u8]) -> bool {
245 true
246 }
247
248 #[inline(always)]
249 fn validate_suffix(_: &[u8]) -> bool {
250 true
251 }
252
253 #[inline(always)]
254 fn validate_subseq(_: &[u8]) -> bool {
255 true
256 }
257}
258
259unsafe impl SubsetOf<UTF8> for ASCII {}
260unsafe impl SubsetOf<Latin1> for ASCII {}
261
262unsafe impl<'a> CharFormat<'a> for ASCII {
263 type Iter = imp::SingleByteCharIndices<'a>;
264
265 #[inline]
266 unsafe fn char_indices(buf: &'a [u8]) -> imp::SingleByteCharIndices<'a> {
267 imp::SingleByteCharIndices::new(buf)
268 }
269
270 #[inline]
271 fn encode_char<F>(ch: char, cont: F) -> Result<(), ()>
272 where
273 F: FnOnce(&[u8]),
274 {
275 let n = ch as u32;
276 if n > 0x7F {
277 return Err(());
278 }
279 cont(&[n as u8]);
280 Ok(())
281 }
282}
283
284#[derive(Copy, Clone, Default, Debug)]
286pub struct UTF8;
287
288unsafe impl Format for UTF8 {
289 #[inline]
290 fn validate(buf: &[u8]) -> bool {
291 str::from_utf8(buf).is_ok()
292 }
293
294 #[inline]
295 fn validate_prefix(buf: &[u8]) -> bool {
296 if buf.len() == 0 {
297 return true;
298 }
299 match futf::classify(buf, buf.len() - 1) {
300 Some(Codepoint {
301 meaning: Meaning::Whole(_),
302 ..
303 }) => true,
304 _ => false,
305 }
306 }
307
308 #[inline]
309 fn validate_suffix(buf: &[u8]) -> bool {
310 if buf.len() == 0 {
311 return true;
312 }
313 match futf::classify(buf, 0) {
314 Some(Codepoint {
315 meaning: Meaning::Whole(_),
316 ..
317 }) => true,
318 _ => false,
319 }
320 }
321
322 #[inline]
323 fn validate_subseq(buf: &[u8]) -> bool {
324 <Self as Format>::validate_prefix(buf) && <Self as Format>::validate_suffix(buf)
325 }
326}
327
328unsafe impl SubsetOf<WTF8> for UTF8 {}
329
330unsafe impl SliceFormat for UTF8 {
331 type Slice = str;
332}
333
334unsafe impl Slice for str {
335 #[inline(always)]
336 fn as_bytes(&self) -> &[u8] {
337 str::as_bytes(self)
338 }
339
340 #[inline(always)]
341 unsafe fn from_bytes(x: &[u8]) -> &str {
342 str::from_utf8_unchecked(x)
343 }
344
345 #[inline(always)]
346 unsafe fn from_mut_bytes(x: &mut [u8]) -> &mut str {
347 mem::transmute(x)
348 }
349}
350
351unsafe impl<'a> CharFormat<'a> for UTF8 {
352 type Iter = str::CharIndices<'a>;
353
354 #[inline]
355 unsafe fn char_indices(buf: &'a [u8]) -> str::CharIndices<'a> {
356 str::from_utf8_unchecked(buf).char_indices()
357 }
358
359 #[inline]
360 fn encode_char<F>(ch: char, cont: F) -> Result<(), ()>
361 where
362 F: FnOnce(&[u8]),
363 {
364 cont(ch.encode_utf8(&mut [0_u8; 4]).as_bytes());
365 Ok(())
366 }
367}
368
369#[derive(Copy, Clone, Default, Debug)]
373pub struct WTF8;
374
375#[inline]
376fn wtf8_meaningful(m: Meaning) -> bool {
377 match m {
378 Meaning::Whole(_) | Meaning::LeadSurrogate(_) | Meaning::TrailSurrogate(_) => true,
379 _ => false,
380 }
381}
382
383unsafe impl Format for WTF8 {
384 #[inline]
385 fn validate(buf: &[u8]) -> bool {
386 let mut i = 0;
387 let mut prev_lead = false;
388 while i < buf.len() {
389 let codept = unwrap_or_return!(futf::classify(buf, i), false);
390 if !wtf8_meaningful(codept.meaning) {
391 return false;
392 }
393 i += codept.bytes.len();
394 prev_lead = match codept.meaning {
395 Meaning::TrailSurrogate(_) if prev_lead => return false,
396 Meaning::LeadSurrogate(_) => true,
397 _ => false,
398 };
399 }
400
401 true
402 }
403
404 #[inline]
405 fn validate_prefix(buf: &[u8]) -> bool {
406 if buf.len() == 0 {
407 return true;
408 }
409 match futf::classify(buf, buf.len() - 1) {
410 Some(c) => wtf8_meaningful(c.meaning),
411 _ => false,
412 }
413 }
414
415 #[inline]
416 fn validate_suffix(buf: &[u8]) -> bool {
417 if buf.len() == 0 {
418 return true;
419 }
420 match futf::classify(buf, 0) {
421 Some(c) => wtf8_meaningful(c.meaning),
422 _ => false,
423 }
424 }
425
426 #[inline]
427 fn validate_subseq(buf: &[u8]) -> bool {
428 <Self as Format>::validate_prefix(buf) && <Self as Format>::validate_suffix(buf)
429 }
430
431 #[inline]
432 unsafe fn fixup(lhs: &[u8], rhs: &[u8]) -> imp::Fixup {
433 const ERR: &'static str = "WTF8: internal error";
434
435 if lhs.len() >= 3 && rhs.len() >= 3 {
436 if let (
437 Some(Codepoint {
438 meaning: Meaning::LeadSurrogate(hi),
439 ..
440 }),
441 Some(Codepoint {
442 meaning: Meaning::TrailSurrogate(lo),
443 ..
444 }),
445 ) = (futf::classify(lhs, lhs.len() - 1), futf::classify(rhs, 0))
446 {
447 let mut fixup = imp::Fixup {
448 drop_left: 3,
449 drop_right: 3,
450 insert_len: 0,
451 insert_bytes: [0_u8; 4],
452 };
453
454 let n = 0x10000 + ((hi as u32) << 10) + (lo as u32);
455
456 let ch = char::from_u32(n).expect(ERR);
457 fixup.insert_len = ch.encode_utf8(&mut fixup.insert_bytes).len() as u32;
458
459 return fixup;
460 }
461 }
462
463 Default::default()
464 }
465}
466
467#[derive(Copy, Clone, Default, Debug)]
475pub struct Latin1;
476
477unsafe impl Format for Latin1 {
478 #[inline(always)]
479 fn validate(_: &[u8]) -> bool {
480 true
481 }
482
483 #[inline(always)]
484 fn validate_prefix(_: &[u8]) -> bool {
485 true
486 }
487
488 #[inline(always)]
489 fn validate_suffix(_: &[u8]) -> bool {
490 true
491 }
492
493 #[inline(always)]
494 fn validate_subseq(_: &[u8]) -> bool {
495 true
496 }
497}
498
499unsafe impl<'a> CharFormat<'a> for Latin1 {
500 type Iter = imp::SingleByteCharIndices<'a>;
501
502 #[inline]
503 unsafe fn char_indices(buf: &'a [u8]) -> imp::SingleByteCharIndices<'a> {
504 imp::SingleByteCharIndices::new(buf)
505 }
506
507 #[inline]
508 fn encode_char<F>(ch: char, cont: F) -> Result<(), ()>
509 where
510 F: FnOnce(&[u8]),
511 {
512 let n = ch as u32;
513 if n > 0xFF {
514 return Err(());
515 }
516 cont(&[n as u8]);
517 Ok(())
518 }
519}