tendril/
fmt.rs

1// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
2// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
3// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
4// option. This file may not be copied, modified, or distributed
5// except according to those terms.
6
7//! Marker types for formats.
8//!
9//! This module defines the types and traits used to mark a `Tendril`
10//! with the format of data it contains. It includes those formats
11//! for which `Tendril` supports at least some operations without
12//! conversion.
13//!
14//! To convert a string tendril to/from a byte tendril in an arbitrary
15//! character encoding, see the `encode` and `decode` methods on
16//! `Tendril`.
17//!
18//! `Tendril` operations may become memory-unsafe if data invalid for
19//! the format sneaks in. For that reason, these traits require
20//! `unsafe impl`.
21
22use std::default::Default;
23use std::{char, mem, str};
24
25use futf::{self, Codepoint, Meaning};
26
27/// Implementation details.
28///
29/// You don't need these unless you are implementing
30/// a new format.
31pub mod imp {
32    use std::default::Default;
33    use std::{iter, mem, slice};
34
35    /// Describes how to fix up encodings when concatenating.
36    ///
37    /// We can drop characters on either side of the splice,
38    /// and insert up to 4 bytes in the middle.
39    pub struct Fixup {
40        pub drop_left: u32,
41        pub drop_right: u32,
42        pub insert_len: u32,
43        pub insert_bytes: [u8; 4],
44    }
45
46    impl Default for Fixup {
47        #[inline(always)]
48        fn default() -> Fixup {
49            Fixup {
50                drop_left: 0,
51                drop_right: 0,
52                insert_len: 0,
53                insert_bytes: [0; 4],
54            }
55        }
56    }
57
58    #[inline(always)]
59    unsafe fn from_u32_unchecked(n: u32) -> char {
60        mem::transmute(n)
61    }
62
63    pub struct SingleByteCharIndices<'a> {
64        inner: iter::Enumerate<slice::Iter<'a, u8>>,
65    }
66
67    impl<'a> Iterator for SingleByteCharIndices<'a> {
68        type Item = (usize, char);
69
70        #[inline]
71        fn next(&mut self) -> Option<(usize, char)> {
72            self.inner
73                .next()
74                .map(|(i, &b)| unsafe { (i, from_u32_unchecked(b as u32)) })
75        }
76    }
77
78    impl<'a> SingleByteCharIndices<'a> {
79        #[inline]
80        pub fn new(buf: &'a [u8]) -> SingleByteCharIndices<'a> {
81            SingleByteCharIndices {
82                inner: buf.iter().enumerate(),
83            }
84        }
85    }
86}
87
88/// Trait for format marker types.
89///
90/// The type implementing this trait is usually not instantiated.
91/// It's used with a phantom type parameter of `Tendril`.
92pub unsafe trait Format {
93    /// Check whether the buffer is valid for this format.
94    fn validate(buf: &[u8]) -> bool;
95
96    /// Check whether the buffer is valid for this format.
97    ///
98    /// You may assume the buffer is a prefix of a valid buffer.
99    #[inline]
100    fn validate_prefix(buf: &[u8]) -> bool {
101        <Self as Format>::validate(buf)
102    }
103
104    /// Check whether the buffer is valid for this format.
105    ///
106    /// You may assume the buffer is a suffix of a valid buffer.
107    #[inline]
108    fn validate_suffix(buf: &[u8]) -> bool {
109        <Self as Format>::validate(buf)
110    }
111
112    /// Check whether the buffer is valid for this format.
113    ///
114    /// You may assume the buffer is a contiguous subsequence
115    /// of a valid buffer, but not necessarily a prefix or
116    /// a suffix.
117    #[inline]
118    fn validate_subseq(buf: &[u8]) -> bool {
119        <Self as Format>::validate(buf)
120    }
121
122    /// Compute any fixup needed when concatenating buffers.
123    ///
124    /// The default is to do nothing.
125    ///
126    /// The function is `unsafe` because it may assume the input
127    /// buffers are already valid for the format. Also, no
128    /// bounds-checking is performed on the return value!
129    #[inline(always)]
130    unsafe fn fixup(_lhs: &[u8], _rhs: &[u8]) -> imp::Fixup {
131        Default::default()
132    }
133}
134
135/// Indicates that one format is a subset of another.
136///
137/// The subset format can be converted to the superset format
138/// for free.
139pub unsafe trait SubsetOf<Super>: Format
140where
141    Super: Format,
142{
143    /// Validate the *other* direction of conversion; check if
144    /// this buffer from the superset format conforms to the
145    /// subset format.
146    ///
147    /// The default calls `Self::validate`, but some conversions
148    /// may implement a check which is cheaper than validating
149    /// from scratch.
150    fn revalidate_subset(x: &[u8]) -> bool {
151        Self::validate(x)
152    }
153}
154
155/// Indicates a format which corresponds to a Rust slice type,
156/// representing exactly the same invariants.
157pub unsafe trait SliceFormat: Format + Sized {
158    type Slice: ?Sized + Slice;
159}
160
161/// Indicates a format which contains characters from Unicode
162/// (all of it, or some proper subset).
163pub unsafe trait CharFormat<'a>: Format {
164    /// Iterator for characters and their byte indices.
165    type Iter: Iterator<Item = (usize, char)>;
166
167    /// Iterate over the characters of the string and their byte
168    /// indices.
169    ///
170    /// You may assume the buffer is *already validated* for `Format`.
171    unsafe fn char_indices(buf: &'a [u8]) -> Self::Iter;
172
173    /// Encode the character as bytes and pass them to a continuation.
174    ///
175    /// Returns `Err(())` iff the character cannot be represented.
176    fn encode_char<F>(ch: char, cont: F) -> Result<(), ()>
177    where
178        F: FnOnce(&[u8]);
179}
180
181/// Indicates a Rust slice type that is represented in memory as bytes.
182pub unsafe trait Slice {
183    /// Access the raw bytes of the slice.
184    fn as_bytes(&self) -> &[u8];
185
186    /// Convert a byte slice to this kind of slice.
187    ///
188    /// You may assume the buffer is *already validated*
189    /// for `Format`.
190    unsafe fn from_bytes(x: &[u8]) -> &Self;
191
192    /// Convert a byte slice to this kind of slice.
193    ///
194    /// You may assume the buffer is *already validated*
195    /// for `Format`.
196    unsafe fn from_mut_bytes(x: &mut [u8]) -> &mut Self;
197}
198
199/// Marker type for uninterpreted bytes.
200///
201/// Validation will never fail for this format.
202#[derive(Copy, Clone, Default, Debug)]
203pub struct Bytes;
204
205unsafe impl Format for Bytes {
206    #[inline(always)]
207    fn validate(_: &[u8]) -> bool {
208        true
209    }
210}
211
212unsafe impl SliceFormat for Bytes {
213    type Slice = [u8];
214}
215
216unsafe impl Slice for [u8] {
217    #[inline(always)]
218    fn as_bytes(&self) -> &[u8] {
219        self
220    }
221
222    #[inline(always)]
223    unsafe fn from_bytes(x: &[u8]) -> &[u8] {
224        x
225    }
226
227    #[inline(always)]
228    unsafe fn from_mut_bytes(x: &mut [u8]) -> &mut [u8] {
229        x
230    }
231}
232
233/// Marker type for ASCII text.
234#[derive(Copy, Clone, Default, Debug)]
235pub struct ASCII;
236
237unsafe impl Format for ASCII {
238    #[inline]
239    fn validate(buf: &[u8]) -> bool {
240        buf.iter().all(|&n| n <= 127)
241    }
242
243    #[inline(always)]
244    fn validate_prefix(_: &[u8]) -> bool {
245        true
246    }
247
248    #[inline(always)]
249    fn validate_suffix(_: &[u8]) -> bool {
250        true
251    }
252
253    #[inline(always)]
254    fn validate_subseq(_: &[u8]) -> bool {
255        true
256    }
257}
258
259unsafe impl SubsetOf<UTF8> for ASCII {}
260unsafe impl SubsetOf<Latin1> for ASCII {}
261
262unsafe impl<'a> CharFormat<'a> for ASCII {
263    type Iter = imp::SingleByteCharIndices<'a>;
264
265    #[inline]
266    unsafe fn char_indices(buf: &'a [u8]) -> imp::SingleByteCharIndices<'a> {
267        imp::SingleByteCharIndices::new(buf)
268    }
269
270    #[inline]
271    fn encode_char<F>(ch: char, cont: F) -> Result<(), ()>
272    where
273        F: FnOnce(&[u8]),
274    {
275        let n = ch as u32;
276        if n > 0x7F {
277            return Err(());
278        }
279        cont(&[n as u8]);
280        Ok(())
281    }
282}
283
284/// Marker type for UTF-8 text.
285#[derive(Copy, Clone, Default, Debug)]
286pub struct UTF8;
287
288unsafe impl Format for UTF8 {
289    #[inline]
290    fn validate(buf: &[u8]) -> bool {
291        str::from_utf8(buf).is_ok()
292    }
293
294    #[inline]
295    fn validate_prefix(buf: &[u8]) -> bool {
296        if buf.len() == 0 {
297            return true;
298        }
299        match futf::classify(buf, buf.len() - 1) {
300            Some(Codepoint {
301                meaning: Meaning::Whole(_),
302                ..
303            }) => true,
304            _ => false,
305        }
306    }
307
308    #[inline]
309    fn validate_suffix(buf: &[u8]) -> bool {
310        if buf.len() == 0 {
311            return true;
312        }
313        match futf::classify(buf, 0) {
314            Some(Codepoint {
315                meaning: Meaning::Whole(_),
316                ..
317            }) => true,
318            _ => false,
319        }
320    }
321
322    #[inline]
323    fn validate_subseq(buf: &[u8]) -> bool {
324        <Self as Format>::validate_prefix(buf) && <Self as Format>::validate_suffix(buf)
325    }
326}
327
328unsafe impl SubsetOf<WTF8> for UTF8 {}
329
330unsafe impl SliceFormat for UTF8 {
331    type Slice = str;
332}
333
334unsafe impl Slice for str {
335    #[inline(always)]
336    fn as_bytes(&self) -> &[u8] {
337        str::as_bytes(self)
338    }
339
340    #[inline(always)]
341    unsafe fn from_bytes(x: &[u8]) -> &str {
342        str::from_utf8_unchecked(x)
343    }
344
345    #[inline(always)]
346    unsafe fn from_mut_bytes(x: &mut [u8]) -> &mut str {
347        mem::transmute(x)
348    }
349}
350
351unsafe impl<'a> CharFormat<'a> for UTF8 {
352    type Iter = str::CharIndices<'a>;
353
354    #[inline]
355    unsafe fn char_indices(buf: &'a [u8]) -> str::CharIndices<'a> {
356        str::from_utf8_unchecked(buf).char_indices()
357    }
358
359    #[inline]
360    fn encode_char<F>(ch: char, cont: F) -> Result<(), ()>
361    where
362        F: FnOnce(&[u8]),
363    {
364        cont(ch.encode_utf8(&mut [0_u8; 4]).as_bytes());
365        Ok(())
366    }
367}
368
369/// Marker type for WTF-8 text.
370///
371/// See the [WTF-8 spec](https://simonsapin.github.io/wtf-8/).
372#[derive(Copy, Clone, Default, Debug)]
373pub struct WTF8;
374
375#[inline]
376fn wtf8_meaningful(m: Meaning) -> bool {
377    match m {
378        Meaning::Whole(_) | Meaning::LeadSurrogate(_) | Meaning::TrailSurrogate(_) => true,
379        _ => false,
380    }
381}
382
383unsafe impl Format for WTF8 {
384    #[inline]
385    fn validate(buf: &[u8]) -> bool {
386        let mut i = 0;
387        let mut prev_lead = false;
388        while i < buf.len() {
389            let codept = unwrap_or_return!(futf::classify(buf, i), false);
390            if !wtf8_meaningful(codept.meaning) {
391                return false;
392            }
393            i += codept.bytes.len();
394            prev_lead = match codept.meaning {
395                Meaning::TrailSurrogate(_) if prev_lead => return false,
396                Meaning::LeadSurrogate(_) => true,
397                _ => false,
398            };
399        }
400
401        true
402    }
403
404    #[inline]
405    fn validate_prefix(buf: &[u8]) -> bool {
406        if buf.len() == 0 {
407            return true;
408        }
409        match futf::classify(buf, buf.len() - 1) {
410            Some(c) => wtf8_meaningful(c.meaning),
411            _ => false,
412        }
413    }
414
415    #[inline]
416    fn validate_suffix(buf: &[u8]) -> bool {
417        if buf.len() == 0 {
418            return true;
419        }
420        match futf::classify(buf, 0) {
421            Some(c) => wtf8_meaningful(c.meaning),
422            _ => false,
423        }
424    }
425
426    #[inline]
427    fn validate_subseq(buf: &[u8]) -> bool {
428        <Self as Format>::validate_prefix(buf) && <Self as Format>::validate_suffix(buf)
429    }
430
431    #[inline]
432    unsafe fn fixup(lhs: &[u8], rhs: &[u8]) -> imp::Fixup {
433        const ERR: &'static str = "WTF8: internal error";
434
435        if lhs.len() >= 3 && rhs.len() >= 3 {
436            if let (
437                Some(Codepoint {
438                    meaning: Meaning::LeadSurrogate(hi),
439                    ..
440                }),
441                Some(Codepoint {
442                    meaning: Meaning::TrailSurrogate(lo),
443                    ..
444                }),
445            ) = (futf::classify(lhs, lhs.len() - 1), futf::classify(rhs, 0))
446            {
447                let mut fixup = imp::Fixup {
448                    drop_left: 3,
449                    drop_right: 3,
450                    insert_len: 0,
451                    insert_bytes: [0_u8; 4],
452                };
453
454                let n = 0x10000 + ((hi as u32) << 10) + (lo as u32);
455
456                let ch = char::from_u32(n).expect(ERR);
457                fixup.insert_len = ch.encode_utf8(&mut fixup.insert_bytes).len() as u32;
458
459                return fixup;
460            }
461        }
462
463        Default::default()
464    }
465}
466
467/// Marker type for the single-byte encoding of the first 256 Unicode codepoints.
468///
469/// This is IANA's "ISO-8859-1". It's ISO's "ISO 8859-1" with the addition of the
470/// C0 and C1 control characters from ECMA-48 / ISO 6429.
471///
472/// Not to be confused with WHATWG's "latin1" or "iso8859-1" labels (or the
473/// many other aliases), which actually stand for Windows-1252.
474#[derive(Copy, Clone, Default, Debug)]
475pub struct Latin1;
476
477unsafe impl Format for Latin1 {
478    #[inline(always)]
479    fn validate(_: &[u8]) -> bool {
480        true
481    }
482
483    #[inline(always)]
484    fn validate_prefix(_: &[u8]) -> bool {
485        true
486    }
487
488    #[inline(always)]
489    fn validate_suffix(_: &[u8]) -> bool {
490        true
491    }
492
493    #[inline(always)]
494    fn validate_subseq(_: &[u8]) -> bool {
495        true
496    }
497}
498
499unsafe impl<'a> CharFormat<'a> for Latin1 {
500    type Iter = imp::SingleByteCharIndices<'a>;
501
502    #[inline]
503    unsafe fn char_indices(buf: &'a [u8]) -> imp::SingleByteCharIndices<'a> {
504        imp::SingleByteCharIndices::new(buf)
505    }
506
507    #[inline]
508    fn encode_char<F>(ch: char, cont: F) -> Result<(), ()>
509    where
510        F: FnOnce(&[u8]),
511    {
512        let n = ch as u32;
513        if n > 0xFF {
514            return Err(());
515        }
516        cont(&[n as u8]);
517        Ok(())
518    }
519}