tendril/
stream.rs

1// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
2// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
3// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
4// option. This file may not be copied, modified, or distributed
5// except according to those terms.
6
7//! Streams of tendrils.
8
9use fmt;
10use tendril::{Atomicity, NonAtomic, Tendril};
11
12use std::borrow::Cow;
13use std::fs::File;
14use std::io;
15use std::marker::PhantomData;
16use std::path::Path;
17
18#[cfg(feature = "encoding")]
19use encoding;
20#[cfg(feature = "encoding_rs")]
21use encoding_rs::{self, DecoderResult};
22use utf8;
23
24/// Trait for types that can process a tendril.
25///
26/// This is a "push" interface, unlike the "pull" interface of
27/// `Iterator<Item=Tendril<F>>`. The push interface matches
28/// [html5ever][] and other incremental parsers with a similar
29/// architecture.
30///
31/// [html5ever]: https://github.com/servo/html5ever
32pub trait TendrilSink<F, A = NonAtomic>
33where
34    F: fmt::Format,
35    A: Atomicity,
36{
37    /// Process this tendril.
38    fn process(&mut self, t: Tendril<F, A>);
39
40    /// Indicates that an error has occurred.
41    fn error(&mut self, desc: Cow<'static, str>);
42
43    /// What the overall result of processing is.
44    type Output;
45
46    /// Indicates the end of the stream.
47    fn finish(self) -> Self::Output;
48
49    /// Process one tendril and finish.
50    fn one<T>(mut self, t: T) -> Self::Output
51    where
52        Self: Sized,
53        T: Into<Tendril<F, A>>,
54    {
55        self.process(t.into());
56        self.finish()
57    }
58
59    /// Consume an iterator of tendrils, processing each item, then finish.
60    fn from_iter<I>(mut self, i: I) -> Self::Output
61    where
62        Self: Sized,
63        I: IntoIterator,
64        I::Item: Into<Tendril<F, A>>,
65    {
66        for t in i {
67            self.process(t.into())
68        }
69        self.finish()
70    }
71
72    /// Read from the given stream of bytes until exhaustion and process incrementally,
73    /// then finish. Return `Err` at the first I/O error.
74    fn read_from<R>(mut self, r: &mut R) -> io::Result<Self::Output>
75    where
76        Self: Sized,
77        R: io::Read,
78        F: fmt::SliceFormat<Slice = [u8]>,
79    {
80        const BUFFER_SIZE: u32 = 4 * 1024;
81        loop {
82            let mut tendril = Tendril::<F, A>::new();
83            // FIXME: this exposes uninitialized bytes to a generic R type
84            // this is fine for R=File which never reads these bytes,
85            // but user-defined types might.
86            // The standard library pushes zeros to `Vec<u8>` for that reason.
87            unsafe {
88                tendril.push_uninitialized(BUFFER_SIZE);
89            }
90            loop {
91                match r.read(&mut tendril) {
92                    Ok(0) => return Ok(self.finish()),
93                    Ok(n) => {
94                        tendril.pop_back(BUFFER_SIZE - n as u32);
95                        self.process(tendril);
96                        break;
97                    }
98                    Err(ref e) if e.kind() == io::ErrorKind::Interrupted => {}
99                    Err(e) => return Err(e),
100                }
101            }
102        }
103    }
104
105    /// Read from the file at the given path and process incrementally,
106    /// then finish. Return `Err` at the first I/O error.
107    fn from_file<P>(self, path: P) -> io::Result<Self::Output>
108    where
109        Self: Sized,
110        P: AsRef<Path>,
111        F: fmt::SliceFormat<Slice = [u8]>,
112    {
113        self.read_from(&mut File::open(path)?)
114    }
115}
116
117/// A `TendrilSink` adaptor that takes bytes, decodes them as UTF-8,
118/// lossily replace ill-formed byte sequences with U+FFFD replacement characters,
119/// and emits Unicode (`StrTendril`).
120///
121/// This does not allocate memory: the output is either subtendrils on the input,
122/// on inline tendrils for a single code point.
123pub struct Utf8LossyDecoder<Sink, A = NonAtomic>
124where
125    Sink: TendrilSink<fmt::UTF8, A>,
126    A: Atomicity,
127{
128    pub inner_sink: Sink,
129    incomplete: Option<utf8::Incomplete>,
130    marker: PhantomData<A>,
131}
132
133impl<Sink, A> Utf8LossyDecoder<Sink, A>
134where
135    Sink: TendrilSink<fmt::UTF8, A>,
136    A: Atomicity,
137{
138    /// Create a new incremental UTF-8 decoder.
139    #[inline]
140    pub fn new(inner_sink: Sink) -> Self {
141        Utf8LossyDecoder {
142            inner_sink: inner_sink,
143            incomplete: None,
144            marker: PhantomData,
145        }
146    }
147}
148
149impl<Sink, A> TendrilSink<fmt::Bytes, A> for Utf8LossyDecoder<Sink, A>
150where
151    Sink: TendrilSink<fmt::UTF8, A>,
152    A: Atomicity,
153{
154    #[inline]
155    fn process(&mut self, mut t: Tendril<fmt::Bytes, A>) {
156        // FIXME: remove take() and map() when non-lexical borrows are stable.
157        if let Some(mut incomplete) = self.incomplete.take() {
158            let resume_at = incomplete.try_complete(&t).map(|(result, rest)| {
159                match result {
160                    Ok(s) => self.inner_sink.process(Tendril::from_slice(s)),
161                    Err(_) => {
162                        self.inner_sink.error("invalid byte sequence".into());
163                        self.inner_sink
164                            .process(Tendril::from_slice(utf8::REPLACEMENT_CHARACTER));
165                    }
166                }
167                t.len() - rest.len()
168            });
169            match resume_at {
170                None => {
171                    self.incomplete = Some(incomplete);
172                    return;
173                }
174                Some(resume_at) => t.pop_front(resume_at as u32),
175            }
176        }
177        while !t.is_empty() {
178            let unborrowed_result = match utf8::decode(&t) {
179                Ok(s) => {
180                    debug_assert!(s.as_ptr() == t.as_ptr());
181                    debug_assert!(s.len() == t.len());
182                    Ok(())
183                }
184                Err(utf8::DecodeError::Invalid {
185                    valid_prefix,
186                    invalid_sequence,
187                    ..
188                }) => {
189                    debug_assert!(valid_prefix.as_ptr() == t.as_ptr());
190                    debug_assert!(valid_prefix.len() <= t.len());
191                    Err((
192                        valid_prefix.len(),
193                        Err(valid_prefix.len() + invalid_sequence.len()),
194                    ))
195                }
196                Err(utf8::DecodeError::Incomplete {
197                    valid_prefix,
198                    incomplete_suffix,
199                }) => {
200                    debug_assert!(valid_prefix.as_ptr() == t.as_ptr());
201                    debug_assert!(valid_prefix.len() <= t.len());
202                    Err((valid_prefix.len(), Ok(incomplete_suffix)))
203                }
204            };
205            match unborrowed_result {
206                Ok(()) => {
207                    unsafe { self.inner_sink.process(t.reinterpret_without_validating()) }
208                    return;
209                }
210                Err((valid_len, and_then)) => {
211                    if valid_len > 0 {
212                        let subtendril = t.subtendril(0, valid_len as u32);
213                        unsafe {
214                            self.inner_sink
215                                .process(subtendril.reinterpret_without_validating())
216                        }
217                    }
218                    match and_then {
219                        Ok(incomplete) => {
220                            self.incomplete = Some(incomplete);
221                            return;
222                        }
223                        Err(offset) => {
224                            self.inner_sink.error("invalid byte sequence".into());
225                            self.inner_sink
226                                .process(Tendril::from_slice(utf8::REPLACEMENT_CHARACTER));
227                            t.pop_front(offset as u32);
228                        }
229                    }
230                }
231            }
232        }
233    }
234
235    #[inline]
236    fn error(&mut self, desc: Cow<'static, str>) {
237        self.inner_sink.error(desc);
238    }
239
240    type Output = Sink::Output;
241
242    #[inline]
243    fn finish(mut self) -> Sink::Output {
244        if self.incomplete.is_some() {
245            self.inner_sink
246                .error("incomplete byte sequence at end of stream".into());
247            self.inner_sink
248                .process(Tendril::from_slice(utf8::REPLACEMENT_CHARACTER));
249        }
250        self.inner_sink.finish()
251    }
252}
253
254/// A `TendrilSink` adaptor that takes bytes, decodes them as the given character encoding,
255/// lossily replace ill-formed byte sequences with U+FFFD replacement characters,
256/// and emits Unicode (`StrTendril`).
257///
258/// This allocates new tendrils for encodings other than UTF-8.
259#[cfg(any(feature = "encoding", feature = "encoding_rs"))]
260pub struct LossyDecoder<Sink, A = NonAtomic>
261where
262    Sink: TendrilSink<fmt::UTF8, A>,
263    A: Atomicity,
264{
265    inner: LossyDecoderInner<Sink, A>,
266}
267
268#[cfg(any(feature = "encoding", feature = "encoding_rs"))]
269enum LossyDecoderInner<Sink, A>
270where
271    Sink: TendrilSink<fmt::UTF8, A>,
272    A: Atomicity,
273{
274    Utf8(Utf8LossyDecoder<Sink, A>),
275    #[cfg(feature = "encoding")]
276    Encoding(Box<encoding::RawDecoder>, Sink),
277    #[cfg(feature = "encoding_rs")]
278    EncodingRs(encoding_rs::Decoder, Sink),
279}
280
281#[cfg(any(feature = "encoding", feature = "encoding_rs"))]
282impl<Sink, A> LossyDecoder<Sink, A>
283where
284    Sink: TendrilSink<fmt::UTF8, A>,
285    A: Atomicity,
286{
287    /// Create a new incremental decoder using the encoding crate.
288    #[cfg(feature = "encoding")]
289    #[inline]
290    pub fn new(encoding: encoding::EncodingRef, sink: Sink) -> Self {
291        if encoding.name() == "utf-8" {
292            LossyDecoder::utf8(sink)
293        } else {
294            LossyDecoder {
295                inner: LossyDecoderInner::Encoding(encoding.raw_decoder(), sink),
296            }
297        }
298    }
299
300    /// Create a new incremental decoder using the encoding_rs crate.
301    #[cfg(feature = "encoding_rs")]
302    #[inline]
303    pub fn new_encoding_rs(encoding: &'static encoding_rs::Encoding, sink: Sink) -> Self {
304        if encoding == encoding_rs::UTF_8 {
305            return Self::utf8(sink);
306        }
307        Self {
308            inner: LossyDecoderInner::EncodingRs(encoding.new_decoder(), sink),
309        }
310    }
311
312    /// Create a new incremental decoder for the UTF-8 encoding.
313    ///
314    /// This is useful for content that is known at run-time to be UTF-8
315    /// (whereas `Utf8LossyDecoder` requires knowning at compile-time.)
316    #[inline]
317    pub fn utf8(sink: Sink) -> LossyDecoder<Sink, A> {
318        LossyDecoder {
319            inner: LossyDecoderInner::Utf8(Utf8LossyDecoder::new(sink)),
320        }
321    }
322
323    /// Give a reference to the inner sink.
324    pub fn inner_sink(&self) -> &Sink {
325        match self.inner {
326            LossyDecoderInner::Utf8(ref utf8) => &utf8.inner_sink,
327            #[cfg(feature = "encoding")]
328            LossyDecoderInner::Encoding(_, ref inner_sink) => inner_sink,
329            #[cfg(feature = "encoding_rs")]
330            LossyDecoderInner::EncodingRs(_, ref inner_sink) => inner_sink,
331        }
332    }
333
334    /// Give a mutable reference to the inner sink.
335    pub fn inner_sink_mut(&mut self) -> &mut Sink {
336        match self.inner {
337            LossyDecoderInner::Utf8(ref mut utf8) => &mut utf8.inner_sink,
338            #[cfg(feature = "encoding")]
339            LossyDecoderInner::Encoding(_, ref mut inner_sink) => inner_sink,
340            #[cfg(feature = "encoding_rs")]
341            LossyDecoderInner::EncodingRs(_, ref mut inner_sink) => inner_sink,
342        }
343    }
344}
345
346#[cfg(any(feature = "encoding", feature = "encoding_rs"))]
347impl<Sink, A> TendrilSink<fmt::Bytes, A> for LossyDecoder<Sink, A>
348where
349    Sink: TendrilSink<fmt::UTF8, A>,
350    A: Atomicity,
351{
352    #[inline]
353    fn process(&mut self, t: Tendril<fmt::Bytes, A>) {
354        match self.inner {
355            LossyDecoderInner::Utf8(ref mut utf8) => return utf8.process(t),
356            #[cfg(feature = "encoding")]
357            LossyDecoderInner::Encoding(ref mut decoder, ref mut sink) => {
358                let mut out = Tendril::new();
359                let mut t = t;
360                loop {
361                    match decoder.raw_feed(&*t, &mut out) {
362                        (_, Some(err)) => {
363                            out.push_char('\u{fffd}');
364                            sink.error(err.cause);
365                            debug_assert!(err.upto >= 0);
366                            t.pop_front(err.upto as u32);
367                            // continue loop and process remainder of t
368                        }
369                        (_, None) => break,
370                    }
371                }
372                if out.len() > 0 {
373                    sink.process(out);
374                }
375            }
376            #[cfg(feature = "encoding_rs")]
377            LossyDecoderInner::EncodingRs(ref mut decoder, ref mut sink) => {
378                if t.is_empty() {
379                    return;
380                }
381                decode_to_sink(t, decoder, sink, false);
382            }
383        }
384    }
385
386    #[inline]
387    fn error(&mut self, desc: Cow<'static, str>) {
388        match self.inner {
389            LossyDecoderInner::Utf8(ref mut utf8) => utf8.error(desc),
390            #[cfg(feature = "encoding")]
391            LossyDecoderInner::Encoding(_, ref mut sink) => sink.error(desc),
392            #[cfg(feature = "encoding_rs")]
393            LossyDecoderInner::EncodingRs(_, ref mut sink) => sink.error(desc),
394        }
395    }
396
397    type Output = Sink::Output;
398
399    #[inline]
400    fn finish(self) -> Sink::Output {
401        match self.inner {
402            LossyDecoderInner::Utf8(utf8) => return utf8.finish(),
403            #[cfg(feature = "encoding")]
404            LossyDecoderInner::Encoding(mut decoder, mut sink) => {
405                let mut out = Tendril::new();
406                if let Some(err) = decoder.raw_finish(&mut out) {
407                    out.push_char('\u{fffd}');
408                    sink.error(err.cause);
409                }
410                if out.len() > 0 {
411                    sink.process(out);
412                }
413                sink.finish()
414            }
415            #[cfg(feature = "encoding_rs")]
416            LossyDecoderInner::EncodingRs(mut decoder, mut sink) => {
417                decode_to_sink(Tendril::new(), &mut decoder, &mut sink, true);
418                sink.finish()
419            }
420        }
421    }
422}
423
424#[cfg(feature = "encoding_rs")]
425fn decode_to_sink<Sink, A>(
426    mut t: Tendril<fmt::Bytes, A>,
427    decoder: &mut encoding_rs::Decoder,
428    sink: &mut Sink,
429    last: bool,
430) where
431    Sink: TendrilSink<fmt::UTF8, A>,
432    A: Atomicity,
433{
434    loop {
435        let mut out = <Tendril<fmt::Bytes, A>>::new();
436        let max_len = decoder
437            .max_utf8_buffer_length_without_replacement(t.len())
438            .unwrap_or(8192);
439        unsafe {
440            out.push_uninitialized(std::cmp::min(max_len as u32, 8192));
441        }
442        let (result, bytes_read, bytes_written) =
443            decoder.decode_to_utf8_without_replacement(&t, &mut out, last);
444        if bytes_written > 0 {
445            sink.process(unsafe {
446                out.subtendril(0, bytes_written as u32)
447                    .reinterpret_without_validating()
448            });
449        }
450        match result {
451            DecoderResult::InputEmpty => return,
452            DecoderResult::OutputFull => {}
453            DecoderResult::Malformed(_, _) => {
454                sink.error(Cow::Borrowed("invalid sequence"));
455                sink.process("\u{FFFD}".into());
456            }
457        }
458        t.pop_front(bytes_read as u32);
459        if t.is_empty() {
460            return;
461        }
462    }
463}
464
465#[cfg(test)]
466mod test {
467    use super::{TendrilSink, Utf8LossyDecoder};
468    use fmt;
469    use std::borrow::Cow;
470    use tendril::{Atomicity, NonAtomic, Tendril};
471
472    #[cfg(any(feature = "encoding", feature = "encoding_rs"))]
473    use super::LossyDecoder;
474    #[cfg(any(feature = "encoding", feature = "encoding_rs"))]
475    use tendril::SliceExt;
476
477    #[cfg(feature = "encoding")]
478    use encoding::all as enc;
479    #[cfg(feature = "encoding_rs")]
480    use encoding_rs as enc_rs;
481
482    struct Accumulate<A>
483    where
484        A: Atomicity,
485    {
486        tendrils: Vec<Tendril<fmt::UTF8, A>>,
487        errors: Vec<String>,
488    }
489
490    impl<A> Accumulate<A>
491    where
492        A: Atomicity,
493    {
494        fn new() -> Accumulate<A> {
495            Accumulate {
496                tendrils: vec![],
497                errors: vec![],
498            }
499        }
500    }
501
502    impl<A> TendrilSink<fmt::UTF8, A> for Accumulate<A>
503    where
504        A: Atomicity,
505    {
506        fn process(&mut self, t: Tendril<fmt::UTF8, A>) {
507            self.tendrils.push(t);
508        }
509
510        fn error(&mut self, desc: Cow<'static, str>) {
511            self.errors.push(desc.into_owned());
512        }
513
514        type Output = (Vec<Tendril<fmt::UTF8, A>>, Vec<String>);
515
516        fn finish(self) -> Self::Output {
517            (self.tendrils, self.errors)
518        }
519    }
520
521    fn check_utf8(input: &[&[u8]], expected: &[&str], errs: usize) {
522        let decoder = Utf8LossyDecoder::new(Accumulate::<NonAtomic>::new());
523        let (tendrils, errors) = decoder.from_iter(input.iter().cloned());
524        assert_eq!(
525            expected,
526            &*tendrils.iter().map(|t| &**t).collect::<Vec<_>>()
527        );
528        assert_eq!(errs, errors.len());
529    }
530
531    #[test]
532    fn utf8() {
533        check_utf8(&[], &[], 0);
534        check_utf8(&[b""], &[], 0);
535        check_utf8(&[b"xyz"], &["xyz"], 0);
536        check_utf8(&[b"x", b"y", b"z"], &["x", "y", "z"], 0);
537
538        check_utf8(&[b"xy\xEA\x99\xAEzw"], &["xy\u{a66e}zw"], 0);
539        check_utf8(&[b"xy\xEA", b"\x99\xAEzw"], &["xy", "\u{a66e}z", "w"], 0);
540        check_utf8(&[b"xy\xEA\x99", b"\xAEzw"], &["xy", "\u{a66e}z", "w"], 0);
541        check_utf8(
542            &[b"xy\xEA", b"\x99", b"\xAEzw"],
543            &["xy", "\u{a66e}z", "w"],
544            0,
545        );
546        check_utf8(&[b"\xEA", b"", b"\x99", b"", b"\xAE"], &["\u{a66e}"], 0);
547        check_utf8(
548            &[b"", b"\xEA", b"", b"\x99", b"", b"\xAE", b""],
549            &["\u{a66e}"],
550            0,
551        );
552
553        check_utf8(
554            &[b"xy\xEA", b"\xFF", b"\x99\xAEz"],
555            &["xy", "\u{fffd}", "\u{fffd}", "\u{fffd}", "\u{fffd}", "z"],
556            4,
557        );
558        check_utf8(
559            &[b"xy\xEA\x99", b"\xFFz"],
560            &["xy", "\u{fffd}", "\u{fffd}", "z"],
561            2,
562        );
563
564        check_utf8(&[b"\xC5\x91\xC5\x91\xC5\x91"], &["őőő"], 0);
565        check_utf8(
566            &[b"\xC5\x91", b"\xC5\x91", b"\xC5\x91"],
567            &["ő", "ő", "ő"],
568            0,
569        );
570        check_utf8(
571            &[b"\xC5", b"\x91\xC5", b"\x91\xC5", b"\x91"],
572            &["ő", "ő", "ő"],
573            0,
574        );
575        check_utf8(
576            &[b"\xC5", b"\x91\xff", b"\x91\xC5", b"\x91"],
577            &["ő", "\u{fffd}", "\u{fffd}", "ő"],
578            2,
579        );
580
581        // incomplete char at end of input
582        check_utf8(&[b"\xC0"], &["\u{fffd}"], 1);
583        check_utf8(&[b"\xEA\x99"], &["\u{fffd}"], 1);
584    }
585
586    #[cfg(any(feature = "encoding", feature = "encoding_rs"))]
587    fn check_decode(
588        mut decoder: LossyDecoder<Accumulate<NonAtomic>>,
589        input: &[&[u8]],
590        expected: &str,
591        errs: usize,
592    ) {
593        for x in input {
594            decoder.process(x.to_tendril());
595        }
596        let (tendrils, errors) = decoder.finish();
597        let mut tendril: Tendril<fmt::UTF8> = Tendril::new();
598        for t in tendrils {
599            tendril.push_tendril(&t);
600        }
601        assert_eq!(expected, &*tendril);
602        assert_eq!(errs, errors.len());
603    }
604
605    #[cfg(any(feature = "encoding", feature = "encoding_rs"))]
606    pub type Tests = &'static [(&'static [&'static [u8]], &'static str, usize)];
607
608    #[cfg(any(feature = "encoding"))]
609    const ASCII: Tests = &[
610        (&[], "", 0),
611        (&[b""], "", 0),
612        (&[b"xyz"], "xyz", 0),
613        (&[b"xy", b"", b"", b"z"], "xyz", 0),
614        (&[b"x", b"y", b"z"], "xyz", 0),
615        (&[b"\xFF"], "\u{fffd}", 1),
616        (&[b"x\xC0yz"], "x\u{fffd}yz", 1),
617        (&[b"x", b"\xC0y", b"z"], "x\u{fffd}yz", 1),
618        (&[b"x\xC0yz\xFF\xFFw"], "x\u{fffd}yz\u{fffd}\u{fffd}w", 3),
619    ];
620
621    #[cfg(feature = "encoding")]
622    #[test]
623    fn decode_ascii() {
624        for &(input, expected, errs) in ASCII {
625            let decoder = LossyDecoder::new(enc::ASCII, Accumulate::new());
626            check_decode(decoder, input, expected, errs);
627        }
628    }
629
630    #[cfg(any(feature = "encoding", feature = "encoding_rs"))]
631    const UTF_8: Tests = &[
632        (&[], "", 0),
633        (&[b""], "", 0),
634        (&[b"xyz"], "xyz", 0),
635        (&[b"x", b"y", b"z"], "xyz", 0),
636        (&[b"\xEA\x99\xAE"], "\u{a66e}", 0),
637        (&[b"\xEA", b"\x99\xAE"], "\u{a66e}", 0),
638        (&[b"\xEA\x99", b"\xAE"], "\u{a66e}", 0),
639        (&[b"\xEA", b"\x99", b"\xAE"], "\u{a66e}", 0),
640        (&[b"\xEA", b"", b"\x99", b"", b"\xAE"], "\u{a66e}", 0),
641        (
642            &[b"", b"\xEA", b"", b"\x99", b"", b"\xAE", b""],
643            "\u{a66e}",
644            0,
645        ),
646        (&[b"xy\xEA", b"\x99\xAEz"], "xy\u{a66e}z", 0),
647        (
648            &[b"xy\xEA", b"\xFF", b"\x99\xAEz"],
649            "xy\u{fffd}\u{fffd}\u{fffd}\u{fffd}z",
650            4,
651        ),
652        (&[b"xy\xEA\x99", b"\xFFz"], "xy\u{fffd}\u{fffd}z", 2),
653        // incomplete char at end of input
654        (&[b"\xC0"], "\u{fffd}", 1),
655        (&[b"\xEA\x99"], "\u{fffd}", 1),
656    ];
657
658    #[cfg(feature = "encoding")]
659    #[test]
660    fn decode_utf8() {
661        for &(input, expected, errs) in UTF_8 {
662            let decoder = LossyDecoder::new(enc::UTF_8, Accumulate::new());
663            check_decode(decoder, input, expected, errs);
664        }
665    }
666
667    #[cfg(feature = "encoding_rs")]
668    #[test]
669    fn decode_utf8_encoding_rs() {
670        for &(input, expected, errs) in UTF_8 {
671            let decoder = LossyDecoder::new_encoding_rs(enc_rs::UTF_8, Accumulate::new());
672            check_decode(decoder, input, expected, errs);
673        }
674    }
675
676    #[cfg(any(feature = "encoding", feature = "encoding_rs"))]
677    const KOI8_U: Tests = &[
678        (&[b"\xfc\xce\xc5\xd2\xc7\xc9\xd1"], "Энергия", 0),
679        (&[b"\xfc\xce", b"\xc5\xd2\xc7\xc9\xd1"], "Энергия", 0),
680        (&[b"\xfc\xce", b"\xc5\xd2\xc7", b"\xc9\xd1"], "Энергия", 0),
681        (
682            &[b"\xfc\xce", b"", b"\xc5\xd2\xc7", b"\xc9\xd1", b""],
683            "Энергия",
684            0,
685        ),
686    ];
687
688    #[cfg(feature = "encoding")]
689    #[test]
690    fn decode_koi8_u() {
691        for &(input, expected, errs) in KOI8_U {
692            let decoder = LossyDecoder::new(enc::KOI8_U, Accumulate::new());
693            check_decode(decoder, input, expected, errs);
694        }
695    }
696
697    #[cfg(feature = "encoding_rs")]
698    #[test]
699    fn decode_koi8_u_encoding_rs() {
700        for &(input, expected, errs) in KOI8_U {
701            let decoder = LossyDecoder::new_encoding_rs(enc_rs::KOI8_U, Accumulate::new());
702            check_decode(decoder, input, expected, errs);
703        }
704    }
705
706    #[cfg(any(feature = "encoding", feature = "encoding_rs"))]
707    const WINDOWS_949: Tests = &[
708        (&[], "", 0),
709        (&[b""], "", 0),
710        (&[b"\xbe\xc8\xb3\xe7"], "안녕", 0),
711        (&[b"\xbe", b"\xc8\xb3\xe7"], "안녕", 0),
712        (&[b"\xbe", b"", b"\xc8\xb3\xe7"], "안녕", 0),
713        (
714            &[b"\xbe\xc8\xb3\xe7\xc7\xcf\xbc\xbc\xbf\xe4"],
715            "안녕하세요",
716            0,
717        ),
718        (&[b"\xbe\xc8\xb3\xe7\xc7"], "안녕\u{fffd}", 1),
719        (&[b"\xbe", b"", b"\xc8\xb3"], "안\u{fffd}", 1),
720        (&[b"\xbe\x28\xb3\xe7"], "\u{fffd}(녕", 1),
721    ];
722
723    #[cfg(feature = "encoding")]
724    #[test]
725    fn decode_windows_949() {
726        for &(input, expected, errs) in WINDOWS_949 {
727            let decoder = LossyDecoder::new(enc::WINDOWS_949, Accumulate::new());
728            check_decode(decoder, input, expected, errs);
729        }
730    }
731
732    #[cfg(feature = "encoding_rs")]
733    #[test]
734    fn decode_windows_949_encoding_rs() {
735        for &(input, expected, errs) in WINDOWS_949 {
736            let decoder = LossyDecoder::new_encoding_rs(enc_rs::EUC_KR, Accumulate::new());
737            check_decode(decoder, input, expected, errs);
738        }
739    }
740
741    #[test]
742    fn read_from() {
743        let decoder = Utf8LossyDecoder::new(Accumulate::<NonAtomic>::new());
744        let mut bytes: &[u8] = b"foo\xffbar";
745        let (tendrils, errors) = decoder.read_from(&mut bytes).unwrap();
746        assert_eq!(
747            &*tendrils.iter().map(|t| &**t).collect::<Vec<_>>(),
748            &["foo", "\u{FFFD}", "bar"]
749        );
750        assert_eq!(errors, &["invalid byte sequence"]);
751    }
752}