html5ever\tokenizer/
mod.rs

1// Copyright 2014-2017 The html5ever Project Developers. See the
2// COPYRIGHT file at the top-level directory of this distribution.
3//
4// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
5// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
6// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
7// option. This file may not be copied, modified, or distributed
8// except according to those terms.
9
10//! The HTML5 tokenizer.
11
12pub use self::interface::{CharacterTokens, EOFToken, NullCharacterToken, ParseError};
13pub use self::interface::{CommentToken, DoctypeToken, TagToken, Token};
14pub use self::interface::{Doctype, EndTag, StartTag, Tag, TagKind};
15pub use self::interface::{TokenSink, TokenSinkResult};
16
17use self::states::{DoctypeIdKind, Public, System};
18use self::states::{DoubleEscaped, Escaped};
19use self::states::{DoubleQuoted, SingleQuoted, Unquoted};
20use self::states::{Rawtext, Rcdata, ScriptData, ScriptDataEscaped};
21
22use self::char_ref::{CharRef, CharRefTokenizer};
23
24use crate::util::str::lower_ascii_letter;
25
26use log::{debug, trace};
27use markup5ever::{ns, small_char_set, TokenizerResult};
28use std::borrow::Cow::{self, Borrowed};
29use std::cell::{Cell, RefCell, RefMut};
30use std::collections::BTreeMap;
31use std::mem;
32
33pub use crate::buffer_queue::{BufferQueue, FromSet, NotFromSet, SetResult};
34use crate::macros::{time, unwrap_or_return};
35use crate::tendril::StrTendril;
36use crate::{Attribute, LocalName, QualName, SmallCharSet};
37
38mod char_ref;
39mod interface;
40pub mod states;
41
42pub enum ProcessResult<Handle> {
43    Continue,
44    Suspend,
45    Script(Handle),
46}
47
48fn option_push(opt_str: &mut Option<StrTendril>, c: char) {
49    match *opt_str {
50        Some(ref mut s) => s.push_char(c),
51        None => *opt_str = Some(StrTendril::from_char(c)),
52    }
53}
54
55/// Tokenizer options, with an impl for `Default`.
56#[derive(Clone)]
57pub struct TokenizerOpts {
58    /// Report all parse errors described in the spec, at some
59    /// performance penalty?  Default: false
60    pub exact_errors: bool,
61
62    /// Discard a `U+FEFF BYTE ORDER MARK` if we see one at the beginning
63    /// of the stream?  Default: true
64    pub discard_bom: bool,
65
66    /// Keep a record of how long we spent in each state?  Printed
67    /// when `end()` is called.  Default: false
68    pub profile: bool,
69
70    /// Initial state override.  Only the test runner should use
71    /// a non-`None` value!
72    pub initial_state: Option<states::State>,
73
74    /// Last start tag.  Only the test runner should use a
75    /// non-`None` value!
76    ///
77    /// FIXME: Can't use Tendril because we want TokenizerOpts
78    /// to be Send.
79    pub last_start_tag_name: Option<String>,
80}
81
82impl Default for TokenizerOpts {
83    fn default() -> TokenizerOpts {
84        TokenizerOpts {
85            exact_errors: false,
86            discard_bom: true,
87            profile: false,
88            initial_state: None,
89            last_start_tag_name: None,
90        }
91    }
92}
93
94/// The HTML tokenizer.
95pub struct Tokenizer<Sink> {
96    /// Options controlling the behavior of the tokenizer.
97    opts: TokenizerOpts,
98
99    /// Destination for tokens we emit.
100    pub sink: Sink,
101
102    /// The abstract machine state as described in the spec.
103    state: Cell<states::State>,
104
105    /// Are we at the end of the file, once buffers have been processed
106    /// completely? This affects whether we will wait for lookahead or not.
107    at_eof: Cell<bool>,
108
109    /// Tokenizer for character references, if we're tokenizing
110    /// one at the moment.
111    char_ref_tokenizer: RefCell<Option<Box<CharRefTokenizer>>>,
112
113    /// Current input character.  Just consumed, may reconsume.
114    current_char: Cell<char>,
115
116    /// Should we reconsume the current input character?
117    reconsume: Cell<bool>,
118
119    /// Did we just consume \r, translating it to \n?  In that case we need
120    /// to ignore the next character if it's \n.
121    ignore_lf: Cell<bool>,
122
123    /// Discard a U+FEFF BYTE ORDER MARK if we see one?  Only done at the
124    /// beginning of the stream.
125    discard_bom: Cell<bool>,
126
127    /// Current tag kind.
128    current_tag_kind: Cell<TagKind>,
129
130    /// Current tag name.
131    current_tag_name: RefCell<StrTendril>,
132
133    /// Current tag is self-closing?
134    current_tag_self_closing: Cell<bool>,
135
136    /// Current tag attributes.
137    current_tag_attrs: RefCell<Vec<Attribute>>,
138
139    /// Current attribute name.
140    current_attr_name: RefCell<StrTendril>,
141
142    /// Current attribute value.
143    current_attr_value: RefCell<StrTendril>,
144
145    /// Current comment.
146    current_comment: RefCell<StrTendril>,
147
148    /// Current doctype token.
149    current_doctype: RefCell<Doctype>,
150
151    /// Last start tag name, for use in checking "appropriate end tag".
152    last_start_tag_name: RefCell<Option<LocalName>>,
153
154    /// The "temporary buffer" mentioned in the spec.
155    temp_buf: RefCell<StrTendril>,
156
157    /// Record of how many ns we spent in each state, if profiling is enabled.
158    state_profile: RefCell<BTreeMap<states::State, u64>>,
159
160    /// Record of how many ns we spent in the token sink.
161    time_in_sink: Cell<u64>,
162
163    /// Track current line
164    current_line: Cell<u64>,
165}
166
167impl<Sink: TokenSink> Tokenizer<Sink> {
168    /// Create a new tokenizer which feeds tokens to a particular `TokenSink`.
169    pub fn new(sink: Sink, mut opts: TokenizerOpts) -> Tokenizer<Sink> {
170        let start_tag_name = opts
171            .last_start_tag_name
172            .take()
173            .map(|s| LocalName::from(&*s));
174        let state = opts.initial_state.unwrap_or(states::Data);
175        let discard_bom = opts.discard_bom;
176        Tokenizer {
177            opts,
178            sink,
179            state: Cell::new(state),
180            char_ref_tokenizer: RefCell::new(None),
181            at_eof: Cell::new(false),
182            current_char: Cell::new('\0'),
183            reconsume: Cell::new(false),
184            ignore_lf: Cell::new(false),
185            discard_bom: Cell::new(discard_bom),
186            current_tag_kind: Cell::new(StartTag),
187            current_tag_name: RefCell::new(StrTendril::new()),
188            current_tag_self_closing: Cell::new(false),
189            current_tag_attrs: RefCell::new(vec![]),
190            current_attr_name: RefCell::new(StrTendril::new()),
191            current_attr_value: RefCell::new(StrTendril::new()),
192            current_comment: RefCell::new(StrTendril::new()),
193            current_doctype: RefCell::new(Doctype::default()),
194            last_start_tag_name: RefCell::new(start_tag_name),
195            temp_buf: RefCell::new(StrTendril::new()),
196            state_profile: RefCell::new(BTreeMap::new()),
197            time_in_sink: Cell::new(0),
198            current_line: Cell::new(1),
199        }
200    }
201
202    /// Feed an input string into the tokenizer.
203    pub fn feed(&self, input: &BufferQueue) -> TokenizerResult<Sink::Handle> {
204        if input.is_empty() {
205            return TokenizerResult::Done;
206        }
207
208        if self.discard_bom.get() {
209            if let Some(c) = input.peek() {
210                if c == '\u{feff}' {
211                    input.next();
212                }
213            } else {
214                return TokenizerResult::Done;
215            }
216        };
217
218        self.run(input)
219    }
220
221    pub fn set_plaintext_state(&self) {
222        self.state.set(states::Plaintext);
223    }
224
225    fn process_token(&self, token: Token) -> TokenSinkResult<Sink::Handle> {
226        if self.opts.profile {
227            let (ret, dt) = time!(self.sink.process_token(token, self.current_line.get()));
228            self.time_in_sink.set(self.time_in_sink.get() + dt);
229            ret
230        } else {
231            self.sink.process_token(token, self.current_line.get())
232        }
233    }
234
235    fn process_token_and_continue(&self, token: Token) {
236        assert!(matches!(
237            self.process_token(token),
238            TokenSinkResult::Continue
239        ));
240    }
241
242    //§ preprocessing-the-input-stream
243    // Get the next input character, which might be the character
244    // 'c' that we already consumed from the buffers.
245    fn get_preprocessed_char(&self, mut c: char, input: &BufferQueue) -> Option<char> {
246        if self.ignore_lf.get() {
247            self.ignore_lf.set(false);
248            if c == '\n' {
249                c = input.next()?;
250            }
251        }
252
253        if c == '\r' {
254            self.ignore_lf.set(true);
255            c = '\n';
256        }
257
258        if c == '\n' {
259            self.current_line.set(self.current_line.get() + 1);
260        }
261
262        if self.opts.exact_errors
263            && match c as u32 {
264                0x01..=0x08 | 0x0B | 0x0E..=0x1F | 0x7F..=0x9F | 0xFDD0..=0xFDEF => true,
265                n if (n & 0xFFFE) == 0xFFFE => true,
266                _ => false,
267            }
268        {
269            let msg = format!("Bad character {c}");
270            self.emit_error(Cow::Owned(msg));
271        }
272
273        trace!("got character {c}");
274        self.current_char.set(c);
275        Some(c)
276    }
277
278    //§ tokenization
279    // Get the next input character, if one is available.
280    fn get_char(&self, input: &BufferQueue) -> Option<char> {
281        if self.reconsume.get() {
282            self.reconsume.set(false);
283            Some(self.current_char.get())
284        } else {
285            input
286                .next()
287                .and_then(|c| self.get_preprocessed_char(c, input))
288        }
289    }
290
291    fn pop_except_from(&self, input: &BufferQueue, set: SmallCharSet) -> Option<SetResult> {
292        // Bail to the slow path for various corner cases.
293        // This means that `FromSet` can contain characters not in the set!
294        // It shouldn't matter because the fallback `FromSet` case should
295        // always do the same thing as the `NotFromSet` case.
296        if self.opts.exact_errors || self.reconsume.get() || self.ignore_lf.get() {
297            return self.get_char(input).map(FromSet);
298        }
299
300        let d = input.pop_except_from(set);
301        trace!("got characters {d:?}");
302        match d {
303            Some(FromSet(c)) => self.get_preprocessed_char(c, input).map(FromSet),
304
305            // NB: We don't set self.current_char for a run of characters not
306            // in the set.  It shouldn't matter for the codepaths that use
307            // this.
308            _ => d,
309        }
310    }
311
312    // Check if the next characters are an ASCII case-insensitive match.  See
313    // BufferQueue::eat.
314    //
315    // NB: this doesn't set the current input character.
316    fn eat(&self, input: &BufferQueue, pat: &str, eq: fn(&u8, &u8) -> bool) -> Option<bool> {
317        if self.ignore_lf.get() {
318            self.ignore_lf.set(false);
319            if self.peek(input) == Some('\n') {
320                self.discard_char(input);
321            }
322        }
323
324        input.push_front(mem::take(&mut self.temp_buf.borrow_mut()));
325        match input.eat(pat, eq) {
326            None if self.at_eof.get() => Some(false),
327            None => {
328                while let Some(data) = input.next() {
329                    self.temp_buf.borrow_mut().push_char(data);
330                }
331                None
332            },
333            Some(matched) => Some(matched),
334        }
335    }
336
337    /// Run the state machine for as long as we can.
338    fn run(&self, input: &BufferQueue) -> TokenizerResult<Sink::Handle> {
339        if self.opts.profile {
340            loop {
341                let state = self.state.get();
342                let old_sink = self.time_in_sink.get();
343                let (run, mut dt) = time!(self.step(input));
344                dt -= (self.time_in_sink.get() - old_sink);
345                let new = match self.state_profile.borrow_mut().get_mut(&state) {
346                    Some(x) => {
347                        *x += dt;
348                        false
349                    },
350                    None => true,
351                };
352                if new {
353                    // do this here because of borrow shenanigans
354                    self.state_profile.borrow_mut().insert(state, dt);
355                }
356                match run {
357                    ProcessResult::Continue => (),
358                    ProcessResult::Suspend => break,
359                    ProcessResult::Script(node) => return TokenizerResult::Script(node),
360                }
361            }
362        } else {
363            loop {
364                match self.step(input) {
365                    ProcessResult::Continue => (),
366                    ProcessResult::Suspend => break,
367                    ProcessResult::Script(node) => return TokenizerResult::Script(node),
368                }
369            }
370        }
371        TokenizerResult::Done
372    }
373
374    #[inline]
375    fn bad_char_error(&self) {
376        #[cfg(feature = "trace_tokenizer")]
377        trace!("  error");
378
379        let msg = if self.opts.exact_errors {
380            Cow::from("Bad character")
381        } else {
382            let c = self.current_char.get();
383            let state = self.state.get();
384            Cow::from(format!("Saw {c} in state {state:?}"))
385        };
386        self.emit_error(msg);
387    }
388
389    #[inline]
390    fn bad_eof_error(&self) {
391        #[cfg(feature = "trace_tokenizer")]
392        trace!("  error_eof");
393
394        let msg = if self.opts.exact_errors {
395            Cow::from("Unexpected EOF")
396        } else {
397            let state = self.state.get();
398            Cow::from(format!("Saw EOF in state {state:?}"))
399        };
400        self.emit_error(msg);
401    }
402
403    fn emit_char(&self, c: char) {
404        #[cfg(feature = "trace_tokenizer")]
405        trace!("  emit");
406
407        self.process_token_and_continue(match c {
408            '\0' => NullCharacterToken,
409            _ => CharacterTokens(StrTendril::from_char(c)),
410        });
411    }
412
413    // The string must not contain '\0'!
414    fn emit_chars(&self, b: StrTendril) {
415        self.process_token_and_continue(CharacterTokens(b));
416    }
417
418    fn emit_current_tag(&self) -> ProcessResult<Sink::Handle> {
419        self.finish_attribute();
420
421        let name = LocalName::from(&**self.current_tag_name.borrow());
422        self.current_tag_name.borrow_mut().clear();
423
424        match self.current_tag_kind.get() {
425            StartTag => {
426                *self.last_start_tag_name.borrow_mut() = Some(name.clone());
427            },
428            EndTag => {
429                if !self.current_tag_attrs.borrow().is_empty() {
430                    self.emit_error(Borrowed("Attributes on an end tag"));
431                }
432                if self.current_tag_self_closing.get() {
433                    self.emit_error(Borrowed("Self-closing end tag"));
434                }
435            },
436        }
437
438        let token = TagToken(Tag {
439            kind: self.current_tag_kind.get(),
440            name,
441            self_closing: self.current_tag_self_closing.get(),
442            attrs: std::mem::take(&mut self.current_tag_attrs.borrow_mut()),
443        });
444
445        match self.process_token(token) {
446            TokenSinkResult::Continue => ProcessResult::Continue,
447            TokenSinkResult::Plaintext => {
448                self.state.set(states::Plaintext);
449                ProcessResult::Continue
450            },
451            TokenSinkResult::Script(node) => {
452                self.state.set(states::Data);
453                ProcessResult::Script(node)
454            },
455            TokenSinkResult::RawData(kind) => {
456                self.state.set(states::RawData(kind));
457                ProcessResult::Continue
458            },
459        }
460    }
461
462    fn emit_temp_buf(&self) {
463        #[cfg(feature = "trace_tokenizer")]
464        trace!("  emit_temp");
465
466        // FIXME: Make sure that clearing on emit is spec-compatible.
467        let buf = mem::take(&mut *self.temp_buf.borrow_mut());
468        self.emit_chars(buf);
469    }
470
471    fn clear_temp_buf(&self) {
472        // Do this without a new allocation.
473        self.temp_buf.borrow_mut().clear();
474    }
475
476    fn emit_current_comment(&self) {
477        let comment = mem::take(&mut *self.current_comment.borrow_mut());
478        self.process_token_and_continue(CommentToken(comment));
479    }
480
481    fn discard_tag(&self) {
482        self.current_tag_name.borrow_mut().clear();
483        self.current_tag_self_closing.set(false);
484        *self.current_tag_attrs.borrow_mut() = vec![];
485    }
486
487    fn create_tag(&self, kind: TagKind, c: char) {
488        self.discard_tag();
489        self.current_tag_name.borrow_mut().push_char(c);
490        self.current_tag_kind.set(kind);
491    }
492
493    fn have_appropriate_end_tag(&self) -> bool {
494        match self.last_start_tag_name.borrow().as_ref() {
495            Some(last) => {
496                (self.current_tag_kind.get() == EndTag)
497                    && (**self.current_tag_name.borrow() == **last)
498            },
499            None => false,
500        }
501    }
502
503    fn create_attribute(&self, c: char) {
504        self.finish_attribute();
505
506        self.current_attr_name.borrow_mut().push_char(c);
507    }
508
509    fn finish_attribute(&self) {
510        if self.current_attr_name.borrow().is_empty() {
511            return;
512        }
513
514        // Check for a duplicate attribute.
515        // FIXME: the spec says we should error as soon as the name is finished.
516        let dup = {
517            let name = &*self.current_attr_name.borrow();
518            self.current_tag_attrs
519                .borrow()
520                .iter()
521                .any(|a| *a.name.local == **name)
522        };
523
524        if dup {
525            self.emit_error(Borrowed("Duplicate attribute"));
526            self.current_attr_name.borrow_mut().clear();
527            self.current_attr_value.borrow_mut().clear();
528        } else {
529            let name = LocalName::from(&**self.current_attr_name.borrow());
530            self.current_attr_name.borrow_mut().clear();
531            self.current_tag_attrs.borrow_mut().push(Attribute {
532                // The tree builder will adjust the namespace if necessary.
533                // This only happens in foreign elements.
534                name: QualName::new(None, ns!(), name),
535                value: mem::take(&mut self.current_attr_value.borrow_mut()),
536            });
537        }
538    }
539
540    fn emit_current_doctype(&self) {
541        let doctype = self.current_doctype.take();
542        self.process_token_and_continue(DoctypeToken(doctype));
543    }
544
545    fn doctype_id(&self, kind: DoctypeIdKind) -> RefMut<'_, Option<StrTendril>> {
546        let current_doctype = self.current_doctype.borrow_mut();
547        match kind {
548            Public => RefMut::map(current_doctype, |d| &mut d.public_id),
549            System => RefMut::map(current_doctype, |d| &mut d.system_id),
550        }
551    }
552
553    fn clear_doctype_id(&self, kind: DoctypeIdKind) {
554        let mut id = self.doctype_id(kind);
555        match *id {
556            Some(ref mut s) => s.clear(),
557            None => *id = Some(StrTendril::new()),
558        }
559    }
560
561    fn consume_char_ref(&self) {
562        *self.char_ref_tokenizer.borrow_mut() = Some(Box::new(CharRefTokenizer::new(matches!(
563            self.state.get(),
564            states::AttributeValue(_)
565        ))));
566    }
567
568    fn emit_eof(&self) {
569        self.process_token_and_continue(EOFToken);
570    }
571
572    fn peek(&self, input: &BufferQueue) -> Option<char> {
573        if self.reconsume.get() {
574            Some(self.current_char.get())
575        } else {
576            input.peek()
577        }
578    }
579
580    fn discard_char(&self, input: &BufferQueue) {
581        // peek() deals in un-processed characters (no newline normalization), while get_char()
582        // does.
583        //
584        // since discard_char is supposed to be used in combination with peek(), discard_char must
585        // discard a single raw input character, not a normalized newline.
586        if self.reconsume.get() {
587            self.reconsume.set(false);
588        } else {
589            input.next();
590        }
591    }
592
593    fn emit_error(&self, error: Cow<'static, str>) {
594        self.process_token_and_continue(ParseError(error));
595    }
596}
597//§ END
598
599// Shorthand for common state machine behaviors.
600macro_rules! shorthand (
601    ( $me:ident : create_tag $kind:ident $c:expr   ) => ( $me.create_tag($kind, $c)                           );
602    ( $me:ident : push_tag $c:expr                 ) => ( $me.current_tag_name.borrow_mut().push_char($c)     );
603    ( $me:ident : discard_tag                      ) => ( $me.discard_tag()                                   );
604    ( $me:ident : discard_char $input:expr         ) => ( $me.discard_char($input)                            );
605    ( $me:ident : push_temp $c:expr                ) => ( $me.temp_buf.borrow_mut().push_char($c)             );
606    ( $me:ident : clear_temp                       ) => ( $me.clear_temp_buf()                                );
607    ( $me:ident : create_attr $c:expr              ) => ( $me.create_attribute($c)                            );
608    ( $me:ident : push_name $c:expr                ) => ( $me.current_attr_name.borrow_mut().push_char($c)    );
609    ( $me:ident : push_value $c:expr               ) => ( $me.current_attr_value.borrow_mut().push_char($c)   );
610    ( $me:ident : append_value $c:expr             ) => ( $me.current_attr_value.borrow_mut().push_tendril($c));
611    ( $me:ident : push_comment $c:expr             ) => ( $me.current_comment.borrow_mut().push_char($c)      );
612    ( $me:ident : append_comment $c:expr           ) => ( $me.current_comment.borrow_mut().push_slice($c)     );
613    ( $me:ident : emit_comment                     ) => ( $me.emit_current_comment()                          );
614    ( $me:ident : clear_comment                    ) => ( $me.current_comment.borrow_mut().clear()            );
615    ( $me:ident : create_doctype                   ) => ( *$me.current_doctype.borrow_mut() = Doctype::default() );
616    ( $me:ident : push_doctype_name $c:expr        ) => ( option_push(&mut $me.current_doctype.borrow_mut().name, $c) );
617    ( $me:ident : push_doctype_id $k:ident $c:expr ) => ( option_push(&mut $me.doctype_id($k), $c)            );
618    ( $me:ident : clear_doctype_id $k:ident        ) => ( $me.clear_doctype_id($k)                            );
619    ( $me:ident : force_quirks                     ) => ( $me.current_doctype.borrow_mut().force_quirks = true);
620    ( $me:ident : emit_doctype                     ) => ( $me.emit_current_doctype()                          );
621);
622
623// Tracing of tokenizer actions.  This adds significant bloat and compile time,
624// so it's behind a cfg flag.
625#[cfg(feature = "trace_tokenizer")]
626macro_rules! sh_trace ( ( $me:ident : $($cmds:tt)* ) => ({
627    trace!("  {:?}", stringify!($($cmds)*));
628    shorthand!($me : $($cmds)*);
629}));
630
631#[cfg(not(feature = "trace_tokenizer"))]
632macro_rules! sh_trace ( ( $me:ident : $($cmds:tt)* ) => ( shorthand!($me: $($cmds)*) ) );
633
634// A little DSL for sequencing shorthand actions.
635macro_rules! go (
636    // A pattern like $($cmd:tt)* ; $($rest:tt)* causes parse ambiguity.
637    // We have to tell the parser how much lookahead we need.
638
639    ( $me:ident : $a:tt                   ; $($rest:tt)* ) => ({ sh_trace!($me: $a);          go!($me: $($rest)*); });
640    ( $me:ident : $a:tt $b:tt             ; $($rest:tt)* ) => ({ sh_trace!($me: $a $b);       go!($me: $($rest)*); });
641    ( $me:ident : $a:tt $b:tt $c:tt       ; $($rest:tt)* ) => ({ sh_trace!($me: $a $b $c);    go!($me: $($rest)*); });
642    ( $me:ident : $a:tt $b:tt $c:tt $d:tt ; $($rest:tt)* ) => ({ sh_trace!($me: $a $b $c $d); go!($me: $($rest)*); });
643
644    // These can only come at the end.
645
646    ( $me:ident : to $s:ident                    ) => ({ $me.state.set(states::$s); return ProcessResult::Continue;           });
647    ( $me:ident : to $s:ident $k1:expr           ) => ({ $me.state.set(states::$s($k1)); return ProcessResult::Continue;      });
648    ( $me:ident : to $s:ident $k1:ident $k2:expr ) => ({ $me.state.set(states::$s($k1($k2))); return ProcessResult::Continue; });
649
650    ( $me:ident : reconsume $s:ident                    ) => ({ $me.reconsume.set(true); go!($me: to $s);         });
651    ( $me:ident : reconsume $s:ident $k1:expr           ) => ({ $me.reconsume.set(true); go!($me: to $s $k1);     });
652    ( $me:ident : reconsume $s:ident $k1:ident $k2:expr ) => ({ $me.reconsume.set(true); go!($me: to $s $k1 $k2); });
653
654    ( $me:ident : consume_char_ref             ) => ({ $me.consume_char_ref(); return ProcessResult::Continue;         });
655
656    // We have a default next state after emitting a tag, but the sink can override.
657    ( $me:ident : emit_tag $s:ident ) => ({
658        $me.state.set(states::$s);
659        return $me.emit_current_tag();
660    });
661
662    ( $me:ident : eof ) => ({ $me.emit_eof(); return ProcessResult::Suspend; });
663
664    // If nothing else matched, it's a single command
665    ( $me:ident : $($cmd:tt)+ ) => ( sh_trace!($me: $($cmd)+) );
666
667    // or nothing.
668    ( $me:ident : ) => (());
669);
670
671// This is a macro because it can cause early return
672// from the function where it is used.
673macro_rules! get_char ( ($me:expr, $input:expr) => (
674    unwrap_or_return!($me.get_char($input), ProcessResult::Suspend)
675));
676
677macro_rules! peek ( ($me:expr, $input:expr) => (
678    unwrap_or_return!($me.peek($input), ProcessResult::Suspend)
679));
680
681macro_rules! pop_except_from ( ($me:expr, $input:expr, $set:expr) => (
682    unwrap_or_return!($me.pop_except_from($input, $set), ProcessResult::Suspend)
683));
684
685macro_rules! eat ( ($me:expr, $input:expr, $pat:expr) => (
686    unwrap_or_return!($me.eat($input, $pat, u8::eq_ignore_ascii_case), ProcessResult::Suspend)
687));
688
689macro_rules! eat_exact ( ($me:expr, $input:expr, $pat:expr) => (
690    unwrap_or_return!($me.eat($input, $pat, u8::eq), ProcessResult::Suspend)
691));
692
693impl<Sink: TokenSink> Tokenizer<Sink> {
694    // Run the state machine for a while.
695    // Return true if we should be immediately re-invoked
696    // (this just simplifies control flow vs. break / continue).
697    #[allow(clippy::never_loop)]
698    fn step(&self, input: &BufferQueue) -> ProcessResult<Sink::Handle> {
699        if self.char_ref_tokenizer.borrow().is_some() {
700            return self.step_char_ref_tokenizer(input);
701        }
702
703        trace!("processing in state {:?}", self.state);
704        match self.state.get() {
705            //§ data-state
706            states::Data => loop {
707                let set = small_char_set!('\r' '\0' '&' '<' '\n');
708
709                #[cfg(any(target_arch = "x86", target_arch = "x86_64", target_arch = "aarch64"))]
710                let set_result = if !(self.opts.exact_errors
711                    || self.reconsume.get()
712                    || self.ignore_lf.get())
713                    && Self::is_supported_simd_feature_detected()
714                {
715                    let front_buffer = input.peek_front_chunk_mut();
716                    let Some(mut front_buffer) = front_buffer else {
717                        return ProcessResult::Suspend;
718                    };
719
720                    // Special case: The fast path is not worth taking if the first character is already in the set,
721                    // which is fairly common
722                    let first_char = front_buffer
723                        .chars()
724                        .next()
725                        .expect("Input buffers are never empty");
726
727                    if matches!(first_char, '\r' | '\0' | '&' | '<' | '\n') {
728                        drop(front_buffer);
729                        self.pop_except_from(input, set)
730                    } else {
731                        // SAFETY:
732                        // This CPU is guaranteed to support SIMD due to the is_supported_simd_feature_detected check above
733                        let result = unsafe { self.data_state_simd_fast_path(&mut front_buffer) };
734
735                        if front_buffer.is_empty() {
736                            drop(front_buffer);
737                            input.pop_front();
738                        }
739
740                        result
741                    }
742                } else {
743                    self.pop_except_from(input, set)
744                };
745
746                #[cfg(not(any(
747                    target_arch = "x86",
748                    target_arch = "x86_64",
749                    target_arch = "aarch64"
750                )))]
751                let set_result = self.pop_except_from(input, set);
752
753                let Some(set_result) = set_result else {
754                    return ProcessResult::Suspend;
755                };
756                match set_result {
757                    FromSet('\0') => {
758                        self.bad_char_error();
759                        self.emit_char('\0');
760                    },
761                    FromSet('&') => go!(self: consume_char_ref),
762                    FromSet('<') => go!(self: to TagOpen),
763                    FromSet(c) => {
764                        self.emit_char(c);
765                    },
766                    NotFromSet(b) => self.emit_chars(b),
767                }
768            },
769
770            //§ rcdata-state
771            states::RawData(Rcdata) => loop {
772                match pop_except_from!(self, input, small_char_set!('\r' '\0' '&' '<' '\n')) {
773                    FromSet('\0') => {
774                        self.bad_char_error();
775                        self.emit_char('\u{fffd}');
776                    },
777                    FromSet('&') => go!(self: consume_char_ref),
778                    FromSet('<') => go!(self: to RawLessThanSign Rcdata),
779                    FromSet(c) => self.emit_char(c),
780                    NotFromSet(b) => self.emit_chars(b),
781                }
782            },
783
784            //§ rawtext-state
785            states::RawData(Rawtext) => loop {
786                match pop_except_from!(self, input, small_char_set!('\r' '\0' '<' '\n')) {
787                    FromSet('\0') => {
788                        self.bad_char_error();
789                        self.emit_char('\u{fffd}');
790                    },
791                    FromSet('<') => go!(self: to RawLessThanSign Rawtext),
792                    FromSet(c) => self.emit_char(c),
793                    NotFromSet(b) => self.emit_chars(b),
794                }
795            },
796
797            //§ script-data-state
798            states::RawData(ScriptData) => loop {
799                match pop_except_from!(self, input, small_char_set!('\r' '\0' '<' '\n')) {
800                    FromSet('\0') => {
801                        self.bad_char_error();
802                        self.emit_char('\u{fffd}');
803                    },
804                    FromSet('<') => go!(self: to RawLessThanSign ScriptData),
805                    FromSet(c) => self.emit_char(c),
806                    NotFromSet(b) => self.emit_chars(b),
807                }
808            },
809
810            //§ script-data-escaped-state
811            states::RawData(ScriptDataEscaped(Escaped)) => loop {
812                match pop_except_from!(self, input, small_char_set!('\r' '\0' '-' '<' '\n')) {
813                    FromSet('\0') => {
814                        self.bad_char_error();
815                        self.emit_char('\u{fffd}');
816                    },
817                    FromSet('-') => {
818                        self.emit_char('-');
819                        go!(self: to ScriptDataEscapedDash Escaped);
820                    },
821                    FromSet('<') => go!(self: to RawLessThanSign ScriptDataEscaped Escaped),
822                    FromSet(c) => self.emit_char(c),
823                    NotFromSet(b) => self.emit_chars(b),
824                }
825            },
826
827            //§ script-data-double-escaped-state
828            states::RawData(ScriptDataEscaped(DoubleEscaped)) => loop {
829                match pop_except_from!(self, input, small_char_set!('\r' '\0' '-' '<' '\n')) {
830                    FromSet('\0') => {
831                        self.bad_char_error();
832                        self.emit_char('\u{fffd}');
833                    },
834                    FromSet('-') => {
835                        self.emit_char('-');
836                        go!(self: to ScriptDataEscapedDash DoubleEscaped);
837                    },
838                    FromSet('<') => {
839                        self.emit_char('<');
840                        go!(self: to RawLessThanSign ScriptDataEscaped DoubleEscaped)
841                    },
842                    FromSet(c) => self.emit_char(c),
843                    NotFromSet(b) => self.emit_chars(b),
844                }
845            },
846
847            //§ plaintext-state
848            states::Plaintext => loop {
849                match pop_except_from!(self, input, small_char_set!('\r' '\0' '\n')) {
850                    FromSet('\0') => {
851                        self.bad_char_error();
852                        self.emit_char('\u{fffd}');
853                    },
854                    FromSet(c) => self.emit_char(c),
855                    NotFromSet(b) => self.emit_chars(b),
856                }
857            },
858
859            //§ tag-open-state
860            states::TagOpen => loop {
861                match get_char!(self, input) {
862                    '!' => go!(self: to MarkupDeclarationOpen),
863                    '/' => go!(self: to EndTagOpen),
864                    '?' => {
865                        self.bad_char_error();
866                        go!(self: clear_comment; reconsume BogusComment)
867                    },
868                    c => match lower_ascii_letter(c) {
869                        Some(cl) => go!(self: create_tag StartTag cl; to TagName),
870                        None => {
871                            self.bad_char_error();
872                            self.emit_char('<');
873                            go!(self: reconsume Data)
874                        },
875                    },
876                }
877            },
878
879            //§ end-tag-open-state
880            states::EndTagOpen => loop {
881                match get_char!(self, input) {
882                    '>' => {
883                        self.bad_char_error();
884                        go!(self: to Data)
885                    },
886                    c => match lower_ascii_letter(c) {
887                        Some(cl) => go!(self: create_tag EndTag cl; to TagName),
888                        None => {
889                            self.bad_char_error();
890                            go!(self: clear_comment; reconsume BogusComment)
891                        },
892                    },
893                }
894            },
895
896            //§ tag-name-state
897            states::TagName => loop {
898                match get_char!(self, input) {
899                    '\t' | '\n' | '\x0C' | ' ' => go!(self: to BeforeAttributeName),
900                    '/' => go!(self: to SelfClosingStartTag),
901                    '>' => go!(self: emit_tag Data),
902                    '\0' => {
903                        self.bad_char_error();
904                        go!(self: push_tag '\u{fffd}')
905                    },
906                    c => go!(self: push_tag (c.to_ascii_lowercase())),
907                }
908            },
909
910            //§ script-data-escaped-less-than-sign-state
911            states::RawLessThanSign(ScriptDataEscaped(Escaped)) => loop {
912                match get_char!(self, input) {
913                    '/' => go!(self: clear_temp; to RawEndTagOpen ScriptDataEscaped Escaped),
914                    c => match lower_ascii_letter(c) {
915                        Some(cl) => {
916                            go!(self: clear_temp; push_temp cl);
917                            self.emit_char('<');
918                            self.emit_char(c);
919                            go!(self: to ScriptDataEscapeStart DoubleEscaped);
920                        },
921                        None => {
922                            self.emit_char('<');
923                            go!(self: reconsume RawData ScriptDataEscaped Escaped);
924                        },
925                    },
926                }
927            },
928
929            //§ script-data-double-escaped-less-than-sign-state
930            states::RawLessThanSign(ScriptDataEscaped(DoubleEscaped)) => loop {
931                match get_char!(self, input) {
932                    '/' => {
933                        go!(self: clear_temp);
934                        self.emit_char('/');
935                        go!(self: to ScriptDataDoubleEscapeEnd);
936                    },
937                    _ => go!(self: reconsume RawData ScriptDataEscaped DoubleEscaped),
938                }
939            },
940
941            //§ rcdata-less-than-sign-state rawtext-less-than-sign-state script-data-less-than-sign-state
942            // otherwise
943            states::RawLessThanSign(kind) => loop {
944                match get_char!(self, input) {
945                    '/' => go!(self: clear_temp; to RawEndTagOpen kind),
946                    '!' if kind == ScriptData => {
947                        self.emit_char('<');
948                        self.emit_char('!');
949                        go!(self: to ScriptDataEscapeStart Escaped);
950                    },
951                    _ => {
952                        self.emit_char('<');
953                        go!(self: reconsume RawData kind);
954                    },
955                }
956            },
957
958            //§ rcdata-end-tag-open-state rawtext-end-tag-open-state script-data-end-tag-open-state script-data-escaped-end-tag-open-state
959            states::RawEndTagOpen(kind) => loop {
960                let c = get_char!(self, input);
961                match lower_ascii_letter(c) {
962                    Some(cl) => go!(self: create_tag EndTag cl; push_temp c; to RawEndTagName kind),
963                    None => {
964                        self.emit_char('<');
965                        self.emit_char('/');
966                        go!(self: reconsume RawData kind);
967                    },
968                }
969            },
970
971            //§ rcdata-end-tag-name-state rawtext-end-tag-name-state script-data-end-tag-name-state script-data-escaped-end-tag-name-state
972            states::RawEndTagName(kind) => loop {
973                let c = get_char!(self, input);
974                if self.have_appropriate_end_tag() {
975                    match c {
976                        '\t' | '\n' | '\x0C' | ' ' => go!(self: clear_temp; to BeforeAttributeName),
977                        '/' => go!(self: clear_temp; to SelfClosingStartTag),
978                        '>' => go!(self: clear_temp; emit_tag Data),
979                        _ => (),
980                    }
981                }
982
983                match lower_ascii_letter(c) {
984                    Some(cl) => go!(self: push_tag cl; push_temp c),
985                    None => {
986                        go!(self: discard_tag);
987                        self.emit_char('<');
988                        self.emit_char('/');
989                        self.emit_temp_buf();
990                        go!(self: reconsume RawData kind);
991                    },
992                }
993            },
994
995            //§ script-data-double-escape-start-state
996            states::ScriptDataEscapeStart(DoubleEscaped) => loop {
997                let c = get_char!(self, input);
998                match c {
999                    '\t' | '\n' | '\x0C' | ' ' | '/' | '>' => {
1000                        let esc = if &**self.temp_buf.borrow() == "script" {
1001                            DoubleEscaped
1002                        } else {
1003                            Escaped
1004                        };
1005                        self.emit_char(c);
1006                        go!(self: to RawData ScriptDataEscaped esc);
1007                    },
1008                    _ => match lower_ascii_letter(c) {
1009                        Some(cl) => {
1010                            go!(self: push_temp cl);
1011                            self.emit_char(c);
1012                        },
1013                        None => go!(self: reconsume RawData ScriptDataEscaped Escaped),
1014                    },
1015                }
1016            },
1017
1018            //§ script-data-escape-start-state
1019            states::ScriptDataEscapeStart(Escaped) => loop {
1020                match get_char!(self, input) {
1021                    '-' => {
1022                        self.emit_char('-');
1023                        go!(self: to ScriptDataEscapeStartDash);
1024                    },
1025                    _ => go!(self: reconsume RawData ScriptData),
1026                }
1027            },
1028
1029            //§ script-data-escape-start-dash-state
1030            states::ScriptDataEscapeStartDash => loop {
1031                match get_char!(self, input) {
1032                    '-' => {
1033                        self.emit_char('-');
1034                        go!(self: to ScriptDataEscapedDashDash Escaped);
1035                    },
1036                    _ => go!(self: reconsume RawData ScriptData),
1037                }
1038            },
1039
1040            //§ script-data-escaped-dash-state script-data-double-escaped-dash-state
1041            states::ScriptDataEscapedDash(kind) => loop {
1042                match get_char!(self, input) {
1043                    '-' => {
1044                        self.emit_char('-');
1045                        go!(self: to ScriptDataEscapedDashDash kind);
1046                    },
1047                    '<' => {
1048                        if kind == DoubleEscaped {
1049                            self.emit_char('<');
1050                        }
1051                        go!(self: to RawLessThanSign ScriptDataEscaped kind);
1052                    },
1053                    '\0' => {
1054                        self.bad_char_error();
1055                        self.emit_char('\u{fffd}');
1056                        go!(self: to RawData ScriptDataEscaped kind)
1057                    },
1058                    c => {
1059                        self.emit_char(c);
1060                        go!(self: to RawData ScriptDataEscaped kind);
1061                    },
1062                }
1063            },
1064
1065            //§ script-data-escaped-dash-dash-state script-data-double-escaped-dash-dash-state
1066            states::ScriptDataEscapedDashDash(kind) => loop {
1067                match get_char!(self, input) {
1068                    '-' => {
1069                        self.emit_char('-');
1070                    },
1071                    '<' => {
1072                        if kind == DoubleEscaped {
1073                            self.emit_char('<');
1074                        }
1075                        go!(self: to RawLessThanSign ScriptDataEscaped kind);
1076                    },
1077                    '>' => {
1078                        self.emit_char('>');
1079                        go!(self: to RawData ScriptData);
1080                    },
1081                    '\0' => {
1082                        self.bad_char_error();
1083                        self.emit_char('\u{fffd}');
1084                        go!(self: to RawData ScriptDataEscaped kind)
1085                    },
1086                    c => {
1087                        self.emit_char(c);
1088                        go!(self: to RawData ScriptDataEscaped kind);
1089                    },
1090                }
1091            },
1092
1093            //§ script-data-double-escape-end-state
1094            states::ScriptDataDoubleEscapeEnd => loop {
1095                let c = get_char!(self, input);
1096                match c {
1097                    '\t' | '\n' | '\x0C' | ' ' | '/' | '>' => {
1098                        let esc = if &**self.temp_buf.borrow() == "script" {
1099                            Escaped
1100                        } else {
1101                            DoubleEscaped
1102                        };
1103                        self.emit_char(c);
1104                        go!(self: to RawData ScriptDataEscaped esc);
1105                    },
1106                    _ => match lower_ascii_letter(c) {
1107                        Some(cl) => {
1108                            go!(self: push_temp cl);
1109                            self.emit_char(c);
1110                        },
1111                        None => go!(self: reconsume RawData ScriptDataEscaped DoubleEscaped),
1112                    },
1113                }
1114            },
1115
1116            //§ before-attribute-name-state
1117            states::BeforeAttributeName => loop {
1118                match get_char!(self, input) {
1119                    '\t' | '\n' | '\x0C' | ' ' => (),
1120                    '/' => go!(self: to SelfClosingStartTag),
1121                    '>' => go!(self: emit_tag Data),
1122                    '\0' => {
1123                        self.bad_char_error();
1124                        go!(self: create_attr '\u{fffd}'; to AttributeName)
1125                    },
1126                    c => match lower_ascii_letter(c) {
1127                        Some(cl) => go!(self: create_attr cl; to AttributeName),
1128                        None => {
1129                            if matches!(c, '"' | '\'' | '<' | '=') {
1130                                self.bad_char_error();
1131                            }
1132
1133                            go!(self: create_attr c; to AttributeName);
1134                        },
1135                    },
1136                }
1137            },
1138
1139            //§ attribute-name-state
1140            states::AttributeName => loop {
1141                match get_char!(self, input) {
1142                    '\t' | '\n' | '\x0C' | ' ' => go!(self: to AfterAttributeName),
1143                    '/' => go!(self: to SelfClosingStartTag),
1144                    '=' => go!(self: to BeforeAttributeValue),
1145                    '>' => go!(self: emit_tag Data),
1146                    '\0' => {
1147                        self.bad_char_error();
1148                        go!(self: push_name '\u{fffd}')
1149                    },
1150                    c => match lower_ascii_letter(c) {
1151                        Some(cl) => go!(self: push_name cl),
1152                        None => {
1153                            if matches!(c, '"' | '\'' | '<') {
1154                                self.bad_char_error();
1155                            }
1156                            go!(self: push_name c);
1157                        },
1158                    },
1159                }
1160            },
1161
1162            //§ after-attribute-name-state
1163            states::AfterAttributeName => loop {
1164                match get_char!(self, input) {
1165                    '\t' | '\n' | '\x0C' | ' ' => (),
1166                    '/' => go!(self: to SelfClosingStartTag),
1167                    '=' => go!(self: to BeforeAttributeValue),
1168                    '>' => go!(self: emit_tag Data),
1169                    '\0' => {
1170                        self.bad_char_error();
1171                        go!(self: create_attr '\u{fffd}'; to AttributeName)
1172                    },
1173                    c => match lower_ascii_letter(c) {
1174                        Some(cl) => go!(self: create_attr cl; to AttributeName),
1175                        None => {
1176                            if matches!(c, '"' | '\'' | '<') {
1177                                self.bad_char_error();
1178                            }
1179
1180                            go!(self: create_attr c; to AttributeName);
1181                        },
1182                    },
1183                }
1184            },
1185
1186            //§ before-attribute-value-state
1187            // Use peek so we can handle the first attr character along with the rest,
1188            // hopefully in the same zero-copy buffer.
1189            states::BeforeAttributeValue => loop {
1190                match peek!(self, input) {
1191                    '\t' | '\n' | '\r' | '\x0C' | ' ' => go!(self: discard_char input),
1192                    '"' => go!(self: discard_char input; to AttributeValue DoubleQuoted),
1193                    '\'' => go!(self: discard_char input; to AttributeValue SingleQuoted),
1194                    '>' => {
1195                        go!(self: discard_char input);
1196                        self.bad_char_error();
1197                        go!(self: emit_tag Data)
1198                    },
1199                    _ => go!(self: to AttributeValue Unquoted),
1200                }
1201            },
1202
1203            //§ attribute-value-(double-quoted)-state
1204            states::AttributeValue(DoubleQuoted) => loop {
1205                match pop_except_from!(self, input, small_char_set!('\r' '"' '&' '\0' '\n')) {
1206                    FromSet('"') => go!(self: to AfterAttributeValueQuoted),
1207                    FromSet('&') => go!(self: consume_char_ref),
1208                    FromSet('\0') => {
1209                        self.bad_char_error();
1210                        go!(self: push_value '\u{fffd}')
1211                    },
1212                    FromSet(c) => go!(self: push_value c),
1213                    NotFromSet(ref b) => go!(self: append_value b),
1214                }
1215            },
1216
1217            //§ attribute-value-(single-quoted)-state
1218            states::AttributeValue(SingleQuoted) => loop {
1219                match pop_except_from!(self, input, small_char_set!('\r' '\'' '&' '\0' '\n')) {
1220                    FromSet('\'') => go!(self: to AfterAttributeValueQuoted),
1221                    FromSet('&') => go!(self: consume_char_ref),
1222                    FromSet('\0') => {
1223                        self.bad_char_error();
1224                        go!(self: push_value '\u{fffd}')
1225                    },
1226                    FromSet(c) => go!(self: push_value c),
1227                    NotFromSet(ref b) => go!(self: append_value b),
1228                }
1229            },
1230
1231            //§ attribute-value-(unquoted)-state
1232            states::AttributeValue(Unquoted) => loop {
1233                match pop_except_from!(
1234                    self,
1235                    input,
1236                    small_char_set!('\r' '\t' '\n' '\x0C' ' ' '&' '>' '\0')
1237                ) {
1238                    FromSet('\t') | FromSet('\n') | FromSet('\x0C') | FromSet(' ') => {
1239                        go!(self: to BeforeAttributeName)
1240                    },
1241                    FromSet('&') => go!(self: consume_char_ref),
1242                    FromSet('>') => go!(self: emit_tag Data),
1243                    FromSet('\0') => {
1244                        self.bad_char_error();
1245                        go!(self: push_value '\u{fffd}')
1246                    },
1247                    FromSet(c) => {
1248                        if matches!(c, '"' | '\'' | '<' | '=' | '`') {
1249                            self.bad_char_error();
1250                        }
1251                        go!(self: push_value c);
1252                    },
1253                    NotFromSet(ref b) => go!(self: append_value b),
1254                }
1255            },
1256
1257            //§ after-attribute-value-(quoted)-state
1258            states::AfterAttributeValueQuoted => loop {
1259                match get_char!(self, input) {
1260                    '\t' | '\n' | '\x0C' | ' ' => go!(self: to BeforeAttributeName),
1261                    '/' => go!(self: to SelfClosingStartTag),
1262                    '>' => go!(self: emit_tag Data),
1263                    _ => {
1264                        self.bad_char_error();
1265                        go!(self: reconsume BeforeAttributeName)
1266                    },
1267                }
1268            },
1269
1270            //§ self-closing-start-tag-state
1271            states::SelfClosingStartTag => loop {
1272                match get_char!(self, input) {
1273                    '>' => {
1274                        self.current_tag_self_closing.set(true);
1275                        go!(self: emit_tag Data);
1276                    },
1277                    _ => {
1278                        self.bad_char_error();
1279                        go!(self: reconsume BeforeAttributeName)
1280                    },
1281                }
1282            },
1283
1284            //§ comment-start-state
1285            states::CommentStart => loop {
1286                match get_char!(self, input) {
1287                    '-' => go!(self: to CommentStartDash),
1288                    '\0' => {
1289                        self.bad_char_error();
1290                        go!(self: push_comment '\u{fffd}'; to Comment)
1291                    },
1292                    '>' => {
1293                        self.bad_char_error();
1294                        go!(self: emit_comment; to Data)
1295                    },
1296                    c => go!(self: push_comment c; to Comment),
1297                }
1298            },
1299
1300            //§ comment-start-dash-state
1301            states::CommentStartDash => loop {
1302                match get_char!(self, input) {
1303                    '-' => go!(self: to CommentEnd),
1304                    '\0' => {
1305                        self.bad_char_error();
1306                        go!(self: append_comment "-\u{fffd}"; to Comment)
1307                    },
1308                    '>' => {
1309                        self.bad_char_error();
1310                        go!(self: emit_comment; to Data)
1311                    },
1312                    c => go!(self: push_comment '-'; push_comment c; to Comment),
1313                }
1314            },
1315
1316            //§ comment-state
1317            states::Comment => loop {
1318                match get_char!(self, input) {
1319                    c @ '<' => go!(self: push_comment c; to CommentLessThanSign),
1320                    '-' => go!(self: to CommentEndDash),
1321                    '\0' => {
1322                        self.bad_char_error();
1323                        go!(self: push_comment '\u{fffd}')
1324                    },
1325                    c => go!(self: push_comment c),
1326                }
1327            },
1328
1329            //§ comment-less-than-sign-state
1330            states::CommentLessThanSign => loop {
1331                match get_char!(self, input) {
1332                    c @ '!' => go!(self: push_comment c; to CommentLessThanSignBang),
1333                    c @ '<' => go!(self: push_comment c),
1334                    _ => go!(self: reconsume Comment),
1335                }
1336            },
1337
1338            //§ comment-less-than-sign-bang
1339            states::CommentLessThanSignBang => loop {
1340                match get_char!(self, input) {
1341                    '-' => go!(self: to CommentLessThanSignBangDash),
1342                    _ => go!(self: reconsume Comment),
1343                }
1344            },
1345
1346            //§ comment-less-than-sign-bang-dash
1347            states::CommentLessThanSignBangDash => loop {
1348                match get_char!(self, input) {
1349                    '-' => go!(self: to CommentLessThanSignBangDashDash),
1350                    _ => go!(self: reconsume CommentEndDash),
1351                }
1352            },
1353
1354            //§ comment-less-than-sign-bang-dash-dash
1355            states::CommentLessThanSignBangDashDash => loop {
1356                match get_char!(self, input) {
1357                    '>' => go!(self: reconsume CommentEnd),
1358                    _ => {
1359                        self.bad_char_error();
1360                        go!(self: reconsume CommentEnd)
1361                    },
1362                }
1363            },
1364
1365            //§ comment-end-dash-state
1366            states::CommentEndDash => loop {
1367                match get_char!(self, input) {
1368                    '-' => go!(self: to CommentEnd),
1369                    '\0' => {
1370                        self.bad_char_error();
1371                        go!(self: append_comment "-\u{fffd}"; to Comment)
1372                    },
1373                    c => go!(self: push_comment '-'; push_comment c; to Comment),
1374                }
1375            },
1376
1377            //§ comment-end-state
1378            states::CommentEnd => loop {
1379                match get_char!(self, input) {
1380                    '>' => go!(self: emit_comment; to Data),
1381                    '!' => go!(self: to CommentEndBang),
1382                    '-' => go!(self: push_comment '-'),
1383                    _ => go!(self: append_comment "--"; reconsume Comment),
1384                }
1385            },
1386
1387            //§ comment-end-bang-state
1388            states::CommentEndBang => loop {
1389                match get_char!(self, input) {
1390                    '-' => go!(self: append_comment "--!"; to CommentEndDash),
1391                    '>' => {
1392                        self.bad_char_error();
1393                        go!(self: emit_comment; to Data)
1394                    },
1395                    '\0' => {
1396                        self.bad_char_error();
1397                        go!(self: append_comment "--!\u{fffd}"; to Comment)
1398                    },
1399                    c => go!(self: append_comment "--!"; push_comment c; to Comment),
1400                }
1401            },
1402
1403            //§ doctype-state
1404            states::Doctype => loop {
1405                match get_char!(self, input) {
1406                    '\t' | '\n' | '\x0C' | ' ' => go!(self: to BeforeDoctypeName),
1407                    '>' => go!(self: reconsume BeforeDoctypeName),
1408                    _ => {
1409                        self.bad_char_error();
1410                        go!(self: reconsume BeforeDoctypeName)
1411                    },
1412                }
1413            },
1414
1415            //§ before-doctype-name-state
1416            states::BeforeDoctypeName => loop {
1417                match get_char!(self, input) {
1418                    '\t' | '\n' | '\x0C' | ' ' => (),
1419                    '\0' => {
1420                        self.bad_char_error();
1421                        go!(self: create_doctype; push_doctype_name '\u{fffd}'; to DoctypeName)
1422                    },
1423                    '>' => {
1424                        self.bad_char_error();
1425                        go!(self: create_doctype; force_quirks; emit_doctype; to Data)
1426                    },
1427                    c => go!(self: create_doctype; push_doctype_name (c.to_ascii_lowercase());
1428                                  to DoctypeName),
1429                }
1430            },
1431
1432            //§ doctype-name-state
1433            states::DoctypeName => loop {
1434                match get_char!(self, input) {
1435                    '\t' | '\n' | '\x0C' | ' ' => go!(self: clear_temp; to AfterDoctypeName),
1436                    '>' => go!(self: emit_doctype; to Data),
1437                    '\0' => {
1438                        self.bad_char_error();
1439                        go!(self: push_doctype_name '\u{fffd}')
1440                    },
1441                    c => go!(self: push_doctype_name (c.to_ascii_lowercase())),
1442                }
1443            },
1444
1445            //§ after-doctype-name-state
1446            states::AfterDoctypeName => loop {
1447                if eat!(self, input, "public") {
1448                    go!(self: to AfterDoctypeKeyword Public);
1449                } else if eat!(self, input, "system") {
1450                    go!(self: to AfterDoctypeKeyword System);
1451                } else {
1452                    match get_char!(self, input) {
1453                        '\t' | '\n' | '\x0C' | ' ' => (),
1454                        '>' => go!(self: emit_doctype; to Data),
1455                        _ => {
1456                            self.bad_char_error();
1457                            go!(self: force_quirks; reconsume BogusDoctype)
1458                        },
1459                    }
1460                }
1461            },
1462
1463            //§ after-doctype-public-keyword-state after-doctype-system-keyword-state
1464            states::AfterDoctypeKeyword(kind) => loop {
1465                match get_char!(self, input) {
1466                    '\t' | '\n' | '\x0C' | ' ' => go!(self: to BeforeDoctypeIdentifier kind),
1467                    '"' => {
1468                        self.bad_char_error();
1469                        go!(self: clear_doctype_id kind; to DoctypeIdentifierDoubleQuoted kind)
1470                    },
1471                    '\'' => {
1472                        self.bad_char_error();
1473                        go!(self: clear_doctype_id kind; to DoctypeIdentifierSingleQuoted kind)
1474                    },
1475                    '>' => {
1476                        self.bad_char_error();
1477                        go!(self: force_quirks; emit_doctype; to Data)
1478                    },
1479                    _ => {
1480                        self.bad_char_error();
1481                        go!(self: force_quirks; reconsume BogusDoctype)
1482                    },
1483                }
1484            },
1485
1486            //§ before-doctype-public-identifier-state before-doctype-system-identifier-state
1487            states::BeforeDoctypeIdentifier(kind) => loop {
1488                match get_char!(self, input) {
1489                    '\t' | '\n' | '\x0C' | ' ' => (),
1490                    '"' => go!(self: clear_doctype_id kind; to DoctypeIdentifierDoubleQuoted kind),
1491                    '\'' => go!(self: clear_doctype_id kind; to DoctypeIdentifierSingleQuoted kind),
1492                    '>' => {
1493                        self.bad_char_error();
1494                        go!(self: force_quirks; emit_doctype; to Data)
1495                    },
1496                    _ => {
1497                        self.bad_char_error();
1498                        go!(self: force_quirks; reconsume BogusDoctype)
1499                    },
1500                }
1501            },
1502
1503            //§ doctype-public-identifier-(double-quoted)-state doctype-system-identifier-(double-quoted)-state
1504            states::DoctypeIdentifierDoubleQuoted(kind) => loop {
1505                match get_char!(self, input) {
1506                    '"' => go!(self: to AfterDoctypeIdentifier kind),
1507                    '\0' => {
1508                        self.bad_char_error();
1509                        go!(self: push_doctype_id kind '\u{fffd}')
1510                    },
1511                    '>' => {
1512                        self.bad_char_error();
1513                        go!(self: force_quirks; emit_doctype; to Data)
1514                    },
1515                    c => go!(self: push_doctype_id kind c),
1516                }
1517            },
1518
1519            //§ doctype-public-identifier-(single-quoted)-state doctype-system-identifier-(single-quoted)-state
1520            states::DoctypeIdentifierSingleQuoted(kind) => loop {
1521                match get_char!(self, input) {
1522                    '\'' => go!(self: to AfterDoctypeIdentifier kind),
1523                    '\0' => {
1524                        self.bad_char_error();
1525                        go!(self: push_doctype_id kind '\u{fffd}')
1526                    },
1527                    '>' => {
1528                        self.bad_char_error();
1529                        go!(self: force_quirks; emit_doctype; to Data)
1530                    },
1531                    c => go!(self: push_doctype_id kind c),
1532                }
1533            },
1534
1535            //§ after-doctype-public-identifier-state
1536            states::AfterDoctypeIdentifier(Public) => loop {
1537                match get_char!(self, input) {
1538                    '\t' | '\n' | '\x0C' | ' ' => {
1539                        go!(self: to BetweenDoctypePublicAndSystemIdentifiers)
1540                    },
1541                    '>' => go!(self: emit_doctype; to Data),
1542                    '"' => {
1543                        self.bad_char_error();
1544                        go!(self: clear_doctype_id System; to DoctypeIdentifierDoubleQuoted System)
1545                    },
1546                    '\'' => {
1547                        self.bad_char_error();
1548                        go!(self: clear_doctype_id System; to DoctypeIdentifierSingleQuoted System)
1549                    },
1550                    _ => {
1551                        self.bad_char_error();
1552                        go!(self: force_quirks; reconsume BogusDoctype)
1553                    },
1554                }
1555            },
1556
1557            //§ after-doctype-system-identifier-state
1558            states::AfterDoctypeIdentifier(System) => loop {
1559                match get_char!(self, input) {
1560                    '\t' | '\n' | '\x0C' | ' ' => (),
1561                    '>' => go!(self: emit_doctype; to Data),
1562                    _ => {
1563                        self.bad_char_error();
1564                        go!(self: reconsume BogusDoctype)
1565                    },
1566                }
1567            },
1568
1569            //§ between-doctype-public-and-system-identifiers-state
1570            states::BetweenDoctypePublicAndSystemIdentifiers => loop {
1571                match get_char!(self, input) {
1572                    '\t' | '\n' | '\x0C' | ' ' => (),
1573                    '>' => go!(self: emit_doctype; to Data),
1574                    '"' => {
1575                        go!(self: clear_doctype_id System; to DoctypeIdentifierDoubleQuoted System)
1576                    },
1577                    '\'' => {
1578                        go!(self: clear_doctype_id System; to DoctypeIdentifierSingleQuoted System)
1579                    },
1580                    _ => {
1581                        self.bad_char_error();
1582                        go!(self: force_quirks; reconsume BogusDoctype)
1583                    },
1584                }
1585            },
1586
1587            //§ bogus-doctype-state
1588            states::BogusDoctype => loop {
1589                match get_char!(self, input) {
1590                    '>' => go!(self: emit_doctype; to Data),
1591                    '\0' => {
1592                        self.bad_char_error();
1593                    },
1594                    _ => (),
1595                }
1596            },
1597
1598            //§ bogus-comment-state
1599            states::BogusComment => loop {
1600                match get_char!(self, input) {
1601                    '>' => go!(self: emit_comment; to Data),
1602                    '\0' => {
1603                        self.bad_char_error();
1604                        go!(self: push_comment '\u{fffd}')
1605                    },
1606                    c => go!(self: push_comment c),
1607                }
1608            },
1609
1610            //§ markup-declaration-open-state
1611            states::MarkupDeclarationOpen => loop {
1612                if eat_exact!(self, input, "--") {
1613                    go!(self: clear_comment; to CommentStart);
1614                } else if eat!(self, input, "doctype") {
1615                    go!(self: to Doctype);
1616                } else {
1617                    if self
1618                        .sink
1619                        .adjusted_current_node_present_but_not_in_html_namespace()
1620                        && eat_exact!(self, input, "[CDATA[")
1621                    {
1622                        go!(self: clear_temp; to CdataSection);
1623                    }
1624                    self.bad_char_error();
1625                    go!(self: clear_comment; to BogusComment);
1626                }
1627            },
1628
1629            //§ cdata-section-state
1630            states::CdataSection => loop {
1631                match get_char!(self, input) {
1632                    ']' => go!(self: to CdataSectionBracket),
1633                    '\0' => {
1634                        self.emit_temp_buf();
1635                        self.emit_char('\0');
1636                    },
1637                    c => go!(self: push_temp c),
1638                }
1639            },
1640
1641            //§ cdata-section-bracket
1642            states::CdataSectionBracket => match get_char!(self, input) {
1643                ']' => go!(self: to CdataSectionEnd),
1644                _ => go!(self: push_temp ']'; reconsume CdataSection),
1645            },
1646
1647            //§ cdata-section-end
1648            states::CdataSectionEnd => loop {
1649                match get_char!(self, input) {
1650                    ']' => go!(self: push_temp ']'),
1651                    '>' => {
1652                        self.emit_temp_buf();
1653                        go!(self: to Data);
1654                    },
1655                    _ => go!(self: push_temp ']'; push_temp ']'; reconsume CdataSection),
1656                }
1657            },
1658            //§ END
1659        }
1660    }
1661
1662    fn step_char_ref_tokenizer(&self, input: &BufferQueue) -> ProcessResult<Sink::Handle> {
1663        // FIXME HACK: Take and replace the tokenizer so we don't
1664        // double-mut-borrow self.  This is why it's boxed.
1665        let mut tok = self.char_ref_tokenizer.take().unwrap();
1666        let outcome = tok.step(self, input);
1667
1668        let progress = match outcome {
1669            char_ref::Done => {
1670                self.process_char_ref(tok.get_result());
1671                return ProcessResult::Continue;
1672            },
1673
1674            char_ref::Stuck => ProcessResult::Suspend,
1675            char_ref::Progress => ProcessResult::Continue,
1676        };
1677
1678        *self.char_ref_tokenizer.borrow_mut() = Some(tok);
1679        progress
1680    }
1681
1682    fn process_char_ref(&self, char_ref: CharRef) {
1683        let CharRef {
1684            mut chars,
1685            mut num_chars,
1686        } = char_ref;
1687
1688        if num_chars == 0 {
1689            chars[0] = '&';
1690            num_chars = 1;
1691        }
1692
1693        for i in 0..num_chars {
1694            let c = chars[i as usize];
1695            match self.state.get() {
1696                states::Data | states::RawData(states::Rcdata) => self.emit_char(c),
1697
1698                states::AttributeValue(_) => go!(self: push_value c),
1699
1700                _ => panic!(
1701                    "state {:?} should not be reachable in process_char_ref",
1702                    self.state.get()
1703                ),
1704            }
1705        }
1706    }
1707
1708    /// Indicate that we have reached the end of the input.
1709    pub fn end(&self) {
1710        // Handle EOF in the char ref sub-tokenizer, if there is one.
1711        // Do this first because it might un-consume stuff.
1712        let input = BufferQueue::default();
1713        match self.char_ref_tokenizer.take() {
1714            None => (),
1715            Some(mut tok) => {
1716                tok.end_of_file(self, &input);
1717                self.process_char_ref(tok.get_result());
1718            },
1719        }
1720
1721        // Process all remaining buffered input.
1722        // If we're waiting for lookahead, we're not gonna get it.
1723        self.at_eof.set(true);
1724        assert!(matches!(self.run(&input), TokenizerResult::Done));
1725        assert!(input.is_empty());
1726
1727        loop {
1728            match self.eof_step() {
1729                ProcessResult::Continue => (),
1730                ProcessResult::Suspend => break,
1731                ProcessResult::Script(_) => unreachable!(),
1732            }
1733        }
1734
1735        self.sink.end();
1736
1737        if self.opts.profile {
1738            self.dump_profile();
1739        }
1740    }
1741
1742    fn dump_profile(&self) {
1743        let mut results: Vec<(states::State, u64)> = self
1744            .state_profile
1745            .borrow()
1746            .iter()
1747            .map(|(s, t)| (*s, *t))
1748            .collect();
1749        results.sort_by(|&(_, x), &(_, y)| y.cmp(&x));
1750
1751        let total: u64 = results
1752            .iter()
1753            .map(|&(_, t)| t)
1754            .fold(0, ::std::ops::Add::add);
1755        println!("\nTokenizer profile, in nanoseconds");
1756        println!(
1757            "\n{:12}         total in token sink",
1758            self.time_in_sink.get()
1759        );
1760        println!("\n{total:12}         total in tokenizer");
1761
1762        for (k, v) in results.into_iter() {
1763            let pct = 100.0 * (v as f64) / (total as f64);
1764            println!("{v:12}  {pct:4.1}%  {k:?}");
1765        }
1766    }
1767
1768    fn eof_step(&self) -> ProcessResult<Sink::Handle> {
1769        debug!("processing EOF in state {:?}", self.state.get());
1770        match self.state.get() {
1771            states::Data
1772            | states::RawData(Rcdata)
1773            | states::RawData(Rawtext)
1774            | states::RawData(ScriptData)
1775            | states::Plaintext => go!(self: eof),
1776
1777            states::TagName
1778            | states::RawData(ScriptDataEscaped(_))
1779            | states::BeforeAttributeName
1780            | states::AttributeName
1781            | states::AfterAttributeName
1782            | states::AttributeValue(_)
1783            | states::AfterAttributeValueQuoted
1784            | states::SelfClosingStartTag
1785            | states::ScriptDataEscapedDash(_)
1786            | states::ScriptDataEscapedDashDash(_) => {
1787                self.bad_eof_error();
1788                go!(self: to Data)
1789            },
1790
1791            states::BeforeAttributeValue => go!(self: reconsume AttributeValue Unquoted),
1792
1793            states::TagOpen => {
1794                self.bad_eof_error();
1795                self.emit_char('<');
1796                go!(self: to Data);
1797            },
1798
1799            states::EndTagOpen => {
1800                self.bad_eof_error();
1801                self.emit_char('<');
1802                self.emit_char('/');
1803                go!(self: to Data);
1804            },
1805
1806            states::RawLessThanSign(ScriptDataEscaped(DoubleEscaped)) => {
1807                go!(self: to RawData ScriptDataEscaped DoubleEscaped)
1808            },
1809
1810            states::RawLessThanSign(kind) => {
1811                self.emit_char('<');
1812                go!(self: to RawData kind);
1813            },
1814
1815            states::RawEndTagOpen(kind) => {
1816                self.emit_char('<');
1817                self.emit_char('/');
1818                go!(self: to RawData kind);
1819            },
1820
1821            states::RawEndTagName(kind) => {
1822                self.emit_char('<');
1823                self.emit_char('/');
1824                self.emit_temp_buf();
1825                go!(self: to RawData kind)
1826            },
1827
1828            states::ScriptDataEscapeStart(kind) => go!(self: to RawData ScriptDataEscaped kind),
1829
1830            states::ScriptDataEscapeStartDash => go!(self: to RawData ScriptData),
1831
1832            states::ScriptDataDoubleEscapeEnd => {
1833                go!(self: to RawData ScriptDataEscaped DoubleEscaped)
1834            },
1835
1836            states::CommentStart
1837            | states::CommentStartDash
1838            | states::Comment
1839            | states::CommentEndDash
1840            | states::CommentEnd
1841            | states::CommentEndBang => {
1842                self.bad_eof_error();
1843                go!(self: emit_comment; to Data)
1844            },
1845
1846            states::CommentLessThanSign | states::CommentLessThanSignBang => {
1847                go!(self: reconsume Comment)
1848            },
1849
1850            states::CommentLessThanSignBangDash => go!(self: reconsume CommentEndDash),
1851
1852            states::CommentLessThanSignBangDashDash => go!(self: reconsume CommentEnd),
1853
1854            states::Doctype | states::BeforeDoctypeName => {
1855                self.bad_eof_error();
1856                go!(self: create_doctype; force_quirks; emit_doctype; to Data)
1857            },
1858
1859            states::DoctypeName
1860            | states::AfterDoctypeName
1861            | states::AfterDoctypeKeyword(_)
1862            | states::BeforeDoctypeIdentifier(_)
1863            | states::DoctypeIdentifierDoubleQuoted(_)
1864            | states::DoctypeIdentifierSingleQuoted(_)
1865            | states::AfterDoctypeIdentifier(_)
1866            | states::BetweenDoctypePublicAndSystemIdentifiers => {
1867                self.bad_eof_error();
1868                go!(self: force_quirks; emit_doctype; to Data)
1869            },
1870
1871            states::BogusDoctype => go!(self: emit_doctype; to Data),
1872
1873            states::BogusComment => go!(self: emit_comment; to Data),
1874
1875            states::MarkupDeclarationOpen => {
1876                self.bad_char_error();
1877                go!(self: to BogusComment)
1878            },
1879
1880            states::CdataSection => {
1881                self.emit_temp_buf();
1882                self.bad_eof_error();
1883                go!(self: to Data)
1884            },
1885
1886            states::CdataSectionBracket => go!(self: push_temp ']'; to CdataSection),
1887
1888            states::CdataSectionEnd => go!(self: push_temp ']'; push_temp ']'; to CdataSection),
1889        }
1890    }
1891
1892    /// Checks for supported SIMD feature, which is now either SSE2 for x86/x86_64 or NEON for aarch64.
1893    fn is_supported_simd_feature_detected() -> bool {
1894        #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
1895        {
1896            is_x86_feature_detected!("sse2")
1897        }
1898
1899        #[cfg(target_arch = "aarch64")]
1900        {
1901            std::arch::is_aarch64_feature_detected!("neon")
1902        }
1903
1904        #[cfg(not(any(target_arch = "x86", target_arch = "x86_64", target_arch = "aarch64")))]
1905        false
1906    }
1907
1908    #[cfg(any(target_arch = "x86", target_arch = "x86_64", target_arch = "aarch64"))]
1909    /// Implements the [data state] with SIMD instructions.
1910    /// Calls SSE2- or NEON-specific function for chunks and processes any remaining bytes.
1911    ///
1912    /// The algorithm implemented is the naive SIMD approach described [here].
1913    ///
1914    /// ### SAFETY:
1915    /// Calling this function on a CPU that supports neither SSE2 nor NEON causes undefined behaviour.
1916    ///
1917    /// [data state]: https://html.spec.whatwg.org/#data-state
1918    /// [here]: https://lemire.me/blog/2024/06/08/scan-html-faster-with-simd-instructions-chrome-edition/
1919    unsafe fn data_state_simd_fast_path(&self, input: &mut StrTendril) -> Option<SetResult> {
1920        #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
1921        let (mut i, mut n_newlines) = self.data_state_sse2_fast_path(input);
1922
1923        #[cfg(target_arch = "aarch64")]
1924        let (mut i, mut n_newlines) = self.data_state_neon_fast_path(input);
1925
1926        // Process any remaining bytes (less than STRIDE)
1927        while let Some(c) = input.as_bytes().get(i) {
1928            if matches!(*c, b'<' | b'&' | b'\r' | b'\0') {
1929                break;
1930            }
1931            if *c == b'\n' {
1932                n_newlines += 1;
1933            }
1934
1935            i += 1;
1936        }
1937
1938        let set_result = if i == 0 {
1939            let first_char = input.pop_front_char().unwrap();
1940            debug_assert!(matches!(first_char, '<' | '&' | '\r' | '\0'));
1941
1942            // FIXME: Passing a bogus input queue is only relevant when c is \n, which can never happen in this case.
1943            // Still, it would be nice to not have to do that.
1944            // The same is true for the unwrap call.
1945            let preprocessed_char = self
1946                .get_preprocessed_char(first_char, &BufferQueue::default())
1947                .unwrap();
1948            SetResult::FromSet(preprocessed_char)
1949        } else {
1950            debug_assert!(
1951                input.len() >= i,
1952                "Trying to remove {:?} bytes from a tendril that is only {:?} bytes long",
1953                i,
1954                input.len()
1955            );
1956            let consumed_chunk = input.unsafe_subtendril(0, i as u32);
1957            input.unsafe_pop_front(i as u32);
1958            SetResult::NotFromSet(consumed_chunk)
1959        };
1960
1961        self.current_line.set(self.current_line.get() + n_newlines);
1962
1963        Some(set_result)
1964    }
1965
1966    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
1967    #[target_feature(enable = "sse2")]
1968    /// Implements the [data state] with SSE2 instructions for x86/x86_64.
1969    /// Returns a pair of the number of bytes processed and the number of newlines found.
1970    ///
1971    /// ### SAFETY:
1972    /// Calling this function on a CPU that does not support NEON causes undefined behaviour.
1973    ///
1974    /// [data state]: https://html.spec.whatwg.org/#data-state
1975    unsafe fn data_state_sse2_fast_path(&self, input: &mut StrTendril) -> (usize, u64) {
1976        #[cfg(target_arch = "x86")]
1977        use std::arch::x86::{
1978            __m128i, _mm_cmpeq_epi8, _mm_loadu_si128, _mm_movemask_epi8, _mm_or_si128,
1979            _mm_set1_epi8,
1980        };
1981        #[cfg(target_arch = "x86_64")]
1982        use std::arch::x86_64::{
1983            __m128i, _mm_cmpeq_epi8, _mm_loadu_si128, _mm_movemask_epi8, _mm_or_si128,
1984            _mm_set1_epi8,
1985        };
1986
1987        debug_assert!(!input.is_empty());
1988
1989        let quote_mask = _mm_set1_epi8('<' as i8);
1990        let escape_mask = _mm_set1_epi8('&' as i8);
1991        let carriage_return_mask = _mm_set1_epi8('\r' as i8);
1992        let zero_mask = _mm_set1_epi8('\0' as i8);
1993        let newline_mask = _mm_set1_epi8('\n' as i8);
1994
1995        let raw_bytes: &[u8] = input.as_bytes();
1996        let start = raw_bytes.as_ptr();
1997
1998        const STRIDE: usize = 16;
1999        let mut i = 0;
2000        let mut n_newlines = 0;
2001        while i + STRIDE <= raw_bytes.len() {
2002            // Load a 16 byte chunk from the input
2003            let data = _mm_loadu_si128(start.add(i) as *const __m128i);
2004
2005            // Compare the chunk against each mask
2006            let quotes = _mm_cmpeq_epi8(data, quote_mask);
2007            let escapes = _mm_cmpeq_epi8(data, escape_mask);
2008            let carriage_returns = _mm_cmpeq_epi8(data, carriage_return_mask);
2009            let zeros = _mm_cmpeq_epi8(data, zero_mask);
2010            let newlines = _mm_cmpeq_epi8(data, newline_mask);
2011
2012            // Combine all test results and create a bitmask from them.
2013            // Each bit in the mask will be 1 if the character at the bit position is in the set and 0 otherwise.
2014            let test_result = _mm_or_si128(
2015                _mm_or_si128(quotes, zeros),
2016                _mm_or_si128(escapes, carriage_returns),
2017            );
2018            let bitmask = _mm_movemask_epi8(test_result);
2019            let newline_mask = _mm_movemask_epi8(newlines);
2020
2021            if (bitmask != 0) {
2022                // We have reached one of the characters that cause the state machine to transition
2023                let position = if cfg!(target_endian = "little") {
2024                    bitmask.trailing_zeros() as usize
2025                } else {
2026                    bitmask.leading_zeros() as usize
2027                };
2028
2029                n_newlines += (newline_mask & ((1 << position) - 1)).count_ones() as u64;
2030                i += position;
2031                break;
2032            } else {
2033                n_newlines += newline_mask.count_ones() as u64;
2034            }
2035
2036            i += STRIDE;
2037        }
2038
2039        (i, n_newlines)
2040    }
2041
2042    #[cfg(target_arch = "aarch64")]
2043    #[target_feature(enable = "neon")]
2044    /// Implements the [data state] with NEON SIMD instructions for AArch64.
2045    /// Returns a pair of the number of bytes processed and the number of newlines found.
2046    ///
2047    /// ### SAFETY:
2048    /// Calling this function on a CPU that does not support NEON causes undefined behaviour.
2049    ///
2050    /// [data state]: https://html.spec.whatwg.org/#data-state
2051    unsafe fn data_state_neon_fast_path(&self, input: &mut StrTendril) -> (usize, u64) {
2052        use std::arch::aarch64::{vceqq_u8, vdupq_n_u8, vld1q_u8, vmaxvq_u8, vorrq_u8};
2053
2054        debug_assert!(!input.is_empty());
2055
2056        let quote_mask = vdupq_n_u8(b'<');
2057        let escape_mask = vdupq_n_u8(b'&');
2058        let carriage_return_mask = vdupq_n_u8(b'\r');
2059        let zero_mask = vdupq_n_u8(b'\0');
2060        let newline_mask = vdupq_n_u8(b'\n');
2061
2062        let raw_bytes: &[u8] = input.as_bytes();
2063        let start = raw_bytes.as_ptr();
2064
2065        const STRIDE: usize = 16;
2066        let mut i = 0;
2067        let mut n_newlines = 0;
2068        while i + STRIDE <= raw_bytes.len() {
2069            // Load a 16 byte chunk from the input
2070            let data = vld1q_u8(start.add(i));
2071
2072            // Compare the chunk against each mask
2073            let quotes = vceqq_u8(data, quote_mask);
2074            let escapes = vceqq_u8(data, escape_mask);
2075            let carriage_returns = vceqq_u8(data, carriage_return_mask);
2076            let zeros = vceqq_u8(data, zero_mask);
2077            let newlines = vceqq_u8(data, newline_mask);
2078
2079            // Combine all test results and create a bitmask from them.
2080            // Each bit in the mask will be 1 if the character at the bit position is in the set and 0 otherwise.
2081            let test_result =
2082                vorrq_u8(vorrq_u8(quotes, zeros), vorrq_u8(escapes, carriage_returns));
2083            let bitmask = vmaxvq_u8(test_result);
2084            let newline_mask = vmaxvq_u8(newlines);
2085            if bitmask != 0 {
2086                // We have reached one of the characters that cause the state machine to transition
2087                let chunk_bytes = std::slice::from_raw_parts(start.add(i), STRIDE);
2088                let position = chunk_bytes
2089                    .iter()
2090                    .position(|&b| matches!(b, b'<' | b'&' | b'\r' | b'\0'))
2091                    .unwrap();
2092
2093                n_newlines += chunk_bytes[..position]
2094                    .iter()
2095                    .filter(|&&b| b == b'\n')
2096                    .count() as u64;
2097
2098                i += position;
2099                break;
2100            } else if newline_mask != 0 {
2101                let chunk_bytes = std::slice::from_raw_parts(start.add(i), STRIDE);
2102                n_newlines += chunk_bytes.iter().filter(|&&b| b == b'\n').count() as u64;
2103            }
2104
2105            i += STRIDE;
2106        }
2107
2108        (i, n_newlines)
2109    }
2110}
2111
2112#[cfg(test)]
2113#[allow(non_snake_case)]
2114mod test {
2115    use super::option_push; // private items
2116    use crate::tendril::{SliceExt, StrTendril};
2117
2118    use super::{TokenSink, TokenSinkResult, Tokenizer, TokenizerOpts};
2119
2120    use super::interface::{CharacterTokens, EOFToken, NullCharacterToken, ParseError};
2121    use super::interface::{EndTag, StartTag, Tag, TagKind};
2122    use super::interface::{TagToken, Token};
2123
2124    use markup5ever::buffer_queue::BufferQueue;
2125    use std::cell::RefCell;
2126
2127    use crate::LocalName;
2128
2129    // LinesMatch implements the TokenSink trait. It is used for testing to see
2130    // if current_line is being updated when process_token is called. The lines
2131    // vector is a collection of the line numbers that each token is on.
2132    struct LinesMatch {
2133        tokens: RefCell<Vec<Token>>,
2134        current_str: RefCell<StrTendril>,
2135        lines: RefCell<Vec<(Token, u64)>>,
2136    }
2137
2138    impl LinesMatch {
2139        fn new() -> LinesMatch {
2140            LinesMatch {
2141                tokens: RefCell::new(vec![]),
2142                current_str: RefCell::new(StrTendril::new()),
2143                lines: RefCell::new(vec![]),
2144            }
2145        }
2146
2147        fn push(&self, token: Token, line_number: u64) {
2148            self.finish_str();
2149            self.lines.borrow_mut().push((token, line_number));
2150        }
2151
2152        fn finish_str(&self) {
2153            if !self.current_str.borrow().is_empty() {
2154                let s = self.current_str.take();
2155                self.tokens.borrow_mut().push(CharacterTokens(s));
2156            }
2157        }
2158    }
2159
2160    impl TokenSink for LinesMatch {
2161        type Handle = ();
2162
2163        fn process_token(&self, token: Token, line_number: u64) -> TokenSinkResult<Self::Handle> {
2164            match token {
2165                CharacterTokens(b) => {
2166                    self.current_str.borrow_mut().push_slice(&b);
2167                },
2168
2169                NullCharacterToken => {
2170                    self.current_str.borrow_mut().push_char('\0');
2171                },
2172
2173                ParseError(_) => {
2174                    panic!("unexpected parse error");
2175                },
2176
2177                TagToken(mut t) => {
2178                    // The spec seems to indicate that one can emit
2179                    // erroneous end tags with attrs, but the test
2180                    // cases don't contain them.
2181                    match t.kind {
2182                        EndTag => {
2183                            t.self_closing = false;
2184                            t.attrs = vec![];
2185                        },
2186                        _ => t.attrs.sort_by(|a1, a2| a1.name.cmp(&a2.name)),
2187                    }
2188                    self.push(TagToken(t), line_number);
2189                },
2190
2191                EOFToken => (),
2192
2193                _ => self.push(token, line_number),
2194            }
2195            TokenSinkResult::Continue
2196        }
2197    }
2198
2199    // Take in tokens, process them, and return vector with line
2200    // numbers that each token is on
2201    fn tokenize(input: Vec<StrTendril>, opts: TokenizerOpts) -> Vec<(Token, u64)> {
2202        let sink = LinesMatch::new();
2203        let tok = Tokenizer::new(sink, opts);
2204        let buffer = BufferQueue::default();
2205        for chunk in input.into_iter() {
2206            buffer.push_back(chunk);
2207            let _ = tok.feed(&buffer);
2208        }
2209        tok.end();
2210        tok.sink.lines.take()
2211    }
2212
2213    // Create a tag token
2214    fn create_tag(token: StrTendril, tagkind: TagKind) -> Token {
2215        let name = LocalName::from(&*token);
2216
2217        TagToken(Tag {
2218            kind: tagkind,
2219            name,
2220            self_closing: false,
2221            attrs: vec![],
2222        })
2223    }
2224
2225    #[test]
2226    fn push_to_None_gives_singleton() {
2227        let mut s: Option<StrTendril> = None;
2228        option_push(&mut s, 'x');
2229        assert_eq!(s, Some("x".to_tendril()));
2230    }
2231
2232    #[test]
2233    fn push_to_empty_appends() {
2234        let mut s: Option<StrTendril> = Some(StrTendril::new());
2235        option_push(&mut s, 'x');
2236        assert_eq!(s, Some("x".to_tendril()));
2237    }
2238
2239    #[test]
2240    fn push_to_nonempty_appends() {
2241        let mut s: Option<StrTendril> = Some(StrTendril::from_slice("y"));
2242        option_push(&mut s, 'x');
2243        assert_eq!(s, Some("yx".to_tendril()));
2244    }
2245
2246    #[test]
2247    fn check_lines() {
2248        let opts = TokenizerOpts {
2249            exact_errors: false,
2250            discard_bom: true,
2251            profile: false,
2252            initial_state: None,
2253            last_start_tag_name: None,
2254        };
2255        let vector = vec![
2256            StrTendril::from("<a>\n"),
2257            StrTendril::from("<b>\n"),
2258            StrTendril::from("</b>\n"),
2259            StrTendril::from("</a>\n"),
2260        ];
2261        let expected = vec![
2262            (create_tag(StrTendril::from("a"), StartTag), 1),
2263            (create_tag(StrTendril::from("b"), StartTag), 2),
2264            (create_tag(StrTendril::from("b"), EndTag), 3),
2265            (create_tag(StrTendril::from("a"), EndTag), 4),
2266        ];
2267        let results = tokenize(vector, opts);
2268        assert_eq!(results, expected);
2269    }
2270
2271    #[test]
2272    fn check_lines_with_new_line() {
2273        let opts = TokenizerOpts {
2274            exact_errors: false,
2275            discard_bom: true,
2276            profile: false,
2277            initial_state: None,
2278            last_start_tag_name: None,
2279        };
2280        let vector = vec![
2281            StrTendril::from("<a>\r\n"),
2282            StrTendril::from("<b>\r\n"),
2283            StrTendril::from("</b>\r\n"),
2284            StrTendril::from("</a>\r\n"),
2285        ];
2286        let expected = vec![
2287            (create_tag(StrTendril::from("a"), StartTag), 1),
2288            (create_tag(StrTendril::from("b"), StartTag), 2),
2289            (create_tag(StrTendril::from("b"), EndTag), 3),
2290            (create_tag(StrTendril::from("a"), EndTag), 4),
2291        ];
2292        let results = tokenize(vector, opts);
2293        assert_eq!(results, expected);
2294    }
2295}