1pub use self::interface::{CharacterTokens, EOFToken, NullCharacterToken, ParseError};
13pub use self::interface::{CommentToken, DoctypeToken, TagToken, Token};
14pub use self::interface::{Doctype, EndTag, StartTag, Tag, TagKind};
15pub use self::interface::{TokenSink, TokenSinkResult};
16
17use self::states::{DoctypeIdKind, Public, System};
18use self::states::{DoubleEscaped, Escaped};
19use self::states::{DoubleQuoted, SingleQuoted, Unquoted};
20use self::states::{Rawtext, Rcdata, ScriptData, ScriptDataEscaped};
21
22use self::char_ref::{CharRef, CharRefTokenizer};
23
24use crate::util::str::lower_ascii_letter;
25
26use log::{debug, trace};
27use markup5ever::{ns, small_char_set, TokenizerResult};
28use std::borrow::Cow::{self, Borrowed};
29use std::cell::{Cell, RefCell, RefMut};
30use std::collections::BTreeMap;
31use std::mem;
32
33pub use crate::buffer_queue::{BufferQueue, FromSet, NotFromSet, SetResult};
34use crate::macros::{time, unwrap_or_return};
35use crate::tendril::StrTendril;
36use crate::{Attribute, LocalName, QualName, SmallCharSet};
37
38mod char_ref;
39mod interface;
40pub mod states;
41
42pub enum ProcessResult<Handle> {
43 Continue,
44 Suspend,
45 Script(Handle),
46}
47
48fn option_push(opt_str: &mut Option<StrTendril>, c: char) {
49 match *opt_str {
50 Some(ref mut s) => s.push_char(c),
51 None => *opt_str = Some(StrTendril::from_char(c)),
52 }
53}
54
55#[derive(Clone)]
57pub struct TokenizerOpts {
58 pub exact_errors: bool,
61
62 pub discard_bom: bool,
65
66 pub profile: bool,
69
70 pub initial_state: Option<states::State>,
73
74 pub last_start_tag_name: Option<String>,
80}
81
82impl Default for TokenizerOpts {
83 fn default() -> TokenizerOpts {
84 TokenizerOpts {
85 exact_errors: false,
86 discard_bom: true,
87 profile: false,
88 initial_state: None,
89 last_start_tag_name: None,
90 }
91 }
92}
93
94pub struct Tokenizer<Sink> {
96 opts: TokenizerOpts,
98
99 pub sink: Sink,
101
102 state: Cell<states::State>,
104
105 at_eof: Cell<bool>,
108
109 char_ref_tokenizer: RefCell<Option<Box<CharRefTokenizer>>>,
112
113 current_char: Cell<char>,
115
116 reconsume: Cell<bool>,
118
119 ignore_lf: Cell<bool>,
122
123 discard_bom: Cell<bool>,
126
127 current_tag_kind: Cell<TagKind>,
129
130 current_tag_name: RefCell<StrTendril>,
132
133 current_tag_self_closing: Cell<bool>,
135
136 current_tag_attrs: RefCell<Vec<Attribute>>,
138
139 current_attr_name: RefCell<StrTendril>,
141
142 current_attr_value: RefCell<StrTendril>,
144
145 current_comment: RefCell<StrTendril>,
147
148 current_doctype: RefCell<Doctype>,
150
151 last_start_tag_name: RefCell<Option<LocalName>>,
153
154 temp_buf: RefCell<StrTendril>,
156
157 state_profile: RefCell<BTreeMap<states::State, u64>>,
159
160 time_in_sink: Cell<u64>,
162
163 current_line: Cell<u64>,
165}
166
167impl<Sink: TokenSink> Tokenizer<Sink> {
168 pub fn new(sink: Sink, mut opts: TokenizerOpts) -> Tokenizer<Sink> {
170 let start_tag_name = opts
171 .last_start_tag_name
172 .take()
173 .map(|s| LocalName::from(&*s));
174 let state = opts.initial_state.unwrap_or(states::Data);
175 let discard_bom = opts.discard_bom;
176 Tokenizer {
177 opts,
178 sink,
179 state: Cell::new(state),
180 char_ref_tokenizer: RefCell::new(None),
181 at_eof: Cell::new(false),
182 current_char: Cell::new('\0'),
183 reconsume: Cell::new(false),
184 ignore_lf: Cell::new(false),
185 discard_bom: Cell::new(discard_bom),
186 current_tag_kind: Cell::new(StartTag),
187 current_tag_name: RefCell::new(StrTendril::new()),
188 current_tag_self_closing: Cell::new(false),
189 current_tag_attrs: RefCell::new(vec![]),
190 current_attr_name: RefCell::new(StrTendril::new()),
191 current_attr_value: RefCell::new(StrTendril::new()),
192 current_comment: RefCell::new(StrTendril::new()),
193 current_doctype: RefCell::new(Doctype::default()),
194 last_start_tag_name: RefCell::new(start_tag_name),
195 temp_buf: RefCell::new(StrTendril::new()),
196 state_profile: RefCell::new(BTreeMap::new()),
197 time_in_sink: Cell::new(0),
198 current_line: Cell::new(1),
199 }
200 }
201
202 pub fn feed(&self, input: &BufferQueue) -> TokenizerResult<Sink::Handle> {
204 if input.is_empty() {
205 return TokenizerResult::Done;
206 }
207
208 if self.discard_bom.get() {
209 if let Some(c) = input.peek() {
210 if c == '\u{feff}' {
211 input.next();
212 }
213 } else {
214 return TokenizerResult::Done;
215 }
216 };
217
218 self.run(input)
219 }
220
221 pub fn set_plaintext_state(&self) {
222 self.state.set(states::Plaintext);
223 }
224
225 fn process_token(&self, token: Token) -> TokenSinkResult<Sink::Handle> {
226 if self.opts.profile {
227 let (ret, dt) = time!(self.sink.process_token(token, self.current_line.get()));
228 self.time_in_sink.set(self.time_in_sink.get() + dt);
229 ret
230 } else {
231 self.sink.process_token(token, self.current_line.get())
232 }
233 }
234
235 fn process_token_and_continue(&self, token: Token) {
236 assert!(matches!(
237 self.process_token(token),
238 TokenSinkResult::Continue
239 ));
240 }
241
242 fn get_preprocessed_char(&self, mut c: char, input: &BufferQueue) -> Option<char> {
246 if self.ignore_lf.get() {
247 self.ignore_lf.set(false);
248 if c == '\n' {
249 c = input.next()?;
250 }
251 }
252
253 if c == '\r' {
254 self.ignore_lf.set(true);
255 c = '\n';
256 }
257
258 if c == '\n' {
259 self.current_line.set(self.current_line.get() + 1);
260 }
261
262 if self.opts.exact_errors
263 && match c as u32 {
264 0x01..=0x08 | 0x0B | 0x0E..=0x1F | 0x7F..=0x9F | 0xFDD0..=0xFDEF => true,
265 n if (n & 0xFFFE) == 0xFFFE => true,
266 _ => false,
267 }
268 {
269 let msg = format!("Bad character {c}");
270 self.emit_error(Cow::Owned(msg));
271 }
272
273 trace!("got character {c}");
274 self.current_char.set(c);
275 Some(c)
276 }
277
278 fn get_char(&self, input: &BufferQueue) -> Option<char> {
281 if self.reconsume.get() {
282 self.reconsume.set(false);
283 Some(self.current_char.get())
284 } else {
285 input
286 .next()
287 .and_then(|c| self.get_preprocessed_char(c, input))
288 }
289 }
290
291 fn pop_except_from(&self, input: &BufferQueue, set: SmallCharSet) -> Option<SetResult> {
292 if self.opts.exact_errors || self.reconsume.get() || self.ignore_lf.get() {
297 return self.get_char(input).map(FromSet);
298 }
299
300 let d = input.pop_except_from(set);
301 trace!("got characters {d:?}");
302 match d {
303 Some(FromSet(c)) => self.get_preprocessed_char(c, input).map(FromSet),
304
305 _ => d,
309 }
310 }
311
312 fn eat(&self, input: &BufferQueue, pat: &str, eq: fn(&u8, &u8) -> bool) -> Option<bool> {
317 if self.ignore_lf.get() {
318 self.ignore_lf.set(false);
319 if self.peek(input) == Some('\n') {
320 self.discard_char(input);
321 }
322 }
323
324 input.push_front(mem::take(&mut self.temp_buf.borrow_mut()));
325 match input.eat(pat, eq) {
326 None if self.at_eof.get() => Some(false),
327 None => {
328 while let Some(data) = input.next() {
329 self.temp_buf.borrow_mut().push_char(data);
330 }
331 None
332 },
333 Some(matched) => Some(matched),
334 }
335 }
336
337 fn run(&self, input: &BufferQueue) -> TokenizerResult<Sink::Handle> {
339 if self.opts.profile {
340 loop {
341 let state = self.state.get();
342 let old_sink = self.time_in_sink.get();
343 let (run, mut dt) = time!(self.step(input));
344 dt -= (self.time_in_sink.get() - old_sink);
345 let new = match self.state_profile.borrow_mut().get_mut(&state) {
346 Some(x) => {
347 *x += dt;
348 false
349 },
350 None => true,
351 };
352 if new {
353 self.state_profile.borrow_mut().insert(state, dt);
355 }
356 match run {
357 ProcessResult::Continue => (),
358 ProcessResult::Suspend => break,
359 ProcessResult::Script(node) => return TokenizerResult::Script(node),
360 }
361 }
362 } else {
363 loop {
364 match self.step(input) {
365 ProcessResult::Continue => (),
366 ProcessResult::Suspend => break,
367 ProcessResult::Script(node) => return TokenizerResult::Script(node),
368 }
369 }
370 }
371 TokenizerResult::Done
372 }
373
374 #[inline]
375 fn bad_char_error(&self) {
376 #[cfg(feature = "trace_tokenizer")]
377 trace!(" error");
378
379 let msg = if self.opts.exact_errors {
380 Cow::from("Bad character")
381 } else {
382 let c = self.current_char.get();
383 let state = self.state.get();
384 Cow::from(format!("Saw {c} in state {state:?}"))
385 };
386 self.emit_error(msg);
387 }
388
389 #[inline]
390 fn bad_eof_error(&self) {
391 #[cfg(feature = "trace_tokenizer")]
392 trace!(" error_eof");
393
394 let msg = if self.opts.exact_errors {
395 Cow::from("Unexpected EOF")
396 } else {
397 let state = self.state.get();
398 Cow::from(format!("Saw EOF in state {state:?}"))
399 };
400 self.emit_error(msg);
401 }
402
403 fn emit_char(&self, c: char) {
404 #[cfg(feature = "trace_tokenizer")]
405 trace!(" emit");
406
407 self.process_token_and_continue(match c {
408 '\0' => NullCharacterToken,
409 _ => CharacterTokens(StrTendril::from_char(c)),
410 });
411 }
412
413 fn emit_chars(&self, b: StrTendril) {
415 self.process_token_and_continue(CharacterTokens(b));
416 }
417
418 fn emit_current_tag(&self) -> ProcessResult<Sink::Handle> {
419 self.finish_attribute();
420
421 let name = LocalName::from(&**self.current_tag_name.borrow());
422 self.current_tag_name.borrow_mut().clear();
423
424 match self.current_tag_kind.get() {
425 StartTag => {
426 *self.last_start_tag_name.borrow_mut() = Some(name.clone());
427 },
428 EndTag => {
429 if !self.current_tag_attrs.borrow().is_empty() {
430 self.emit_error(Borrowed("Attributes on an end tag"));
431 }
432 if self.current_tag_self_closing.get() {
433 self.emit_error(Borrowed("Self-closing end tag"));
434 }
435 },
436 }
437
438 let token = TagToken(Tag {
439 kind: self.current_tag_kind.get(),
440 name,
441 self_closing: self.current_tag_self_closing.get(),
442 attrs: std::mem::take(&mut self.current_tag_attrs.borrow_mut()),
443 });
444
445 match self.process_token(token) {
446 TokenSinkResult::Continue => ProcessResult::Continue,
447 TokenSinkResult::Plaintext => {
448 self.state.set(states::Plaintext);
449 ProcessResult::Continue
450 },
451 TokenSinkResult::Script(node) => {
452 self.state.set(states::Data);
453 ProcessResult::Script(node)
454 },
455 TokenSinkResult::RawData(kind) => {
456 self.state.set(states::RawData(kind));
457 ProcessResult::Continue
458 },
459 }
460 }
461
462 fn emit_temp_buf(&self) {
463 #[cfg(feature = "trace_tokenizer")]
464 trace!(" emit_temp");
465
466 let buf = mem::take(&mut *self.temp_buf.borrow_mut());
468 self.emit_chars(buf);
469 }
470
471 fn clear_temp_buf(&self) {
472 self.temp_buf.borrow_mut().clear();
474 }
475
476 fn emit_current_comment(&self) {
477 let comment = mem::take(&mut *self.current_comment.borrow_mut());
478 self.process_token_and_continue(CommentToken(comment));
479 }
480
481 fn discard_tag(&self) {
482 self.current_tag_name.borrow_mut().clear();
483 self.current_tag_self_closing.set(false);
484 *self.current_tag_attrs.borrow_mut() = vec![];
485 }
486
487 fn create_tag(&self, kind: TagKind, c: char) {
488 self.discard_tag();
489 self.current_tag_name.borrow_mut().push_char(c);
490 self.current_tag_kind.set(kind);
491 }
492
493 fn have_appropriate_end_tag(&self) -> bool {
494 match self.last_start_tag_name.borrow().as_ref() {
495 Some(last) => {
496 (self.current_tag_kind.get() == EndTag)
497 && (**self.current_tag_name.borrow() == **last)
498 },
499 None => false,
500 }
501 }
502
503 fn create_attribute(&self, c: char) {
504 self.finish_attribute();
505
506 self.current_attr_name.borrow_mut().push_char(c);
507 }
508
509 fn finish_attribute(&self) {
510 if self.current_attr_name.borrow().is_empty() {
511 return;
512 }
513
514 let dup = {
517 let name = &*self.current_attr_name.borrow();
518 self.current_tag_attrs
519 .borrow()
520 .iter()
521 .any(|a| *a.name.local == **name)
522 };
523
524 if dup {
525 self.emit_error(Borrowed("Duplicate attribute"));
526 self.current_attr_name.borrow_mut().clear();
527 self.current_attr_value.borrow_mut().clear();
528 } else {
529 let name = LocalName::from(&**self.current_attr_name.borrow());
530 self.current_attr_name.borrow_mut().clear();
531 self.current_tag_attrs.borrow_mut().push(Attribute {
532 name: QualName::new(None, ns!(), name),
535 value: mem::take(&mut self.current_attr_value.borrow_mut()),
536 });
537 }
538 }
539
540 fn emit_current_doctype(&self) {
541 let doctype = self.current_doctype.take();
542 self.process_token_and_continue(DoctypeToken(doctype));
543 }
544
545 fn doctype_id(&self, kind: DoctypeIdKind) -> RefMut<'_, Option<StrTendril>> {
546 let current_doctype = self.current_doctype.borrow_mut();
547 match kind {
548 Public => RefMut::map(current_doctype, |d| &mut d.public_id),
549 System => RefMut::map(current_doctype, |d| &mut d.system_id),
550 }
551 }
552
553 fn clear_doctype_id(&self, kind: DoctypeIdKind) {
554 let mut id = self.doctype_id(kind);
555 match *id {
556 Some(ref mut s) => s.clear(),
557 None => *id = Some(StrTendril::new()),
558 }
559 }
560
561 fn consume_char_ref(&self) {
562 *self.char_ref_tokenizer.borrow_mut() = Some(Box::new(CharRefTokenizer::new(matches!(
563 self.state.get(),
564 states::AttributeValue(_)
565 ))));
566 }
567
568 fn emit_eof(&self) {
569 self.process_token_and_continue(EOFToken);
570 }
571
572 fn peek(&self, input: &BufferQueue) -> Option<char> {
573 if self.reconsume.get() {
574 Some(self.current_char.get())
575 } else {
576 input.peek()
577 }
578 }
579
580 fn discard_char(&self, input: &BufferQueue) {
581 if self.reconsume.get() {
587 self.reconsume.set(false);
588 } else {
589 input.next();
590 }
591 }
592
593 fn emit_error(&self, error: Cow<'static, str>) {
594 self.process_token_and_continue(ParseError(error));
595 }
596}
597macro_rules! shorthand (
601 ( $me:ident : create_tag $kind:ident $c:expr ) => ( $me.create_tag($kind, $c) );
602 ( $me:ident : push_tag $c:expr ) => ( $me.current_tag_name.borrow_mut().push_char($c) );
603 ( $me:ident : discard_tag ) => ( $me.discard_tag() );
604 ( $me:ident : discard_char $input:expr ) => ( $me.discard_char($input) );
605 ( $me:ident : push_temp $c:expr ) => ( $me.temp_buf.borrow_mut().push_char($c) );
606 ( $me:ident : clear_temp ) => ( $me.clear_temp_buf() );
607 ( $me:ident : create_attr $c:expr ) => ( $me.create_attribute($c) );
608 ( $me:ident : push_name $c:expr ) => ( $me.current_attr_name.borrow_mut().push_char($c) );
609 ( $me:ident : push_value $c:expr ) => ( $me.current_attr_value.borrow_mut().push_char($c) );
610 ( $me:ident : append_value $c:expr ) => ( $me.current_attr_value.borrow_mut().push_tendril($c));
611 ( $me:ident : push_comment $c:expr ) => ( $me.current_comment.borrow_mut().push_char($c) );
612 ( $me:ident : append_comment $c:expr ) => ( $me.current_comment.borrow_mut().push_slice($c) );
613 ( $me:ident : emit_comment ) => ( $me.emit_current_comment() );
614 ( $me:ident : clear_comment ) => ( $me.current_comment.borrow_mut().clear() );
615 ( $me:ident : create_doctype ) => ( *$me.current_doctype.borrow_mut() = Doctype::default() );
616 ( $me:ident : push_doctype_name $c:expr ) => ( option_push(&mut $me.current_doctype.borrow_mut().name, $c) );
617 ( $me:ident : push_doctype_id $k:ident $c:expr ) => ( option_push(&mut $me.doctype_id($k), $c) );
618 ( $me:ident : clear_doctype_id $k:ident ) => ( $me.clear_doctype_id($k) );
619 ( $me:ident : force_quirks ) => ( $me.current_doctype.borrow_mut().force_quirks = true);
620 ( $me:ident : emit_doctype ) => ( $me.emit_current_doctype() );
621);
622
623#[cfg(feature = "trace_tokenizer")]
626macro_rules! sh_trace ( ( $me:ident : $($cmds:tt)* ) => ({
627 trace!(" {:?}", stringify!($($cmds)*));
628 shorthand!($me : $($cmds)*);
629}));
630
631#[cfg(not(feature = "trace_tokenizer"))]
632macro_rules! sh_trace ( ( $me:ident : $($cmds:tt)* ) => ( shorthand!($me: $($cmds)*) ) );
633
634macro_rules! go (
636 ( $me:ident : $a:tt ; $($rest:tt)* ) => ({ sh_trace!($me: $a); go!($me: $($rest)*); });
640 ( $me:ident : $a:tt $b:tt ; $($rest:tt)* ) => ({ sh_trace!($me: $a $b); go!($me: $($rest)*); });
641 ( $me:ident : $a:tt $b:tt $c:tt ; $($rest:tt)* ) => ({ sh_trace!($me: $a $b $c); go!($me: $($rest)*); });
642 ( $me:ident : $a:tt $b:tt $c:tt $d:tt ; $($rest:tt)* ) => ({ sh_trace!($me: $a $b $c $d); go!($me: $($rest)*); });
643
644 ( $me:ident : to $s:ident ) => ({ $me.state.set(states::$s); return ProcessResult::Continue; });
647 ( $me:ident : to $s:ident $k1:expr ) => ({ $me.state.set(states::$s($k1)); return ProcessResult::Continue; });
648 ( $me:ident : to $s:ident $k1:ident $k2:expr ) => ({ $me.state.set(states::$s($k1($k2))); return ProcessResult::Continue; });
649
650 ( $me:ident : reconsume $s:ident ) => ({ $me.reconsume.set(true); go!($me: to $s); });
651 ( $me:ident : reconsume $s:ident $k1:expr ) => ({ $me.reconsume.set(true); go!($me: to $s $k1); });
652 ( $me:ident : reconsume $s:ident $k1:ident $k2:expr ) => ({ $me.reconsume.set(true); go!($me: to $s $k1 $k2); });
653
654 ( $me:ident : consume_char_ref ) => ({ $me.consume_char_ref(); return ProcessResult::Continue; });
655
656 ( $me:ident : emit_tag $s:ident ) => ({
658 $me.state.set(states::$s);
659 return $me.emit_current_tag();
660 });
661
662 ( $me:ident : eof ) => ({ $me.emit_eof(); return ProcessResult::Suspend; });
663
664 ( $me:ident : $($cmd:tt)+ ) => ( sh_trace!($me: $($cmd)+) );
666
667 ( $me:ident : ) => (());
669);
670
671macro_rules! get_char ( ($me:expr, $input:expr) => (
674 unwrap_or_return!($me.get_char($input), ProcessResult::Suspend)
675));
676
677macro_rules! peek ( ($me:expr, $input:expr) => (
678 unwrap_or_return!($me.peek($input), ProcessResult::Suspend)
679));
680
681macro_rules! pop_except_from ( ($me:expr, $input:expr, $set:expr) => (
682 unwrap_or_return!($me.pop_except_from($input, $set), ProcessResult::Suspend)
683));
684
685macro_rules! eat ( ($me:expr, $input:expr, $pat:expr) => (
686 unwrap_or_return!($me.eat($input, $pat, u8::eq_ignore_ascii_case), ProcessResult::Suspend)
687));
688
689macro_rules! eat_exact ( ($me:expr, $input:expr, $pat:expr) => (
690 unwrap_or_return!($me.eat($input, $pat, u8::eq), ProcessResult::Suspend)
691));
692
693impl<Sink: TokenSink> Tokenizer<Sink> {
694 #[allow(clippy::never_loop)]
698 fn step(&self, input: &BufferQueue) -> ProcessResult<Sink::Handle> {
699 if self.char_ref_tokenizer.borrow().is_some() {
700 return self.step_char_ref_tokenizer(input);
701 }
702
703 trace!("processing in state {:?}", self.state);
704 match self.state.get() {
705 states::Data => loop {
707 let set = small_char_set!('\r' '\0' '&' '<' '\n');
708
709 #[cfg(any(target_arch = "x86", target_arch = "x86_64", target_arch = "aarch64"))]
710 let set_result = if !(self.opts.exact_errors
711 || self.reconsume.get()
712 || self.ignore_lf.get())
713 && Self::is_supported_simd_feature_detected()
714 {
715 let front_buffer = input.peek_front_chunk_mut();
716 let Some(mut front_buffer) = front_buffer else {
717 return ProcessResult::Suspend;
718 };
719
720 let first_char = front_buffer
723 .chars()
724 .next()
725 .expect("Input buffers are never empty");
726
727 if matches!(first_char, '\r' | '\0' | '&' | '<' | '\n') {
728 drop(front_buffer);
729 self.pop_except_from(input, set)
730 } else {
731 let result = unsafe { self.data_state_simd_fast_path(&mut front_buffer) };
734
735 if front_buffer.is_empty() {
736 drop(front_buffer);
737 input.pop_front();
738 }
739
740 result
741 }
742 } else {
743 self.pop_except_from(input, set)
744 };
745
746 #[cfg(not(any(
747 target_arch = "x86",
748 target_arch = "x86_64",
749 target_arch = "aarch64"
750 )))]
751 let set_result = self.pop_except_from(input, set);
752
753 let Some(set_result) = set_result else {
754 return ProcessResult::Suspend;
755 };
756 match set_result {
757 FromSet('\0') => {
758 self.bad_char_error();
759 self.emit_char('\0');
760 },
761 FromSet('&') => go!(self: consume_char_ref),
762 FromSet('<') => go!(self: to TagOpen),
763 FromSet(c) => {
764 self.emit_char(c);
765 },
766 NotFromSet(b) => self.emit_chars(b),
767 }
768 },
769
770 states::RawData(Rcdata) => loop {
772 match pop_except_from!(self, input, small_char_set!('\r' '\0' '&' '<' '\n')) {
773 FromSet('\0') => {
774 self.bad_char_error();
775 self.emit_char('\u{fffd}');
776 },
777 FromSet('&') => go!(self: consume_char_ref),
778 FromSet('<') => go!(self: to RawLessThanSign Rcdata),
779 FromSet(c) => self.emit_char(c),
780 NotFromSet(b) => self.emit_chars(b),
781 }
782 },
783
784 states::RawData(Rawtext) => loop {
786 match pop_except_from!(self, input, small_char_set!('\r' '\0' '<' '\n')) {
787 FromSet('\0') => {
788 self.bad_char_error();
789 self.emit_char('\u{fffd}');
790 },
791 FromSet('<') => go!(self: to RawLessThanSign Rawtext),
792 FromSet(c) => self.emit_char(c),
793 NotFromSet(b) => self.emit_chars(b),
794 }
795 },
796
797 states::RawData(ScriptData) => loop {
799 match pop_except_from!(self, input, small_char_set!('\r' '\0' '<' '\n')) {
800 FromSet('\0') => {
801 self.bad_char_error();
802 self.emit_char('\u{fffd}');
803 },
804 FromSet('<') => go!(self: to RawLessThanSign ScriptData),
805 FromSet(c) => self.emit_char(c),
806 NotFromSet(b) => self.emit_chars(b),
807 }
808 },
809
810 states::RawData(ScriptDataEscaped(Escaped)) => loop {
812 match pop_except_from!(self, input, small_char_set!('\r' '\0' '-' '<' '\n')) {
813 FromSet('\0') => {
814 self.bad_char_error();
815 self.emit_char('\u{fffd}');
816 },
817 FromSet('-') => {
818 self.emit_char('-');
819 go!(self: to ScriptDataEscapedDash Escaped);
820 },
821 FromSet('<') => go!(self: to RawLessThanSign ScriptDataEscaped Escaped),
822 FromSet(c) => self.emit_char(c),
823 NotFromSet(b) => self.emit_chars(b),
824 }
825 },
826
827 states::RawData(ScriptDataEscaped(DoubleEscaped)) => loop {
829 match pop_except_from!(self, input, small_char_set!('\r' '\0' '-' '<' '\n')) {
830 FromSet('\0') => {
831 self.bad_char_error();
832 self.emit_char('\u{fffd}');
833 },
834 FromSet('-') => {
835 self.emit_char('-');
836 go!(self: to ScriptDataEscapedDash DoubleEscaped);
837 },
838 FromSet('<') => {
839 self.emit_char('<');
840 go!(self: to RawLessThanSign ScriptDataEscaped DoubleEscaped)
841 },
842 FromSet(c) => self.emit_char(c),
843 NotFromSet(b) => self.emit_chars(b),
844 }
845 },
846
847 states::Plaintext => loop {
849 match pop_except_from!(self, input, small_char_set!('\r' '\0' '\n')) {
850 FromSet('\0') => {
851 self.bad_char_error();
852 self.emit_char('\u{fffd}');
853 },
854 FromSet(c) => self.emit_char(c),
855 NotFromSet(b) => self.emit_chars(b),
856 }
857 },
858
859 states::TagOpen => loop {
861 match get_char!(self, input) {
862 '!' => go!(self: to MarkupDeclarationOpen),
863 '/' => go!(self: to EndTagOpen),
864 '?' => {
865 self.bad_char_error();
866 go!(self: clear_comment; reconsume BogusComment)
867 },
868 c => match lower_ascii_letter(c) {
869 Some(cl) => go!(self: create_tag StartTag cl; to TagName),
870 None => {
871 self.bad_char_error();
872 self.emit_char('<');
873 go!(self: reconsume Data)
874 },
875 },
876 }
877 },
878
879 states::EndTagOpen => loop {
881 match get_char!(self, input) {
882 '>' => {
883 self.bad_char_error();
884 go!(self: to Data)
885 },
886 c => match lower_ascii_letter(c) {
887 Some(cl) => go!(self: create_tag EndTag cl; to TagName),
888 None => {
889 self.bad_char_error();
890 go!(self: clear_comment; reconsume BogusComment)
891 },
892 },
893 }
894 },
895
896 states::TagName => loop {
898 match get_char!(self, input) {
899 '\t' | '\n' | '\x0C' | ' ' => go!(self: to BeforeAttributeName),
900 '/' => go!(self: to SelfClosingStartTag),
901 '>' => go!(self: emit_tag Data),
902 '\0' => {
903 self.bad_char_error();
904 go!(self: push_tag '\u{fffd}')
905 },
906 c => go!(self: push_tag (c.to_ascii_lowercase())),
907 }
908 },
909
910 states::RawLessThanSign(ScriptDataEscaped(Escaped)) => loop {
912 match get_char!(self, input) {
913 '/' => go!(self: clear_temp; to RawEndTagOpen ScriptDataEscaped Escaped),
914 c => match lower_ascii_letter(c) {
915 Some(cl) => {
916 go!(self: clear_temp; push_temp cl);
917 self.emit_char('<');
918 self.emit_char(c);
919 go!(self: to ScriptDataEscapeStart DoubleEscaped);
920 },
921 None => {
922 self.emit_char('<');
923 go!(self: reconsume RawData ScriptDataEscaped Escaped);
924 },
925 },
926 }
927 },
928
929 states::RawLessThanSign(ScriptDataEscaped(DoubleEscaped)) => loop {
931 match get_char!(self, input) {
932 '/' => {
933 go!(self: clear_temp);
934 self.emit_char('/');
935 go!(self: to ScriptDataDoubleEscapeEnd);
936 },
937 _ => go!(self: reconsume RawData ScriptDataEscaped DoubleEscaped),
938 }
939 },
940
941 states::RawLessThanSign(kind) => loop {
944 match get_char!(self, input) {
945 '/' => go!(self: clear_temp; to RawEndTagOpen kind),
946 '!' if kind == ScriptData => {
947 self.emit_char('<');
948 self.emit_char('!');
949 go!(self: to ScriptDataEscapeStart Escaped);
950 },
951 _ => {
952 self.emit_char('<');
953 go!(self: reconsume RawData kind);
954 },
955 }
956 },
957
958 states::RawEndTagOpen(kind) => loop {
960 let c = get_char!(self, input);
961 match lower_ascii_letter(c) {
962 Some(cl) => go!(self: create_tag EndTag cl; push_temp c; to RawEndTagName kind),
963 None => {
964 self.emit_char('<');
965 self.emit_char('/');
966 go!(self: reconsume RawData kind);
967 },
968 }
969 },
970
971 states::RawEndTagName(kind) => loop {
973 let c = get_char!(self, input);
974 if self.have_appropriate_end_tag() {
975 match c {
976 '\t' | '\n' | '\x0C' | ' ' => go!(self: clear_temp; to BeforeAttributeName),
977 '/' => go!(self: clear_temp; to SelfClosingStartTag),
978 '>' => go!(self: clear_temp; emit_tag Data),
979 _ => (),
980 }
981 }
982
983 match lower_ascii_letter(c) {
984 Some(cl) => go!(self: push_tag cl; push_temp c),
985 None => {
986 go!(self: discard_tag);
987 self.emit_char('<');
988 self.emit_char('/');
989 self.emit_temp_buf();
990 go!(self: reconsume RawData kind);
991 },
992 }
993 },
994
995 states::ScriptDataEscapeStart(DoubleEscaped) => loop {
997 let c = get_char!(self, input);
998 match c {
999 '\t' | '\n' | '\x0C' | ' ' | '/' | '>' => {
1000 let esc = if &**self.temp_buf.borrow() == "script" {
1001 DoubleEscaped
1002 } else {
1003 Escaped
1004 };
1005 self.emit_char(c);
1006 go!(self: to RawData ScriptDataEscaped esc);
1007 },
1008 _ => match lower_ascii_letter(c) {
1009 Some(cl) => {
1010 go!(self: push_temp cl);
1011 self.emit_char(c);
1012 },
1013 None => go!(self: reconsume RawData ScriptDataEscaped Escaped),
1014 },
1015 }
1016 },
1017
1018 states::ScriptDataEscapeStart(Escaped) => loop {
1020 match get_char!(self, input) {
1021 '-' => {
1022 self.emit_char('-');
1023 go!(self: to ScriptDataEscapeStartDash);
1024 },
1025 _ => go!(self: reconsume RawData ScriptData),
1026 }
1027 },
1028
1029 states::ScriptDataEscapeStartDash => loop {
1031 match get_char!(self, input) {
1032 '-' => {
1033 self.emit_char('-');
1034 go!(self: to ScriptDataEscapedDashDash Escaped);
1035 },
1036 _ => go!(self: reconsume RawData ScriptData),
1037 }
1038 },
1039
1040 states::ScriptDataEscapedDash(kind) => loop {
1042 match get_char!(self, input) {
1043 '-' => {
1044 self.emit_char('-');
1045 go!(self: to ScriptDataEscapedDashDash kind);
1046 },
1047 '<' => {
1048 if kind == DoubleEscaped {
1049 self.emit_char('<');
1050 }
1051 go!(self: to RawLessThanSign ScriptDataEscaped kind);
1052 },
1053 '\0' => {
1054 self.bad_char_error();
1055 self.emit_char('\u{fffd}');
1056 go!(self: to RawData ScriptDataEscaped kind)
1057 },
1058 c => {
1059 self.emit_char(c);
1060 go!(self: to RawData ScriptDataEscaped kind);
1061 },
1062 }
1063 },
1064
1065 states::ScriptDataEscapedDashDash(kind) => loop {
1067 match get_char!(self, input) {
1068 '-' => {
1069 self.emit_char('-');
1070 },
1071 '<' => {
1072 if kind == DoubleEscaped {
1073 self.emit_char('<');
1074 }
1075 go!(self: to RawLessThanSign ScriptDataEscaped kind);
1076 },
1077 '>' => {
1078 self.emit_char('>');
1079 go!(self: to RawData ScriptData);
1080 },
1081 '\0' => {
1082 self.bad_char_error();
1083 self.emit_char('\u{fffd}');
1084 go!(self: to RawData ScriptDataEscaped kind)
1085 },
1086 c => {
1087 self.emit_char(c);
1088 go!(self: to RawData ScriptDataEscaped kind);
1089 },
1090 }
1091 },
1092
1093 states::ScriptDataDoubleEscapeEnd => loop {
1095 let c = get_char!(self, input);
1096 match c {
1097 '\t' | '\n' | '\x0C' | ' ' | '/' | '>' => {
1098 let esc = if &**self.temp_buf.borrow() == "script" {
1099 Escaped
1100 } else {
1101 DoubleEscaped
1102 };
1103 self.emit_char(c);
1104 go!(self: to RawData ScriptDataEscaped esc);
1105 },
1106 _ => match lower_ascii_letter(c) {
1107 Some(cl) => {
1108 go!(self: push_temp cl);
1109 self.emit_char(c);
1110 },
1111 None => go!(self: reconsume RawData ScriptDataEscaped DoubleEscaped),
1112 },
1113 }
1114 },
1115
1116 states::BeforeAttributeName => loop {
1118 match get_char!(self, input) {
1119 '\t' | '\n' | '\x0C' | ' ' => (),
1120 '/' => go!(self: to SelfClosingStartTag),
1121 '>' => go!(self: emit_tag Data),
1122 '\0' => {
1123 self.bad_char_error();
1124 go!(self: create_attr '\u{fffd}'; to AttributeName)
1125 },
1126 c => match lower_ascii_letter(c) {
1127 Some(cl) => go!(self: create_attr cl; to AttributeName),
1128 None => {
1129 if matches!(c, '"' | '\'' | '<' | '=') {
1130 self.bad_char_error();
1131 }
1132
1133 go!(self: create_attr c; to AttributeName);
1134 },
1135 },
1136 }
1137 },
1138
1139 states::AttributeName => loop {
1141 match get_char!(self, input) {
1142 '\t' | '\n' | '\x0C' | ' ' => go!(self: to AfterAttributeName),
1143 '/' => go!(self: to SelfClosingStartTag),
1144 '=' => go!(self: to BeforeAttributeValue),
1145 '>' => go!(self: emit_tag Data),
1146 '\0' => {
1147 self.bad_char_error();
1148 go!(self: push_name '\u{fffd}')
1149 },
1150 c => match lower_ascii_letter(c) {
1151 Some(cl) => go!(self: push_name cl),
1152 None => {
1153 if matches!(c, '"' | '\'' | '<') {
1154 self.bad_char_error();
1155 }
1156 go!(self: push_name c);
1157 },
1158 },
1159 }
1160 },
1161
1162 states::AfterAttributeName => loop {
1164 match get_char!(self, input) {
1165 '\t' | '\n' | '\x0C' | ' ' => (),
1166 '/' => go!(self: to SelfClosingStartTag),
1167 '=' => go!(self: to BeforeAttributeValue),
1168 '>' => go!(self: emit_tag Data),
1169 '\0' => {
1170 self.bad_char_error();
1171 go!(self: create_attr '\u{fffd}'; to AttributeName)
1172 },
1173 c => match lower_ascii_letter(c) {
1174 Some(cl) => go!(self: create_attr cl; to AttributeName),
1175 None => {
1176 if matches!(c, '"' | '\'' | '<') {
1177 self.bad_char_error();
1178 }
1179
1180 go!(self: create_attr c; to AttributeName);
1181 },
1182 },
1183 }
1184 },
1185
1186 states::BeforeAttributeValue => loop {
1190 match peek!(self, input) {
1191 '\t' | '\n' | '\r' | '\x0C' | ' ' => go!(self: discard_char input),
1192 '"' => go!(self: discard_char input; to AttributeValue DoubleQuoted),
1193 '\'' => go!(self: discard_char input; to AttributeValue SingleQuoted),
1194 '>' => {
1195 go!(self: discard_char input);
1196 self.bad_char_error();
1197 go!(self: emit_tag Data)
1198 },
1199 _ => go!(self: to AttributeValue Unquoted),
1200 }
1201 },
1202
1203 states::AttributeValue(DoubleQuoted) => loop {
1205 match pop_except_from!(self, input, small_char_set!('\r' '"' '&' '\0' '\n')) {
1206 FromSet('"') => go!(self: to AfterAttributeValueQuoted),
1207 FromSet('&') => go!(self: consume_char_ref),
1208 FromSet('\0') => {
1209 self.bad_char_error();
1210 go!(self: push_value '\u{fffd}')
1211 },
1212 FromSet(c) => go!(self: push_value c),
1213 NotFromSet(ref b) => go!(self: append_value b),
1214 }
1215 },
1216
1217 states::AttributeValue(SingleQuoted) => loop {
1219 match pop_except_from!(self, input, small_char_set!('\r' '\'' '&' '\0' '\n')) {
1220 FromSet('\'') => go!(self: to AfterAttributeValueQuoted),
1221 FromSet('&') => go!(self: consume_char_ref),
1222 FromSet('\0') => {
1223 self.bad_char_error();
1224 go!(self: push_value '\u{fffd}')
1225 },
1226 FromSet(c) => go!(self: push_value c),
1227 NotFromSet(ref b) => go!(self: append_value b),
1228 }
1229 },
1230
1231 states::AttributeValue(Unquoted) => loop {
1233 match pop_except_from!(
1234 self,
1235 input,
1236 small_char_set!('\r' '\t' '\n' '\x0C' ' ' '&' '>' '\0')
1237 ) {
1238 FromSet('\t') | FromSet('\n') | FromSet('\x0C') | FromSet(' ') => {
1239 go!(self: to BeforeAttributeName)
1240 },
1241 FromSet('&') => go!(self: consume_char_ref),
1242 FromSet('>') => go!(self: emit_tag Data),
1243 FromSet('\0') => {
1244 self.bad_char_error();
1245 go!(self: push_value '\u{fffd}')
1246 },
1247 FromSet(c) => {
1248 if matches!(c, '"' | '\'' | '<' | '=' | '`') {
1249 self.bad_char_error();
1250 }
1251 go!(self: push_value c);
1252 },
1253 NotFromSet(ref b) => go!(self: append_value b),
1254 }
1255 },
1256
1257 states::AfterAttributeValueQuoted => loop {
1259 match get_char!(self, input) {
1260 '\t' | '\n' | '\x0C' | ' ' => go!(self: to BeforeAttributeName),
1261 '/' => go!(self: to SelfClosingStartTag),
1262 '>' => go!(self: emit_tag Data),
1263 _ => {
1264 self.bad_char_error();
1265 go!(self: reconsume BeforeAttributeName)
1266 },
1267 }
1268 },
1269
1270 states::SelfClosingStartTag => loop {
1272 match get_char!(self, input) {
1273 '>' => {
1274 self.current_tag_self_closing.set(true);
1275 go!(self: emit_tag Data);
1276 },
1277 _ => {
1278 self.bad_char_error();
1279 go!(self: reconsume BeforeAttributeName)
1280 },
1281 }
1282 },
1283
1284 states::CommentStart => loop {
1286 match get_char!(self, input) {
1287 '-' => go!(self: to CommentStartDash),
1288 '\0' => {
1289 self.bad_char_error();
1290 go!(self: push_comment '\u{fffd}'; to Comment)
1291 },
1292 '>' => {
1293 self.bad_char_error();
1294 go!(self: emit_comment; to Data)
1295 },
1296 c => go!(self: push_comment c; to Comment),
1297 }
1298 },
1299
1300 states::CommentStartDash => loop {
1302 match get_char!(self, input) {
1303 '-' => go!(self: to CommentEnd),
1304 '\0' => {
1305 self.bad_char_error();
1306 go!(self: append_comment "-\u{fffd}"; to Comment)
1307 },
1308 '>' => {
1309 self.bad_char_error();
1310 go!(self: emit_comment; to Data)
1311 },
1312 c => go!(self: push_comment '-'; push_comment c; to Comment),
1313 }
1314 },
1315
1316 states::Comment => loop {
1318 match get_char!(self, input) {
1319 c @ '<' => go!(self: push_comment c; to CommentLessThanSign),
1320 '-' => go!(self: to CommentEndDash),
1321 '\0' => {
1322 self.bad_char_error();
1323 go!(self: push_comment '\u{fffd}')
1324 },
1325 c => go!(self: push_comment c),
1326 }
1327 },
1328
1329 states::CommentLessThanSign => loop {
1331 match get_char!(self, input) {
1332 c @ '!' => go!(self: push_comment c; to CommentLessThanSignBang),
1333 c @ '<' => go!(self: push_comment c),
1334 _ => go!(self: reconsume Comment),
1335 }
1336 },
1337
1338 states::CommentLessThanSignBang => loop {
1340 match get_char!(self, input) {
1341 '-' => go!(self: to CommentLessThanSignBangDash),
1342 _ => go!(self: reconsume Comment),
1343 }
1344 },
1345
1346 states::CommentLessThanSignBangDash => loop {
1348 match get_char!(self, input) {
1349 '-' => go!(self: to CommentLessThanSignBangDashDash),
1350 _ => go!(self: reconsume CommentEndDash),
1351 }
1352 },
1353
1354 states::CommentLessThanSignBangDashDash => loop {
1356 match get_char!(self, input) {
1357 '>' => go!(self: reconsume CommentEnd),
1358 _ => {
1359 self.bad_char_error();
1360 go!(self: reconsume CommentEnd)
1361 },
1362 }
1363 },
1364
1365 states::CommentEndDash => loop {
1367 match get_char!(self, input) {
1368 '-' => go!(self: to CommentEnd),
1369 '\0' => {
1370 self.bad_char_error();
1371 go!(self: append_comment "-\u{fffd}"; to Comment)
1372 },
1373 c => go!(self: push_comment '-'; push_comment c; to Comment),
1374 }
1375 },
1376
1377 states::CommentEnd => loop {
1379 match get_char!(self, input) {
1380 '>' => go!(self: emit_comment; to Data),
1381 '!' => go!(self: to CommentEndBang),
1382 '-' => go!(self: push_comment '-'),
1383 _ => go!(self: append_comment "--"; reconsume Comment),
1384 }
1385 },
1386
1387 states::CommentEndBang => loop {
1389 match get_char!(self, input) {
1390 '-' => go!(self: append_comment "--!"; to CommentEndDash),
1391 '>' => {
1392 self.bad_char_error();
1393 go!(self: emit_comment; to Data)
1394 },
1395 '\0' => {
1396 self.bad_char_error();
1397 go!(self: append_comment "--!\u{fffd}"; to Comment)
1398 },
1399 c => go!(self: append_comment "--!"; push_comment c; to Comment),
1400 }
1401 },
1402
1403 states::Doctype => loop {
1405 match get_char!(self, input) {
1406 '\t' | '\n' | '\x0C' | ' ' => go!(self: to BeforeDoctypeName),
1407 '>' => go!(self: reconsume BeforeDoctypeName),
1408 _ => {
1409 self.bad_char_error();
1410 go!(self: reconsume BeforeDoctypeName)
1411 },
1412 }
1413 },
1414
1415 states::BeforeDoctypeName => loop {
1417 match get_char!(self, input) {
1418 '\t' | '\n' | '\x0C' | ' ' => (),
1419 '\0' => {
1420 self.bad_char_error();
1421 go!(self: create_doctype; push_doctype_name '\u{fffd}'; to DoctypeName)
1422 },
1423 '>' => {
1424 self.bad_char_error();
1425 go!(self: create_doctype; force_quirks; emit_doctype; to Data)
1426 },
1427 c => go!(self: create_doctype; push_doctype_name (c.to_ascii_lowercase());
1428 to DoctypeName),
1429 }
1430 },
1431
1432 states::DoctypeName => loop {
1434 match get_char!(self, input) {
1435 '\t' | '\n' | '\x0C' | ' ' => go!(self: clear_temp; to AfterDoctypeName),
1436 '>' => go!(self: emit_doctype; to Data),
1437 '\0' => {
1438 self.bad_char_error();
1439 go!(self: push_doctype_name '\u{fffd}')
1440 },
1441 c => go!(self: push_doctype_name (c.to_ascii_lowercase())),
1442 }
1443 },
1444
1445 states::AfterDoctypeName => loop {
1447 if eat!(self, input, "public") {
1448 go!(self: to AfterDoctypeKeyword Public);
1449 } else if eat!(self, input, "system") {
1450 go!(self: to AfterDoctypeKeyword System);
1451 } else {
1452 match get_char!(self, input) {
1453 '\t' | '\n' | '\x0C' | ' ' => (),
1454 '>' => go!(self: emit_doctype; to Data),
1455 _ => {
1456 self.bad_char_error();
1457 go!(self: force_quirks; reconsume BogusDoctype)
1458 },
1459 }
1460 }
1461 },
1462
1463 states::AfterDoctypeKeyword(kind) => loop {
1465 match get_char!(self, input) {
1466 '\t' | '\n' | '\x0C' | ' ' => go!(self: to BeforeDoctypeIdentifier kind),
1467 '"' => {
1468 self.bad_char_error();
1469 go!(self: clear_doctype_id kind; to DoctypeIdentifierDoubleQuoted kind)
1470 },
1471 '\'' => {
1472 self.bad_char_error();
1473 go!(self: clear_doctype_id kind; to DoctypeIdentifierSingleQuoted kind)
1474 },
1475 '>' => {
1476 self.bad_char_error();
1477 go!(self: force_quirks; emit_doctype; to Data)
1478 },
1479 _ => {
1480 self.bad_char_error();
1481 go!(self: force_quirks; reconsume BogusDoctype)
1482 },
1483 }
1484 },
1485
1486 states::BeforeDoctypeIdentifier(kind) => loop {
1488 match get_char!(self, input) {
1489 '\t' | '\n' | '\x0C' | ' ' => (),
1490 '"' => go!(self: clear_doctype_id kind; to DoctypeIdentifierDoubleQuoted kind),
1491 '\'' => go!(self: clear_doctype_id kind; to DoctypeIdentifierSingleQuoted kind),
1492 '>' => {
1493 self.bad_char_error();
1494 go!(self: force_quirks; emit_doctype; to Data)
1495 },
1496 _ => {
1497 self.bad_char_error();
1498 go!(self: force_quirks; reconsume BogusDoctype)
1499 },
1500 }
1501 },
1502
1503 states::DoctypeIdentifierDoubleQuoted(kind) => loop {
1505 match get_char!(self, input) {
1506 '"' => go!(self: to AfterDoctypeIdentifier kind),
1507 '\0' => {
1508 self.bad_char_error();
1509 go!(self: push_doctype_id kind '\u{fffd}')
1510 },
1511 '>' => {
1512 self.bad_char_error();
1513 go!(self: force_quirks; emit_doctype; to Data)
1514 },
1515 c => go!(self: push_doctype_id kind c),
1516 }
1517 },
1518
1519 states::DoctypeIdentifierSingleQuoted(kind) => loop {
1521 match get_char!(self, input) {
1522 '\'' => go!(self: to AfterDoctypeIdentifier kind),
1523 '\0' => {
1524 self.bad_char_error();
1525 go!(self: push_doctype_id kind '\u{fffd}')
1526 },
1527 '>' => {
1528 self.bad_char_error();
1529 go!(self: force_quirks; emit_doctype; to Data)
1530 },
1531 c => go!(self: push_doctype_id kind c),
1532 }
1533 },
1534
1535 states::AfterDoctypeIdentifier(Public) => loop {
1537 match get_char!(self, input) {
1538 '\t' | '\n' | '\x0C' | ' ' => {
1539 go!(self: to BetweenDoctypePublicAndSystemIdentifiers)
1540 },
1541 '>' => go!(self: emit_doctype; to Data),
1542 '"' => {
1543 self.bad_char_error();
1544 go!(self: clear_doctype_id System; to DoctypeIdentifierDoubleQuoted System)
1545 },
1546 '\'' => {
1547 self.bad_char_error();
1548 go!(self: clear_doctype_id System; to DoctypeIdentifierSingleQuoted System)
1549 },
1550 _ => {
1551 self.bad_char_error();
1552 go!(self: force_quirks; reconsume BogusDoctype)
1553 },
1554 }
1555 },
1556
1557 states::AfterDoctypeIdentifier(System) => loop {
1559 match get_char!(self, input) {
1560 '\t' | '\n' | '\x0C' | ' ' => (),
1561 '>' => go!(self: emit_doctype; to Data),
1562 _ => {
1563 self.bad_char_error();
1564 go!(self: reconsume BogusDoctype)
1565 },
1566 }
1567 },
1568
1569 states::BetweenDoctypePublicAndSystemIdentifiers => loop {
1571 match get_char!(self, input) {
1572 '\t' | '\n' | '\x0C' | ' ' => (),
1573 '>' => go!(self: emit_doctype; to Data),
1574 '"' => {
1575 go!(self: clear_doctype_id System; to DoctypeIdentifierDoubleQuoted System)
1576 },
1577 '\'' => {
1578 go!(self: clear_doctype_id System; to DoctypeIdentifierSingleQuoted System)
1579 },
1580 _ => {
1581 self.bad_char_error();
1582 go!(self: force_quirks; reconsume BogusDoctype)
1583 },
1584 }
1585 },
1586
1587 states::BogusDoctype => loop {
1589 match get_char!(self, input) {
1590 '>' => go!(self: emit_doctype; to Data),
1591 '\0' => {
1592 self.bad_char_error();
1593 },
1594 _ => (),
1595 }
1596 },
1597
1598 states::BogusComment => loop {
1600 match get_char!(self, input) {
1601 '>' => go!(self: emit_comment; to Data),
1602 '\0' => {
1603 self.bad_char_error();
1604 go!(self: push_comment '\u{fffd}')
1605 },
1606 c => go!(self: push_comment c),
1607 }
1608 },
1609
1610 states::MarkupDeclarationOpen => loop {
1612 if eat_exact!(self, input, "--") {
1613 go!(self: clear_comment; to CommentStart);
1614 } else if eat!(self, input, "doctype") {
1615 go!(self: to Doctype);
1616 } else {
1617 if self
1618 .sink
1619 .adjusted_current_node_present_but_not_in_html_namespace()
1620 && eat_exact!(self, input, "[CDATA[")
1621 {
1622 go!(self: clear_temp; to CdataSection);
1623 }
1624 self.bad_char_error();
1625 go!(self: clear_comment; to BogusComment);
1626 }
1627 },
1628
1629 states::CdataSection => loop {
1631 match get_char!(self, input) {
1632 ']' => go!(self: to CdataSectionBracket),
1633 '\0' => {
1634 self.emit_temp_buf();
1635 self.emit_char('\0');
1636 },
1637 c => go!(self: push_temp c),
1638 }
1639 },
1640
1641 states::CdataSectionBracket => match get_char!(self, input) {
1643 ']' => go!(self: to CdataSectionEnd),
1644 _ => go!(self: push_temp ']'; reconsume CdataSection),
1645 },
1646
1647 states::CdataSectionEnd => loop {
1649 match get_char!(self, input) {
1650 ']' => go!(self: push_temp ']'),
1651 '>' => {
1652 self.emit_temp_buf();
1653 go!(self: to Data);
1654 },
1655 _ => go!(self: push_temp ']'; push_temp ']'; reconsume CdataSection),
1656 }
1657 },
1658 }
1660 }
1661
1662 fn step_char_ref_tokenizer(&self, input: &BufferQueue) -> ProcessResult<Sink::Handle> {
1663 let mut tok = self.char_ref_tokenizer.take().unwrap();
1666 let outcome = tok.step(self, input);
1667
1668 let progress = match outcome {
1669 char_ref::Done => {
1670 self.process_char_ref(tok.get_result());
1671 return ProcessResult::Continue;
1672 },
1673
1674 char_ref::Stuck => ProcessResult::Suspend,
1675 char_ref::Progress => ProcessResult::Continue,
1676 };
1677
1678 *self.char_ref_tokenizer.borrow_mut() = Some(tok);
1679 progress
1680 }
1681
1682 fn process_char_ref(&self, char_ref: CharRef) {
1683 let CharRef {
1684 mut chars,
1685 mut num_chars,
1686 } = char_ref;
1687
1688 if num_chars == 0 {
1689 chars[0] = '&';
1690 num_chars = 1;
1691 }
1692
1693 for i in 0..num_chars {
1694 let c = chars[i as usize];
1695 match self.state.get() {
1696 states::Data | states::RawData(states::Rcdata) => self.emit_char(c),
1697
1698 states::AttributeValue(_) => go!(self: push_value c),
1699
1700 _ => panic!(
1701 "state {:?} should not be reachable in process_char_ref",
1702 self.state.get()
1703 ),
1704 }
1705 }
1706 }
1707
1708 pub fn end(&self) {
1710 let input = BufferQueue::default();
1713 match self.char_ref_tokenizer.take() {
1714 None => (),
1715 Some(mut tok) => {
1716 tok.end_of_file(self, &input);
1717 self.process_char_ref(tok.get_result());
1718 },
1719 }
1720
1721 self.at_eof.set(true);
1724 assert!(matches!(self.run(&input), TokenizerResult::Done));
1725 assert!(input.is_empty());
1726
1727 loop {
1728 match self.eof_step() {
1729 ProcessResult::Continue => (),
1730 ProcessResult::Suspend => break,
1731 ProcessResult::Script(_) => unreachable!(),
1732 }
1733 }
1734
1735 self.sink.end();
1736
1737 if self.opts.profile {
1738 self.dump_profile();
1739 }
1740 }
1741
1742 fn dump_profile(&self) {
1743 let mut results: Vec<(states::State, u64)> = self
1744 .state_profile
1745 .borrow()
1746 .iter()
1747 .map(|(s, t)| (*s, *t))
1748 .collect();
1749 results.sort_by(|&(_, x), &(_, y)| y.cmp(&x));
1750
1751 let total: u64 = results
1752 .iter()
1753 .map(|&(_, t)| t)
1754 .fold(0, ::std::ops::Add::add);
1755 println!("\nTokenizer profile, in nanoseconds");
1756 println!(
1757 "\n{:12} total in token sink",
1758 self.time_in_sink.get()
1759 );
1760 println!("\n{total:12} total in tokenizer");
1761
1762 for (k, v) in results.into_iter() {
1763 let pct = 100.0 * (v as f64) / (total as f64);
1764 println!("{v:12} {pct:4.1}% {k:?}");
1765 }
1766 }
1767
1768 fn eof_step(&self) -> ProcessResult<Sink::Handle> {
1769 debug!("processing EOF in state {:?}", self.state.get());
1770 match self.state.get() {
1771 states::Data
1772 | states::RawData(Rcdata)
1773 | states::RawData(Rawtext)
1774 | states::RawData(ScriptData)
1775 | states::Plaintext => go!(self: eof),
1776
1777 states::TagName
1778 | states::RawData(ScriptDataEscaped(_))
1779 | states::BeforeAttributeName
1780 | states::AttributeName
1781 | states::AfterAttributeName
1782 | states::AttributeValue(_)
1783 | states::AfterAttributeValueQuoted
1784 | states::SelfClosingStartTag
1785 | states::ScriptDataEscapedDash(_)
1786 | states::ScriptDataEscapedDashDash(_) => {
1787 self.bad_eof_error();
1788 go!(self: to Data)
1789 },
1790
1791 states::BeforeAttributeValue => go!(self: reconsume AttributeValue Unquoted),
1792
1793 states::TagOpen => {
1794 self.bad_eof_error();
1795 self.emit_char('<');
1796 go!(self: to Data);
1797 },
1798
1799 states::EndTagOpen => {
1800 self.bad_eof_error();
1801 self.emit_char('<');
1802 self.emit_char('/');
1803 go!(self: to Data);
1804 },
1805
1806 states::RawLessThanSign(ScriptDataEscaped(DoubleEscaped)) => {
1807 go!(self: to RawData ScriptDataEscaped DoubleEscaped)
1808 },
1809
1810 states::RawLessThanSign(kind) => {
1811 self.emit_char('<');
1812 go!(self: to RawData kind);
1813 },
1814
1815 states::RawEndTagOpen(kind) => {
1816 self.emit_char('<');
1817 self.emit_char('/');
1818 go!(self: to RawData kind);
1819 },
1820
1821 states::RawEndTagName(kind) => {
1822 self.emit_char('<');
1823 self.emit_char('/');
1824 self.emit_temp_buf();
1825 go!(self: to RawData kind)
1826 },
1827
1828 states::ScriptDataEscapeStart(kind) => go!(self: to RawData ScriptDataEscaped kind),
1829
1830 states::ScriptDataEscapeStartDash => go!(self: to RawData ScriptData),
1831
1832 states::ScriptDataDoubleEscapeEnd => {
1833 go!(self: to RawData ScriptDataEscaped DoubleEscaped)
1834 },
1835
1836 states::CommentStart
1837 | states::CommentStartDash
1838 | states::Comment
1839 | states::CommentEndDash
1840 | states::CommentEnd
1841 | states::CommentEndBang => {
1842 self.bad_eof_error();
1843 go!(self: emit_comment; to Data)
1844 },
1845
1846 states::CommentLessThanSign | states::CommentLessThanSignBang => {
1847 go!(self: reconsume Comment)
1848 },
1849
1850 states::CommentLessThanSignBangDash => go!(self: reconsume CommentEndDash),
1851
1852 states::CommentLessThanSignBangDashDash => go!(self: reconsume CommentEnd),
1853
1854 states::Doctype | states::BeforeDoctypeName => {
1855 self.bad_eof_error();
1856 go!(self: create_doctype; force_quirks; emit_doctype; to Data)
1857 },
1858
1859 states::DoctypeName
1860 | states::AfterDoctypeName
1861 | states::AfterDoctypeKeyword(_)
1862 | states::BeforeDoctypeIdentifier(_)
1863 | states::DoctypeIdentifierDoubleQuoted(_)
1864 | states::DoctypeIdentifierSingleQuoted(_)
1865 | states::AfterDoctypeIdentifier(_)
1866 | states::BetweenDoctypePublicAndSystemIdentifiers => {
1867 self.bad_eof_error();
1868 go!(self: force_quirks; emit_doctype; to Data)
1869 },
1870
1871 states::BogusDoctype => go!(self: emit_doctype; to Data),
1872
1873 states::BogusComment => go!(self: emit_comment; to Data),
1874
1875 states::MarkupDeclarationOpen => {
1876 self.bad_char_error();
1877 go!(self: to BogusComment)
1878 },
1879
1880 states::CdataSection => {
1881 self.emit_temp_buf();
1882 self.bad_eof_error();
1883 go!(self: to Data)
1884 },
1885
1886 states::CdataSectionBracket => go!(self: push_temp ']'; to CdataSection),
1887
1888 states::CdataSectionEnd => go!(self: push_temp ']'; push_temp ']'; to CdataSection),
1889 }
1890 }
1891
1892 fn is_supported_simd_feature_detected() -> bool {
1894 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
1895 {
1896 is_x86_feature_detected!("sse2")
1897 }
1898
1899 #[cfg(target_arch = "aarch64")]
1900 {
1901 std::arch::is_aarch64_feature_detected!("neon")
1902 }
1903
1904 #[cfg(not(any(target_arch = "x86", target_arch = "x86_64", target_arch = "aarch64")))]
1905 false
1906 }
1907
1908 #[cfg(any(target_arch = "x86", target_arch = "x86_64", target_arch = "aarch64"))]
1909 unsafe fn data_state_simd_fast_path(&self, input: &mut StrTendril) -> Option<SetResult> {
1920 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
1921 let (mut i, mut n_newlines) = self.data_state_sse2_fast_path(input);
1922
1923 #[cfg(target_arch = "aarch64")]
1924 let (mut i, mut n_newlines) = self.data_state_neon_fast_path(input);
1925
1926 while let Some(c) = input.as_bytes().get(i) {
1928 if matches!(*c, b'<' | b'&' | b'\r' | b'\0') {
1929 break;
1930 }
1931 if *c == b'\n' {
1932 n_newlines += 1;
1933 }
1934
1935 i += 1;
1936 }
1937
1938 let set_result = if i == 0 {
1939 let first_char = input.pop_front_char().unwrap();
1940 debug_assert!(matches!(first_char, '<' | '&' | '\r' | '\0'));
1941
1942 let preprocessed_char = self
1946 .get_preprocessed_char(first_char, &BufferQueue::default())
1947 .unwrap();
1948 SetResult::FromSet(preprocessed_char)
1949 } else {
1950 debug_assert!(
1951 input.len() >= i,
1952 "Trying to remove {:?} bytes from a tendril that is only {:?} bytes long",
1953 i,
1954 input.len()
1955 );
1956 let consumed_chunk = input.unsafe_subtendril(0, i as u32);
1957 input.unsafe_pop_front(i as u32);
1958 SetResult::NotFromSet(consumed_chunk)
1959 };
1960
1961 self.current_line.set(self.current_line.get() + n_newlines);
1962
1963 Some(set_result)
1964 }
1965
1966 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
1967 #[target_feature(enable = "sse2")]
1968 unsafe fn data_state_sse2_fast_path(&self, input: &mut StrTendril) -> (usize, u64) {
1976 #[cfg(target_arch = "x86")]
1977 use std::arch::x86::{
1978 __m128i, _mm_cmpeq_epi8, _mm_loadu_si128, _mm_movemask_epi8, _mm_or_si128,
1979 _mm_set1_epi8,
1980 };
1981 #[cfg(target_arch = "x86_64")]
1982 use std::arch::x86_64::{
1983 __m128i, _mm_cmpeq_epi8, _mm_loadu_si128, _mm_movemask_epi8, _mm_or_si128,
1984 _mm_set1_epi8,
1985 };
1986
1987 debug_assert!(!input.is_empty());
1988
1989 let quote_mask = _mm_set1_epi8('<' as i8);
1990 let escape_mask = _mm_set1_epi8('&' as i8);
1991 let carriage_return_mask = _mm_set1_epi8('\r' as i8);
1992 let zero_mask = _mm_set1_epi8('\0' as i8);
1993 let newline_mask = _mm_set1_epi8('\n' as i8);
1994
1995 let raw_bytes: &[u8] = input.as_bytes();
1996 let start = raw_bytes.as_ptr();
1997
1998 const STRIDE: usize = 16;
1999 let mut i = 0;
2000 let mut n_newlines = 0;
2001 while i + STRIDE <= raw_bytes.len() {
2002 let data = _mm_loadu_si128(start.add(i) as *const __m128i);
2004
2005 let quotes = _mm_cmpeq_epi8(data, quote_mask);
2007 let escapes = _mm_cmpeq_epi8(data, escape_mask);
2008 let carriage_returns = _mm_cmpeq_epi8(data, carriage_return_mask);
2009 let zeros = _mm_cmpeq_epi8(data, zero_mask);
2010 let newlines = _mm_cmpeq_epi8(data, newline_mask);
2011
2012 let test_result = _mm_or_si128(
2015 _mm_or_si128(quotes, zeros),
2016 _mm_or_si128(escapes, carriage_returns),
2017 );
2018 let bitmask = _mm_movemask_epi8(test_result);
2019 let newline_mask = _mm_movemask_epi8(newlines);
2020
2021 if (bitmask != 0) {
2022 let position = if cfg!(target_endian = "little") {
2024 bitmask.trailing_zeros() as usize
2025 } else {
2026 bitmask.leading_zeros() as usize
2027 };
2028
2029 n_newlines += (newline_mask & ((1 << position) - 1)).count_ones() as u64;
2030 i += position;
2031 break;
2032 } else {
2033 n_newlines += newline_mask.count_ones() as u64;
2034 }
2035
2036 i += STRIDE;
2037 }
2038
2039 (i, n_newlines)
2040 }
2041
2042 #[cfg(target_arch = "aarch64")]
2043 #[target_feature(enable = "neon")]
2044 unsafe fn data_state_neon_fast_path(&self, input: &mut StrTendril) -> (usize, u64) {
2052 use std::arch::aarch64::{vceqq_u8, vdupq_n_u8, vld1q_u8, vmaxvq_u8, vorrq_u8};
2053
2054 debug_assert!(!input.is_empty());
2055
2056 let quote_mask = vdupq_n_u8(b'<');
2057 let escape_mask = vdupq_n_u8(b'&');
2058 let carriage_return_mask = vdupq_n_u8(b'\r');
2059 let zero_mask = vdupq_n_u8(b'\0');
2060 let newline_mask = vdupq_n_u8(b'\n');
2061
2062 let raw_bytes: &[u8] = input.as_bytes();
2063 let start = raw_bytes.as_ptr();
2064
2065 const STRIDE: usize = 16;
2066 let mut i = 0;
2067 let mut n_newlines = 0;
2068 while i + STRIDE <= raw_bytes.len() {
2069 let data = vld1q_u8(start.add(i));
2071
2072 let quotes = vceqq_u8(data, quote_mask);
2074 let escapes = vceqq_u8(data, escape_mask);
2075 let carriage_returns = vceqq_u8(data, carriage_return_mask);
2076 let zeros = vceqq_u8(data, zero_mask);
2077 let newlines = vceqq_u8(data, newline_mask);
2078
2079 let test_result =
2082 vorrq_u8(vorrq_u8(quotes, zeros), vorrq_u8(escapes, carriage_returns));
2083 let bitmask = vmaxvq_u8(test_result);
2084 let newline_mask = vmaxvq_u8(newlines);
2085 if bitmask != 0 {
2086 let chunk_bytes = std::slice::from_raw_parts(start.add(i), STRIDE);
2088 let position = chunk_bytes
2089 .iter()
2090 .position(|&b| matches!(b, b'<' | b'&' | b'\r' | b'\0'))
2091 .unwrap();
2092
2093 n_newlines += chunk_bytes[..position]
2094 .iter()
2095 .filter(|&&b| b == b'\n')
2096 .count() as u64;
2097
2098 i += position;
2099 break;
2100 } else if newline_mask != 0 {
2101 let chunk_bytes = std::slice::from_raw_parts(start.add(i), STRIDE);
2102 n_newlines += chunk_bytes.iter().filter(|&&b| b == b'\n').count() as u64;
2103 }
2104
2105 i += STRIDE;
2106 }
2107
2108 (i, n_newlines)
2109 }
2110}
2111
2112#[cfg(test)]
2113#[allow(non_snake_case)]
2114mod test {
2115 use super::option_push; use crate::tendril::{SliceExt, StrTendril};
2117
2118 use super::{TokenSink, TokenSinkResult, Tokenizer, TokenizerOpts};
2119
2120 use super::interface::{CharacterTokens, EOFToken, NullCharacterToken, ParseError};
2121 use super::interface::{EndTag, StartTag, Tag, TagKind};
2122 use super::interface::{TagToken, Token};
2123
2124 use markup5ever::buffer_queue::BufferQueue;
2125 use std::cell::RefCell;
2126
2127 use crate::LocalName;
2128
2129 struct LinesMatch {
2133 tokens: RefCell<Vec<Token>>,
2134 current_str: RefCell<StrTendril>,
2135 lines: RefCell<Vec<(Token, u64)>>,
2136 }
2137
2138 impl LinesMatch {
2139 fn new() -> LinesMatch {
2140 LinesMatch {
2141 tokens: RefCell::new(vec![]),
2142 current_str: RefCell::new(StrTendril::new()),
2143 lines: RefCell::new(vec![]),
2144 }
2145 }
2146
2147 fn push(&self, token: Token, line_number: u64) {
2148 self.finish_str();
2149 self.lines.borrow_mut().push((token, line_number));
2150 }
2151
2152 fn finish_str(&self) {
2153 if !self.current_str.borrow().is_empty() {
2154 let s = self.current_str.take();
2155 self.tokens.borrow_mut().push(CharacterTokens(s));
2156 }
2157 }
2158 }
2159
2160 impl TokenSink for LinesMatch {
2161 type Handle = ();
2162
2163 fn process_token(&self, token: Token, line_number: u64) -> TokenSinkResult<Self::Handle> {
2164 match token {
2165 CharacterTokens(b) => {
2166 self.current_str.borrow_mut().push_slice(&b);
2167 },
2168
2169 NullCharacterToken => {
2170 self.current_str.borrow_mut().push_char('\0');
2171 },
2172
2173 ParseError(_) => {
2174 panic!("unexpected parse error");
2175 },
2176
2177 TagToken(mut t) => {
2178 match t.kind {
2182 EndTag => {
2183 t.self_closing = false;
2184 t.attrs = vec![];
2185 },
2186 _ => t.attrs.sort_by(|a1, a2| a1.name.cmp(&a2.name)),
2187 }
2188 self.push(TagToken(t), line_number);
2189 },
2190
2191 EOFToken => (),
2192
2193 _ => self.push(token, line_number),
2194 }
2195 TokenSinkResult::Continue
2196 }
2197 }
2198
2199 fn tokenize(input: Vec<StrTendril>, opts: TokenizerOpts) -> Vec<(Token, u64)> {
2202 let sink = LinesMatch::new();
2203 let tok = Tokenizer::new(sink, opts);
2204 let buffer = BufferQueue::default();
2205 for chunk in input.into_iter() {
2206 buffer.push_back(chunk);
2207 let _ = tok.feed(&buffer);
2208 }
2209 tok.end();
2210 tok.sink.lines.take()
2211 }
2212
2213 fn create_tag(token: StrTendril, tagkind: TagKind) -> Token {
2215 let name = LocalName::from(&*token);
2216
2217 TagToken(Tag {
2218 kind: tagkind,
2219 name,
2220 self_closing: false,
2221 attrs: vec![],
2222 })
2223 }
2224
2225 #[test]
2226 fn push_to_None_gives_singleton() {
2227 let mut s: Option<StrTendril> = None;
2228 option_push(&mut s, 'x');
2229 assert_eq!(s, Some("x".to_tendril()));
2230 }
2231
2232 #[test]
2233 fn push_to_empty_appends() {
2234 let mut s: Option<StrTendril> = Some(StrTendril::new());
2235 option_push(&mut s, 'x');
2236 assert_eq!(s, Some("x".to_tendril()));
2237 }
2238
2239 #[test]
2240 fn push_to_nonempty_appends() {
2241 let mut s: Option<StrTendril> = Some(StrTendril::from_slice("y"));
2242 option_push(&mut s, 'x');
2243 assert_eq!(s, Some("yx".to_tendril()));
2244 }
2245
2246 #[test]
2247 fn check_lines() {
2248 let opts = TokenizerOpts {
2249 exact_errors: false,
2250 discard_bom: true,
2251 profile: false,
2252 initial_state: None,
2253 last_start_tag_name: None,
2254 };
2255 let vector = vec![
2256 StrTendril::from("<a>\n"),
2257 StrTendril::from("<b>\n"),
2258 StrTendril::from("</b>\n"),
2259 StrTendril::from("</a>\n"),
2260 ];
2261 let expected = vec![
2262 (create_tag(StrTendril::from("a"), StartTag), 1),
2263 (create_tag(StrTendril::from("b"), StartTag), 2),
2264 (create_tag(StrTendril::from("b"), EndTag), 3),
2265 (create_tag(StrTendril::from("a"), EndTag), 4),
2266 ];
2267 let results = tokenize(vector, opts);
2268 assert_eq!(results, expected);
2269 }
2270
2271 #[test]
2272 fn check_lines_with_new_line() {
2273 let opts = TokenizerOpts {
2274 exact_errors: false,
2275 discard_bom: true,
2276 profile: false,
2277 initial_state: None,
2278 last_start_tag_name: None,
2279 };
2280 let vector = vec![
2281 StrTendril::from("<a>\r\n"),
2282 StrTendril::from("<b>\r\n"),
2283 StrTendril::from("</b>\r\n"),
2284 StrTendril::from("</a>\r\n"),
2285 ];
2286 let expected = vec![
2287 (create_tag(StrTendril::from("a"), StartTag), 1),
2288 (create_tag(StrTendril::from("b"), StartTag), 2),
2289 (create_tag(StrTendril::from("b"), EndTag), 3),
2290 (create_tag(StrTendril::from("a"), EndTag), 4),
2291 ];
2292 let results = tokenize(vector, opts);
2293 assert_eq!(results, expected);
2294 }
2295}