html5ever/
driver.rs

1// Copyright 2014-2017 The html5ever Project Developers. See the
2// COPYRIGHT file at the top-level directory of this distribution.
3//
4// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
5// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
6// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
7// option. This file may not be copied, modified, or distributed
8// except according to those terms.
9
10//! High-level interface to the parser.
11
12use crate::buffer_queue::BufferQueue;
13use crate::tokenizer::{Tokenizer, TokenizerOpts};
14use crate::tree_builder::{create_element, TreeBuilder, TreeBuilderOpts, TreeSink};
15use crate::{Attribute, QualName};
16use markup5ever::TokenizerResult;
17use std::borrow::Cow;
18
19use crate::tendril;
20use crate::tendril::stream::{TendrilSink, Utf8LossyDecoder};
21use crate::tendril::StrTendril;
22
23/// All-encompassing options struct for the parser.
24#[derive(Clone, Default)]
25pub struct ParseOpts {
26    /// Tokenizer options.
27    pub tokenizer: TokenizerOpts,
28
29    /// Tree builder options.
30    pub tree_builder: TreeBuilderOpts,
31}
32
33/// Parse an HTML document
34///
35/// The returned value implements `tendril::TendrilSink`
36/// so that Unicode input may be provided incrementally,
37/// or all at once with the `one` method.
38///
39/// If your input is bytes, use `Parser::from_utf8`.
40pub fn parse_document<Sink>(sink: Sink, opts: ParseOpts) -> Parser<Sink>
41where
42    Sink: TreeSink,
43{
44    let tb = TreeBuilder::new(sink, opts.tree_builder);
45    let tok = Tokenizer::new(tb, opts.tokenizer);
46    Parser {
47        tokenizer: tok,
48        input_buffer: BufferQueue::default(),
49    }
50}
51
52/// Parse an HTML fragment
53///
54/// The returned value implements `tendril::TendrilSink`
55/// so that Unicode input may be provided incrementally,
56/// or all at once with the `one` method.
57///
58/// If your input is bytes, use `Parser::from_utf8`.
59pub fn parse_fragment<Sink>(
60    sink: Sink,
61    opts: ParseOpts,
62    context_name: QualName,
63    context_attrs: Vec<Attribute>,
64    context_element_allows_scripting: bool,
65) -> Parser<Sink>
66where
67    Sink: TreeSink,
68{
69    let context_elem = create_element(&sink, context_name, context_attrs);
70    parse_fragment_for_element(
71        sink,
72        opts,
73        context_elem,
74        context_element_allows_scripting,
75        None,
76    )
77}
78
79/// Like `parse_fragment`, but with an existing context element
80/// and optionally a form element.
81pub fn parse_fragment_for_element<Sink>(
82    sink: Sink,
83    opts: ParseOpts,
84    context_element: Sink::Handle,
85    context_element_allows_scripting: bool,
86    form_element: Option<Sink::Handle>,
87) -> Parser<Sink>
88where
89    Sink: TreeSink,
90{
91    let tree_builder =
92        TreeBuilder::new_for_fragment(sink, context_element, form_element, opts.tree_builder);
93    let tokenizer_options = TokenizerOpts {
94        initial_state: Some(
95            tree_builder.tokenizer_state_for_context_elem(context_element_allows_scripting),
96        ),
97        ..opts.tokenizer
98    };
99    let tokenizer = Tokenizer::new(tree_builder, tokenizer_options);
100    Parser {
101        tokenizer,
102        input_buffer: BufferQueue::default(),
103    }
104}
105
106/// An HTML parser,
107/// ready to receive Unicode input through the `tendril::TendrilSink` trait’s methods.
108pub struct Parser<Sink>
109where
110    Sink: TreeSink,
111{
112    pub tokenizer: Tokenizer<TreeBuilder<Sink::Handle, Sink>>,
113    pub input_buffer: BufferQueue,
114}
115
116impl<Sink: TreeSink> TendrilSink<tendril::fmt::UTF8> for Parser<Sink> {
117    fn process(&mut self, t: StrTendril) {
118        self.input_buffer.push_back(t);
119        // FIXME: Properly support </script> somehow.
120        while let TokenizerResult::Script(_) = self.tokenizer.feed(&self.input_buffer) {}
121    }
122
123    // FIXME: Is it too noisy to report every character decoding error?
124    fn error(&mut self, desc: Cow<'static, str>) {
125        self.tokenizer.sink.sink.parse_error(desc)
126    }
127
128    type Output = Sink::Output;
129
130    fn finish(self) -> Self::Output {
131        // FIXME: Properly support </script> somehow.
132        while let TokenizerResult::Script(_) = self.tokenizer.feed(&self.input_buffer) {}
133        assert!(self.input_buffer.is_empty());
134        self.tokenizer.end();
135        self.tokenizer.sink.sink.finish()
136    }
137}
138
139impl<Sink: TreeSink> Parser<Sink> {
140    /// Wrap this parser into a `TendrilSink` that accepts UTF-8 bytes.
141    ///
142    /// Use this when your input is bytes that are known to be in the UTF-8 encoding.
143    /// Decoding is lossy, like `String::from_utf8_lossy`.
144    #[allow(clippy::wrong_self_convention)]
145    pub fn from_utf8(self) -> Utf8LossyDecoder<Self> {
146        Utf8LossyDecoder::new(self)
147    }
148}