msg_tool\utils/
html5ever_arcdom.rs

1use std::borrow::Cow;
2use std::collections::{HashSet, VecDeque};
3use std::fmt;
4use std::io;
5use std::mem;
6use std::sync::{Arc, Mutex, Weak};
7
8use crossbeam::atomic::AtomicCell;
9use tendril::StrTendril;
10
11use markup5ever::Attribute;
12use markup5ever::ExpandedName;
13use markup5ever::QualName;
14use markup5ever::interface::tree_builder;
15use markup5ever::interface::tree_builder::{ElementFlags, NodeOrText, QuirksMode, TreeSink};
16use markup5ever::serialize::TraversalScope;
17use markup5ever::serialize::TraversalScope::{ChildrenOnly, IncludeNode};
18use markup5ever::serialize::{Serialize, Serializer};
19use xml5ever::interface::ElemName;
20use xml5ever::local_name;
21
22#[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Debug)]
23pub struct AtomicAttribute {
24    pub name: QualName,
25    pub value: String,
26}
27
28impl From<Attribute> for AtomicAttribute {
29    fn from(attr: Attribute) -> Self {
30        AtomicAttribute {
31            name: attr.name,
32            value: attr.value.to_string(),
33        }
34    }
35}
36
37#[derive(Debug, Clone)]
38pub enum NodeData {
39    /// The `Document` itself - the root node of a HTML document.
40    Document,
41
42    /// A `DOCTYPE` with name, public id, and system id. See
43    /// [document type declaration on wikipedia][dtd wiki].
44    ///
45    /// [dtd wiki]: https://en.wikipedia.org/wiki/Document_type_declaration
46    Doctype {
47        name: String,
48        public_id: String,
49        system_id: String,
50    },
51
52    /// A text node.
53    Text { contents: Arc<Mutex<String>> },
54
55    /// A comment.
56    Comment { contents: String },
57
58    /// An element with attributes.
59    Element {
60        name: QualName,
61        attrs: Arc<Mutex<Vec<AtomicAttribute>>>,
62
63        /// For HTML \<template\> elements, the [template contents].
64        ///
65        /// [template contents]: https://html.spec.whatwg.org/multipage/#template-contents
66        template_contents: Arc<Mutex<Option<Handle>>>,
67
68        /// Whether the node is a [HTML integration point].
69        ///
70        /// [HTML integration point]: https://html.spec.whatwg.org/multipage/#html-integration-point
71        mathml_annotation_xml_integration_point: bool,
72    },
73
74    /// A Processing instruction.
75    ProcessingInstruction { target: String, contents: String },
76}
77
78/// A DOM node.
79pub struct Node {
80    /// Parent node.
81    pub parent: AtomicCell<Option<WeakHandle>>,
82    /// Child nodes of this node.
83    pub children: Mutex<Vec<Handle>>,
84    /// Represents this node's data.
85    pub data: NodeData,
86}
87
88impl Node {
89    /// Create a new node from its contents
90    pub fn new(data: NodeData) -> Arc<Self> {
91        Arc::new(Node {
92            data,
93            parent: AtomicCell::new(None),
94            children: Mutex::new(Vec::new()),
95        })
96    }
97
98    /// <https://html.spec.whatwg.org/#option-element-nearest-ancestor-select>
99    fn get_option_element_nearest_ancestor_select(&self) -> Option<Arc<Self>> {
100        // Step 1. Let ancestorOptgroup be null.
101        // NOTE: The algorithm doesn't actually need the value, so a boolean is enough.
102        let mut did_see_ancestor_optgroup = false;
103
104        // Step 2. For each ancestor of option's ancestors, in reverse tree order:
105        let mut current = self.parent().and_then(|parent| parent.upgrade())?;
106        loop {
107            if let NodeData::Element { name, .. } = &current.data {
108                // Step 2.1 If ancestor is a datalist, hr, or option element, then return null.
109                if matches!(
110                    name.local_name(),
111                    &local_name!("datalist") | &local_name!("hr") | &local_name!("option")
112                ) {
113                    return None;
114                }
115
116                // Step 2.2 If ancestor is an optgroup element:
117                if name.local_name() == &local_name!("optgroup") {
118                    // Step 2.2.1 If ancestorOptgroup is not null, then return null.
119                    if did_see_ancestor_optgroup {
120                        return None;
121                    }
122
123                    // Step 2.2.2 Set ancestorOptgroup to ancestor.
124                    did_see_ancestor_optgroup = true;
125                }
126
127                // Step 2.3 If ancestor is a select, then return ancestor.
128                if name.local_name() == &local_name!("select") {
129                    return Some(current);
130                }
131            };
132
133            // Move on to the next ancestor
134            let Some(next_ancestor) = current.parent().and_then(|parent| parent.upgrade()) else {
135                break;
136            };
137            current = next_ancestor;
138        }
139
140        // Step 3. Return null.
141        None
142    }
143
144    fn parent(&self) -> Option<Weak<Self>> {
145        let parent = self.parent.take();
146        self.parent.store(parent.clone());
147        parent
148    }
149
150    /// <https://html.spec.whatwg.org/#select-enabled-selectedcontent>
151    fn get_a_selects_enabled_selectedcontent(&self) -> Option<Arc<Self>> {
152        // Step 1. If select has the multiple attribute, then return null.
153        let NodeData::Element { name, attrs, .. } = &self.data else {
154            panic!("Trying to get selectedcontent of non-element");
155        };
156        debug_assert_eq!(name.local_name(), &local_name!("select"));
157        if attrs
158            .lock()
159            .unwrap()
160            .iter()
161            .any(|attribute| attribute.name.local == local_name!("multiple"))
162        {
163            return None;
164        }
165
166        // Step 2. Let selectedcontent be the first selectedcontent element descendant of select in tree order
167        // if any such element exists; otherwise return null.
168        // FIXME: This does not visit the nodes in tree order
169        let mut remaining = VecDeque::default();
170        remaining.extend(self.children.lock().unwrap().iter().cloned());
171        let mut selectedcontent = None;
172        while let Some(node) = remaining.pop_front() {
173            remaining.extend(node.children.lock().unwrap().iter().cloned());
174
175            let NodeData::Element { name, .. } = &self.data else {
176                continue;
177            };
178            if name.local_name() == &local_name!("selectedcontent") {
179                selectedcontent = Some(node);
180                break;
181            }
182        }
183        let selectedcontent = selectedcontent?;
184
185        // Step 3. If selectedcontent's disabled is true, then return null.
186        // FIXME: This step is unimplemented for now to reduce complexity.
187
188        // Step 4. Return selectedcontent.
189        Some(selectedcontent)
190    }
191
192    /// <https://html.spec.whatwg.org/#clone-an-option-into-a-selectedcontent>
193    fn clone_an_option_into_selectedcontent(&self, selectedcontent: Arc<Self>) {
194        // Step 1. Let documentFragment be a new DocumentFragment whose node document is option's node document.
195        // NOTE: We just remember the children of said fragment, thats good enough.
196        let mut document_fragment = Vec::new();
197
198        // Step 2. For each child of option's children:
199        for child in self.children.lock().unwrap().iter() {
200            // Step 2.1 Let childClone be the result of running clone given child with subtree set to true.
201            let child_clone = child.clone_with_subtree();
202
203            // Step 2.2 Append childClone to documentFragment.
204            document_fragment.push(child_clone);
205        }
206
207        // Step 3. Replace all with documentFragment within selectedcontent.
208        *selectedcontent.children.lock().unwrap() = document_fragment;
209    }
210
211    /// Clones the node and all of its descendants, returning a handle to the new subtree.
212    ///
213    /// This function will run into infinite recursion when the DOM tree contains cycles and it makes
214    /// no attempts to guard against that.
215    fn clone_with_subtree(&self) -> Arc<Self> {
216        let children = self
217            .children
218            .lock()
219            .unwrap()
220            .iter()
221            .map(|child| child.clone_with_subtree())
222            .collect();
223        Arc::new(Self {
224            parent: AtomicCell::new(self.parent()),
225            data: self.data.clone(),
226            children: Mutex::new(children),
227        })
228    }
229}
230
231impl fmt::Debug for Node {
232    fn fmt(&self, fmt: &mut fmt::Formatter) -> fmt::Result {
233        fmt.debug_struct("Node")
234            .field("data", &self.data)
235            .field("children", &self.children)
236            .finish()
237    }
238}
239
240/// Reference to a DOM node.
241pub type Handle = Arc<Node>;
242
243/// Weak reference to a DOM node, used for parent pointers.
244pub type WeakHandle = Weak<Node>;
245
246/// Append a parentless node to another nodes' children
247fn append(new_parent: &Handle, child: Handle) {
248    let previous_parent = child.parent.swap(Some(Arc::downgrade(new_parent)));
249    // Invariant: child cannot have existing parent
250    assert!(previous_parent.is_none());
251    new_parent.children.lock().unwrap().push(child);
252}
253
254/// If the node has a parent, get it and this node's position in its children
255fn get_parent_and_index(target: &Handle) -> Option<(Handle, usize)> {
256    if let Some(weak) = target.parent.take() {
257        let parent = weak.upgrade().expect("dangling weak pointer");
258        target.parent.store(Some(weak));
259        let i = match parent
260            .children
261            .lock()
262            .unwrap()
263            .iter()
264            .enumerate()
265            .find(|&(_, child)| Arc::ptr_eq(child, target))
266        {
267            Some((i, _)) => i,
268            None => panic!("have parent but couldn't find in parent's children!"),
269        };
270        Some((parent, i))
271    } else {
272        None
273    }
274}
275
276fn append_to_existing_text(prev: &Handle, text: &str) -> bool {
277    match prev.data {
278        NodeData::Text { ref contents } => {
279            contents.lock().unwrap().push_str(text);
280            true
281        }
282        _ => false,
283    }
284}
285
286fn remove_from_parent(target: &Handle) {
287    if let Some((parent, i)) = get_parent_and_index(target) {
288        parent.children.lock().unwrap().remove(i);
289        target.parent.store(None);
290    }
291}
292
293/// The DOM itself; the result of parsing.
294pub struct ArcDom {
295    /// The `Document` itself.
296    pub document: Handle,
297
298    /// Errors that occurred during parsing.
299    pub errors: Mutex<Vec<Cow<'static, str>>>,
300
301    /// The document's quirks mode.
302    pub quirks_mode: AtomicCell<QuirksMode>,
303}
304
305impl TreeSink for ArcDom {
306    type Output = Self;
307    fn finish(self) -> Self {
308        self
309    }
310
311    type Handle = Handle;
312
313    type ElemName<'a>
314        = ExpandedName<'a>
315    where
316        Self: 'a;
317
318    fn parse_error(&self, msg: Cow<'static, str>) {
319        self.errors.lock().unwrap().push(msg);
320    }
321
322    fn get_document(&self) -> Handle {
323        self.document.clone()
324    }
325
326    fn get_template_contents(&self, target: &Handle) -> Handle {
327        if let NodeData::Element {
328            ref template_contents,
329            ..
330        } = target.data
331        {
332            template_contents
333                .lock()
334                .unwrap()
335                .as_ref()
336                .expect("not a template element!")
337                .clone()
338        } else {
339            panic!("not a template element!")
340        }
341    }
342
343    fn set_quirks_mode(&self, mode: QuirksMode) {
344        self.quirks_mode.store(mode);
345    }
346
347    fn same_node(&self, x: &Handle, y: &Handle) -> bool {
348        Arc::ptr_eq(x, y)
349    }
350
351    fn elem_name<'a>(&self, target: &'a Handle) -> ExpandedName<'a> {
352        match target.data {
353            NodeData::Element { ref name, .. } => name.expanded(),
354            _ => panic!("not an element!"),
355        }
356    }
357
358    fn create_element(&self, name: QualName, attrs: Vec<Attribute>, flags: ElementFlags) -> Handle {
359        Node::new(NodeData::Element {
360            name,
361            attrs: Arc::new(Mutex::new(
362                attrs.into_iter().map(AtomicAttribute::from).collect(),
363            )),
364            template_contents: Arc::new(Mutex::new(if flags.template {
365                Some(Node::new(NodeData::Document))
366            } else {
367                None
368            })),
369            mathml_annotation_xml_integration_point: flags.mathml_annotation_xml_integration_point,
370        })
371    }
372
373    fn create_comment(&self, text: StrTendril) -> Handle {
374        Node::new(NodeData::Comment {
375            contents: text.to_string(),
376        })
377    }
378
379    fn create_pi(&self, target: StrTendril, data: StrTendril) -> Handle {
380        Node::new(NodeData::ProcessingInstruction {
381            target: target.to_string(),
382            contents: data.to_string(),
383        })
384    }
385
386    fn append(&self, parent: &Handle, child: NodeOrText<Handle>) {
387        // Append to an existing Text node if we have one.
388        if let NodeOrText::AppendText(text) = &child {
389            if let Some(h) = parent.children.lock().unwrap().last() {
390                if append_to_existing_text(h, text) {
391                    return;
392                }
393            }
394        }
395
396        append(
397            parent,
398            match child {
399                NodeOrText::AppendText(text) => Node::new(NodeData::Text {
400                    contents: Arc::new(Mutex::new(text.to_string())),
401                }),
402                NodeOrText::AppendNode(node) => node,
403            },
404        );
405    }
406
407    fn append_before_sibling(&self, sibling: &Handle, child: NodeOrText<Handle>) {
408        let (parent, i) = get_parent_and_index(sibling)
409            .expect("append_before_sibling called on node without parent");
410
411        let child = match (child, i) {
412            // No previous node.
413            (NodeOrText::AppendText(text), 0) => Node::new(NodeData::Text {
414                contents: Arc::new(Mutex::new(text.to_string())),
415            }),
416
417            // Look for a text node before the insertion point.
418            (NodeOrText::AppendText(text), i) => {
419                let children = parent.children.lock().unwrap();
420                let prev = &children[i - 1];
421                if append_to_existing_text(prev, &text) {
422                    return;
423                }
424                Node::new(NodeData::Text {
425                    contents: Arc::new(Mutex::new(text.to_string())),
426                })
427            }
428
429            // The tree builder promises we won't have a text node after
430            // the insertion point.
431
432            // Any other kind of node.
433            (NodeOrText::AppendNode(node), _) => node,
434        };
435
436        remove_from_parent(&child);
437
438        child.parent.store(Some(Arc::downgrade(&parent)));
439        parent.children.lock().unwrap().insert(i, child);
440    }
441
442    fn append_based_on_parent_node(
443        &self,
444        element: &Self::Handle,
445        prev_element: &Self::Handle,
446        child: NodeOrText<Self::Handle>,
447    ) {
448        let parent = element.parent.take();
449        let has_parent = parent.is_some();
450        element.parent.store(parent);
451
452        if has_parent {
453            self.append_before_sibling(element, child);
454        } else {
455            self.append(prev_element, child);
456        }
457    }
458
459    fn append_doctype_to_document(
460        &self,
461        name: StrTendril,
462        public_id: StrTendril,
463        system_id: StrTendril,
464    ) {
465        append(
466            &self.document,
467            Node::new(NodeData::Doctype {
468                name: name.to_string(),
469                public_id: public_id.to_string(),
470                system_id: system_id.to_string(),
471            }),
472        );
473    }
474
475    fn add_attrs_if_missing(&self, target: &Handle, attrs: Vec<Attribute>) {
476        let mut existing = if let NodeData::Element { ref attrs, .. } = target.data {
477            attrs.lock().unwrap()
478        } else {
479            panic!("not an element")
480        };
481
482        let existing_names = existing
483            .iter()
484            .map(|e| e.name.clone())
485            .collect::<HashSet<_>>();
486        existing.extend(
487            attrs
488                .into_iter()
489                .filter(|attr| !existing_names.contains(&attr.name))
490                .map(AtomicAttribute::from),
491        );
492    }
493
494    fn remove_from_parent(&self, target: &Handle) {
495        remove_from_parent(target);
496    }
497
498    fn reparent_children(&self, node: &Handle, new_parent: &Handle) {
499        let mut children = node.children.lock().unwrap();
500        let mut new_children = new_parent.children.lock().unwrap();
501        for child in children.iter() {
502            let previous_parent = child.parent.swap(Some(Arc::downgrade(new_parent)));
503            assert!(Arc::ptr_eq(
504                node,
505                &previous_parent.unwrap().upgrade().expect("dangling weak")
506            ))
507        }
508        new_children.extend(mem::take(&mut *children));
509    }
510
511    fn is_mathml_annotation_xml_integration_point(&self, target: &Handle) -> bool {
512        if let NodeData::Element {
513            mathml_annotation_xml_integration_point,
514            ..
515        } = target.data
516        {
517            mathml_annotation_xml_integration_point
518        } else {
519            panic!("not an element!")
520        }
521    }
522
523    fn maybe_clone_an_option_into_selectedcontent(&self, option: &Self::Handle) {
524        let NodeData::Element { name, attrs, .. } = &option.data else {
525            panic!("\"maybe clone an option into selectedcontent\" called with non-element node");
526        };
527        debug_assert_eq!(name.local_name(), &local_name!("option"));
528
529        // Step 1. Let select be option's option element nearest ancestor select.
530        let select = option.get_option_element_nearest_ancestor_select();
531
532        // Step 2. If all of the following conditions are true:
533        // * select is not null;
534        // * option's selectedness is true; and
535        // * select's enabled selectedcontent is not null,
536        // then run clone an option into a selectedcontent given option and select's enabled selectedcontent.
537        if let Some(selectedcontent) =
538            select.and_then(|select| select.get_a_selects_enabled_selectedcontent())
539        {
540            if attrs
541                .lock()
542                .unwrap()
543                .iter()
544                .any(|attribute| attribute.name.local == local_name!("selected"))
545            {
546                option.clone_an_option_into_selectedcontent(selectedcontent);
547            }
548        }
549    }
550}
551
552impl Default for ArcDom {
553    fn default() -> ArcDom {
554        ArcDom {
555            document: Node::new(NodeData::Document),
556            errors: Default::default(),
557            quirks_mode: AtomicCell::new(tree_builder::NoQuirks),
558        }
559    }
560}
561
562enum SerializeOp {
563    Open(Handle),
564    Close(QualName),
565}
566
567pub struct SerializableHandle(Handle);
568
569impl From<Handle> for SerializableHandle {
570    fn from(h: Handle) -> SerializableHandle {
571        SerializableHandle(h)
572    }
573}
574
575impl Serialize for SerializableHandle {
576    fn serialize<S>(&self, serializer: &mut S, traversal_scope: TraversalScope) -> io::Result<()>
577    where
578        S: Serializer,
579    {
580        let mut ops = VecDeque::new();
581        match traversal_scope {
582            IncludeNode => ops.push_back(SerializeOp::Open(self.0.clone())),
583            ChildrenOnly(_) => ops.extend(
584                self.0
585                    .children
586                    .lock()
587                    .unwrap()
588                    .iter()
589                    .map(|h| SerializeOp::Open(h.clone())),
590            ),
591        }
592
593        while let Some(op) = ops.pop_front() {
594            match op {
595                SerializeOp::Open(handle) => match handle.data {
596                    NodeData::Element {
597                        ref name,
598                        ref attrs,
599                        ..
600                    } => {
601                        serializer.start_elem(
602                            name.clone(),
603                            attrs
604                                .lock()
605                                .unwrap()
606                                .iter()
607                                .map(|at| (&at.name, &at.value[..])),
608                        )?;
609
610                        ops.reserve(1 + handle.children.lock().unwrap().len());
611                        ops.push_front(SerializeOp::Close(name.clone()));
612
613                        for child in handle.children.lock().unwrap().iter().rev() {
614                            ops.push_front(SerializeOp::Open(child.clone()));
615                        }
616                    }
617
618                    NodeData::Doctype { ref name, .. } => serializer.write_doctype(name)?,
619
620                    NodeData::Text { ref contents } => {
621                        serializer.write_text(&contents.lock().unwrap())?
622                    }
623
624                    NodeData::Comment { ref contents } => serializer.write_comment(contents)?,
625
626                    NodeData::ProcessingInstruction {
627                        ref target,
628                        ref contents,
629                    } => serializer.write_processing_instruction(target, contents)?,
630
631                    NodeData::Document => panic!("Can't serialize Document node itself"),
632                },
633
634                SerializeOp::Close(name) => {
635                    serializer.end_elem(name)?;
636                }
637            }
638        }
639
640        Ok(())
641    }
642}