markup5ever\util/
buffer_queue.rs

1// Copyright 2014-2017 The html5ever Project Developers. See the
2// COPYRIGHT file at the top-level directory of this distribution.
3//
4// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
5// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
6// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
7// option. This file may not be copied, modified, or distributed
8// except according to those terms.
9
10//! The `BufferQueue` struct and helper types.
11//!
12//! This type is designed for the efficient parsing of string data, especially where many
13//! significant characters are from the ascii range 0-63. This includes, for example, important
14//! characters in xml/html parsing.
15//!
16//! Good and predictable performance is achieved by avoiding allocation where possible (a.k.a. zero
17//! copy).
18//!
19//! [`BufferQueue`]: struct.BufferQueue.html
20
21use std::{
22    cell::{RefCell, RefMut},
23    collections::VecDeque,
24    mem,
25};
26
27use tendril::StrTendril;
28
29pub use self::SetResult::{FromSet, NotFromSet};
30use crate::util::smallcharset::SmallCharSet;
31
32/// Result from [`pop_except_from`] containing either a character from a [`SmallCharSet`], or a
33/// string buffer of characters not from the set.
34///
35/// [`pop_except_from`]: struct.BufferQueue.html#method.pop_except_from
36/// [`SmallCharSet`]: ../struct.SmallCharSet.html
37#[derive(PartialEq, Eq, Debug)]
38pub enum SetResult {
39    /// A character from the `SmallCharSet`.
40    FromSet(char),
41    /// A string buffer containing no characters from the `SmallCharSet`.
42    NotFromSet(StrTendril),
43}
44
45/// A queue of owned string buffers, which supports incrementally consuming characters.
46///
47/// Internally it uses [`VecDeque`] and has the same complexity properties.
48///
49/// [`VecDeque`]: https://doc.rust-lang.org/std/collections/struct.VecDeque.html
50#[derive(Clone, Debug)]
51pub struct BufferQueue {
52    /// Buffers to process.
53    buffers: RefCell<VecDeque<StrTendril>>,
54}
55
56impl Default for BufferQueue {
57    /// Create an empty BufferQueue.
58    #[inline]
59    fn default() -> Self {
60        Self {
61            buffers: RefCell::new(VecDeque::with_capacity(16)),
62        }
63    }
64}
65
66impl BufferQueue {
67    /// Returns whether the queue is empty.
68    #[inline]
69    pub fn is_empty(&self) -> bool {
70        self.buffers.borrow().is_empty()
71    }
72
73    /// Get the buffer at the beginning of the queue.
74    #[inline]
75    pub fn pop_front(&self) -> Option<StrTendril> {
76        self.buffers.borrow_mut().pop_front()
77    }
78
79    /// Add a buffer to the beginning of the queue.
80    ///
81    /// If the buffer is empty, it will be skipped.
82    pub fn push_front(&self, buf: StrTendril) {
83        if buf.len32() == 0 {
84            return;
85        }
86        self.buffers.borrow_mut().push_front(buf);
87    }
88
89    /// Add a buffer to the end of the queue.
90    ///
91    /// If the buffer is empty, it will be skipped.
92    pub fn push_back(&self, buf: StrTendril) {
93        if buf.len32() == 0 {
94            return;
95        }
96        self.buffers.borrow_mut().push_back(buf);
97    }
98
99    /// Look at the next available character without removing it, if the queue is not empty.
100    pub fn peek(&self) -> Option<char> {
101        debug_assert!(
102            !self.buffers.borrow().iter().any(|el| el.len32() == 0),
103            "invariant \"all buffers in the queue are non-empty\" failed"
104        );
105        self.buffers
106            .borrow()
107            .front()
108            .map(|b| b.chars().next().unwrap())
109    }
110
111    /// Pops and returns either a single character from the given set, or
112    /// a buffer of characters none of which are in the set.
113    ///
114    /// # Examples
115    ///
116    /// ```
117    /// # #[macro_use] extern crate markup5ever;
118    /// # #[macro_use] extern crate tendril;
119    /// # fn main() {
120    /// use markup5ever::buffer_queue::{BufferQueue, SetResult};
121    ///
122    /// let mut queue = BufferQueue::default();
123    /// queue.push_back(format_tendril!(r#"<some_tag attr="text">SomeText</some_tag>"#));
124    /// let set = small_char_set!(b'<' b'>' b' ' b'=' b'"' b'/');
125    /// let tag = format_tendril!("some_tag");
126    /// let attr = format_tendril!("attr");
127    /// let attr_val = format_tendril!("text");
128    /// assert_eq!(queue.pop_except_from(set), Some(SetResult::FromSet('<')));
129    /// assert_eq!(queue.pop_except_from(set), Some(SetResult::NotFromSet(tag)));
130    /// assert_eq!(queue.pop_except_from(set), Some(SetResult::FromSet(' ')));
131    /// assert_eq!(queue.pop_except_from(set), Some(SetResult::NotFromSet(attr)));
132    /// assert_eq!(queue.pop_except_from(set), Some(SetResult::FromSet('=')));
133    /// assert_eq!(queue.pop_except_from(set), Some(SetResult::FromSet('"')));
134    /// assert_eq!(queue.pop_except_from(set), Some(SetResult::NotFromSet(attr_val)));
135    /// // ...
136    /// # }
137    /// ```
138    pub fn pop_except_from(&self, set: SmallCharSet) -> Option<SetResult> {
139        let (result, now_empty) = match self.buffers.borrow_mut().front_mut() {
140            None => (None, false),
141            Some(buf) => {
142                let n = set.nonmember_prefix_len(buf);
143                if n > 0 {
144                    let out;
145                    unsafe {
146                        out = buf.unsafe_subtendril(0, n);
147                        buf.unsafe_pop_front(n);
148                    }
149                    (Some(NotFromSet(out)), buf.is_empty())
150                } else {
151                    let c = buf.pop_front_char().expect("empty buffer in queue");
152                    (Some(FromSet(c)), buf.is_empty())
153                }
154            },
155        };
156
157        // Unborrow self for this part.
158        if now_empty {
159            self.buffers.borrow_mut().pop_front();
160        }
161
162        result
163    }
164
165    /// Consume bytes matching the pattern, using a custom comparison function `eq`.
166    ///
167    /// Returns `Some(true)` if there is a match, `Some(false)` if there is no match, or `None` if
168    /// it wasn't possible to know (more data is needed).
169    ///
170    /// The custom comparison function is used elsewhere to compare ascii-case-insensitively.
171    ///
172    /// # Examples
173    ///
174    /// ```
175    /// # extern crate markup5ever;
176    /// # #[macro_use] extern crate tendril;
177    /// # fn main() {
178    /// use markup5ever::buffer_queue::BufferQueue;
179    ///
180    /// let mut queue = BufferQueue::default();
181    /// queue.push_back(format_tendril!("testtext"));
182    /// let test_str = "test";
183    /// assert_eq!(queue.eat("test", |&a, &b| a == b), Some(true));
184    /// assert_eq!(queue.eat("text", |&a, &b| a == b), Some(true));
185    /// assert!(queue.is_empty());
186    /// # }
187    /// ```
188    pub fn eat<F: Fn(&u8, &u8) -> bool>(&self, pat: &str, eq: F) -> Option<bool> {
189        let mut buffers_exhausted = 0;
190        let mut consumed_from_last = 0;
191
192        self.buffers.borrow().front()?;
193
194        for pattern_byte in pat.bytes() {
195            if buffers_exhausted >= self.buffers.borrow().len() {
196                return None;
197            }
198            let buf = &self.buffers.borrow()[buffers_exhausted];
199
200            if !eq(&buf.as_bytes()[consumed_from_last], &pattern_byte) {
201                return Some(false);
202            }
203
204            consumed_from_last += 1;
205            if consumed_from_last >= buf.len() {
206                buffers_exhausted += 1;
207                consumed_from_last = 0;
208            }
209        }
210
211        // We have a match. Commit changes to the BufferQueue.
212        for _ in 0..buffers_exhausted {
213            self.buffers.borrow_mut().pop_front();
214        }
215
216        match self.buffers.borrow_mut().front_mut() {
217            None => assert_eq!(consumed_from_last, 0),
218            Some(ref mut buf) => buf.pop_front(consumed_from_last as u32),
219        }
220
221        Some(true)
222    }
223
224    /// Get the next character if one is available, removing it from the queue.
225    ///
226    /// This function manages the buffers, removing them as they become empty.
227    pub fn next(&self) -> Option<char> {
228        let (result, now_empty) = match self.buffers.borrow_mut().front_mut() {
229            None => (None, false),
230            Some(buf) => {
231                let c = buf.pop_front_char().expect("empty buffer in queue");
232                (Some(c), buf.is_empty())
233            },
234        };
235
236        if now_empty {
237            self.buffers.borrow_mut().pop_front();
238        }
239
240        result
241    }
242
243    pub fn replace_with(&self, other: BufferQueue) {
244        let _ = mem::replace(&mut *self.buffers.borrow_mut(), other.buffers.take());
245    }
246
247    pub fn swap_with(&self, other: &BufferQueue) {
248        mem::swap(
249            &mut *self.buffers.borrow_mut(),
250            &mut *other.buffers.borrow_mut(),
251        );
252    }
253
254    /// Return a mutable reference to the first tendril in the queue.
255    pub fn peek_front_chunk_mut(&self) -> Option<RefMut<'_, StrTendril>> {
256        let buffers = self.buffers.borrow_mut();
257        if buffers.is_empty() {
258            return None;
259        }
260
261        let front_buffer = RefMut::map(buffers, |buffers| {
262            buffers.front_mut().expect("there is at least one buffer")
263        });
264        Some(front_buffer)
265    }
266}
267
268#[cfg(test)]
269#[allow(non_snake_case)]
270mod test {
271    use tendril::SliceExt;
272
273    use super::BufferQueue;
274    use super::SetResult::{FromSet, NotFromSet};
275
276    #[test]
277    fn smoke_test() {
278        let bq = BufferQueue::default();
279        assert_eq!(bq.peek(), None);
280        assert_eq!(bq.next(), None);
281
282        bq.push_back("abc".to_tendril());
283        assert_eq!(bq.peek(), Some('a'));
284        assert_eq!(bq.next(), Some('a'));
285        assert_eq!(bq.peek(), Some('b'));
286        assert_eq!(bq.peek(), Some('b'));
287        assert_eq!(bq.next(), Some('b'));
288        assert_eq!(bq.peek(), Some('c'));
289        assert_eq!(bq.next(), Some('c'));
290        assert_eq!(bq.peek(), None);
291        assert_eq!(bq.next(), None);
292    }
293
294    #[test]
295    fn can_unconsume() {
296        let bq = BufferQueue::default();
297        bq.push_back("abc".to_tendril());
298        assert_eq!(bq.next(), Some('a'));
299
300        bq.push_front("xy".to_tendril());
301        assert_eq!(bq.next(), Some('x'));
302        assert_eq!(bq.next(), Some('y'));
303        assert_eq!(bq.next(), Some('b'));
304        assert_eq!(bq.next(), Some('c'));
305        assert_eq!(bq.next(), None);
306    }
307
308    #[test]
309    fn can_pop_except_set() {
310        let bq = BufferQueue::default();
311        bq.push_back("abc&def".to_tendril());
312        let pop = || bq.pop_except_from(small_char_set!('&'));
313        assert_eq!(pop(), Some(NotFromSet("abc".to_tendril())));
314        assert_eq!(pop(), Some(FromSet('&')));
315        assert_eq!(pop(), Some(NotFromSet("def".to_tendril())));
316        assert_eq!(pop(), None);
317    }
318
319    #[test]
320    fn can_eat() {
321        // This is not very comprehensive.  We rely on the tokenizer
322        // integration tests for more thorough testing with many
323        // different input buffer splits.
324        let bq = BufferQueue::default();
325        bq.push_back("a".to_tendril());
326        bq.push_back("bc".to_tendril());
327        assert_eq!(bq.eat("abcd", u8::eq_ignore_ascii_case), None);
328        assert_eq!(bq.eat("ax", u8::eq_ignore_ascii_case), Some(false));
329        assert_eq!(bq.eat("ab", u8::eq_ignore_ascii_case), Some(true));
330        assert_eq!(bq.next(), Some('c'));
331        assert_eq!(bq.next(), None);
332    }
333}