markup5ever\util/buffer_queue.rs
1// Copyright 2014-2017 The html5ever Project Developers. See the
2// COPYRIGHT file at the top-level directory of this distribution.
3//
4// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
5// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
6// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
7// option. This file may not be copied, modified, or distributed
8// except according to those terms.
9
10//! The `BufferQueue` struct and helper types.
11//!
12//! This type is designed for the efficient parsing of string data, especially where many
13//! significant characters are from the ascii range 0-63. This includes, for example, important
14//! characters in xml/html parsing.
15//!
16//! Good and predictable performance is achieved by avoiding allocation where possible (a.k.a. zero
17//! copy).
18//!
19//! [`BufferQueue`]: struct.BufferQueue.html
20
21use std::{
22 cell::{RefCell, RefMut},
23 collections::VecDeque,
24 mem,
25};
26
27use tendril::StrTendril;
28
29pub use self::SetResult::{FromSet, NotFromSet};
30use crate::util::smallcharset::SmallCharSet;
31
32/// Result from [`pop_except_from`] containing either a character from a [`SmallCharSet`], or a
33/// string buffer of characters not from the set.
34///
35/// [`pop_except_from`]: struct.BufferQueue.html#method.pop_except_from
36/// [`SmallCharSet`]: ../struct.SmallCharSet.html
37#[derive(PartialEq, Eq, Debug)]
38pub enum SetResult {
39 /// A character from the `SmallCharSet`.
40 FromSet(char),
41 /// A string buffer containing no characters from the `SmallCharSet`.
42 NotFromSet(StrTendril),
43}
44
45/// A queue of owned string buffers, which supports incrementally consuming characters.
46///
47/// Internally it uses [`VecDeque`] and has the same complexity properties.
48///
49/// [`VecDeque`]: https://doc.rust-lang.org/std/collections/struct.VecDeque.html
50#[derive(Clone, Debug)]
51pub struct BufferQueue {
52 /// Buffers to process.
53 buffers: RefCell<VecDeque<StrTendril>>,
54}
55
56impl Default for BufferQueue {
57 /// Create an empty BufferQueue.
58 #[inline]
59 fn default() -> Self {
60 Self {
61 buffers: RefCell::new(VecDeque::with_capacity(16)),
62 }
63 }
64}
65
66impl BufferQueue {
67 /// Returns whether the queue is empty.
68 #[inline]
69 pub fn is_empty(&self) -> bool {
70 self.buffers.borrow().is_empty()
71 }
72
73 /// Get the buffer at the beginning of the queue.
74 #[inline]
75 pub fn pop_front(&self) -> Option<StrTendril> {
76 self.buffers.borrow_mut().pop_front()
77 }
78
79 /// Add a buffer to the beginning of the queue.
80 ///
81 /// If the buffer is empty, it will be skipped.
82 pub fn push_front(&self, buf: StrTendril) {
83 if buf.len32() == 0 {
84 return;
85 }
86 self.buffers.borrow_mut().push_front(buf);
87 }
88
89 /// Add a buffer to the end of the queue.
90 ///
91 /// If the buffer is empty, it will be skipped.
92 pub fn push_back(&self, buf: StrTendril) {
93 if buf.len32() == 0 {
94 return;
95 }
96 self.buffers.borrow_mut().push_back(buf);
97 }
98
99 /// Look at the next available character without removing it, if the queue is not empty.
100 pub fn peek(&self) -> Option<char> {
101 debug_assert!(
102 !self.buffers.borrow().iter().any(|el| el.len32() == 0),
103 "invariant \"all buffers in the queue are non-empty\" failed"
104 );
105 self.buffers
106 .borrow()
107 .front()
108 .map(|b| b.chars().next().unwrap())
109 }
110
111 /// Pops and returns either a single character from the given set, or
112 /// a buffer of characters none of which are in the set.
113 ///
114 /// # Examples
115 ///
116 /// ```
117 /// # #[macro_use] extern crate markup5ever;
118 /// # #[macro_use] extern crate tendril;
119 /// # fn main() {
120 /// use markup5ever::buffer_queue::{BufferQueue, SetResult};
121 ///
122 /// let mut queue = BufferQueue::default();
123 /// queue.push_back(format_tendril!(r#"<some_tag attr="text">SomeText</some_tag>"#));
124 /// let set = small_char_set!(b'<' b'>' b' ' b'=' b'"' b'/');
125 /// let tag = format_tendril!("some_tag");
126 /// let attr = format_tendril!("attr");
127 /// let attr_val = format_tendril!("text");
128 /// assert_eq!(queue.pop_except_from(set), Some(SetResult::FromSet('<')));
129 /// assert_eq!(queue.pop_except_from(set), Some(SetResult::NotFromSet(tag)));
130 /// assert_eq!(queue.pop_except_from(set), Some(SetResult::FromSet(' ')));
131 /// assert_eq!(queue.pop_except_from(set), Some(SetResult::NotFromSet(attr)));
132 /// assert_eq!(queue.pop_except_from(set), Some(SetResult::FromSet('=')));
133 /// assert_eq!(queue.pop_except_from(set), Some(SetResult::FromSet('"')));
134 /// assert_eq!(queue.pop_except_from(set), Some(SetResult::NotFromSet(attr_val)));
135 /// // ...
136 /// # }
137 /// ```
138 pub fn pop_except_from(&self, set: SmallCharSet) -> Option<SetResult> {
139 let (result, now_empty) = match self.buffers.borrow_mut().front_mut() {
140 None => (None, false),
141 Some(buf) => {
142 let n = set.nonmember_prefix_len(buf);
143 if n > 0 {
144 let out;
145 unsafe {
146 out = buf.unsafe_subtendril(0, n);
147 buf.unsafe_pop_front(n);
148 }
149 (Some(NotFromSet(out)), buf.is_empty())
150 } else {
151 let c = buf.pop_front_char().expect("empty buffer in queue");
152 (Some(FromSet(c)), buf.is_empty())
153 }
154 },
155 };
156
157 // Unborrow self for this part.
158 if now_empty {
159 self.buffers.borrow_mut().pop_front();
160 }
161
162 result
163 }
164
165 /// Consume bytes matching the pattern, using a custom comparison function `eq`.
166 ///
167 /// Returns `Some(true)` if there is a match, `Some(false)` if there is no match, or `None` if
168 /// it wasn't possible to know (more data is needed).
169 ///
170 /// The custom comparison function is used elsewhere to compare ascii-case-insensitively.
171 ///
172 /// # Examples
173 ///
174 /// ```
175 /// # extern crate markup5ever;
176 /// # #[macro_use] extern crate tendril;
177 /// # fn main() {
178 /// use markup5ever::buffer_queue::BufferQueue;
179 ///
180 /// let mut queue = BufferQueue::default();
181 /// queue.push_back(format_tendril!("testtext"));
182 /// let test_str = "test";
183 /// assert_eq!(queue.eat("test", |&a, &b| a == b), Some(true));
184 /// assert_eq!(queue.eat("text", |&a, &b| a == b), Some(true));
185 /// assert!(queue.is_empty());
186 /// # }
187 /// ```
188 pub fn eat<F: Fn(&u8, &u8) -> bool>(&self, pat: &str, eq: F) -> Option<bool> {
189 let mut buffers_exhausted = 0;
190 let mut consumed_from_last = 0;
191
192 self.buffers.borrow().front()?;
193
194 for pattern_byte in pat.bytes() {
195 if buffers_exhausted >= self.buffers.borrow().len() {
196 return None;
197 }
198 let buf = &self.buffers.borrow()[buffers_exhausted];
199
200 if !eq(&buf.as_bytes()[consumed_from_last], &pattern_byte) {
201 return Some(false);
202 }
203
204 consumed_from_last += 1;
205 if consumed_from_last >= buf.len() {
206 buffers_exhausted += 1;
207 consumed_from_last = 0;
208 }
209 }
210
211 // We have a match. Commit changes to the BufferQueue.
212 for _ in 0..buffers_exhausted {
213 self.buffers.borrow_mut().pop_front();
214 }
215
216 match self.buffers.borrow_mut().front_mut() {
217 None => assert_eq!(consumed_from_last, 0),
218 Some(ref mut buf) => buf.pop_front(consumed_from_last as u32),
219 }
220
221 Some(true)
222 }
223
224 /// Get the next character if one is available, removing it from the queue.
225 ///
226 /// This function manages the buffers, removing them as they become empty.
227 pub fn next(&self) -> Option<char> {
228 let (result, now_empty) = match self.buffers.borrow_mut().front_mut() {
229 None => (None, false),
230 Some(buf) => {
231 let c = buf.pop_front_char().expect("empty buffer in queue");
232 (Some(c), buf.is_empty())
233 },
234 };
235
236 if now_empty {
237 self.buffers.borrow_mut().pop_front();
238 }
239
240 result
241 }
242
243 pub fn replace_with(&self, other: BufferQueue) {
244 let _ = mem::replace(&mut *self.buffers.borrow_mut(), other.buffers.take());
245 }
246
247 pub fn swap_with(&self, other: &BufferQueue) {
248 mem::swap(
249 &mut *self.buffers.borrow_mut(),
250 &mut *other.buffers.borrow_mut(),
251 );
252 }
253
254 /// Return a mutable reference to the first tendril in the queue.
255 pub fn peek_front_chunk_mut(&self) -> Option<RefMut<'_, StrTendril>> {
256 let buffers = self.buffers.borrow_mut();
257 if buffers.is_empty() {
258 return None;
259 }
260
261 let front_buffer = RefMut::map(buffers, |buffers| {
262 buffers.front_mut().expect("there is at least one buffer")
263 });
264 Some(front_buffer)
265 }
266}
267
268#[cfg(test)]
269#[allow(non_snake_case)]
270mod test {
271 use tendril::SliceExt;
272
273 use super::BufferQueue;
274 use super::SetResult::{FromSet, NotFromSet};
275
276 #[test]
277 fn smoke_test() {
278 let bq = BufferQueue::default();
279 assert_eq!(bq.peek(), None);
280 assert_eq!(bq.next(), None);
281
282 bq.push_back("abc".to_tendril());
283 assert_eq!(bq.peek(), Some('a'));
284 assert_eq!(bq.next(), Some('a'));
285 assert_eq!(bq.peek(), Some('b'));
286 assert_eq!(bq.peek(), Some('b'));
287 assert_eq!(bq.next(), Some('b'));
288 assert_eq!(bq.peek(), Some('c'));
289 assert_eq!(bq.next(), Some('c'));
290 assert_eq!(bq.peek(), None);
291 assert_eq!(bq.next(), None);
292 }
293
294 #[test]
295 fn can_unconsume() {
296 let bq = BufferQueue::default();
297 bq.push_back("abc".to_tendril());
298 assert_eq!(bq.next(), Some('a'));
299
300 bq.push_front("xy".to_tendril());
301 assert_eq!(bq.next(), Some('x'));
302 assert_eq!(bq.next(), Some('y'));
303 assert_eq!(bq.next(), Some('b'));
304 assert_eq!(bq.next(), Some('c'));
305 assert_eq!(bq.next(), None);
306 }
307
308 #[test]
309 fn can_pop_except_set() {
310 let bq = BufferQueue::default();
311 bq.push_back("abc&def".to_tendril());
312 let pop = || bq.pop_except_from(small_char_set!('&'));
313 assert_eq!(pop(), Some(NotFromSet("abc".to_tendril())));
314 assert_eq!(pop(), Some(FromSet('&')));
315 assert_eq!(pop(), Some(NotFromSet("def".to_tendril())));
316 assert_eq!(pop(), None);
317 }
318
319 #[test]
320 fn can_eat() {
321 // This is not very comprehensive. We rely on the tokenizer
322 // integration tests for more thorough testing with many
323 // different input buffer splits.
324 let bq = BufferQueue::default();
325 bq.push_back("a".to_tendril());
326 bq.push_back("bc".to_tendril());
327 assert_eq!(bq.eat("abcd", u8::eq_ignore_ascii_case), None);
328 assert_eq!(bq.eat("ax", u8::eq_ignore_ascii_case), Some(false));
329 assert_eq!(bq.eat("ab", u8::eq_ignore_ascii_case), Some(true));
330 assert_eq!(bq.next(), Some('c'));
331 assert_eq!(bq.next(), None);
332 }
333}