cfgrammar/
newlinecache.rs

1use crate::Span;
2use std::str::FromStr;
3
4/// Cache newlines from an input. These can be used to turn UTF-8 byte offsets into human-friendly
5/// line numbers (and vice versa) without having to store the full input. The cache stores only
6/// newline positions, and not the actual user input; the cache can only be filled incrementally
7/// using the [NewlineCache::feed] method.
8///
9/// It is easy to to intermix bytes and human-friendly line numbers so `NewlineCache` uses the
10/// following terminology:
11///   * `byte` and `byte`s: a UTF-8 byte offset.
12///   * `line_byte` and `line_byte`s: the UTF-8 byte offset of a line start or end.
13///   * `line_num`: a human-friendly line number.
14///   * `col_num`: a human-friendly column number.
15pub struct NewlineCache {
16    newlines: Vec<usize>,
17    trailing_bytes: usize,
18}
19
20impl NewlineCache {
21    /// Create an empty `NewlineCache`.
22    pub fn new() -> Self {
23        Self {
24            newlines: vec![0],
25            trailing_bytes: 0,
26        }
27    }
28
29    /// Feed further input into the cache. This input is considered to be a direct continuation of
30    /// any previous input fed into the cache. Feeding new data thus appends to the cache. If the
31    /// previous input contained a partial line (i.e. did not end in a newline), then the new input
32    /// (unless it starts with a newline) will be considered to be a continuation of that partial
33    /// line.
34    pub fn feed(&mut self, src: &str) {
35        let start_pos = self.newlines.last().unwrap() + self.trailing_bytes;
36        self.newlines
37            .extend(src.char_indices().filter_map(|c| match c {
38                (offset, '\n') => {
39                    self.trailing_bytes = 0;
40                    Some(start_pos + offset + 1)
41                }
42                (_, c) => {
43                    self.trailing_bytes += c.len_utf8();
44                    None
45                }
46            }));
47    }
48
49    /// Number of bytes fed into the newline cache.
50    fn feed_len(&self) -> usize {
51        self.newlines.last().unwrap() + self.trailing_bytes
52    }
53
54    /// Convert a byte offset in the input to a logical line number (i.e. a "human friendly" line
55    /// number, starting from 1). Returns None if the byte offset exceeds the known input length.
56    pub fn byte_to_line_num(&self, byte: usize) -> Option<usize> {
57        if byte > self.feed_len() {
58            return None;
59        }
60
61        let last_newline = self.newlines.last().unwrap();
62        let last_byte = last_newline + self.trailing_bytes;
63
64        if byte < last_byte && byte > *last_newline {
65            Some(self.newlines.len())
66        } else {
67            let (line_m1, _) = self
68                .newlines
69                .iter()
70                .enumerate()
71                .rev()
72                .find(|&(_, &line_off)| line_off <= byte)
73                .unwrap();
74            Some(line_m1 + 1)
75        }
76    }
77
78    /// Convert a logical line number into a byte offset.
79    /// Returns None if the line number exceeds the known line count,
80    /// or the input or the line_num is zero.
81    fn line_num_to_byte(&self, line_num: usize) -> Option<usize> {
82        if line_num > self.newlines.len() || line_num == 0 {
83            None
84        } else {
85            Some(self.newlines[line_num - 1])
86        }
87    }
88
89    /// Convert a byte offset in the input to the byte offset of the beginning of its line.
90    /// Returns None if the byte offset exceeds the known input length.
91    pub fn byte_to_line_byte(&self, byte: usize) -> Option<usize> {
92        self.byte_to_line_num(byte)
93            .and_then(|line_num| self.line_num_to_byte(line_num))
94    }
95
96    /// A convenience method to return the logical line and logical column number of a byte. This
97    /// requires passing a `&str` which *must* be equivalent to the string(s) passed to `feed`:
98    /// if not, nondeterminstic results, including panics, are possible.
99    //
100    /// # Panics
101    ///
102    /// May panic if `src` is different than the string(s) passed to `feed` (or might not panic and
103    /// return non-deterministic results).
104    pub fn byte_to_line_num_and_col_num(&self, src: &str, byte: usize) -> Option<(usize, usize)> {
105        if byte > self.feed_len() || src.len() != self.feed_len() {
106            return None;
107        }
108
109        self.byte_to_line_num(byte).map(|line_num| {
110            if byte == src.len() {
111                let line_byte = *self.newlines.last().unwrap();
112                return (self.newlines.len(), src[line_byte..].chars().count() + 1);
113            }
114
115            let line_byte = self.line_num_to_byte(line_num).unwrap();
116            let mut column = 0;
117            let mut skip_char = None;
118
119            for (c_off, c) in src[line_byte..].char_indices() {
120                if Some(c) != skip_char {
121                    column += 1;
122                    skip_char = None;
123                }
124                if c == '\r' {
125                    skip_char = Some('\n');
126                }
127                if c_off == byte - line_byte {
128                    break;
129                }
130            }
131            (line_num, column)
132        })
133    }
134
135    /// Return the (start byte, end byte) of the lines containing `span`. This will always cover
136    /// at least 1 logical line.
137    pub fn span_line_bytes(&self, span: Span) -> (usize, usize) {
138        let (st, st_line) = match self.newlines.binary_search(&span.start()) {
139            Ok(j) => (self.newlines[j], j + 1),
140            Err(j) => (self.newlines[j - 1], j),
141        };
142        let en = match self.newlines[st_line..].binary_search(&span.end()) {
143            Ok(j) if st_line + j == self.newlines.len() - st_line => {
144                self.newlines.last().unwrap() + self.trailing_bytes
145            }
146            Ok(j) => self.newlines[st_line + j + 1] - 1,
147            Err(j) if st_line + j == self.newlines.len() => {
148                self.newlines.last().unwrap() + self.trailing_bytes
149            }
150            Err(j) => self.newlines[st_line + j] - 1,
151        };
152        (st, en)
153    }
154}
155
156impl FromStr for NewlineCache {
157    type Err = ();
158
159    /// Construct a `NewlineCache` directly from a `&str`. This is equivalent to creating a blank
160    /// `NewlineCache` and [Self::feed()]ing the string directly in.
161    fn from_str(s: &str) -> Result<Self, Self::Err> {
162        let mut x = Self::new();
163        x.feed(s);
164        Ok(x)
165    }
166}
167
168impl<'a> FromIterator<&'a str> for NewlineCache {
169    fn from_iter<I>(iter: I) -> Self
170    where
171        I: IntoIterator<Item = &'a str>,
172    {
173        let mut nlcache = NewlineCache::new();
174        for s in iter.into_iter() {
175            nlcache.feed(s)
176        }
177        nlcache
178    }
179}
180
181#[cfg(test)]
182mod tests {
183    use super::NewlineCache;
184    use crate::Span;
185    use std::str::FromStr;
186
187    fn newline_test_helper(feed: &[&str], tests: &[(usize, usize)]) {
188        let cache: NewlineCache = feed.iter().copied().collect();
189        let full_string = feed.concat();
190
191        let mut src_pos = 0;
192        let mut result = Vec::new();
193        for f in feed {
194            for (offset, _) in f.char_indices() {
195                let line_col =
196                    cache.byte_to_line_num_and_col_num(full_string.as_str(), offset + src_pos);
197                result.push(line_col.unwrap())
198            }
199            src_pos += f.len();
200        }
201        assert_eq!(result, tests)
202    }
203
204    #[test]
205    fn newline_cache_various() {
206        let abc = &[(1, 1), (1, 2), (1, 3)];
207        newline_test_helper(&["a", "b", "c"], abc);
208        newline_test_helper(&["abc"], abc);
209        newline_test_helper(&["ab", "c"], abc);
210
211        let ab = &[(1, 1), (1, 2), (2, 1)];
212        newline_test_helper(&["a\n", "b"], ab);
213        newline_test_helper(&["a", "\nb"], ab);
214
215        let cr = &[(1, 1), (1, 2), (1, 2), (2, 1)];
216        newline_test_helper(&["a\r\n", "b"], cr);
217        newline_test_helper(&["a", "\r\nb"], cr);
218        newline_test_helper(&["a\r", "\nb"], cr);
219
220        newline_test_helper(&["\r\n\n"], &[(1, 1), (1, 1), (2, 1)]);
221
222        let suits = &[(1, 1), (1, 2), (1, 3), (1, 4)];
223        newline_test_helper(&["", "♠♥♦♣"], suits);
224        newline_test_helper(&["♠", "♥♦♣"], suits);
225        newline_test_helper(&["♠♥", "♦♣"], suits);
226        newline_test_helper(&["♠♥♦", "♣"], suits);
227        newline_test_helper(&["♠♥♦♣", ""], suits);
228
229        let suits_nl = &[(1, 1), (1, 2), (1, 3), (2, 1), (2, 2)];
230        newline_test_helper(&["", "♠♥\n♦♣"], suits_nl);
231        newline_test_helper(&["♠", "♥\n♦♣"], suits_nl);
232        newline_test_helper(&["♠♥", "\n♦♣"], suits_nl);
233        newline_test_helper(&["♠♥\n", "♦♣"], suits_nl);
234        newline_test_helper(&["♠♥\n♦", "♣"], suits_nl);
235        newline_test_helper(&["♠♥\n♦♣", ""], suits_nl);
236
237        #[rustfmt::skip]
238        let multi_line = &[
239            (1, 1), (1, 2), (2, 1), (2, 2),
240            (3, 1), (3, 2), (4, 1), (4, 2),
241            (5, 1), (5, 2), (6, 1), (6, 2),
242        ];
243        newline_test_helper(&["", "1\n2\n3\n4\n5\n6\n"], multi_line);
244        newline_test_helper(&["1\n2\n3\n4\n5\n6\n"], multi_line);
245        newline_test_helper(&["1\n2\n3\n4\n5\n6\n", ""], multi_line);
246
247        newline_test_helper(&["1", "\n2\n3\n4\n5\n6\n"], multi_line);
248        newline_test_helper(&["1\n", "2\n3\n4\n5\n6\n"], multi_line);
249        newline_test_helper(&["1\n2", "\n3\n4\n5\n6\n"], multi_line);
250        newline_test_helper(&["1\n2\n", "3\n4\n5\n6\n"], multi_line);
251        newline_test_helper(&["1\n2\n3", "\n4\n5\n6\n"], multi_line);
252        newline_test_helper(&["1\n2\n3\n", "4\n5\n6\n"], multi_line);
253        newline_test_helper(&["1\n2\n3\n4", "\n5\n6\n"], multi_line);
254        newline_test_helper(&["1\n2\n3\n4\n", "5\n6\n"], multi_line);
255        newline_test_helper(&["1\n2\n3\n4\n5", "\n6\n"], multi_line);
256        newline_test_helper(&["1\n2\n3\n4\n5\n", "6\n"], multi_line);
257        newline_test_helper(&["1\n2\n", "3\n4\n", "5\n6\n"], multi_line);
258        newline_test_helper(&["1\n2\n", "3\n4", "\n5\n6\n"], multi_line);
259        newline_test_helper(&["1\n2\n", "", "3\n4", "", "\n5\n6\n"], multi_line);
260
261        newline_test_helper(&["", "", ""], &[]);
262        newline_test_helper(&["", ""], &[]);
263        newline_test_helper(&[""], &[]);
264        newline_test_helper(&["", "", "", "a"], &[(1, 1)]);
265        newline_test_helper(&["", "", "a"], &[(1, 1)]);
266        newline_test_helper(&["", "a"], &[(1, 1)]);
267
268        // Positive tests for the string we'll use for negative tests.
269        newline_test_helper(&["1", "\n23"], &[(1, 1), (1, 2), (2, 1), (2, 2)]);
270    }
271
272    #[test]
273    fn newline_cache_negative() {
274        let mut cache = NewlineCache::new();
275
276        // Byte exceeds input length
277        cache.feed("1");
278        assert_eq!(None, cache.byte_to_line_num(2));
279        assert_eq!(None, cache.byte_to_line_num_and_col_num("1", 2));
280        cache.feed("\n23");
281        assert_eq!(None, cache.byte_to_line_num(5));
282        assert_eq!(None, cache.byte_to_line_num_and_col_num("1\n23", 5));
283
284        // Byte valid, but src.len() exceeds input length.
285        assert_eq!(None, cache.byte_to_line_num_and_col_num("1\n234", 1));
286    }
287
288    #[test]
289    fn spanlines_str() {
290        fn span_line_eq(input: &str, byte_st: usize, byte_en: usize, substr: &str) {
291            let nlc = NewlineCache::from_str(input).unwrap();
292            let (lines_st, lines_en) = nlc.span_line_bytes(Span::new(byte_st, byte_en));
293            assert_eq!(&input[lines_st..lines_en], substr);
294        }
295
296        span_line_eq("a b c", 2, 3, "a b c");
297        span_line_eq("a b c", 4, 5, "a b c");
298
299        span_line_eq("a b c\n", 2, 3, "a b c");
300        span_line_eq("a b c\n", 4, 5, "a b c");
301        span_line_eq("a b c\n", 5, 5, "a b c");
302        span_line_eq("a b c\n", 6, 6, "");
303
304        span_line_eq(" a\nb\n  c d", 1, 2, " a");
305        span_line_eq(" a\nb\n  c d", 3, 4, "b");
306        span_line_eq(" a\nb\n  c d", 7, 8, "  c d");
307        span_line_eq(" a\nb\n  c d", 9, 10, "  c d");
308
309        span_line_eq("ab\n", 2, 3, "ab\n");
310        span_line_eq("ab\ncd", 2, 3, "ab\ncd");
311        span_line_eq("ab\ncd\nef", 2, 3, "ab\ncd");
312
313        let s = "\na\na a\na a a\na a a a";
314        span_line_eq(s, 0, 0, "");
315        span_line_eq(s, 0, 2, "\na");
316        span_line_eq(s, 0, 4, "\na\na a");
317        span_line_eq(s, 0, 7, "\na\na a\na a a");
318        span_line_eq(s, 4, 7, "a a\na a a");
319
320        span_line_eq(" a\n❤ b", 1, 2, " a");
321        span_line_eq(" a\n❤ b", 3, 4, "❤ b");
322        span_line_eq(" a\n❤ b", 5, 6, "❤ b");
323    }
324}