lrpar/
parser.rs

1#![allow(clippy::derive_partial_eq_without_eq)]
2use std::{
3    error::Error,
4    fmt::{self, Debug, Display, Write as _},
5    hash::Hash,
6    marker::PhantomData,
7    vec,
8};
9
10// Can be used on non-wasm32 but to avoid the dependency.
11#[cfg(not(target_arch = "wasm32"))]
12use std::time::{Duration, Instant};
13#[cfg(target_arch = "wasm32")]
14use web_time::{Duration, Instant};
15
16use cactus::Cactus;
17use cfgrammar::{RIdx, Span, TIdx, header::Value, span::Location, yacc::YaccGrammar};
18use lrtable::{Action, StIdx, StateTable};
19use num_traits::{AsPrimitive, PrimInt, Unsigned};
20use proc_macro2::TokenStream;
21use quote::quote;
22#[cfg(feature = "serde")]
23use serde::{Deserialize, Serialize};
24
25use crate::{LexError, Lexeme, LexerTypes, NonStreamingLexer, cpctplus};
26
27#[cfg(test)]
28const RECOVERY_TIME_BUDGET: u64 = 60_000; // milliseconds
29#[cfg(not(test))]
30const RECOVERY_TIME_BUDGET: u64 = 500; // milliseconds
31
32#[deprecated(
33    since = "0.14.0",
34    note = "Use the version of `Node` exported from your `lrpar_mod!`"
35)]
36pub type Node<T, S> = _deprecated_moved_::Node<T, S>;
37
38#[doc(hidden)]
39pub mod _deprecated_moved_ {
40    use super::*;
41    /// A generic parse tree.
42    #[derive(Debug, Clone, PartialEq)]
43    pub enum Node<LexemeT: Lexeme<StorageT>, StorageT> {
44        /// Terminals store a single lexeme.
45        Term { lexeme: LexemeT },
46        /// Nonterminals reference a rule and have zero or more `Node`s as children.
47        Nonterm {
48            ridx: RIdx<StorageT>,
49            nodes: Vec<Node<LexemeT, StorageT>>,
50        },
51    }
52}
53
54#[allow(deprecated)]
55impl<LexemeT: Lexeme<StorageT>, StorageT: 'static + PrimInt + Unsigned> Node<LexemeT, StorageT>
56where
57    usize: AsPrimitive<StorageT>,
58{
59    /// Return a pretty-printed version of this node.
60    pub fn pp(&self, grm: &YaccGrammar<StorageT>, input: &str) -> String {
61        let mut st = vec![(0, self)]; // Stack of (indent level, node) pairs
62        let mut s = String::new();
63        while let Some((indent, e)) = st.pop() {
64            for _ in 0..indent {
65                s.push(' ');
66            }
67            match *e {
68                Node::Term { lexeme } => {
69                    let tidx = TIdx(lexeme.tok_id());
70                    let tn = grm.token_name(tidx).unwrap();
71                    let lt = &input[lexeme.span().start()..lexeme.span().end()];
72                    writeln!(s, "{} {}", tn, lt).ok();
73                }
74                Node::Nonterm { ridx, ref nodes } => {
75                    writeln!(s, "{}", grm.rule_name_str(ridx)).ok();
76                    for x in nodes.iter().rev() {
77                        st.push((indent + 1, x));
78                    }
79                }
80            }
81        }
82        s
83    }
84}
85
86type PStack<StorageT> = Vec<StIdx<StorageT>>; // Parse stack
87type TokenCostFn<'a, StorageT> = &'a (dyn Fn(TIdx<StorageT>) -> u8 + 'a);
88type ActionFn<'a, 'b, 'input, StorageT, LexerTypesT, ActionT, ParamT> = &'a dyn Fn(
89    RIdx<StorageT>,
90    &'b dyn NonStreamingLexer<'input, LexerTypesT>,
91    Span,
92    vec::Drain<AStackType<<LexerTypesT as LexerTypes>::LexemeT, ActionT>>,
93    ParamT,
94) -> ActionT;
95
96#[derive(Debug)]
97pub enum AStackType<LexemeT, ActionT> {
98    ActionType(ActionT),
99    Lexeme(LexemeT),
100}
101
102pub(super) struct Parser<
103    'a,
104    'b: 'a,
105    'input: 'b,
106    StorageT: 'static + Eq + Hash + PrimInt + Unsigned,
107    LexerTypesT: LexerTypes<StorageT = StorageT>,
108    ActionT: 'a,
109    ParamT: Clone,
110> where
111    usize: AsPrimitive<StorageT>,
112{
113    rcvry_kind: RecoveryKind,
114    pub(super) grm: &'a YaccGrammar<StorageT>,
115    pub(super) token_cost: Box<TokenCostFn<'a, StorageT>>,
116    pub(super) stable: &'a StateTable<StorageT>,
117    lexer: &'b dyn NonStreamingLexer<'input, LexerTypesT>,
118    // In the long term, we should remove the `lexemes` field entirely, as the `NonStreamingLexer` API is
119    // powerful enough to allow us to incrementally obtain lexemes and buffer them when necessary.
120    pub(super) lexemes: Vec<LexerTypesT::LexemeT>,
121    actions: &'a [ActionFn<'a, 'b, 'input, LexerTypesT::StorageT, LexerTypesT, ActionT, ParamT>],
122    param: ParamT,
123}
124
125impl<
126    'a,
127    'b: 'a,
128    'input: 'b,
129    StorageT: 'static + Debug + Hash + PrimInt + Unsigned,
130    LexerTypesT: LexerTypes<StorageT = StorageT>,
131    Node,
132>
133    Parser<
134        'a,
135        'b,
136        'input,
137        StorageT,
138        LexerTypesT,
139        Node,
140        (
141            &dyn Fn(LexerTypesT::LexemeT) -> Node,
142            &dyn Fn(RIdx<StorageT>, Vec<Node>) -> Node,
143        ),
144    >
145where
146    usize: AsPrimitive<StorageT>,
147{
148    fn parse_map(
149        rcvry_kind: RecoveryKind,
150        grm: &YaccGrammar<StorageT>,
151        token_cost: TokenCostFn<'a, StorageT>,
152        stable: &StateTable<StorageT>,
153        lexer: &'b dyn NonStreamingLexer<'input, LexerTypesT>,
154        lexemes: Vec<LexerTypesT::LexemeT>,
155        fterm: &'a dyn Fn(LexerTypesT::LexemeT) -> Node,
156        fnonterm: &'a dyn Fn(RIdx<StorageT>, Vec<Node>) -> Node,
157    ) -> (Option<Node>, Vec<LexParseError<StorageT, LexerTypesT>>) {
158        for tidx in grm.iter_tidxs() {
159            assert!(token_cost(tidx) > 0);
160        }
161        let mut actions: Vec<
162            ActionFn<
163                'a,
164                'b,
165                'input,
166                StorageT,
167                LexerTypesT,
168                Node,
169                (
170                    &'a dyn Fn(LexerTypesT::LexemeT) -> Node,
171                    &'a dyn Fn(RIdx<StorageT>, Vec<Node>) -> Node,
172                ),
173            >,
174        > = Vec::new();
175        actions.resize(usize::from(grm.prods_len()), &action_map);
176        let psr = Parser {
177            rcvry_kind,
178            grm,
179            token_cost: Box::new(token_cost),
180            stable,
181            lexer,
182            lexemes,
183            actions: actions.as_slice(),
184            param: (fterm, fnonterm),
185        };
186        let mut pstack = vec![stable.start_state()];
187        let mut astack = Vec::new();
188        let mut errors = Vec::new();
189        let mut spans = Vec::new();
190        let accpt = psr.lr(0, &mut pstack, &mut astack, &mut errors, &mut spans);
191        (accpt, errors)
192    }
193}
194
195fn action_map<StorageT, LexerTypesT: LexerTypes, Node>(
196    ridx: RIdx<StorageT>,
197    _lexer: &dyn NonStreamingLexer<LexerTypesT>,
198    _span: Span,
199    astack: vec::Drain<AStackType<LexerTypesT::LexemeT, Node>>,
200    param: (
201        &dyn Fn(LexerTypesT::LexemeT) -> Node,
202        &dyn Fn(RIdx<StorageT>, Vec<Node>) -> Node,
203    ),
204) -> Node
205where
206    usize: AsPrimitive<LexerTypesT::StorageT>,
207    LexerTypesT::LexemeT: Lexeme<StorageT>,
208{
209    let (fterm, fnonterm) = param;
210    let mut nodes = Vec::with_capacity(astack.len());
211    for a in astack {
212        nodes.push(match a {
213            AStackType::ActionType(n) => n,
214            AStackType::Lexeme(lexeme) => fterm(lexeme),
215        });
216    }
217    fnonterm(ridx, nodes)
218}
219
220#[deprecated(
221    since = "0.14.0",
222    note = "Deprecated with `parse_generictree` there is no direct replacement, besides a custom action"
223)]
224#[allow(deprecated)]
225/// The action which implements [`cfgrammar::yacc::YaccOriginalActionKind::GenericParseTree`].
226/// Usually you should just use the action kind directly. But you can also call this from
227/// within a custom action to return a generic parse tree with custom behavior.
228pub fn action_generictree<StorageT, LexerTypesT: LexerTypes>(
229    ridx: RIdx<StorageT>,
230    _lexer: &dyn NonStreamingLexer<LexerTypesT>,
231    _span: Span,
232    astack: vec::Drain<AStackType<LexerTypesT::LexemeT, Node<LexerTypesT::LexemeT, StorageT>>>,
233    _param: (),
234) -> Node<LexerTypesT::LexemeT, StorageT>
235where
236    usize: AsPrimitive<LexerTypesT::StorageT>,
237    LexerTypesT::LexemeT: Lexeme<StorageT>,
238{
239    let mut nodes = Vec::with_capacity(astack.len());
240    for a in astack {
241        nodes.push(match a {
242            AStackType::ActionType(n) => n,
243            AStackType::Lexeme(lexeme) => Node::Term { lexeme },
244        });
245    }
246    Node::Nonterm { ridx, nodes }
247}
248
249impl<
250    'a,
251    'b: 'a,
252    'input: 'b,
253    StorageT: 'static + Debug + Eq + Hash + PrimInt + Unsigned,
254    LexerTypesT: LexerTypes<StorageT = StorageT>,
255    ActionT: 'a,
256    ParamT: Clone,
257> Parser<'a, 'b, 'input, StorageT, LexerTypesT, ActionT, ParamT>
258where
259    usize: AsPrimitive<StorageT>,
260{
261    fn parse_actions(
262        rcvry_kind: RecoveryKind,
263        grm: &'a YaccGrammar<StorageT>,
264        token_cost: TokenCostFn<'a, StorageT>,
265        stable: &'a StateTable<StorageT>,
266        lexer: &'b dyn NonStreamingLexer<'input, LexerTypesT>,
267        lexemes: Vec<LexerTypesT::LexemeT>,
268        actions: &'a [ActionFn<'a, 'b, 'input, StorageT, LexerTypesT, ActionT, ParamT>],
269        param: ParamT,
270    ) -> (Option<ActionT>, Vec<LexParseError<StorageT, LexerTypesT>>) {
271        for tidx in grm.iter_tidxs() {
272            assert!(token_cost(tidx) > 0);
273        }
274        let psr = Parser {
275            rcvry_kind,
276            grm,
277            token_cost: Box::new(token_cost),
278            stable,
279            lexer,
280            lexemes,
281            actions,
282            param,
283        };
284        let mut pstack = vec![stable.start_state()];
285        let mut astack = Vec::new();
286        let mut errors = Vec::new();
287        let mut spans = Vec::new();
288        let accpt = psr.lr(0, &mut pstack, &mut astack, &mut errors, &mut spans);
289        (accpt, errors)
290    }
291
292    /// Start parsing text at `laidx` (using the lexeme in `lexeme_prefix`, if it is not `None`,
293    /// as the first lexeme) up to (but excluding) `end_laidx` (if it's specified). Parsing
294    /// continues as long as possible (assuming that any errors encountered can be recovered from)
295    /// unless `end_laidx` is `None`, at which point this function returns as soon as it
296    /// encounters an error.
297    ///
298    /// Note that if `lexeme_prefix` is specified, `laidx` will still be incremented, and thus
299    /// `end_laidx` *must* be set to `laidx + 1` in order that the parser doesn't skip the real
300    /// lexeme at position `laidx`.
301    ///
302    /// Return `Some(value)` if the parse reached an accept state (i.e. all the input was consumed,
303    /// possibly after making repairs) or `None` (i.e. some of the input was not consumed, even
304    /// after possibly making repairs) otherwise.
305    fn lr(
306        &self,
307        mut laidx: usize,
308        pstack: &mut PStack<StorageT>,
309        astack: &mut Vec<AStackType<LexerTypesT::LexemeT, ActionT>>,
310        errors: &mut Vec<LexParseError<StorageT, LexerTypesT>>,
311        spans: &mut Vec<Span>,
312    ) -> Option<ActionT> {
313        let mut recoverer = None;
314        let mut recovery_budget = Duration::from_millis(RECOVERY_TIME_BUDGET);
315        loop {
316            debug_assert_eq!(astack.len(), spans.len());
317            let stidx = *pstack.last().unwrap();
318            let la_tidx = self.next_tidx(laidx);
319
320            match self.stable.action(stidx, la_tidx) {
321                Action::Reduce(pidx) => {
322                    let ridx = self.grm.prod_to_rule(pidx);
323                    let pop_idx = pstack.len() - self.grm.prod(pidx).len();
324
325                    pstack.drain(pop_idx..);
326                    let prior = *pstack.last().unwrap();
327                    pstack.push(self.stable.goto(prior, ridx).unwrap());
328
329                    let span = if spans.is_empty() {
330                        Span::new(0, 0)
331                    } else if pop_idx - 1 < spans.len() {
332                        Span::new(spans[pop_idx - 1].start(), spans[spans.len() - 1].end())
333                    } else {
334                        Span::new(spans[spans.len() - 1].start(), spans[spans.len() - 1].end())
335                    };
336                    spans.truncate(pop_idx - 1);
337                    spans.push(span);
338
339                    let v = AStackType::ActionType(self.actions[usize::from(pidx)](
340                        ridx,
341                        self.lexer,
342                        span,
343                        astack.drain(pop_idx - 1..),
344                        self.param.clone(),
345                    ));
346                    astack.push(v);
347                }
348                Action::Shift(state_id) => {
349                    let la_lexeme = self.next_lexeme(laidx);
350                    pstack.push(state_id);
351                    astack.push(AStackType::Lexeme(la_lexeme));
352
353                    spans.push(la_lexeme.span());
354                    laidx += 1;
355                }
356                Action::Accept => {
357                    debug_assert_eq!(la_tidx, self.grm.eof_token_idx());
358                    debug_assert_eq!(astack.len(), 1);
359                    match astack.drain(..).next().unwrap() {
360                        AStackType::ActionType(v) => return Some(v),
361                        _ => unreachable!(),
362                    }
363                }
364                Action::Error => {
365                    if recoverer.is_none() {
366                        recoverer = Some(match self.rcvry_kind {
367                            RecoveryKind::CPCTPlus => cpctplus::recoverer(self),
368                            RecoveryKind::None => {
369                                let la_lexeme = self.next_lexeme(laidx);
370                                errors.push(
371                                    ParseError {
372                                        stidx,
373                                        lexeme: la_lexeme,
374                                        repairs: vec![],
375                                    }
376                                    .into(),
377                                );
378                                return None;
379                            }
380                        });
381                    }
382
383                    let before = Instant::now();
384                    let finish_by = before + recovery_budget;
385                    let (new_laidx, repairs) = recoverer
386                        .as_ref()
387                        .unwrap()
388                        .as_ref()
389                        .recover(finish_by, self, laidx, pstack, astack, spans);
390                    let after = Instant::now();
391                    recovery_budget = recovery_budget
392                        .checked_sub(after - before)
393                        .unwrap_or_else(|| Duration::new(0, 0));
394                    let keep_going = !repairs.is_empty();
395                    let la_lexeme = self.next_lexeme(laidx);
396                    errors.push(
397                        ParseError {
398                            stidx,
399                            lexeme: la_lexeme,
400                            repairs,
401                        }
402                        .into(),
403                    );
404                    if !keep_going {
405                        return None;
406                    }
407                    laidx = new_laidx;
408                }
409            }
410        }
411    }
412
413    /// Parse from `laidx` up to (but excluding) `end_laidx` mutating `pstack` as parsing occurs.
414    /// Returns the index of the token it parsed up to (by definition <= end_laidx: can be less if
415    /// the input is < end_laidx, or if an error is encountered). Does not do any form of error
416    /// recovery.
417    pub(super) fn lr_upto(
418        &self,
419        lexeme_prefix: Option<LexerTypesT::LexemeT>,
420        mut laidx: usize,
421        end_laidx: usize,
422        pstack: &mut PStack<StorageT>,
423        astack: &mut Option<&mut Vec<AStackType<LexerTypesT::LexemeT, ActionT>>>,
424        spans: &mut Option<&mut Vec<Span>>,
425    ) -> usize {
426        assert!(lexeme_prefix.is_none() || end_laidx == laidx + 1);
427        while laidx != end_laidx && laidx <= self.lexemes.len() {
428            let stidx = *pstack.last().unwrap();
429            let la_tidx = if let Some(l) = lexeme_prefix {
430                TIdx(l.tok_id())
431            } else {
432                self.next_tidx(laidx)
433            };
434
435            match self.stable.action(stidx, la_tidx) {
436                Action::Reduce(pidx) => {
437                    let ridx = self.grm.prod_to_rule(pidx);
438                    let pop_idx = pstack.len() - self.grm.prod(pidx).len();
439                    if let Some(ref mut astack_uw) = *astack {
440                        if let Some(ref mut spans_uw) = *spans {
441                            let span = if spans_uw.is_empty() {
442                                Span::new(0, 0)
443                            } else if pop_idx - 1 < spans_uw.len() {
444                                Span::new(
445                                    spans_uw[pop_idx - 1].start(),
446                                    spans_uw[spans_uw.len() - 1].end(),
447                                )
448                            } else {
449                                Span::new(
450                                    spans_uw[spans_uw.len() - 1].start(),
451                                    spans_uw[spans_uw.len() - 1].end(),
452                                )
453                            };
454                            spans_uw.truncate(pop_idx - 1);
455                            spans_uw.push(span);
456
457                            let v = AStackType::ActionType(self.actions[usize::from(pidx)](
458                                ridx,
459                                self.lexer,
460                                span,
461                                astack_uw.drain(pop_idx - 1..),
462                                self.param.clone(),
463                            ));
464                            astack_uw.push(v);
465                        } else {
466                            unreachable!();
467                        }
468                    }
469
470                    pstack.drain(pop_idx..);
471                    let prior = *pstack.last().unwrap();
472                    pstack.push(self.stable.goto(prior, ridx).unwrap());
473                }
474                Action::Shift(state_id) => {
475                    if let Some(ref mut astack_uw) = *astack {
476                        if let Some(spans_uw) = spans {
477                            let la_lexeme = if let Some(l) = lexeme_prefix {
478                                l
479                            } else {
480                                self.next_lexeme(laidx)
481                            };
482                            astack_uw.push(AStackType::Lexeme(la_lexeme));
483                            spans_uw.push(la_lexeme.span());
484                        }
485                    }
486                    pstack.push(state_id);
487                    laidx += 1;
488                }
489                Action::Accept => {
490                    break;
491                }
492                Action::Error => {
493                    break;
494                }
495            }
496        }
497        laidx
498    }
499
500    /// Return a `Lexeme` for the next lemexe (if `laidx` == `self.lexemes.len()` this will be
501    /// a lexeme constructed to look as if contains the EOF token).
502    pub(super) fn next_lexeme(&self, laidx: usize) -> LexerTypesT::LexemeT {
503        let llen = self.lexemes.len();
504        debug_assert!(laidx <= llen);
505        if laidx < llen {
506            self.lexemes[laidx]
507        } else {
508            // We have to artificially construct a Lexeme for the EOF lexeme.
509            let last_la_end = if llen == 0 {
510                0
511            } else {
512                debug_assert!(laidx > 0);
513                let last_la = self.lexemes[laidx - 1];
514                last_la.span().end()
515            };
516
517            Lexeme::new_faulty(
518                StorageT::from(u32::from(self.grm.eof_token_idx())).unwrap(),
519                last_la_end,
520                0,
521            )
522        }
523    }
524
525    /// Return the `TIdx` of the next lexeme (if `laidx` == `self.lexemes.len()` this will be the
526    /// EOF `TIdx`).
527    pub(super) fn next_tidx(&self, laidx: usize) -> TIdx<StorageT> {
528        let ll = self.lexemes.len();
529        debug_assert!(laidx <= ll);
530        if laidx < ll {
531            TIdx(self.lexemes[laidx].tok_id())
532        } else {
533            self.grm.eof_token_idx()
534        }
535    }
536
537    /// Start parsing text at `laidx` (using the lexeme in `lexeme_prefix`, if it is not `None`,
538    /// as the first lexeme) up to (but excluding) `end_laidx`. If an error is encountered, parsing
539    /// immediately terminates (without recovery).
540    ///
541    /// Note that if `lexeme_prefix` is specified, `laidx` will still be incremented, and thus
542    /// `end_laidx` *must* be set to `laidx + 1` in order that the parser doesn't skip the real
543    /// lexeme at position `laidx`.
544    #[allow(deprecated)]
545    pub(super) fn lr_cactus(
546        &self,
547        lexeme_prefix: Option<LexerTypesT::LexemeT>,
548        mut laidx: usize,
549        end_laidx: usize,
550        mut pstack: Cactus<StIdx<StorageT>>,
551        tstack: &mut Option<&mut Vec<Node<LexerTypesT::LexemeT, StorageT>>>,
552    ) -> (usize, Cactus<StIdx<StorageT>>) {
553        assert!(lexeme_prefix.is_none() || end_laidx == laidx + 1);
554        while laidx != end_laidx {
555            let stidx = *pstack.val().unwrap();
556            let la_tidx = if let Some(l) = lexeme_prefix {
557                TIdx(l.tok_id())
558            } else {
559                self.next_tidx(laidx)
560            };
561
562            match self.stable.action(stidx, la_tidx) {
563                Action::Reduce(pidx) => {
564                    let ridx = self.grm.prod_to_rule(pidx);
565                    let pop_num = self.grm.prod(pidx).len();
566                    if let Some(ref mut tstack_uw) = *tstack {
567                        let nodes = tstack_uw
568                            .drain(pstack.len() - pop_num - 1..)
569                            .collect::<Vec<Node<LexerTypesT::LexemeT, StorageT>>>();
570                        tstack_uw.push(Node::Nonterm { ridx, nodes });
571                    }
572
573                    for _ in 0..pop_num {
574                        pstack = pstack.parent().unwrap();
575                    }
576                    let prior = *pstack.val().unwrap();
577                    pstack = pstack.child(self.stable.goto(prior, ridx).unwrap());
578                }
579                Action::Shift(state_id) => {
580                    if let Some(ref mut tstack_uw) = *tstack {
581                        let la_lexeme = if let Some(l) = lexeme_prefix {
582                            l
583                        } else {
584                            self.next_lexeme(laidx)
585                        };
586                        tstack_uw.push(Node::Term { lexeme: la_lexeme });
587                    }
588                    pstack = pstack.child(state_id);
589                    laidx += 1;
590                }
591                Action::Accept => {
592                    debug_assert_eq!(la_tidx, self.grm.eof_token_idx());
593                    if let Some(ref tstack_uw) = *tstack {
594                        debug_assert_eq!(tstack_uw.len(), 1);
595                    }
596                    break;
597                }
598                Action::Error => {
599                    break;
600                }
601            }
602        }
603        (laidx, pstack)
604    }
605}
606
607pub(super) trait Recoverer<
608    StorageT: 'static + Debug + Hash + PrimInt + Unsigned,
609    LexerTypesT: LexerTypes<StorageT = StorageT>,
610    ActionT,
611    ParamT: Clone,
612> where
613    usize: AsPrimitive<StorageT>,
614{
615    fn recover(
616        &self,
617        finish_by: Instant,
618        parser: &Parser<StorageT, LexerTypesT, ActionT, ParamT>,
619        in_laidx: usize,
620        in_pstack: &mut PStack<StorageT>,
621        astack: &mut Vec<AStackType<LexerTypesT::LexemeT, ActionT>>,
622        spans: &mut Vec<Span>,
623    ) -> (usize, Vec<Vec<ParseRepair<LexerTypesT::LexemeT, StorageT>>>);
624}
625
626/// What recovery algorithm should be used when a syntax error is encountered?
627#[derive(Clone, Copy, Debug)]
628#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
629#[non_exhaustive]
630pub enum RecoveryKind {
631    /// The CPCT+ algorithm from Diekmann/Tratt "Don't Panic! Better, Fewer, Syntax Errors for LR
632    /// Parsers".
633    CPCTPlus,
634    /// Don't use error recovery: return as soon as the first syntax error is encountered.
635    None,
636}
637
638impl TryFrom<RecoveryKind> for Value<Location> {
639    type Error = cfgrammar::header::HeaderError<Location>;
640    fn try_from(rk: RecoveryKind) -> Result<Value<Location>, Self::Error> {
641        use cfgrammar::{
642            Location,
643            header::{Namespaced, Setting},
644        };
645        let from_loc = Location::Other("From<RecoveryKind>".to_string());
646        Ok(match rk {
647            RecoveryKind::CPCTPlus => Value::Setting(Setting::Unitary(Namespaced {
648                namespace: Some(("RecoveryKind".to_string(), from_loc.clone())),
649                member: ("CPCTPlus".to_string(), from_loc.clone()),
650            })),
651            RecoveryKind::None => Value::Setting(Setting::Unitary(Namespaced {
652                namespace: Some(("RecoveryKind".to_string(), from_loc.clone())),
653                member: ("None".to_string(), from_loc.clone()),
654            })),
655        })
656    }
657}
658
659impl TryFrom<&Value<Location>> for RecoveryKind {
660    type Error = cfgrammar::header::HeaderError<Location>;
661    fn try_from(rk: &Value<Location>) -> Result<RecoveryKind, Self::Error> {
662        use cfgrammar::header::{HeaderError, HeaderErrorKind, Namespaced, Setting};
663
664        match rk {
665            Value::Flag(_, loc) => Err(HeaderError {
666                kind: HeaderErrorKind::ConversionError(
667                    "RecoveryKind",
668                    "Cannot convert boolean to RecoveryKind",
669                ),
670                locations: vec![loc.clone()],
671            }),
672            Value::Setting(Setting::Num(_, loc)) => Err(HeaderError {
673                kind: HeaderErrorKind::ConversionError(
674                    "RecoveryKind",
675                    "Cannot convert number to RecoveryKind",
676                ),
677                locations: vec![loc.clone()],
678            }),
679            Value::Setting(Setting::String(_, loc)) => Err(HeaderError {
680                kind: HeaderErrorKind::ConversionError(
681                    "RecoveryKind",
682                    "Cannot convert string to RecoveryKind",
683                ),
684                locations: vec![loc.clone()],
685            }),
686            Value::Setting(Setting::Unitary(Namespaced {
687                namespace,
688                member: (kind, kind_loc),
689            })) => {
690                match namespace {
691                    Some((ns, loc)) if ns.to_lowercase() != "recoverykind" => {
692                        return Err(HeaderError {
693                            kind: HeaderErrorKind::ConversionError(
694                                "RecoveryKind",
695                                "Unknown namespace",
696                            ),
697                            locations: vec![loc.clone()],
698                        });
699                    }
700                    _ => {}
701                }
702                match kind.to_lowercase().as_ref() {
703                    "cpctplus" => Ok(RecoveryKind::CPCTPlus),
704                    "none" => Ok(RecoveryKind::None),
705                    _ => Err(HeaderError {
706                        kind: HeaderErrorKind::ConversionError("RecoveryKind", "Unknown variant"),
707                        locations: vec![kind_loc.clone()],
708                    }),
709                }
710            }
711            Value::Setting(Setting::Constructor {
712                ctor: _,
713                arg:
714                    Namespaced {
715                        namespace: _,
716                        member: (_, arg_loc),
717                    },
718            }) => Err(HeaderError {
719                kind: HeaderErrorKind::ConversionError(
720                    "RecoveryKind",
721                    "Cannot convert constructor to RecoveryKind",
722                ),
723                locations: vec![arg_loc.clone()],
724            }),
725            Value::Setting(Setting::Array(_, arr_loc, _)) => Err(HeaderError {
726                kind: HeaderErrorKind::ConversionError(
727                    "RecoveryKind",
728                    "Cannot convert array to RecoveryKind",
729                ),
730                locations: vec![arr_loc.clone()],
731            }),
732        }
733    }
734}
735
736impl quote::ToTokens for RecoveryKind {
737    fn to_tokens(&self, tokens: &mut TokenStream) {
738        tokens.extend(match *self {
739            RecoveryKind::CPCTPlus => quote!(::lrpar::RecoveryKind::CPCTPlus),
740            RecoveryKind::None => quote!(::lrpar::RecoveryKind::None),
741        })
742    }
743}
744
745/// A lexing or parsing error. Although the two are quite distinct in terms of what can be reported
746/// to users, both can (at least conceptually) occur at any point of the intertwined lexing/parsing
747/// process.
748#[derive(Debug)]
749pub enum LexParseError<
750    StorageT: 'static + Debug + Hash + PrimInt + Unsigned,
751    LexerTypesT: LexerTypes<StorageT = StorageT>,
752> where
753    usize: AsPrimitive<StorageT>,
754{
755    LexError(LexerTypesT::LexErrorT),
756    ParseError(ParseError<LexerTypesT::LexemeT, StorageT>),
757}
758
759impl<StorageT: Debug + Hash + PrimInt + Unsigned, LexerTypesT: LexerTypes<StorageT = StorageT>>
760    LexParseError<StorageT, LexerTypesT>
761where
762    usize: AsPrimitive<StorageT>,
763{
764    /// A pretty-printer of a lexer/parser error. This isn't suitable for all purposes, but it's
765    /// often good enough. The output format is not guaranteed to be stable but is likely to be of
766    /// the form:
767    ///
768    /// ```text
769    /// Parsing error at line 3 column 8. Repair sequences found:
770    ///   1: Insert ID
771    ///   2: Delete +, Shift 3
772    /// ```
773    ///
774    /// If you are using the compile-time parse system, your `grm_y` module exposes a suitable
775    /// [`epp`](../ctbuilder/struct.CTParserBuilder.html#method.process_file) function; if you are
776    /// using the run-time system,
777    /// [`YaccGrammar`](../../cfgrammar/yacc/grammar/struct.YaccGrammar.html) exposes a suitable
778    /// [`epp`](../../cfgrammar/yacc/grammar/struct.YaccGrammar.html#method.token_epp) function,
779    /// though you will have to wrap it in a closure e.g. `&|t| grm.token_epp(t)`.
780    pub fn pp<'a>(
781        &self,
782        lexer: &dyn NonStreamingLexer<LexerTypesT>,
783        epp: &dyn Fn(TIdx<StorageT>) -> Option<&'a str>,
784    ) -> String {
785        match self {
786            LexParseError::LexError(e) => {
787                let ((line, col), _) = lexer.line_col(e.span());
788                format!("Lexing error at line {} column {}.", line, col)
789            }
790            LexParseError::ParseError(e) => {
791                let ((line, col), _) = lexer.line_col(e.lexeme().span());
792                let mut out = format!("Parsing error at line {} column {}.", line, col);
793                let repairs_len = e.repairs().len();
794                if repairs_len == 0 {
795                    out.push_str(" No repair sequences found.");
796                } else {
797                    out.push_str(" Repair sequences found:");
798                    for (i, rs) in e.repairs().iter().enumerate() {
799                        let padding = ((repairs_len as f64).log10() as usize)
800                            - (((i + 1) as f64).log10() as usize)
801                            + 1;
802                        write!(out, "\n  {}{}: ", " ".repeat(padding), i + 1).ok();
803                        let mut rs_out = Vec::new();
804
805                        // Merge together Deletes iff they are consecutive (if they are separated
806                        // by even a single character, they will not be merged).
807                        let mut i = 0;
808                        while i < rs.len() {
809                            match rs[i] {
810                                ParseRepair::Delete(l) => {
811                                    let mut j = i + 1;
812                                    let mut last_end = l.span().end();
813                                    while j < rs.len() {
814                                        if let ParseRepair::Delete(next_l) = rs[j] {
815                                            if next_l.span().start() == last_end {
816                                                last_end = next_l.span().end();
817                                                j += 1;
818                                                continue;
819                                            }
820                                        }
821                                        break;
822                                    }
823                                    let t = &lexer
824                                        .span_str(Span::new(l.span().start(), last_end))
825                                        .replace('\n', "\\n");
826                                    rs_out.push(format!("Delete {}", t));
827                                    i = j;
828                                }
829                                ParseRepair::Insert(tidx) => {
830                                    rs_out.push(format!("Insert {}", epp(tidx).unwrap()));
831                                    i += 1;
832                                }
833                                ParseRepair::Shift(l) => {
834                                    let t = &lexer.span_str(l.span()).replace('\n', "\\n");
835                                    rs_out.push(format!("Shift {}", t));
836                                    i += 1;
837                                }
838                            }
839                        }
840
841                        out.push_str(&rs_out.join(", "));
842                    }
843                }
844                out
845            }
846        }
847    }
848}
849
850impl<
851    StorageT: 'static + Debug + Hash + PrimInt + Unsigned,
852    LexerTypesT: LexerTypes<StorageT = StorageT>,
853> fmt::Display for LexParseError<StorageT, LexerTypesT>
854where
855    usize: AsPrimitive<StorageT>,
856{
857    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
858        match *self {
859            LexParseError::LexError(ref e) => Display::fmt(e, f),
860            LexParseError::ParseError(ref e) => Display::fmt(e, f),
861        }
862    }
863}
864
865impl<
866    StorageT: 'static + Debug + Hash + PrimInt + Unsigned,
867    LexerTypesT: LexerTypes<StorageT = StorageT>,
868> Error for LexParseError<StorageT, LexerTypesT>
869where
870    usize: AsPrimitive<StorageT>,
871{
872}
873
874impl<
875    StorageT: 'static + Debug + Hash + PrimInt + Unsigned,
876    LexerTypesT: LexerTypes<StorageT = StorageT, LexErrorT = T>,
877    T: LexError,
878> From<T> for LexParseError<StorageT, LexerTypesT>
879where
880    usize: AsPrimitive<StorageT>,
881{
882    fn from(err: T) -> LexParseError<StorageT, LexerTypesT> {
883        LexParseError::LexError(err)
884    }
885}
886
887impl<
888    StorageT: 'static + Debug + Hash + PrimInt + Unsigned,
889    LexerTypesT: LexerTypes<StorageT = StorageT>,
890> From<ParseError<LexerTypesT::LexemeT, StorageT>> for LexParseError<StorageT, LexerTypesT>
891where
892    usize: AsPrimitive<StorageT>,
893{
894    fn from(
895        err: ParseError<LexerTypesT::LexemeT, StorageT>,
896    ) -> LexParseError<StorageT, LexerTypesT> {
897        LexParseError::ParseError(err)
898    }
899}
900
901/// A run-time parser builder.
902pub struct RTParserBuilder<
903    'a,
904    StorageT: 'static + Eq + Debug + Hash + PrimInt + Unsigned,
905    LexerTypesT: LexerTypes<StorageT = StorageT>,
906> where
907    usize: AsPrimitive<StorageT>,
908{
909    grm: &'a YaccGrammar<StorageT>,
910    stable: &'a StateTable<StorageT>,
911    recoverer: RecoveryKind,
912    term_costs: &'a dyn Fn(TIdx<StorageT>) -> u8,
913    phantom: PhantomData<(StorageT, LexerTypesT)>,
914}
915
916impl<
917    'a,
918    StorageT: 'static + Debug + Hash + PrimInt + Unsigned,
919    LexerTypesT: LexerTypes<StorageT = StorageT>,
920> RTParserBuilder<'a, StorageT, LexerTypesT>
921where
922    usize: AsPrimitive<StorageT>,
923{
924    /// Create a new run-time parser from a `YaccGrammar`, and a `StateTable`.
925    pub fn new(grm: &'a YaccGrammar<StorageT>, stable: &'a StateTable<StorageT>) -> Self {
926        RTParserBuilder {
927            grm,
928            stable,
929            recoverer: RecoveryKind::CPCTPlus,
930            term_costs: &|_| 1,
931            phantom: PhantomData,
932        }
933    }
934
935    /// Set the recoverer for this parser to `rk`.
936    pub fn recoverer(mut self, rk: RecoveryKind) -> Self {
937        self.recoverer = rk;
938        self
939    }
940
941    pub fn term_costs(mut self, f: &'a dyn Fn(TIdx<StorageT>) -> u8) -> Self {
942        self.term_costs = f;
943        self
944    }
945
946    #[deprecated(
947        since = "0.14.0",
948        note = "Use `parse_map` to return a `Node` from your `lrpar_mod!` instead"
949    )]
950    #[allow(deprecated)]
951    /// Parse input, and (if possible) return a generic parse tree. See the arguments for
952    /// [`parse_actions`](#method.parse_actions) for more details about the return value.
953    pub fn parse_generictree(
954        &self,
955        lexer: &dyn NonStreamingLexer<LexerTypesT>,
956    ) -> (
957        Option<Node<LexerTypesT::LexemeT, StorageT>>,
958        Vec<LexParseError<StorageT, LexerTypesT>>,
959    ) {
960        self.parse_map(lexer, &|lexeme| Node::Term { lexeme }, &|ridx, nodes| {
961            Node::Nonterm { ridx, nodes }
962        })
963    }
964
965    /// Parse input, and (if possible) return a generic parse tree. See the arguments for
966    /// [`parse_actions`](#method.parse_actions) for more details about the return value.
967    pub fn parse_map<Node>(
968        &self,
969        lexer: &dyn NonStreamingLexer<LexerTypesT>,
970        fterm: &dyn Fn(LexerTypesT::LexemeT) -> Node,
971        fnonterm: &dyn Fn(RIdx<StorageT>, Vec<Node>) -> Node,
972    ) -> (Option<Node>, Vec<LexParseError<StorageT, LexerTypesT>>) {
973        let mut lexemes = vec![];
974        for e in lexer.iter().collect::<Vec<_>>() {
975            match e {
976                Ok(l) => lexemes.push(l),
977                Err(e) => return (None, vec![e.into()]),
978            }
979        }
980        Parser::<
981            StorageT,
982            LexerTypesT,
983            Node,
984            (
985                &dyn Fn(LexerTypesT::LexemeT) -> Node,
986                &dyn Fn(RIdx<StorageT>, Vec<Node>) -> Node,
987            ),
988        >::parse_map(
989            self.recoverer,
990            self.grm,
991            self.term_costs,
992            self.stable,
993            lexer,
994            lexemes,
995            fterm,
996            fnonterm,
997        )
998    }
999
1000    #[deprecated(since = "0.14.0", note = "Use `parse_map` instead")]
1001    /// Parse input, returning any errors found. See the arguments for
1002    /// [`parse_actions`](#method.parse_actions) for more details about the return value.
1003    pub fn parse_noaction(
1004        &self,
1005        lexer: &dyn NonStreamingLexer<LexerTypesT>,
1006    ) -> Vec<LexParseError<StorageT, LexerTypesT>> {
1007        self.parse_map(lexer, &|_| (), &|_, _| ()).1
1008    }
1009
1010    /// Parse input, execute actions, and return the associated value (if possible) and/or any
1011    /// lexing/parsing errors encountered. Note that the two parts of the (value, errors) return
1012    /// pair are entirely independent: one can encounter errors without a value being produced
1013    /// (`None, [...]`), errors and a value (`Some(...), [...]`), as well as a value and no errors
1014    /// (`Some(...), []`). Errors are sorted by the position they were found in the input and can
1015    /// be a mix of lexing and parsing errors.
1016    pub fn parse_actions<'b: 'a, 'input: 'b, ActionT: 'a, ParamT: Clone>(
1017        &self,
1018        lexer: &'b dyn NonStreamingLexer<'input, LexerTypesT>,
1019        actions: &'a [ActionFn<'a, 'b, 'input, StorageT, LexerTypesT, ActionT, ParamT>],
1020        param: ParamT,
1021    ) -> (Option<ActionT>, Vec<LexParseError<StorageT, LexerTypesT>>) {
1022        let mut lexemes = vec![];
1023        for e in lexer.iter().collect::<Vec<_>>() {
1024            match e {
1025                Ok(l) => lexemes.push(l),
1026                Err(e) => return (None, vec![e.into()]),
1027            }
1028        }
1029        Parser::parse_actions(
1030            self.recoverer,
1031            self.grm,
1032            self.term_costs,
1033            self.stable,
1034            lexer,
1035            lexemes,
1036            actions,
1037            param,
1038        )
1039    }
1040
1041    pub fn grammar(&self) -> &YaccGrammar<StorageT> {
1042        self.grm
1043    }
1044}
1045
1046/// After a parse error is encountered, the parser attempts to find a way of recovering. Each entry
1047/// in the sequence of repairs is represented by a `ParseRepair`.
1048#[derive(Clone, Debug, Eq, Hash, PartialEq)]
1049pub enum ParseRepair<LexemeT: Lexeme<StorageT>, StorageT: Hash> {
1050    /// Insert a `Symbol::Token`.
1051    Insert(TIdx<StorageT>),
1052    /// Delete a symbol.
1053    Delete(LexemeT),
1054    /// Shift a symbol.
1055    Shift(LexemeT),
1056}
1057
1058/// Records a single parse error.
1059#[derive(Clone, Debug, PartialEq)]
1060pub struct ParseError<LexemeT: Lexeme<StorageT>, StorageT: Hash> {
1061    stidx: StIdx<StorageT>,
1062    lexeme: LexemeT,
1063    repairs: Vec<Vec<ParseRepair<LexemeT, StorageT>>>,
1064}
1065
1066impl<LexemeT: Lexeme<StorageT>, StorageT: Debug + Hash> Display for ParseError<LexemeT, StorageT> {
1067    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
1068        write!(f, "Parse error at lexeme {:?}", self.lexeme)
1069    }
1070}
1071
1072impl<LexemeT: Lexeme<StorageT>, StorageT: Debug + Hash> Error for ParseError<LexemeT, StorageT> {}
1073
1074impl<LexemeT: Lexeme<StorageT>, StorageT: Hash + PrimInt + Unsigned> ParseError<LexemeT, StorageT> {
1075    /// Return the state table index where this error was detected.
1076    pub fn stidx(&self) -> StIdx<StorageT> {
1077        self.stidx
1078    }
1079
1080    /// Return the lexeme where this error was detected.
1081    pub fn lexeme(&self) -> &LexemeT {
1082        &self.lexeme
1083    }
1084
1085    /// Return the repairs found that would fix this error. Note that there are infinite number of
1086    /// possible repairs for any error, so this is by definition a (finite) subset.
1087    pub fn repairs(&self) -> &Vec<Vec<ParseRepair<LexemeT, StorageT>>> {
1088        &self.repairs
1089    }
1090}
1091
1092#[cfg(test)]
1093pub(crate) mod test {
1094    use std::collections::HashMap;
1095
1096    use cfgrammar::{
1097        Span,
1098        yacc::{YaccGrammar, YaccKind, YaccOriginalActionKind},
1099    };
1100    use lrtable::{Minimiser, from_yacc};
1101    use num_traits::ToPrimitive;
1102    use regex::Regex;
1103
1104    use super::*;
1105    use crate::{
1106        Lexeme, Lexer,
1107        test_utils::{TestLexError, TestLexeme, TestLexerTypes},
1108    };
1109
1110    #[allow(deprecated)]
1111    pub(crate) fn do_parse<'input>(
1112        rcvry_kind: RecoveryKind,
1113        lexs: &str,
1114        grms: &str,
1115        input: &'input str,
1116    ) -> (
1117        YaccGrammar<u16>,
1118        SmallLexer<'input>,
1119        Result<
1120            Node<TestLexeme, u16>,
1121            (
1122                Option<Node<TestLexeme, u16>>,
1123                Vec<LexParseError<u16, TestLexerTypes>>,
1124            ),
1125        >,
1126    ) {
1127        do_parse_with_costs(rcvry_kind, lexs, grms, input, &HashMap::new())
1128    }
1129
1130    #[allow(deprecated)]
1131    fn do_parse_with_costs<'input>(
1132        rcvry_kind: RecoveryKind,
1133        lexs: &str,
1134        grms: &str,
1135        input: &'input str,
1136        costs: &HashMap<&str, u8>,
1137    ) -> (
1138        YaccGrammar<u16>,
1139        SmallLexer<'input>,
1140        Result<
1141            Node<TestLexeme, u16>,
1142            (
1143                Option<Node<TestLexeme, u16>>,
1144                Vec<LexParseError<u16, TestLexerTypes>>,
1145            ),
1146        >,
1147    ) {
1148        let grm = YaccGrammar::<u16>::new_with_storaget(
1149            YaccKind::Original(YaccOriginalActionKind::GenericParseTree),
1150            grms,
1151        )
1152        .unwrap();
1153        let (_, stable) = from_yacc(&grm, Minimiser::Pager).unwrap();
1154        let rule_ids = grm
1155            .tokens_map()
1156            .iter()
1157            .map(|(&n, &i)| (n.to_owned(), u32::from(i).to_u16().unwrap()))
1158            .collect();
1159        let lexer_rules = small_lexer(lexs, rule_ids);
1160        let lexemes = small_lex(lexer_rules, input);
1161        let lexer = SmallLexer { lexemes, s: input };
1162        let costs_tidx = costs
1163            .iter()
1164            .map(|(k, v)| (grm.token_idx(k).unwrap(), v))
1165            .collect::<HashMap<_, _>>();
1166        let (r, errs) = RTParserBuilder::new(&grm, &stable)
1167            .recoverer(rcvry_kind)
1168            .term_costs(&|tidx| **costs_tidx.get(&tidx).unwrap_or(&&1))
1169            .parse_generictree(&lexer);
1170        if let Some(node) = r {
1171            if errs.is_empty() {
1172                (grm, lexer, Ok(node))
1173            } else {
1174                (grm, lexer, Err((Some(node), errs)))
1175            }
1176        } else {
1177            (grm, lexer, Err((None, errs)))
1178        }
1179    }
1180
1181    fn check_parse_output(lexs: &str, grms: &str, input: &str, expected: &str) {
1182        let (grm, _, pt) = do_parse(RecoveryKind::CPCTPlus, lexs, grms, input);
1183        assert_eq!(expected, pt.unwrap().pp(&grm, input));
1184    }
1185
1186    // SmallLexer is our highly simplified version of lrlex (allowing us to avoid having to have
1187    // lrlex as a dependency of lrpar). The format is the same as lrlex *except*:
1188    //   * The initial "%%" isn't needed, and only "'" is valid as a rule name delimiter.
1189    //   * "Unnamed" rules aren't allowed (e.g. you can't have a rule which discards whitespaces).
1190    pub(crate) struct SmallLexer<'input> {
1191        lexemes: Vec<TestLexeme>,
1192        s: &'input str,
1193    }
1194
1195    impl Lexer<TestLexerTypes> for SmallLexer<'_> {
1196        fn iter<'a>(&'a self) -> Box<dyn Iterator<Item = Result<TestLexeme, TestLexError>> + 'a> {
1197            Box::new(self.lexemes.iter().map(|x| Ok(*x)))
1198        }
1199    }
1200
1201    impl<'input> NonStreamingLexer<'input, TestLexerTypes> for SmallLexer<'input> {
1202        fn span_str(&self, span: Span) -> &'input str {
1203            &self.s[span.start()..span.end()]
1204        }
1205
1206        fn span_lines_str(&self, _: Span) -> &'input str {
1207            unreachable!();
1208        }
1209
1210        fn line_col(&self, _: Span) -> ((usize, usize), (usize, usize)) {
1211            unreachable!();
1212        }
1213    }
1214
1215    fn small_lexer(lexs: &str, ids_map: HashMap<String, u16>) -> Vec<(u16, Regex)> {
1216        let mut rules = Vec::new();
1217        for l in lexs.split('\n').map(|x| x.trim()).filter(|x| !x.is_empty()) {
1218            assert!(l.rfind('\'') == Some(l.len() - 1));
1219            let i = l[..l.len() - 1].rfind('\'').unwrap();
1220            let name = &l[i + 1..l.len() - 1];
1221            let re = &l[..i - 1].trim();
1222            rules.push((
1223                ids_map[name],
1224                Regex::new(&format!("\\A(?:{})", re)).unwrap(),
1225            ));
1226        }
1227        rules
1228    }
1229
1230    fn small_lex(rules: Vec<(u16, Regex)>, input: &str) -> Vec<TestLexeme> {
1231        let mut lexemes = vec![];
1232        let mut i = 0;
1233        while i < input.len() {
1234            let mut longest = 0; // Length of the longest match
1235            let mut longest_tok_id = 0; // This is only valid iff longest != 0
1236            for (tok_id, r) in rules.iter() {
1237                if let Some(m) = r.find(&input[i..]) {
1238                    let len = m.end();
1239                    if len > longest {
1240                        longest = len;
1241                        longest_tok_id = *tok_id;
1242                    }
1243                }
1244            }
1245            assert!(longest > 0);
1246            lexemes.push(Lexeme::new(longest_tok_id, i, longest));
1247            i += longest;
1248        }
1249        lexemes
1250    }
1251
1252    #[test]
1253    fn simple_parse() {
1254        // From p4 of https://www.cs.umd.edu/class/spring2014/cmsc430/lectures/lec07.pdf
1255        check_parse_output(
1256            "[a-zA-Z_] 'ID'
1257             \\+ '+'",
1258            "
1259%start E
1260%%
1261E: T '+' E
1262 | T;
1263T: 'ID';
1264",
1265            "a+b",
1266            "E
1267 T
1268  ID a
1269 + +
1270 E
1271  T
1272   ID b
1273",
1274        );
1275    }
1276
1277    #[test]
1278    fn parse_empty_rules() {
1279        let lexs = "[a-zA-Z_] 'ID'";
1280        let grms = "%start S
1281%%
1282S: L;
1283L: 'ID'
1284 | ;
1285";
1286        check_parse_output(
1287            lexs, grms, "", "S
1288 L
1289",
1290        );
1291
1292        check_parse_output(
1293            lexs,
1294            grms,
1295            "x",
1296            "S
1297 L
1298  ID x
1299",
1300        );
1301    }
1302
1303    #[test]
1304    fn recursive_parse() {
1305        let lexs = "\\+ '+'
1306                    \\* '*'
1307                    [0-9]+ 'INT'";
1308        let grms = "%start Expr
1309%%
1310Expr : Expr '+' Term | Term;
1311Term : Term '*' Factor | Factor;
1312Factor : 'INT';";
1313
1314        check_parse_output(
1315            lexs,
1316            grms,
1317            "2+3*4",
1318            "Expr
1319 Expr
1320  Term
1321   Factor
1322    INT 2
1323 + +
1324 Term
1325  Term
1326   Factor
1327    INT 3
1328  * *
1329  Factor
1330   INT 4
1331",
1332        );
1333        check_parse_output(
1334            lexs,
1335            grms,
1336            "2*3+4",
1337            "Expr
1338 Expr
1339  Term
1340   Term
1341    Factor
1342     INT 2
1343   * *
1344   Factor
1345    INT 3
1346 + +
1347 Term
1348  Factor
1349   INT 4
1350",
1351        );
1352    }
1353
1354    #[test]
1355    fn parse_error() {
1356        let lexs = "\\( '('
1357                    \\) ')'
1358                    [a-zA-Z_][a-zA-Z_0-9]* 'ID'";
1359        let grms = "%start Call
1360%%
1361Call: 'ID' '(' ')';";
1362
1363        check_parse_output(
1364            lexs,
1365            grms,
1366            "f()",
1367            "Call
1368 ID f
1369 ( (
1370 ) )
1371",
1372        );
1373
1374        let (grm, _, pr) = do_parse(RecoveryKind::CPCTPlus, lexs, grms, "f(");
1375        let (_, errs) = pr.unwrap_err();
1376        assert_eq!(errs.len(), 1);
1377        let err_tok_id = usize::from(grm.eof_token_idx()).to_u16().unwrap();
1378        match &errs[0] {
1379            LexParseError::ParseError(e) => {
1380                assert_eq!(e.lexeme(), &Lexeme::new_faulty(err_tok_id, 2, 0));
1381                assert!(e.lexeme().faulty());
1382            }
1383            _ => unreachable!(),
1384        }
1385
1386        let (grm, _, pr) = do_parse(RecoveryKind::CPCTPlus, lexs, grms, "f(f(");
1387        let (_, errs) = pr.unwrap_err();
1388        assert_eq!(errs.len(), 1);
1389        let err_tok_id = usize::from(grm.token_idx("ID").unwrap()).to_u16().unwrap();
1390        match &errs[0] {
1391            LexParseError::ParseError(e) => {
1392                assert_eq!(e.lexeme(), &Lexeme::new(err_tok_id, 2, 1));
1393                assert!(!e.lexeme().faulty());
1394            }
1395            _ => unreachable!(),
1396        }
1397    }
1398
1399    #[test]
1400    fn test_parse_map() {
1401        #[derive(PartialEq, Debug)]
1402        enum TestParseMap<'a> {
1403            Term(&'a str, &'a str),
1404            NonTerm(&'a str, Vec<TestParseMap<'a>>),
1405        }
1406        let lex_src = r#"[0-9]+ 'INT'
1407\+ '+'
1408\* '*'
1409"#;
1410        let grammar_src = "
1411%grmtools{YaccKind: Original(NoAction)}
1412%start Expr
1413%%
1414Expr : Expr '+' Term | Term;
1415Term : Term '*' Factor | Factor;
1416Factor : 'INT';";
1417        let input_src = "2*3+4";
1418        let grm = str::parse::<YaccGrammar<u16>>(grammar_src).unwrap();
1419        let (_, stable) = lrtable::from_yacc(&grm, lrtable::Minimiser::Pager).unwrap();
1420        let rt_parser = RTParserBuilder::new(&grm, &stable);
1421        let rule_ids = grm
1422            .tokens_map()
1423            .iter()
1424            .map(|(&n, &i)| (n.to_owned(), u32::from(i).to_u16().unwrap()))
1425            .collect();
1426        let lexer_rules = small_lexer(lex_src, rule_ids);
1427        let lexemes = small_lex(lexer_rules, input_src);
1428        let lexer = SmallLexer {
1429            lexemes,
1430            s: input_src,
1431        };
1432
1433        let (parse_map, errs) = rt_parser.parse_map(
1434            &lexer,
1435            &|lexeme: TestLexeme| {
1436                let tidx = TIdx(lexeme.tok_id());
1437                let tn = &grm.token_name(tidx).unwrap();
1438                let lt = &input_src[lexeme.span().start()..lexeme.span().end()];
1439                TestParseMap::Term(tn, lt)
1440            },
1441            &|ridx, nodes| {
1442                let rule_name = &grm.rule_name_str(ridx);
1443                TestParseMap::NonTerm(rule_name, nodes)
1444            },
1445        );
1446        assert!(parse_map.is_some() && errs.is_empty());
1447        // Sanity check the `parse_generictree` output.
1448        check_parse_output(
1449            lex_src,
1450            grammar_src,
1451            input_src,
1452            "Expr
1453 Expr
1454  Term
1455   Term
1456    Factor
1457     INT 2
1458   * *
1459   Factor
1460    INT 3
1461 + +
1462 Term
1463  Factor
1464   INT 4
1465",
1466        );
1467
1468        let expected_parse_map = {
1469            use TestParseMap::*;
1470            NonTerm(
1471                "Expr",
1472                vec![
1473                    NonTerm(
1474                        "Expr",
1475                        vec![NonTerm(
1476                            "Term",
1477                            vec![
1478                                NonTerm("Term", vec![NonTerm("Factor", vec![Term("INT", "2")])]),
1479                                Term("*", "*"),
1480                                NonTerm("Factor", vec![Term("INT", "3")]),
1481                            ],
1482                        )],
1483                    ),
1484                    Term("+", "+"),
1485                    NonTerm("Term", vec![NonTerm("Factor", vec![Term("INT", "4")])]),
1486                ],
1487            )
1488        };
1489        assert_eq!(parse_map, Some(expected_parse_map));
1490    }
1491}