lrpar/
parser.rs

1#![allow(clippy::derive_partial_eq_without_eq)]
2use std::{
3    error::Error,
4    fmt::{self, Debug, Display, Write as _},
5    hash::Hash,
6    marker::PhantomData,
7    vec,
8};
9
10// Can be used on non-wasm32 but to avoid the dependency.
11#[cfg(not(target_arch = "wasm32"))]
12use std::time::{Duration, Instant};
13#[cfg(target_arch = "wasm32")]
14use web_time::{Duration, Instant};
15
16use cactus::Cactus;
17use cfgrammar::{RIdx, Span, TIdx, header::Value, span::Location, yacc::YaccGrammar};
18use lrtable::{Action, StIdx, StateTable};
19use num_traits::{AsPrimitive, PrimInt, Unsigned};
20use proc_macro2::TokenStream;
21use quote::quote;
22#[cfg(feature = "serde")]
23use serde::{Deserialize, Serialize};
24
25use crate::{LexError, Lexeme, LexerTypes, NonStreamingLexer, cpctplus};
26
27#[cfg(test)]
28const RECOVERY_TIME_BUDGET: u64 = 60_000; // milliseconds
29#[cfg(not(test))]
30const RECOVERY_TIME_BUDGET: u64 = 500; // milliseconds
31
32#[deprecated(
33    since = "0.14.0",
34    note = "Use the version of `Node` exported from your `lrpar_mod!`"
35)]
36pub type Node<T, S> = _deprecated_moved_::Node<T, S>;
37
38#[doc(hidden)]
39pub mod _deprecated_moved_ {
40    use super::*;
41    /// A generic parse tree.
42    #[derive(Debug, Clone, PartialEq)]
43    pub enum Node<LexemeT: Lexeme<StorageT>, StorageT> {
44        /// Terminals store a single lexeme.
45        Term { lexeme: LexemeT },
46        /// Nonterminals reference a rule and have zero or more `Node`s as children.
47        Nonterm {
48            ridx: RIdx<StorageT>,
49            nodes: Vec<Node<LexemeT, StorageT>>,
50        },
51    }
52}
53
54#[allow(deprecated)]
55impl<LexemeT: Lexeme<StorageT>, StorageT: 'static + PrimInt + Unsigned> Node<LexemeT, StorageT>
56where
57    usize: AsPrimitive<StorageT>,
58{
59    /// Return a pretty-printed version of this node.
60    pub fn pp(&self, grm: &YaccGrammar<StorageT>, input: &str) -> String {
61        let mut st = vec![(0, self)]; // Stack of (indent level, node) pairs
62        let mut s = String::new();
63        while let Some((indent, e)) = st.pop() {
64            for _ in 0..indent {
65                s.push(' ');
66            }
67            match *e {
68                Node::Term { lexeme } => {
69                    let tidx = TIdx(lexeme.tok_id());
70                    let tn = grm.token_name(tidx).unwrap();
71                    let lt = &input[lexeme.span().start()..lexeme.span().end()];
72                    writeln!(s, "{} {}", tn, lt).ok();
73                }
74                Node::Nonterm { ridx, ref nodes } => {
75                    writeln!(s, "{}", grm.rule_name_str(ridx)).ok();
76                    for x in nodes.iter().rev() {
77                        st.push((indent + 1, x));
78                    }
79                }
80            }
81        }
82        s
83    }
84}
85
86type PStack<StorageT> = Vec<StIdx<StorageT>>; // Parse stack
87type TokenCostFn<'a, StorageT> = &'a (dyn Fn(TIdx<StorageT>) -> u8 + 'a);
88type ActionFn<'a, 'b, 'input, StorageT, LexerTypesT, ActionT, ParamT> = &'a dyn Fn(
89    RIdx<StorageT>,
90    &'b dyn NonStreamingLexer<'input, LexerTypesT>,
91    Span,
92    vec::Drain<AStackType<<LexerTypesT as LexerTypes>::LexemeT, ActionT>>,
93    ParamT,
94) -> ActionT;
95
96#[derive(Debug)]
97pub enum AStackType<LexemeT, ActionT> {
98    ActionType(ActionT),
99    Lexeme(LexemeT),
100}
101
102pub(super) struct Parser<
103    'a,
104    'b: 'a,
105    'input: 'b,
106    StorageT: 'static + Eq + Hash + PrimInt + Unsigned,
107    LexerTypesT: LexerTypes<StorageT = StorageT>,
108    ActionT: 'a,
109    ParamT: Clone,
110> where
111    usize: AsPrimitive<StorageT>,
112{
113    rcvry_kind: RecoveryKind,
114    pub(super) grm: &'a YaccGrammar<StorageT>,
115    pub(super) token_cost: Box<TokenCostFn<'a, StorageT>>,
116    pub(super) stable: &'a StateTable<StorageT>,
117    lexer: &'b dyn NonStreamingLexer<'input, LexerTypesT>,
118    // In the long term, we should remove the `lexemes` field entirely, as the `NonStreamingLexer` API is
119    // powerful enough to allow us to incrementally obtain lexemes and buffer them when necessary.
120    pub(super) lexemes: Vec<LexerTypesT::LexemeT>,
121    actions: &'a [ActionFn<'a, 'b, 'input, LexerTypesT::StorageT, LexerTypesT, ActionT, ParamT>],
122    param: ParamT,
123}
124
125impl<
126    'a,
127    'b: 'a,
128    'input: 'b,
129    StorageT: 'static + Debug + Hash + PrimInt + Unsigned,
130    LexerTypesT: LexerTypes<StorageT = StorageT>,
131    Node,
132>
133    Parser<
134        'a,
135        'b,
136        'input,
137        StorageT,
138        LexerTypesT,
139        Node,
140        (
141            &dyn Fn(LexerTypesT::LexemeT) -> Node,
142            &dyn Fn(RIdx<StorageT>, Vec<Node>) -> Node,
143        ),
144    >
145where
146    usize: AsPrimitive<StorageT>,
147{
148    fn parse_map(
149        rcvry_kind: RecoveryKind,
150        grm: &YaccGrammar<StorageT>,
151        token_cost: TokenCostFn<'a, StorageT>,
152        stable: &StateTable<StorageT>,
153        lexer: &'b dyn NonStreamingLexer<'input, LexerTypesT>,
154        lexemes: Vec<LexerTypesT::LexemeT>,
155        fterm: &'a dyn Fn(LexerTypesT::LexemeT) -> Node,
156        fnonterm: &'a dyn Fn(RIdx<StorageT>, Vec<Node>) -> Node,
157    ) -> (Option<Node>, Vec<LexParseError<StorageT, LexerTypesT>>) {
158        for tidx in grm.iter_tidxs() {
159            assert!(token_cost(tidx) > 0);
160        }
161        let mut actions: Vec<
162            ActionFn<
163                'a,
164                'b,
165                'input,
166                StorageT,
167                LexerTypesT,
168                Node,
169                (
170                    &'a dyn Fn(LexerTypesT::LexemeT) -> Node,
171                    &'a dyn Fn(RIdx<StorageT>, Vec<Node>) -> Node,
172                ),
173            >,
174        > = Vec::new();
175        actions.resize(usize::from(grm.prods_len()), &action_map);
176        let psr = Parser {
177            rcvry_kind,
178            grm,
179            token_cost: Box::new(token_cost),
180            stable,
181            lexer,
182            lexemes,
183            actions: actions.as_slice(),
184            param: (fterm, fnonterm),
185        };
186        let mut pstack = vec![stable.start_state()];
187        let mut astack = Vec::new();
188        let mut errors = Vec::new();
189        let mut spans = Vec::new();
190        let accpt = psr.lr(0, &mut pstack, &mut astack, &mut errors, &mut spans);
191        (accpt, errors)
192    }
193}
194
195fn action_map<StorageT, LexerTypesT: LexerTypes, Node>(
196    ridx: RIdx<StorageT>,
197    _lexer: &dyn NonStreamingLexer<LexerTypesT>,
198    _span: Span,
199    astack: vec::Drain<AStackType<LexerTypesT::LexemeT, Node>>,
200    param: (
201        &dyn Fn(LexerTypesT::LexemeT) -> Node,
202        &dyn Fn(RIdx<StorageT>, Vec<Node>) -> Node,
203    ),
204) -> Node
205where
206    usize: AsPrimitive<LexerTypesT::StorageT>,
207    LexerTypesT::LexemeT: Lexeme<StorageT>,
208{
209    let (fterm, fnonterm) = param;
210    let mut nodes = Vec::with_capacity(astack.len());
211    for a in astack {
212        nodes.push(match a {
213            AStackType::ActionType(n) => n,
214            AStackType::Lexeme(lexeme) => fterm(lexeme),
215        });
216    }
217    fnonterm(ridx, nodes)
218}
219
220#[deprecated(
221    since = "0.14.0",
222    note = "Deprecated with `parse_generictree` there is no direct replacement, besides a custom action"
223)]
224#[allow(deprecated)]
225/// The action which implements [`cfgrammar::yacc::YaccOriginalActionKind::GenericParseTree`].
226/// Usually you should just use the action kind directly. But you can also call this from
227/// within a custom action to return a generic parse tree with custom behavior.
228pub fn action_generictree<StorageT, LexerTypesT: LexerTypes>(
229    ridx: RIdx<StorageT>,
230    _lexer: &dyn NonStreamingLexer<LexerTypesT>,
231    _span: Span,
232    astack: vec::Drain<AStackType<LexerTypesT::LexemeT, Node<LexerTypesT::LexemeT, StorageT>>>,
233    _param: (),
234) -> Node<LexerTypesT::LexemeT, StorageT>
235where
236    usize: AsPrimitive<LexerTypesT::StorageT>,
237    LexerTypesT::LexemeT: Lexeme<StorageT>,
238{
239    let mut nodes = Vec::with_capacity(astack.len());
240    for a in astack {
241        nodes.push(match a {
242            AStackType::ActionType(n) => n,
243            AStackType::Lexeme(lexeme) => Node::Term { lexeme },
244        });
245    }
246    Node::Nonterm { ridx, nodes }
247}
248
249impl<
250    'a,
251    'b: 'a,
252    'input: 'b,
253    StorageT: 'static + Debug + Eq + Hash + PrimInt + Unsigned,
254    LexerTypesT: LexerTypes<StorageT = StorageT>,
255    ActionT: 'a,
256    ParamT: Clone,
257> Parser<'a, 'b, 'input, StorageT, LexerTypesT, ActionT, ParamT>
258where
259    usize: AsPrimitive<StorageT>,
260{
261    fn parse_actions(
262        rcvry_kind: RecoveryKind,
263        grm: &'a YaccGrammar<StorageT>,
264        token_cost: TokenCostFn<'a, StorageT>,
265        stable: &'a StateTable<StorageT>,
266        lexer: &'b dyn NonStreamingLexer<'input, LexerTypesT>,
267        lexemes: Vec<LexerTypesT::LexemeT>,
268        actions: &'a [ActionFn<'a, 'b, 'input, StorageT, LexerTypesT, ActionT, ParamT>],
269        param: ParamT,
270    ) -> (Option<ActionT>, Vec<LexParseError<StorageT, LexerTypesT>>) {
271        for tidx in grm.iter_tidxs() {
272            assert!(token_cost(tidx) > 0);
273        }
274        let psr = Parser {
275            rcvry_kind,
276            grm,
277            token_cost: Box::new(token_cost),
278            stable,
279            lexer,
280            lexemes,
281            actions,
282            param,
283        };
284        let mut pstack = vec![stable.start_state()];
285        let mut astack = Vec::new();
286        let mut errors = Vec::new();
287        let mut spans = Vec::new();
288        let accpt = psr.lr(0, &mut pstack, &mut astack, &mut errors, &mut spans);
289        (accpt, errors)
290    }
291
292    /// Start parsing text at `laidx` (using the lexeme in `lexeme_prefix`, if it is not `None`,
293    /// as the first lexeme) up to (but excluding) `end_laidx` (if it's specified). Parsing
294    /// continues as long as possible (assuming that any errors encountered can be recovered from)
295    /// unless `end_laidx` is `None`, at which point this function returns as soon as it
296    /// encounters an error.
297    ///
298    /// Note that if `lexeme_prefix` is specified, `laidx` will still be incremented, and thus
299    /// `end_laidx` *must* be set to `laidx + 1` in order that the parser doesn't skip the real
300    /// lexeme at position `laidx`.
301    ///
302    /// Return `Some(value)` if the parse reached an accept state (i.e. all the input was consumed,
303    /// possibly after making repairs) or `None` (i.e. some of the input was not consumed, even
304    /// after possibly making repairs) otherwise.
305    fn lr(
306        &self,
307        mut laidx: usize,
308        pstack: &mut PStack<StorageT>,
309        astack: &mut Vec<AStackType<LexerTypesT::LexemeT, ActionT>>,
310        errors: &mut Vec<LexParseError<StorageT, LexerTypesT>>,
311        spans: &mut Vec<Span>,
312    ) -> Option<ActionT> {
313        let mut recoverer = None;
314        let mut recovery_budget = Duration::from_millis(RECOVERY_TIME_BUDGET);
315        loop {
316            debug_assert_eq!(astack.len(), spans.len());
317            let stidx = *pstack.last().unwrap();
318            let la_tidx = self.next_tidx(laidx);
319
320            match self.stable.action(stidx, la_tidx) {
321                Action::Reduce(pidx) => {
322                    let ridx = self.grm.prod_to_rule(pidx);
323                    let pop_idx = pstack.len() - self.grm.prod(pidx).len();
324
325                    pstack.drain(pop_idx..);
326                    let prior = *pstack.last().unwrap();
327                    pstack.push(self.stable.goto(prior, ridx).unwrap());
328
329                    let span = if spans.is_empty() {
330                        Span::new(0, 0)
331                    } else if pop_idx - 1 < spans.len() {
332                        Span::new(spans[pop_idx - 1].start(), spans[spans.len() - 1].end())
333                    } else {
334                        Span::new(spans[spans.len() - 1].start(), spans[spans.len() - 1].end())
335                    };
336                    spans.truncate(pop_idx - 1);
337                    spans.push(span);
338
339                    let v = AStackType::ActionType(self.actions[usize::from(pidx)](
340                        ridx,
341                        self.lexer,
342                        span,
343                        astack.drain(pop_idx - 1..),
344                        self.param.clone(),
345                    ));
346                    astack.push(v);
347                }
348                Action::Shift(state_id) => {
349                    let la_lexeme = self.next_lexeme(laidx);
350                    pstack.push(state_id);
351                    astack.push(AStackType::Lexeme(la_lexeme));
352
353                    spans.push(la_lexeme.span());
354                    laidx += 1;
355                }
356                Action::Accept => {
357                    debug_assert_eq!(la_tidx, self.grm.eof_token_idx());
358                    debug_assert_eq!(astack.len(), 1);
359                    match astack.drain(..).next().unwrap() {
360                        AStackType::ActionType(v) => return Some(v),
361                        _ => unreachable!(),
362                    }
363                }
364                Action::Error => {
365                    if recoverer.is_none() {
366                        recoverer = Some(match self.rcvry_kind {
367                            RecoveryKind::CPCTPlus => cpctplus::recoverer(self),
368                            RecoveryKind::None => {
369                                let la_lexeme = self.next_lexeme(laidx);
370                                errors.push(
371                                    ParseError {
372                                        stidx,
373                                        lexeme: la_lexeme,
374                                        repairs: vec![],
375                                    }
376                                    .into(),
377                                );
378                                return None;
379                            }
380                        });
381                    }
382
383                    let before = Instant::now();
384                    let finish_by = before + recovery_budget;
385                    let (new_laidx, repairs) = recoverer
386                        .as_ref()
387                        .unwrap()
388                        .as_ref()
389                        .recover(finish_by, self, laidx, pstack, astack, spans);
390                    let after = Instant::now();
391                    recovery_budget = recovery_budget
392                        .checked_sub(after - before)
393                        .unwrap_or_else(|| Duration::new(0, 0));
394                    let keep_going = !repairs.is_empty();
395                    let la_lexeme = self.next_lexeme(laidx);
396                    errors.push(
397                        ParseError {
398                            stidx,
399                            lexeme: la_lexeme,
400                            repairs,
401                        }
402                        .into(),
403                    );
404                    if !keep_going {
405                        return None;
406                    }
407                    laidx = new_laidx;
408                }
409            }
410        }
411    }
412
413    /// Parse from `laidx` up to (but excluding) `end_laidx` mutating `pstack` as parsing occurs.
414    /// Returns the index of the token it parsed up to (by definition <= end_laidx: can be less if
415    /// the input is < end_laidx, or if an error is encountered). Does not do any form of error
416    /// recovery.
417    pub(super) fn lr_upto(
418        &self,
419        lexeme_prefix: Option<LexerTypesT::LexemeT>,
420        mut laidx: usize,
421        end_laidx: usize,
422        pstack: &mut PStack<StorageT>,
423        astack: &mut Option<&mut Vec<AStackType<LexerTypesT::LexemeT, ActionT>>>,
424        spans: &mut Option<&mut Vec<Span>>,
425    ) -> usize {
426        assert!(lexeme_prefix.is_none() || end_laidx == laidx + 1);
427        while laidx != end_laidx && laidx <= self.lexemes.len() {
428            let stidx = *pstack.last().unwrap();
429            let la_tidx = if let Some(l) = lexeme_prefix {
430                TIdx(l.tok_id())
431            } else {
432                self.next_tidx(laidx)
433            };
434
435            match self.stable.action(stidx, la_tidx) {
436                Action::Reduce(pidx) => {
437                    let ridx = self.grm.prod_to_rule(pidx);
438                    let pop_idx = pstack.len() - self.grm.prod(pidx).len();
439                    if let Some(ref mut astack_uw) = *astack {
440                        if let Some(ref mut spans_uw) = *spans {
441                            let span = if spans_uw.is_empty() {
442                                Span::new(0, 0)
443                            } else if pop_idx - 1 < spans_uw.len() {
444                                Span::new(
445                                    spans_uw[pop_idx - 1].start(),
446                                    spans_uw[spans_uw.len() - 1].end(),
447                                )
448                            } else {
449                                Span::new(
450                                    spans_uw[spans_uw.len() - 1].start(),
451                                    spans_uw[spans_uw.len() - 1].end(),
452                                )
453                            };
454                            spans_uw.truncate(pop_idx - 1);
455                            spans_uw.push(span);
456
457                            let v = AStackType::ActionType(self.actions[usize::from(pidx)](
458                                ridx,
459                                self.lexer,
460                                span,
461                                astack_uw.drain(pop_idx - 1..),
462                                self.param.clone(),
463                            ));
464                            astack_uw.push(v);
465                        } else {
466                            unreachable!();
467                        }
468                    }
469
470                    pstack.drain(pop_idx..);
471                    let prior = *pstack.last().unwrap();
472                    pstack.push(self.stable.goto(prior, ridx).unwrap());
473                }
474                Action::Shift(state_id) => {
475                    if let Some(ref mut astack_uw) = *astack {
476                        if let Some(spans_uw) = spans {
477                            let la_lexeme = if let Some(l) = lexeme_prefix {
478                                l
479                            } else {
480                                self.next_lexeme(laidx)
481                            };
482                            astack_uw.push(AStackType::Lexeme(la_lexeme));
483                            spans_uw.push(la_lexeme.span());
484                        }
485                    }
486                    pstack.push(state_id);
487                    laidx += 1;
488                }
489                Action::Accept => {
490                    break;
491                }
492                Action::Error => {
493                    break;
494                }
495            }
496        }
497        laidx
498    }
499
500    /// Return a `Lexeme` for the next lemexe (if `laidx` == `self.lexemes.len()` this will be
501    /// a lexeme constructed to look as if contains the EOF token).
502    pub(super) fn next_lexeme(&self, laidx: usize) -> LexerTypesT::LexemeT {
503        let llen = self.lexemes.len();
504        debug_assert!(laidx <= llen);
505        if laidx < llen {
506            self.lexemes[laidx]
507        } else {
508            // We have to artificially construct a Lexeme for the EOF lexeme.
509            let last_la_end = if llen == 0 {
510                0
511            } else {
512                debug_assert!(laidx > 0);
513                let last_la = self.lexemes[laidx - 1];
514                last_la.span().end()
515            };
516
517            Lexeme::new_faulty(
518                StorageT::from(u32::from(self.grm.eof_token_idx())).unwrap(),
519                last_la_end,
520                0,
521            )
522        }
523    }
524
525    /// Return the `TIdx` of the next lexeme (if `laidx` == `self.lexemes.len()` this will be the
526    /// EOF `TIdx`).
527    pub(super) fn next_tidx(&self, laidx: usize) -> TIdx<StorageT> {
528        let ll = self.lexemes.len();
529        debug_assert!(laidx <= ll);
530        if laidx < ll {
531            TIdx(self.lexemes[laidx].tok_id())
532        } else {
533            self.grm.eof_token_idx()
534        }
535    }
536
537    /// Start parsing text at `laidx` (using the lexeme in `lexeme_prefix`, if it is not `None`,
538    /// as the first lexeme) up to (but excluding) `end_laidx`. If an error is encountered, parsing
539    /// immediately terminates (without recovery).
540    ///
541    /// Note that if `lexeme_prefix` is specified, `laidx` will still be incremented, and thus
542    /// `end_laidx` *must* be set to `laidx + 1` in order that the parser doesn't skip the real
543    /// lexeme at position `laidx`.
544    #[allow(deprecated)]
545    pub(super) fn lr_cactus(
546        &self,
547        lexeme_prefix: Option<LexerTypesT::LexemeT>,
548        mut laidx: usize,
549        end_laidx: usize,
550        mut pstack: Cactus<StIdx<StorageT>>,
551        tstack: &mut Option<&mut Vec<Node<LexerTypesT::LexemeT, StorageT>>>,
552    ) -> (usize, Cactus<StIdx<StorageT>>) {
553        assert!(lexeme_prefix.is_none() || end_laidx == laidx + 1);
554        while laidx != end_laidx {
555            let stidx = *pstack.val().unwrap();
556            let la_tidx = if let Some(l) = lexeme_prefix {
557                TIdx(l.tok_id())
558            } else {
559                self.next_tidx(laidx)
560            };
561
562            match self.stable.action(stidx, la_tidx) {
563                Action::Reduce(pidx) => {
564                    let ridx = self.grm.prod_to_rule(pidx);
565                    let pop_num = self.grm.prod(pidx).len();
566                    if let Some(ref mut tstack_uw) = *tstack {
567                        let nodes = tstack_uw
568                            .drain(pstack.len() - pop_num - 1..)
569                            .collect::<Vec<Node<LexerTypesT::LexemeT, StorageT>>>();
570                        tstack_uw.push(Node::Nonterm { ridx, nodes });
571                    }
572
573                    for _ in 0..pop_num {
574                        pstack = pstack.parent().unwrap();
575                    }
576                    let prior = *pstack.val().unwrap();
577                    pstack = pstack.child(self.stable.goto(prior, ridx).unwrap());
578                }
579                Action::Shift(state_id) => {
580                    if let Some(ref mut tstack_uw) = *tstack {
581                        let la_lexeme = if let Some(l) = lexeme_prefix {
582                            l
583                        } else {
584                            self.next_lexeme(laidx)
585                        };
586                        tstack_uw.push(Node::Term { lexeme: la_lexeme });
587                    }
588                    pstack = pstack.child(state_id);
589                    laidx += 1;
590                }
591                Action::Accept => {
592                    debug_assert_eq!(la_tidx, self.grm.eof_token_idx());
593                    if let Some(ref tstack_uw) = *tstack {
594                        debug_assert_eq!(tstack_uw.len(), 1);
595                    }
596                    break;
597                }
598                Action::Error => {
599                    break;
600                }
601            }
602        }
603        (laidx, pstack)
604    }
605}
606
607pub(super) trait Recoverer<
608    StorageT: 'static + Debug + Hash + PrimInt + Unsigned,
609    LexerTypesT: LexerTypes<StorageT = StorageT>,
610    ActionT,
611    ParamT: Clone,
612> where
613    usize: AsPrimitive<StorageT>,
614{
615    fn recover(
616        &self,
617        finish_by: Instant,
618        parser: &Parser<StorageT, LexerTypesT, ActionT, ParamT>,
619        in_laidx: usize,
620        in_pstack: &mut PStack<StorageT>,
621        astack: &mut Vec<AStackType<LexerTypesT::LexemeT, ActionT>>,
622        spans: &mut Vec<Span>,
623    ) -> (usize, Vec<Vec<ParseRepair<LexerTypesT::LexemeT, StorageT>>>);
624}
625
626/// What recovery algorithm should be used when a syntax error is encountered?
627#[derive(Clone, Copy, Debug)]
628#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
629#[non_exhaustive]
630pub enum RecoveryKind {
631    /// The CPCT+ algorithm from Diekmann/Tratt "Don't Panic! Better, Fewer, Syntax Errors for LR
632    /// Parsers".
633    CPCTPlus,
634    /// Don't use error recovery: return as soon as the first syntax error is encountered.
635    None,
636}
637
638impl TryFrom<RecoveryKind> for Value<Location> {
639    type Error = cfgrammar::header::HeaderError<Location>;
640    fn try_from(rk: RecoveryKind) -> Result<Value<Location>, Self::Error> {
641        use cfgrammar::{
642            Location,
643            header::{Namespaced, Setting},
644        };
645        let from_loc = Location::Other("From<RecoveryKind>".to_string());
646        Ok(match rk {
647            RecoveryKind::CPCTPlus => Value::Setting(Setting::Unitary(Namespaced {
648                namespace: Some(("RecoveryKind".to_string(), from_loc.clone())),
649                member: ("CPCTPlus".to_string(), from_loc.clone()),
650            })),
651            RecoveryKind::None => Value::Setting(Setting::Unitary(Namespaced {
652                namespace: Some(("RecoveryKind".to_string(), from_loc.clone())),
653                member: ("None".to_string(), from_loc.clone()),
654            })),
655        })
656    }
657}
658
659impl TryFrom<&Value<Location>> for RecoveryKind {
660    type Error = cfgrammar::header::HeaderError<Location>;
661    fn try_from(rk: &Value<Location>) -> Result<RecoveryKind, Self::Error> {
662        use cfgrammar::header::{HeaderError, HeaderErrorKind, Namespaced, Setting};
663
664        match rk {
665            Value::Flag(_, loc) => Err(HeaderError {
666                kind: HeaderErrorKind::ConversionError(
667                    "RecoveryKind",
668                    "Cannot convert boolean to RecoveryKind",
669                ),
670                locations: vec![loc.clone()],
671            }),
672            Value::Setting(Setting::Num(_, loc)) => Err(HeaderError {
673                kind: HeaderErrorKind::ConversionError(
674                    "RecoveryKind",
675                    "Cannot convert number to RecoveryKind",
676                ),
677                locations: vec![loc.clone()],
678            }),
679            Value::Setting(Setting::String(_, loc)) => Err(HeaderError {
680                kind: HeaderErrorKind::ConversionError(
681                    "RecoveryKind",
682                    "Cannot convert string to RecoveryKind",
683                ),
684                locations: vec![loc.clone()],
685            }),
686            Value::Setting(Setting::Unitary(Namespaced {
687                namespace,
688                member: (kind, kind_loc),
689            })) => {
690                match namespace {
691                    Some((ns, loc)) if ns.to_lowercase() != "recoverykind" => {
692                        return Err(HeaderError {
693                            kind: HeaderErrorKind::ConversionError(
694                                "RecoveryKind",
695                                "Unknown namespace",
696                            ),
697                            locations: vec![loc.clone()],
698                        });
699                    }
700                    _ => {}
701                }
702                match kind.to_lowercase().as_ref() {
703                    "cpctplus" => Ok(RecoveryKind::CPCTPlus),
704                    "none" => Ok(RecoveryKind::None),
705                    _ => Err(HeaderError {
706                        kind: HeaderErrorKind::ConversionError("RecoveryKind", "Unknown variant"),
707                        locations: vec![kind_loc.clone()],
708                    }),
709                }
710            }
711            Value::Setting(Setting::Constructor {
712                ctor: _,
713                arg:
714                    Namespaced {
715                        namespace: _,
716                        member: (_, arg_loc),
717                    },
718            }) => Err(HeaderError {
719                kind: HeaderErrorKind::ConversionError(
720                    "RecoveryKind",
721                    "Cannot convert constructor to RecoveryKind",
722                ),
723                locations: vec![arg_loc.clone()],
724            }),
725        }
726    }
727}
728
729impl quote::ToTokens for RecoveryKind {
730    fn to_tokens(&self, tokens: &mut TokenStream) {
731        tokens.extend(match *self {
732            RecoveryKind::CPCTPlus => quote!(::lrpar::RecoveryKind::CPCTPlus),
733            RecoveryKind::None => quote!(::lrpar::RecoveryKind::None),
734        })
735    }
736}
737
738/// A lexing or parsing error. Although the two are quite distinct in terms of what can be reported
739/// to users, both can (at least conceptually) occur at any point of the intertwined lexing/parsing
740/// process.
741#[derive(Debug)]
742pub enum LexParseError<
743    StorageT: 'static + Debug + Hash + PrimInt + Unsigned,
744    LexerTypesT: LexerTypes<StorageT = StorageT>,
745> where
746    usize: AsPrimitive<StorageT>,
747{
748    LexError(LexerTypesT::LexErrorT),
749    ParseError(ParseError<LexerTypesT::LexemeT, StorageT>),
750}
751
752impl<StorageT: Debug + Hash + PrimInt + Unsigned, LexerTypesT: LexerTypes<StorageT = StorageT>>
753    LexParseError<StorageT, LexerTypesT>
754where
755    usize: AsPrimitive<StorageT>,
756{
757    /// A pretty-printer of a lexer/parser error. This isn't suitable for all purposes, but it's
758    /// often good enough. The output format is not guaranteed to be stable but is likely to be of
759    /// the form:
760    ///
761    /// ```text
762    /// Parsing error at line 3 column 8. Repair sequences found:
763    ///   1: Insert ID
764    ///   2: Delete +, Shift 3
765    /// ```
766    ///
767    /// If you are using the compile-time parse system, your `grm_y` module exposes a suitable
768    /// [`epp`](../ctbuilder/struct.CTParserBuilder.html#method.process_file) function; if you are
769    /// using the run-time system,
770    /// [`YaccGrammar`](../../cfgrammar/yacc/grammar/struct.YaccGrammar.html) exposes a suitable
771    /// [`epp`](../../cfgrammar/yacc/grammar/struct.YaccGrammar.html#method.token_epp) function,
772    /// though you will have to wrap it in a closure e.g. `&|t| grm.token_epp(t)`.
773    pub fn pp<'a>(
774        &self,
775        lexer: &dyn NonStreamingLexer<LexerTypesT>,
776        epp: &dyn Fn(TIdx<StorageT>) -> Option<&'a str>,
777    ) -> String {
778        match self {
779            LexParseError::LexError(e) => {
780                let ((line, col), _) = lexer.line_col(e.span());
781                format!("Lexing error at line {} column {}.", line, col)
782            }
783            LexParseError::ParseError(e) => {
784                let ((line, col), _) = lexer.line_col(e.lexeme().span());
785                let mut out = format!("Parsing error at line {} column {}.", line, col);
786                let repairs_len = e.repairs().len();
787                if repairs_len == 0 {
788                    out.push_str(" No repair sequences found.");
789                } else {
790                    out.push_str(" Repair sequences found:");
791                    for (i, rs) in e.repairs().iter().enumerate() {
792                        let padding = ((repairs_len as f64).log10() as usize)
793                            - (((i + 1) as f64).log10() as usize)
794                            + 1;
795                        write!(out, "\n  {}{}: ", " ".repeat(padding), i + 1).ok();
796                        let mut rs_out = Vec::new();
797
798                        // Merge together Deletes iff they are consecutive (if they are separated
799                        // by even a single character, they will not be merged).
800                        let mut i = 0;
801                        while i < rs.len() {
802                            match rs[i] {
803                                ParseRepair::Delete(l) => {
804                                    let mut j = i + 1;
805                                    let mut last_end = l.span().end();
806                                    while j < rs.len() {
807                                        if let ParseRepair::Delete(next_l) = rs[j] {
808                                            if next_l.span().start() == last_end {
809                                                last_end = next_l.span().end();
810                                                j += 1;
811                                                continue;
812                                            }
813                                        }
814                                        break;
815                                    }
816                                    let t = &lexer
817                                        .span_str(Span::new(l.span().start(), last_end))
818                                        .replace('\n', "\\n");
819                                    rs_out.push(format!("Delete {}", t));
820                                    i = j;
821                                }
822                                ParseRepair::Insert(tidx) => {
823                                    rs_out.push(format!("Insert {}", epp(tidx).unwrap()));
824                                    i += 1;
825                                }
826                                ParseRepair::Shift(l) => {
827                                    let t = &lexer.span_str(l.span()).replace('\n', "\\n");
828                                    rs_out.push(format!("Shift {}", t));
829                                    i += 1;
830                                }
831                            }
832                        }
833
834                        out.push_str(&rs_out.join(", "));
835                    }
836                }
837                out
838            }
839        }
840    }
841}
842
843impl<
844    StorageT: 'static + Debug + Hash + PrimInt + Unsigned,
845    LexerTypesT: LexerTypes<StorageT = StorageT>,
846> fmt::Display for LexParseError<StorageT, LexerTypesT>
847where
848    usize: AsPrimitive<StorageT>,
849{
850    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
851        match *self {
852            LexParseError::LexError(ref e) => Display::fmt(e, f),
853            LexParseError::ParseError(ref e) => Display::fmt(e, f),
854        }
855    }
856}
857
858impl<
859    StorageT: 'static + Debug + Hash + PrimInt + Unsigned,
860    LexerTypesT: LexerTypes<StorageT = StorageT>,
861> Error for LexParseError<StorageT, LexerTypesT>
862where
863    usize: AsPrimitive<StorageT>,
864{
865}
866
867impl<
868    StorageT: 'static + Debug + Hash + PrimInt + Unsigned,
869    LexerTypesT: LexerTypes<StorageT = StorageT, LexErrorT = T>,
870    T: LexError,
871> From<T> for LexParseError<StorageT, LexerTypesT>
872where
873    usize: AsPrimitive<StorageT>,
874{
875    fn from(err: T) -> LexParseError<StorageT, LexerTypesT> {
876        LexParseError::LexError(err)
877    }
878}
879
880impl<
881    StorageT: 'static + Debug + Hash + PrimInt + Unsigned,
882    LexerTypesT: LexerTypes<StorageT = StorageT>,
883> From<ParseError<LexerTypesT::LexemeT, StorageT>> for LexParseError<StorageT, LexerTypesT>
884where
885    usize: AsPrimitive<StorageT>,
886{
887    fn from(
888        err: ParseError<LexerTypesT::LexemeT, StorageT>,
889    ) -> LexParseError<StorageT, LexerTypesT> {
890        LexParseError::ParseError(err)
891    }
892}
893
894/// A run-time parser builder.
895pub struct RTParserBuilder<
896    'a,
897    StorageT: 'static + Eq + Debug + Hash + PrimInt + Unsigned,
898    LexerTypesT: LexerTypes<StorageT = StorageT>,
899> where
900    usize: AsPrimitive<StorageT>,
901{
902    grm: &'a YaccGrammar<StorageT>,
903    stable: &'a StateTable<StorageT>,
904    recoverer: RecoveryKind,
905    term_costs: &'a dyn Fn(TIdx<StorageT>) -> u8,
906    phantom: PhantomData<(StorageT, LexerTypesT)>,
907}
908
909impl<
910    'a,
911    StorageT: 'static + Debug + Hash + PrimInt + Unsigned,
912    LexerTypesT: LexerTypes<StorageT = StorageT>,
913> RTParserBuilder<'a, StorageT, LexerTypesT>
914where
915    usize: AsPrimitive<StorageT>,
916{
917    /// Create a new run-time parser from a `YaccGrammar`, and a `StateTable`.
918    pub fn new(grm: &'a YaccGrammar<StorageT>, stable: &'a StateTable<StorageT>) -> Self {
919        RTParserBuilder {
920            grm,
921            stable,
922            recoverer: RecoveryKind::CPCTPlus,
923            term_costs: &|_| 1,
924            phantom: PhantomData,
925        }
926    }
927
928    /// Set the recoverer for this parser to `rk`.
929    pub fn recoverer(mut self, rk: RecoveryKind) -> Self {
930        self.recoverer = rk;
931        self
932    }
933
934    pub fn term_costs(mut self, f: &'a dyn Fn(TIdx<StorageT>) -> u8) -> Self {
935        self.term_costs = f;
936        self
937    }
938
939    #[deprecated(
940        since = "0.14.0",
941        note = "Use `parse_map` to return a `Node` from your `lrpar_mod!` instead"
942    )]
943    #[allow(deprecated)]
944    /// Parse input, and (if possible) return a generic parse tree. See the arguments for
945    /// [`parse_actions`](#method.parse_actions) for more details about the return value.
946    pub fn parse_generictree(
947        &self,
948        lexer: &dyn NonStreamingLexer<LexerTypesT>,
949    ) -> (
950        Option<Node<LexerTypesT::LexemeT, StorageT>>,
951        Vec<LexParseError<StorageT, LexerTypesT>>,
952    ) {
953        self.parse_map(lexer, &|lexeme| Node::Term { lexeme }, &|ridx, nodes| {
954            Node::Nonterm { ridx, nodes }
955        })
956    }
957
958    /// Parse input, and (if possible) return a generic parse tree. See the arguments for
959    /// [`parse_actions`](#method.parse_actions) for more details about the return value.
960    pub fn parse_map<Node>(
961        &self,
962        lexer: &dyn NonStreamingLexer<LexerTypesT>,
963        fterm: &dyn Fn(LexerTypesT::LexemeT) -> Node,
964        fnonterm: &dyn Fn(RIdx<StorageT>, Vec<Node>) -> Node,
965    ) -> (Option<Node>, Vec<LexParseError<StorageT, LexerTypesT>>) {
966        let mut lexemes = vec![];
967        for e in lexer.iter().collect::<Vec<_>>() {
968            match e {
969                Ok(l) => lexemes.push(l),
970                Err(e) => return (None, vec![e.into()]),
971            }
972        }
973        Parser::<
974            StorageT,
975            LexerTypesT,
976            Node,
977            (
978                &dyn Fn(LexerTypesT::LexemeT) -> Node,
979                &dyn Fn(RIdx<StorageT>, Vec<Node>) -> Node,
980            ),
981        >::parse_map(
982            self.recoverer,
983            self.grm,
984            self.term_costs,
985            self.stable,
986            lexer,
987            lexemes,
988            fterm,
989            fnonterm,
990        )
991    }
992
993    #[deprecated(since = "0.14.0", note = "Use `parse_map` instead")]
994    /// Parse input, returning any errors found. See the arguments for
995    /// [`parse_actions`](#method.parse_actions) for more details about the return value.
996    pub fn parse_noaction(
997        &self,
998        lexer: &dyn NonStreamingLexer<LexerTypesT>,
999    ) -> Vec<LexParseError<StorageT, LexerTypesT>> {
1000        self.parse_map(lexer, &|_| (), &|_, _| ()).1
1001    }
1002
1003    /// Parse input, execute actions, and return the associated value (if possible) and/or any
1004    /// lexing/parsing errors encountered. Note that the two parts of the (value, errors) return
1005    /// pair are entirely independent: one can encounter errors without a value being produced
1006    /// (`None, [...]`), errors and a value (`Some(...), [...]`), as well as a value and no errors
1007    /// (`Some(...), []`). Errors are sorted by the position they were found in the input and can
1008    /// be a mix of lexing and parsing errors.
1009    pub fn parse_actions<'b: 'a, 'input: 'b, ActionT: 'a, ParamT: Clone>(
1010        &self,
1011        lexer: &'b dyn NonStreamingLexer<'input, LexerTypesT>,
1012        actions: &'a [ActionFn<'a, 'b, 'input, StorageT, LexerTypesT, ActionT, ParamT>],
1013        param: ParamT,
1014    ) -> (Option<ActionT>, Vec<LexParseError<StorageT, LexerTypesT>>) {
1015        let mut lexemes = vec![];
1016        for e in lexer.iter().collect::<Vec<_>>() {
1017            match e {
1018                Ok(l) => lexemes.push(l),
1019                Err(e) => return (None, vec![e.into()]),
1020            }
1021        }
1022        Parser::parse_actions(
1023            self.recoverer,
1024            self.grm,
1025            self.term_costs,
1026            self.stable,
1027            lexer,
1028            lexemes,
1029            actions,
1030            param,
1031        )
1032    }
1033}
1034
1035/// After a parse error is encountered, the parser attempts to find a way of recovering. Each entry
1036/// in the sequence of repairs is represented by a `ParseRepair`.
1037#[derive(Clone, Debug, Eq, Hash, PartialEq)]
1038pub enum ParseRepair<LexemeT: Lexeme<StorageT>, StorageT: Hash> {
1039    /// Insert a `Symbol::Token`.
1040    Insert(TIdx<StorageT>),
1041    /// Delete a symbol.
1042    Delete(LexemeT),
1043    /// Shift a symbol.
1044    Shift(LexemeT),
1045}
1046
1047/// Records a single parse error.
1048#[derive(Clone, Debug, PartialEq)]
1049pub struct ParseError<LexemeT: Lexeme<StorageT>, StorageT: Hash> {
1050    stidx: StIdx<StorageT>,
1051    lexeme: LexemeT,
1052    repairs: Vec<Vec<ParseRepair<LexemeT, StorageT>>>,
1053}
1054
1055impl<LexemeT: Lexeme<StorageT>, StorageT: Debug + Hash> Display for ParseError<LexemeT, StorageT> {
1056    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
1057        write!(f, "Parse error at lexeme {:?}", self.lexeme)
1058    }
1059}
1060
1061impl<LexemeT: Lexeme<StorageT>, StorageT: Debug + Hash> Error for ParseError<LexemeT, StorageT> {}
1062
1063impl<LexemeT: Lexeme<StorageT>, StorageT: Hash + PrimInt + Unsigned> ParseError<LexemeT, StorageT> {
1064    /// Return the state table index where this error was detected.
1065    pub fn stidx(&self) -> StIdx<StorageT> {
1066        self.stidx
1067    }
1068
1069    /// Return the lexeme where this error was detected.
1070    pub fn lexeme(&self) -> &LexemeT {
1071        &self.lexeme
1072    }
1073
1074    /// Return the repairs found that would fix this error. Note that there are infinite number of
1075    /// possible repairs for any error, so this is by definition a (finite) subset.
1076    pub fn repairs(&self) -> &Vec<Vec<ParseRepair<LexemeT, StorageT>>> {
1077        &self.repairs
1078    }
1079}
1080
1081#[cfg(test)]
1082pub(crate) mod test {
1083    use std::collections::HashMap;
1084
1085    use cfgrammar::{
1086        Span,
1087        yacc::{YaccGrammar, YaccKind, YaccOriginalActionKind},
1088    };
1089    use lrtable::{Minimiser, from_yacc};
1090    use num_traits::ToPrimitive;
1091    use regex::Regex;
1092
1093    use super::*;
1094    use crate::{
1095        Lexeme, Lexer,
1096        test_utils::{TestLexError, TestLexeme, TestLexerTypes},
1097    };
1098
1099    #[allow(deprecated)]
1100    pub(crate) fn do_parse<'input>(
1101        rcvry_kind: RecoveryKind,
1102        lexs: &str,
1103        grms: &str,
1104        input: &'input str,
1105    ) -> (
1106        YaccGrammar<u16>,
1107        SmallLexer<'input>,
1108        Result<
1109            Node<TestLexeme, u16>,
1110            (
1111                Option<Node<TestLexeme, u16>>,
1112                Vec<LexParseError<u16, TestLexerTypes>>,
1113            ),
1114        >,
1115    ) {
1116        do_parse_with_costs(rcvry_kind, lexs, grms, input, &HashMap::new())
1117    }
1118
1119    #[allow(deprecated)]
1120    fn do_parse_with_costs<'input>(
1121        rcvry_kind: RecoveryKind,
1122        lexs: &str,
1123        grms: &str,
1124        input: &'input str,
1125        costs: &HashMap<&str, u8>,
1126    ) -> (
1127        YaccGrammar<u16>,
1128        SmallLexer<'input>,
1129        Result<
1130            Node<TestLexeme, u16>,
1131            (
1132                Option<Node<TestLexeme, u16>>,
1133                Vec<LexParseError<u16, TestLexerTypes>>,
1134            ),
1135        >,
1136    ) {
1137        let grm = YaccGrammar::<u16>::new_with_storaget(
1138            YaccKind::Original(YaccOriginalActionKind::GenericParseTree),
1139            grms,
1140        )
1141        .unwrap();
1142        let (_, stable) = from_yacc(&grm, Minimiser::Pager).unwrap();
1143        let rule_ids = grm
1144            .tokens_map()
1145            .iter()
1146            .map(|(&n, &i)| (n.to_owned(), u32::from(i).to_u16().unwrap()))
1147            .collect();
1148        let lexer_rules = small_lexer(lexs, rule_ids);
1149        let lexemes = small_lex(lexer_rules, input);
1150        let lexer = SmallLexer { lexemes, s: input };
1151        let costs_tidx = costs
1152            .iter()
1153            .map(|(k, v)| (grm.token_idx(k).unwrap(), v))
1154            .collect::<HashMap<_, _>>();
1155        let (r, errs) = RTParserBuilder::new(&grm, &stable)
1156            .recoverer(rcvry_kind)
1157            .term_costs(&|tidx| **costs_tidx.get(&tidx).unwrap_or(&&1))
1158            .parse_generictree(&lexer);
1159        if let Some(node) = r {
1160            if errs.is_empty() {
1161                (grm, lexer, Ok(node))
1162            } else {
1163                (grm, lexer, Err((Some(node), errs)))
1164            }
1165        } else {
1166            (grm, lexer, Err((None, errs)))
1167        }
1168    }
1169
1170    fn check_parse_output(lexs: &str, grms: &str, input: &str, expected: &str) {
1171        let (grm, _, pt) = do_parse(RecoveryKind::CPCTPlus, lexs, grms, input);
1172        assert_eq!(expected, pt.unwrap().pp(&grm, input));
1173    }
1174
1175    // SmallLexer is our highly simplified version of lrlex (allowing us to avoid having to have
1176    // lrlex as a dependency of lrpar). The format is the same as lrlex *except*:
1177    //   * The initial "%%" isn't needed, and only "'" is valid as a rule name delimiter.
1178    //   * "Unnamed" rules aren't allowed (e.g. you can't have a rule which discards whitespaces).
1179    pub(crate) struct SmallLexer<'input> {
1180        lexemes: Vec<TestLexeme>,
1181        s: &'input str,
1182    }
1183
1184    impl Lexer<TestLexerTypes> for SmallLexer<'_> {
1185        fn iter<'a>(&'a self) -> Box<dyn Iterator<Item = Result<TestLexeme, TestLexError>> + 'a> {
1186            Box::new(self.lexemes.iter().map(|x| Ok(*x)))
1187        }
1188    }
1189
1190    impl<'input> NonStreamingLexer<'input, TestLexerTypes> for SmallLexer<'input> {
1191        fn span_str(&self, span: Span) -> &'input str {
1192            &self.s[span.start()..span.end()]
1193        }
1194
1195        fn span_lines_str(&self, _: Span) -> &'input str {
1196            unreachable!();
1197        }
1198
1199        fn line_col(&self, _: Span) -> ((usize, usize), (usize, usize)) {
1200            unreachable!();
1201        }
1202    }
1203
1204    fn small_lexer(lexs: &str, ids_map: HashMap<String, u16>) -> Vec<(u16, Regex)> {
1205        let mut rules = Vec::new();
1206        for l in lexs.split('\n').map(|x| x.trim()).filter(|x| !x.is_empty()) {
1207            assert!(l.rfind('\'') == Some(l.len() - 1));
1208            let i = l[..l.len() - 1].rfind('\'').unwrap();
1209            let name = &l[i + 1..l.len() - 1];
1210            let re = &l[..i - 1].trim();
1211            rules.push((
1212                ids_map[name],
1213                Regex::new(&format!("\\A(?:{})", re)).unwrap(),
1214            ));
1215        }
1216        rules
1217    }
1218
1219    fn small_lex(rules: Vec<(u16, Regex)>, input: &str) -> Vec<TestLexeme> {
1220        let mut lexemes = vec![];
1221        let mut i = 0;
1222        while i < input.len() {
1223            let mut longest = 0; // Length of the longest match
1224            let mut longest_tok_id = 0; // This is only valid iff longest != 0
1225            for (tok_id, r) in rules.iter() {
1226                if let Some(m) = r.find(&input[i..]) {
1227                    let len = m.end();
1228                    if len > longest {
1229                        longest = len;
1230                        longest_tok_id = *tok_id;
1231                    }
1232                }
1233            }
1234            assert!(longest > 0);
1235            lexemes.push(Lexeme::new(longest_tok_id, i, longest));
1236            i += longest;
1237        }
1238        lexemes
1239    }
1240
1241    #[test]
1242    fn simple_parse() {
1243        // From p4 of https://www.cs.umd.edu/class/spring2014/cmsc430/lectures/lec07.pdf
1244        check_parse_output(
1245            "[a-zA-Z_] 'ID'
1246             \\+ '+'",
1247            "
1248%start E
1249%%
1250E: T '+' E
1251 | T;
1252T: 'ID';
1253",
1254            "a+b",
1255            "E
1256 T
1257  ID a
1258 + +
1259 E
1260  T
1261   ID b
1262",
1263        );
1264    }
1265
1266    #[test]
1267    fn parse_empty_rules() {
1268        let lexs = "[a-zA-Z_] 'ID'";
1269        let grms = "%start S
1270%%
1271S: L;
1272L: 'ID'
1273 | ;
1274";
1275        check_parse_output(
1276            lexs, grms, "", "S
1277 L
1278",
1279        );
1280
1281        check_parse_output(
1282            lexs,
1283            grms,
1284            "x",
1285            "S
1286 L
1287  ID x
1288",
1289        );
1290    }
1291
1292    #[test]
1293    fn recursive_parse() {
1294        let lexs = "\\+ '+'
1295                    \\* '*'
1296                    [0-9]+ 'INT'";
1297        let grms = "%start Expr
1298%%
1299Expr : Expr '+' Term | Term;
1300Term : Term '*' Factor | Factor;
1301Factor : 'INT';";
1302
1303        check_parse_output(
1304            lexs,
1305            grms,
1306            "2+3*4",
1307            "Expr
1308 Expr
1309  Term
1310   Factor
1311    INT 2
1312 + +
1313 Term
1314  Term
1315   Factor
1316    INT 3
1317  * *
1318  Factor
1319   INT 4
1320",
1321        );
1322        check_parse_output(
1323            lexs,
1324            grms,
1325            "2*3+4",
1326            "Expr
1327 Expr
1328  Term
1329   Term
1330    Factor
1331     INT 2
1332   * *
1333   Factor
1334    INT 3
1335 + +
1336 Term
1337  Factor
1338   INT 4
1339",
1340        );
1341    }
1342
1343    #[test]
1344    fn parse_error() {
1345        let lexs = "\\( '('
1346                    \\) ')'
1347                    [a-zA-Z_][a-zA-Z_0-9]* 'ID'";
1348        let grms = "%start Call
1349%%
1350Call: 'ID' '(' ')';";
1351
1352        check_parse_output(
1353            lexs,
1354            grms,
1355            "f()",
1356            "Call
1357 ID f
1358 ( (
1359 ) )
1360",
1361        );
1362
1363        let (grm, _, pr) = do_parse(RecoveryKind::CPCTPlus, lexs, grms, "f(");
1364        let (_, errs) = pr.unwrap_err();
1365        assert_eq!(errs.len(), 1);
1366        let err_tok_id = usize::from(grm.eof_token_idx()).to_u16().unwrap();
1367        match &errs[0] {
1368            LexParseError::ParseError(e) => {
1369                assert_eq!(e.lexeme(), &Lexeme::new_faulty(err_tok_id, 2, 0));
1370                assert!(e.lexeme().faulty());
1371            }
1372            _ => unreachable!(),
1373        }
1374
1375        let (grm, _, pr) = do_parse(RecoveryKind::CPCTPlus, lexs, grms, "f(f(");
1376        let (_, errs) = pr.unwrap_err();
1377        assert_eq!(errs.len(), 1);
1378        let err_tok_id = usize::from(grm.token_idx("ID").unwrap()).to_u16().unwrap();
1379        match &errs[0] {
1380            LexParseError::ParseError(e) => {
1381                assert_eq!(e.lexeme(), &Lexeme::new(err_tok_id, 2, 1));
1382                assert!(!e.lexeme().faulty());
1383            }
1384            _ => unreachable!(),
1385        }
1386    }
1387
1388    #[test]
1389    fn test_parse_map() {
1390        #[derive(PartialEq, Debug)]
1391        enum TestParseMap<'a> {
1392            Term(&'a str, &'a str),
1393            NonTerm(&'a str, Vec<TestParseMap<'a>>),
1394        }
1395        let lex_src = r#"[0-9]+ 'INT'
1396\+ '+'
1397\* '*'
1398"#;
1399        let grammar_src = "
1400%grmtools{YaccKind: Original(NoAction)}
1401%start Expr
1402%%
1403Expr : Expr '+' Term | Term;
1404Term : Term '*' Factor | Factor;
1405Factor : 'INT';";
1406        let input_src = "2*3+4";
1407        let grm = str::parse::<YaccGrammar<u16>>(grammar_src).unwrap();
1408        let (_, stable) = lrtable::from_yacc(&grm, lrtable::Minimiser::Pager).unwrap();
1409        let rt_parser = RTParserBuilder::new(&grm, &stable);
1410        let rule_ids = grm
1411            .tokens_map()
1412            .iter()
1413            .map(|(&n, &i)| (n.to_owned(), u32::from(i).to_u16().unwrap()))
1414            .collect();
1415        let lexer_rules = small_lexer(lex_src, rule_ids);
1416        let lexemes = small_lex(lexer_rules, input_src);
1417        let lexer = SmallLexer {
1418            lexemes,
1419            s: input_src,
1420        };
1421
1422        let (parse_map, errs) = rt_parser.parse_map(
1423            &lexer,
1424            &|lexeme: TestLexeme| {
1425                let tidx = TIdx(lexeme.tok_id());
1426                let tn = &grm.token_name(tidx).unwrap();
1427                let lt = &input_src[lexeme.span().start()..lexeme.span().end()];
1428                TestParseMap::Term(tn, lt)
1429            },
1430            &|ridx, nodes| {
1431                let rule_name = &grm.rule_name_str(ridx);
1432                TestParseMap::NonTerm(rule_name, nodes)
1433            },
1434        );
1435        assert!(parse_map.is_some() && errs.is_empty());
1436        // Sanity check the `parse_generictree` output.
1437        check_parse_output(
1438            lex_src,
1439            grammar_src,
1440            input_src,
1441            "Expr
1442 Expr
1443  Term
1444   Term
1445    Factor
1446     INT 2
1447   * *
1448   Factor
1449    INT 3
1450 + +
1451 Term
1452  Factor
1453   INT 4
1454",
1455        );
1456
1457        let expected_parse_map = {
1458            use TestParseMap::*;
1459            NonTerm(
1460                "Expr",
1461                vec![
1462                    NonTerm(
1463                        "Expr",
1464                        vec![NonTerm(
1465                            "Term",
1466                            vec![
1467                                NonTerm("Term", vec![NonTerm("Factor", vec![Term("INT", "2")])]),
1468                                Term("*", "*"),
1469                                NonTerm("Factor", vec![Term("INT", "3")]),
1470                            ],
1471                        )],
1472                    ),
1473                    Term("+", "+"),
1474                    NonTerm("Term", vec![NonTerm("Factor", vec![Term("INT", "4")])]),
1475                ],
1476            )
1477        };
1478        assert_eq!(parse_map, Some(expected_parse_map));
1479    }
1480}
lrpar/parser.rs

lrpar/
parser.rs