1use std::{
2 collections::{HashMap, HashSet},
3 fmt::Debug,
4 hash::Hash,
5 marker::PhantomData,
6 slice::Iter,
7 str::FromStr,
8};
9
10use cfgrammar::{
11 header::{GrmtoolsSectionParser, Header, HeaderError, HeaderErrorKind, HeaderValue, Value},
12 span::Location,
13 NewlineCache, Span,
14};
15use num_traits::{AsPrimitive, PrimInt, Unsigned};
16use regex::{Regex, RegexBuilder};
17
18use lrpar::{Lexeme, Lexer, LexerTypes, NonStreamingLexer};
19
20use crate::{
21 parser::{LexParser, StartState, StartStateOperation},
22 LRLexError, LexBuildError, LexBuildResult, StartStateId,
23};
24
25#[doc(hidden)]
26#[derive(Clone, Debug)]
28#[non_exhaustive]
29pub struct LexFlags {
30 pub dot_matches_new_line: Option<bool>,
32 pub multi_line: Option<bool>,
33 pub octal: Option<bool>,
34 pub posix_escapes: Option<bool>,
35 pub allow_wholeline_comments: Option<bool>,
36
37 pub case_insensitive: Option<bool>,
39 pub swap_greed: Option<bool>,
40 pub ignore_whitespace: Option<bool>,
41 pub unicode: Option<bool>,
42 pub size_limit: Option<usize>,
43 pub dfa_size_limit: Option<usize>,
44 pub nest_limit: Option<u32>,
45}
46
47impl<T: Clone> TryFrom<&mut Header<T>> for LexFlags {
48 type Error = HeaderError<T>;
49 fn try_from(header: &mut Header<T>) -> Result<LexFlags, HeaderError<T>> {
50 use cfgrammar::header::Setting;
51 let mut lex_flags = UNSPECIFIED_LEX_FLAGS;
52 let LexFlags {
53 dot_matches_new_line,
54 multi_line,
55 octal,
56 posix_escapes,
57 allow_wholeline_comments,
58 case_insensitive,
59 swap_greed,
60 ignore_whitespace,
61 unicode,
62 size_limit,
63 dfa_size_limit,
64 nest_limit,
65 } = &mut lex_flags;
66 macro_rules! cvt_flag {
67 ($it:ident) => {
68 header.mark_used(&stringify!($it).to_string());
69 *$it = match header.get(stringify!($it)) {
70 Some(HeaderValue(_, Value::Flag(flag, _))) => Some(*flag),
71 Some(HeaderValue(loc, _)) => Err(HeaderError {
72 kind: HeaderErrorKind::ConversionError("LexFlags", "Expected boolean"),
73 locations: vec![loc.clone()],
74 })?,
75 None => None,
76 }
77 };
78 }
79 cvt_flag!(dot_matches_new_line);
80 cvt_flag!(multi_line);
81 cvt_flag!(octal);
82 cvt_flag!(posix_escapes);
83 cvt_flag!(allow_wholeline_comments);
84 cvt_flag!(case_insensitive);
85 cvt_flag!(swap_greed);
86 cvt_flag!(ignore_whitespace);
87 cvt_flag!(unicode);
88 macro_rules! cvt_num {
89 ($it:ident, $num_ty: ty) => {
90 header.mark_used(&stringify!($it).to_string());
91 *$it = match header.get(stringify!($it)) {
92 Some(HeaderValue(_, Value::Setting(Setting::Num(n, _)))) => Some(*n as $num_ty),
93 Some(HeaderValue(loc, _)) => Err(HeaderError {
94 kind: HeaderErrorKind::ConversionError("LexFlags", "Expected numeric"),
95 locations: vec![loc.clone()],
96 })?,
97 None => None,
98 }
99 };
100 }
101 cvt_num!(size_limit, usize);
102 cvt_num!(dfa_size_limit, usize);
103 cvt_num!(nest_limit, u32);
104 Ok(lex_flags)
105 }
106}
107
108impl From<&LexFlags> for Header<Location> {
109 fn from(flags: &LexFlags) -> Header<Location> {
110 let mut header = Header::new();
111 let LexFlags {
112 dot_matches_new_line,
113 multi_line,
114 octal,
115 posix_escapes,
116 allow_wholeline_comments,
117 case_insensitive,
118 swap_greed,
119 ignore_whitespace,
120 unicode,
121 size_limit,
122 dfa_size_limit,
123 nest_limit,
124 } = flags;
125 macro_rules! cvt_flag {
126 ($it: ident) => {
127 $it.map(|x| {
128 header.insert(
129 stringify!($it).to_string(),
130 HeaderValue(
131 Location::Other("From<&LexFlags".to_string()),
132 Value::Flag(x, Location::Other("From<&LexFlags>".to_string())),
133 ),
134 )
135 });
136 };
137 }
138 cvt_flag!(dot_matches_new_line);
139 cvt_flag!(multi_line);
140 cvt_flag!(octal);
141 cvt_flag!(posix_escapes);
142 cvt_flag!(allow_wholeline_comments);
143 cvt_flag!(case_insensitive);
144 cvt_flag!(swap_greed);
145 cvt_flag!(ignore_whitespace);
146 cvt_flag!(unicode);
147
148 macro_rules! cvt_num {
149 ($it: ident) => {
150 $it.map(|x| {
151 use cfgrammar::header::Setting;
152 header.insert(
153 stringify!($it).to_string(),
154 HeaderValue(
155 Location::Other("From<&LexFlags".to_string()),
156 Value::Setting(Setting::Num(
157 x as u64,
158 Location::Other("From<&LexFlags>".to_string()),
159 )),
160 ),
161 )
162 });
163 };
164 }
165 cvt_num!(size_limit);
166 cvt_num!(dfa_size_limit);
167 cvt_num!(nest_limit);
168
169 header
170 }
171}
172
173pub const DEFAULT_LEX_FLAGS: LexFlags = LexFlags {
175 allow_wholeline_comments: Some(false),
176 dot_matches_new_line: Some(true),
177 multi_line: Some(true),
178 octal: Some(true),
179 posix_escapes: Some(false),
180 case_insensitive: None,
181 ignore_whitespace: None,
182 swap_greed: None,
183 unicode: None,
184 size_limit: None,
185 dfa_size_limit: None,
186 nest_limit: None,
187};
188
189pub const UNSPECIFIED_LEX_FLAGS: LexFlags = LexFlags {
191 allow_wholeline_comments: None,
192 dot_matches_new_line: None,
193 multi_line: None,
194 octal: None,
195 posix_escapes: None,
196 case_insensitive: None,
197 ignore_whitespace: None,
198 swap_greed: None,
199 unicode: None,
200 size_limit: None,
201 dfa_size_limit: None,
202 nest_limit: None,
203};
204
205#[derive(Debug)]
206#[doc(hidden)]
207pub struct Rule<StorageT> {
208 pub(super) tok_id: Option<StorageT>,
214 #[deprecated(note = "Use the name() function")]
217 pub name: Option<String>,
218 #[deprecated(note = "Use the name_span() function")]
219 pub name_span: Span,
220 pub(super) re_str: String,
221 re: Regex,
222 #[deprecated(note = "Use the start_states() function")]
224 pub start_states: Vec<usize>,
225 #[deprecated(note = "Use the target_state() function")]
230 pub target_state: Option<(usize, StartStateOperation)>,
231}
232
233impl<StorageT: PrimInt> Rule<StorageT> {
234 #[doc(hidden)]
237 #[allow(private_interfaces)]
238 #[allow(clippy::too_many_arguments)]
239 pub fn new(
240 _: crate::unstable_api::InternalPublicApi,
241 tok_id: Option<StorageT>,
242 name: Option<String>,
243 name_span: Span,
244 re_str: String,
245 start_states: Vec<usize>,
246 target_state: Option<(usize, StartStateOperation)>,
247 lex_flags: &LexFlags,
248 ) -> Result<Rule<StorageT>, regex::Error> {
249 let mut re = RegexBuilder::new(&format!("\\A(?:{})", re_str));
250 let mut re = re
251 .octal(lex_flags.octal.unwrap())
252 .multi_line(lex_flags.multi_line.unwrap())
253 .dot_matches_new_line(lex_flags.dot_matches_new_line.unwrap());
254
255 if let Some(flag) = lex_flags.ignore_whitespace {
256 re = re.ignore_whitespace(flag)
257 }
258 if let Some(flag) = lex_flags.unicode {
259 re = re.unicode(flag)
260 }
261 if let Some(flag) = lex_flags.case_insensitive {
262 re = re.case_insensitive(flag)
263 }
264 if let Some(flag) = lex_flags.swap_greed {
265 re = re.swap_greed(flag)
266 }
267 if let Some(sz) = lex_flags.size_limit {
268 re = re.size_limit(sz)
269 }
270 if let Some(sz) = lex_flags.dfa_size_limit {
271 re = re.dfa_size_limit(sz)
272 }
273 if let Some(lim) = lex_flags.nest_limit {
274 re = re.nest_limit(lim)
275 }
276
277 let re = re.build()?;
278 #[allow(deprecated)]
279 Ok(Rule {
280 tok_id,
281 name,
282 name_span,
283 re_str,
284 re,
285 start_states,
286 target_state,
287 })
288 }
289
290 pub fn tok_id(&self) -> Option<StorageT> {
295 self.tok_id
296 }
297
298 pub fn name(&self) -> Option<&str> {
301 #[allow(deprecated)]
302 self.name.as_deref()
303 }
304
305 pub fn name_span(&self) -> Span {
307 #[allow(deprecated)]
308 self.name_span
309 }
310
311 pub fn re_str(&self) -> &str {
313 &self.re_str
314 }
315
316 pub fn start_states(&self) -> &[usize] {
318 #[allow(deprecated)]
319 self.start_states.as_slice()
320 }
321
322 pub fn target_state(&self) -> Option<(usize, StartStateOperation)> {
324 #[allow(deprecated)]
325 self.target_state.clone()
326 }
327}
328
329pub trait LexerDef<LexerTypesT: LexerTypes>
331where
332 usize: AsPrimitive<LexerTypesT::StorageT>,
333{
334 #[doc(hidden)]
335 fn from_rules(start_states: Vec<StartState>, rules: Vec<Rule<LexerTypesT::StorageT>>) -> Self
338 where
339 Self: Sized;
340
341 fn from_str(s: &str) -> LexBuildResult<Self>
343 where
344 Self: Sized;
345
346 fn get_rule(&self, idx: usize) -> Option<&Rule<LexerTypesT::StorageT>>;
348
349 fn get_rule_by_id(&self, tok_id: LexerTypesT::StorageT) -> &Rule<LexerTypesT::StorageT>;
352
353 fn get_rule_by_name(&self, n: &str) -> Option<&Rule<LexerTypesT::StorageT>>;
355
356 fn set_rule_ids<'a>(
373 &'a mut self,
374 rule_ids_map: &HashMap<&'a str, LexerTypesT::StorageT>,
375 ) -> (Option<HashSet<&'a str>>, Option<HashSet<&'a str>>);
376
377 fn set_rule_ids_spanned<'a>(
378 &'a mut self,
379 rule_ids_map: &HashMap<&'a str, LexerTypesT::StorageT>,
380 ) -> (Option<HashSet<&'a str>>, Option<HashSet<(&'a str, Span)>>);
381
382 fn iter_rules(&self) -> Iter<Rule<LexerTypesT::StorageT>>;
384
385 fn iter_start_states(&self) -> Iter<StartState>;
387}
388
389#[derive(Debug)]
392pub struct LRNonStreamingLexerDef<LexerTypesT: LexerTypes>
393where
394 usize: AsPrimitive<LexerTypesT::StorageT>,
395{
396 rules: Vec<Rule<LexerTypesT::StorageT>>,
397 start_states: Vec<StartState>,
398 lex_flags: LexFlags,
399 phantom: PhantomData<LexerTypesT>,
400}
401
402impl<LexerTypesT: LexerTypes> LexerDef<LexerTypesT> for LRNonStreamingLexerDef<LexerTypesT>
403where
404 usize: AsPrimitive<LexerTypesT::StorageT>,
405 LexerTypesT::StorageT: TryFrom<usize>,
406{
407 fn from_rules(
408 start_states: Vec<StartState>,
409 rules: Vec<Rule<LexerTypesT::StorageT>>,
410 ) -> LRNonStreamingLexerDef<LexerTypesT> {
411 LRNonStreamingLexerDef {
412 rules,
413 start_states,
414 lex_flags: DEFAULT_LEX_FLAGS,
415 phantom: PhantomData,
416 }
417 }
418
419 fn from_str(s: &str) -> LexBuildResult<LRNonStreamingLexerDef<LexerTypesT>> {
422 let (mut header, pos) = GrmtoolsSectionParser::new(s, false)
423 .parse()
424 .map_err(|mut errs| errs.drain(..).map(LexBuildError::from).collect::<Vec<_>>())?;
425 let flags = LexFlags::try_from(&mut header).map_err(|e| vec![e.into()])?;
426 LexParser::<LexerTypesT>::new_with_lex_flags(s[pos..].to_string(), flags.clone()).map(|p| {
427 LRNonStreamingLexerDef {
428 rules: p.rules,
429 start_states: p.start_states,
430 lex_flags: flags,
431 phantom: PhantomData,
432 }
433 })
434 }
435
436 fn get_rule(&self, idx: usize) -> Option<&Rule<LexerTypesT::StorageT>> {
437 self.rules.get(idx)
438 }
439
440 fn get_rule_by_id(&self, tok_id: LexerTypesT::StorageT) -> &Rule<LexerTypesT::StorageT> {
441 self.rules
442 .iter()
443 .find(|r| r.tok_id == Some(tok_id))
444 .unwrap()
445 }
446
447 fn get_rule_by_name(&self, n: &str) -> Option<&Rule<LexerTypesT::StorageT>> {
448 self.rules.iter().find(|r| r.name() == Some(n))
449 }
450
451 fn set_rule_ids<'a>(
452 &'a mut self,
453 rule_ids_map: &HashMap<&'a str, LexerTypesT::StorageT>,
454 ) -> (Option<HashSet<&'a str>>, Option<HashSet<&'a str>>) {
455 let (missing_from_parser, missing_from_lexer) = self.set_rule_ids_spanned(rule_ids_map);
456 let missing_from_lexer =
457 missing_from_lexer.map(|missing| missing.iter().map(|(name, _)| *name).collect());
458 (missing_from_parser, missing_from_lexer)
459 }
460
461 fn set_rule_ids_spanned<'a>(
462 &'a mut self,
463 rule_ids_map: &HashMap<&'a str, LexerTypesT::StorageT>,
464 ) -> (Option<HashSet<&'a str>>, Option<HashSet<(&'a str, Span)>>) {
465 let mut missing_from_parser_idxs = Vec::new();
472 let mut rules_with_names = 0;
473 for (i, r) in self.rules.iter_mut().enumerate() {
474 if let Some(n) = r.name() {
475 match rule_ids_map.get(n) {
476 Some(tok_id) => r.tok_id = Some(*tok_id),
477 None => {
478 r.tok_id = None;
479 missing_from_parser_idxs.push(i);
480 }
481 }
482 rules_with_names += 1;
483 }
484 }
485
486 let missing_from_parser = if missing_from_parser_idxs.is_empty() {
487 None
488 } else {
489 let mut mfp = HashSet::with_capacity(missing_from_parser_idxs.len());
490 for i in &missing_from_parser_idxs {
491 mfp.insert((self.rules[*i].name().unwrap(), self.rules[*i].name_span()));
492 }
493 Some(mfp)
494 };
495
496 let missing_from_lexer =
497 if rules_with_names - missing_from_parser_idxs.len() == rule_ids_map.len() {
498 None
499 } else {
500 Some(
501 rule_ids_map
502 .keys()
503 .cloned()
504 .collect::<HashSet<&str>>()
505 .difference(
506 &self
507 .rules
508 .iter()
509 .filter_map(|x| x.name())
510 .collect::<HashSet<&str>>(),
511 )
512 .cloned()
513 .collect::<HashSet<&str>>(),
514 )
515 };
516
517 (missing_from_lexer, missing_from_parser)
518 }
519
520 fn iter_rules(&self) -> Iter<Rule<LexerTypesT::StorageT>> {
521 self.rules.iter()
522 }
523
524 fn iter_start_states(&self) -> Iter<StartState> {
525 self.start_states.iter()
526 }
527}
528
529impl<
530 StorageT: 'static + Debug + Hash + PrimInt + Unsigned,
531 LexerTypesT: LexerTypes<StorageT = StorageT>,
532 > LRNonStreamingLexerDef<LexerTypesT>
533where
534 usize: AsPrimitive<StorageT>,
535 LexerTypesT::StorageT: TryFrom<usize>,
536{
537 pub fn new_with_options(
539 s: &str,
540 lex_flags: LexFlags,
541 ) -> LexBuildResult<LRNonStreamingLexerDef<LexerTypesT>> {
542 let (_, pos) = GrmtoolsSectionParser::new(s, false).parse().unwrap();
543 LexParser::<LexerTypesT>::new_with_lex_flags(s[pos..].to_string(), lex_flags.clone()).map(
544 |p| LRNonStreamingLexerDef {
545 rules: p.rules,
546 start_states: p.start_states,
547 lex_flags,
548 phantom: PhantomData,
549 },
550 )
551 }
552
553 pub fn lexer<'lexer, 'input: 'lexer>(
556 &'lexer self,
557 s: &'input str,
558 ) -> LRNonStreamingLexer<'lexer, 'input, LexerTypesT> {
559 let mut lexemes = vec![];
560 let mut i = 0;
561 let mut state_stack: Vec<(usize, &StartState)> = Vec::new();
562 let initial_state = match self.get_start_state_by_id(0) {
563 None => {
564 lexemes.push(Err(LRLexError::new(Span::new(i, i))));
565 return LRNonStreamingLexer::new(s, lexemes, NewlineCache::from_str(s).unwrap());
566 }
567 Some(state) => state,
568 };
569 state_stack.push((1, initial_state));
570
571 while i < s.len() {
572 let old_i = i;
573 let mut longest = 0; let mut longest_ridx = 0; let current_state = match state_stack.last() {
576 None => {
577 lexemes.push(Err(LRLexError::new(Span::new(i, i))));
578 return LRNonStreamingLexer::new(
579 s,
580 lexemes,
581 NewlineCache::from_str(s).unwrap(),
582 );
583 }
584 Some((_, s)) => s,
585 };
586 for (ridx, r) in self.iter_rules().enumerate() {
587 if !Self::state_matches(current_state, r.start_states()) {
588 continue;
589 }
590 if let Some(m) = r.re.find(&s[old_i..]) {
591 let len = m.end();
592 if len > longest {
595 longest = len;
596 longest_ridx = ridx;
597 }
598 }
599 }
600 if longest > 0 {
601 let r = self.get_rule(longest_ridx).unwrap();
602 if r.name().is_some() {
603 match r.tok_id {
604 Some(tok_id) => {
605 lexemes.push(Ok(Lexeme::new(tok_id, old_i, longest)));
606 }
607 None => {
608 lexemes.push(Err(LRLexError::new(Span::new(old_i, old_i))));
609 break;
610 }
611 }
612 }
613 if let Some((target_state_id, op)) = &r.target_state() {
614 let state = match self.get_start_state_by_id(*target_state_id) {
615 None => {
616 lexemes.push(Err(LRLexError::new(Span::new(old_i, old_i))));
618 break;
619 }
620 Some(state) => state,
621 };
622 let head = state_stack.last_mut();
623 match op {
624 StartStateOperation::ReplaceStack => {
625 state_stack.clear();
626 state_stack.push((1, state));
627 }
628 StartStateOperation::Push => match head {
629 Some((count, s)) if s.id == state.id => *count += 1,
630 _ => state_stack.push((1, state)),
631 },
632 StartStateOperation::Pop => match head {
633 Some((count, _s)) if *count > 1 => {
634 *count -= 1;
635 }
636 Some(_) => {
637 state_stack.pop();
638 if state_stack.is_empty() {
639 state_stack.push((1, initial_state));
640 }
641 }
642 None => {
643 lexemes.push(Err(LRLexError::new(Span::new(old_i, old_i))));
644 break;
645 }
646 },
647 }
648 }
649 i += longest;
650 } else {
651 lexemes.push(Err(LRLexError::new_with_lexing_state(
652 Span::new(old_i, old_i),
653 StartStateId::new(current_state.id),
654 )));
655 break;
656 }
657 }
658 LRNonStreamingLexer::new(s, lexemes, NewlineCache::from_str(s).unwrap())
659 }
660
661 fn state_matches(state: &StartState, rule_states: &[usize]) -> bool {
662 if rule_states.is_empty() {
663 !state.exclusive
664 } else {
665 rule_states.contains(&state.id)
666 }
667 }
668
669 fn get_start_state_by_id(&self, id: usize) -> Option<&StartState> {
670 self.start_states.iter().find(|state| state.id == id)
671 }
672
673 pub fn lex_flags(&self) -> Option<&LexFlags> {
676 Some(&self.lex_flags)
677 }
678}
679
680pub struct LRNonStreamingLexer<'lexer, 'input: 'lexer, LexerTypesT: LexerTypes>
684where
685 usize: AsPrimitive<LexerTypesT::StorageT>,
686 LexerTypesT::StorageT: 'static + Debug + PrimInt,
687{
688 s: &'input str,
689 lexemes: Vec<Result<LexerTypesT::LexemeT, LRLexError>>,
690 newlines: NewlineCache,
691 phantom: PhantomData<(&'lexer (), LexerTypesT::StorageT)>,
692}
693
694impl<
695 'lexer,
696 'input: 'lexer,
697 StorageT: 'static + Debug + Hash + PrimInt + Unsigned,
698 LexerTypesT: LexerTypes<StorageT = StorageT>,
699 > LRNonStreamingLexer<'lexer, 'input, LexerTypesT>
700where
701 usize: AsPrimitive<StorageT>,
702{
703 pub fn new(
709 s: &'input str,
710 lexemes: Vec<Result<LexerTypesT::LexemeT, LRLexError>>,
711 newlines: NewlineCache,
712 ) -> LRNonStreamingLexer<'lexer, 'input, LexerTypesT> {
713 LRNonStreamingLexer {
714 s,
715 lexemes,
716 newlines,
717 phantom: PhantomData,
718 }
719 }
720}
721
722impl<
723 'lexer,
724 'input: 'lexer,
725 StorageT: 'static + Debug + Hash + PrimInt + Unsigned,
726 LexerTypesT: LexerTypes<StorageT = StorageT, LexErrorT = LRLexError>,
727 > Lexer<LexerTypesT> for LRNonStreamingLexer<'lexer, 'input, LexerTypesT>
728where
729 usize: AsPrimitive<StorageT>,
730{
731 fn iter<'a>(
732 &'a self,
733 ) -> Box<dyn Iterator<Item = Result<LexerTypesT::LexemeT, LexerTypesT::LexErrorT>> + 'a> {
734 Box::new(self.lexemes.iter().cloned())
735 }
736}
737
738impl<'lexer, 'input: 'lexer, LexerTypesT: LexerTypes<LexErrorT = LRLexError>>
739 NonStreamingLexer<'input, LexerTypesT> for LRNonStreamingLexer<'lexer, 'input, LexerTypesT>
740where
741 usize: AsPrimitive<LexerTypesT::StorageT>,
742{
743 fn span_str(&self, span: Span) -> &'input str {
744 if span.end() > self.s.len() {
745 panic!(
746 "Span {:?} exceeds known input length {}",
747 span,
748 self.s.len()
749 );
750 }
751 &self.s[span.start()..span.end()]
752 }
753
754 fn span_lines_str(&self, span: Span) -> &'input str {
755 debug_assert!(span.end() >= span.start());
756 if span.end() > self.s.len() {
757 panic!(
758 "Span {:?} exceeds known input length {}",
759 span,
760 self.s.len()
761 );
762 }
763
764 let (st, en) = self.newlines.span_line_bytes(span);
765 &self.s[st..en]
766 }
767
768 fn line_col(&self, span: Span) -> ((usize, usize), (usize, usize)) {
769 debug_assert!(span.end() >= span.start());
770 if span.end() > self.s.len() {
771 panic!(
772 "Span {:?} exceeds known input length {}",
773 span,
774 self.s.len()
775 );
776 }
777
778 (
779 self.newlines
780 .byte_to_line_num_and_col_num(self.s, span.start())
781 .unwrap(),
782 self.newlines
783 .byte_to_line_num_and_col_num(self.s, span.end())
784 .unwrap(),
785 )
786 }
787}
788
789#[cfg(test)]
790mod test {
791 use super::*;
792 use crate::{DefaultLexeme, DefaultLexerTypes};
793 use lrpar::LexError;
794 use std::collections::HashMap;
795
796 #[test]
797 fn test_basic() {
798 let src = r"
799%%
800[0-9]+ 'int'
801[a-zA-Z]+ 'id'
802[ \t] ;"
803 .to_string();
804 let mut lexerdef = LRNonStreamingLexerDef::<DefaultLexerTypes<u8>>::from_str(&src).unwrap();
805 let mut map = HashMap::new();
806 map.insert("int", 0);
807 map.insert("id", 1);
808 assert_eq!(lexerdef.set_rule_ids(&map), (None, None));
809
810 let lexemes = lexerdef
811 .lexer("abc 123")
812 .iter()
813 .map(|x| x.unwrap())
814 .collect::<Vec<_>>();
815 assert_eq!(lexemes.len(), 2);
816 let lex1 = lexemes[0];
817 assert_eq!(lex1.tok_id(), 1u8);
818 assert_eq!(lex1.span().start(), 0);
819 assert_eq!(lex1.span().len(), 3);
820 let lex2 = lexemes[1];
821 assert_eq!(lex2.tok_id(), 0);
822 assert_eq!(lex2.span().start(), 4);
823 assert_eq!(lex2.span().len(), 3);
824 }
825
826 #[test]
827 fn test_posix_escapes() {
828 let src = r#"%%
829\\ 'slash'
830\a 'alert'
831\b 'backspace'
832\f 'feed'
833\n 'newline'
834\r 'return'
835\t 'tab'
836\v 'vtab'
837\q 'normal_char'
838"#
839 .to_string();
840 let mut options = DEFAULT_LEX_FLAGS;
841 options.posix_escapes = Some(true);
842 let lexerdef =
843 LRNonStreamingLexerDef::<DefaultLexerTypes<u8>>::new_with_options(&src, options)
844 .unwrap();
845 let lexemes = lexerdef
846 .lexer("\\\x07\x08\x0c\n\r\t\x0bq")
847 .iter()
848 .map(|x| x.unwrap())
849 .collect::<Vec<_>>();
850 assert_eq!(lexemes.len(), 9);
851 for i in 0..9u8 {
852 let lexeme = lexemes[i as usize];
853 assert_eq!(lexeme.tok_id(), i);
854 }
855 }
856
857 #[test]
858 fn test_non_posix_escapes() {
859 let src = r#"%%
860\\ 'slash'
861\a 'alert'
862a\b a 'work_break'
863\f 'feed'
864\n 'newline'
865\r 'return'
866\t 'tab'
867\v 'vtab'
868\q 'normal_char'
869"#
870 .to_string();
871 let mut options = DEFAULT_LEX_FLAGS;
872 options.posix_escapes = Some(false);
873 let lexerdef =
874 LRNonStreamingLexerDef::<DefaultLexerTypes<u8>>::new_with_options(&src, options)
875 .unwrap();
876 let lexemes = lexerdef
877 .lexer("\\\x07a a\x0c\n\r\t\x0bq")
878 .iter()
879 .map(|x| x.unwrap())
880 .collect::<Vec<_>>();
881 assert_eq!(lexemes.len(), 9);
882 for i in 0..9u8 {
883 let lexeme = lexemes[i as usize];
884 assert_eq!(lexeme.tok_id(), i);
885 }
886 }
887
888 #[test]
889 fn test_basic_error() {
890 let src = "
891%%
892[0-9]+ 'int'"
893 .to_string();
894 let lexerdef = LRNonStreamingLexerDef::<DefaultLexerTypes<u8>>::from_str(&src).unwrap();
895 match lexerdef.lexer("abc").iter().next().unwrap() {
896 Ok(_) => panic!("Invalid input lexed"),
897 Err(e) => {
898 if e.span().start() != 0 || e.span().end() != 0 {
899 panic!("Incorrect span returned {:?}", e.span());
900 }
901 }
902 };
903 }
904
905 #[test]
906 fn test_longest_match() {
907 let src = "%%
908if 'IF'
909[a-z]+ 'ID'
910[ ] ;"
911 .to_string();
912 let mut lexerdef = LRNonStreamingLexerDef::<DefaultLexerTypes<u8>>::from_str(&src).unwrap();
913 let mut map = HashMap::new();
914 map.insert("IF", 0);
915 map.insert("ID", 1);
916 assert_eq!(lexerdef.set_rule_ids(&map), (None, None));
917
918 let lexemes = lexerdef
919 .lexer("iff if")
920 .iter()
921 .map(|x| x.unwrap())
922 .collect::<Vec<DefaultLexeme<u8>>>();
923 assert_eq!(lexemes.len(), 2);
924 let lex1 = lexemes[0];
925 assert_eq!(lex1.tok_id(), 1u8);
926 assert_eq!(lex1.span().start(), 0);
927 assert_eq!(lex1.span().len(), 3);
928 let lex2 = lexemes[1];
929 assert_eq!(lex2.tok_id(), 0);
930 assert_eq!(lex2.span().start(), 4);
931 assert_eq!(lex2.span().len(), 2);
932 }
933
934 #[test]
935 fn test_multibyte() {
936 let src = "%%
937[a❤]+ 'ID'
938[ ] ;"
939 .to_string();
940 let mut lexerdef = LRNonStreamingLexerDef::<DefaultLexerTypes<u8>>::from_str(&src).unwrap();
941 let mut map = HashMap::new();
942 map.insert("ID", 0u8);
943 assert_eq!(lexerdef.set_rule_ids(&map), (None, None));
944
945 let lexer = lexerdef.lexer("a ❤ a");
946 let lexemes = lexer
947 .iter()
948 .map(|x| x.unwrap())
949 .collect::<Vec<DefaultLexeme<u8>>>();
950 assert_eq!(lexemes.len(), 3);
951 let lex1 = lexemes[0];
952 assert_eq!(lex1.span().start(), 0);
953 assert_eq!(lex1.span().len(), 1);
954 assert_eq!(lexer.span_str(lex1.span()), "a");
955 let lex2 = lexemes[1];
956 assert_eq!(lex2.span().start(), 2);
957 assert_eq!(lex2.span().len(), 3);
958 assert_eq!(lexer.span_str(lex2.span()), "❤");
959 let lex3 = lexemes[2];
960 assert_eq!(lex3.span().start(), 6);
961 assert_eq!(lex3.span().len(), 1);
962 assert_eq!(lexer.span_str(lex3.span()), "a");
963 }
964
965 #[test]
966 fn test_line_col() {
967 let src = "%%
968[a-z]+ 'ID'
969[ \\n] ;"
970 .to_string();
971 let mut lexerdef = LRNonStreamingLexerDef::<DefaultLexerTypes<u8>>::from_str(&src).unwrap();
972 let mut map = HashMap::new();
973 map.insert("ID", 0u8);
974 assert_eq!(lexerdef.set_rule_ids(&map), (None, None));
975
976 let lexer = lexerdef.lexer("a b c");
977 let lexemes = lexer
978 .iter()
979 .map(|x| x.unwrap())
980 .collect::<Vec<DefaultLexeme<u8>>>();
981 assert_eq!(lexemes.len(), 3);
982 assert_eq!(lexer.line_col(lexemes[1].span()), ((1, 3), (1, 4)));
983 assert_eq!(lexer.span_lines_str(lexemes[1].span()), "a b c");
984 assert_eq!(lexer.span_lines_str(lexemes[2].span()), "a b c");
985
986 let lexer = lexerdef.lexer("a b c\n");
987 let lexemes = lexer.iter().map(|x| x.unwrap()).collect::<Vec<_>>();
988 assert_eq!(lexemes.len(), 3);
989 assert_eq!(lexer.line_col(lexemes[1].span()), ((1, 3), (1, 4)));
990 assert_eq!(lexer.span_lines_str(lexemes[1].span()), "a b c");
991 assert_eq!(lexer.span_lines_str(lexemes[2].span()), "a b c");
992
993 let lexer = lexerdef.lexer(" a\nb\n c d");
994 let lexemes = lexer.iter().map(|x| x.unwrap()).collect::<Vec<_>>();
995 assert_eq!(lexemes.len(), 4);
996 assert_eq!(lexer.line_col(lexemes[0].span()), ((1, 2), (1, 3)));
997 assert_eq!(lexer.line_col(lexemes[1].span()), ((2, 1), (2, 2)));
998 assert_eq!(lexer.line_col(lexemes[2].span()), ((3, 3), (3, 4)));
999 assert_eq!(lexer.line_col(lexemes[3].span()), ((3, 5), (3, 6)));
1000 assert_eq!(lexer.span_lines_str(lexemes[0].span()), " a");
1001 assert_eq!(lexer.span_lines_str(lexemes[1].span()), "b");
1002 assert_eq!(lexer.span_lines_str(lexemes[2].span()), " c d");
1003 assert_eq!(lexer.span_lines_str(lexemes[3].span()), " c d");
1004
1005 let mut s = Vec::new();
1006 let mut offs = vec![0];
1007 for i in 0..71 {
1008 offs.push(offs[i] + i + 1);
1009 s.push(vec!["a"; i].join(" "));
1010 }
1011 let s = s.join("\n");
1012 let lexer = lexerdef.lexer(&s);
1013 let lexemes = lexer.iter().map(|x| x.unwrap()).collect::<Vec<_>>();
1014 assert_eq!(lexemes.len(), offs[70]);
1015 assert_eq!(lexer.span_lines_str(Span::new(0, 0)), "");
1016 assert_eq!(lexer.span_lines_str(Span::new(0, 2)), "\na");
1017 assert_eq!(lexer.span_lines_str(Span::new(0, 4)), "\na\na a");
1018 assert_eq!(lexer.span_lines_str(Span::new(0, 7)), "\na\na a\na a a");
1019 assert_eq!(lexer.span_lines_str(Span::new(4, 7)), "a a\na a a");
1020 assert_eq!(lexer.span_lines_str(lexemes[0].span()), "a");
1021 assert_eq!(lexer.span_lines_str(lexemes[1].span()), "a a");
1022 assert_eq!(lexer.span_lines_str(lexemes[3].span()), "a a a");
1023 for i in 0..70 {
1024 assert_eq!(
1025 lexer.span_lines_str(lexemes[offs[i]].span()),
1026 vec!["a"; i + 1].join(" ")
1027 );
1028 }
1029 }
1030
1031 #[test]
1032 fn test_line_col_multibyte() {
1033 let src = "%%
1034[a-z❤]+ 'ID'
1035[ \\n] ;"
1036 .to_string();
1037 let mut lexerdef = LRNonStreamingLexerDef::<DefaultLexerTypes<u8>>::from_str(&src).unwrap();
1038 let mut map = HashMap::new();
1039 map.insert("ID", 0u8);
1040 assert_eq!(lexerdef.set_rule_ids(&map), (None, None));
1041
1042 let lexer = lexerdef.lexer(" a\n❤ b");
1043 let lexemes = lexer
1044 .iter()
1045 .map(|x| x.unwrap())
1046 .collect::<Vec<DefaultLexeme<u8>>>();
1047 assert_eq!(lexemes.len(), 3);
1048 assert_eq!(lexer.line_col(lexemes[0].span()), ((1, 2), (1, 3)));
1049 assert_eq!(lexer.line_col(lexemes[1].span()), ((2, 1), (2, 2)));
1050 assert_eq!(lexer.line_col(lexemes[2].span()), ((2, 3), (2, 4)));
1051 assert_eq!(lexer.span_lines_str(lexemes[0].span()), " a");
1052 assert_eq!(lexer.span_lines_str(lexemes[1].span()), "❤ b");
1053 assert_eq!(lexer.span_lines_str(lexemes[2].span()), "❤ b");
1054 }
1055
1056 #[test]
1057 #[should_panic]
1058 fn test_bad_line_col() {
1059 let src = "%%
1060[a-z]+ 'ID'
1061[ \\n] ;"
1062 .to_string();
1063 let mut lexerdef = LRNonStreamingLexerDef::<DefaultLexerTypes<u8>>::from_str(&src).unwrap();
1064 let mut map = HashMap::new();
1065 map.insert("ID", 0u8);
1066 assert_eq!(lexerdef.set_rule_ids(&map), (None, None));
1067
1068 let lexer = lexerdef.lexer("a b c");
1069
1070 lexer.line_col(Span::new(100, 100));
1071 }
1072
1073 #[test]
1074 fn test_missing_from_lexer_and_parser() {
1075 let src = "%%
1076[a-z]+ 'ID'
1077[ \\n] ;"
1078 .to_string();
1079 let mut lexerdef = LRNonStreamingLexerDef::<DefaultLexerTypes<u8>>::from_str(&src).unwrap();
1080 let mut map = HashMap::new();
1081 map.insert("INT", 0u8);
1082 let mut missing_from_lexer = HashSet::new();
1083 missing_from_lexer.insert("INT");
1084 let mut missing_from_parser = HashSet::new();
1085 missing_from_parser.insert("ID");
1086 assert_eq!(
1087 lexerdef.set_rule_ids(&map),
1088 (Some(missing_from_lexer), Some(missing_from_parser))
1089 );
1090
1091 match lexerdef.lexer(" a ").iter().next().unwrap() {
1092 Ok(_) => panic!("Invalid input lexed"),
1093 Err(e) => {
1094 if e.span().start() != 1 || e.span().end() != 1 {
1095 panic!("Incorrect span returned {:?}", e.span());
1096 }
1097 }
1098 };
1099 }
1100
1101 #[test]
1102 fn test_multiline_lexeme() {
1103 let src = "%%
1104'.*' 'STR'
1105[ \\n] ;"
1106 .to_string();
1107 let mut lexerdef = LRNonStreamingLexerDef::<DefaultLexerTypes<u8>>::from_str(&src).unwrap();
1108 let mut map = HashMap::new();
1109 map.insert("STR", 0u8);
1110 assert_eq!(lexerdef.set_rule_ids(&map), (None, None));
1111
1112 let lexer = lexerdef.lexer("'a\nb'\n");
1113 let lexemes = lexer
1114 .iter()
1115 .map(|x| x.unwrap())
1116 .collect::<Vec<DefaultLexeme<u8>>>();
1117 assert_eq!(lexemes.len(), 1);
1118 assert_eq!(lexer.line_col(lexemes[0].span()), ((1, 1), (2, 3)));
1119 assert_eq!(lexer.span_lines_str(lexemes[0].span()), "'a\nb'");
1120 }
1121
1122 #[test]
1123 fn test_token_span() {
1124 let src = "%%
1125a 'A'
1126b 'B'
1127[ \\n] ;"
1128 .to_string();
1129 let lexerdef = LRNonStreamingLexerDef::<DefaultLexerTypes<u8>>::from_str(&src).unwrap();
1130 assert_eq!(
1131 lexerdef.get_rule_by_name("A").unwrap().name_span(),
1132 Span::new(6, 7)
1133 );
1134 assert_eq!(
1135 lexerdef.get_rule_by_name("B").unwrap().name_span(),
1136 Span::new(12, 13)
1137 );
1138 let anonymous_rules = lexerdef
1139 .iter_rules()
1140 .filter(|rule| rule.name().is_none())
1141 .collect::<Vec<_>>();
1142 assert_eq!(anonymous_rules[0].name_span(), Span::new(21, 21));
1143 }
1144
1145 #[test]
1146 fn test_token_start_states() {
1147 let src = "%x EXCLUSIVE_START
1148%s INCLUSIVE_START
1149%%
1150a 'A'
1151b 'B'
1152[ \\n] ;"
1153 .to_string();
1154 let lexerdef = LRNonStreamingLexerDef::<DefaultLexerTypes<u8>>::from_str(&src).unwrap();
1155 assert_eq!(
1156 lexerdef.get_rule_by_name("A").unwrap().name_span(),
1157 Span::new(44, 45)
1158 );
1159 assert_eq!(
1160 lexerdef.get_rule_by_name("B").unwrap().name_span(),
1161 Span::new(50, 51)
1162 );
1163 }
1164
1165 #[test]
1166 fn test_rule_start_states() {
1167 let src = "%x EXCLUSIVE_START
1168%s INCLUSIVE_START
1169%%
1170<EXCLUSIVE_START>a 'A'
1171<INCLUSIVE_START>b 'B'
1172[ \\n] ;"
1173 .to_string();
1174 let lexerdef = LRNonStreamingLexerDef::<DefaultLexerTypes<u8>>::from_str(&src).unwrap();
1175 let a_rule = lexerdef.get_rule_by_name("A").unwrap();
1176 assert_eq!(a_rule.name_span(), Span::new(61, 62));
1177 assert_eq!(a_rule.re_str, "a");
1178
1179 let b_rule = lexerdef.get_rule_by_name("B").unwrap();
1180 assert_eq!(b_rule.name_span(), Span::new(84, 85));
1181 assert_eq!(b_rule.re_str, "b");
1182 }
1183
1184 #[test]
1185 fn test_state_matches_regular_no_rule_states() {
1186 let all_states = &[
1187 StartState::new(0, "INITIAL", false, Span::new(0, 0)),
1188 StartState::new(1, "EXCLUSIVE", true, Span::new(0, 0)),
1189 ];
1190 let rule_states = vec![];
1191 let current_state = &all_states[0];
1192 let m = LRNonStreamingLexerDef::<DefaultLexerTypes<u8>>::state_matches(
1193 current_state,
1194 &rule_states,
1195 );
1196 assert!(m);
1197 }
1198
1199 #[test]
1200 fn test_state_matches_exclusive_no_rule_states() {
1201 let all_states = &[
1202 StartState::new(0, "INITIAL", false, Span::new(0, 0)),
1203 StartState::new(1, "EXCLUSIVE", true, Span::new(0, 0)),
1204 ];
1205 let rule_states = vec![];
1206 let current_state = &all_states[1];
1207 let m = LRNonStreamingLexerDef::<DefaultLexerTypes<u8>>::state_matches(
1208 current_state,
1209 &rule_states,
1210 );
1211 assert!(!m);
1212 }
1213
1214 #[test]
1215 fn test_state_matches_regular_matching_rule_states() {
1216 let all_states = &[
1217 StartState::new(0, "INITIAL", false, Span::new(0, 0)),
1218 StartState::new(1, "EXCLUSIVE", true, Span::new(0, 0)),
1219 ];
1220 let rule_states = vec![0];
1221 let current_state = &all_states[0];
1222 let m = LRNonStreamingLexerDef::<DefaultLexerTypes<u8>>::state_matches(
1223 current_state,
1224 &rule_states,
1225 );
1226 assert!(m);
1227 }
1228
1229 #[test]
1230 fn test_state_matches_exclusive_matching_rule_states() {
1231 let all_states = &[
1232 StartState::new(0, "INITIAL", false, Span::new(0, 0)),
1233 StartState::new(1, "EXCLUSIVE", true, Span::new(0, 0)),
1234 ];
1235 let rule_states = vec![1];
1236 let current_state = &all_states[1];
1237 let m = LRNonStreamingLexerDef::<DefaultLexerTypes<u8>>::state_matches(
1238 current_state,
1239 &rule_states,
1240 );
1241 assert!(m);
1242 }
1243
1244 #[test]
1245 fn test_state_matches_regular_other_rule_states() {
1246 let all_states = &[
1247 StartState::new(0, "INITIAL", false, Span::new(0, 0)),
1248 StartState::new(1, "EXCLUSIVE", true, Span::new(0, 0)),
1249 ];
1250 let rule_states = vec![1];
1251 let current_state = &all_states[0];
1252 let m = LRNonStreamingLexerDef::<DefaultLexerTypes<u8>>::state_matches(
1253 current_state,
1254 &rule_states,
1255 );
1256 assert!(!m);
1257 }
1258
1259 #[test]
1260 fn test_state_matches_exclusive_other_rule_states() {
1261 let all_states = &[
1262 StartState::new(0, "INITIAL", false, Span::new(0, 0)),
1263 StartState::new(1, "EXCLUSIVE", true, Span::new(0, 0)),
1264 ];
1265 let rule_states = vec![0];
1266 let current_state = &all_states[1];
1267 let m = LRNonStreamingLexerDef::<DefaultLexerTypes<u8>>::state_matches(
1268 current_state,
1269 &rule_states,
1270 );
1271 assert!(!m);
1272 }
1273}