1use std::{
2 collections::{HashMap, HashSet},
3 fmt::Debug,
4 hash::Hash,
5 marker::PhantomData,
6 slice::Iter,
7 str::FromStr,
8};
9
10use cfgrammar::{
11 NewlineCache, Span,
12 header::{GrmtoolsSectionParser, Header, HeaderError, HeaderErrorKind, HeaderValue, Value},
13 span::Location,
14};
15use num_traits::{AsPrimitive, PrimInt, Unsigned};
16use regex::{Regex, RegexBuilder};
17
18use lrpar::{Lexeme, Lexer, LexerTypes, NonStreamingLexer};
19
20use crate::{
21 LRLexError, LexBuildError, LexBuildResult, StartStateId,
22 parser::{LexParser, StartState, StartStateOperation},
23};
24
25#[doc(hidden)]
26#[derive(Clone, Debug)]
28#[non_exhaustive]
29pub struct LexFlags {
30 pub dot_matches_new_line: Option<bool>,
32 pub multi_line: Option<bool>,
33 pub octal: Option<bool>,
34 pub posix_escapes: Option<bool>,
35 pub allow_wholeline_comments: Option<bool>,
36
37 pub case_insensitive: Option<bool>,
39 pub swap_greed: Option<bool>,
40 pub ignore_whitespace: Option<bool>,
41 pub unicode: Option<bool>,
42 pub size_limit: Option<usize>,
43 pub dfa_size_limit: Option<usize>,
44 pub nest_limit: Option<u32>,
45}
46
47impl<T: Clone> TryFrom<&mut Header<T>> for LexFlags {
48 type Error = HeaderError<T>;
49 fn try_from(header: &mut Header<T>) -> Result<LexFlags, HeaderError<T>> {
50 use cfgrammar::header::Setting;
51 let mut lex_flags = UNSPECIFIED_LEX_FLAGS;
52 let LexFlags {
53 dot_matches_new_line,
54 multi_line,
55 octal,
56 posix_escapes,
57 allow_wholeline_comments,
58 case_insensitive,
59 swap_greed,
60 ignore_whitespace,
61 unicode,
62 size_limit,
63 dfa_size_limit,
64 nest_limit,
65 } = &mut lex_flags;
66 macro_rules! cvt_flag {
67 ($it:ident) => {
68 header.mark_used(&stringify!($it).to_string());
69 *$it = match header.get(stringify!($it)) {
70 Some(HeaderValue(_, Value::Flag(flag, _))) => Some(*flag),
71 Some(HeaderValue(loc, _)) => Err(HeaderError {
72 kind: HeaderErrorKind::ConversionError("LexFlags", "Expected boolean"),
73 locations: vec![loc.clone()],
74 })?,
75 None => None,
76 }
77 };
78 }
79 cvt_flag!(dot_matches_new_line);
80 cvt_flag!(multi_line);
81 cvt_flag!(octal);
82 cvt_flag!(posix_escapes);
83 cvt_flag!(allow_wholeline_comments);
84 cvt_flag!(case_insensitive);
85 cvt_flag!(swap_greed);
86 cvt_flag!(ignore_whitespace);
87 cvt_flag!(unicode);
88 macro_rules! cvt_num {
89 ($it:ident, $num_ty: ty) => {
90 header.mark_used(&stringify!($it).to_string());
91 *$it = match header.get(stringify!($it)) {
92 Some(HeaderValue(_, Value::Setting(Setting::Num(n, _)))) => Some(*n as $num_ty),
93 Some(HeaderValue(loc, _)) => Err(HeaderError {
94 kind: HeaderErrorKind::ConversionError("LexFlags", "Expected numeric"),
95 locations: vec![loc.clone()],
96 })?,
97 None => None,
98 }
99 };
100 }
101 cvt_num!(size_limit, usize);
102 cvt_num!(dfa_size_limit, usize);
103 cvt_num!(nest_limit, u32);
104 Ok(lex_flags)
105 }
106}
107
108impl From<&LexFlags> for Header<Location> {
109 fn from(flags: &LexFlags) -> Header<Location> {
110 let mut header = Header::new();
111 let LexFlags {
112 dot_matches_new_line,
113 multi_line,
114 octal,
115 posix_escapes,
116 allow_wholeline_comments,
117 case_insensitive,
118 swap_greed,
119 ignore_whitespace,
120 unicode,
121 size_limit,
122 dfa_size_limit,
123 nest_limit,
124 } = flags;
125 macro_rules! cvt_flag {
126 ($it: ident) => {
127 $it.map(|x| {
128 header.insert(
129 stringify!($it).to_string(),
130 HeaderValue(
131 Location::Other("From<&LexFlags".to_string()),
132 Value::Flag(x, Location::Other("From<&LexFlags>".to_string())),
133 ),
134 )
135 });
136 };
137 }
138 cvt_flag!(dot_matches_new_line);
139 cvt_flag!(multi_line);
140 cvt_flag!(octal);
141 cvt_flag!(posix_escapes);
142 cvt_flag!(allow_wholeline_comments);
143 cvt_flag!(case_insensitive);
144 cvt_flag!(swap_greed);
145 cvt_flag!(ignore_whitespace);
146 cvt_flag!(unicode);
147
148 macro_rules! cvt_num {
149 ($it: ident) => {
150 $it.map(|x| {
151 use cfgrammar::header::Setting;
152 header.insert(
153 stringify!($it).to_string(),
154 HeaderValue(
155 Location::Other("From<&LexFlags".to_string()),
156 Value::Setting(Setting::Num(
157 x as u64,
158 Location::Other("From<&LexFlags>".to_string()),
159 )),
160 ),
161 )
162 });
163 };
164 }
165 cvt_num!(size_limit);
166 cvt_num!(dfa_size_limit);
167 cvt_num!(nest_limit);
168
169 header
170 }
171}
172
173#[doc(hidden)]
175pub const DEFAULT_LEX_FLAGS: LexFlags = LexFlags {
176 allow_wholeline_comments: Some(false),
177 dot_matches_new_line: Some(true),
178 multi_line: Some(true),
179 octal: Some(true),
180 posix_escapes: Some(false),
181 case_insensitive: None,
182 ignore_whitespace: None,
183 swap_greed: None,
184 unicode: None,
185 size_limit: None,
186 dfa_size_limit: None,
187 nest_limit: None,
188};
189
190#[doc(hidden)]
191pub const UNSPECIFIED_LEX_FLAGS: LexFlags = LexFlags {
193 allow_wholeline_comments: None,
194 dot_matches_new_line: None,
195 multi_line: None,
196 octal: None,
197 posix_escapes: None,
198 case_insensitive: None,
199 ignore_whitespace: None,
200 swap_greed: None,
201 unicode: None,
202 size_limit: None,
203 dfa_size_limit: None,
204 nest_limit: None,
205};
206
207#[derive(Debug, Clone)]
208#[doc(hidden)]
209pub struct Rule<StorageT> {
210 pub(super) tok_id: Option<StorageT>,
216 #[deprecated(note = "Use the name() function")]
219 pub name: Option<String>,
220 #[deprecated(note = "Use the name_span() function")]
221 pub name_span: Span,
222 pub(super) re_str: String,
223 re: Regex,
224 #[deprecated(note = "Use the start_states() function")]
226 pub start_states: Vec<usize>,
227 #[deprecated(note = "Use the target_state() function")]
232 pub target_state: Option<(usize, StartStateOperation)>,
233}
234
235impl<StorageT: PrimInt> Rule<StorageT> {
236 #[doc(hidden)]
239 #[allow(private_interfaces)]
240 #[allow(clippy::too_many_arguments)]
241 pub fn new(
242 _: crate::unstable_api::InternalPublicApi,
243 tok_id: Option<StorageT>,
244 name: Option<String>,
245 name_span: Span,
246 re_str: String,
247 start_states: Vec<usize>,
248 target_state: Option<(usize, StartStateOperation)>,
249 lex_flags: &LexFlags,
250 ) -> Result<Rule<StorageT>, regex::Error> {
251 let mut re = RegexBuilder::new(&format!("\\A(?:{})", re_str));
252 let mut re = re
253 .octal(lex_flags.octal.unwrap())
254 .multi_line(lex_flags.multi_line.unwrap())
255 .dot_matches_new_line(lex_flags.dot_matches_new_line.unwrap());
256
257 if let Some(flag) = lex_flags.ignore_whitespace {
258 re = re.ignore_whitespace(flag)
259 }
260 if let Some(flag) = lex_flags.unicode {
261 re = re.unicode(flag)
262 }
263 if let Some(flag) = lex_flags.case_insensitive {
264 re = re.case_insensitive(flag)
265 }
266 if let Some(flag) = lex_flags.swap_greed {
267 re = re.swap_greed(flag)
268 }
269 if let Some(sz) = lex_flags.size_limit {
270 re = re.size_limit(sz)
271 }
272 if let Some(sz) = lex_flags.dfa_size_limit {
273 re = re.dfa_size_limit(sz)
274 }
275 if let Some(lim) = lex_flags.nest_limit {
276 re = re.nest_limit(lim)
277 }
278
279 let re = re.build()?;
280 #[allow(deprecated)]
281 Ok(Rule {
282 tok_id,
283 name,
284 name_span,
285 re_str,
286 re,
287 start_states,
288 target_state,
289 })
290 }
291
292 pub fn tok_id(&self) -> Option<StorageT> {
297 self.tok_id
298 }
299
300 pub fn name(&self) -> Option<&str> {
303 #[allow(deprecated)]
304 self.name.as_deref()
305 }
306
307 pub fn name_span(&self) -> Span {
309 #[allow(deprecated)]
310 self.name_span
311 }
312
313 pub fn re_str(&self) -> &str {
315 &self.re_str
316 }
317
318 pub fn start_states(&self) -> &[usize] {
320 #[allow(deprecated)]
321 self.start_states.as_slice()
322 }
323
324 pub fn target_state(&self) -> Option<(usize, StartStateOperation)> {
326 #[allow(deprecated)]
327 self.target_state.clone()
328 }
329}
330
331pub trait LexerDef<LexerTypesT: LexerTypes>
333where
334 usize: AsPrimitive<LexerTypesT::StorageT>,
335{
336 #[doc(hidden)]
337 fn from_rules(start_states: Vec<StartState>, rules: Vec<Rule<LexerTypesT::StorageT>>) -> Self
340 where
341 Self: Sized;
342
343 fn from_str(s: &str) -> LexBuildResult<Self>
345 where
346 Self: Sized;
347
348 fn get_rule(&self, idx: usize) -> Option<&Rule<LexerTypesT::StorageT>>;
350
351 fn get_rule_by_id(&self, tok_id: LexerTypesT::StorageT) -> &Rule<LexerTypesT::StorageT>;
354
355 fn get_rule_by_name(&self, n: &str) -> Option<&Rule<LexerTypesT::StorageT>>;
357
358 fn set_rule_ids<'a>(
375 &'a mut self,
376 rule_ids_map: &HashMap<&'a str, LexerTypesT::StorageT>,
377 ) -> (Option<HashSet<&'a str>>, Option<HashSet<&'a str>>);
378
379 fn set_rule_ids_spanned<'a>(
380 &'a mut self,
381 rule_ids_map: &HashMap<&'a str, LexerTypesT::StorageT>,
382 ) -> (Option<HashSet<&'a str>>, Option<HashSet<(&'a str, Span)>>);
383
384 fn iter_rules(&self) -> Iter<'_, Rule<LexerTypesT::StorageT>>;
386
387 fn iter_start_states(&self) -> Iter<'_, StartState>;
389}
390
391#[derive(Debug, Clone)]
394pub struct LRNonStreamingLexerDef<LexerTypesT: LexerTypes>
395where
396 usize: AsPrimitive<LexerTypesT::StorageT>,
397{
398 rules: Vec<Rule<LexerTypesT::StorageT>>,
399 start_states: Vec<StartState>,
400 lex_flags: LexFlags,
401 phantom: PhantomData<LexerTypesT>,
402}
403
404impl<LexerTypesT: LexerTypes> LexerDef<LexerTypesT> for LRNonStreamingLexerDef<LexerTypesT>
405where
406 usize: AsPrimitive<LexerTypesT::StorageT>,
407 LexerTypesT::StorageT: TryFrom<usize>,
408{
409 fn from_rules(
410 start_states: Vec<StartState>,
411 rules: Vec<Rule<LexerTypesT::StorageT>>,
412 ) -> LRNonStreamingLexerDef<LexerTypesT> {
413 LRNonStreamingLexerDef {
414 rules,
415 start_states,
416 lex_flags: DEFAULT_LEX_FLAGS,
417 phantom: PhantomData,
418 }
419 }
420
421 fn from_str(s: &str) -> LexBuildResult<LRNonStreamingLexerDef<LexerTypesT>> {
424 let (mut header, pos) = GrmtoolsSectionParser::new(s, false)
425 .parse()
426 .map_err(|mut errs| errs.drain(..).map(LexBuildError::from).collect::<Vec<_>>())?;
427 let flags = LexFlags::try_from(&mut header).map_err(|e| vec![e.into()])?;
428 LexParser::<LexerTypesT>::new_with_lex_flags(s[pos..].to_string(), flags.clone()).map(|p| {
429 LRNonStreamingLexerDef {
430 rules: p.rules,
431 start_states: p.start_states,
432 lex_flags: flags,
433 phantom: PhantomData,
434 }
435 })
436 }
437
438 fn get_rule(&self, idx: usize) -> Option<&Rule<LexerTypesT::StorageT>> {
439 self.rules.get(idx)
440 }
441
442 fn get_rule_by_id(&self, tok_id: LexerTypesT::StorageT) -> &Rule<LexerTypesT::StorageT> {
443 self.rules
444 .iter()
445 .find(|r| r.tok_id == Some(tok_id))
446 .unwrap()
447 }
448
449 fn get_rule_by_name(&self, n: &str) -> Option<&Rule<LexerTypesT::StorageT>> {
450 self.rules.iter().find(|r| r.name() == Some(n))
451 }
452
453 fn set_rule_ids<'a>(
454 &'a mut self,
455 rule_ids_map: &HashMap<&'a str, LexerTypesT::StorageT>,
456 ) -> (Option<HashSet<&'a str>>, Option<HashSet<&'a str>>) {
457 let (missing_from_parser, missing_from_lexer) = self.set_rule_ids_spanned(rule_ids_map);
458 let missing_from_lexer =
459 missing_from_lexer.map(|missing| missing.iter().map(|(name, _)| *name).collect());
460 (missing_from_parser, missing_from_lexer)
461 }
462
463 fn set_rule_ids_spanned<'a>(
464 &'a mut self,
465 rule_ids_map: &HashMap<&'a str, LexerTypesT::StorageT>,
466 ) -> (Option<HashSet<&'a str>>, Option<HashSet<(&'a str, Span)>>) {
467 let mut missing_from_parser_idxs = Vec::new();
474 let mut rules_with_names = 0;
475 for (i, r) in self.rules.iter_mut().enumerate() {
476 if let Some(n) = r.name() {
477 match rule_ids_map.get(n) {
478 Some(tok_id) => r.tok_id = Some(*tok_id),
479 None => {
480 r.tok_id = None;
481 missing_from_parser_idxs.push(i);
482 }
483 }
484 rules_with_names += 1;
485 }
486 }
487
488 let missing_from_parser = if missing_from_parser_idxs.is_empty() {
489 None
490 } else {
491 let mut mfp = HashSet::with_capacity(missing_from_parser_idxs.len());
492 for i in &missing_from_parser_idxs {
493 mfp.insert((self.rules[*i].name().unwrap(), self.rules[*i].name_span()));
494 }
495 Some(mfp)
496 };
497
498 let missing_from_lexer =
499 if rules_with_names - missing_from_parser_idxs.len() == rule_ids_map.len() {
500 None
501 } else {
502 Some(
503 rule_ids_map
504 .keys()
505 .cloned()
506 .collect::<HashSet<&str>>()
507 .difference(
508 &self
509 .rules
510 .iter()
511 .filter_map(|x| x.name())
512 .collect::<HashSet<&str>>(),
513 )
514 .cloned()
515 .collect::<HashSet<&str>>(),
516 )
517 };
518
519 (missing_from_lexer, missing_from_parser)
520 }
521
522 fn iter_rules(&self) -> Iter<'_, Rule<LexerTypesT::StorageT>> {
523 self.rules.iter()
524 }
525
526 fn iter_start_states(&self) -> Iter<'_, StartState> {
527 self.start_states.iter()
528 }
529}
530
531impl<
532 StorageT: 'static + Debug + Hash + PrimInt + Unsigned,
533 LexerTypesT: LexerTypes<StorageT = StorageT>,
534> LRNonStreamingLexerDef<LexerTypesT>
535where
536 usize: AsPrimitive<StorageT>,
537 LexerTypesT::StorageT: TryFrom<usize>,
538{
539 pub fn new_with_options(
541 s: &str,
542 lex_flags: LexFlags,
543 ) -> LexBuildResult<LRNonStreamingLexerDef<LexerTypesT>> {
544 let (_, pos) = GrmtoolsSectionParser::new(s, false).parse().unwrap();
545 LexParser::<LexerTypesT>::new_with_lex_flags(s[pos..].to_string(), lex_flags.clone()).map(
546 |p| LRNonStreamingLexerDef {
547 rules: p.rules,
548 start_states: p.start_states,
549 lex_flags,
550 phantom: PhantomData,
551 },
552 )
553 }
554
555 pub fn lexer<'lexer, 'input: 'lexer>(
558 &'lexer self,
559 s: &'input str,
560 ) -> LRNonStreamingLexer<'lexer, 'input, LexerTypesT> {
561 let mut lexemes = vec![];
562 let mut i = 0;
563 let mut state_stack: Vec<(usize, &StartState)> = Vec::new();
564 let initial_state = match self.get_start_state_by_id(0) {
565 None => {
566 lexemes.push(Err(LRLexError::new(Span::new(i, i))));
567 return LRNonStreamingLexer::new(s, lexemes, NewlineCache::from_str(s).unwrap());
568 }
569 Some(state) => state,
570 };
571 state_stack.push((1, initial_state));
572
573 while i < s.len() {
574 let old_i = i;
575 let mut longest = 0; let mut longest_ridx = 0; let current_state = match state_stack.last() {
578 None => {
579 lexemes.push(Err(LRLexError::new(Span::new(i, i))));
580 return LRNonStreamingLexer::new(
581 s,
582 lexemes,
583 NewlineCache::from_str(s).unwrap(),
584 );
585 }
586 Some((_, s)) => s,
587 };
588 for (ridx, r) in self.iter_rules().enumerate() {
589 if !Self::state_matches(current_state, r.start_states()) {
590 continue;
591 }
592 if let Some(m) = r.re.find(&s[old_i..]) {
593 let len = m.end();
594 if len > longest {
597 longest = len;
598 longest_ridx = ridx;
599 }
600 }
601 }
602 if longest > 0 {
603 let r = self.get_rule(longest_ridx).unwrap();
604 if r.name().is_some() {
605 match r.tok_id {
606 Some(tok_id) => {
607 lexemes.push(Ok(Lexeme::new(tok_id, old_i, longest)));
608 }
609 None => {
610 lexemes.push(Err(LRLexError::new(Span::new(old_i, old_i))));
611 break;
612 }
613 }
614 }
615 if let Some((target_state_id, op)) = &r.target_state() {
616 let state = match self.get_start_state_by_id(*target_state_id) {
617 None => {
618 lexemes.push(Err(LRLexError::new(Span::new(old_i, old_i))));
620 break;
621 }
622 Some(state) => state,
623 };
624 let head = state_stack.last_mut();
625 match op {
626 StartStateOperation::ReplaceStack => {
627 state_stack.clear();
628 state_stack.push((1, state));
629 }
630 StartStateOperation::Push => match head {
631 Some((count, s)) if s.id == state.id => *count += 1,
632 _ => state_stack.push((1, state)),
633 },
634 StartStateOperation::Pop => match head {
635 Some((count, _s)) if *count > 1 => {
636 *count -= 1;
637 }
638 Some(_) => {
639 state_stack.pop();
640 if state_stack.is_empty() {
641 state_stack.push((1, initial_state));
642 }
643 }
644 None => {
645 lexemes.push(Err(LRLexError::new(Span::new(old_i, old_i))));
646 break;
647 }
648 },
649 }
650 }
651 i += longest;
652 } else {
653 lexemes.push(Err(LRLexError::new_with_lexing_state(
654 Span::new(old_i, old_i),
655 StartStateId::new(current_state.id),
656 )));
657 break;
658 }
659 }
660 LRNonStreamingLexer::new(s, lexemes, NewlineCache::from_str(s).unwrap())
661 }
662
663 fn state_matches(state: &StartState, rule_states: &[usize]) -> bool {
664 if rule_states.is_empty() {
665 !state.exclusive
666 } else {
667 rule_states.contains(&state.id)
668 }
669 }
670
671 fn get_start_state_by_id(&self, id: usize) -> Option<&StartState> {
672 self.start_states.iter().find(|state| state.id == id)
673 }
674
675 pub(crate) fn lex_flags(&self) -> Option<&LexFlags> {
678 Some(&self.lex_flags)
679 }
680}
681
682pub struct LRNonStreamingLexer<'lexer, 'input: 'lexer, LexerTypesT: LexerTypes>
686where
687 usize: AsPrimitive<LexerTypesT::StorageT>,
688 LexerTypesT::StorageT: 'static + Debug + PrimInt,
689{
690 s: &'input str,
691 lexemes: Vec<Result<LexerTypesT::LexemeT, LRLexError>>,
692 newlines: NewlineCache,
693 phantom: PhantomData<(&'lexer (), LexerTypesT::StorageT)>,
694}
695
696impl<
697 'lexer,
698 'input: 'lexer,
699 StorageT: 'static + Debug + Hash + PrimInt + Unsigned,
700 LexerTypesT: LexerTypes<StorageT = StorageT>,
701> LRNonStreamingLexer<'lexer, 'input, LexerTypesT>
702where
703 usize: AsPrimitive<StorageT>,
704{
705 pub fn new(
711 s: &'input str,
712 lexemes: Vec<Result<LexerTypesT::LexemeT, LRLexError>>,
713 newlines: NewlineCache,
714 ) -> LRNonStreamingLexer<'lexer, 'input, LexerTypesT> {
715 LRNonStreamingLexer {
716 s,
717 lexemes,
718 newlines,
719 phantom: PhantomData,
720 }
721 }
722}
723
724impl<
725 'lexer,
726 'input: 'lexer,
727 StorageT: 'static + Debug + Hash + PrimInt + Unsigned,
728 LexerTypesT: LexerTypes<StorageT = StorageT, LexErrorT = LRLexError>,
729> Lexer<LexerTypesT> for LRNonStreamingLexer<'lexer, 'input, LexerTypesT>
730where
731 usize: AsPrimitive<StorageT>,
732{
733 fn iter<'a>(
734 &'a self,
735 ) -> Box<dyn Iterator<Item = Result<LexerTypesT::LexemeT, LexerTypesT::LexErrorT>> + 'a> {
736 Box::new(self.lexemes.iter().cloned())
737 }
738}
739
740impl<'lexer, 'input: 'lexer, LexerTypesT: LexerTypes<LexErrorT = LRLexError>>
741 NonStreamingLexer<'input, LexerTypesT> for LRNonStreamingLexer<'lexer, 'input, LexerTypesT>
742where
743 usize: AsPrimitive<LexerTypesT::StorageT>,
744{
745 fn span_str(&self, span: Span) -> &'input str {
746 if span.end() > self.s.len() {
747 panic!(
748 "Span {:?} exceeds known input length {}",
749 span,
750 self.s.len()
751 );
752 }
753 &self.s[span.start()..span.end()]
754 }
755
756 fn span_lines_str(&self, span: Span) -> &'input str {
757 debug_assert!(span.end() >= span.start());
758 if span.end() > self.s.len() {
759 panic!(
760 "Span {:?} exceeds known input length {}",
761 span,
762 self.s.len()
763 );
764 }
765
766 let (st, en) = self.newlines.span_line_bytes(span);
767 &self.s[st..en]
768 }
769
770 fn line_col(&self, span: Span) -> ((usize, usize), (usize, usize)) {
771 debug_assert!(span.end() >= span.start());
772 if span.end() > self.s.len() {
773 panic!(
774 "Span {:?} exceeds known input length {}",
775 span,
776 self.s.len()
777 );
778 }
779
780 (
781 self.newlines
782 .byte_to_line_num_and_col_num(self.s, span.start())
783 .unwrap(),
784 self.newlines
785 .byte_to_line_num_and_col_num(self.s, span.end())
786 .unwrap(),
787 )
788 }
789}
790
791#[cfg(test)]
792mod test {
793 use super::*;
794 use crate::{DefaultLexeme, DefaultLexerTypes};
795 use lrpar::LexError;
796 use std::collections::HashMap;
797
798 #[test]
799 fn test_basic() {
800 let src = r"
801%%
802[0-9]+ 'int'
803[a-zA-Z]+ 'id'
804[ \t] ;"
805 .to_string();
806 let mut lexerdef = LRNonStreamingLexerDef::<DefaultLexerTypes<u8>>::from_str(&src).unwrap();
807 let mut map = HashMap::new();
808 map.insert("int", 0);
809 map.insert("id", 1);
810 assert_eq!(lexerdef.set_rule_ids(&map), (None, None));
811
812 let lexemes = lexerdef
813 .lexer("abc 123")
814 .iter()
815 .map(|x| x.unwrap())
816 .collect::<Vec<_>>();
817 assert_eq!(lexemes.len(), 2);
818 let lex1 = lexemes[0];
819 assert_eq!(lex1.tok_id(), 1u8);
820 assert_eq!(lex1.span().start(), 0);
821 assert_eq!(lex1.span().len(), 3);
822 let lex2 = lexemes[1];
823 assert_eq!(lex2.tok_id(), 0);
824 assert_eq!(lex2.span().start(), 4);
825 assert_eq!(lex2.span().len(), 3);
826 }
827
828 #[test]
829 fn test_posix_escapes() {
830 let src = r#"%%
831\\ 'slash'
832\a 'alert'
833\b 'backspace'
834\f 'feed'
835\n 'newline'
836\r 'return'
837\t 'tab'
838\v 'vtab'
839\q 'normal_char'
840"#
841 .to_string();
842 let mut options = DEFAULT_LEX_FLAGS;
843 options.posix_escapes = Some(true);
844 let lexerdef =
845 LRNonStreamingLexerDef::<DefaultLexerTypes<u8>>::new_with_options(&src, options)
846 .unwrap();
847 let lexemes = lexerdef
848 .lexer("\\\x07\x08\x0c\n\r\t\x0bq")
849 .iter()
850 .map(|x| x.unwrap())
851 .collect::<Vec<_>>();
852 assert_eq!(lexemes.len(), 9);
853 for i in 0..9u8 {
854 let lexeme = lexemes[i as usize];
855 assert_eq!(lexeme.tok_id(), i);
856 }
857 }
858
859 #[test]
860 fn test_non_posix_escapes() {
861 let src = r#"%%
862\\ 'slash'
863\a 'alert'
864a\b a 'work_break'
865\f 'feed'
866\n 'newline'
867\r 'return'
868\t 'tab'
869\v 'vtab'
870\q 'normal_char'
871"#
872 .to_string();
873 let mut options = DEFAULT_LEX_FLAGS;
874 options.posix_escapes = Some(false);
875 let lexerdef =
876 LRNonStreamingLexerDef::<DefaultLexerTypes<u8>>::new_with_options(&src, options)
877 .unwrap();
878 let lexemes = lexerdef
879 .lexer("\\\x07a a\x0c\n\r\t\x0bq")
880 .iter()
881 .map(|x| x.unwrap())
882 .collect::<Vec<_>>();
883 assert_eq!(lexemes.len(), 9);
884 for i in 0..9u8 {
885 let lexeme = lexemes[i as usize];
886 assert_eq!(lexeme.tok_id(), i);
887 }
888 }
889
890 #[test]
891 fn test_basic_error() {
892 let src = "
893%%
894[0-9]+ 'int'"
895 .to_string();
896 let lexerdef = LRNonStreamingLexerDef::<DefaultLexerTypes<u8>>::from_str(&src).unwrap();
897 match lexerdef.lexer("abc").iter().next().unwrap() {
898 Ok(_) => panic!("Invalid input lexed"),
899 Err(e) => {
900 if e.span().start() != 0 || e.span().end() != 0 {
901 panic!("Incorrect span returned {:?}", e.span());
902 }
903 }
904 };
905 }
906
907 #[test]
908 fn test_longest_match() {
909 let src = "%%
910if 'IF'
911[a-z]+ 'ID'
912[ ] ;"
913 .to_string();
914 let mut lexerdef = LRNonStreamingLexerDef::<DefaultLexerTypes<u8>>::from_str(&src).unwrap();
915 let mut map = HashMap::new();
916 map.insert("IF", 0);
917 map.insert("ID", 1);
918 assert_eq!(lexerdef.set_rule_ids(&map), (None, None));
919
920 let lexemes = lexerdef
921 .lexer("iff if")
922 .iter()
923 .map(|x| x.unwrap())
924 .collect::<Vec<DefaultLexeme<u8>>>();
925 assert_eq!(lexemes.len(), 2);
926 let lex1 = lexemes[0];
927 assert_eq!(lex1.tok_id(), 1u8);
928 assert_eq!(lex1.span().start(), 0);
929 assert_eq!(lex1.span().len(), 3);
930 let lex2 = lexemes[1];
931 assert_eq!(lex2.tok_id(), 0);
932 assert_eq!(lex2.span().start(), 4);
933 assert_eq!(lex2.span().len(), 2);
934 }
935
936 #[test]
937 fn test_multibyte() {
938 let src = "%%
939[a❤]+ 'ID'
940[ ] ;"
941 .to_string();
942 let mut lexerdef = LRNonStreamingLexerDef::<DefaultLexerTypes<u8>>::from_str(&src).unwrap();
943 let mut map = HashMap::new();
944 map.insert("ID", 0u8);
945 assert_eq!(lexerdef.set_rule_ids(&map), (None, None));
946
947 let lexer = lexerdef.lexer("a ❤ a");
948 let lexemes = lexer
949 .iter()
950 .map(|x| x.unwrap())
951 .collect::<Vec<DefaultLexeme<u8>>>();
952 assert_eq!(lexemes.len(), 3);
953 let lex1 = lexemes[0];
954 assert_eq!(lex1.span().start(), 0);
955 assert_eq!(lex1.span().len(), 1);
956 assert_eq!(lexer.span_str(lex1.span()), "a");
957 let lex2 = lexemes[1];
958 assert_eq!(lex2.span().start(), 2);
959 assert_eq!(lex2.span().len(), 3);
960 assert_eq!(lexer.span_str(lex2.span()), "❤");
961 let lex3 = lexemes[2];
962 assert_eq!(lex3.span().start(), 6);
963 assert_eq!(lex3.span().len(), 1);
964 assert_eq!(lexer.span_str(lex3.span()), "a");
965 }
966
967 #[test]
968 fn test_line_col() {
969 let src = "%%
970[a-z]+ 'ID'
971[ \\n] ;"
972 .to_string();
973 let mut lexerdef = LRNonStreamingLexerDef::<DefaultLexerTypes<u8>>::from_str(&src).unwrap();
974 let mut map = HashMap::new();
975 map.insert("ID", 0u8);
976 assert_eq!(lexerdef.set_rule_ids(&map), (None, None));
977
978 let lexer = lexerdef.lexer("a b c");
979 let lexemes = lexer
980 .iter()
981 .map(|x| x.unwrap())
982 .collect::<Vec<DefaultLexeme<u8>>>();
983 assert_eq!(lexemes.len(), 3);
984 assert_eq!(lexer.line_col(lexemes[1].span()), ((1, 3), (1, 4)));
985 assert_eq!(lexer.span_lines_str(lexemes[1].span()), "a b c");
986 assert_eq!(lexer.span_lines_str(lexemes[2].span()), "a b c");
987
988 let lexer = lexerdef.lexer("a b c\n");
989 let lexemes = lexer.iter().map(|x| x.unwrap()).collect::<Vec<_>>();
990 assert_eq!(lexemes.len(), 3);
991 assert_eq!(lexer.line_col(lexemes[1].span()), ((1, 3), (1, 4)));
992 assert_eq!(lexer.span_lines_str(lexemes[1].span()), "a b c");
993 assert_eq!(lexer.span_lines_str(lexemes[2].span()), "a b c");
994
995 let lexer = lexerdef.lexer(" a\nb\n c d");
996 let lexemes = lexer.iter().map(|x| x.unwrap()).collect::<Vec<_>>();
997 assert_eq!(lexemes.len(), 4);
998 assert_eq!(lexer.line_col(lexemes[0].span()), ((1, 2), (1, 3)));
999 assert_eq!(lexer.line_col(lexemes[1].span()), ((2, 1), (2, 2)));
1000 assert_eq!(lexer.line_col(lexemes[2].span()), ((3, 3), (3, 4)));
1001 assert_eq!(lexer.line_col(lexemes[3].span()), ((3, 5), (3, 6)));
1002 assert_eq!(lexer.span_lines_str(lexemes[0].span()), " a");
1003 assert_eq!(lexer.span_lines_str(lexemes[1].span()), "b");
1004 assert_eq!(lexer.span_lines_str(lexemes[2].span()), " c d");
1005 assert_eq!(lexer.span_lines_str(lexemes[3].span()), " c d");
1006
1007 let mut s = Vec::new();
1008 let mut offs = vec![0];
1009 for i in 0..71 {
1010 offs.push(offs[i] + i + 1);
1011 s.push(vec!["a"; i].join(" "));
1012 }
1013 let s = s.join("\n");
1014 let lexer = lexerdef.lexer(&s);
1015 let lexemes = lexer.iter().map(|x| x.unwrap()).collect::<Vec<_>>();
1016 assert_eq!(lexemes.len(), offs[70]);
1017 assert_eq!(lexer.span_lines_str(Span::new(0, 0)), "");
1018 assert_eq!(lexer.span_lines_str(Span::new(0, 2)), "\na");
1019 assert_eq!(lexer.span_lines_str(Span::new(0, 4)), "\na\na a");
1020 assert_eq!(lexer.span_lines_str(Span::new(0, 7)), "\na\na a\na a a");
1021 assert_eq!(lexer.span_lines_str(Span::new(4, 7)), "a a\na a a");
1022 assert_eq!(lexer.span_lines_str(lexemes[0].span()), "a");
1023 assert_eq!(lexer.span_lines_str(lexemes[1].span()), "a a");
1024 assert_eq!(lexer.span_lines_str(lexemes[3].span()), "a a a");
1025 for i in 0..70 {
1026 assert_eq!(
1027 lexer.span_lines_str(lexemes[offs[i]].span()),
1028 vec!["a"; i + 1].join(" ")
1029 );
1030 }
1031 }
1032
1033 #[test]
1034 fn test_line_col_multibyte() {
1035 let src = "%%
1036[a-z❤]+ 'ID'
1037[ \\n] ;"
1038 .to_string();
1039 let mut lexerdef = LRNonStreamingLexerDef::<DefaultLexerTypes<u8>>::from_str(&src).unwrap();
1040 let mut map = HashMap::new();
1041 map.insert("ID", 0u8);
1042 assert_eq!(lexerdef.set_rule_ids(&map), (None, None));
1043
1044 let lexer = lexerdef.lexer(" a\n❤ b");
1045 let lexemes = lexer
1046 .iter()
1047 .map(|x| x.unwrap())
1048 .collect::<Vec<DefaultLexeme<u8>>>();
1049 assert_eq!(lexemes.len(), 3);
1050 assert_eq!(lexer.line_col(lexemes[0].span()), ((1, 2), (1, 3)));
1051 assert_eq!(lexer.line_col(lexemes[1].span()), ((2, 1), (2, 2)));
1052 assert_eq!(lexer.line_col(lexemes[2].span()), ((2, 3), (2, 4)));
1053 assert_eq!(lexer.span_lines_str(lexemes[0].span()), " a");
1054 assert_eq!(lexer.span_lines_str(lexemes[1].span()), "❤ b");
1055 assert_eq!(lexer.span_lines_str(lexemes[2].span()), "❤ b");
1056 }
1057
1058 #[test]
1059 #[should_panic]
1060 fn test_bad_line_col() {
1061 let src = "%%
1062[a-z]+ 'ID'
1063[ \\n] ;"
1064 .to_string();
1065 let mut lexerdef = LRNonStreamingLexerDef::<DefaultLexerTypes<u8>>::from_str(&src).unwrap();
1066 let mut map = HashMap::new();
1067 map.insert("ID", 0u8);
1068 assert_eq!(lexerdef.set_rule_ids(&map), (None, None));
1069
1070 let lexer = lexerdef.lexer("a b c");
1071
1072 lexer.line_col(Span::new(100, 100));
1073 }
1074
1075 #[test]
1076 fn test_missing_from_lexer_and_parser() {
1077 let src = "%%
1078[a-z]+ 'ID'
1079[ \\n] ;"
1080 .to_string();
1081 let mut lexerdef = LRNonStreamingLexerDef::<DefaultLexerTypes<u8>>::from_str(&src).unwrap();
1082 let mut map = HashMap::new();
1083 map.insert("INT", 0u8);
1084 let mut missing_from_lexer = HashSet::new();
1085 missing_from_lexer.insert("INT");
1086 let mut missing_from_parser = HashSet::new();
1087 missing_from_parser.insert("ID");
1088 assert_eq!(
1089 lexerdef.set_rule_ids(&map),
1090 (Some(missing_from_lexer), Some(missing_from_parser))
1091 );
1092
1093 match lexerdef.lexer(" a ").iter().next().unwrap() {
1094 Ok(_) => panic!("Invalid input lexed"),
1095 Err(e) => {
1096 if e.span().start() != 1 || e.span().end() != 1 {
1097 panic!("Incorrect span returned {:?}", e.span());
1098 }
1099 }
1100 };
1101 }
1102
1103 #[test]
1104 fn test_multiline_lexeme() {
1105 let src = "%%
1106'.*' 'STR'
1107[ \\n] ;"
1108 .to_string();
1109 let mut lexerdef = LRNonStreamingLexerDef::<DefaultLexerTypes<u8>>::from_str(&src).unwrap();
1110 let mut map = HashMap::new();
1111 map.insert("STR", 0u8);
1112 assert_eq!(lexerdef.set_rule_ids(&map), (None, None));
1113
1114 let lexer = lexerdef.lexer("'a\nb'\n");
1115 let lexemes = lexer
1116 .iter()
1117 .map(|x| x.unwrap())
1118 .collect::<Vec<DefaultLexeme<u8>>>();
1119 assert_eq!(lexemes.len(), 1);
1120 assert_eq!(lexer.line_col(lexemes[0].span()), ((1, 1), (2, 3)));
1121 assert_eq!(lexer.span_lines_str(lexemes[0].span()), "'a\nb'");
1122 }
1123
1124 #[test]
1125 fn test_token_span() {
1126 let src = "%%
1127a 'A'
1128b 'B'
1129[ \\n] ;"
1130 .to_string();
1131 let lexerdef = LRNonStreamingLexerDef::<DefaultLexerTypes<u8>>::from_str(&src).unwrap();
1132 assert_eq!(
1133 lexerdef.get_rule_by_name("A").unwrap().name_span(),
1134 Span::new(6, 7)
1135 );
1136 assert_eq!(
1137 lexerdef.get_rule_by_name("B").unwrap().name_span(),
1138 Span::new(12, 13)
1139 );
1140 let anonymous_rules = lexerdef
1141 .iter_rules()
1142 .filter(|rule| rule.name().is_none())
1143 .collect::<Vec<_>>();
1144 assert_eq!(anonymous_rules[0].name_span(), Span::new(21, 21));
1145 }
1146
1147 #[test]
1148 fn test_token_start_states() {
1149 let src = "%x EXCLUSIVE_START
1150%s INCLUSIVE_START
1151%%
1152a 'A'
1153b 'B'
1154[ \\n] ;"
1155 .to_string();
1156 let lexerdef = LRNonStreamingLexerDef::<DefaultLexerTypes<u8>>::from_str(&src).unwrap();
1157 assert_eq!(
1158 lexerdef.get_rule_by_name("A").unwrap().name_span(),
1159 Span::new(44, 45)
1160 );
1161 assert_eq!(
1162 lexerdef.get_rule_by_name("B").unwrap().name_span(),
1163 Span::new(50, 51)
1164 );
1165 }
1166
1167 #[test]
1168 fn test_rule_start_states() {
1169 let src = "%x EXCLUSIVE_START
1170%s INCLUSIVE_START
1171%%
1172<EXCLUSIVE_START>a 'A'
1173<INCLUSIVE_START>b 'B'
1174[ \\n] ;"
1175 .to_string();
1176 let lexerdef = LRNonStreamingLexerDef::<DefaultLexerTypes<u8>>::from_str(&src).unwrap();
1177 let a_rule = lexerdef.get_rule_by_name("A").unwrap();
1178 assert_eq!(a_rule.name_span(), Span::new(61, 62));
1179 assert_eq!(a_rule.re_str, "a");
1180
1181 let b_rule = lexerdef.get_rule_by_name("B").unwrap();
1182 assert_eq!(b_rule.name_span(), Span::new(84, 85));
1183 assert_eq!(b_rule.re_str, "b");
1184 }
1185
1186 #[test]
1187 fn test_state_matches_regular_no_rule_states() {
1188 let all_states = &[
1189 StartState::new(0, "INITIAL", false, Span::new(0, 0)),
1190 StartState::new(1, "EXCLUSIVE", true, Span::new(0, 0)),
1191 ];
1192 let rule_states = vec![];
1193 let current_state = &all_states[0];
1194 let m = LRNonStreamingLexerDef::<DefaultLexerTypes<u8>>::state_matches(
1195 current_state,
1196 &rule_states,
1197 );
1198 assert!(m);
1199 }
1200
1201 #[test]
1202 fn test_state_matches_exclusive_no_rule_states() {
1203 let all_states = &[
1204 StartState::new(0, "INITIAL", false, Span::new(0, 0)),
1205 StartState::new(1, "EXCLUSIVE", true, Span::new(0, 0)),
1206 ];
1207 let rule_states = vec![];
1208 let current_state = &all_states[1];
1209 let m = LRNonStreamingLexerDef::<DefaultLexerTypes<u8>>::state_matches(
1210 current_state,
1211 &rule_states,
1212 );
1213 assert!(!m);
1214 }
1215
1216 #[test]
1217 fn test_state_matches_regular_matching_rule_states() {
1218 let all_states = &[
1219 StartState::new(0, "INITIAL", false, Span::new(0, 0)),
1220 StartState::new(1, "EXCLUSIVE", true, Span::new(0, 0)),
1221 ];
1222 let rule_states = vec![0];
1223 let current_state = &all_states[0];
1224 let m = LRNonStreamingLexerDef::<DefaultLexerTypes<u8>>::state_matches(
1225 current_state,
1226 &rule_states,
1227 );
1228 assert!(m);
1229 }
1230
1231 #[test]
1232 fn test_state_matches_exclusive_matching_rule_states() {
1233 let all_states = &[
1234 StartState::new(0, "INITIAL", false, Span::new(0, 0)),
1235 StartState::new(1, "EXCLUSIVE", true, Span::new(0, 0)),
1236 ];
1237 let rule_states = vec![1];
1238 let current_state = &all_states[1];
1239 let m = LRNonStreamingLexerDef::<DefaultLexerTypes<u8>>::state_matches(
1240 current_state,
1241 &rule_states,
1242 );
1243 assert!(m);
1244 }
1245
1246 #[test]
1247 fn test_state_matches_regular_other_rule_states() {
1248 let all_states = &[
1249 StartState::new(0, "INITIAL", false, Span::new(0, 0)),
1250 StartState::new(1, "EXCLUSIVE", true, Span::new(0, 0)),
1251 ];
1252 let rule_states = vec![1];
1253 let current_state = &all_states[0];
1254 let m = LRNonStreamingLexerDef::<DefaultLexerTypes<u8>>::state_matches(
1255 current_state,
1256 &rule_states,
1257 );
1258 assert!(!m);
1259 }
1260
1261 #[test]
1262 fn test_state_matches_exclusive_other_rule_states() {
1263 let all_states = &[
1264 StartState::new(0, "INITIAL", false, Span::new(0, 0)),
1265 StartState::new(1, "EXCLUSIVE", true, Span::new(0, 0)),
1266 ];
1267 let rule_states = vec![0];
1268 let current_state = &all_states[1];
1269 let m = LRNonStreamingLexerDef::<DefaultLexerTypes<u8>>::state_matches(
1270 current_state,
1271 &rule_states,
1272 );
1273 assert!(!m);
1274 }
1275}