1use bincode::Encode;
4use cfgrammar::{
5 header::{
6 GrmtoolsSectionParser, Header, HeaderError, HeaderErrorKind, HeaderValue, Namespaced,
7 Setting, Value,
8 },
9 markmap::MergeBehavior,
10 span::{Location, Span},
11};
12use glob::glob;
13use lrpar::{
14 CTParserBuilder, LexerTypes,
15 diagnostics::{DiagnosticFormatter, SpannedDiagnosticFormatter},
16};
17use num_traits::{AsPrimitive, PrimInt, Unsigned};
18use proc_macro2::{Ident, TokenStream};
19use quote::{ToTokens, TokenStreamExt, format_ident, quote};
20use regex::Regex;
21use std::marker::PhantomData;
22use std::{
23 any::type_name,
24 borrow::Borrow,
25 collections::{HashMap, HashSet},
26 env::{current_dir, var},
27 error::Error,
28 fmt::{self, Debug, Display, Write as _},
29 fs::{self, File, create_dir_all, read_to_string},
30 hash::Hash,
31 io::Write,
32 path::{Path, PathBuf},
33 sync::{LazyLock, Mutex},
34};
35
36use crate::{DefaultLexerTypes, LRNonStreamingLexer, LRNonStreamingLexerDef, LexFlags, LexerDef};
37
38const RUST_FILE_EXT: &str = "rs";
39
40const ERROR: &str = "[Error]";
41const WARNING: &str = "[Warning]";
42
43static RE_TOKEN_ID: LazyLock<Regex> =
44 LazyLock::new(|| Regex::new(r"^[a-zA-Z_][a-zA-Z_0-9]*$").unwrap());
45
46static GENERATED_PATHS: LazyLock<Mutex<HashSet<PathBuf>>> =
47 LazyLock::new(|| Mutex::new(HashSet::new()));
48
49#[non_exhaustive]
50pub enum LexerKind {
51 LRNonStreamingLexer,
52}
53
54impl<T: Clone> TryFrom<&Value<T>> for LexerKind {
55 type Error = cfgrammar::header::HeaderError<T>;
56 fn try_from(it: &Value<T>) -> Result<LexerKind, Self::Error> {
57 match it {
58 Value::Flag(_, loc) => Err(HeaderError {
59 kind: HeaderErrorKind::ConversionError(
60 "LexerKind",
61 "Expected `LexerKind` found bool",
62 ),
63 locations: vec![loc.clone()],
64 }),
65 Value::Setting(Setting::Num(_, loc)) => Err(HeaderError {
66 kind: HeaderErrorKind::ConversionError(
67 "LexerKind",
68 "Expected `LexerKind` found numeric",
69 ),
70 locations: vec![loc.clone()],
71 }),
72 Value::Setting(Setting::String(_, loc)) => Err(HeaderError {
73 kind: HeaderErrorKind::ConversionError(
74 "LexerKind",
75 "Expected `LexerKind` found string",
76 ),
77 locations: vec![loc.clone()],
78 }),
79 Value::Setting(Setting::Constructor {
80 ctor:
81 Namespaced {
82 namespace: _,
83 member: (_, loc),
84 },
85 arg: _,
86 }) => Err(HeaderError {
87 kind: HeaderErrorKind::ConversionError(
88 "LexerKind",
89 "Expected `LexerKind` found constructor",
90 ),
91 locations: vec![loc.clone()],
92 }),
93 Value::Setting(Setting::Array(_, arr_loc, _)) => Err(HeaderError {
94 kind: HeaderErrorKind::ConversionError(
95 "LexerKind",
96 "Expected `LexerKind` found array",
97 ),
98 locations: vec![arr_loc.clone()],
99 }),
100 Value::Setting(Setting::Unitary(Namespaced {
101 namespace,
102 member: (member, member_loc),
103 })) => {
104 if let Some((ns, loc)) = namespace {
105 if ns.to_lowercase() != "lexerkind" {
106 return Err(HeaderError {
107 kind: HeaderErrorKind::ConversionError(
108 "LexerKind",
109 "Expected namespace `LexerKind`",
110 ),
111 locations: vec![loc.clone()],
112 });
113 }
114 }
115 if member.to_lowercase() != "lrnonstreaminglexer" {
116 return Err(HeaderError {
117 kind: HeaderErrorKind::ConversionError(
118 "LexerKind",
119 "Unknown `LexerKind` Variant",
120 ),
121 locations: vec![member_loc.clone()],
122 });
123 }
124
125 Ok(LexerKind::LRNonStreamingLexer)
126 }
127 }
128 }
129}
130
131#[derive(Clone, PartialEq, Eq, Debug)]
133#[non_exhaustive]
134pub enum Visibility {
135 Private,
137 Public,
139 PublicSuper,
141 PublicSelf,
143 PublicCrate,
145 PublicIn(String),
147}
148
149impl ToTokens for Visibility {
150 fn to_tokens(&self, tokens: &mut TokenStream) {
151 tokens.extend(match self {
152 Visibility::Private => quote!(),
153 Visibility::Public => quote! {pub},
154 Visibility::PublicSuper => quote! {pub(super)},
155 Visibility::PublicSelf => quote! {pub(self)},
156 Visibility::PublicCrate => quote! {pub(crate)},
157 Visibility::PublicIn(data) => {
158 let other = str::parse::<TokenStream>(data).unwrap();
159 quote! {pub(in #other)}
160 }
161 })
162 }
163}
164
165#[derive(Clone, Copy, PartialEq, Eq, Debug)]
169#[non_exhaustive]
170pub enum RustEdition {
171 Rust2015,
172 Rust2018,
173 Rust2021,
174}
175
176struct QuoteOption<T>(Option<T>);
182
183impl<T: ToTokens> ToTokens for QuoteOption<T> {
184 fn to_tokens(&self, tokens: &mut TokenStream) {
185 tokens.append_all(match self.0 {
186 Some(ref t) => quote! { ::std::option::Option::Some(#t) },
187 None => quote! { ::std::option::Option::None },
188 });
189 }
190}
191
192struct QuoteTuple<T>(T);
195
196impl<A: ToTokens, B: ToTokens> ToTokens for QuoteTuple<(A, B)> {
197 fn to_tokens(&self, tokens: &mut TokenStream) {
198 let (a, b) = &self.0;
199 tokens.append_all(quote!((#a, #b)));
200 }
201}
202
203struct QuoteToString<'a>(&'a str);
205
206impl ToTokens for QuoteToString<'_> {
207 fn to_tokens(&self, tokens: &mut TokenStream) {
208 let x = &self.0;
209 tokens.append_all(quote! { #x.to_string() });
210 }
211}
212
213struct ErrorString(String);
215impl fmt::Display for ErrorString {
216 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
217 let ErrorString(s) = self;
218 write!(f, "{}", s)
219 }
220}
221impl fmt::Debug for ErrorString {
222 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
223 let ErrorString(s) = self;
224 write!(f, "{}", s)
225 }
226}
227impl Error for ErrorString {}
228
229pub struct CTLexerBuilder<'a, LexerTypesT: LexerTypes = DefaultLexerTypes<u32>>
232where
233 LexerTypesT::StorageT: Debug + Eq + Hash + ToTokens,
234 usize: num_traits::AsPrimitive<LexerTypesT::StorageT>,
235{
236 lrpar_config:
237 Option<Box<dyn Fn(CTParserBuilder<LexerTypesT>) -> CTParserBuilder<LexerTypesT> + 'a>>,
238 lexer_path: Option<PathBuf>,
239 output_path: Option<PathBuf>,
240 lexerkind: Option<LexerKind>,
241 mod_name: Option<&'a str>,
242 visibility: Visibility,
243 rust_edition: RustEdition,
244 rule_ids_map: Option<HashMap<String, LexerTypesT::StorageT>>,
245 allow_missing_terms_in_lexer: bool,
246 allow_missing_tokens_in_parser: bool,
247 warnings_are_errors: bool,
248 show_warnings: bool,
249 header: Header<Location>,
250 #[cfg(test)]
251 inspect_lexerkind_cb: Option<Box<dyn Fn(&LexerKind) -> Result<(), Box<dyn Error>>>>,
252}
253
254impl CTLexerBuilder<'_, DefaultLexerTypes<u32>> {
255 pub fn new() -> Self {
257 CTLexerBuilder::<DefaultLexerTypes<u32>>::new_with_lexemet()
258 }
259}
260
261impl<'a, LexerTypesT: LexerTypes<LexErrorT = crate::LRLexError> + 'static>
262 CTLexerBuilder<'a, LexerTypesT>
263where
264 LexerTypesT::StorageT:
265 'static + Debug + Eq + Hash + PrimInt + Encode + TryFrom<usize> + Unsigned + ToTokens,
266 usize: AsPrimitive<LexerTypesT::StorageT>,
267{
268 pub fn new_with_lexemet() -> Self {
285 let mut header = Header::new();
286 header.set_default_merge_behavior(MergeBehavior::Ours);
287 CTLexerBuilder {
288 lrpar_config: None,
289 lexer_path: None,
290 output_path: None,
291 lexerkind: None,
292 mod_name: None,
293 visibility: Visibility::Private,
294 rust_edition: RustEdition::Rust2021,
295 rule_ids_map: None,
296 allow_missing_terms_in_lexer: false,
297 allow_missing_tokens_in_parser: false,
298 warnings_are_errors: false,
299 show_warnings: true,
300 header,
301 #[cfg(test)]
302 inspect_lexerkind_cb: None,
303 }
304 }
305
306 pub fn lrpar_config<F>(mut self, config_func: F) -> Self
325 where
326 F: Fn(CTParserBuilder<LexerTypesT>) -> CTParserBuilder<LexerTypesT> + 'a,
327 {
328 self.lrpar_config = Some(Box::new(config_func));
329 self
330 }
331
332 pub fn lexer_in_src_dir<P>(mut self, srcp: P) -> Result<Self, Box<dyn Error>>
350 where
351 P: AsRef<Path>,
352 {
353 if !srcp.as_ref().is_relative() {
354 return Err(format!(
355 "Lexer path '{}' must be a relative path.",
356 srcp.as_ref().to_str().unwrap_or("<invalid UTF-8>")
357 )
358 .into());
359 }
360
361 let mut lexp = current_dir()?;
362 lexp.push("src");
363 lexp.push(srcp.as_ref());
364 self.lexer_path = Some(lexp);
365
366 let mut outp = PathBuf::new();
367 outp.push(var("OUT_DIR").unwrap());
368 outp.push(srcp.as_ref().parent().unwrap().to_str().unwrap());
369 create_dir_all(&outp)?;
370 let mut leaf = srcp
371 .as_ref()
372 .file_name()
373 .unwrap()
374 .to_str()
375 .unwrap()
376 .to_owned();
377 write!(leaf, ".{}", RUST_FILE_EXT).ok();
378 outp.push(leaf);
379 Ok(self.output_path(outp))
380 }
381
382 pub fn lexer_path<P>(mut self, inp: P) -> Self
386 where
387 P: AsRef<Path>,
388 {
389 self.lexer_path = Some(inp.as_ref().to_owned());
390 self
391 }
392
393 pub fn output_path<P>(mut self, outp: P) -> Self
398 where
399 P: AsRef<Path>,
400 {
401 self.output_path = Some(outp.as_ref().to_owned());
402 self
403 }
404
405 pub fn lexerkind(mut self, lexerkind: LexerKind) -> Self {
407 self.lexerkind = Some(lexerkind);
408 self
409 }
410
411 pub fn mod_name(mut self, mod_name: &'a str) -> Self {
415 self.mod_name = Some(mod_name);
416 self
417 }
418
419 pub fn visibility(mut self, vis: Visibility) -> Self {
421 self.visibility = vis;
422 self
423 }
424
425 pub fn rust_edition(mut self, edition: RustEdition) -> Self {
428 self.rust_edition = edition;
429 self
430 }
431
432 pub fn rule_ids_map<T: std::borrow::Borrow<HashMap<String, LexerTypesT::StorageT>> + Clone>(
437 mut self,
438 rule_ids_map: T,
439 ) -> Self {
440 self.rule_ids_map = Some(rule_ids_map.borrow().to_owned());
441 self
442 }
443
444 pub fn build(mut self) -> Result<CTLexer, Box<dyn Error>> {
464 let lexerp = self
465 .lexer_path
466 .as_ref()
467 .expect("lexer_path must be specified before processing.");
468 let outp = self
469 .output_path
470 .as_ref()
471 .expect("output_path must be specified before processing.");
472
473 {
474 let mut lk = GENERATED_PATHS.lock().unwrap();
475 if lk.contains(outp.as_path()) {
476 return Err(format!("Generating two lexers to the same path ('{}') is not allowed: use CTLexerBuilder::output_path (and, optionally, CTLexerBuilder::mod_name) to differentiate them.", &outp.to_str().unwrap()).into());
477 }
478 lk.insert(outp.clone());
479 }
480 let lex_src = read_to_string(lexerp)
481 .map_err(|e| format!("When reading '{}': {e}", lexerp.display()))?;
482 let lex_diag = SpannedDiagnosticFormatter::new(&lex_src, lexerp);
483 let mut header = self.header;
484 let (parsed_header, _) = GrmtoolsSectionParser::new(&lex_src, false)
485 .parse()
486 .map_err(|es| {
487 let mut out = String::new();
488 out.push_str(&format!(
489 "\n{ERROR}{}\n",
490 lex_diag.file_location_msg(" parsing the `%grmtools` section", None)
491 ));
492 for e in es {
493 out.push_str(&indent(" ", &lex_diag.format_error(e).to_string()));
494 out.push('\n');
495 }
496 ErrorString(out)
497 })?;
498 header.merge_from(parsed_header)?;
499 header.mark_used(&"lexerkind".to_string());
500 let lexerkind = match self.lexerkind {
501 Some(lexerkind) => lexerkind,
502 None => {
503 if let Some(HeaderValue(_, lk_val)) = header.get("lexerkind") {
504 LexerKind::try_from(lk_val)?
505 } else {
506 LexerKind::LRNonStreamingLexer
507 }
508 }
509 };
510 #[cfg(test)]
511 if let Some(inspect_lexerkind_cb) = self.inspect_lexerkind_cb {
512 inspect_lexerkind_cb(&lexerkind)?
513 }
514 let (lexerdef, lex_flags): (LRNonStreamingLexerDef<LexerTypesT>, LexFlags) =
515 match lexerkind {
516 LexerKind::LRNonStreamingLexer => {
517 let lex_flags = LexFlags::try_from(&mut header)?;
518 let lexerdef = LRNonStreamingLexerDef::<LexerTypesT>::new_with_options(
519 &lex_src, lex_flags,
520 )
521 .map_err(|errs| {
522 let mut out = String::new();
523 out.push_str(&format!(
524 "\n{ERROR}{}\n",
525 lex_diag.file_location_msg("", None)
526 ));
527 for e in errs {
528 out.push_str(&indent(" ", &lex_diag.format_error(e).to_string()));
529 out.push('\n');
530 }
531 ErrorString(out)
532 })?;
533 let lex_flags = lexerdef.lex_flags().cloned();
534 (lexerdef, lex_flags.unwrap())
535 }
536 };
537
538 let ct_parser = if let Some(ref lrcfg) = self.lrpar_config {
539 let mut closure_lexerdef = lexerdef.clone();
540 let mut ctp = CTParserBuilder::<LexerTypesT>::new().inspect_rt(Box::new(
541 move |yacc_header, rtpb, rule_ids_map, grm_path| {
542 let owned_map = rule_ids_map
543 .iter()
544 .map(|(x, y)| (&**x, *y))
545 .collect::<HashMap<_, _>>();
546 closure_lexerdef.set_rule_ids(&owned_map);
547 yacc_header.mark_used(&"test_files".to_string());
548 let grammar = rtpb.grammar();
549 let test_glob = yacc_header.get("test_files");
550 let mut err_str = None;
551 let add_error_line = |err_str: &mut Option<String>, line| {
552 if let Some(err_str) = err_str {
553 err_str.push_str(&format!("{}\n", line));
554 } else {
555 let _ = err_str.insert(format!("{}\n", line));
556 }
557 };
558 match test_glob {
559 Some(HeaderValue(_, Value::Setting(Setting::Array(test_globs, _, _)))) => {
560 for setting in test_globs {
561 match setting {
562 Setting::String(test_files, _) => {
563 let path_joined = grm_path.parent().unwrap().join(test_files);
564 let path_str = &path_joined.to_string_lossy();
565 let mut glob_paths = glob(path_str).map_err(|e| e.to_string())?.peekable();
566 if glob_paths.peek().is_none() {
567 return Err(format!("'test_files' glob '{}' matched no paths", path_str)
568 .to_string()
569 .into(),
570 );
571 }
572
573 for path in glob_paths {
574 let path = path?;
575 if let Some(ext) = path.extension() {
576 if let Some(ext) = ext.to_str() {
577 if ext.starts_with("grm") {
578 add_error_line(&mut err_str, "test_files extensions beginning with `grm` are reserved.".into());
579 }
580 }
581 }
582 let input = fs::read_to_string(&path)?;
583 let l: LRNonStreamingLexer<LexerTypesT> =
584 closure_lexerdef.lexer(&input);
585 let errs = rtpb.parse_map(&l, &|_| (), &|_, _| ()).1;
586 if !errs.is_empty() {
587 add_error_line(&mut err_str, format!("While parsing {}:", path.display()));
588 for e in errs {
589 let e_pp = e.pp(&l, &|t| grammar.token_epp(t));
590 let e_lines = e_pp.split("\n");
591 for e in e_lines {
592 add_error_line(&mut err_str, format!("\t{}", e));
593 }
594 }
595 }
596 }
597 }
598 _ => return Err("Invalid value for setting 'test_files'".into()),
599 }
600 }
601 if let Some(err_str) = err_str {
602 Err(ErrorString(err_str))?
603 } else {
604 Ok(())
605 }
606
607 }
608 Some(_) => Err("Invalid value for setting 'test_files'".into()),
609 None => Ok(()),
610 }
611 },
612 ));
613 ctp = lrcfg(ctp);
614 let ct_parser = ctp.build()?;
615 self.rule_ids_map = Some(ct_parser.token_map().to_owned());
616 Some(ct_parser)
617 } else {
618 None
619 };
620
621 let mut lexerdef = Box::new(lexerdef);
622 let unused_header_values = header.unused();
623 if !unused_header_values.is_empty() {
624 return Err(
625 format!("Unused header values: {}", unused_header_values.join(", ")).into(),
626 );
627 }
628
629 let (missing_from_lexer, missing_from_parser) = match self.rule_ids_map {
630 Some(ref rim) => {
631 let owned_map = rim
633 .iter()
634 .map(|(x, y)| (&**x, *y))
635 .collect::<HashMap<_, _>>();
636 let (x, y) = lexerdef.set_rule_ids_spanned(&owned_map);
637 (
638 x.map(|a| a.iter().map(|&b| b.to_string()).collect::<HashSet<_>>()),
639 y.map(|a| {
640 a.iter()
641 .map(|(b, span)| (b.to_string(), *span))
642 .collect::<HashSet<_>>()
643 }),
644 )
645 }
646 None => (None, None),
647 };
648
649 let mut has_unallowed_missing = false;
650 let err_indent = " ".repeat(ERROR.len());
651 if !self.allow_missing_terms_in_lexer {
652 if let Some(ref mfl) = missing_from_lexer {
653 if let Some(ct_parser) = &ct_parser {
654 let grm = ct_parser.yacc_grammar();
655 let token_spans = mfl
656 .iter()
657 .map(|name| {
658 ct_parser
659 .yacc_grammar()
660 .token_span(*grm.tokens_map().get(name.as_str()).unwrap())
661 .expect("Given token should have a span")
662 })
663 .collect::<Vec<_>>();
664
665 let yacc_diag = SpannedDiagnosticFormatter::new(
666 ct_parser.grammar_src(),
667 ct_parser.grammar_path(),
668 );
669
670 eprintln!(
671 "{ERROR} these tokens are not referenced in the lexer but defined as follows"
672 );
673 eprintln!(
674 "{err_indent} {}",
675 yacc_diag.file_location_msg("in the grammar", None)
676 );
677 for span in token_spans {
678 eprintln!(
679 "{}",
680 yacc_diag.underline_span_with_text(
681 span,
682 "Missing from lexer".to_string(),
683 '^'
684 )
685 );
686 }
687 eprintln!();
688 } else {
689 eprintln!(
690 "{ERROR} the following tokens are used in the grammar but are not defined in the lexer:"
691 );
692 for n in mfl {
693 eprintln!(" {}", n);
694 }
695 }
696 has_unallowed_missing = true;
697 }
698 }
699 if !self.allow_missing_tokens_in_parser && self.show_warnings {
700 if let Some(ref mfp) = missing_from_parser {
701 let error_prefix = if self.warnings_are_errors {
702 ERROR
703 } else {
704 WARNING
705 };
706 let err_indent = " ".repeat(error_prefix.len());
707 let mut outs = Vec::new();
708 outs.push(format!("{error_prefix} these tokens are not referenced in the grammar but defined as follows"));
709 outs.push(format!(
710 "{err_indent} {}",
711 lex_diag.file_location_msg("in the lexer", None)
712 ));
713 for (_, span) in mfp {
714 let error_contents = lex_diag.underline_span_with_text(
715 *span,
716 "Missing from parser".to_string(),
717 '^',
718 );
719 outs.extend(error_contents.lines().map(|s| s.to_string()));
720 }
721
722 for s in outs {
723 if !self.warnings_are_errors && std::env::var("OUT_DIR").is_ok() {
724 println!("cargo:warning={}", s)
725 } else {
726 eprintln!("{}", s);
727 }
728 }
729
730 has_unallowed_missing |= self.warnings_are_errors;
731 }
732 }
733 if has_unallowed_missing {
734 fs::remove_file(outp).ok();
735 panic!();
736 }
737
738 let mod_name = match self.mod_name {
739 Some(s) => s.to_owned(),
740 None => {
741 let mut stem = lexerp.to_str().unwrap();
746 loop {
747 let new_stem = Path::new(stem).file_stem().unwrap().to_str().unwrap();
748 if stem == new_stem {
749 break;
750 }
751 stem = new_stem;
752 }
753 format!("{}_l", stem)
754 }
755 };
756 let mod_name =
757 match syn::parse_str::<proc_macro2::Ident>(&mod_name) {
758 Ok(s) => s,
759 Err(e) => return Err(format!(
760 "CTLexerBuilder::mod_name(\"{}\") is not a valid rust identifier due to '{}'",
761 mod_name, e
762 )
763 .into()),
764 };
765 let mut lexerdef_func_impl = {
766 let LexFlags {
767 allow_wholeline_comments,
768 dot_matches_new_line,
769 multi_line,
770 octal,
771 posix_escapes,
772 case_insensitive,
773 unicode,
774 swap_greed,
775 ignore_whitespace,
776 size_limit,
777 dfa_size_limit,
778 nest_limit,
779 } = lex_flags;
780 let allow_wholeline_comments = QuoteOption(allow_wholeline_comments);
781 let dot_matches_new_line = QuoteOption(dot_matches_new_line);
782 let multi_line = QuoteOption(multi_line);
783 let octal = QuoteOption(octal);
784 let posix_escapes = QuoteOption(posix_escapes);
785 let case_insensitive = QuoteOption(case_insensitive);
786 let unicode = QuoteOption(unicode);
787 let swap_greed = QuoteOption(swap_greed);
788 let ignore_whitespace = QuoteOption(ignore_whitespace);
789 let size_limit = QuoteOption(size_limit);
790 let dfa_size_limit = QuoteOption(dfa_size_limit);
791 let nest_limit = QuoteOption(nest_limit);
792
793 quote! {
795 let mut lex_flags = ::lrlex::DEFAULT_LEX_FLAGS;
796 lex_flags.allow_wholeline_comments = #allow_wholeline_comments.or(::lrlex::DEFAULT_LEX_FLAGS.allow_wholeline_comments);
797 lex_flags.dot_matches_new_line = #dot_matches_new_line.or(::lrlex::DEFAULT_LEX_FLAGS.dot_matches_new_line);
798 lex_flags.multi_line = #multi_line.or(::lrlex::DEFAULT_LEX_FLAGS.multi_line);
799 lex_flags.octal = #octal.or(::lrlex::DEFAULT_LEX_FLAGS.octal);
800 lex_flags.posix_escapes = #posix_escapes.or(::lrlex::DEFAULT_LEX_FLAGS.posix_escapes);
801 lex_flags.case_insensitive = #case_insensitive.or(::lrlex::DEFAULT_LEX_FLAGS.case_insensitive);
802 lex_flags.unicode = #unicode.or(::lrlex::DEFAULT_LEX_FLAGS.unicode);
803 lex_flags.swap_greed = #swap_greed.or(::lrlex::DEFAULT_LEX_FLAGS.swap_greed);
804 lex_flags.ignore_whitespace = #ignore_whitespace.or(::lrlex::DEFAULT_LEX_FLAGS.ignore_whitespace);
805 lex_flags.size_limit = #size_limit.or(::lrlex::DEFAULT_LEX_FLAGS.size_limit);
806 lex_flags.dfa_size_limit = #dfa_size_limit.or(::lrlex::DEFAULT_LEX_FLAGS.dfa_size_limit);
807 lex_flags.nest_limit = #nest_limit.or(::lrlex::DEFAULT_LEX_FLAGS.nest_limit);
808 let lex_flags = lex_flags;
809 }
810 };
811 {
812 let start_states = lexerdef.iter_start_states();
813 let rules = lexerdef.iter_rules().map(|r| {
814 let tok_id = QuoteOption(r.tok_id);
815 let n = QuoteOption(r.name().map(QuoteToString));
816 let target_state =
817 QuoteOption(r.target_state().map(|(x, y)| QuoteTuple((x, y))));
818 let n_span = r.name_span();
819 let regex = QuoteToString(&r.re_str);
820 let start_states = r.start_states();
821 quote! {
826 Rule::new(::lrlex::unstable_api::InternalPublicApi, #tok_id, #n, #n_span, #regex,
827 vec![#(#start_states),*], #target_state, &lex_flags).unwrap()
828 }
829 });
830 lexerdef_func_impl.append_all(quote! {
832 let start_states: Vec<StartState> = vec![#(#start_states),*];
833 let rules = vec![#(#rules),*];
834 });
835 }
836 let lexerdef_ty = match lexerkind {
837 LexerKind::LRNonStreamingLexer => {
838 quote!(::lrlex::LRNonStreamingLexerDef)
839 }
840 };
841 lexerdef_func_impl.append_all(quote! {
843 #lexerdef_ty::from_rules(start_states, rules)
844 });
845
846 let mut token_consts = TokenStream::new();
847 if let Some(rim) = self.rule_ids_map {
848 let mut rim_sorted = Vec::from_iter(rim.iter());
849 rim_sorted.sort_by_key(|(k, _)| *k);
850 for (name, id) in rim_sorted {
851 if RE_TOKEN_ID.is_match(name) {
852 let tok_ident = format_ident!("N_{}", name.to_ascii_uppercase());
853 let storaget =
854 str::parse::<TokenStream>(type_name::<LexerTypesT::StorageT>()).unwrap();
855 let tok_const = quote! {
857 #[allow(dead_code)]
858 pub const #tok_ident: #storaget = #id;
859 };
860 token_consts.extend(tok_const)
861 }
862 }
863 }
864 let token_consts = token_consts.into_iter();
865 let out_tokens = {
866 let lexerdef_param = str::parse::<TokenStream>(type_name::<LexerTypesT>()).unwrap();
867 let mod_vis = self.visibility;
868 quote! {
870 #mod_vis mod #mod_name {
871 use ::lrlex::{LexerDef, Rule, StartState};
872 #[allow(dead_code)]
873 pub fn lexerdef() -> #lexerdef_ty<#lexerdef_param> {
874 #lexerdef_func_impl
875 }
876
877 #(#token_consts)*
878 }
879 }
880 };
881 let unformatted = out_tokens.to_string();
883 let mut outs = String::new();
884 let timestamp = env!("VERGEN_BUILD_TIMESTAMP");
888 write!(outs, "// lrlex build time: {}\n\n", quote!(#timestamp),).ok();
889 outs.push_str(
890 &syn::parse_str(&unformatted)
891 .map(|syntax_tree| prettyplease::unparse(&syntax_tree))
892 .unwrap_or(unformatted),
893 );
894 if let Ok(curs) = read_to_string(outp) {
898 if curs == outs {
899 return Ok(CTLexer {
900 missing_from_lexer,
901 missing_from_parser,
902 });
903 }
904 }
905 let mut f = File::create(outp)?;
906 f.write_all(outs.as_bytes())?;
907 Ok(CTLexer {
908 missing_from_lexer,
909 missing_from_parser,
910 })
911 }
912
913 #[deprecated(
920 since = "0.11.0",
921 note = "Please use lexer_in_src_dir() and build() instead"
922 )]
923 #[allow(deprecated)]
924 pub fn process_file_in_src(
925 self,
926 srcp: &str,
927 ) -> Result<(Option<HashSet<String>>, Option<HashSet<String>>), Box<dyn Error>> {
928 let mut inp = current_dir()?;
929 inp.push("src");
930 inp.push(srcp);
931 let mut outp = PathBuf::new();
932 outp.push(var("OUT_DIR").unwrap());
933 outp.push(Path::new(srcp).parent().unwrap().to_str().unwrap());
934 create_dir_all(&outp)?;
935 let mut leaf = Path::new(srcp)
936 .file_name()
937 .unwrap()
938 .to_str()
939 .unwrap()
940 .to_owned();
941 write!(leaf, ".{}", RUST_FILE_EXT).ok();
942 outp.push(leaf);
943 self.process_file(inp, outp)
944 }
945
946 #[deprecated(
964 since = "0.11.0",
965 note = "Please use lexer_in_src_dir() and build() instead"
966 )]
967 pub fn process_file<P, Q>(
968 mut self,
969 inp: P,
970 outp: Q,
971 ) -> Result<(Option<HashSet<String>>, Option<HashSet<String>>), Box<dyn Error>>
972 where
973 P: AsRef<Path>,
974 Q: AsRef<Path>,
975 {
976 self.lexer_path = Some(inp.as_ref().to_owned());
977 self.output_path = Some(outp.as_ref().to_owned());
978 let cl = self.build()?;
979 Ok((
980 cl.missing_from_lexer().map(|x| x.to_owned()),
981 cl.missing_from_parser()
982 .map(|x| x.iter().map(|(n, _)| n.to_owned()).collect::<HashSet<_>>()),
983 ))
984 }
985
986 pub fn allow_missing_terms_in_lexer(mut self, allow: bool) -> Self {
989 self.allow_missing_terms_in_lexer = allow;
990 self
991 }
992
993 pub fn allow_missing_tokens_in_parser(mut self, allow: bool) -> Self {
997 self.allow_missing_tokens_in_parser = allow;
998 self
999 }
1000
1001 pub fn warnings_are_errors(mut self, flag: bool) -> Self {
1004 self.warnings_are_errors = flag;
1005 self
1006 }
1007
1008 pub fn show_warnings(mut self, flag: bool) -> Self {
1011 self.show_warnings = flag;
1012 self
1013 }
1014
1015 pub fn allow_wholeline_comments(mut self, flag: bool) -> Self {
1023 let key = "allow_wholeline_comments".to_string();
1024 self.header.insert(
1025 key,
1026 HeaderValue(
1027 Location::Other("CTLexerBuilder".to_string()),
1028 Value::Flag(flag, Location::Other("CTLexerBuilder".to_string())),
1029 ),
1030 );
1031 self
1032 }
1033
1034 pub fn dot_matches_new_line(mut self, flag: bool) -> Self {
1039 let key = "dot_matches_new_line".to_string();
1040 self.header.insert(
1041 key,
1042 HeaderValue(
1043 Location::Other("CTLexerBuilder".to_string()),
1044 Value::Flag(flag, Location::Other("CTLexerBuilder".to_string())),
1045 ),
1046 );
1047 self
1048 }
1049
1050 pub fn multi_line(mut self, flag: bool) -> Self {
1055 let key = "multi_line".to_string();
1056 self.header.insert(
1057 key,
1058 HeaderValue(
1059 Location::Other("CTLexerBuilder".to_string()),
1060 Value::Flag(flag, Location::Other("CTLexerBuilder".to_string())),
1061 ),
1062 );
1063 self
1064 }
1065
1066 pub fn posix_escapes(mut self, flag: bool) -> Self {
1071 let key = "posix_escapes".to_string();
1072 self.header.insert(
1073 key,
1074 HeaderValue(
1075 Location::Other("CTLexerBuilder".to_string()),
1076 Value::Flag(flag, Location::Other("CTLexerBuilder".to_string())),
1077 ),
1078 );
1079 self
1080 }
1081
1082 pub fn octal(mut self, flag: bool) -> Self {
1087 let key = "octal".to_string();
1088 self.header.insert(
1089 key,
1090 HeaderValue(
1091 Location::Other("CTLexerBuilder".to_string()),
1092 Value::Flag(flag, Location::Other("CTLexerBuilder".to_string())),
1093 ),
1094 );
1095 self
1096 }
1097
1098 pub fn swap_greed(mut self, flag: bool) -> Self {
1103 let key = "swap_greed".to_string();
1104 self.header.insert(
1105 key,
1106 HeaderValue(
1107 Location::Other("CTLexerBuilder".to_string()),
1108 Value::Flag(flag, Location::Other("CTLexerBuilder".to_string())),
1109 ),
1110 );
1111 self
1112 }
1113
1114 pub fn ignore_whitespace(mut self, flag: bool) -> Self {
1119 let key = "ignore_whitespace".to_string();
1120 self.header.insert(
1121 key,
1122 HeaderValue(
1123 Location::Other("CTLexerBuilder".to_string()),
1124 Value::Flag(flag, Location::Other("CTLexerBuilder".to_string())),
1125 ),
1126 );
1127 self
1128 }
1129
1130 pub fn unicode(mut self, flag: bool) -> Self {
1135 let key = "unicode".to_string();
1136 self.header.insert(
1137 key,
1138 HeaderValue(
1139 Location::Other("CTLexerBuilder".to_string()),
1140 Value::Flag(flag, Location::Other("CTLexerBuilder".to_string())),
1141 ),
1142 );
1143 self
1144 }
1145
1146 pub fn case_insensitive(mut self, flag: bool) -> Self {
1151 let key = "case_insensitive".to_string();
1152 self.header.insert(
1153 key,
1154 HeaderValue(
1155 Location::Other("CTLexerBuilder".to_string()),
1156 Value::Flag(flag, Location::Other("CTLexerBuilder".to_string())),
1157 ),
1158 );
1159 self
1160 }
1161
1162 pub fn size_limit(mut self, sz: usize) -> Self {
1167 let key = "size_limit".to_string();
1168 self.header.insert(
1169 key,
1170 HeaderValue(
1171 Location::Other("CTLexerBuilder".to_string()),
1172 Value::Setting(Setting::Num(
1173 sz as u64,
1174 Location::Other("CTLexerBuilder".to_string()),
1175 )),
1176 ),
1177 );
1178 self
1179 }
1180
1181 pub fn dfa_size_limit(mut self, sz: usize) -> Self {
1186 let key = "dfa_size_limit".to_string();
1187 self.header.insert(
1188 key,
1189 HeaderValue(
1190 Location::Other("CTLexerBuilder".to_string()),
1191 Value::Setting(Setting::Num(
1192 sz as u64,
1193 Location::Other("CTLexerBuilder".to_string()),
1194 )),
1195 ),
1196 );
1197 self
1198 }
1199
1200 pub fn nest_limit(mut self, lim: u32) -> Self {
1205 let key = "nest_limit".to_string();
1206 self.header.insert(
1207 key,
1208 HeaderValue(
1209 Location::Other("CTLexerBuilder".to_string()),
1210 Value::Setting(Setting::Num(
1211 lim as u64,
1212 Location::Other("CTLexerBuilder".to_string()),
1213 )),
1214 ),
1215 );
1216 self
1217 }
1218
1219 #[cfg(test)]
1220 pub fn inspect_lexerkind(
1221 mut self,
1222 cb: Box<dyn Fn(&LexerKind) -> Result<(), Box<dyn Error>>>,
1223 ) -> Self {
1224 self.inspect_lexerkind_cb = Some(cb);
1225 self
1226 }
1227}
1228
1229pub struct CTLexer {
1231 missing_from_lexer: Option<HashSet<String>>,
1232 missing_from_parser: Option<HashSet<(String, Span)>>,
1233}
1234
1235impl CTLexer {
1236 fn missing_from_lexer(&self) -> Option<&HashSet<String>> {
1237 self.missing_from_lexer.as_ref()
1238 }
1239
1240 fn missing_from_parser(&self) -> Option<&HashSet<(String, Span)>> {
1241 self.missing_from_parser.as_ref()
1242 }
1243}
1244
1245#[derive(Debug, Clone)]
1268pub struct CTTokenMapBuilder<StorageT: Display + ToTokens> {
1269 mod_name: String,
1270 token_map: Vec<(String, TokenStream)>,
1271 rename_map: Option<HashMap<String, String>>,
1272 allow_dead_code: bool,
1273 _marker: PhantomData<StorageT>,
1274}
1275
1276impl<StorageT: Display + ToTokens> CTTokenMapBuilder<StorageT> {
1277 pub fn new(
1283 mod_name: impl Into<String>,
1284 token_map: impl Borrow<HashMap<String, StorageT>>,
1285 ) -> Self {
1286 Self {
1287 mod_name: mod_name.into(),
1288 token_map: token_map
1289 .borrow()
1290 .iter()
1291 .map(|(tok_name, tok_value)| (tok_name.clone(), tok_value.to_token_stream()))
1292 .collect(),
1293 rename_map: None,
1294 allow_dead_code: false,
1295 _marker: PhantomData,
1296 }
1297 }
1298
1299 pub fn rename_map<M, I, K, V>(mut self, rename_map: Option<M>) -> Self
1313 where
1314 M: IntoIterator<Item = I>,
1315 I: Borrow<(K, V)>,
1316 K: AsRef<str>,
1317 V: AsRef<str>,
1318 {
1319 self.rename_map = rename_map.map(|rename_map| {
1320 rename_map
1321 .into_iter()
1322 .map(|it| {
1323 let (k, v) = it.borrow();
1324 let k = k.as_ref().into();
1325 let v = v.as_ref().into();
1326 (k, v)
1327 })
1328 .collect()
1329 });
1330 self
1331 }
1332
1333 pub fn allow_dead_code(mut self, allow_dead_code: bool) -> Self {
1340 self.allow_dead_code = allow_dead_code;
1341 self
1342 }
1343
1344 pub fn build(&self) -> Result<(), Box<dyn Error>> {
1346 let mut outs = String::new();
1350 let timestamp = env!("VERGEN_BUILD_TIMESTAMP");
1351 let mod_ident = format_ident!("{}", self.mod_name);
1352 write!(outs, "// lrlex build time: {}\n\n", quote!(#timestamp),).ok();
1353 let storaget = str::parse::<TokenStream>(type_name::<StorageT>()).unwrap();
1354 let mut token_map_sorted = self.token_map.clone();
1357 token_map_sorted.sort_by(|(l, _), (r, _)| l.cmp(r));
1358 let (token_array, tokens) = token_map_sorted
1359 .iter()
1360 .map(|(k, id)| {
1361 let name = match &self.rename_map {
1362 Some(rmap) => rmap.get(k).unwrap_or(k),
1363 _ => k,
1364 };
1365 let tok_ident: Ident = syn::parse_str(&format!("T_{}", name.to_ascii_uppercase()))
1366 .map_err(|e| {
1367 format!(
1368 "token name {:?} is not a valid Rust identifier: {}; \
1369 consider renaming it via `CTTokenMapBuilder::rename_map`.",
1370 name, e
1371 )
1372 })?;
1373 Ok((
1374 quote! {
1378 #id,
1379 },
1380 quote! {
1381 pub const #tok_ident: #storaget = #id;
1382 },
1383 ))
1384 })
1385 .collect::<Result<(TokenStream, TokenStream), Box<dyn Error>>>()?;
1386 let unused_annotation = if self.allow_dead_code {
1387 quote! {#[allow(dead_code)]}
1388 } else {
1389 quote! {}
1390 };
1391 let unformatted = quote! {
1394 #unused_annotation
1395 mod #mod_ident {
1396 #tokens
1397 #[allow(dead_code)]
1398 pub const TOK_IDS: &[#storaget] = &[#token_array];
1399 }
1400 }
1401 .to_string();
1402 let out_mod = syn::parse_str(&unformatted)
1403 .map(|syntax_tree| prettyplease::unparse(&syntax_tree))
1404 .unwrap_or(unformatted);
1405 outs.push_str(&out_mod);
1406 let mut outp = PathBuf::from(var("OUT_DIR")?);
1407 outp.push(&self.mod_name);
1408 outp.set_extension("rs");
1409
1410 if let Ok(curs) = read_to_string(&outp) {
1414 if curs == outs {
1415 return Ok(());
1416 }
1417 }
1418
1419 let mut f = File::create(outp)?;
1420 f.write_all(outs.as_bytes())?;
1421 Ok(())
1422 }
1423}
1424
1425#[deprecated(since = "0.14.0", note = "use `lrlex::CTTokenMapBuilder` instead")]
1430pub fn ct_token_map<StorageT: Display + ToTokens>(
1431 mod_name: &str,
1432 token_map: impl Borrow<HashMap<String, StorageT>>,
1433 rename_map: Option<&HashMap<&str, &str>>,
1434) -> Result<(), Box<dyn Error>> {
1435 CTTokenMapBuilder::new(mod_name, token_map)
1436 .rename_map(rename_map)
1437 .allow_dead_code(true)
1438 .build()
1439}
1440
1441fn indent(indent: &str, s: &str) -> String {
1452 format!("{indent}{}\n", s.trim_end_matches('\n')).replace('\n', &format!("\n{}", indent))
1453}
1454
1455#[cfg(all(not(target_arch = "wasm32"), test))]
1459mod test {
1460 use std::fs::File;
1461 use std::io::Write;
1462
1463 use super::{CTLexerBuilder, LexerKind};
1464 #[test]
1465 fn test_grmtools_section_lexerkind() {
1466 let lexerkinds = [
1467 "LRNonStreamingLexer",
1468 "lrnonstreaminglexer",
1469 "LexerKind::lrnonstreaminglexer",
1470 "lexerkind::LRNonStreamingLexer",
1471 ];
1472 for (i, kind) in lexerkinds.iter().enumerate() {
1473 let lex_src = format!(
1474 "
1475%grmtools{{lexerkind: {}}}
1476%%
1477. ;
1478",
1479 kind
1480 );
1481 let lex_path = format!(
1482 "{}/test_grmtools_section_lexerkind_{}.l",
1483 env!("OUT_DIR"),
1484 i
1485 );
1486 let mut l_file = File::create(lex_path.clone()).unwrap();
1487 l_file.write_all(lex_src.as_bytes()).unwrap();
1488 CTLexerBuilder::new()
1489 .output_path(format!("{}.rs", lex_path.clone()))
1490 .lexer_path(lex_path.clone())
1491 .inspect_lexerkind(Box::new(move |lexerkind| {
1492 assert!(matches!(lexerkind, &LexerKind::LRNonStreamingLexer));
1493 Ok(())
1494 }))
1495 .build()
1496 .unwrap();
1497 }
1498 }
1499
1500 #[test]
1501 fn test_invalid_identifier_in_derived_mod_name() {
1504 let mut lex_path = std::path::PathBuf::from(env!("OUT_DIR"));
1505 lex_path.push("contains-a-dash.l");
1506 let mut f = File::create(&lex_path).unwrap();
1507 let _ = f.write_all(
1508 r#"
1509%%
1510A "A"
1511"#
1512 .as_bytes(),
1513 );
1514 match CTLexerBuilder::new()
1515 .output_path(format!("{}.rs", lex_path.display()))
1516 .lexer_path(lex_path.clone())
1517 .build()
1518 {
1519 Ok(_) => panic!("Expected error"),
1520 Err(e) => {
1521 let err_string = e.to_string();
1522 assert_eq!(
1523 err_string,
1524 "CTLexerBuilder::mod_name(\"contains-a-dash_l\") is not a valid rust identifier due to 'unexpected token'"
1525 );
1526 }
1527 }
1528 }
1529}