1use bincode::Encode;
4use cfgrammar::{
5 header::{
6 GrmtoolsSectionParser, Header, HeaderError, HeaderErrorKind, HeaderValue, Namespaced,
7 Setting, Value,
8 },
9 markmap::MergeBehavior,
10 span::{Location, Span},
11};
12use glob::glob;
13use lrpar::{
14 CTParserBuilder, LexerTypes,
15 diagnostics::{DiagnosticFormatter, SpannedDiagnosticFormatter},
16};
17use num_traits::{AsPrimitive, PrimInt, Unsigned};
18use proc_macro2::{Ident, TokenStream};
19use quote::{ToTokens, TokenStreamExt, format_ident, quote};
20use regex::Regex;
21use std::marker::PhantomData;
22use std::{
23 any::type_name,
24 borrow::Borrow,
25 collections::{HashMap, HashSet},
26 env::{current_dir, var},
27 error::Error,
28 fmt::{self, Debug, Display, Write as _},
29 fs::{self, File, create_dir_all, read_to_string},
30 hash::Hash,
31 io::Write,
32 path::{Path, PathBuf},
33 sync::{LazyLock, Mutex},
34};
35
36use crate::{DefaultLexerTypes, LRNonStreamingLexer, LRNonStreamingLexerDef, LexFlags, LexerDef};
37
38const RUST_FILE_EXT: &str = "rs";
39
40const ERROR: &str = "[Error]";
41const WARNING: &str = "[Warning]";
42
43static RE_TOKEN_ID: LazyLock<Regex> =
44 LazyLock::new(|| Regex::new(r"^[a-zA-Z_][a-zA-Z_0-9]*$").unwrap());
45
46static GENERATED_PATHS: LazyLock<Mutex<HashSet<PathBuf>>> =
47 LazyLock::new(|| Mutex::new(HashSet::new()));
48
49#[non_exhaustive]
50pub enum LexerKind {
51 LRNonStreamingLexer,
52}
53
54impl<T: Clone> TryFrom<&Value<T>> for LexerKind {
55 type Error = cfgrammar::header::HeaderError<T>;
56 fn try_from(it: &Value<T>) -> Result<LexerKind, Self::Error> {
57 match it {
58 Value::Flag(_, loc) => Err(HeaderError {
59 kind: HeaderErrorKind::ConversionError(
60 "LexerKind",
61 "Expected `LexerKind` found bool",
62 ),
63 locations: vec![loc.clone()],
64 }),
65 Value::Setting(Setting::Num(_, loc)) => Err(HeaderError {
66 kind: HeaderErrorKind::ConversionError(
67 "LexerKind",
68 "Expected `LexerKind` found numeric",
69 ),
70 locations: vec![loc.clone()],
71 }),
72 Value::Setting(Setting::String(_, loc)) => Err(HeaderError {
73 kind: HeaderErrorKind::ConversionError(
74 "LexerKind",
75 "Expected `LexerKind` found string",
76 ),
77 locations: vec![loc.clone()],
78 }),
79 Value::Setting(Setting::Constructor {
80 ctor:
81 Namespaced {
82 namespace: _,
83 member: (_, loc),
84 },
85 arg: _,
86 }) => Err(HeaderError {
87 kind: HeaderErrorKind::ConversionError(
88 "LexerKind",
89 "Expected `LexerKind` found constructor",
90 ),
91 locations: vec![loc.clone()],
92 }),
93 Value::Setting(Setting::Unitary(Namespaced {
94 namespace,
95 member: (member, member_loc),
96 })) => {
97 if let Some((ns, loc)) = namespace {
98 if ns.to_lowercase() != "lexerkind" {
99 return Err(HeaderError {
100 kind: HeaderErrorKind::ConversionError(
101 "LexerKind",
102 "Expected namespace `LexerKind`",
103 ),
104 locations: vec![loc.clone()],
105 });
106 }
107 }
108 if member.to_lowercase() != "lrnonstreaminglexer" {
109 return Err(HeaderError {
110 kind: HeaderErrorKind::ConversionError(
111 "LexerKind",
112 "Unknown `LexerKind` Variant",
113 ),
114 locations: vec![member_loc.clone()],
115 });
116 }
117
118 Ok(LexerKind::LRNonStreamingLexer)
119 }
120 }
121 }
122}
123
124#[derive(Clone, PartialEq, Eq, Debug)]
126#[non_exhaustive]
127pub enum Visibility {
128 Private,
130 Public,
132 PublicSuper,
134 PublicSelf,
136 PublicCrate,
138 PublicIn(String),
140}
141
142impl ToTokens for Visibility {
143 fn to_tokens(&self, tokens: &mut TokenStream) {
144 tokens.extend(match self {
145 Visibility::Private => quote!(),
146 Visibility::Public => quote! {pub},
147 Visibility::PublicSuper => quote! {pub(super)},
148 Visibility::PublicSelf => quote! {pub(self)},
149 Visibility::PublicCrate => quote! {pub(crate)},
150 Visibility::PublicIn(data) => {
151 let other = str::parse::<TokenStream>(data).unwrap();
152 quote! {pub(in #other)}
153 }
154 })
155 }
156}
157
158#[derive(Clone, Copy, PartialEq, Eq, Debug)]
162#[non_exhaustive]
163pub enum RustEdition {
164 Rust2015,
165 Rust2018,
166 Rust2021,
167}
168
169struct QuoteOption<T>(Option<T>);
175
176impl<T: ToTokens> ToTokens for QuoteOption<T> {
177 fn to_tokens(&self, tokens: &mut TokenStream) {
178 tokens.append_all(match self.0 {
179 Some(ref t) => quote! { ::std::option::Option::Some(#t) },
180 None => quote! { ::std::option::Option::None },
181 });
182 }
183}
184
185struct QuoteTuple<T>(T);
188
189impl<A: ToTokens, B: ToTokens> ToTokens for QuoteTuple<(A, B)> {
190 fn to_tokens(&self, tokens: &mut TokenStream) {
191 let (a, b) = &self.0;
192 tokens.append_all(quote!((#a, #b)));
193 }
194}
195
196struct QuoteToString<'a>(&'a str);
198
199impl ToTokens for QuoteToString<'_> {
200 fn to_tokens(&self, tokens: &mut TokenStream) {
201 let x = &self.0;
202 tokens.append_all(quote! { #x.to_string() });
203 }
204}
205
206struct ErrorString(String);
208impl fmt::Display for ErrorString {
209 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
210 let ErrorString(s) = self;
211 write!(f, "{}", s)
212 }
213}
214impl fmt::Debug for ErrorString {
215 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
216 let ErrorString(s) = self;
217 write!(f, "{}", s)
218 }
219}
220impl Error for ErrorString {}
221
222pub struct CTLexerBuilder<'a, LexerTypesT: LexerTypes = DefaultLexerTypes<u32>>
225where
226 LexerTypesT::StorageT: Debug + Eq + Hash + ToTokens,
227 usize: num_traits::AsPrimitive<LexerTypesT::StorageT>,
228{
229 lrpar_config: Option<Box<dyn Fn(CTParserBuilder<LexerTypesT>) -> CTParserBuilder<LexerTypesT>>>,
230 lexer_path: Option<PathBuf>,
231 output_path: Option<PathBuf>,
232 lexerkind: Option<LexerKind>,
233 mod_name: Option<&'a str>,
234 visibility: Visibility,
235 rust_edition: RustEdition,
236 rule_ids_map: Option<HashMap<String, LexerTypesT::StorageT>>,
237 allow_missing_terms_in_lexer: bool,
238 allow_missing_tokens_in_parser: bool,
239 warnings_are_errors: bool,
240 show_warnings: bool,
241 header: Header<Location>,
242 #[cfg(test)]
243 inspect_lexerkind_cb: Option<Box<dyn Fn(LexerKind) -> Result<(), Box<dyn Error>>>>,
244}
245
246impl CTLexerBuilder<'_, DefaultLexerTypes<u32>> {
247 pub fn new() -> Self {
249 CTLexerBuilder::<DefaultLexerTypes<u32>>::new_with_lexemet()
250 }
251}
252
253impl<'a, LexerTypesT: LexerTypes<LexErrorT = crate::LRLexError> + 'static>
254 CTLexerBuilder<'a, LexerTypesT>
255where
256 LexerTypesT::StorageT:
257 'static + Debug + Eq + Hash + PrimInt + Encode + TryFrom<usize> + Unsigned + ToTokens,
258 usize: AsPrimitive<LexerTypesT::StorageT>,
259{
260 pub fn new_with_lexemet() -> Self {
277 let mut header = Header::new();
278 header.set_default_merge_behavior(MergeBehavior::Ours);
279 CTLexerBuilder {
280 lrpar_config: None,
281 lexer_path: None,
282 output_path: None,
283 lexerkind: None,
284 mod_name: None,
285 visibility: Visibility::Private,
286 rust_edition: RustEdition::Rust2021,
287 rule_ids_map: None,
288 allow_missing_terms_in_lexer: false,
289 allow_missing_tokens_in_parser: false,
290 warnings_are_errors: false,
291 show_warnings: true,
292 header,
293 #[cfg(test)]
294 inspect_lexerkind_cb: None,
295 }
296 }
297
298 pub fn lrpar_config<F>(mut self, config_func: F) -> Self
317 where
318 F: 'static + Fn(CTParserBuilder<LexerTypesT>) -> CTParserBuilder<LexerTypesT>,
319 {
320 self.lrpar_config = Some(Box::new(config_func));
321 self
322 }
323
324 pub fn lexer_in_src_dir<P>(mut self, srcp: P) -> Result<Self, Box<dyn Error>>
342 where
343 P: AsRef<Path>,
344 {
345 if !srcp.as_ref().is_relative() {
346 return Err(format!(
347 "Lexer path '{}' must be a relative path.",
348 srcp.as_ref().to_str().unwrap_or("<invalid UTF-8>")
349 )
350 .into());
351 }
352
353 let mut lexp = current_dir()?;
354 lexp.push("src");
355 lexp.push(srcp.as_ref());
356 self.lexer_path = Some(lexp);
357
358 let mut outp = PathBuf::new();
359 outp.push(var("OUT_DIR").unwrap());
360 outp.push(srcp.as_ref().parent().unwrap().to_str().unwrap());
361 create_dir_all(&outp)?;
362 let mut leaf = srcp
363 .as_ref()
364 .file_name()
365 .unwrap()
366 .to_str()
367 .unwrap()
368 .to_owned();
369 write!(leaf, ".{}", RUST_FILE_EXT).ok();
370 outp.push(leaf);
371 Ok(self.output_path(outp))
372 }
373
374 pub fn lexer_path<P>(mut self, inp: P) -> Self
378 where
379 P: AsRef<Path>,
380 {
381 self.lexer_path = Some(inp.as_ref().to_owned());
382 self
383 }
384
385 pub fn output_path<P>(mut self, outp: P) -> Self
390 where
391 P: AsRef<Path>,
392 {
393 self.output_path = Some(outp.as_ref().to_owned());
394 self
395 }
396
397 pub fn lexerkind(mut self, lexerkind: LexerKind) -> Self {
399 self.lexerkind = Some(lexerkind);
400 self
401 }
402
403 pub fn mod_name(mut self, mod_name: &'a str) -> Self {
407 self.mod_name = Some(mod_name);
408 self
409 }
410
411 pub fn visibility(mut self, vis: Visibility) -> Self {
413 self.visibility = vis;
414 self
415 }
416
417 pub fn rust_edition(mut self, edition: RustEdition) -> Self {
420 self.rust_edition = edition;
421 self
422 }
423
424 pub fn rule_ids_map<T: std::borrow::Borrow<HashMap<String, LexerTypesT::StorageT>> + Clone>(
429 mut self,
430 rule_ids_map: T,
431 ) -> Self {
432 self.rule_ids_map = Some(rule_ids_map.borrow().to_owned());
433 self
434 }
435
436 pub fn build(mut self) -> Result<CTLexer, Box<dyn Error>> {
456 let lexerp = self
457 .lexer_path
458 .as_ref()
459 .expect("lexer_path must be specified before processing.");
460 let outp = self
461 .output_path
462 .as_ref()
463 .expect("output_path must be specified before processing.");
464
465 {
466 let mut lk = GENERATED_PATHS.lock().unwrap();
467 if lk.contains(outp.as_path()) {
468 return Err(format!("Generating two lexers to the same path ('{}') is not allowed: use CTLexerBuilder::output_path (and, optionally, CTLexerBuilder::mod_name) to differentiate them.", &outp.to_str().unwrap()).into());
469 }
470 lk.insert(outp.clone());
471 }
472 let lex_src = read_to_string(lexerp)
473 .map_err(|e| format!("When reading '{}': {e}", lexerp.display()))?;
474 let lex_diag = SpannedDiagnosticFormatter::new(&lex_src, lexerp);
475 let mut header = self.header;
476 let (parsed_header, _) = GrmtoolsSectionParser::new(&lex_src, false)
477 .parse()
478 .map_err(|es| {
479 let mut out = String::new();
480 out.push_str(&format!(
481 "\n{ERROR}{}\n",
482 lex_diag.file_location_msg(" parsing the `%grmtools` section", None)
483 ));
484 for e in es {
485 out.push_str(&indent(" ", &lex_diag.format_error(e).to_string()));
486 out.push('\n');
487 }
488 ErrorString(out)
489 })?;
490 header.merge_from(parsed_header)?;
491 header.mark_used(&"lexerkind".to_string());
492 let lexerkind = match self.lexerkind {
493 Some(lexerkind) => lexerkind,
494 None => {
495 if let Some(HeaderValue(_, lk_val)) = header.get("lexerkind") {
496 LexerKind::try_from(lk_val)?
497 } else {
498 LexerKind::LRNonStreamingLexer
499 }
500 }
501 };
502 #[cfg(test)]
503 if let Some(inspect_lexerkind_cb) = self.inspect_lexerkind_cb {
504 inspect_lexerkind_cb(lexerkind)?
505 }
506 let (lexerdef, lex_flags): (LRNonStreamingLexerDef<LexerTypesT>, LexFlags) =
507 match lexerkind {
508 LexerKind::LRNonStreamingLexer => {
509 let lex_flags = LexFlags::try_from(&mut header)?;
510 let lexerdef = LRNonStreamingLexerDef::<LexerTypesT>::new_with_options(
511 &lex_src, lex_flags,
512 )
513 .map_err(|errs| {
514 let mut out = String::new();
515 out.push_str(&format!(
516 "\n{ERROR}{}\n",
517 lex_diag.file_location_msg("", None)
518 ));
519 for e in errs {
520 out.push_str(&indent(" ", &lex_diag.format_error(e).to_string()));
521 out.push('\n');
522 }
523 ErrorString(out)
524 })?;
525 let lex_flags = lexerdef.lex_flags().cloned();
526 (lexerdef, lex_flags.unwrap())
527 }
528 };
529
530 let ct_parser = if let Some(ref lrcfg) = self.lrpar_config {
531 let mut closure_lexerdef = lexerdef.clone();
532 let mut ctp = CTParserBuilder::<LexerTypesT>::new().inspect_rt(Box::new(
533 move |yacc_header, rtpb, rule_ids_map, grm_path| {
534 let owned_map = rule_ids_map
535 .iter()
536 .map(|(x, y)| (&**x, *y))
537 .collect::<HashMap<_, _>>();
538 closure_lexerdef.set_rule_ids(&owned_map);
539 yacc_header.mark_used(&"test_files".to_string());
540 let test_glob = yacc_header.get("test_files");
541 match test_glob {
542 Some(HeaderValue(_, Value::Setting(Setting::String(test_files, _)))) => {
543 let path_joined = grm_path.parent().unwrap().join(test_files);
544 for path in
545 glob(&path_joined.to_string_lossy()).map_err(|e| e.to_string())?
546 {
547 let path = path?;
548 if let Some(ext) = path.extension() {
549 if let Some(ext) = ext.to_str() {
550 if ext.starts_with("grm") {
551 Err(ErrorString("test_files extensions beginning with `grm` are reserved.".into()))?
552 }
553 }
554 }
555 let input = fs::read_to_string(&path)?;
556 let l: LRNonStreamingLexer<LexerTypesT> =
557 closure_lexerdef.lexer(&input);
558 for e in rtpb.parse_map(&l, &|_| (), &|_, _| ()).1 {
559 Err(format!("parsing {}: {}", path.display(), e))?
560 }
561 }
562 Ok(())
563 }
564 Some(_) => Err("Invalid value for setting 'test_files'".into()),
565 None => Ok(()),
566 }
567 },
568 ));
569 ctp = lrcfg(ctp);
570 let ct_parser = ctp.build()?;
571 self.rule_ids_map = Some(ct_parser.token_map().to_owned());
572 Some(ct_parser)
573 } else {
574 None
575 };
576
577 let mut lexerdef = Box::new(lexerdef);
578 let unused_header_values = header.unused();
579 if !unused_header_values.is_empty() {
580 return Err(
581 format!("Unused header values: {}", unused_header_values.join(", ")).into(),
582 );
583 }
584
585 let (missing_from_lexer, missing_from_parser) = match self.rule_ids_map {
586 Some(ref rim) => {
587 let owned_map = rim
589 .iter()
590 .map(|(x, y)| (&**x, *y))
591 .collect::<HashMap<_, _>>();
592 let (x, y) = lexerdef.set_rule_ids_spanned(&owned_map);
593 (
594 x.map(|a| a.iter().map(|&b| b.to_string()).collect::<HashSet<_>>()),
595 y.map(|a| {
596 a.iter()
597 .map(|(b, span)| (b.to_string(), *span))
598 .collect::<HashSet<_>>()
599 }),
600 )
601 }
602 None => (None, None),
603 };
604
605 let mut has_unallowed_missing = false;
606 let err_indent = " ".repeat(ERROR.len());
607 if !self.allow_missing_terms_in_lexer {
608 if let Some(ref mfl) = missing_from_lexer {
609 if let Some(ct_parser) = &ct_parser {
610 let grm = ct_parser.yacc_grammar();
611 let token_spans = mfl
612 .iter()
613 .map(|name| {
614 ct_parser
615 .yacc_grammar()
616 .token_span(*grm.tokens_map().get(name.as_str()).unwrap())
617 .expect("Given token should have a span")
618 })
619 .collect::<Vec<_>>();
620
621 let yacc_diag = SpannedDiagnosticFormatter::new(
622 ct_parser.grammar_src(),
623 ct_parser.grammar_path(),
624 );
625
626 eprintln!(
627 "{ERROR} these tokens are not referenced in the lexer but defined as follows"
628 );
629 eprintln!(
630 "{err_indent} {}",
631 yacc_diag.file_location_msg("in the grammar", None)
632 );
633 for span in token_spans {
634 eprintln!(
635 "{}",
636 yacc_diag.underline_span_with_text(
637 span,
638 "Missing from lexer".to_string(),
639 '^'
640 )
641 );
642 }
643 eprintln!();
644 } else {
645 eprintln!(
646 "{ERROR} the following tokens are used in the grammar but are not defined in the lexer:"
647 );
648 for n in mfl {
649 eprintln!(" {}", n);
650 }
651 }
652 has_unallowed_missing = true;
653 }
654 }
655 if !self.allow_missing_tokens_in_parser && self.show_warnings {
656 if let Some(ref mfp) = missing_from_parser {
657 let error_prefix = if self.warnings_are_errors {
658 ERROR
659 } else {
660 WARNING
661 };
662 let err_indent = " ".repeat(error_prefix.len());
663 let mut outs = Vec::new();
664 outs.push(format!("{error_prefix} these tokens are not referenced in the grammar but defined as follows"));
665 outs.push(format!(
666 "{err_indent} {}",
667 lex_diag.file_location_msg("in the lexer", None)
668 ));
669 for (_, span) in mfp {
670 let error_contents = lex_diag.underline_span_with_text(
671 *span,
672 "Missing from parser".to_string(),
673 '^',
674 );
675 outs.extend(error_contents.lines().map(|s| s.to_string()));
676 }
677
678 for s in outs {
679 if !self.warnings_are_errors && std::env::var("OUT_DIR").is_ok() {
680 println!("cargo:warning={}", s)
681 } else {
682 eprintln!("{}", s);
683 }
684 }
685
686 has_unallowed_missing |= self.warnings_are_errors;
687 }
688 }
689 if has_unallowed_missing {
690 fs::remove_file(outp).ok();
691 panic!();
692 }
693
694 let mod_name = match self.mod_name {
695 Some(s) => s.to_owned(),
696 None => {
697 let mut stem = lexerp.to_str().unwrap();
702 loop {
703 let new_stem = Path::new(stem).file_stem().unwrap().to_str().unwrap();
704 if stem == new_stem {
705 break;
706 }
707 stem = new_stem;
708 }
709 format!("{}_l", stem)
710 }
711 };
712 let mod_name = format_ident!("{}", mod_name);
713 let mut lexerdef_func_impl = {
714 let LexFlags {
715 allow_wholeline_comments,
716 dot_matches_new_line,
717 multi_line,
718 octal,
719 posix_escapes,
720 case_insensitive,
721 unicode,
722 swap_greed,
723 ignore_whitespace,
724 size_limit,
725 dfa_size_limit,
726 nest_limit,
727 } = lex_flags;
728 let allow_wholeline_comments = QuoteOption(allow_wholeline_comments);
729 let dot_matches_new_line = QuoteOption(dot_matches_new_line);
730 let multi_line = QuoteOption(multi_line);
731 let octal = QuoteOption(octal);
732 let posix_escapes = QuoteOption(posix_escapes);
733 let case_insensitive = QuoteOption(case_insensitive);
734 let unicode = QuoteOption(unicode);
735 let swap_greed = QuoteOption(swap_greed);
736 let ignore_whitespace = QuoteOption(ignore_whitespace);
737 let size_limit = QuoteOption(size_limit);
738 let dfa_size_limit = QuoteOption(dfa_size_limit);
739 let nest_limit = QuoteOption(nest_limit);
740
741 quote! {
743 let mut lex_flags = ::lrlex::DEFAULT_LEX_FLAGS;
744 lex_flags.allow_wholeline_comments = #allow_wholeline_comments.or(::lrlex::DEFAULT_LEX_FLAGS.allow_wholeline_comments);
745 lex_flags.dot_matches_new_line = #dot_matches_new_line.or(::lrlex::DEFAULT_LEX_FLAGS.dot_matches_new_line);
746 lex_flags.multi_line = #multi_line.or(::lrlex::DEFAULT_LEX_FLAGS.multi_line);
747 lex_flags.octal = #octal.or(::lrlex::DEFAULT_LEX_FLAGS.octal);
748 lex_flags.posix_escapes = #posix_escapes.or(::lrlex::DEFAULT_LEX_FLAGS.posix_escapes);
749 lex_flags.case_insensitive = #case_insensitive.or(::lrlex::DEFAULT_LEX_FLAGS.case_insensitive);
750 lex_flags.unicode = #unicode.or(::lrlex::DEFAULT_LEX_FLAGS.unicode);
751 lex_flags.swap_greed = #swap_greed.or(::lrlex::DEFAULT_LEX_FLAGS.swap_greed);
752 lex_flags.ignore_whitespace = #ignore_whitespace.or(::lrlex::DEFAULT_LEX_FLAGS.ignore_whitespace);
753 lex_flags.size_limit = #size_limit.or(::lrlex::DEFAULT_LEX_FLAGS.size_limit);
754 lex_flags.dfa_size_limit = #dfa_size_limit.or(::lrlex::DEFAULT_LEX_FLAGS.dfa_size_limit);
755 lex_flags.nest_limit = #nest_limit.or(::lrlex::DEFAULT_LEX_FLAGS.nest_limit);
756 let lex_flags = lex_flags;
757 }
758 };
759 {
760 let start_states = lexerdef.iter_start_states();
761 let rules = lexerdef.iter_rules().map(|r| {
762 let tok_id = QuoteOption(r.tok_id);
763 let n = QuoteOption(r.name().map(QuoteToString));
764 let target_state =
765 QuoteOption(r.target_state().map(|(x, y)| QuoteTuple((x, y))));
766 let n_span = r.name_span();
767 let regex = QuoteToString(&r.re_str);
768 let start_states = r.start_states();
769 quote! {
774 Rule::new(::lrlex::unstable_api::InternalPublicApi, #tok_id, #n, #n_span, #regex.to_string(),
775 vec![#(#start_states),*], #target_state, &lex_flags).unwrap()
776 }
777 });
778 lexerdef_func_impl.append_all(quote! {
780 let start_states: Vec<StartState> = vec![#(#start_states),*];
781 let rules = vec![#(#rules),*];
782 });
783 }
784 let lexerdef_ty = match lexerkind {
785 LexerKind::LRNonStreamingLexer => {
786 quote!(::lrlex::LRNonStreamingLexerDef)
787 }
788 };
789 lexerdef_func_impl.append_all(quote! {
791 #lexerdef_ty::from_rules(start_states, rules)
792 });
793
794 let mut token_consts = TokenStream::new();
795 if let Some(rim) = self.rule_ids_map {
796 let mut rim_sorted = Vec::from_iter(rim.iter());
797 rim_sorted.sort_by_key(|(k, _)| *k);
798 for (name, id) in rim_sorted {
799 if RE_TOKEN_ID.is_match(name) {
800 let tok_ident = format_ident!("N_{}", name.to_ascii_uppercase());
801 let storaget =
802 str::parse::<TokenStream>(type_name::<LexerTypesT::StorageT>()).unwrap();
803 let tok_const = quote! {
805 #[allow(dead_code)]
806 pub const #tok_ident: #storaget = #id;
807 };
808 token_consts.extend(tok_const)
809 }
810 }
811 }
812 let token_consts = token_consts.into_iter();
813 let out_tokens = {
814 let lexerdef_param = str::parse::<TokenStream>(type_name::<LexerTypesT>()).unwrap();
815 let mod_vis = self.visibility;
816 quote! {
818 #mod_vis mod #mod_name {
819 use ::lrlex::{LexerDef, Rule, StartState};
820 #[allow(dead_code)]
821 pub fn lexerdef() -> #lexerdef_ty<#lexerdef_param> {
822 #lexerdef_func_impl
823 }
824
825 #(#token_consts)*
826 }
827 }
828 };
829 let unformatted = out_tokens.to_string();
831 let mut outs = String::new();
832 let timestamp = env!("VERGEN_BUILD_TIMESTAMP");
836 write!(outs, "// lrlex build time: {}\n\n", quote!(#timestamp),).ok();
837 outs.push_str(
838 &syn::parse_str(&unformatted)
839 .map(|syntax_tree| prettyplease::unparse(&syntax_tree))
840 .unwrap_or(unformatted),
841 );
842 if let Ok(curs) = read_to_string(outp) {
846 if curs == outs {
847 return Ok(CTLexer {
848 missing_from_lexer,
849 missing_from_parser,
850 });
851 }
852 }
853 let mut f = File::create(outp)?;
854 f.write_all(outs.as_bytes())?;
855 Ok(CTLexer {
856 missing_from_lexer,
857 missing_from_parser,
858 })
859 }
860
861 #[deprecated(
868 since = "0.11.0",
869 note = "Please use lexer_in_src_dir() and build() instead"
870 )]
871 #[allow(deprecated)]
872 pub fn process_file_in_src(
873 self,
874 srcp: &str,
875 ) -> Result<(Option<HashSet<String>>, Option<HashSet<String>>), Box<dyn Error>> {
876 let mut inp = current_dir()?;
877 inp.push("src");
878 inp.push(srcp);
879 let mut outp = PathBuf::new();
880 outp.push(var("OUT_DIR").unwrap());
881 outp.push(Path::new(srcp).parent().unwrap().to_str().unwrap());
882 create_dir_all(&outp)?;
883 let mut leaf = Path::new(srcp)
884 .file_name()
885 .unwrap()
886 .to_str()
887 .unwrap()
888 .to_owned();
889 write!(leaf, ".{}", RUST_FILE_EXT).ok();
890 outp.push(leaf);
891 self.process_file(inp, outp)
892 }
893
894 #[deprecated(
912 since = "0.11.0",
913 note = "Please use lexer_in_src_dir() and build() instead"
914 )]
915 pub fn process_file<P, Q>(
916 mut self,
917 inp: P,
918 outp: Q,
919 ) -> Result<(Option<HashSet<String>>, Option<HashSet<String>>), Box<dyn Error>>
920 where
921 P: AsRef<Path>,
922 Q: AsRef<Path>,
923 {
924 self.lexer_path = Some(inp.as_ref().to_owned());
925 self.output_path = Some(outp.as_ref().to_owned());
926 let cl = self.build()?;
927 Ok((
928 cl.missing_from_lexer().map(|x| x.to_owned()),
929 cl.missing_from_parser()
930 .map(|x| x.iter().map(|(n, _)| n.to_owned()).collect::<HashSet<_>>()),
931 ))
932 }
933
934 pub fn allow_missing_terms_in_lexer(mut self, allow: bool) -> Self {
937 self.allow_missing_terms_in_lexer = allow;
938 self
939 }
940
941 pub fn allow_missing_tokens_in_parser(mut self, allow: bool) -> Self {
945 self.allow_missing_tokens_in_parser = allow;
946 self
947 }
948
949 pub fn warnings_are_errors(mut self, flag: bool) -> Self {
952 self.warnings_are_errors = flag;
953 self
954 }
955
956 pub fn show_warnings(mut self, flag: bool) -> Self {
959 self.show_warnings = flag;
960 self
961 }
962
963 pub fn allow_wholeline_comments(mut self, flag: bool) -> Self {
971 let key = "allow_wholeline_comments".to_string();
972 self.header.insert(
973 key,
974 HeaderValue(
975 Location::Other("CTLexerBuilder".to_string()),
976 Value::Flag(flag, Location::Other("CTLexerBuilder".to_string())),
977 ),
978 );
979 self
980 }
981
982 pub fn dot_matches_new_line(mut self, flag: bool) -> Self {
987 let key = "dot_matches_new_line".to_string();
988 self.header.insert(
989 key,
990 HeaderValue(
991 Location::Other("CTLexerBuilder".to_string()),
992 Value::Flag(flag, Location::Other("CTLexerBuilder".to_string())),
993 ),
994 );
995 self
996 }
997
998 pub fn multi_line(mut self, flag: bool) -> Self {
1003 let key = "multi_line".to_string();
1004 self.header.insert(
1005 key,
1006 HeaderValue(
1007 Location::Other("CTLexerBuilder".to_string()),
1008 Value::Flag(flag, Location::Other("CTLexerBuilder".to_string())),
1009 ),
1010 );
1011 self
1012 }
1013
1014 pub fn posix_escapes(mut self, flag: bool) -> Self {
1019 let key = "posix_escapes".to_string();
1020 self.header.insert(
1021 key,
1022 HeaderValue(
1023 Location::Other("CTLexerBuilder".to_string()),
1024 Value::Flag(flag, Location::Other("CTLexerBuilder".to_string())),
1025 ),
1026 );
1027 self
1028 }
1029
1030 pub fn octal(mut self, flag: bool) -> Self {
1035 let key = "octal".to_string();
1036 self.header.insert(
1037 key,
1038 HeaderValue(
1039 Location::Other("CTLexerBuilder".to_string()),
1040 Value::Flag(flag, Location::Other("CTLexerBuilder".to_string())),
1041 ),
1042 );
1043 self
1044 }
1045
1046 pub fn swap_greed(mut self, flag: bool) -> Self {
1051 let key = "swap_greed".to_string();
1052 self.header.insert(
1053 key,
1054 HeaderValue(
1055 Location::Other("CTLexerBuilder".to_string()),
1056 Value::Flag(flag, Location::Other("CTLexerBuilder".to_string())),
1057 ),
1058 );
1059 self
1060 }
1061
1062 pub fn ignore_whitespace(mut self, flag: bool) -> Self {
1067 let key = "ignore_whitespace".to_string();
1068 self.header.insert(
1069 key,
1070 HeaderValue(
1071 Location::Other("CTLexerBuilder".to_string()),
1072 Value::Flag(flag, Location::Other("CTLexerBuilder".to_string())),
1073 ),
1074 );
1075 self
1076 }
1077
1078 pub fn unicode(mut self, flag: bool) -> Self {
1083 let key = "unicode".to_string();
1084 self.header.insert(
1085 key,
1086 HeaderValue(
1087 Location::Other("CTLexerBuilder".to_string()),
1088 Value::Flag(flag, Location::Other("CTLexerBuilder".to_string())),
1089 ),
1090 );
1091 self
1092 }
1093
1094 pub fn case_insensitive(mut self, flag: bool) -> Self {
1099 let key = "case_insensitive".to_string();
1100 self.header.insert(
1101 key,
1102 HeaderValue(
1103 Location::Other("CTLexerBuilder".to_string()),
1104 Value::Flag(flag, Location::Other("CTLexerBuilder".to_string())),
1105 ),
1106 );
1107 self
1108 }
1109
1110 pub fn size_limit(mut self, sz: usize) -> Self {
1115 let key = "size_limit".to_string();
1116 self.header.insert(
1117 key,
1118 HeaderValue(
1119 Location::Other("CTLexerBuilder".to_string()),
1120 Value::Setting(Setting::Num(
1121 sz as u64,
1122 Location::Other("CTLexerBuilder".to_string()),
1123 )),
1124 ),
1125 );
1126 self
1127 }
1128
1129 pub fn dfa_size_limit(mut self, sz: usize) -> Self {
1134 let key = "dfa_size_limit".to_string();
1135 self.header.insert(
1136 key,
1137 HeaderValue(
1138 Location::Other("CTLexerBuilder".to_string()),
1139 Value::Setting(Setting::Num(
1140 sz as u64,
1141 Location::Other("CTLexerBuilder".to_string()),
1142 )),
1143 ),
1144 );
1145 self
1146 }
1147
1148 pub fn nest_limit(mut self, lim: u32) -> Self {
1153 let key = "nest_limit".to_string();
1154 self.header.insert(
1155 key,
1156 HeaderValue(
1157 Location::Other("CTLexerBuilder".to_string()),
1158 Value::Setting(Setting::Num(
1159 lim as u64,
1160 Location::Other("CTLexerBuilder".to_string()),
1161 )),
1162 ),
1163 );
1164 self
1165 }
1166
1167 #[cfg(test)]
1168 pub fn inspect_lexerkind(
1169 mut self,
1170 cb: Box<dyn Fn(LexerKind) -> Result<(), Box<dyn Error>>>,
1171 ) -> Self {
1172 self.inspect_lexerkind_cb = Some(cb);
1173 self
1174 }
1175}
1176
1177pub struct CTLexer {
1179 missing_from_lexer: Option<HashSet<String>>,
1180 missing_from_parser: Option<HashSet<(String, Span)>>,
1181}
1182
1183impl CTLexer {
1184 fn missing_from_lexer(&self) -> Option<&HashSet<String>> {
1185 self.missing_from_lexer.as_ref()
1186 }
1187
1188 fn missing_from_parser(&self) -> Option<&HashSet<(String, Span)>> {
1189 self.missing_from_parser.as_ref()
1190 }
1191}
1192
1193#[derive(Debug, Clone)]
1216pub struct CTTokenMapBuilder<StorageT: Display + ToTokens> {
1217 mod_name: String,
1218 token_map: Vec<(String, TokenStream)>,
1219 rename_map: Option<HashMap<String, String>>,
1220 allow_dead_code: bool,
1221 _marker: PhantomData<StorageT>,
1222}
1223
1224impl<StorageT: Display + ToTokens> CTTokenMapBuilder<StorageT> {
1225 pub fn new(
1231 mod_name: impl Into<String>,
1232 token_map: impl Borrow<HashMap<String, StorageT>>,
1233 ) -> Self {
1234 Self {
1235 mod_name: mod_name.into(),
1236 token_map: token_map
1237 .borrow()
1238 .iter()
1239 .map(|(tok_name, tok_value)| (tok_name.clone(), tok_value.to_token_stream()))
1240 .collect(),
1241 rename_map: None,
1242 allow_dead_code: false,
1243 _marker: PhantomData,
1244 }
1245 }
1246
1247 pub fn rename_map<M, I, K, V>(mut self, rename_map: Option<M>) -> Self
1261 where
1262 M: IntoIterator<Item = I>,
1263 I: Borrow<(K, V)>,
1264 K: AsRef<str>,
1265 V: AsRef<str>,
1266 {
1267 self.rename_map = rename_map.map(|rename_map| {
1268 rename_map
1269 .into_iter()
1270 .map(|it| {
1271 let (k, v) = it.borrow();
1272 let k = k.as_ref().into();
1273 let v = v.as_ref().into();
1274 (k, v)
1275 })
1276 .collect()
1277 });
1278 self
1279 }
1280
1281 pub fn allow_dead_code(mut self, allow_dead_code: bool) -> Self {
1288 self.allow_dead_code = allow_dead_code;
1289 self
1290 }
1291
1292 pub fn build(&self) -> Result<(), Box<dyn Error>> {
1294 let mut outs = String::new();
1298 let timestamp = env!("VERGEN_BUILD_TIMESTAMP");
1299 let mod_ident = format_ident!("{}", self.mod_name);
1300 write!(outs, "// lrlex build time: {}\n\n", quote!(#timestamp),).ok();
1301 let storaget = str::parse::<TokenStream>(type_name::<StorageT>()).unwrap();
1302 let mut token_map_sorted = self.token_map.clone();
1305 token_map_sorted.sort_by(|(l, _), (r, _)| l.cmp(r));
1306 let (token_array, tokens) = token_map_sorted
1307 .iter()
1308 .map(|(k, id)| {
1309 let name = match &self.rename_map {
1310 Some(rmap) => rmap.get(k).unwrap_or(k),
1311 _ => k,
1312 };
1313 let tok_ident: Ident = syn::parse_str(&format!("T_{}", name.to_ascii_uppercase()))
1314 .map_err(|e| {
1315 format!(
1316 "token name {:?} is not a valid Rust identifier: {}; \
1317 consider renaming it via `CTTokenMapBuilder::rename_map`.",
1318 name, e
1319 )
1320 })?;
1321 Ok((
1322 quote! {
1326 #id,
1327 },
1328 quote! {
1329 pub const #tok_ident: #storaget = #id;
1330 },
1331 ))
1332 })
1333 .collect::<Result<(TokenStream, TokenStream), Box<dyn Error>>>()?;
1334 let unused_annotation;
1335 if self.allow_dead_code {
1336 unused_annotation = quote! {#[allow(dead_code)]};
1337 } else {
1338 unused_annotation = quote! {};
1339 };
1340 let unformatted = quote! {
1343 #unused_annotation
1344 mod #mod_ident {
1345 #tokens
1346 #[allow(dead_code)]
1347 pub const TOK_IDS: &[#storaget] = &[#token_array];
1348 }
1349 }
1350 .to_string();
1351 let out_mod = syn::parse_str(&unformatted)
1352 .map(|syntax_tree| prettyplease::unparse(&syntax_tree))
1353 .unwrap_or(unformatted);
1354 outs.push_str(&out_mod);
1355 let mut outp = PathBuf::from(var("OUT_DIR")?);
1356 outp.push(&self.mod_name);
1357 outp.set_extension("rs");
1358
1359 if let Ok(curs) = read_to_string(&outp) {
1363 if curs == outs {
1364 return Ok(());
1365 }
1366 }
1367
1368 let mut f = File::create(outp)?;
1369 f.write_all(outs.as_bytes())?;
1370 Ok(())
1371 }
1372}
1373
1374#[deprecated(since = "0.14.0", note = "use `lrlex::CTTokenMapBuilder` instead")]
1379pub fn ct_token_map<StorageT: Display + ToTokens>(
1380 mod_name: &str,
1381 token_map: impl Borrow<HashMap<String, StorageT>>,
1382 rename_map: Option<&HashMap<&str, &str>>,
1383) -> Result<(), Box<dyn Error>> {
1384 CTTokenMapBuilder::new(mod_name, token_map)
1385 .rename_map(rename_map)
1386 .allow_dead_code(true)
1387 .build()
1388}
1389
1390fn indent(indent: &str, s: &str) -> String {
1401 format!("{indent}{}\n", s.trim_end_matches('\n')).replace('\n', &format!("\n{}", indent))
1402}
1403
1404#[cfg(test)]
1405mod test {
1406 use std::fs::File;
1407 use std::io::Write;
1408
1409 use super::{CTLexerBuilder, LexerKind};
1410 #[test]
1411 fn test_grmtools_section_lexerkind() {
1412 let lexerkinds = [
1413 "LRNonStreamingLexer",
1414 "lrnonstreaminglexer",
1415 "LexerKind::lrnonstreaminglexer",
1416 "lexerkind::LRNonStreamingLexer",
1417 ];
1418 for (i, kind) in lexerkinds.iter().enumerate() {
1419 let lex_src = format!(
1420 "
1421%grmtools{{lexerkind: {}}}
1422%%
1423. ;
1424",
1425 kind
1426 );
1427 let lex_path = format!(
1428 "{}/test_grmtools_section_lexerkind_{}.l",
1429 env!("OUT_DIR"),
1430 i
1431 );
1432 let mut l_file = File::create(lex_path.clone()).unwrap();
1433 l_file.write_all(lex_src.as_bytes()).unwrap();
1434 CTLexerBuilder::new()
1435 .output_path(format!("{}.rs", lex_path.clone()))
1436 .lexer_path(lex_path.clone())
1437 .inspect_lexerkind(Box::new(move |lexerkind| {
1438 assert!(matches!(lexerkind, LexerKind::LRNonStreamingLexer));
1439 Ok(())
1440 }))
1441 .build()
1442 .unwrap();
1443 }
1444 }
1445}