Skip to main content

lrlex/
ctbuilder.rs

1//! Build grammars at run-time.
2
3use bincode::Encode;
4use cfgrammar::{
5    header::{
6        GrmtoolsSectionParser, Header, HeaderError, HeaderErrorKind, HeaderValue, Namespaced,
7        Setting, Value,
8    },
9    markmap::MergeBehavior,
10    span::{Location, Span},
11};
12use glob::glob;
13use lrpar::{
14    CTParserBuilder, LexerTypes,
15    diagnostics::{DiagnosticFormatter, SpannedDiagnosticFormatter},
16};
17use num_traits::{AsPrimitive, PrimInt, Unsigned};
18use proc_macro2::{Ident, TokenStream};
19use quote::{ToTokens, TokenStreamExt, format_ident, quote};
20use regex::Regex;
21use std::marker::PhantomData;
22use std::{
23    any::type_name,
24    borrow::Borrow,
25    collections::{HashMap, HashSet},
26    env::{current_dir, var},
27    error::Error,
28    fmt::{self, Debug, Display, Write as _},
29    fs::{self, File, create_dir_all, read_to_string},
30    hash::Hash,
31    io::Write,
32    path::{Path, PathBuf},
33    sync::{LazyLock, Mutex},
34};
35
36use crate::{DefaultLexerTypes, LRNonStreamingLexer, LRNonStreamingLexerDef, LexFlags, LexerDef};
37
38const RUST_FILE_EXT: &str = "rs";
39
40const ERROR: &str = "[Error]";
41const WARNING: &str = "[Warning]";
42
43static RE_TOKEN_ID: LazyLock<Regex> =
44    LazyLock::new(|| Regex::new(r"^[a-zA-Z_][a-zA-Z_0-9]*$").unwrap());
45
46static GENERATED_PATHS: LazyLock<Mutex<HashSet<PathBuf>>> =
47    LazyLock::new(|| Mutex::new(HashSet::new()));
48
49#[non_exhaustive]
50pub enum LexerKind {
51    LRNonStreamingLexer,
52}
53
54impl<T: Clone> TryFrom<&Value<T>> for LexerKind {
55    type Error = cfgrammar::header::HeaderError<T>;
56    fn try_from(it: &Value<T>) -> Result<LexerKind, Self::Error> {
57        match it {
58            Value::Flag(_, loc) => Err(HeaderError {
59                kind: HeaderErrorKind::ConversionError(
60                    "LexerKind",
61                    "Expected `LexerKind` found bool",
62                ),
63                locations: vec![loc.clone()],
64            }),
65            Value::Setting(Setting::Num(_, loc)) => Err(HeaderError {
66                kind: HeaderErrorKind::ConversionError(
67                    "LexerKind",
68                    "Expected `LexerKind` found numeric",
69                ),
70                locations: vec![loc.clone()],
71            }),
72            Value::Setting(Setting::String(_, loc)) => Err(HeaderError {
73                kind: HeaderErrorKind::ConversionError(
74                    "LexerKind",
75                    "Expected `LexerKind` found string",
76                ),
77                locations: vec![loc.clone()],
78            }),
79            Value::Setting(Setting::Constructor {
80                ctor:
81                    Namespaced {
82                        namespace: _,
83                        member: (_, loc),
84                    },
85                arg: _,
86            }) => Err(HeaderError {
87                kind: HeaderErrorKind::ConversionError(
88                    "LexerKind",
89                    "Expected `LexerKind` found constructor",
90                ),
91                locations: vec![loc.clone()],
92            }),
93            Value::Setting(Setting::Array(_, arr_loc, _)) => Err(HeaderError {
94                kind: HeaderErrorKind::ConversionError(
95                    "LexerKind",
96                    "Expected `LexerKind` found array",
97                ),
98                locations: vec![arr_loc.clone()],
99            }),
100            Value::Setting(Setting::Unitary(Namespaced {
101                namespace,
102                member: (member, member_loc),
103            })) => {
104                if let Some((ns, loc)) = namespace {
105                    if ns.to_lowercase() != "lexerkind" {
106                        return Err(HeaderError {
107                            kind: HeaderErrorKind::ConversionError(
108                                "LexerKind",
109                                "Expected namespace `LexerKind`",
110                            ),
111                            locations: vec![loc.clone()],
112                        });
113                    }
114                }
115                if member.to_lowercase() != "lrnonstreaminglexer" {
116                    return Err(HeaderError {
117                        kind: HeaderErrorKind::ConversionError(
118                            "LexerKind",
119                            "Unknown `LexerKind` Variant",
120                        ),
121                        locations: vec![member_loc.clone()],
122                    });
123                }
124
125                Ok(LexerKind::LRNonStreamingLexer)
126            }
127        }
128    }
129}
130
131/// Specify the visibility of the module generated by [CTLexerBuilder].
132#[derive(Clone, PartialEq, Eq, Debug)]
133#[non_exhaustive]
134pub enum Visibility {
135    /// Module-level visibility only.
136    Private,
137    /// `pub`
138    Public,
139    /// `pub(super)`
140    PublicSuper,
141    /// `pub(self)`
142    PublicSelf,
143    /// `pub(crate)`
144    PublicCrate,
145    /// `pub(in {arg})`
146    PublicIn(String),
147}
148
149impl ToTokens for Visibility {
150    fn to_tokens(&self, tokens: &mut TokenStream) {
151        tokens.extend(match self {
152            Visibility::Private => quote!(),
153            Visibility::Public => quote! {pub},
154            Visibility::PublicSuper => quote! {pub(super)},
155            Visibility::PublicSelf => quote! {pub(self)},
156            Visibility::PublicCrate => quote! {pub(crate)},
157            Visibility::PublicIn(data) => {
158                let other = str::parse::<TokenStream>(data).unwrap();
159                quote! {pub(in #other)}
160            }
161        })
162    }
163}
164
165/// Specifies the [Rust Edition] that will be emitted during code generation.
166///
167/// [Rust Edition]: https://doc.rust-lang.org/edition-guide/rust-2021/index.html
168#[derive(Clone, Copy, PartialEq, Eq, Debug)]
169#[non_exhaustive]
170pub enum RustEdition {
171    Rust2015,
172    Rust2018,
173    Rust2021,
174}
175
176/// The quote impl of `ToTokens` for `Option` prints an empty string for `None`
177/// and the inner value for `Some(inner_value)`.
178///
179/// This wrapper instead emits both `Some` and `None` variants.
180/// See: [quote #20](https://github.com/dtolnay/quote/issues/20)
181struct QuoteOption<T>(Option<T>);
182
183impl<T: ToTokens> ToTokens for QuoteOption<T> {
184    fn to_tokens(&self, tokens: &mut TokenStream) {
185        tokens.append_all(match self.0 {
186            Some(ref t) => quote! { ::std::option::Option::Some(#t) },
187            None => quote! { ::std::option::Option::None },
188        });
189    }
190}
191
192/// This wrapper adds a missing impl of `ToTokens` for tuples.
193/// For a tuple `(a, b)` emits `(a.to_tokens(), b.to_tokens())`
194struct QuoteTuple<T>(T);
195
196impl<A: ToTokens, B: ToTokens> ToTokens for QuoteTuple<(A, B)> {
197    fn to_tokens(&self, tokens: &mut TokenStream) {
198        let (a, b) = &self.0;
199        tokens.append_all(quote!((#a, #b)));
200    }
201}
202
203/// The wrapped `&str` value will be emitted with a call to `to_string()`
204struct QuoteToString<'a>(&'a str);
205
206impl ToTokens for QuoteToString<'_> {
207    fn to_tokens(&self, tokens: &mut TokenStream) {
208        let x = &self.0;
209        tokens.append_all(quote! { #x.to_string() });
210    }
211}
212
213/// A string which uses `Display` for it's `Debug` impl.
214struct ErrorString(String);
215impl fmt::Display for ErrorString {
216    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
217        let ErrorString(s) = self;
218        write!(f, "{}", s)
219    }
220}
221impl fmt::Debug for ErrorString {
222    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
223        let ErrorString(s) = self;
224        write!(f, "{}", s)
225    }
226}
227impl Error for ErrorString {}
228
229/// A `CTLexerBuilder` allows one to specify the criteria for building a statically generated
230/// lexer.
231pub struct CTLexerBuilder<'a, LexerTypesT: LexerTypes = DefaultLexerTypes<u32>>
232where
233    LexerTypesT::StorageT: Debug + Eq + Hash + ToTokens,
234    usize: num_traits::AsPrimitive<LexerTypesT::StorageT>,
235{
236    lrpar_config:
237        Option<Box<dyn Fn(CTParserBuilder<LexerTypesT>) -> CTParserBuilder<LexerTypesT> + 'a>>,
238    lexer_path: Option<PathBuf>,
239    output_path: Option<PathBuf>,
240    lexerkind: Option<LexerKind>,
241    mod_name: Option<&'a str>,
242    visibility: Visibility,
243    rust_edition: RustEdition,
244    rule_ids_map: Option<HashMap<String, LexerTypesT::StorageT>>,
245    allow_missing_terms_in_lexer: bool,
246    allow_missing_tokens_in_parser: bool,
247    warnings_are_errors: bool,
248    show_warnings: bool,
249    header: Header<Location>,
250    #[cfg(test)]
251    inspect_lexerkind_cb: Option<Box<dyn Fn(&LexerKind) -> Result<(), Box<dyn Error>>>>,
252}
253
254impl CTLexerBuilder<'_, DefaultLexerTypes<u32>> {
255    /// Create a new [CTLexerBuilder].
256    pub fn new() -> Self {
257        CTLexerBuilder::<DefaultLexerTypes<u32>>::new_with_lexemet()
258    }
259}
260
261impl<'a, LexerTypesT: LexerTypes<LexErrorT = crate::LRLexError> + 'static>
262    CTLexerBuilder<'a, LexerTypesT>
263where
264    LexerTypesT::StorageT:
265        'static + Debug + Eq + Hash + PrimInt + Encode + TryFrom<usize> + Unsigned + ToTokens,
266    usize: AsPrimitive<LexerTypesT::StorageT>,
267{
268    /// Create a new [CTLexerBuilder].
269    ///
270    /// `LexerTypesT::StorageT` must be an unsigned integer type (e.g. `u8`, `u16`) which is big enough
271    /// to index all the tokens, rules, and productions in the lexer and less than or equal in size
272    /// to `usize` (e.g. on a 64-bit machine `u128` would be too big). If you are lexing large
273    /// files, the additional storage requirements of larger integer types can be noticeable, and
274    /// in such cases it can be worth specifying a smaller type. `StorageT` defaults to `u32` if
275    /// unspecified.
276    ///
277    /// # Examples
278    ///
279    /// ```text
280    /// CTLexerBuilder::<DefaultLexerTypes<u8>>::new_with_lexemet()
281    ///     .lexer_in_src_dir("grm.l", None)?
282    ///     .build()?;
283    /// ```
284    pub fn new_with_lexemet() -> Self {
285        let mut header = Header::new();
286        header.set_default_merge_behavior(MergeBehavior::Ours);
287        CTLexerBuilder {
288            lrpar_config: None,
289            lexer_path: None,
290            output_path: None,
291            lexerkind: None,
292            mod_name: None,
293            visibility: Visibility::Private,
294            rust_edition: RustEdition::Rust2021,
295            rule_ids_map: None,
296            allow_missing_terms_in_lexer: false,
297            allow_missing_tokens_in_parser: false,
298            warnings_are_errors: false,
299            show_warnings: true,
300            header,
301            #[cfg(test)]
302            inspect_lexerkind_cb: None,
303        }
304    }
305
306    /// An optional convenience function to make it easier to create an (lrlex) lexer and (lrpar)
307    /// parser in one shot. The closure passed to this function will be called during
308    /// [CTLexerBuilder::build]: it will be passed an lrpar `CTParserBuilder` instance upon which
309    /// it can set whatever lrpar options are desired. [`CTLexerBuilder`] will then create both the
310    /// compiler and lexer and link them together as required.
311    ///
312    /// # Examples
313    ///
314    /// ```text
315    /// CTLexerBuilder:::new()
316    ///     .lrpar_config(|ctp| {
317    ///         ctp.yacckind(YaccKind::Grmtools)
318    ///             .grammar_in_src_dir("calc.y")
319    ///             .unwrap()
320    ///     })
321    ///     .lexer_in_src_dir("calc.l")?
322    ///     .build()?;
323    /// ```
324    pub fn lrpar_config<F>(mut self, config_func: F) -> Self
325    where
326        F: Fn(CTParserBuilder<LexerTypesT>) -> CTParserBuilder<LexerTypesT> + 'a,
327    {
328        self.lrpar_config = Some(Box::new(config_func));
329        self
330    }
331
332    /// Set the input lexer path to a file relative to this project's `src` directory. This will
333    /// also set the output path (i.e. you do not need to call [CTLexerBuilder::output_path]).
334    ///
335    /// For example if `a/b.l` is passed as `inp` then [CTLexerBuilder::build] will:
336    ///   * use `src/a/b.l` as the input file.
337    ///   * write output to a file which can then be imported by calling `lrlex_mod!("a/b.l")`.
338    ///   * create a module in that output file named `b_l`.
339    ///
340    /// You can override the output path and/or module name by calling
341    /// [CTLexerBuilder::output_path] and/or [CTLexerBuilder::mod_name], respectively, after
342    /// calling this function.
343    ///
344    /// This is a convenience function that makes it easier to compile lexer files stored in a
345    /// project's `src/` directory: please see [CTLexerBuilder::build] for additional constraints
346    /// and information about the generated files. Note also that each `.l` file can only be
347    /// processed once using this function: if you want to generate multiple lexers from a single
348    /// `.l` file, you will need to use [CTLexerBuilder::output_path].
349    pub fn lexer_in_src_dir<P>(mut self, srcp: P) -> Result<Self, Box<dyn Error>>
350    where
351        P: AsRef<Path>,
352    {
353        if !srcp.as_ref().is_relative() {
354            return Err(format!(
355                "Lexer path '{}' must be a relative path.",
356                srcp.as_ref().to_str().unwrap_or("<invalid UTF-8>")
357            )
358            .into());
359        }
360
361        let mut lexp = current_dir()?;
362        lexp.push("src");
363        lexp.push(srcp.as_ref());
364        self.lexer_path = Some(lexp);
365
366        let mut outp = PathBuf::new();
367        outp.push(var("OUT_DIR").unwrap());
368        outp.push(srcp.as_ref().parent().unwrap().to_str().unwrap());
369        create_dir_all(&outp)?;
370        let mut leaf = srcp
371            .as_ref()
372            .file_name()
373            .unwrap()
374            .to_str()
375            .unwrap()
376            .to_owned();
377        write!(leaf, ".{}", RUST_FILE_EXT).ok();
378        outp.push(leaf);
379        Ok(self.output_path(outp))
380    }
381
382    /// Set the input lexer path to `inp`. If specified, you must also call
383    /// [CTLexerBuilder::output_path]. In general it is easier to use
384    /// [CTLexerBuilder::lexer_in_src_dir].
385    pub fn lexer_path<P>(mut self, inp: P) -> Self
386    where
387        P: AsRef<Path>,
388    {
389        self.lexer_path = Some(inp.as_ref().to_owned());
390        self
391    }
392
393    /// Set the output lexer path to `outp`. Note that there are no requirements on `outp`: the
394    /// file can exist anywhere you can create a valid [Path] to. However, if you wish to use
395    /// [crate::lrlex_mod!] you will need to make sure that `outp` is in
396    /// [std::env::var]`("OUT_DIR")` or one of its subdirectories.
397    pub fn output_path<P>(mut self, outp: P) -> Self
398    where
399        P: AsRef<Path>,
400    {
401        self.output_path = Some(outp.as_ref().to_owned());
402        self
403    }
404
405    /// Set the type of lexer to be generated to `lexerkind`.
406    pub fn lexerkind(mut self, lexerkind: LexerKind) -> Self {
407        self.lexerkind = Some(lexerkind);
408        self
409    }
410
411    /// Set the generated module name to `mod_name`. If no module name is specified,
412    /// [`process_file`](#method.process_file) will attempt to create a sensible default based on
413    /// the input filename.
414    pub fn mod_name(mut self, mod_name: &'a str) -> Self {
415        self.mod_name = Some(mod_name);
416        self
417    }
418
419    /// Set the visibility of the generated module to `vis`. Defaults to `Visibility::Private`.
420    pub fn visibility(mut self, vis: Visibility) -> Self {
421        self.visibility = vis;
422        self
423    }
424
425    /// Sets the rust edition to be used for generated code. Defaults to the latest edition of
426    /// rust supported by grmtools.
427    pub fn rust_edition(mut self, edition: RustEdition) -> Self {
428        self.rust_edition = edition;
429        self
430    }
431
432    /// Set this lexer builder's map of rule IDs to `rule_ids_map`. By default, lexing rules have
433    /// arbitrary, but distinct, IDs. Setting the map of rule IDs (from rule names to `StorageT`)
434    /// allows users to synchronise a lexer and parser and to check that all rules are used by both
435    /// parts).
436    pub fn rule_ids_map<T: std::borrow::Borrow<HashMap<String, LexerTypesT::StorageT>> + Clone>(
437        mut self,
438        rule_ids_map: T,
439    ) -> Self {
440        self.rule_ids_map = Some(rule_ids_map.borrow().to_owned());
441        self
442    }
443
444    /// Statically compile the `.l` file specified by [CTLexerBuilder::lexer_path()] into Rust,
445    /// placing the output into the file specified by [CTLexerBuilder::output_path()].
446    ///
447    /// The generated module follows the form:
448    ///
449    /// ```text
450    ///    mod modname {
451    ///      pub fn lexerdef() -> LexerDef<LexerTypesT> { ... }
452    ///
453    ///      ...
454    ///    }
455    /// ```
456    ///
457    /// where:
458    ///  * `modname` is either:
459    ///    * the module name specified by [CTLexerBuilder::mod_name()]
460    ///    * or, if no module name was explicitly specified, then for the file `/a/b/c.l` the
461    ///      module name is `c_l` (i.e. the file's leaf name, minus its extension, with a prefix of
462    ///      `_l`).
463    pub fn build(mut self) -> Result<CTLexer, Box<dyn Error>> {
464        let lexerp = self
465            .lexer_path
466            .as_ref()
467            .expect("lexer_path must be specified before processing.");
468        let outp = self
469            .output_path
470            .as_ref()
471            .expect("output_path must be specified before processing.");
472
473        {
474            let mut lk = GENERATED_PATHS.lock().unwrap();
475            if lk.contains(outp.as_path()) {
476                return Err(format!("Generating two lexers to the same path ('{}') is not allowed: use CTLexerBuilder::output_path (and, optionally, CTLexerBuilder::mod_name) to differentiate them.", &outp.to_str().unwrap()).into());
477            }
478            lk.insert(outp.clone());
479        }
480        let lex_src = read_to_string(lexerp)
481            .map_err(|e| format!("When reading '{}': {e}", lexerp.display()))?;
482        let lex_diag = SpannedDiagnosticFormatter::new(&lex_src, lexerp);
483        let mut header = self.header;
484        let (parsed_header, _) = GrmtoolsSectionParser::new(&lex_src, false)
485            .parse()
486            .map_err(|es| {
487                let mut out = String::new();
488                out.push_str(&format!(
489                    "\n{ERROR}{}\n",
490                    lex_diag.file_location_msg(" parsing the `%grmtools` section", None)
491                ));
492                for e in es {
493                    out.push_str(&indent("     ", &lex_diag.format_error(e).to_string()));
494                    out.push('\n');
495                }
496                ErrorString(out)
497            })?;
498        header.merge_from(parsed_header)?;
499        header.mark_used(&"lexerkind".to_string());
500        let lexerkind = match self.lexerkind {
501            Some(lexerkind) => lexerkind,
502            None => {
503                if let Some(HeaderValue(_, lk_val)) = header.get("lexerkind") {
504                    LexerKind::try_from(lk_val)?
505                } else {
506                    LexerKind::LRNonStreamingLexer
507                }
508            }
509        };
510        #[cfg(test)]
511        if let Some(inspect_lexerkind_cb) = self.inspect_lexerkind_cb {
512            inspect_lexerkind_cb(&lexerkind)?
513        }
514        let (lexerdef, lex_flags): (LRNonStreamingLexerDef<LexerTypesT>, LexFlags) =
515            match lexerkind {
516                LexerKind::LRNonStreamingLexer => {
517                    let lex_flags = LexFlags::try_from(&mut header)?;
518                    let lexerdef = LRNonStreamingLexerDef::<LexerTypesT>::new_with_options(
519                        &lex_src, lex_flags,
520                    )
521                    .map_err(|errs| {
522                        let mut out = String::new();
523                        out.push_str(&format!(
524                            "\n{ERROR}{}\n",
525                            lex_diag.file_location_msg("", None)
526                        ));
527                        for e in errs {
528                            out.push_str(&indent("     ", &lex_diag.format_error(e).to_string()));
529                            out.push('\n');
530                        }
531                        ErrorString(out)
532                    })?;
533                    let lex_flags = lexerdef.lex_flags().cloned();
534                    (lexerdef, lex_flags.unwrap())
535                }
536            };
537
538        let ct_parser = if let Some(ref lrcfg) = self.lrpar_config {
539            let mut closure_lexerdef = lexerdef.clone();
540            let mut ctp = CTParserBuilder::<LexerTypesT>::new().inspect_rt(Box::new(
541                move |yacc_header, rtpb, rule_ids_map, grm_path| {
542                    let owned_map = rule_ids_map
543                        .iter()
544                        .map(|(x, y)| (&**x, *y))
545                        .collect::<HashMap<_, _>>();
546                    closure_lexerdef.set_rule_ids(&owned_map);
547                    yacc_header.mark_used(&"test_files".to_string());
548                    let grammar = rtpb.grammar();
549                    let test_glob = yacc_header.get("test_files");
550                    let mut err_str = None;
551                    let add_error_line = |err_str: &mut Option<String>, line| {
552                        if let Some(err_str) = err_str {
553                            err_str.push_str(&format!("{}\n", line));
554                        } else {
555                            let _ = err_str.insert(format!("{}\n", line));
556                        }
557                    };
558                    match test_glob {
559                        Some(HeaderValue(_, Value::Setting(Setting::Array(test_globs, _, _)))) => {
560                            for setting in test_globs {
561                                match setting {
562                                    Setting::String(test_files, _) => {
563                                        let path_joined = grm_path.parent().unwrap().join(test_files);
564                                        let path_str = &path_joined.to_string_lossy();
565                                        let mut glob_paths = glob(path_str).map_err(|e| e.to_string())?.peekable();
566                                        if glob_paths.peek().is_none() {
567                                            return Err(format!("'test_files' glob '{}' matched no paths", path_str)
568                                                .to_string()
569                                                .into(),
570                                            );
571                                        }
572
573                                        for path in glob_paths {
574                                            let path = path?;
575                                            if let Some(ext) = path.extension() {
576                                                if let Some(ext) = ext.to_str() {
577                                                    if ext.starts_with("grm") {
578                                                        add_error_line(&mut err_str, "test_files extensions beginning with `grm` are reserved.".into());
579                                                    }
580                                                }
581                                            }
582                                            let input = fs::read_to_string(&path)?;
583                                            let l: LRNonStreamingLexer<LexerTypesT> =
584                                                closure_lexerdef.lexer(&input);
585                                            let errs = rtpb.parse_map(&l, &|_| (), &|_, _| ()).1;
586                                            if !errs.is_empty() {
587                                                add_error_line(&mut err_str, format!("While parsing {}:", path.display()));
588                                                for e in errs {
589                                                    let e_pp = e.pp(&l, &|t| grammar.token_epp(t));
590                                                    let e_lines = e_pp.split("\n");
591                                                    for e in e_lines {
592                                                        add_error_line(&mut err_str, format!("\t{}", e));
593                                                    }
594                                                }
595                                            }
596                                        }
597                                    }
598                                    _ => return Err("Invalid value for setting 'test_files'".into()),
599                                }
600                            }
601                            if let Some(err_str) = err_str {
602                                Err(ErrorString(err_str))?
603                            } else {
604                                Ok(())
605                            }
606
607                        }
608                        Some(_) => Err("Invalid value for setting 'test_files'".into()),
609                        None => Ok(()),
610                    }
611                },
612            ));
613            ctp = lrcfg(ctp);
614            let ct_parser = ctp.build()?;
615            self.rule_ids_map = Some(ct_parser.token_map().to_owned());
616            Some(ct_parser)
617        } else {
618            None
619        };
620
621        let mut lexerdef = Box::new(lexerdef);
622        let unused_header_values = header.unused();
623        if !unused_header_values.is_empty() {
624            return Err(
625                format!("Unused header values: {}", unused_header_values.join(", ")).into(),
626            );
627        }
628
629        let (missing_from_lexer, missing_from_parser) = match self.rule_ids_map {
630            Some(ref rim) => {
631                // Convert from HashMap<String, _> to HashMap<&str, _>
632                let owned_map = rim
633                    .iter()
634                    .map(|(x, y)| (&**x, *y))
635                    .collect::<HashMap<_, _>>();
636                let (x, y) = lexerdef.set_rule_ids_spanned(&owned_map);
637                (
638                    x.map(|a| a.iter().map(|&b| b.to_string()).collect::<HashSet<_>>()),
639                    y.map(|a| {
640                        a.iter()
641                            .map(|(b, span)| (b.to_string(), *span))
642                            .collect::<HashSet<_>>()
643                    }),
644                )
645            }
646            None => (None, None),
647        };
648
649        let mut has_unallowed_missing = false;
650        let err_indent = " ".repeat(ERROR.len());
651        if !self.allow_missing_terms_in_lexer {
652            if let Some(ref mfl) = missing_from_lexer {
653                if let Some(ct_parser) = &ct_parser {
654                    let grm = ct_parser.yacc_grammar();
655                    let token_spans = mfl
656                        .iter()
657                        .map(|name| {
658                            ct_parser
659                                .yacc_grammar()
660                                .token_span(*grm.tokens_map().get(name.as_str()).unwrap())
661                                .expect("Given token should have a span")
662                        })
663                        .collect::<Vec<_>>();
664
665                    let yacc_diag = SpannedDiagnosticFormatter::new(
666                        ct_parser.grammar_src(),
667                        ct_parser.grammar_path(),
668                    );
669
670                    eprintln!(
671                        "{ERROR} these tokens are not referenced in the lexer but defined as follows"
672                    );
673                    eprintln!(
674                        "{err_indent} {}",
675                        yacc_diag.file_location_msg("in the grammar", None)
676                    );
677                    for span in token_spans {
678                        eprintln!(
679                            "{}",
680                            yacc_diag.underline_span_with_text(
681                                span,
682                                "Missing from lexer".to_string(),
683                                '^'
684                            )
685                        );
686                    }
687                    eprintln!();
688                } else {
689                    eprintln!(
690                        "{ERROR} the following tokens are used in the grammar but are not defined in the lexer:"
691                    );
692                    for n in mfl {
693                        eprintln!("    {}", n);
694                    }
695                }
696                has_unallowed_missing = true;
697            }
698        }
699        if !self.allow_missing_tokens_in_parser && self.show_warnings {
700            if let Some(ref mfp) = missing_from_parser {
701                let error_prefix = if self.warnings_are_errors {
702                    ERROR
703                } else {
704                    WARNING
705                };
706                let err_indent = " ".repeat(error_prefix.len());
707                let mut outs = Vec::new();
708                outs.push(format!("{error_prefix} these tokens are not referenced in the grammar but defined as follows"));
709                outs.push(format!(
710                    "{err_indent} {}",
711                    lex_diag.file_location_msg("in the lexer", None)
712                ));
713                for (_, span) in mfp {
714                    let error_contents = lex_diag.underline_span_with_text(
715                        *span,
716                        "Missing from parser".to_string(),
717                        '^',
718                    );
719                    outs.extend(error_contents.lines().map(|s| s.to_string()));
720                }
721
722                for s in outs {
723                    if !self.warnings_are_errors && std::env::var("OUT_DIR").is_ok() {
724                        println!("cargo:warning={}", s)
725                    } else {
726                        eprintln!("{}", s);
727                    }
728                }
729
730                has_unallowed_missing |= self.warnings_are_errors;
731            }
732        }
733        if has_unallowed_missing {
734            fs::remove_file(outp).ok();
735            panic!();
736        }
737
738        let mod_name = match self.mod_name {
739            Some(s) => s.to_owned(),
740            None => {
741                // The user hasn't specified a module name, so we create one automatically: what we
742                // do is strip off all the filename extensions (note that it's likely that inp ends
743                // with `l.rs`, so we potentially have to strip off more than one extension) and
744                // then add `_l` to the end.
745                let mut stem = lexerp.to_str().unwrap();
746                loop {
747                    let new_stem = Path::new(stem).file_stem().unwrap().to_str().unwrap();
748                    if stem == new_stem {
749                        break;
750                    }
751                    stem = new_stem;
752                }
753                format!("{}_l", stem)
754            }
755        };
756        let mod_name =
757            match syn::parse_str::<proc_macro2::Ident>(&mod_name) {
758                Ok(s) => s,
759                Err(e) => return Err(format!(
760                    "CTLexerBuilder::mod_name(\"{}\") is not a valid rust identifier due to '{}'",
761                    mod_name, e
762                )
763                .into()),
764            };
765        let mut lexerdef_func_impl = {
766            let LexFlags {
767                allow_wholeline_comments,
768                dot_matches_new_line,
769                multi_line,
770                octal,
771                posix_escapes,
772                case_insensitive,
773                unicode,
774                swap_greed,
775                ignore_whitespace,
776                size_limit,
777                dfa_size_limit,
778                nest_limit,
779            } = lex_flags;
780            let allow_wholeline_comments = QuoteOption(allow_wholeline_comments);
781            let dot_matches_new_line = QuoteOption(dot_matches_new_line);
782            let multi_line = QuoteOption(multi_line);
783            let octal = QuoteOption(octal);
784            let posix_escapes = QuoteOption(posix_escapes);
785            let case_insensitive = QuoteOption(case_insensitive);
786            let unicode = QuoteOption(unicode);
787            let swap_greed = QuoteOption(swap_greed);
788            let ignore_whitespace = QuoteOption(ignore_whitespace);
789            let size_limit = QuoteOption(size_limit);
790            let dfa_size_limit = QuoteOption(dfa_size_limit);
791            let nest_limit = QuoteOption(nest_limit);
792
793            // Code gen for the lexerdef() `lex_flags` variable.
794            quote! {
795                let mut lex_flags = ::lrlex::DEFAULT_LEX_FLAGS;
796                lex_flags.allow_wholeline_comments = #allow_wholeline_comments.or(::lrlex::DEFAULT_LEX_FLAGS.allow_wholeline_comments);
797                lex_flags.dot_matches_new_line = #dot_matches_new_line.or(::lrlex::DEFAULT_LEX_FLAGS.dot_matches_new_line);
798                lex_flags.multi_line = #multi_line.or(::lrlex::DEFAULT_LEX_FLAGS.multi_line);
799                lex_flags.octal = #octal.or(::lrlex::DEFAULT_LEX_FLAGS.octal);
800                lex_flags.posix_escapes = #posix_escapes.or(::lrlex::DEFAULT_LEX_FLAGS.posix_escapes);
801                lex_flags.case_insensitive = #case_insensitive.or(::lrlex::DEFAULT_LEX_FLAGS.case_insensitive);
802                lex_flags.unicode = #unicode.or(::lrlex::DEFAULT_LEX_FLAGS.unicode);
803                lex_flags.swap_greed = #swap_greed.or(::lrlex::DEFAULT_LEX_FLAGS.swap_greed);
804                lex_flags.ignore_whitespace = #ignore_whitespace.or(::lrlex::DEFAULT_LEX_FLAGS.ignore_whitespace);
805                lex_flags.size_limit = #size_limit.or(::lrlex::DEFAULT_LEX_FLAGS.size_limit);
806                lex_flags.dfa_size_limit = #dfa_size_limit.or(::lrlex::DEFAULT_LEX_FLAGS.dfa_size_limit);
807                lex_flags.nest_limit = #nest_limit.or(::lrlex::DEFAULT_LEX_FLAGS.nest_limit);
808                let lex_flags = lex_flags;
809            }
810        };
811        {
812            let start_states = lexerdef.iter_start_states();
813            let rules = lexerdef.iter_rules().map(|r| {
814                    let tok_id = QuoteOption(r.tok_id);
815                    let n = QuoteOption(r.name().map(QuoteToString));
816                    let target_state =
817                        QuoteOption(r.target_state().map(|(x, y)| QuoteTuple((x, y))));
818                    let n_span = r.name_span();
819                    let regex = QuoteToString(&r.re_str);
820                    let start_states = r.start_states();
821                    // Code gen to construct a rule.
822                    //
823                    // We cannot `impl ToToken for Rule` because `Rule` never stores `lex_flags`,
824                    // Thus we reference the local lex_flags variable bound earlier.
825                    quote! {
826                        Rule::new(::lrlex::unstable_api::InternalPublicApi, #tok_id, #n, #n_span, #regex,
827                                vec![#(#start_states),*], #target_state, &lex_flags).unwrap()
828                    }
829                });
830            // Code gen for `lexerdef()`s rules and the stack of `start_states`.
831            lexerdef_func_impl.append_all(quote! {
832                let start_states: Vec<StartState> = vec![#(#start_states),*];
833                let rules = vec![#(#rules),*];
834            });
835        }
836        let lexerdef_ty = match lexerkind {
837            LexerKind::LRNonStreamingLexer => {
838                quote!(::lrlex::LRNonStreamingLexerDef)
839            }
840        };
841        // Code gen for the lexerdef() return value referencing variables bound earlier.
842        lexerdef_func_impl.append_all(quote! {
843            #lexerdef_ty::from_rules(start_states, rules)
844        });
845
846        let mut token_consts = TokenStream::new();
847        if let Some(rim) = self.rule_ids_map {
848            let mut rim_sorted = Vec::from_iter(rim.iter());
849            rim_sorted.sort_by_key(|(k, _)| *k);
850            for (name, id) in rim_sorted {
851                if RE_TOKEN_ID.is_match(name) {
852                    let tok_ident = format_ident!("N_{}", name.to_ascii_uppercase());
853                    let storaget =
854                        str::parse::<TokenStream>(type_name::<LexerTypesT::StorageT>()).unwrap();
855                    // Code gen for the constant token values.
856                    let tok_const = quote! {
857                        #[allow(dead_code)]
858                        pub const #tok_ident: #storaget = #id;
859                    };
860                    token_consts.extend(tok_const)
861                }
862            }
863        }
864        let token_consts = token_consts.into_iter();
865        let out_tokens = {
866            let lexerdef_param = str::parse::<TokenStream>(type_name::<LexerTypesT>()).unwrap();
867            let mod_vis = self.visibility;
868            // Code gen for the generated module.
869            quote! {
870                #mod_vis mod #mod_name {
871                    use ::lrlex::{LexerDef, Rule, StartState};
872                    #[allow(dead_code)]
873                    pub fn lexerdef() -> #lexerdef_ty<#lexerdef_param> {
874                        #lexerdef_func_impl
875                    }
876
877                    #(#token_consts)*
878                }
879            }
880        };
881        // Try and run a code formatter on the generated code.
882        let unformatted = out_tokens.to_string();
883        let mut outs = String::new();
884        // Record the time that this version of lrlex was built. If the source code changes and rustc
885        // forces a recompile, this will change this value, causing anything which depends on this
886        // build of lrlex to be recompiled too.
887        let timestamp = env!("VERGEN_BUILD_TIMESTAMP");
888        write!(outs, "// lrlex build time: {}\n\n", quote!(#timestamp),).ok();
889        outs.push_str(
890            &syn::parse_str(&unformatted)
891                .map(|syntax_tree| prettyplease::unparse(&syntax_tree))
892                .unwrap_or(unformatted),
893        );
894        // If the file we're about to write out already exists with the same contents, then we
895        // don't overwrite it (since that will force a recompile of the file, and relinking of the
896        // binary etc).
897        if let Ok(curs) = read_to_string(outp) {
898            if curs == outs {
899                return Ok(CTLexer {
900                    missing_from_lexer,
901                    missing_from_parser,
902                });
903            }
904        }
905        let mut f = File::create(outp)?;
906        f.write_all(outs.as_bytes())?;
907        Ok(CTLexer {
908            missing_from_lexer,
909            missing_from_parser,
910        })
911    }
912
913    /// Given the filename `a/b.l` as input, statically compile the file `src/a/b.l` into a Rust
914    /// module which can then be imported using `lrlex_mod!("a/b.l")`. This is a convenience
915    /// function around [`process_file`](struct.CTLexerBuilder.html#method.process_file) which makes
916    /// it easier to compile `.l` files stored in a project's `src/` directory: please see
917    /// [`process_file`](#method.process_file) for additional constraints and information about the
918    /// generated files.
919    #[deprecated(
920        since = "0.11.0",
921        note = "Please use lexer_in_src_dir() and build() instead"
922    )]
923    #[allow(deprecated)]
924    pub fn process_file_in_src(
925        self,
926        srcp: &str,
927    ) -> Result<(Option<HashSet<String>>, Option<HashSet<String>>), Box<dyn Error>> {
928        let mut inp = current_dir()?;
929        inp.push("src");
930        inp.push(srcp);
931        let mut outp = PathBuf::new();
932        outp.push(var("OUT_DIR").unwrap());
933        outp.push(Path::new(srcp).parent().unwrap().to_str().unwrap());
934        create_dir_all(&outp)?;
935        let mut leaf = Path::new(srcp)
936            .file_name()
937            .unwrap()
938            .to_str()
939            .unwrap()
940            .to_owned();
941        write!(leaf, ".{}", RUST_FILE_EXT).ok();
942        outp.push(leaf);
943        self.process_file(inp, outp)
944    }
945
946    /// Statically compile the `.l` file `inp` into Rust, placing the output into the file `outp`.
947    /// The latter defines a module as follows:
948    ///
949    /// ```text
950    ///    mod modname {
951    ///      pub fn lexerdef() -> LexerDef<LexerTypesT::StorageT> { ... }
952    ///
953    ///      ...
954    ///    }
955    /// ```
956    ///
957    /// where:
958    ///  * `modname` is either:
959    ///    * the module name specified [`mod_name`](#method.mod_name)
960    ///    * or, if no module name was explicitly specified, then for the file `/a/b/c.l` the
961    ///      module name is `c_l` (i.e. the file's leaf name, minus its extension, with a prefix of
962    ///      `_l`).
963    #[deprecated(
964        since = "0.11.0",
965        note = "Please use lexer_in_src_dir() and build() instead"
966    )]
967    pub fn process_file<P, Q>(
968        mut self,
969        inp: P,
970        outp: Q,
971    ) -> Result<(Option<HashSet<String>>, Option<HashSet<String>>), Box<dyn Error>>
972    where
973        P: AsRef<Path>,
974        Q: AsRef<Path>,
975    {
976        self.lexer_path = Some(inp.as_ref().to_owned());
977        self.output_path = Some(outp.as_ref().to_owned());
978        let cl = self.build()?;
979        Ok((
980            cl.missing_from_lexer().map(|x| x.to_owned()),
981            cl.missing_from_parser()
982                .map(|x| x.iter().map(|(n, _)| n.to_owned()).collect::<HashSet<_>>()),
983        ))
984    }
985
986    /// If passed false, tokens used in the grammar but not defined in the lexer will cause a
987    /// panic at lexer generation time. Defaults to false.
988    pub fn allow_missing_terms_in_lexer(mut self, allow: bool) -> Self {
989        self.allow_missing_terms_in_lexer = allow;
990        self
991    }
992
993    /// If passed false, tokens defined in the lexer but not used in the grammar will cause a
994    /// warning at lexer generation time. Defaults to false (since lexers sometimes define tokens such
995    /// as reserved words, which are intentionally not in the grammar).
996    pub fn allow_missing_tokens_in_parser(mut self, allow: bool) -> Self {
997        self.allow_missing_tokens_in_parser = allow;
998        self
999    }
1000
1001    /// If set to true, [CTLexerBuilder::build] will return an error if the given lexer contains
1002    /// any warnings. Defaults to `true`.
1003    pub fn warnings_are_errors(mut self, flag: bool) -> Self {
1004        self.warnings_are_errors = flag;
1005        self
1006    }
1007
1008    /// If set to true, [CTParserBuilder::build] will print warnings to stderr, or via cargo when
1009    /// running under cargo. Defaults to `true`.
1010    pub fn show_warnings(mut self, flag: bool) -> Self {
1011        self.show_warnings = flag;
1012        self
1013    }
1014
1015    /// Enables `// comment` style parsing according to `flag``.
1016    /// When enabled comments can appear at the beginning of a line,
1017    /// and regular expressions with the `/` character should be escaped via `\/`.
1018    ///
1019    /// The default value is `false`.
1020    ///
1021    /// Setting this flag will override the same flag within a `%grmtools` section.
1022    pub fn allow_wholeline_comments(mut self, flag: bool) -> Self {
1023        let key = "allow_wholeline_comments".to_string();
1024        self.header.insert(
1025            key,
1026            HeaderValue(
1027                Location::Other("CTLexerBuilder".to_string()),
1028                Value::Flag(flag, Location::Other("CTLexerBuilder".to_string())),
1029            ),
1030        );
1031        self
1032    }
1033
1034    /// Sets the `regex::RegexBuilder` option of the same name.
1035    /// The default value is `true`.
1036    ///
1037    /// Setting this flag will override the same flag within a `%grmtools` section.
1038    pub fn dot_matches_new_line(mut self, flag: bool) -> Self {
1039        let key = "dot_matches_new_line".to_string();
1040        self.header.insert(
1041            key,
1042            HeaderValue(
1043                Location::Other("CTLexerBuilder".to_string()),
1044                Value::Flag(flag, Location::Other("CTLexerBuilder".to_string())),
1045            ),
1046        );
1047        self
1048    }
1049
1050    /// Sets the `regex::RegexBuilder` option of the same name.
1051    /// The default value is `true`.
1052    ///
1053    /// Setting this flag will override the same flag within a `%grmtools` section.
1054    pub fn multi_line(mut self, flag: bool) -> Self {
1055        let key = "multi_line".to_string();
1056        self.header.insert(
1057            key,
1058            HeaderValue(
1059                Location::Other("CTLexerBuilder".to_string()),
1060                Value::Flag(flag, Location::Other("CTLexerBuilder".to_string())),
1061            ),
1062        );
1063        self
1064    }
1065
1066    /// Enables posix lex compatible escape sequences according to `flag`.
1067    /// The default value is `false`.
1068    ///
1069    /// Setting this flag will override the same flag within a `%grmtools` section.
1070    pub fn posix_escapes(mut self, flag: bool) -> Self {
1071        let key = "posix_escapes".to_string();
1072        self.header.insert(
1073            key,
1074            HeaderValue(
1075                Location::Other("CTLexerBuilder".to_string()),
1076                Value::Flag(flag, Location::Other("CTLexerBuilder".to_string())),
1077            ),
1078        );
1079        self
1080    }
1081
1082    /// Sets the `regex::RegexBuilder` option of the same name.
1083    /// The default value is `true`.
1084    ///
1085    /// Setting this flag will override the same flag within a `%grmtools` section.
1086    pub fn octal(mut self, flag: bool) -> Self {
1087        let key = "octal".to_string();
1088        self.header.insert(
1089            key,
1090            HeaderValue(
1091                Location::Other("CTLexerBuilder".to_string()),
1092                Value::Flag(flag, Location::Other("CTLexerBuilder".to_string())),
1093            ),
1094        );
1095        self
1096    }
1097
1098    /// Sets the `regex::RegexBuilder` option of the same name.
1099    /// Default value is specified by regex.
1100    ///
1101    /// Setting this flag will override the same flag within a `%grmtools` section.
1102    pub fn swap_greed(mut self, flag: bool) -> Self {
1103        let key = "swap_greed".to_string();
1104        self.header.insert(
1105            key,
1106            HeaderValue(
1107                Location::Other("CTLexerBuilder".to_string()),
1108                Value::Flag(flag, Location::Other("CTLexerBuilder".to_string())),
1109            ),
1110        );
1111        self
1112    }
1113
1114    /// Sets the `regex::RegexBuilder` option of the same name.
1115    /// Default value is specified by regex.
1116    ///
1117    /// Setting this flag will override the same flag within a `%grmtools` section.
1118    pub fn ignore_whitespace(mut self, flag: bool) -> Self {
1119        let key = "ignore_whitespace".to_string();
1120        self.header.insert(
1121            key,
1122            HeaderValue(
1123                Location::Other("CTLexerBuilder".to_string()),
1124                Value::Flag(flag, Location::Other("CTLexerBuilder".to_string())),
1125            ),
1126        );
1127        self
1128    }
1129
1130    /// Sets the `regex::RegexBuilder` option of the same name.
1131    /// Default value is specified by regex.
1132    ///
1133    /// Setting this flag will override the same flag within a `%grmtools` section.
1134    pub fn unicode(mut self, flag: bool) -> Self {
1135        let key = "unicode".to_string();
1136        self.header.insert(
1137            key,
1138            HeaderValue(
1139                Location::Other("CTLexerBuilder".to_string()),
1140                Value::Flag(flag, Location::Other("CTLexerBuilder".to_string())),
1141            ),
1142        );
1143        self
1144    }
1145
1146    /// Sets the `regex::RegexBuilder` option of the same name.
1147    /// Default value is specified by regex.
1148    ///
1149    /// Setting this flag will override the same flag within a `%grmtools` section.
1150    pub fn case_insensitive(mut self, flag: bool) -> Self {
1151        let key = "case_insensitive".to_string();
1152        self.header.insert(
1153            key,
1154            HeaderValue(
1155                Location::Other("CTLexerBuilder".to_string()),
1156                Value::Flag(flag, Location::Other("CTLexerBuilder".to_string())),
1157            ),
1158        );
1159        self
1160    }
1161
1162    /// Sets the `regex::RegexBuilder` option of the same name.
1163    /// Default value is specified by regex.
1164    ///
1165    /// Setting this flag will override the same flag within a `%grmtools` section.
1166    pub fn size_limit(mut self, sz: usize) -> Self {
1167        let key = "size_limit".to_string();
1168        self.header.insert(
1169            key,
1170            HeaderValue(
1171                Location::Other("CTLexerBuilder".to_string()),
1172                Value::Setting(Setting::Num(
1173                    sz as u64,
1174                    Location::Other("CTLexerBuilder".to_string()),
1175                )),
1176            ),
1177        );
1178        self
1179    }
1180
1181    /// Sets the `regex::RegexBuilder` option of the same name.
1182    /// Default value is specified by regex.
1183    ///
1184    /// Setting this flag will override the same flag within a `%grmtools` section.
1185    pub fn dfa_size_limit(mut self, sz: usize) -> Self {
1186        let key = "dfa_size_limit".to_string();
1187        self.header.insert(
1188            key,
1189            HeaderValue(
1190                Location::Other("CTLexerBuilder".to_string()),
1191                Value::Setting(Setting::Num(
1192                    sz as u64,
1193                    Location::Other("CTLexerBuilder".to_string()),
1194                )),
1195            ),
1196        );
1197        self
1198    }
1199
1200    /// Sets the `regex::RegexBuilder` option of the same name.
1201    /// Default value is specified by regex.
1202    ///
1203    /// Setting this flag will override the same flag within a `%grmtools` section.
1204    pub fn nest_limit(mut self, lim: u32) -> Self {
1205        let key = "nest_limit".to_string();
1206        self.header.insert(
1207            key,
1208            HeaderValue(
1209                Location::Other("CTLexerBuilder".to_string()),
1210                Value::Setting(Setting::Num(
1211                    lim as u64,
1212                    Location::Other("CTLexerBuilder".to_string()),
1213                )),
1214            ),
1215        );
1216        self
1217    }
1218
1219    #[cfg(test)]
1220    pub fn inspect_lexerkind(
1221        mut self,
1222        cb: Box<dyn Fn(&LexerKind) -> Result<(), Box<dyn Error>>>,
1223    ) -> Self {
1224        self.inspect_lexerkind_cb = Some(cb);
1225        self
1226    }
1227}
1228
1229/// An interface to the result of [CTLexerBuilder::build()].
1230pub struct CTLexer {
1231    missing_from_lexer: Option<HashSet<String>>,
1232    missing_from_parser: Option<HashSet<(String, Span)>>,
1233}
1234
1235impl CTLexer {
1236    fn missing_from_lexer(&self) -> Option<&HashSet<String>> {
1237        self.missing_from_lexer.as_ref()
1238    }
1239
1240    fn missing_from_parser(&self) -> Option<&HashSet<(String, Span)>> {
1241        self.missing_from_parser.as_ref()
1242    }
1243}
1244
1245/// Exports all token IDs used by a parser as a separate Rust module.
1246///
1247/// This builder will create a Rust module named `mod_name`
1248/// that can be imported with [`lrlex_mod!(mod_name)`](crate::lrlex_mod).
1249/// The module will contain one `const` `StorageT` per token in `token_map`,
1250/// with the token prefixed by `T_`. In addition, it will contain
1251/// an array of all token IDs `TOK_IDS`.
1252///
1253/// For example, if `StorageT` is `u8`, `mod_name` is `x`, and `token_map` is
1254/// `HashMap{"ID": 0, "INT": 1}` the generated module will look roughly as follows:
1255///
1256/// ```rust,ignore
1257/// mod x {
1258///   pub const T_ID: u8 = 0;
1259///   pub const T_INT: u8 = 1;
1260///   pub const TOK_IDS: &[u8] = &[T_ID, T_INT];
1261/// }
1262/// ```
1263///
1264/// See the [custom lexer example] for more usage details.
1265///
1266/// [custom lexer example]: https://github.com/softdevteam/grmtools/tree/master/lrlex/examples/calc_manual_lex
1267#[derive(Debug, Clone)]
1268pub struct CTTokenMapBuilder<StorageT: Display + ToTokens> {
1269    mod_name: String,
1270    token_map: Vec<(String, TokenStream)>,
1271    rename_map: Option<HashMap<String, String>>,
1272    allow_dead_code: bool,
1273    _marker: PhantomData<StorageT>,
1274}
1275
1276impl<StorageT: Display + ToTokens> CTTokenMapBuilder<StorageT> {
1277    /// Create a new token map builder.
1278    ///
1279    /// See the [builder documentation] for more info.
1280    ///
1281    /// [builder documentation]: CTTokenMapBuilder
1282    pub fn new(
1283        mod_name: impl Into<String>,
1284        token_map: impl Borrow<HashMap<String, StorageT>>,
1285    ) -> Self {
1286        Self {
1287            mod_name: mod_name.into(),
1288            token_map: token_map
1289                .borrow()
1290                .iter()
1291                .map(|(tok_name, tok_value)| (tok_name.clone(), tok_value.to_token_stream()))
1292                .collect(),
1293            rename_map: None,
1294            allow_dead_code: false,
1295            _marker: PhantomData,
1296        }
1297    }
1298
1299    /// Set a token rename map.
1300    ///
1301    /// Rename map is used to specify identifier names for tokens whose names
1302    /// are not valid Rust identifiers. For example, if `token_map`
1303    /// is `HashMap{"+": 0, "ID": 1}` and `rename_map` is `HashMap{"+": "PLUS"}`
1304    /// then the generated module will look roughly as follows:
1305    ///
1306    /// ```rust,ignore
1307    /// mod x {
1308    ///   pub const T_PLUS: u8 = 0;
1309    ///   pub const T_ID: u8 = 1;
1310    /// }
1311    /// ```
1312    pub fn rename_map<M, I, K, V>(mut self, rename_map: Option<M>) -> Self
1313    where
1314        M: IntoIterator<Item = I>,
1315        I: Borrow<(K, V)>,
1316        K: AsRef<str>,
1317        V: AsRef<str>,
1318    {
1319        self.rename_map = rename_map.map(|rename_map| {
1320            rename_map
1321                .into_iter()
1322                .map(|it| {
1323                    let (k, v) = it.borrow();
1324                    let k = k.as_ref().into();
1325                    let v = v.as_ref().into();
1326                    (k, v)
1327                })
1328                .collect()
1329        });
1330        self
1331    }
1332
1333    /// Control whether the builder will add `#[allow(dead_code)]`
1334    /// to the generated module.
1335    ///
1336    /// By default, all tokens are `#[deny(dead_code)]`, meaning that you'll
1337    /// get a warning if your custom lexer doesn't use any of them.
1338    /// This function can be used to disable this behavior.
1339    pub fn allow_dead_code(mut self, allow_dead_code: bool) -> Self {
1340        self.allow_dead_code = allow_dead_code;
1341        self
1342    }
1343
1344    /// Build the token map module.
1345    pub fn build(&self) -> Result<(), Box<dyn Error>> {
1346        // Record the time that this version of lrlex was built. If the source code changes and rustc
1347        // forces a recompile, this will change this value, causing anything which depends on this
1348        // build of lrlex to be recompiled too.
1349        let mut outs = String::new();
1350        let timestamp = env!("VERGEN_BUILD_TIMESTAMP");
1351        let mod_ident = format_ident!("{}", self.mod_name);
1352        write!(outs, "// lrlex build time: {}\n\n", quote!(#timestamp),).ok();
1353        let storaget = str::parse::<TokenStream>(type_name::<StorageT>()).unwrap();
1354        // Sort the tokens so that they're always in the same order.
1355        // This will prevent unneeded rebuilds.
1356        let mut token_map_sorted = self.token_map.clone();
1357        token_map_sorted.sort_by(|(l, _), (r, _)| l.cmp(r));
1358        let (token_array, tokens) = token_map_sorted
1359            .iter()
1360            .map(|(k, id)| {
1361                let name = match &self.rename_map {
1362                    Some(rmap) => rmap.get(k).unwrap_or(k),
1363                    _ => k,
1364                };
1365                let tok_ident: Ident = syn::parse_str(&format!("T_{}", name.to_ascii_uppercase()))
1366                    .map_err(|e| {
1367                        format!(
1368                            "token name {:?} is not a valid Rust identifier: {}; \
1369                            consider renaming it via `CTTokenMapBuilder::rename_map`.",
1370                            name, e
1371                        )
1372                    })?;
1373                Ok((
1374                    // Note: the array of all tokens can't use `tok_ident` because
1375                    // it will confuse the dead code checker. For this reason,
1376                    // we use `id` here.
1377                    quote! {
1378                        #id,
1379                    },
1380                    quote! {
1381                        pub const #tok_ident: #storaget = #id;
1382                    },
1383                ))
1384            })
1385            .collect::<Result<(TokenStream, TokenStream), Box<dyn Error>>>()?;
1386        let unused_annotation = if self.allow_dead_code {
1387            quote! {#[allow(dead_code)]}
1388        } else {
1389            quote! {}
1390        };
1391        // Since the formatter doesn't preserve comments and we don't want to lose build time,
1392        // just format the module contents.
1393        let unformatted = quote! {
1394            #unused_annotation
1395            mod #mod_ident {
1396                #tokens
1397                #[allow(dead_code)]
1398                pub const TOK_IDS: &[#storaget] = &[#token_array];
1399            }
1400        }
1401        .to_string();
1402        let out_mod = syn::parse_str(&unformatted)
1403            .map(|syntax_tree| prettyplease::unparse(&syntax_tree))
1404            .unwrap_or(unformatted);
1405        outs.push_str(&out_mod);
1406        let mut outp = PathBuf::from(var("OUT_DIR")?);
1407        outp.push(&self.mod_name);
1408        outp.set_extension("rs");
1409
1410        // If the file we're about to write out already exists with the same contents, then we
1411        // don't overwrite it (since that will force a recompile of the file, and relinking of the
1412        // binary etc).
1413        if let Ok(curs) = read_to_string(&outp) {
1414            if curs == outs {
1415                return Ok(());
1416            }
1417        }
1418
1419        let mut f = File::create(outp)?;
1420        f.write_all(outs.as_bytes())?;
1421        Ok(())
1422    }
1423}
1424
1425/// Create a Rust module named `mod_name` that can be imported with
1426/// [`lrlex_mod!(mod_name)`](crate::lrlex_mod).
1427///
1428/// This function is deprecated in favour of [`CTTokenMapBuilder`].
1429#[deprecated(since = "0.14.0", note = "use `lrlex::CTTokenMapBuilder` instead")]
1430pub fn ct_token_map<StorageT: Display + ToTokens>(
1431    mod_name: &str,
1432    token_map: impl Borrow<HashMap<String, StorageT>>,
1433    rename_map: Option<&HashMap<&str, &str>>,
1434) -> Result<(), Box<dyn Error>> {
1435    CTTokenMapBuilder::new(mod_name, token_map)
1436        .rename_map(rename_map)
1437        .allow_dead_code(true)
1438        .build()
1439}
1440
1441/// Indents a multi-line string and trims any trailing newline.
1442/// This currently assumes that indentation on blank lines does not matter.
1443///
1444/// The algorithm used by this function is:
1445/// 1. Prefix `s` with the indentation, indenting the first line.
1446/// 2. Trim any trailing newlines.
1447/// 3. Replace all newlines with `\n{indent}`` to indent all lines after the first.
1448///
1449/// It is plausible that we should a step 4, but currently do not:
1450/// 4. Replace all `\n{indent}\n` with `\n\n`
1451fn indent(indent: &str, s: &str) -> String {
1452    format!("{indent}{}\n", s.trim_end_matches('\n')).replace('\n', &format!("\n{}", indent))
1453}
1454
1455// It isn't clear to me why this test isn't working on wasm32,
1456// as the `workspace_runner` should allow access to `OUT_DIR`
1457// perhaps it is related to absolute paths
1458#[cfg(all(not(target_arch = "wasm32"), test))]
1459mod test {
1460    use std::fs::File;
1461    use std::io::Write;
1462
1463    use super::{CTLexerBuilder, LexerKind};
1464    #[test]
1465    fn test_grmtools_section_lexerkind() {
1466        let lexerkinds = [
1467            "LRNonStreamingLexer",
1468            "lrnonstreaminglexer",
1469            "LexerKind::lrnonstreaminglexer",
1470            "lexerkind::LRNonStreamingLexer",
1471        ];
1472        for (i, kind) in lexerkinds.iter().enumerate() {
1473            let lex_src = format!(
1474                "
1475%grmtools{{lexerkind: {}}}
1476%%
1477. ;
1478",
1479                kind
1480            );
1481            let lex_path = format!(
1482                "{}/test_grmtools_section_lexerkind_{}.l",
1483                env!("OUT_DIR"),
1484                i
1485            );
1486            let mut l_file = File::create(lex_path.clone()).unwrap();
1487            l_file.write_all(lex_src.as_bytes()).unwrap();
1488            CTLexerBuilder::new()
1489                .output_path(format!("{}.rs", lex_path.clone()))
1490                .lexer_path(lex_path.clone())
1491                .inspect_lexerkind(Box::new(move |lexerkind| {
1492                    assert!(matches!(lexerkind, &LexerKind::LRNonStreamingLexer));
1493                    Ok(())
1494                }))
1495                .build()
1496                .unwrap();
1497        }
1498    }
1499
1500    #[test]
1501    /// Tests a yacc .y filename containing a dash character leading to an invalid rust identifier
1502    /// when that dash is subsequently used as the default `CTParserBuilder::mod_name`.
1503    fn test_invalid_identifier_in_derived_mod_name() {
1504        let mut lex_path = std::path::PathBuf::from(env!("OUT_DIR"));
1505        lex_path.push("contains-a-dash.l");
1506        let mut f = File::create(&lex_path).unwrap();
1507        let _ = f.write_all(
1508            r#"
1509%%
1510A  "A"
1511"#
1512            .as_bytes(),
1513        );
1514        match CTLexerBuilder::new()
1515            .output_path(format!("{}.rs", lex_path.display()))
1516            .lexer_path(lex_path.clone())
1517            .build()
1518        {
1519            Ok(_) => panic!("Expected error"),
1520            Err(e) => {
1521                let err_string = e.to_string();
1522                assert_eq!(
1523                    err_string,
1524                    "CTLexerBuilder::mod_name(\"contains-a-dash_l\") is not a valid rust identifier due to 'unexpected token'"
1525                );
1526            }
1527        }
1528    }
1529}