lrlex/
ctbuilder.rs

1//! Build grammars at run-time.
2
3use std::{
4    any::type_name,
5    collections::{HashMap, HashSet},
6    env::{current_dir, var},
7    error::Error,
8    fmt::{Debug, Display, Write as _},
9    fs::{self, create_dir_all, read_to_string, File},
10    hash::Hash,
11    io::Write,
12    path::{Path, PathBuf},
13    str::FromStr,
14    sync::Mutex,
15};
16
17use bincode::Encode;
18use cfgrammar::{
19    header::{GrmtoolsSectionParser, HeaderError, HeaderErrorKind, Namespaced, Setting, Value},
20    newlinecache::NewlineCache,
21    Spanned,
22};
23use lazy_static::lazy_static;
24use lrpar::{CTParserBuilder, LexerTypes};
25use num_traits::{AsPrimitive, PrimInt, Unsigned};
26use proc_macro2::TokenStream;
27use quote::{format_ident, quote, ToTokens, TokenStreamExt};
28use regex::Regex;
29
30use crate::{DefaultLexerTypes, LRNonStreamingLexerDef, LexFlags, LexerDef, UNSPECIFIED_LEX_FLAGS};
31
32const RUST_FILE_EXT: &str = "rs";
33
34lazy_static! {
35    static ref RE_TOKEN_ID: Regex = Regex::new(r"^[a-zA-Z_][a-zA-Z_0-9]*$").unwrap();
36    static ref GENERATED_PATHS: Mutex<HashSet<PathBuf>> = Mutex::new(HashSet::new());
37}
38
39#[non_exhaustive]
40pub enum LexerKind {
41    LRNonStreamingLexer,
42}
43
44impl TryFrom<&Value> for LexerKind {
45    type Error = cfgrammar::header::HeaderError;
46    fn try_from(it: &Value) -> Result<LexerKind, Self::Error> {
47        match it {
48            Value::Flag(_, loc) => Err(HeaderError {
49                kind: HeaderErrorKind::ConversionError(
50                    "LexerKind",
51                    "Expected `LexerKind` found bool",
52                ),
53                locations: vec![loc.clone()],
54            }),
55            Value::Setting(Setting::Num(_, loc)) => Err(HeaderError {
56                kind: HeaderErrorKind::ConversionError(
57                    "LexerKind",
58                    "Expected `LexerKind` found numeric",
59                ),
60                locations: vec![loc.clone()],
61            }),
62            Value::Setting(Setting::Constructor {
63                ctor:
64                    Namespaced {
65                        namespace: _,
66                        member: (_, loc),
67                    },
68                arg: _,
69            }) => Err(HeaderError {
70                kind: HeaderErrorKind::ConversionError(
71                    "LexerKind",
72                    "Expected `LexerKind` found constructor",
73                ),
74                locations: vec![loc.clone()],
75            }),
76            Value::Setting(Setting::Unitary(Namespaced {
77                namespace,
78                member: (member, member_loc),
79            })) => {
80                if let Some((ns, loc)) = namespace {
81                    if ns.to_lowercase() != "lexerkind" {
82                        return Err(HeaderError {
83                            kind: HeaderErrorKind::ConversionError(
84                                "LexerKind",
85                                "Expected namespace `LexerKind`",
86                            ),
87                            locations: vec![loc.clone()],
88                        });
89                    }
90                }
91                if member.to_lowercase() != "lrnonstreaminglexer" {
92                    return Err(HeaderError {
93                        kind: HeaderErrorKind::ConversionError(
94                            "LexerKind",
95                            "Unknown `LexerKind` Variant",
96                        ),
97                        locations: vec![member_loc.clone()],
98                    });
99                }
100
101                Ok(LexerKind::LRNonStreamingLexer)
102            }
103        }
104    }
105}
106
107/// Specify the visibility of the module generated by [CTLexerBuilder].
108#[derive(Clone, PartialEq, Eq, Debug)]
109#[non_exhaustive]
110pub enum Visibility {
111    /// Module-level visibility only.
112    Private,
113    /// `pub`
114    Public,
115    /// `pub(super)`
116    PublicSuper,
117    /// `pub(self)`
118    PublicSelf,
119    /// `pub(crate)`
120    PublicCrate,
121    /// `pub(in {arg})`
122    PublicIn(String),
123}
124
125impl ToTokens for Visibility {
126    fn to_tokens(&self, tokens: &mut TokenStream) {
127        tokens.extend(match self {
128            Visibility::Private => quote!(),
129            Visibility::Public => quote! {pub},
130            Visibility::PublicSuper => quote! {pub(super)},
131            Visibility::PublicSelf => quote! {pub(self)},
132            Visibility::PublicCrate => quote! {pub(crate)},
133            Visibility::PublicIn(data) => {
134                let other = str::parse::<TokenStream>(data).unwrap();
135                quote! {pub(in #other)}
136            }
137        })
138    }
139}
140
141/// Specifies the [Rust Edition] that will be emitted during code generation.
142///
143/// [Rust Edition]: https://doc.rust-lang.org/edition-guide/rust-2021/index.html
144#[derive(Clone, Copy, PartialEq, Eq, Debug)]
145#[non_exhaustive]
146pub enum RustEdition {
147    Rust2015,
148    Rust2018,
149    Rust2021,
150}
151
152/// The quote impl of `ToTokens` for `Option` prints an empty string for `None`
153/// and the inner value for `Some(inner_value)`.
154///
155/// This wrapper instead emits both `Some` and `None` variants.
156/// See: [quote #20](https://github.com/dtolnay/quote/issues/20)
157struct QuoteOption<T>(Option<T>);
158
159impl<T: ToTokens> ToTokens for QuoteOption<T> {
160    fn to_tokens(&self, tokens: &mut TokenStream) {
161        tokens.append_all(match self.0 {
162            Some(ref t) => quote! { ::std::option::Option::Some(#t) },
163            None => quote! { ::std::option::Option::None },
164        });
165    }
166}
167
168/// This wrapper adds a missing impl of `ToTokens` for tuples.
169/// For a tuple `(a, b)` emits `(a.to_tokens(), b.to_tokens())`
170struct QuoteTuple<T>(T);
171
172impl<A: ToTokens, B: ToTokens> ToTokens for QuoteTuple<(A, B)> {
173    fn to_tokens(&self, tokens: &mut TokenStream) {
174        let (a, b) = &self.0;
175        tokens.append_all(quote!((#a, #b)));
176    }
177}
178
179/// The wrapped `&str` value will be emitted with a call to `to_string()`
180struct QuoteToString<'a>(&'a str);
181
182impl ToTokens for QuoteToString<'_> {
183    fn to_tokens(&self, tokens: &mut TokenStream) {
184        let x = &self.0;
185        tokens.append_all(quote! { #x.to_string() });
186    }
187}
188
189/// A `CTLexerBuilder` allows one to specify the criteria for building a statically generated
190/// lexer.
191pub struct CTLexerBuilder<'a, LexerTypesT: LexerTypes = DefaultLexerTypes<u32>>
192where
193    LexerTypesT::StorageT: Debug + Eq + Hash + ToTokens,
194    usize: num_traits::AsPrimitive<LexerTypesT::StorageT>,
195{
196    lrpar_config: Option<Box<dyn Fn(CTParserBuilder<LexerTypesT>) -> CTParserBuilder<LexerTypesT>>>,
197    lexer_path: Option<PathBuf>,
198    output_path: Option<PathBuf>,
199    lexerkind: Option<LexerKind>,
200    mod_name: Option<&'a str>,
201    visibility: Visibility,
202    rust_edition: RustEdition,
203    rule_ids_map: Option<HashMap<String, LexerTypesT::StorageT>>,
204    allow_missing_terms_in_lexer: bool,
205    allow_missing_tokens_in_parser: bool,
206    force_lex_flags: LexFlags,
207    default_lex_flags: LexFlags,
208    #[cfg(test)]
209    inspect_lexerkind_cb: Option<Box<dyn Fn(LexerKind) -> Result<(), Box<dyn Error>>>>,
210}
211
212impl CTLexerBuilder<'_, DefaultLexerTypes<u32>> {
213    /// Create a new [CTLexerBuilder].
214    pub fn new() -> Self {
215        CTLexerBuilder::<DefaultLexerTypes<u32>>::new_with_lexemet()
216    }
217}
218
219impl<'a, LexerTypesT: LexerTypes> CTLexerBuilder<'a, LexerTypesT>
220where
221    LexerTypesT::StorageT:
222        'static + Debug + Eq + Hash + PrimInt + Encode + TryFrom<usize> + Unsigned + ToTokens,
223    usize: AsPrimitive<LexerTypesT::StorageT>,
224{
225    /// Create a new [CTLexerBuilder].
226    ///
227    /// `LexerTypesT::StorageT` must be an unsigned integer type (e.g. `u8`, `u16`) which is big enough
228    /// to index all the tokens, rules, and productions in the lexer and less than or equal in size
229    /// to `usize` (e.g. on a 64-bit machine `u128` would be too big). If you are lexing large
230    /// files, the additional storage requirements of larger integer types can be noticeable, and
231    /// in such cases it can be worth specifying a smaller type. `StorageT` defaults to `u32` if
232    /// unspecified.
233    ///
234    /// # Examples
235    ///
236    /// ```text
237    /// CTLexerBuilder::<DefaultLexerTypes<u8>>::new_with_lexemet()
238    ///     .lexer_in_src_dir("grm.l", None)?
239    ///     .build()?;
240    /// ```
241    pub fn new_with_lexemet() -> Self {
242        CTLexerBuilder {
243            lrpar_config: None,
244            lexer_path: None,
245            output_path: None,
246            lexerkind: None,
247            mod_name: None,
248            visibility: Visibility::Private,
249            rust_edition: RustEdition::Rust2021,
250            rule_ids_map: None,
251            allow_missing_terms_in_lexer: false,
252            allow_missing_tokens_in_parser: true,
253            force_lex_flags: UNSPECIFIED_LEX_FLAGS,
254            default_lex_flags: UNSPECIFIED_LEX_FLAGS,
255            #[cfg(test)]
256            inspect_lexerkind_cb: None,
257        }
258    }
259
260    /// An optional convenience function to make it easier to create an (lrlex) lexer and (lrpar)
261    /// parser in one shot. The closure passed to this function will be called during
262    /// [CTLexerBuilder::build]: it will be passed an lrpar `CTParserBuilder` instance upon which
263    /// it can set whatever lrpar options are desired. [`CTLexerBuilder`] will then create both the
264    /// compiler and lexer and link them together as required.
265    ///
266    /// # Examples
267    ///
268    /// ```text
269    /// CTLexerBuilder:::new()
270    ///     .lrpar_config(|ctp| {
271    ///         ctp.yacckind(YaccKind::Grmtools)
272    ///             .grammar_in_src_dir("calc.y")
273    ///             .unwrap()
274    ///     })
275    ///     .lexer_in_src_dir("calc.l")?
276    ///     .build()?;
277    /// ```
278    pub fn lrpar_config<F>(mut self, config_func: F) -> Self
279    where
280        F: 'static + Fn(CTParserBuilder<LexerTypesT>) -> CTParserBuilder<LexerTypesT>,
281    {
282        self.lrpar_config = Some(Box::new(config_func));
283        self
284    }
285
286    /// Set the input lexer path to a file relative to this project's `src` directory. This will
287    /// also set the output path (i.e. you do not need to call [CTLexerBuilder::output_path]).
288    ///
289    /// For example if `a/b.l` is passed as `inp` then [CTLexerBuilder::build] will:
290    ///   * use `src/a/b.l` as the input file.
291    ///   * write output to a file which can then be imported by calling `lrlex_mod!("a/b.l")`.
292    ///   * create a module in that output file named `b_l`.
293    ///
294    /// You can override the output path and/or module name by calling
295    /// [CTLexerBuilder::output_path] and/or [CTLexerBuilder::mod_name], respectively, after
296    /// calling this function.
297    ///
298    /// This is a convenience function that makes it easier to compile lexer files stored in a
299    /// project's `src/` directory: please see [CTLexerBuilder::build] for additional constraints
300    /// and information about the generated files. Note also that each `.l` file can only be
301    /// processed once using this function: if you want to generate multiple lexers from a single
302    /// `.l` file, you will need to use [CTLexerBuilder::output_path].
303    pub fn lexer_in_src_dir<P>(mut self, srcp: P) -> Result<Self, Box<dyn Error>>
304    where
305        P: AsRef<Path>,
306    {
307        if !srcp.as_ref().is_relative() {
308            return Err(format!(
309                "Lexer path '{}' must be a relative path.",
310                srcp.as_ref().to_str().unwrap_or("<invalid UTF-8>")
311            )
312            .into());
313        }
314
315        let mut lexp = current_dir()?;
316        lexp.push("src");
317        lexp.push(srcp.as_ref());
318        self.lexer_path = Some(lexp);
319
320        let mut outp = PathBuf::new();
321        outp.push(var("OUT_DIR").unwrap());
322        outp.push(srcp.as_ref().parent().unwrap().to_str().unwrap());
323        create_dir_all(&outp)?;
324        let mut leaf = srcp
325            .as_ref()
326            .file_name()
327            .unwrap()
328            .to_str()
329            .unwrap()
330            .to_owned();
331        write!(leaf, ".{}", RUST_FILE_EXT).ok();
332        outp.push(leaf);
333        Ok(self.output_path(outp))
334    }
335
336    /// Set the input lexer path to `inp`. If specified, you must also call
337    /// [CTLexerBuilder::output_path]. In general it is easier to use
338    /// [CTLexerBuilder::lexer_in_src_dir].
339    pub fn lexer_path<P>(mut self, inp: P) -> Self
340    where
341        P: AsRef<Path>,
342    {
343        self.lexer_path = Some(inp.as_ref().to_owned());
344        self
345    }
346
347    /// Set the output lexer path to `outp`. Note that there are no requirements on `outp`: the
348    /// file can exist anywhere you can create a valid [Path] to. However, if you wish to use
349    /// [crate::lrlex_mod!] you will need to make sure that `outp` is in
350    /// [std::env::var]`("OUT_DIR")` or one of its subdirectories.
351    pub fn output_path<P>(mut self, outp: P) -> Self
352    where
353        P: AsRef<Path>,
354    {
355        self.output_path = Some(outp.as_ref().to_owned());
356        self
357    }
358
359    /// Set the type of lexer to be generated to `lexerkind`.
360    pub fn lexerkind(mut self, lexerkind: LexerKind) -> Self {
361        self.lexerkind = Some(lexerkind);
362        self
363    }
364
365    /// Set the generated module name to `mod_name`. If no module name is specified,
366    /// [`process_file`](#method.process_file) will attempt to create a sensible default based on
367    /// the input filename.
368    pub fn mod_name(mut self, mod_name: &'a str) -> Self {
369        self.mod_name = Some(mod_name);
370        self
371    }
372
373    /// Set the visibility of the generated module to `vis`. Defaults to `Visibility::Private`.
374    pub fn visibility(mut self, vis: Visibility) -> Self {
375        self.visibility = vis;
376        self
377    }
378
379    /// Sets the rust edition to be used for generated code. Defaults to the latest edition of
380    /// rust supported by grmtools.
381    pub fn rust_edition(mut self, edition: RustEdition) -> Self {
382        self.rust_edition = edition;
383        self
384    }
385
386    /// Set this lexer builder's map of rule IDs to `rule_ids_map`. By default, lexing rules have
387    /// arbitrary, but distinct, IDs. Setting the map of rule IDs (from rule names to `StorageT`)
388    /// allows users to synchronise a lexer and parser and to check that all rules are used by both
389    /// parts).
390    pub fn rule_ids_map<T: std::borrow::Borrow<HashMap<String, LexerTypesT::StorageT>> + Clone>(
391        mut self,
392        rule_ids_map: T,
393    ) -> Self {
394        self.rule_ids_map = Some(rule_ids_map.borrow().to_owned());
395        self
396    }
397
398    /// Statically compile the `.l` file specified by [CTLexerBuilder::lexer_path()] into Rust,
399    /// placing the output into the file specified by [CTLexerBuilder::output_path()].
400    ///
401    /// The generated module follows the form:
402    ///
403    /// ```text
404    ///    mod modname {
405    ///      pub fn lexerdef() -> LexerDef<LexerTypesT> { ... }
406    ///
407    ///      ...
408    ///    }
409    /// ```
410    ///
411    /// where:
412    ///  * `modname` is either:
413    ///    * the module name specified by [CTLexerBuilder::mod_name()]
414    ///    * or, if no module name was explicitly specified, then for the file `/a/b/c.l` the
415    ///      module name is `c_l` (i.e. the file's leaf name, minus its extension, with a prefix of
416    ///      `_l`).
417    pub fn build(mut self) -> Result<CTLexer, Box<dyn Error>> {
418        if let Some(ref lrcfg) = self.lrpar_config {
419            let mut ctp = CTParserBuilder::<LexerTypesT>::new();
420            ctp = lrcfg(ctp);
421            let map = ctp.build()?;
422            self.rule_ids_map = Some(map.token_map().to_owned());
423        }
424
425        let lexerp = self
426            .lexer_path
427            .as_ref()
428            .expect("lexer_path must be specified before processing.");
429        let outp = self
430            .output_path
431            .as_ref()
432            .expect("output_path must be specified before processing.");
433
434        {
435            let mut lk = GENERATED_PATHS.lock().unwrap();
436            if lk.contains(outp.as_path()) {
437                return Err(format!("Generating two lexers to the same path ('{}') is not allowed: use CTLexerBuilder::output_path (and, optionally, CTLexerBuilder::mod_name) to differentiate them.", &outp.to_str().unwrap()).into());
438            }
439            lk.insert(outp.clone());
440        }
441        let lex_src = read_to_string(lexerp)
442            .map_err(|e| format!("When reading '{}': {e}", lexerp.display()))?;
443        let (header, _) = GrmtoolsSectionParser::new(&lex_src, false)
444            .parse()
445            .map_err(|es| {
446                es.iter()
447                    .map(|e| e.to_string())
448                    .collect::<Vec<_>>()
449                    .join("\n")
450            })?;
451        let lexerkind = match self.lexerkind {
452            Some(lexerkind) => lexerkind,
453            None => {
454                if let Some((_, lk_val)) = header.get("lexerkind") {
455                    LexerKind::try_from(lk_val)?
456                } else {
457                    LexerKind::LRNonStreamingLexer
458                }
459            }
460        };
461        let line_cache = NewlineCache::from_str(&lex_src).unwrap();
462        #[cfg(test)]
463        if let Some(inspect_lexerkind_cb) = self.inspect_lexerkind_cb {
464            inspect_lexerkind_cb(lexerkind)?
465        }
466        let (mut lexerdef, lex_flags): (Box<dyn LexerDef<LexerTypesT>>, LexFlags) = match lexerkind
467        {
468            LexerKind::LRNonStreamingLexer => {
469                let lexerdef = LRNonStreamingLexerDef::<LexerTypesT>::new_with_options(
470                    &lex_src,
471                    self.force_lex_flags.clone(),
472                    self.default_lex_flags.clone(),
473                )
474                .map_err(|errs| {
475                    errs.iter()
476                        .map(|e| {
477                            if let Some((line, column)) = line_cache.byte_to_line_num_and_col_num(
478                                &lex_src,
479                                e.spans().first().unwrap().start(),
480                            ) {
481                                format!("{} at line {line} column {column}", e)
482                            } else {
483                                format!("{}", e)
484                            }
485                        })
486                        .collect::<Vec<_>>()
487                        .join("\n")
488                })?;
489                let lex_flags = lexerdef.lex_flags().cloned();
490                (Box::new(lexerdef), lex_flags.unwrap())
491            }
492        };
493        let (missing_from_lexer, missing_from_parser) = match self.rule_ids_map {
494            Some(ref rim) => {
495                // Convert from HashMap<String, _> to HashMap<&str, _>
496                let owned_map = rim
497                    .iter()
498                    .map(|(x, y)| (&**x, *y))
499                    .collect::<HashMap<_, _>>();
500                let (x, y) = lexerdef.set_rule_ids(&owned_map);
501                (
502                    x.map(|a| a.iter().map(|&b| b.to_string()).collect::<HashSet<_>>()),
503                    y.map(|a| a.iter().map(|&b| b.to_string()).collect::<HashSet<_>>()),
504                )
505            }
506            None => (None, None),
507        };
508
509        let mut has_unallowed_missing = false;
510        if !self.allow_missing_terms_in_lexer {
511            if let Some(ref mfl) = missing_from_lexer {
512                eprintln!("Error: the following tokens are used in the grammar but are not defined in the lexer:");
513                for n in mfl {
514                    eprintln!("    {}", n);
515                }
516                has_unallowed_missing = true;
517            }
518        }
519        if !self.allow_missing_tokens_in_parser {
520            if let Some(ref mfp) = missing_from_parser {
521                eprintln!("Error: the following tokens are defined in the lexer but not used in the grammar:");
522                for n in mfp {
523                    eprintln!("    {}", n);
524                }
525                has_unallowed_missing = true;
526            }
527        }
528        if has_unallowed_missing {
529            fs::remove_file(outp).ok();
530            panic!();
531        }
532
533        let mod_name = match self.mod_name {
534            Some(s) => s.to_owned(),
535            None => {
536                // The user hasn't specified a module name, so we create one automatically: what we
537                // do is strip off all the filename extensions (note that it's likely that inp ends
538                // with `l.rs`, so we potentially have to strip off more than one extension) and
539                // then add `_l` to the end.
540                let mut stem = lexerp.to_str().unwrap();
541                loop {
542                    let new_stem = Path::new(stem).file_stem().unwrap().to_str().unwrap();
543                    if stem == new_stem {
544                        break;
545                    }
546                    stem = new_stem;
547                }
548                format!("{}_l", stem)
549            }
550        };
551        let mod_name = format_ident!("{}", mod_name);
552        let mut lexerdef_func_impl = {
553            let LexFlags {
554                allow_wholeline_comments,
555                dot_matches_new_line,
556                multi_line,
557                octal,
558                posix_escapes,
559                case_insensitive,
560                unicode,
561                swap_greed,
562                ignore_whitespace,
563                size_limit,
564                dfa_size_limit,
565                nest_limit,
566            } = lex_flags;
567            let allow_wholeline_comments = QuoteOption(allow_wholeline_comments);
568            let dot_matches_new_line = QuoteOption(dot_matches_new_line);
569            let multi_line = QuoteOption(multi_line);
570            let octal = QuoteOption(octal);
571            let posix_escapes = QuoteOption(posix_escapes);
572            let case_insensitive = QuoteOption(case_insensitive);
573            let unicode = QuoteOption(unicode);
574            let swap_greed = QuoteOption(swap_greed);
575            let ignore_whitespace = QuoteOption(ignore_whitespace);
576            let size_limit = QuoteOption(size_limit);
577            let dfa_size_limit = QuoteOption(dfa_size_limit);
578            let nest_limit = QuoteOption(nest_limit);
579
580            // Code gen for the lexerdef() `lex_flags` variable.
581            quote! {
582                let mut lex_flags = ::lrlex::DEFAULT_LEX_FLAGS;
583                lex_flags.allow_wholeline_comments = #allow_wholeline_comments;
584                lex_flags.dot_matches_new_line = #dot_matches_new_line;
585                lex_flags.multi_line = #multi_line;
586                lex_flags.octal = #octal;
587                lex_flags.posix_escapes = #posix_escapes;
588                lex_flags.case_insensitive = #case_insensitive;
589                lex_flags.unicode = #unicode;
590                lex_flags.swap_greed = #swap_greed;
591                lex_flags.ignore_whitespace = #ignore_whitespace;
592                lex_flags.size_limit = #size_limit;
593                lex_flags.dfa_size_limit = #dfa_size_limit;
594                lex_flags.nest_limit = #nest_limit;
595                let lex_flags = lex_flags;
596            }
597        };
598        {
599            let start_states = lexerdef.iter_start_states();
600            let rules = lexerdef.iter_rules().map(|r| {
601                    let tok_id = QuoteOption(r.tok_id);
602                    let n = QuoteOption(r.name().map(QuoteToString));
603                    let target_state =
604                        QuoteOption(r.target_state().map(|(x, y)| QuoteTuple((x, y))));
605                    let n_span = r.name_span();
606                    let regex = QuoteToString(&r.re_str);
607                    let start_states = r.start_states();
608                    // Code gen to construct a rule.
609                    //
610                    // We cannot `impl ToToken for Rule` because `Rule` never stores `lex_flags`,
611                    // Thus we reference the local lex_flags variable bound earlier.
612                    quote! {
613                        Rule::new(::lrlex::unstable_api::InternalPublicApi, #tok_id, #n, #n_span, #regex.to_string(),
614                                vec![#(#start_states),*], #target_state, &lex_flags).unwrap()
615                    }
616                });
617            // Code gen for `lexerdef()`s rules and the stack of `start_states`.
618            lexerdef_func_impl.append_all(quote! {
619                let start_states: Vec<StartState> = vec![#(#start_states),*];
620                let rules = vec![#(#rules),*];
621            });
622        }
623        let lexerdef_ty = match lexerkind {
624            LexerKind::LRNonStreamingLexer => {
625                quote!(::lrlex::LRNonStreamingLexerDef)
626            }
627        };
628        // Code gen for the lexerdef() return value referencing variables bound earlier.
629        lexerdef_func_impl.append_all(quote! {
630            #lexerdef_ty::from_rules(start_states, rules)
631        });
632
633        let mut token_consts = TokenStream::new();
634        if let Some(rim) = self.rule_ids_map {
635            for (name, id) in rim {
636                if RE_TOKEN_ID.is_match(&name) {
637                    let tok_ident = format_ident!("N_{}", name.to_ascii_uppercase());
638                    let storaget =
639                        str::parse::<TokenStream>(type_name::<LexerTypesT::StorageT>()).unwrap();
640                    // Code gen for the constant token values.
641                    let tok_const = quote! {
642                        #[allow(dead_code)]
643                        pub const #tok_ident: #storaget = #id;
644                    };
645                    token_consts.extend(tok_const)
646                }
647            }
648        }
649        let token_consts = token_consts.into_iter();
650        let out_tokens = {
651            let lexerdef_param = str::parse::<TokenStream>(type_name::<LexerTypesT>()).unwrap();
652            let mod_vis = self.visibility;
653            // Code gen for the generated module.
654            quote! {
655                #mod_vis mod #mod_name {
656                    use ::lrlex::{LexerDef, Rule, StartState};
657                    #[allow(dead_code)]
658                    pub fn lexerdef() -> #lexerdef_ty<#lexerdef_param> {
659                        #lexerdef_func_impl
660                    }
661
662                    #(#token_consts)*
663                }
664            }
665        };
666        // Try and run a code formatter on the generated code.
667        let unformatted = out_tokens.to_string();
668        let outs = syn::parse_str(&unformatted)
669            .map(|syntax_tree| prettyplease::unparse(&syntax_tree))
670            .unwrap_or(unformatted);
671        // If the file we're about to write out already exists with the same contents, then we
672        // don't overwrite it (since that will force a recompile of the file, and relinking of the
673        // binary etc).
674        if let Ok(curs) = read_to_string(outp) {
675            if curs == outs {
676                return Ok(CTLexer {
677                    missing_from_lexer,
678                    missing_from_parser,
679                });
680            }
681        }
682        let mut f = File::create(outp)?;
683        f.write_all(outs.as_bytes())?;
684        Ok(CTLexer {
685            missing_from_lexer,
686            missing_from_parser,
687        })
688    }
689
690    /// Given the filename `a/b.l` as input, statically compile the file `src/a/b.l` into a Rust
691    /// module which can then be imported using `lrlex_mod!("a/b.l")`. This is a convenience
692    /// function around [`process_file`](struct.CTLexerBuilder.html#method.process_file) which makes
693    /// it easier to compile `.l` files stored in a project's `src/` directory: please see
694    /// [`process_file`](#method.process_file) for additional constraints and information about the
695    /// generated files.
696    #[deprecated(
697        since = "0.11.0",
698        note = "Please use lexer_in_src_dir() and build() instead"
699    )]
700    #[allow(deprecated)]
701    pub fn process_file_in_src(
702        self,
703        srcp: &str,
704    ) -> Result<(Option<HashSet<String>>, Option<HashSet<String>>), Box<dyn Error>> {
705        let mut inp = current_dir()?;
706        inp.push("src");
707        inp.push(srcp);
708        let mut outp = PathBuf::new();
709        outp.push(var("OUT_DIR").unwrap());
710        outp.push(Path::new(srcp).parent().unwrap().to_str().unwrap());
711        create_dir_all(&outp)?;
712        let mut leaf = Path::new(srcp)
713            .file_name()
714            .unwrap()
715            .to_str()
716            .unwrap()
717            .to_owned();
718        write!(leaf, ".{}", RUST_FILE_EXT).ok();
719        outp.push(leaf);
720        self.process_file(inp, outp)
721    }
722
723    /// Statically compile the `.l` file `inp` into Rust, placing the output into the file `outp`.
724    /// The latter defines a module as follows:
725    ///
726    /// ```text
727    ///    mod modname {
728    ///      pub fn lexerdef() -> LexerDef<LexerTypesT::StorageT> { ... }
729    ///
730    ///      ...
731    ///    }
732    /// ```
733    ///
734    /// where:
735    ///  * `modname` is either:
736    ///    * the module name specified [`mod_name`](#method.mod_name)
737    ///    * or, if no module name was explicitly specified, then for the file `/a/b/c.l` the
738    ///      module name is `c_l` (i.e. the file's leaf name, minus its extension, with a prefix of
739    ///      `_l`).
740    #[deprecated(
741        since = "0.11.0",
742        note = "Please use lexer_in_src_dir() and build() instead"
743    )]
744    pub fn process_file<P, Q>(
745        mut self,
746        inp: P,
747        outp: Q,
748    ) -> Result<(Option<HashSet<String>>, Option<HashSet<String>>), Box<dyn Error>>
749    where
750        P: AsRef<Path>,
751        Q: AsRef<Path>,
752    {
753        self.lexer_path = Some(inp.as_ref().to_owned());
754        self.output_path = Some(outp.as_ref().to_owned());
755        let cl = self.build()?;
756        Ok((
757            cl.missing_from_lexer().map(|x| x.to_owned()),
758            cl.missing_from_parser().map(|x| x.to_owned()),
759        ))
760    }
761
762    /// If passed false, tokens used in the grammar but not defined in the lexer will cause a
763    /// panic at lexer generation time. Defaults to false.
764    pub fn allow_missing_terms_in_lexer(mut self, allow: bool) -> Self {
765        self.allow_missing_terms_in_lexer = allow;
766        self
767    }
768
769    /// If passed false, tokens defined in the lexer but not used in the grammar will cause a
770    /// panic at lexer generation time. Defaults to true (since lexers sometimes define tokens such
771    /// as reserved words, which are intentionally not in the grammar).
772    pub fn allow_missing_tokens_in_parser(mut self, allow: bool) -> Self {
773        self.allow_missing_tokens_in_parser = allow;
774        self
775    }
776
777    /// Enables `// comment` style parsing according to `flag``.
778    /// When enabled comments can appear at the beginning of a line,
779    /// and regular expressions with the `/` character should be escaped via `\/`.
780    ///
781    /// The default value is `false`.
782    ///
783    /// Setting this flag will override the same flag within a `%grmtools` section.
784    pub fn allow_wholeline_comments(mut self, flag: bool) -> Self {
785        self.force_lex_flags.allow_wholeline_comments = Some(flag);
786        self
787    }
788
789    /// Sets the `regex::RegexBuilder` option of the same name.
790    /// The default value is `true`.
791    ///
792    /// Setting this flag will override the same flag within a `%grmtools` section.
793    pub fn dot_matches_new_line(mut self, flag: bool) -> Self {
794        self.force_lex_flags.dot_matches_new_line = Some(flag);
795        self
796    }
797
798    /// Sets the `regex::RegexBuilder` option of the same name.
799    /// The default value is `true`.
800    ///
801    /// Setting this flag will override the same flag within a `%grmtools` section.
802    pub fn multi_line(mut self, flag: bool) -> Self {
803        self.force_lex_flags.multi_line = Some(flag);
804        self
805    }
806
807    /// Enables posix lex compatible escape sequences according to `flag`.
808    /// The default value is `false`.
809    ///
810    /// Setting this flag will override the same flag within a `%grmtools` section.
811    pub fn posix_escapes(mut self, flag: bool) -> Self {
812        self.force_lex_flags.posix_escapes = Some(flag);
813        self
814    }
815
816    /// Sets the `regex::RegexBuilder` option of the same name.
817    /// The default value is `true`.
818    ///
819    /// Setting this flag will override the same flag within a `%grmtools` section.
820    pub fn octal(mut self, flag: bool) -> Self {
821        self.force_lex_flags.octal = Some(flag);
822        self
823    }
824
825    /// Sets the `regex::RegexBuilder` option of the same name.
826    /// Default value is specified by regex.
827    ///
828    /// Setting this flag will override the same flag within a `%grmtools` section.
829    pub fn swap_greed(mut self, flag: bool) -> Self {
830        self.force_lex_flags.swap_greed = Some(flag);
831        self
832    }
833
834    /// Sets the `regex::RegexBuilder` option of the same name.
835    /// Default value is specified by regex.
836    ///
837    /// Setting this flag will override the same flag within a `%grmtools` section.
838    pub fn ignore_whitespace(mut self, flag: bool) -> Self {
839        self.force_lex_flags.ignore_whitespace = Some(flag);
840        self
841    }
842
843    /// Sets the `regex::RegexBuilder` option of the same name.
844    /// Default value is specified by regex.
845    ///
846    /// Setting this flag will override the same flag within a `%grmtools` section.
847    pub fn unicode(mut self, flag: bool) -> Self {
848        self.force_lex_flags.unicode = Some(flag);
849        self
850    }
851
852    /// Sets the `regex::RegexBuilder` option of the same name.
853    /// Default value is specified by regex.
854    ///
855    /// Setting this flag will override the same flag within a `%grmtools` section.
856    pub fn case_insensitive(mut self, flag: bool) -> Self {
857        self.force_lex_flags.case_insensitive = Some(flag);
858        self
859    }
860
861    /// Sets the `regex::RegexBuilder` option of the same name.
862    /// Default value is specified by regex.
863    ///
864    /// Setting this flag will override the same flag within a `%grmtools` section.
865    pub fn size_limit(mut self, sz: usize) -> Self {
866        self.force_lex_flags.size_limit = Some(sz);
867        self
868    }
869
870    /// Sets the `regex::RegexBuilder` option of the same name.
871    /// Default value is specified by regex.
872    ///
873    /// Setting this flag will override the same flag within a `%grmtools` section.
874    pub fn dfa_size_limit(mut self, sz: usize) -> Self {
875        self.force_lex_flags.dfa_size_limit = Some(sz);
876        self
877    }
878
879    /// Sets the `regex::RegexBuilder` option of the same name.
880    /// Default value is specified by regex.
881    ///
882    /// Setting this flag will override the same flag within a `%grmtools` section.
883    pub fn nest_limit(mut self, lim: u32) -> Self {
884        self.force_lex_flags.nest_limit = Some(lim);
885        self
886    }
887
888    /// `Some` values in the specified `flags` will be used as a default value
889    /// unless the specified value has already been specified previously via `CTLexerBuilder`
890    /// or was specified in the `%grmtools` section of a *.l* file.
891    pub fn default_lex_flags(mut self, flags: LexFlags) -> Self {
892        self.default_lex_flags = flags;
893        self
894    }
895
896    #[cfg(test)]
897    pub fn inspect_lexerkind(
898        mut self,
899        cb: Box<dyn Fn(LexerKind) -> Result<(), Box<dyn Error>>>,
900    ) -> Self {
901        self.inspect_lexerkind_cb = Some(cb);
902        self
903    }
904}
905
906/// An interface to the result of [CTLexerBuilder::build()].
907pub struct CTLexer {
908    missing_from_lexer: Option<HashSet<String>>,
909    missing_from_parser: Option<HashSet<String>>,
910}
911
912impl CTLexer {
913    fn missing_from_lexer(&self) -> Option<&HashSet<String>> {
914        self.missing_from_lexer.as_ref()
915    }
916
917    fn missing_from_parser(&self) -> Option<&HashSet<String>> {
918        self.missing_from_parser.as_ref()
919    }
920}
921
922/// Create a Rust module named `mod_name` that can be imported with
923/// [`lrlex_mod!(mod_name)`](crate::lrlex_mod). The module contains one `const` `StorageT` per
924/// token in `token_map`, with the token prefixed by `T_`. For example with `StorageT` `u8`,
925/// `mod_name` `x`, and `token_map` `HashMap{"ID": 0, "INT": 1}` the generated module will look
926/// roughly as follows:
927///
928/// ```rust,ignore
929/// mod x {
930///   pub const T_ID: u8 = 0;
931///   pub const T_INT: u8 = 1;
932/// }
933/// ```
934///
935/// You can optionally remap names (for example, because the parser's token names do not lead to
936/// valid Rust identifiers) by specifying the `rename_map` `HashMap`. For example, if `token_map`
937/// is `HashMap{"+": 0, "ID": 1}` and `rename_map` is `HashMap{"+": "PLUS"}` then the generated
938/// module will look roughly as follows:
939///
940/// ```rust,ignore
941/// mod x {
942///   pub const T_PLUS: u8 = 0;
943///   pub const T_ID: u8 = 1;
944/// }
945/// ```
946pub fn ct_token_map<StorageT: Display>(
947    mod_name: &str,
948    token_map: &HashMap<String, StorageT>,
949    rename_map: Option<&HashMap<&str, &str>>,
950) -> Result<(), Box<dyn Error>> {
951    // Record the time that this version of lrlex was built. If the source code changes and rustc
952    // forces a recompile, this will change this value, causing anything which depends on this
953    // build of lrlex to be recompiled too.
954    let mut outs = String::new();
955    let timestamp = env!("VERGEN_BUILD_TIMESTAMP");
956    write!(
957        outs,
958        "// lrlex build time: {}\n\nmod {} {{\n",
959        quote!(#timestamp),
960        mod_name
961    )
962    .ok();
963    outs.push_str(
964        &token_map
965            .iter()
966            .map(|(k, v)| {
967                let k = match rename_map {
968                    Some(rmap) => *rmap.get(k.as_str()).unwrap_or(&k.as_str()),
969                    _ => k,
970                };
971                format!(
972                    "    #[allow(dead_code)] pub const T_{}: {} = {};",
973                    k,
974                    type_name::<StorageT>(),
975                    v
976                )
977            })
978            .collect::<Vec<_>>()
979            .join("\n"),
980    );
981    outs.push_str("\n}");
982
983    let mut outp = PathBuf::from(var("OUT_DIR")?);
984    outp.push(mod_name);
985    outp.set_extension("rs");
986
987    // If the file we're about to write out already exists with the same contents, then we
988    // don't overwrite it (since that will force a recompile of the file, and relinking of the
989    // binary etc).
990    if let Ok(curs) = read_to_string(&outp) {
991        if curs == outs {
992            return Ok(());
993        }
994    }
995
996    let mut f = File::create(outp)?;
997    f.write_all(outs.as_bytes())?;
998    Ok(())
999}
1000
1001#[cfg(test)]
1002mod test {
1003    use std::fs::File;
1004    use std::io::Write;
1005
1006    use super::{CTLexerBuilder, LexerKind};
1007    #[test]
1008    fn test_grmtools_section_lexerkind() {
1009        let lexerkinds = [
1010            "LRNonStreamingLexer",
1011            "lrnonstreaminglexer",
1012            "LexerKind::lrnonstreaminglexer",
1013            "lexerkind::LRNonStreamingLexer",
1014        ];
1015        for (i, kind) in lexerkinds.iter().enumerate() {
1016            let lex_src = format!(
1017                "
1018%grmtools{{lexerkind: {}}}
1019%%
1020. ;
1021",
1022                kind
1023            );
1024            let lex_path = format!(
1025                "{}/test_grmtools_section_lexerkind_{}.l",
1026                env!("OUT_DIR"),
1027                i
1028            );
1029            let mut l_file = File::create(lex_path.clone()).unwrap();
1030            l_file.write_all(lex_src.as_bytes()).unwrap();
1031            CTLexerBuilder::new()
1032                .output_path(format!("{}.rs", lex_path.clone()))
1033                .lexer_path(lex_path.clone())
1034                .inspect_lexerkind(Box::new(move |lexerkind| {
1035                    assert!(matches!(lexerkind, LexerKind::LRNonStreamingLexer));
1036                    Ok(())
1037                }))
1038                .build()
1039                .unwrap();
1040        }
1041    }
1042}