lrlex/
ctbuilder.rs

1//! Build grammars at run-time.
2
3use std::{
4    any::type_name,
5    collections::{HashMap, HashSet},
6    env::{current_dir, var},
7    error::Error,
8    fmt::{Debug, Display, Write as _},
9    fs::{self, create_dir_all, read_to_string, File},
10    hash::Hash,
11    io::Write,
12    path::{Path, PathBuf},
13    str::FromStr,
14    sync::Mutex,
15};
16
17use bincode::Encode;
18use cfgrammar::{
19    header::{
20        GrmtoolsSectionParser, Header, HeaderError, HeaderErrorKind, HeaderValue, Namespaced,
21        Setting, Value,
22    },
23    markmap::MergeBehavior,
24    newlinecache::NewlineCache,
25    span::Location,
26    Spanned,
27};
28use glob::glob;
29use lazy_static::lazy_static;
30use lrpar::{CTParserBuilder, LexerTypes};
31use num_traits::{AsPrimitive, PrimInt, Unsigned};
32use proc_macro2::TokenStream;
33use quote::{format_ident, quote, ToTokens, TokenStreamExt};
34use regex::Regex;
35
36use crate::{DefaultLexerTypes, LRNonStreamingLexer, LRNonStreamingLexerDef, LexFlags, LexerDef};
37
38const RUST_FILE_EXT: &str = "rs";
39
40lazy_static! {
41    static ref RE_TOKEN_ID: Regex = Regex::new(r"^[a-zA-Z_][a-zA-Z_0-9]*$").unwrap();
42    static ref GENERATED_PATHS: Mutex<HashSet<PathBuf>> = Mutex::new(HashSet::new());
43}
44
45#[non_exhaustive]
46pub enum LexerKind {
47    LRNonStreamingLexer,
48}
49
50impl<T: Clone> TryFrom<&Value<T>> for LexerKind {
51    type Error = cfgrammar::header::HeaderError<T>;
52    fn try_from(it: &Value<T>) -> Result<LexerKind, Self::Error> {
53        match it {
54            Value::Flag(_, loc) => Err(HeaderError {
55                kind: HeaderErrorKind::ConversionError(
56                    "LexerKind",
57                    "Expected `LexerKind` found bool",
58                ),
59                locations: vec![loc.clone()],
60            }),
61            Value::Setting(Setting::Num(_, loc)) => Err(HeaderError {
62                kind: HeaderErrorKind::ConversionError(
63                    "LexerKind",
64                    "Expected `LexerKind` found numeric",
65                ),
66                locations: vec![loc.clone()],
67            }),
68            Value::Setting(Setting::String(_, loc)) => Err(HeaderError {
69                kind: HeaderErrorKind::ConversionError(
70                    "LexerKind",
71                    "Expected `LexerKind` found string",
72                ),
73                locations: vec![loc.clone()],
74            }),
75            Value::Setting(Setting::Constructor {
76                ctor:
77                    Namespaced {
78                        namespace: _,
79                        member: (_, loc),
80                    },
81                arg: _,
82            }) => Err(HeaderError {
83                kind: HeaderErrorKind::ConversionError(
84                    "LexerKind",
85                    "Expected `LexerKind` found constructor",
86                ),
87                locations: vec![loc.clone()],
88            }),
89            Value::Setting(Setting::Unitary(Namespaced {
90                namespace,
91                member: (member, member_loc),
92            })) => {
93                if let Some((ns, loc)) = namespace {
94                    if ns.to_lowercase() != "lexerkind" {
95                        return Err(HeaderError {
96                            kind: HeaderErrorKind::ConversionError(
97                                "LexerKind",
98                                "Expected namespace `LexerKind`",
99                            ),
100                            locations: vec![loc.clone()],
101                        });
102                    }
103                }
104                if member.to_lowercase() != "lrnonstreaminglexer" {
105                    return Err(HeaderError {
106                        kind: HeaderErrorKind::ConversionError(
107                            "LexerKind",
108                            "Unknown `LexerKind` Variant",
109                        ),
110                        locations: vec![member_loc.clone()],
111                    });
112                }
113
114                Ok(LexerKind::LRNonStreamingLexer)
115            }
116        }
117    }
118}
119
120/// Specify the visibility of the module generated by [CTLexerBuilder].
121#[derive(Clone, PartialEq, Eq, Debug)]
122#[non_exhaustive]
123pub enum Visibility {
124    /// Module-level visibility only.
125    Private,
126    /// `pub`
127    Public,
128    /// `pub(super)`
129    PublicSuper,
130    /// `pub(self)`
131    PublicSelf,
132    /// `pub(crate)`
133    PublicCrate,
134    /// `pub(in {arg})`
135    PublicIn(String),
136}
137
138impl ToTokens for Visibility {
139    fn to_tokens(&self, tokens: &mut TokenStream) {
140        tokens.extend(match self {
141            Visibility::Private => quote!(),
142            Visibility::Public => quote! {pub},
143            Visibility::PublicSuper => quote! {pub(super)},
144            Visibility::PublicSelf => quote! {pub(self)},
145            Visibility::PublicCrate => quote! {pub(crate)},
146            Visibility::PublicIn(data) => {
147                let other = str::parse::<TokenStream>(data).unwrap();
148                quote! {pub(in #other)}
149            }
150        })
151    }
152}
153
154/// Specifies the [Rust Edition] that will be emitted during code generation.
155///
156/// [Rust Edition]: https://doc.rust-lang.org/edition-guide/rust-2021/index.html
157#[derive(Clone, Copy, PartialEq, Eq, Debug)]
158#[non_exhaustive]
159pub enum RustEdition {
160    Rust2015,
161    Rust2018,
162    Rust2021,
163}
164
165/// The quote impl of `ToTokens` for `Option` prints an empty string for `None`
166/// and the inner value for `Some(inner_value)`.
167///
168/// This wrapper instead emits both `Some` and `None` variants.
169/// See: [quote #20](https://github.com/dtolnay/quote/issues/20)
170struct QuoteOption<T>(Option<T>);
171
172impl<T: ToTokens> ToTokens for QuoteOption<T> {
173    fn to_tokens(&self, tokens: &mut TokenStream) {
174        tokens.append_all(match self.0 {
175            Some(ref t) => quote! { ::std::option::Option::Some(#t) },
176            None => quote! { ::std::option::Option::None },
177        });
178    }
179}
180
181/// This wrapper adds a missing impl of `ToTokens` for tuples.
182/// For a tuple `(a, b)` emits `(a.to_tokens(), b.to_tokens())`
183struct QuoteTuple<T>(T);
184
185impl<A: ToTokens, B: ToTokens> ToTokens for QuoteTuple<(A, B)> {
186    fn to_tokens(&self, tokens: &mut TokenStream) {
187        let (a, b) = &self.0;
188        tokens.append_all(quote!((#a, #b)));
189    }
190}
191
192/// The wrapped `&str` value will be emitted with a call to `to_string()`
193struct QuoteToString<'a>(&'a str);
194
195impl ToTokens for QuoteToString<'_> {
196    fn to_tokens(&self, tokens: &mut TokenStream) {
197        let x = &self.0;
198        tokens.append_all(quote! { #x.to_string() });
199    }
200}
201
202/// A `CTLexerBuilder` allows one to specify the criteria for building a statically generated
203/// lexer.
204pub struct CTLexerBuilder<'a, LexerTypesT: LexerTypes = DefaultLexerTypes<u32>>
205where
206    LexerTypesT::StorageT: Debug + Eq + Hash + ToTokens,
207    usize: num_traits::AsPrimitive<LexerTypesT::StorageT>,
208{
209    lrpar_config: Option<Box<dyn Fn(CTParserBuilder<LexerTypesT>) -> CTParserBuilder<LexerTypesT>>>,
210    lexer_path: Option<PathBuf>,
211    output_path: Option<PathBuf>,
212    lexerkind: Option<LexerKind>,
213    mod_name: Option<&'a str>,
214    visibility: Visibility,
215    rust_edition: RustEdition,
216    rule_ids_map: Option<HashMap<String, LexerTypesT::StorageT>>,
217    allow_missing_terms_in_lexer: bool,
218    allow_missing_tokens_in_parser: bool,
219    header: Header<Location>,
220    #[cfg(test)]
221    inspect_lexerkind_cb: Option<Box<dyn Fn(LexerKind) -> Result<(), Box<dyn Error>>>>,
222}
223
224impl CTLexerBuilder<'_, DefaultLexerTypes<u32>> {
225    /// Create a new [CTLexerBuilder].
226    pub fn new() -> Self {
227        CTLexerBuilder::<DefaultLexerTypes<u32>>::new_with_lexemet()
228    }
229}
230
231impl<'a, LexerTypesT: LexerTypes<LexErrorT = crate::LRLexError> + 'static>
232    CTLexerBuilder<'a, LexerTypesT>
233where
234    LexerTypesT::StorageT:
235        'static + Debug + Eq + Hash + PrimInt + Encode + TryFrom<usize> + Unsigned + ToTokens,
236    usize: AsPrimitive<LexerTypesT::StorageT>,
237{
238    /// Create a new [CTLexerBuilder].
239    ///
240    /// `LexerTypesT::StorageT` must be an unsigned integer type (e.g. `u8`, `u16`) which is big enough
241    /// to index all the tokens, rules, and productions in the lexer and less than or equal in size
242    /// to `usize` (e.g. on a 64-bit machine `u128` would be too big). If you are lexing large
243    /// files, the additional storage requirements of larger integer types can be noticeable, and
244    /// in such cases it can be worth specifying a smaller type. `StorageT` defaults to `u32` if
245    /// unspecified.
246    ///
247    /// # Examples
248    ///
249    /// ```text
250    /// CTLexerBuilder::<DefaultLexerTypes<u8>>::new_with_lexemet()
251    ///     .lexer_in_src_dir("grm.l", None)?
252    ///     .build()?;
253    /// ```
254    pub fn new_with_lexemet() -> Self {
255        let mut header = Header::new();
256        header.set_default_merge_behavior(MergeBehavior::Ours);
257        CTLexerBuilder {
258            lrpar_config: None,
259            lexer_path: None,
260            output_path: None,
261            lexerkind: None,
262            mod_name: None,
263            visibility: Visibility::Private,
264            rust_edition: RustEdition::Rust2021,
265            rule_ids_map: None,
266            allow_missing_terms_in_lexer: false,
267            allow_missing_tokens_in_parser: true,
268            header,
269            #[cfg(test)]
270            inspect_lexerkind_cb: None,
271        }
272    }
273
274    /// An optional convenience function to make it easier to create an (lrlex) lexer and (lrpar)
275    /// parser in one shot. The closure passed to this function will be called during
276    /// [CTLexerBuilder::build]: it will be passed an lrpar `CTParserBuilder` instance upon which
277    /// it can set whatever lrpar options are desired. [`CTLexerBuilder`] will then create both the
278    /// compiler and lexer and link them together as required.
279    ///
280    /// # Examples
281    ///
282    /// ```text
283    /// CTLexerBuilder:::new()
284    ///     .lrpar_config(|ctp| {
285    ///         ctp.yacckind(YaccKind::Grmtools)
286    ///             .grammar_in_src_dir("calc.y")
287    ///             .unwrap()
288    ///     })
289    ///     .lexer_in_src_dir("calc.l")?
290    ///     .build()?;
291    /// ```
292    pub fn lrpar_config<F>(mut self, config_func: F) -> Self
293    where
294        F: 'static + Fn(CTParserBuilder<LexerTypesT>) -> CTParserBuilder<LexerTypesT>,
295    {
296        self.lrpar_config = Some(Box::new(config_func));
297        self
298    }
299
300    /// Set the input lexer path to a file relative to this project's `src` directory. This will
301    /// also set the output path (i.e. you do not need to call [CTLexerBuilder::output_path]).
302    ///
303    /// For example if `a/b.l` is passed as `inp` then [CTLexerBuilder::build] will:
304    ///   * use `src/a/b.l` as the input file.
305    ///   * write output to a file which can then be imported by calling `lrlex_mod!("a/b.l")`.
306    ///   * create a module in that output file named `b_l`.
307    ///
308    /// You can override the output path and/or module name by calling
309    /// [CTLexerBuilder::output_path] and/or [CTLexerBuilder::mod_name], respectively, after
310    /// calling this function.
311    ///
312    /// This is a convenience function that makes it easier to compile lexer files stored in a
313    /// project's `src/` directory: please see [CTLexerBuilder::build] for additional constraints
314    /// and information about the generated files. Note also that each `.l` file can only be
315    /// processed once using this function: if you want to generate multiple lexers from a single
316    /// `.l` file, you will need to use [CTLexerBuilder::output_path].
317    pub fn lexer_in_src_dir<P>(mut self, srcp: P) -> Result<Self, Box<dyn Error>>
318    where
319        P: AsRef<Path>,
320    {
321        if !srcp.as_ref().is_relative() {
322            return Err(format!(
323                "Lexer path '{}' must be a relative path.",
324                srcp.as_ref().to_str().unwrap_or("<invalid UTF-8>")
325            )
326            .into());
327        }
328
329        let mut lexp = current_dir()?;
330        lexp.push("src");
331        lexp.push(srcp.as_ref());
332        self.lexer_path = Some(lexp);
333
334        let mut outp = PathBuf::new();
335        outp.push(var("OUT_DIR").unwrap());
336        outp.push(srcp.as_ref().parent().unwrap().to_str().unwrap());
337        create_dir_all(&outp)?;
338        let mut leaf = srcp
339            .as_ref()
340            .file_name()
341            .unwrap()
342            .to_str()
343            .unwrap()
344            .to_owned();
345        write!(leaf, ".{}", RUST_FILE_EXT).ok();
346        outp.push(leaf);
347        Ok(self.output_path(outp))
348    }
349
350    /// Set the input lexer path to `inp`. If specified, you must also call
351    /// [CTLexerBuilder::output_path]. In general it is easier to use
352    /// [CTLexerBuilder::lexer_in_src_dir].
353    pub fn lexer_path<P>(mut self, inp: P) -> Self
354    where
355        P: AsRef<Path>,
356    {
357        self.lexer_path = Some(inp.as_ref().to_owned());
358        self
359    }
360
361    /// Set the output lexer path to `outp`. Note that there are no requirements on `outp`: the
362    /// file can exist anywhere you can create a valid [Path] to. However, if you wish to use
363    /// [crate::lrlex_mod!] you will need to make sure that `outp` is in
364    /// [std::env::var]`("OUT_DIR")` or one of its subdirectories.
365    pub fn output_path<P>(mut self, outp: P) -> Self
366    where
367        P: AsRef<Path>,
368    {
369        self.output_path = Some(outp.as_ref().to_owned());
370        self
371    }
372
373    /// Set the type of lexer to be generated to `lexerkind`.
374    pub fn lexerkind(mut self, lexerkind: LexerKind) -> Self {
375        self.lexerkind = Some(lexerkind);
376        self
377    }
378
379    /// Set the generated module name to `mod_name`. If no module name is specified,
380    /// [`process_file`](#method.process_file) will attempt to create a sensible default based on
381    /// the input filename.
382    pub fn mod_name(mut self, mod_name: &'a str) -> Self {
383        self.mod_name = Some(mod_name);
384        self
385    }
386
387    /// Set the visibility of the generated module to `vis`. Defaults to `Visibility::Private`.
388    pub fn visibility(mut self, vis: Visibility) -> Self {
389        self.visibility = vis;
390        self
391    }
392
393    /// Sets the rust edition to be used for generated code. Defaults to the latest edition of
394    /// rust supported by grmtools.
395    pub fn rust_edition(mut self, edition: RustEdition) -> Self {
396        self.rust_edition = edition;
397        self
398    }
399
400    /// Set this lexer builder's map of rule IDs to `rule_ids_map`. By default, lexing rules have
401    /// arbitrary, but distinct, IDs. Setting the map of rule IDs (from rule names to `StorageT`)
402    /// allows users to synchronise a lexer and parser and to check that all rules are used by both
403    /// parts).
404    pub fn rule_ids_map<T: std::borrow::Borrow<HashMap<String, LexerTypesT::StorageT>> + Clone>(
405        mut self,
406        rule_ids_map: T,
407    ) -> Self {
408        self.rule_ids_map = Some(rule_ids_map.borrow().to_owned());
409        self
410    }
411
412    /// Statically compile the `.l` file specified by [CTLexerBuilder::lexer_path()] into Rust,
413    /// placing the output into the file specified by [CTLexerBuilder::output_path()].
414    ///
415    /// The generated module follows the form:
416    ///
417    /// ```text
418    ///    mod modname {
419    ///      pub fn lexerdef() -> LexerDef<LexerTypesT> { ... }
420    ///
421    ///      ...
422    ///    }
423    /// ```
424    ///
425    /// where:
426    ///  * `modname` is either:
427    ///    * the module name specified by [CTLexerBuilder::mod_name()]
428    ///    * or, if no module name was explicitly specified, then for the file `/a/b/c.l` the
429    ///      module name is `c_l` (i.e. the file's leaf name, minus its extension, with a prefix of
430    ///      `_l`).
431    pub fn build(mut self) -> Result<CTLexer, Box<dyn Error>> {
432        let lexerp = self
433            .lexer_path
434            .as_ref()
435            .expect("lexer_path must be specified before processing.");
436        let outp = self
437            .output_path
438            .as_ref()
439            .expect("output_path must be specified before processing.");
440
441        {
442            let mut lk = GENERATED_PATHS.lock().unwrap();
443            if lk.contains(outp.as_path()) {
444                return Err(format!("Generating two lexers to the same path ('{}') is not allowed: use CTLexerBuilder::output_path (and, optionally, CTLexerBuilder::mod_name) to differentiate them.", &outp.to_str().unwrap()).into());
445            }
446            lk.insert(outp.clone());
447        }
448        let lex_src = read_to_string(lexerp)
449            .map_err(|e| format!("When reading '{}': {e}", lexerp.display()))?;
450        let mut header = self.header;
451        let (parsed_header, _) = GrmtoolsSectionParser::new(&lex_src, false)
452            .parse()
453            .map_err(|es| {
454                es.iter()
455                    .map(|e| e.to_string())
456                    .collect::<Vec<_>>()
457                    .join("\n")
458            })?;
459        header.merge_from(parsed_header)?;
460        header.mark_used(&"lexerkind".to_string());
461        let lexerkind = match self.lexerkind {
462            Some(lexerkind) => lexerkind,
463            None => {
464                if let Some(HeaderValue(_, lk_val)) = header.get("lexerkind") {
465                    LexerKind::try_from(lk_val)?
466                } else {
467                    LexerKind::LRNonStreamingLexer
468                }
469            }
470        };
471        let line_cache = NewlineCache::from_str(&lex_src).unwrap();
472        #[cfg(test)]
473        if let Some(inspect_lexerkind_cb) = self.inspect_lexerkind_cb {
474            inspect_lexerkind_cb(lexerkind)?
475        }
476        let (lexerdef, lex_flags): (LRNonStreamingLexerDef<LexerTypesT>, LexFlags) = match lexerkind
477        {
478            LexerKind::LRNonStreamingLexer => {
479                let lex_flags = LexFlags::try_from(&mut header)?;
480                let lexerdef =
481                    LRNonStreamingLexerDef::<LexerTypesT>::new_with_options(&lex_src, lex_flags)
482                        .map_err(|errs| {
483                            errs.iter()
484                                .map(|e| {
485                                    if let Some((line, column)) = line_cache
486                                        .byte_to_line_num_and_col_num(
487                                            &lex_src,
488                                            e.spans().first().unwrap().start(),
489                                        )
490                                    {
491                                        format!("{} at line {line} column {column}", e)
492                                    } else {
493                                        format!("{}", e)
494                                    }
495                                })
496                                .collect::<Vec<_>>()
497                                .join("\n")
498                        })?;
499                let lex_flags = lexerdef.lex_flags().cloned();
500                (lexerdef, lex_flags.unwrap())
501            }
502        };
503
504        if let Some(ref lrcfg) = self.lrpar_config {
505            let mut lexerdef = lexerdef.clone();
506            let mut ctp = CTParserBuilder::<LexerTypesT>::new().inspect_rt(Box::new(
507                move |yacc_header, rtpb, rule_ids_map, grm_path| {
508                    let owned_map = rule_ids_map
509                        .iter()
510                        .map(|(x, y)| (&**x, *y))
511                        .collect::<HashMap<_, _>>();
512                    lexerdef.set_rule_ids(&owned_map);
513                    yacc_header.mark_used(&"test_files".to_string());
514                    let test_glob = yacc_header.get("test_files");
515                    match test_glob {
516                        Some(HeaderValue(_, Value::Setting(Setting::String(test_files, _)))) => {
517                            let path_joined = grm_path.parent().unwrap().join(test_files);
518                            for path in
519                                glob(&path_joined.to_string_lossy()).map_err(|e| e.to_string())?
520                            {
521                                let path = path?;
522                                let input = fs::read_to_string(&path)?;
523                                let l: LRNonStreamingLexer<LexerTypesT> = lexerdef.lexer(&input);
524                                for e in rtpb.parse_noaction(&l) {
525                                    Err(format!("parsing {}: {}", path.display(), e))?
526                                }
527                            }
528                            Ok(())
529                        }
530                        Some(_) => Err("Invalid value for setting 'test_files'".into()),
531                        None => Ok(()),
532                    }
533                },
534            ));
535            ctp = lrcfg(ctp);
536            let map = ctp.build()?;
537            self.rule_ids_map = Some(map.token_map().to_owned());
538        }
539
540        let mut lexerdef = Box::new(lexerdef);
541        let unused_header_values = header.unused();
542        if !unused_header_values.is_empty() {
543            return Err(
544                format!("Unused header values: {}", unused_header_values.join(", ")).into(),
545            );
546        }
547
548        let (missing_from_lexer, missing_from_parser) = match self.rule_ids_map {
549            Some(ref rim) => {
550                // Convert from HashMap<String, _> to HashMap<&str, _>
551                let owned_map = rim
552                    .iter()
553                    .map(|(x, y)| (&**x, *y))
554                    .collect::<HashMap<_, _>>();
555                let (x, y) = lexerdef.set_rule_ids(&owned_map);
556                (
557                    x.map(|a| a.iter().map(|&b| b.to_string()).collect::<HashSet<_>>()),
558                    y.map(|a| a.iter().map(|&b| b.to_string()).collect::<HashSet<_>>()),
559                )
560            }
561            None => (None, None),
562        };
563
564        let mut has_unallowed_missing = false;
565        if !self.allow_missing_terms_in_lexer {
566            if let Some(ref mfl) = missing_from_lexer {
567                eprintln!("Error: the following tokens are used in the grammar but are not defined in the lexer:");
568                for n in mfl {
569                    eprintln!("    {}", n);
570                }
571                has_unallowed_missing = true;
572            }
573        }
574        if !self.allow_missing_tokens_in_parser {
575            if let Some(ref mfp) = missing_from_parser {
576                eprintln!("Error: the following tokens are defined in the lexer but not used in the grammar:");
577                for n in mfp {
578                    eprintln!("    {}", n);
579                }
580                has_unallowed_missing = true;
581            }
582        }
583        if has_unallowed_missing {
584            fs::remove_file(outp).ok();
585            panic!();
586        }
587
588        let mod_name = match self.mod_name {
589            Some(s) => s.to_owned(),
590            None => {
591                // The user hasn't specified a module name, so we create one automatically: what we
592                // do is strip off all the filename extensions (note that it's likely that inp ends
593                // with `l.rs`, so we potentially have to strip off more than one extension) and
594                // then add `_l` to the end.
595                let mut stem = lexerp.to_str().unwrap();
596                loop {
597                    let new_stem = Path::new(stem).file_stem().unwrap().to_str().unwrap();
598                    if stem == new_stem {
599                        break;
600                    }
601                    stem = new_stem;
602                }
603                format!("{}_l", stem)
604            }
605        };
606        let mod_name = format_ident!("{}", mod_name);
607        let mut lexerdef_func_impl = {
608            let LexFlags {
609                allow_wholeline_comments,
610                dot_matches_new_line,
611                multi_line,
612                octal,
613                posix_escapes,
614                case_insensitive,
615                unicode,
616                swap_greed,
617                ignore_whitespace,
618                size_limit,
619                dfa_size_limit,
620                nest_limit,
621            } = lex_flags;
622            let allow_wholeline_comments = QuoteOption(allow_wholeline_comments);
623            let dot_matches_new_line = QuoteOption(dot_matches_new_line);
624            let multi_line = QuoteOption(multi_line);
625            let octal = QuoteOption(octal);
626            let posix_escapes = QuoteOption(posix_escapes);
627            let case_insensitive = QuoteOption(case_insensitive);
628            let unicode = QuoteOption(unicode);
629            let swap_greed = QuoteOption(swap_greed);
630            let ignore_whitespace = QuoteOption(ignore_whitespace);
631            let size_limit = QuoteOption(size_limit);
632            let dfa_size_limit = QuoteOption(dfa_size_limit);
633            let nest_limit = QuoteOption(nest_limit);
634
635            // Code gen for the lexerdef() `lex_flags` variable.
636            quote! {
637                let mut lex_flags = ::lrlex::DEFAULT_LEX_FLAGS;
638                lex_flags.allow_wholeline_comments = #allow_wholeline_comments.or(::lrlex::DEFAULT_LEX_FLAGS.allow_wholeline_comments);
639                lex_flags.dot_matches_new_line = #dot_matches_new_line.or(::lrlex::DEFAULT_LEX_FLAGS.dot_matches_new_line);
640                lex_flags.multi_line = #multi_line.or(::lrlex::DEFAULT_LEX_FLAGS.multi_line);
641                lex_flags.octal = #octal.or(::lrlex::DEFAULT_LEX_FLAGS.octal);
642                lex_flags.posix_escapes = #posix_escapes.or(::lrlex::DEFAULT_LEX_FLAGS.posix_escapes);
643                lex_flags.case_insensitive = #case_insensitive.or(::lrlex::DEFAULT_LEX_FLAGS.case_insensitive);
644                lex_flags.unicode = #unicode.or(::lrlex::DEFAULT_LEX_FLAGS.unicode);
645                lex_flags.swap_greed = #swap_greed.or(::lrlex::DEFAULT_LEX_FLAGS.swap_greed);
646                lex_flags.ignore_whitespace = #ignore_whitespace.or(::lrlex::DEFAULT_LEX_FLAGS.ignore_whitespace);
647                lex_flags.size_limit = #size_limit.or(::lrlex::DEFAULT_LEX_FLAGS.size_limit);
648                lex_flags.dfa_size_limit = #dfa_size_limit.or(::lrlex::DEFAULT_LEX_FLAGS.dfa_size_limit);
649                lex_flags.nest_limit = #nest_limit.or(::lrlex::DEFAULT_LEX_FLAGS.nest_limit);
650                let lex_flags = lex_flags;
651            }
652        };
653        {
654            let start_states = lexerdef.iter_start_states();
655            let rules = lexerdef.iter_rules().map(|r| {
656                    let tok_id = QuoteOption(r.tok_id);
657                    let n = QuoteOption(r.name().map(QuoteToString));
658                    let target_state =
659                        QuoteOption(r.target_state().map(|(x, y)| QuoteTuple((x, y))));
660                    let n_span = r.name_span();
661                    let regex = QuoteToString(&r.re_str);
662                    let start_states = r.start_states();
663                    // Code gen to construct a rule.
664                    //
665                    // We cannot `impl ToToken for Rule` because `Rule` never stores `lex_flags`,
666                    // Thus we reference the local lex_flags variable bound earlier.
667                    quote! {
668                        Rule::new(::lrlex::unstable_api::InternalPublicApi, #tok_id, #n, #n_span, #regex.to_string(),
669                                vec![#(#start_states),*], #target_state, &lex_flags).unwrap()
670                    }
671                });
672            // Code gen for `lexerdef()`s rules and the stack of `start_states`.
673            lexerdef_func_impl.append_all(quote! {
674                let start_states: Vec<StartState> = vec![#(#start_states),*];
675                let rules = vec![#(#rules),*];
676            });
677        }
678        let lexerdef_ty = match lexerkind {
679            LexerKind::LRNonStreamingLexer => {
680                quote!(::lrlex::LRNonStreamingLexerDef)
681            }
682        };
683        // Code gen for the lexerdef() return value referencing variables bound earlier.
684        lexerdef_func_impl.append_all(quote! {
685            #lexerdef_ty::from_rules(start_states, rules)
686        });
687
688        let mut token_consts = TokenStream::new();
689        if let Some(rim) = self.rule_ids_map {
690            for (name, id) in rim {
691                if RE_TOKEN_ID.is_match(&name) {
692                    let tok_ident = format_ident!("N_{}", name.to_ascii_uppercase());
693                    let storaget =
694                        str::parse::<TokenStream>(type_name::<LexerTypesT::StorageT>()).unwrap();
695                    // Code gen for the constant token values.
696                    let tok_const = quote! {
697                        #[allow(dead_code)]
698                        pub const #tok_ident: #storaget = #id;
699                    };
700                    token_consts.extend(tok_const)
701                }
702            }
703        }
704        let token_consts = token_consts.into_iter();
705        let out_tokens = {
706            let lexerdef_param = str::parse::<TokenStream>(type_name::<LexerTypesT>()).unwrap();
707            let mod_vis = self.visibility;
708            // Code gen for the generated module.
709            quote! {
710                #mod_vis mod #mod_name {
711                    use ::lrlex::{LexerDef, Rule, StartState};
712                    #[allow(dead_code)]
713                    pub fn lexerdef() -> #lexerdef_ty<#lexerdef_param> {
714                        #lexerdef_func_impl
715                    }
716
717                    #(#token_consts)*
718                }
719            }
720        };
721        // Try and run a code formatter on the generated code.
722        let unformatted = out_tokens.to_string();
723        let outs = syn::parse_str(&unformatted)
724            .map(|syntax_tree| prettyplease::unparse(&syntax_tree))
725            .unwrap_or(unformatted);
726        // If the file we're about to write out already exists with the same contents, then we
727        // don't overwrite it (since that will force a recompile of the file, and relinking of the
728        // binary etc).
729        if let Ok(curs) = read_to_string(outp) {
730            if curs == outs {
731                return Ok(CTLexer {
732                    missing_from_lexer,
733                    missing_from_parser,
734                });
735            }
736        }
737        let mut f = File::create(outp)?;
738        f.write_all(outs.as_bytes())?;
739        Ok(CTLexer {
740            missing_from_lexer,
741            missing_from_parser,
742        })
743    }
744
745    /// Given the filename `a/b.l` as input, statically compile the file `src/a/b.l` into a Rust
746    /// module which can then be imported using `lrlex_mod!("a/b.l")`. This is a convenience
747    /// function around [`process_file`](struct.CTLexerBuilder.html#method.process_file) which makes
748    /// it easier to compile `.l` files stored in a project's `src/` directory: please see
749    /// [`process_file`](#method.process_file) for additional constraints and information about the
750    /// generated files.
751    #[deprecated(
752        since = "0.11.0",
753        note = "Please use lexer_in_src_dir() and build() instead"
754    )]
755    #[allow(deprecated)]
756    pub fn process_file_in_src(
757        self,
758        srcp: &str,
759    ) -> Result<(Option<HashSet<String>>, Option<HashSet<String>>), Box<dyn Error>> {
760        let mut inp = current_dir()?;
761        inp.push("src");
762        inp.push(srcp);
763        let mut outp = PathBuf::new();
764        outp.push(var("OUT_DIR").unwrap());
765        outp.push(Path::new(srcp).parent().unwrap().to_str().unwrap());
766        create_dir_all(&outp)?;
767        let mut leaf = Path::new(srcp)
768            .file_name()
769            .unwrap()
770            .to_str()
771            .unwrap()
772            .to_owned();
773        write!(leaf, ".{}", RUST_FILE_EXT).ok();
774        outp.push(leaf);
775        self.process_file(inp, outp)
776    }
777
778    /// Statically compile the `.l` file `inp` into Rust, placing the output into the file `outp`.
779    /// The latter defines a module as follows:
780    ///
781    /// ```text
782    ///    mod modname {
783    ///      pub fn lexerdef() -> LexerDef<LexerTypesT::StorageT> { ... }
784    ///
785    ///      ...
786    ///    }
787    /// ```
788    ///
789    /// where:
790    ///  * `modname` is either:
791    ///    * the module name specified [`mod_name`](#method.mod_name)
792    ///    * or, if no module name was explicitly specified, then for the file `/a/b/c.l` the
793    ///      module name is `c_l` (i.e. the file's leaf name, minus its extension, with a prefix of
794    ///      `_l`).
795    #[deprecated(
796        since = "0.11.0",
797        note = "Please use lexer_in_src_dir() and build() instead"
798    )]
799    pub fn process_file<P, Q>(
800        mut self,
801        inp: P,
802        outp: Q,
803    ) -> Result<(Option<HashSet<String>>, Option<HashSet<String>>), Box<dyn Error>>
804    where
805        P: AsRef<Path>,
806        Q: AsRef<Path>,
807    {
808        self.lexer_path = Some(inp.as_ref().to_owned());
809        self.output_path = Some(outp.as_ref().to_owned());
810        let cl = self.build()?;
811        Ok((
812            cl.missing_from_lexer().map(|x| x.to_owned()),
813            cl.missing_from_parser().map(|x| x.to_owned()),
814        ))
815    }
816
817    /// If passed false, tokens used in the grammar but not defined in the lexer will cause a
818    /// panic at lexer generation time. Defaults to false.
819    pub fn allow_missing_terms_in_lexer(mut self, allow: bool) -> Self {
820        self.allow_missing_terms_in_lexer = allow;
821        self
822    }
823
824    /// If passed false, tokens defined in the lexer but not used in the grammar will cause a
825    /// panic at lexer generation time. Defaults to true (since lexers sometimes define tokens such
826    /// as reserved words, which are intentionally not in the grammar).
827    pub fn allow_missing_tokens_in_parser(mut self, allow: bool) -> Self {
828        self.allow_missing_tokens_in_parser = allow;
829        self
830    }
831
832    /// Enables `// comment` style parsing according to `flag``.
833    /// When enabled comments can appear at the beginning of a line,
834    /// and regular expressions with the `/` character should be escaped via `\/`.
835    ///
836    /// The default value is `false`.
837    ///
838    /// Setting this flag will override the same flag within a `%grmtools` section.
839    pub fn allow_wholeline_comments(mut self, flag: bool) -> Self {
840        let key = "allow_wholeline_comments".to_string();
841        self.header.insert(
842            key,
843            HeaderValue(
844                Location::Other("CTLexerBuilder".to_string()),
845                Value::Flag(flag, Location::Other("CTLexerBuilder".to_string())),
846            ),
847        );
848        self
849    }
850
851    /// Sets the `regex::RegexBuilder` option of the same name.
852    /// The default value is `true`.
853    ///
854    /// Setting this flag will override the same flag within a `%grmtools` section.
855    pub fn dot_matches_new_line(mut self, flag: bool) -> Self {
856        let key = "dot_matches_new_line".to_string();
857        self.header.insert(
858            key,
859            HeaderValue(
860                Location::Other("CTLexerBuilder".to_string()),
861                Value::Flag(flag, Location::Other("CTLexerBuilder".to_string())),
862            ),
863        );
864        self
865    }
866
867    /// Sets the `regex::RegexBuilder` option of the same name.
868    /// The default value is `true`.
869    ///
870    /// Setting this flag will override the same flag within a `%grmtools` section.
871    pub fn multi_line(mut self, flag: bool) -> Self {
872        let key = "multi_line".to_string();
873        self.header.insert(
874            key,
875            HeaderValue(
876                Location::Other("CTLexerBuilder".to_string()),
877                Value::Flag(flag, Location::Other("CTLexerBuilder".to_string())),
878            ),
879        );
880        self
881    }
882
883    /// Enables posix lex compatible escape sequences according to `flag`.
884    /// The default value is `false`.
885    ///
886    /// Setting this flag will override the same flag within a `%grmtools` section.
887    pub fn posix_escapes(mut self, flag: bool) -> Self {
888        let key = "posix_escapes".to_string();
889        self.header.insert(
890            key,
891            HeaderValue(
892                Location::Other("CTLexerBuilder".to_string()),
893                Value::Flag(flag, Location::Other("CTLexerBuilder".to_string())),
894            ),
895        );
896        self
897    }
898
899    /// Sets the `regex::RegexBuilder` option of the same name.
900    /// The default value is `true`.
901    ///
902    /// Setting this flag will override the same flag within a `%grmtools` section.
903    pub fn octal(mut self, flag: bool) -> Self {
904        let key = "octal".to_string();
905        self.header.insert(
906            key,
907            HeaderValue(
908                Location::Other("CTLexerBuilder".to_string()),
909                Value::Flag(flag, Location::Other("CTLexerBuilder".to_string())),
910            ),
911        );
912        self
913    }
914
915    /// Sets the `regex::RegexBuilder` option of the same name.
916    /// Default value is specified by regex.
917    ///
918    /// Setting this flag will override the same flag within a `%grmtools` section.
919    pub fn swap_greed(mut self, flag: bool) -> Self {
920        let key = "swap_greed".to_string();
921        self.header.insert(
922            key,
923            HeaderValue(
924                Location::Other("CTLexerBuilder".to_string()),
925                Value::Flag(flag, Location::Other("CTLexerBuilder".to_string())),
926            ),
927        );
928        self
929    }
930
931    /// Sets the `regex::RegexBuilder` option of the same name.
932    /// Default value is specified by regex.
933    ///
934    /// Setting this flag will override the same flag within a `%grmtools` section.
935    pub fn ignore_whitespace(mut self, flag: bool) -> Self {
936        let key = "ignore_whitespace".to_string();
937        self.header.insert(
938            key,
939            HeaderValue(
940                Location::Other("CTLexerBuilder".to_string()),
941                Value::Flag(flag, Location::Other("CTLexerBuilder".to_string())),
942            ),
943        );
944        self
945    }
946
947    /// Sets the `regex::RegexBuilder` option of the same name.
948    /// Default value is specified by regex.
949    ///
950    /// Setting this flag will override the same flag within a `%grmtools` section.
951    pub fn unicode(mut self, flag: bool) -> Self {
952        let key = "unicode".to_string();
953        self.header.insert(
954            key,
955            HeaderValue(
956                Location::Other("CTLexerBuilder".to_string()),
957                Value::Flag(flag, Location::Other("CTLexerBuilder".to_string())),
958            ),
959        );
960        self
961    }
962
963    /// Sets the `regex::RegexBuilder` option of the same name.
964    /// Default value is specified by regex.
965    ///
966    /// Setting this flag will override the same flag within a `%grmtools` section.
967    pub fn case_insensitive(mut self, flag: bool) -> Self {
968        let key = "case_insensitive".to_string();
969        self.header.insert(
970            key,
971            HeaderValue(
972                Location::Other("CTLexerBuilder".to_string()),
973                Value::Flag(flag, Location::Other("CTLexerBuilder".to_string())),
974            ),
975        );
976        self
977    }
978
979    /// Sets the `regex::RegexBuilder` option of the same name.
980    /// Default value is specified by regex.
981    ///
982    /// Setting this flag will override the same flag within a `%grmtools` section.
983    pub fn size_limit(mut self, sz: usize) -> Self {
984        let key = "size_limit".to_string();
985        self.header.insert(
986            key,
987            HeaderValue(
988                Location::Other("CTLexerBuilder".to_string()),
989                Value::Setting(Setting::Num(
990                    sz as u64,
991                    Location::Other("CTLexerBuilder".to_string()),
992                )),
993            ),
994        );
995        self
996    }
997
998    /// Sets the `regex::RegexBuilder` option of the same name.
999    /// Default value is specified by regex.
1000    ///
1001    /// Setting this flag will override the same flag within a `%grmtools` section.
1002    pub fn dfa_size_limit(mut self, sz: usize) -> Self {
1003        let key = "dfa_size_limit".to_string();
1004        self.header.insert(
1005            key,
1006            HeaderValue(
1007                Location::Other("CTLexerBuilder".to_string()),
1008                Value::Setting(Setting::Num(
1009                    sz as u64,
1010                    Location::Other("CTLexerBuilder".to_string()),
1011                )),
1012            ),
1013        );
1014        self
1015    }
1016
1017    /// Sets the `regex::RegexBuilder` option of the same name.
1018    /// Default value is specified by regex.
1019    ///
1020    /// Setting this flag will override the same flag within a `%grmtools` section.
1021    pub fn nest_limit(mut self, lim: u32) -> Self {
1022        let key = "nest_limit".to_string();
1023        self.header.insert(
1024            key,
1025            HeaderValue(
1026                Location::Other("CTLexerBuilder".to_string()),
1027                Value::Setting(Setting::Num(
1028                    lim as u64,
1029                    Location::Other("CTLexerBuilder".to_string()),
1030                )),
1031            ),
1032        );
1033        self
1034    }
1035
1036    #[cfg(test)]
1037    pub fn inspect_lexerkind(
1038        mut self,
1039        cb: Box<dyn Fn(LexerKind) -> Result<(), Box<dyn Error>>>,
1040    ) -> Self {
1041        self.inspect_lexerkind_cb = Some(cb);
1042        self
1043    }
1044}
1045
1046/// An interface to the result of [CTLexerBuilder::build()].
1047pub struct CTLexer {
1048    missing_from_lexer: Option<HashSet<String>>,
1049    missing_from_parser: Option<HashSet<String>>,
1050}
1051
1052impl CTLexer {
1053    fn missing_from_lexer(&self) -> Option<&HashSet<String>> {
1054        self.missing_from_lexer.as_ref()
1055    }
1056
1057    fn missing_from_parser(&self) -> Option<&HashSet<String>> {
1058        self.missing_from_parser.as_ref()
1059    }
1060}
1061
1062/// Create a Rust module named `mod_name` that can be imported with
1063/// [`lrlex_mod!(mod_name)`](crate::lrlex_mod). The module contains one `const` `StorageT` per
1064/// token in `token_map`, with the token prefixed by `T_`. For example with `StorageT` `u8`,
1065/// `mod_name` `x`, and `token_map` `HashMap{"ID": 0, "INT": 1}` the generated module will look
1066/// roughly as follows:
1067///
1068/// ```rust,ignore
1069/// mod x {
1070///   pub const T_ID: u8 = 0;
1071///   pub const T_INT: u8 = 1;
1072/// }
1073/// ```
1074///
1075/// You can optionally remap names (for example, because the parser's token names do not lead to
1076/// valid Rust identifiers) by specifying the `rename_map` `HashMap`. For example, if `token_map`
1077/// is `HashMap{"+": 0, "ID": 1}` and `rename_map` is `HashMap{"+": "PLUS"}` then the generated
1078/// module will look roughly as follows:
1079///
1080/// ```rust,ignore
1081/// mod x {
1082///   pub const T_PLUS: u8 = 0;
1083///   pub const T_ID: u8 = 1;
1084/// }
1085/// ```
1086pub fn ct_token_map<StorageT: Display>(
1087    mod_name: &str,
1088    token_map: &HashMap<String, StorageT>,
1089    rename_map: Option<&HashMap<&str, &str>>,
1090) -> Result<(), Box<dyn Error>> {
1091    // Record the time that this version of lrlex was built. If the source code changes and rustc
1092    // forces a recompile, this will change this value, causing anything which depends on this
1093    // build of lrlex to be recompiled too.
1094    let mut outs = String::new();
1095    let timestamp = env!("VERGEN_BUILD_TIMESTAMP");
1096    write!(
1097        outs,
1098        "// lrlex build time: {}\n\nmod {} {{\n",
1099        quote!(#timestamp),
1100        mod_name
1101    )
1102    .ok();
1103    outs.push_str(
1104        &token_map
1105            .iter()
1106            .map(|(k, v)| {
1107                let k = match rename_map {
1108                    Some(rmap) => *rmap.get(k.as_str()).unwrap_or(&k.as_str()),
1109                    _ => k,
1110                };
1111                format!(
1112                    "    #[allow(dead_code)] pub const T_{}: {} = {};",
1113                    k,
1114                    type_name::<StorageT>(),
1115                    v
1116                )
1117            })
1118            .collect::<Vec<_>>()
1119            .join("\n"),
1120    );
1121    outs.push_str("\n}");
1122
1123    let mut outp = PathBuf::from(var("OUT_DIR")?);
1124    outp.push(mod_name);
1125    outp.set_extension("rs");
1126
1127    // If the file we're about to write out already exists with the same contents, then we
1128    // don't overwrite it (since that will force a recompile of the file, and relinking of the
1129    // binary etc).
1130    if let Ok(curs) = read_to_string(&outp) {
1131        if curs == outs {
1132            return Ok(());
1133        }
1134    }
1135
1136    let mut f = File::create(outp)?;
1137    f.write_all(outs.as_bytes())?;
1138    Ok(())
1139}
1140
1141#[cfg(test)]
1142mod test {
1143    use std::fs::File;
1144    use std::io::Write;
1145
1146    use super::{CTLexerBuilder, LexerKind};
1147    #[test]
1148    fn test_grmtools_section_lexerkind() {
1149        let lexerkinds = [
1150            "LRNonStreamingLexer",
1151            "lrnonstreaminglexer",
1152            "LexerKind::lrnonstreaminglexer",
1153            "lexerkind::LRNonStreamingLexer",
1154        ];
1155        for (i, kind) in lexerkinds.iter().enumerate() {
1156            let lex_src = format!(
1157                "
1158%grmtools{{lexerkind: {}}}
1159%%
1160. ;
1161",
1162                kind
1163            );
1164            let lex_path = format!(
1165                "{}/test_grmtools_section_lexerkind_{}.l",
1166                env!("OUT_DIR"),
1167                i
1168            );
1169            let mut l_file = File::create(lex_path.clone()).unwrap();
1170            l_file.write_all(lex_src.as_bytes()).unwrap();
1171            CTLexerBuilder::new()
1172                .output_path(format!("{}.rs", lex_path.clone()))
1173                .lexer_path(lex_path.clone())
1174                .inspect_lexerkind(Box::new(move |lexerkind| {
1175                    assert!(matches!(lexerkind, LexerKind::LRNonStreamingLexer));
1176                    Ok(())
1177                }))
1178                .build()
1179                .unwrap();
1180        }
1181    }
1182}