Merge pull request #4480 from DimitrisJim/lexer_opt

Improve lexer performance by matching early on ascii identifiers.
2026-06-02 19:39:49 +09:00 · 2023-02-01 15:29:16 +02:00
parent 5fb03b6891 5025113da0
commit adc23253e4
1 changed files with 38 additions and 33 deletions
--- a/compiler/parser/src/lexer.rs
+++ b/compiler/parser/src/lexer.rs
@@ -18,8 +18,8 @@ use unic_ucd_ident::{is_xid_continue, is_xid_start};

 #[derive(Clone, Copy, PartialEq, Debug, Default)]
 struct IndentationLevel {
-    tabs: usize,
-    spaces: usize,
+    tabs: u32,
+    spaces: u32,
 }

 impl IndentationLevel {
@@ -225,7 +225,8 @@ where
            at_begin_of_line: true,
            nesting: 0,
            indentations: Indentations::default(),
-            pending: Vec::new(),
+            // Usually we have less than 5 tokens pending.
+            pending: Vec::with_capacity(5),
            location: start,
            window: CharWindow::new(input),
        };
@@ -257,13 +258,13 @@ where
        };

        let start_pos = self.get_pos();
-        let mut name = String::new();
+        let mut name = String::with_capacity(8);
        while self.is_identifier_continuation() {
            name.push(self.next_char().unwrap());
        }
        let end_pos = self.get_pos();

-        if let Some(tok) = KEYWORDS.get(name.as_str()) {
+        if let Some(tok) = KEYWORDS.get(&name) {
            Ok((start_pos, tok.clone(), end_pos))
        } else {
            Ok((start_pos, Tok::Name { name }, end_pos))
@@ -464,7 +465,7 @@ where
            self.next_char();
        }
        let quote_char = self.next_char().unwrap();
-        let mut string_content = String::new();
+        let mut string_content = String::with_capacity(5);

        // If the next two characters are also the quote character, then we have a triple-quoted
        // string; consume those two characters and ensure that we require a triple-quote to close
@@ -534,12 +535,15 @@ where
    }

    fn is_identifier_start(&self, c: char) -> bool {
-        c == '_' || is_xid_start(c)
+        match c {
+            'a'..='z' | 'A'..='Z' | '_' => true,
+            _ => is_xid_start(c),
+        }
    }

    fn is_identifier_continuation(&self) -> bool {
        match self.window[0] {
-            Some('_' | '0'..='9') => true,
+            Some('a'..='z' | 'A'..='Z' | '_' | '0'..='9') => true,
            Some(c) => is_xid_continue(c),
            _ => false,
        }
@@ -564,8 +568,8 @@ where
    /// Given we are at the start of a line, count the number of spaces and/or tabs until the first character.
    fn eat_indentation(&mut self) -> Result<IndentationLevel, LexicalError> {
        // Determine indentation:
-        let mut spaces: usize = 0;
-        let mut tabs: usize = 0;
+        let mut spaces: u32 = 0;
+        let mut tabs: u32 = 0;
        loop {
            match self.window[0] {
                Some(' ') => {
@@ -686,21 +690,9 @@ where
    fn consume_normal(&mut self) -> Result<(), LexicalError> {
        // Check if we have some character:
        if let Some(c) = self.window[0] {
-            // First check identifier:
            if self.is_identifier_start(c) {
                let identifier = self.lex_identifier()?;
                self.emit(identifier);
-            } else if is_emoji_presentation(c) {
-                let tok_start = self.get_pos();
-                self.next_char();
-                let tok_end = self.get_pos();
-                self.emit((
-                    tok_start,
-                    Tok::Name {
-                        name: c.to_string(),
-                    },
-                    tok_end,
-                ));
            } else {
                self.consume_character(c)?;
            }
@@ -1047,10 +1039,7 @@ where
                }
            }
            ',' => {
-                let tok_start = self.get_pos();
-                self.next_char();
-                let tok_end = self.get_pos();
-                self.emit((tok_start, Tok::Comma, tok_end));
+                self.eat_single_char(Tok::Comma);
            }
            '.' => {
                if let Some('0'..='9') = self.window[1] {
@@ -1109,13 +1098,25 @@ where
                    });
                }
            }
-
            _ => {
-                let c = self.next_char();
-                return Err(LexicalError {
-                    error: LexicalErrorType::UnrecognizedToken { tok: c.unwrap() },
-                    location: self.get_pos(),
-                });
+                if is_emoji_presentation(c) {
+                    let tok_start = self.get_pos();
+                    self.next_char();
+                    let tok_end = self.get_pos();
+                    self.emit((
+                        tok_start,
+                        Tok::Name {
+                            name: c.to_string(),
+                        },
+                        tok_end,
+                    ));
+                } else {
+                    let c = self.next_char();
+                    return Err(LexicalError {
+                        error: LexicalErrorType::UnrecognizedToken { tok: c.unwrap() },
+                        location: self.get_pos(),
+                    });
+                }
            } // Ignore all the rest..
        }

@@ -1124,7 +1125,11 @@ where

    fn eat_single_char(&mut self, ty: Tok) {
        let tok_start = self.get_pos();
-        self.next_char().unwrap();
+        self.next_char().unwrap_or_else(|| unsafe {
+            // SAFETY: eat_single_char has been called only after a character has been read
+            // from the window, so the window is guaranteed to be non-empty.
+            std::hint::unreachable_unchecked()
+        });
        let tok_end = self.get_pos();
        self.emit((tok_start, ty, tok_end));
    }