Lexer: Add skip whitespace fastpath (#7184)

2025-07-24 13:33:50 +00:00 · 2023-09-06 16:14:01 +02:00 · 2023-09-06 16:14:01 +02:00 · 171b66cb43
commit 171b66cb43
parent fa0b6f4813
1 changed files with 83 additions and 37 deletions
--- a/crates/ruff_python_parser/src/lexer.rs
+++ b/crates/ruff_python_parser/src/lexer.rs
@ -580,6 +580,82 @@ impl<'source> Lexer<'source> {
            }
        }

+        if self.state.is_after_newline() {
+            if let Some(indentation) = self.eat_indentation()? {
+                return Ok(indentation);
+            }
+        } else {
+            self.skip_whitespace()?;
+        }
+
+        self.cursor.start_token();
+        if let Some(c) = self.cursor.bump() {
+            if c.is_ascii() {
+                self.consume_ascii_character(c)
+            } else if is_unicode_identifier_start(c) {
+                let identifier = self.lex_identifier(c)?;
+                self.state = State::Other;
+
+                Ok((identifier, self.token_range()))
+            } else if is_emoji_presentation(c) {
+                self.state = State::Other;
+
+                Ok((
+                    Tok::Name {
+                        name: c.to_string(),
+                    },
+                    self.token_range(),
+                ))
+            } else {
+                Err(LexicalError {
+                    error: LexicalErrorType::UnrecognizedToken { tok: c },
+                    location: self.token_start(),
+                })
+            }
+        } else {
+            // Reached the end of the file. Emit a trailing newline token if not at the beginning of a logical line,
+            // empty the dedent stack, and finally, return the EndOfFile token.
+            self.consume_end()
+        }
+    }
+
+    fn skip_whitespace(&mut self) -> Result<(), LexicalError> {
+        loop {
+            match self.cursor.first() {
+                ' ' => {
+                    self.cursor.bump();
+                }
+                '\t' => {
+                    self.cursor.bump();
+                }
+                '\\' => {
+                    self.cursor.bump();
+                    if self.cursor.eat_char('\r') {
+                        self.cursor.eat_char('\n');
+                    } else if self.cursor.is_eof() {
+                        return Err(LexicalError {
+                            error: LexicalErrorType::Eof,
+                            location: self.token_start(),
+                        });
+                    } else if !self.cursor.eat_char('\n') {
+                        return Err(LexicalError {
+                            error: LexicalErrorType::LineContinuationError,
+                            location: self.token_start(),
+                        });
+                    }
+                }
+                // Form feed
+                '\x0C' => {
+                    self.cursor.bump();
+                }
+                _ => break,
+            }
+        }
+
+        Ok(())
+    }
+
+    fn eat_indentation(&mut self) -> Result<Option<Spanned>, LexicalError> {
        let mut indentation = Indentation::root();
        self.cursor.start_token();

@ -619,48 +695,18 @@ impl<'source> Lexer<'source> {
            }
        }

-        if self.state.is_after_newline() {
-            // Handle indentation if this is a new, not all empty, logical line
-            if !matches!(self.cursor.first(), '\n' | '\r' | '#' | EOF_CHAR) {
-                self.state = State::NonEmptyLogicalLine;
+        // Handle indentation if this is a new, not all empty, logical line
+        if !matches!(self.cursor.first(), '\n' | '\r' | '#' | EOF_CHAR) {
+            self.state = State::NonEmptyLogicalLine;

-                if let Some(spanned) = self.handle_indentation(indentation)? {
-                    // Set to false so that we don't handle indentation on the next call.
+            if let Some(spanned) = self.handle_indentation(indentation)? {
+                // Set to false so that we don't handle indentation on the next call.

-                    return Ok(spanned);
-                }
+                return Ok(Some(spanned));
            }
        }

-        self.cursor.start_token();
-        if let Some(c) = self.cursor.bump() {
-            if c.is_ascii() {
-                self.consume_ascii_character(c)
-            } else if is_unicode_identifier_start(c) {
-                let identifier = self.lex_identifier(c)?;
-                self.state = State::Other;
-
-                Ok((identifier, self.token_range()))
-            } else if is_emoji_presentation(c) {
-                self.state = State::Other;
-
-                Ok((
-                    Tok::Name {
-                        name: c.to_string(),
-                    },
-                    self.token_range(),
-                ))
-            } else {
-                Err(LexicalError {
-                    error: LexicalErrorType::UnrecognizedToken { tok: c },
-                    location: self.token_start(),
-                })
-            }
-        } else {
-            // Reached the end of the file. Emit a trailing newline token if not at the beginning of a logical line,
-            // empty the dedent stack, and finally, return the EndOfFile token.
-            self.consume_end()
-        }
+        Ok(None)
    }

    fn handle_indentation(