Lexer should consider BOM for the start offset (#11732)

## Summary This PR fixes a bug where the lexer didn't consider the BOM into the start offset. fixes: #11731 ## Test Plan Add multiple test cases which involves BOM character in the source for the lexer and verify the snapshot.
2025-08-19 01:50:38 +00:00 · 2024-06-04 14:15:46 +05:30 · 2024-06-04 14:15:46 +05:30 · 2567e14b7a
commit 2567e14b7a
parent 3b19df04d7
4 changed files with 117 additions and 15 deletions
--- a/crates/ruff_python_parser/src/lexer.rs
+++ b/crates/ruff_python_parser/src/lexer.rs
@ -30,6 +30,8 @@ mod cursor;
 mod fstring;
 mod indentation;
 const BOM: char = '\u{feff}';
 /// A lexer for Python source code.
 #[derive(Debug)]
 pub struct Lexer<'src> {
@ -100,11 +102,10 @@ impl<'src> Lexer<'src> {
            errors: Vec::new(),
        };
-        // TODO: Handle possible mismatch between BOM and explicit encoding declaration.
+        if start_offset == TextSize::new(0) {
-        // spell-checker:ignore feff
+            // TODO: Handle possible mismatch between BOM and explicit encoding declaration.
-        lexer.cursor.eat_char('\u{feff}');
+            lexer.cursor.eat_char(BOM);
-
+        } else {
        if start_offset > TextSize::new(0) {
            lexer.cursor.skip_bytes(start_offset.to_usize());
        }
@ -1922,8 +1923,8 @@ mod tests {
        }
    }
-    fn lex(source: &str, mode: Mode) -> LexerOutput {
+    fn lex(source: &str, mode: Mode, start_offset: TextSize) -> LexerOutput {
-        let mut lexer = Lexer::new(source, mode, TextSize::default());
+        let mut lexer = Lexer::new(source, mode, start_offset);
        let mut tokens = Vec::new();
        loop {
            let kind = lexer.next_token();
@ -1943,8 +1944,8 @@ mod tests {
        }
    }
-    fn lex_valid(source: &str, mode: Mode) -> LexerOutput {
+    fn lex_valid(source: &str, mode: Mode, start_offset: TextSize) -> LexerOutput {
-        let output = lex(source, mode);
+        let output = lex(source, mode, start_offset);
        if !output.errors.is_empty() {
            let mut message = "Unexpected lexical errors for a valid source:\n".to_string();
@ -1959,7 +1960,7 @@ mod tests {
    }
    fn lex_invalid(source: &str, mode: Mode) -> LexerOutput {
-        let output = lex(source, mode);
+        let output = lex(source, mode, TextSize::default());
        assert!(
            !output.errors.is_empty(),
@ -1970,11 +1971,35 @@ mod tests {
    }
    fn lex_source(source: &str) -> LexerOutput {
-        lex_valid(source, Mode::Module)
+        lex_valid(source, Mode::Module, TextSize::default())
    }
    fn lex_source_with_offset(source: &str, start_offset: TextSize) -> LexerOutput {
        lex_valid(source, Mode::Module, start_offset)
    }
    fn lex_jupyter_source(source: &str) -> LexerOutput {
-        lex_valid(source, Mode::Ipython)
+        lex_valid(source, Mode::Ipython, TextSize::default())
    }
    #[test]
    fn bom() {
        let source = "\u{feff}x = 1";
        assert_snapshot!(lex_source(source));
    }
    #[test]
    fn bom_with_offset() {
        let source = "\u{feff}x + y + z";
        assert_snapshot!(lex_source_with_offset(source, TextSize::new(7)));
    }
    #[test]
    fn bom_with_offset_edge() {
        // BOM offsets the first token by 3, so make sure that lexing from offset 11 (variable z)
        // doesn't panic. Refer https://github.com/astral-sh/ruff/issues/11731
        let source = "\u{feff}x + y + z";
        assert_snapshot!(lex_source_with_offset(source, TextSize::new(11)));
    }
    fn ipython_escape_command_line_continuation_eol(eol: &str) -> LexerOutput {
@ -2118,7 +2143,7 @@ foo = ,func
 def f(arg=%timeit a = b):
    pass"
            .trim();
-        let output = lex(source, Mode::Ipython);
+        let output = lex(source, Mode::Ipython, TextSize::default());
        assert!(output.errors.is_empty());
        assert_no_ipython_escape_command(&output.tokens);
    }
@ -2351,7 +2376,7 @@ if first:
    }
    fn get_tokens_only(source: &str) -> Vec<TokenKind> {
-        let output = lex(source, Mode::Module);
+        let output = lex(source, Mode::Module, TextSize::default());
        assert!(output.errors.is_empty());
        output.tokens.into_iter().map(|token| token.kind).collect()
    }
@ -2593,7 +2618,7 @@ f"{(lambda x:{x})}"
    }
    fn lex_fstring_error(source: &str) -> FStringErrorType {
-        let output = lex(source, Mode::Module);
+        let output = lex(source, Mode::Module, TextSize::default());
        match output
            .errors
            .into_iter()
--- a/crates/ruff_python_parser/src/snapshots/ruff_python_parserlexertests__bom.snap
+++ b/crates/ruff_python_parser/src/snapshots/ruff_python_parserlexertests__bom.snap
@ -0,0 +1,29 @@
 ---
 source: crates/ruff_python_parser/src/lexer.rs
 expression: lex_source(source)
 ---
 ## Tokens
 ```
 [
    (
        Name(
            "x",
        ),
        3..4,
    ),
    (
        Equal,
        5..6,
    ),
    (
        Int(
            1,
        ),
        7..8,
    ),
    (
        Newline,
        8..8,
    ),
 ]
 ```
--- a/crates/ruff_python_parser/src/snapshots/ruff_python_parserlexertests__bom_with_offset.snap
+++ b/crates/ruff_python_parser/src/snapshots/ruff_python_parserlexertests__bom_with_offset.snap
@ -0,0 +1,29 @@
 ---
 source: crates/ruff_python_parser/src/lexer.rs
 expression: "lex_source_with_offset(source, TextSize::new(7))"
 ---
 ## Tokens
 ```
 [
    (
        Name(
            "y",
        ),
        7..8,
    ),
    (
        Plus,
        9..10,
    ),
    (
        Name(
            "z",
        ),
        11..12,
    ),
    (
        Newline,
        12..12,
    ),
 ]
 ```
--- a/crates/ruff_python_parser/src/snapshots/ruff_python_parserlexertests__bom_with_offset_edge.snap
+++ b/crates/ruff_python_parser/src/snapshots/ruff_python_parserlexertests__bom_with_offset_edge.snap
@ -0,0 +1,19 @@
 ---
 source: crates/ruff_python_parser/src/lexer.rs
 expression: "lex_source_with_offset(source, TextSize::new(11))"
 ---
 ## Tokens
 ```
 [
    (
        Name(
            "z",
        ),
        11..12,
    ),
    (
        Newline,
        12..12,
    ),
 ]
 ```