Lexer should consider BOM for the start offset (#11732)

## Summary This PR fixes a bug where the lexer didn't consider the BOM into the start offset. fixes: #11731 ## Test Plan Add multiple test cases which involves BOM character in the source for the lexer and verify the snapshot.
2025-07-24 05:26:23 +00:00 · 2024-06-04 14:15:46 +05:30 · 2024-06-04 14:15:46 +05:30 · 2567e14b7a
commit 2567e14b7a
parent 3b19df04d7
4 changed files with 117 additions and 15 deletions
--- a/crates/ruff_python_parser/src/lexer.rs
+++ b/crates/ruff_python_parser/src/lexer.rs
@ -30,6 +30,8 @@ mod cursor;
 mod fstring;
 mod indentation;

+const BOM: char = '\u{feff}';
+
 /// A lexer for Python source code.
 #[derive(Debug)]
 pub struct Lexer<'src> {
@ -100,11 +102,10 @@ impl<'src> Lexer<'src> {
            errors: Vec::new(),
        };

-        // TODO: Handle possible mismatch between BOM and explicit encoding declaration.
-        // spell-checker:ignore feff
-        lexer.cursor.eat_char('\u{feff}');
-
-        if start_offset > TextSize::new(0) {
+        if start_offset == TextSize::new(0) {
+            // TODO: Handle possible mismatch between BOM and explicit encoding declaration.
+            lexer.cursor.eat_char(BOM);
+        } else {
            lexer.cursor.skip_bytes(start_offset.to_usize());
        }

@ -1922,8 +1923,8 @@ mod tests {
        }
    }

-    fn lex(source: &str, mode: Mode) -> LexerOutput {
-        let mut lexer = Lexer::new(source, mode, TextSize::default());
+    fn lex(source: &str, mode: Mode, start_offset: TextSize) -> LexerOutput {
+        let mut lexer = Lexer::new(source, mode, start_offset);
        let mut tokens = Vec::new();
        loop {
            let kind = lexer.next_token();
@ -1943,8 +1944,8 @@ mod tests {
        }
    }

-    fn lex_valid(source: &str, mode: Mode) -> LexerOutput {
-        let output = lex(source, mode);
+    fn lex_valid(source: &str, mode: Mode, start_offset: TextSize) -> LexerOutput {
+        let output = lex(source, mode, start_offset);

        if !output.errors.is_empty() {
            let mut message = "Unexpected lexical errors for a valid source:\n".to_string();
@ -1959,7 +1960,7 @@ mod tests {
    }

    fn lex_invalid(source: &str, mode: Mode) -> LexerOutput {
-        let output = lex(source, mode);
+        let output = lex(source, mode, TextSize::default());

        assert!(
            !output.errors.is_empty(),
@ -1970,11 +1971,35 @@ mod tests {
    }

    fn lex_source(source: &str) -> LexerOutput {
-        lex_valid(source, Mode::Module)
+        lex_valid(source, Mode::Module, TextSize::default())
+    }
+
+    fn lex_source_with_offset(source: &str, start_offset: TextSize) -> LexerOutput {
+        lex_valid(source, Mode::Module, start_offset)
    }

    fn lex_jupyter_source(source: &str) -> LexerOutput {
-        lex_valid(source, Mode::Ipython)
+        lex_valid(source, Mode::Ipython, TextSize::default())
+    }
+
+    #[test]
+    fn bom() {
+        let source = "\u{feff}x = 1";
+        assert_snapshot!(lex_source(source));
+    }
+
+    #[test]
+    fn bom_with_offset() {
+        let source = "\u{feff}x + y + z";
+        assert_snapshot!(lex_source_with_offset(source, TextSize::new(7)));
+    }
+
+    #[test]
+    fn bom_with_offset_edge() {
+        // BOM offsets the first token by 3, so make sure that lexing from offset 11 (variable z)
+        // doesn't panic. Refer https://github.com/astral-sh/ruff/issues/11731
+        let source = "\u{feff}x + y + z";
+        assert_snapshot!(lex_source_with_offset(source, TextSize::new(11)));
    }

    fn ipython_escape_command_line_continuation_eol(eol: &str) -> LexerOutput {
@ -2118,7 +2143,7 @@ foo = ,func
 def f(arg=%timeit a = b):
    pass"
            .trim();
-        let output = lex(source, Mode::Ipython);
+        let output = lex(source, Mode::Ipython, TextSize::default());
        assert!(output.errors.is_empty());
        assert_no_ipython_escape_command(&output.tokens);
    }
@ -2351,7 +2376,7 @@ if first:
    }

    fn get_tokens_only(source: &str) -> Vec<TokenKind> {
-        let output = lex(source, Mode::Module);
+        let output = lex(source, Mode::Module, TextSize::default());
        assert!(output.errors.is_empty());
        output.tokens.into_iter().map(|token| token.kind).collect()
    }
@ -2593,7 +2618,7 @@ f"{(lambda x:{x})}"
    }

    fn lex_fstring_error(source: &str) -> FStringErrorType {
-        let output = lex(source, Mode::Module);
+        let output = lex(source, Mode::Module, TextSize::default());
        match output
            .errors
            .into_iter()
--- a/crates/ruff_python_parser/src/snapshots/ruff_python_parserlexertests__bom.snap
+++ b/crates/ruff_python_parser/src/snapshots/ruff_python_parserlexertests__bom.snap
@ -0,0 +1,29 @@
+---
+source: crates/ruff_python_parser/src/lexer.rs
+expression: lex_source(source)
+---
+## Tokens
+```
+[
+    (
+        Name(
+            "x",
+        ),
+        3..4,
+    ),
+    (
+        Equal,
+        5..6,
+    ),
+    (
+        Int(
+            1,
+        ),
+        7..8,
+    ),
+    (
+        Newline,
+        8..8,
+    ),
+]
+```
--- a/crates/ruff_python_parser/src/snapshots/ruff_python_parserlexertests__bom_with_offset.snap
+++ b/crates/ruff_python_parser/src/snapshots/ruff_python_parserlexertests__bom_with_offset.snap
@ -0,0 +1,29 @@
+---
+source: crates/ruff_python_parser/src/lexer.rs
+expression: "lex_source_with_offset(source, TextSize::new(7))"
+---
+## Tokens
+```
+[
+    (
+        Name(
+            "y",
+        ),
+        7..8,
+    ),
+    (
+        Plus,
+        9..10,
+    ),
+    (
+        Name(
+            "z",
+        ),
+        11..12,
+    ),
+    (
+        Newline,
+        12..12,
+    ),
+]
+```
--- a/crates/ruff_python_parser/src/snapshots/ruff_python_parserlexertests__bom_with_offset_edge.snap
+++ b/crates/ruff_python_parser/src/snapshots/ruff_python_parserlexertests__bom_with_offset_edge.snap
@ -0,0 +1,19 @@
+---
+source: crates/ruff_python_parser/src/lexer.rs
+expression: "lex_source_with_offset(source, TextSize::new(11))"
+---
+## Tokens
+```
+[
+    (
+        Name(
+            "z",
+        ),
+        11..12,
+    ),
+    (
+        Newline,
+        12..12,
+    ),
+]
+```