mirror of
https://github.com/astral-sh/ruff.git
synced 2025-07-24 05:26:23 +00:00
Lexer should consider BOM for the start offset (#11732)
## Summary This PR fixes a bug where the lexer didn't consider the BOM into the start offset. fixes: #11731 ## Test Plan Add multiple test cases which involves BOM character in the source for the lexer and verify the snapshot.
This commit is contained in:
parent
3b19df04d7
commit
2567e14b7a
4 changed files with 117 additions and 15 deletions
|
@ -30,6 +30,8 @@ mod cursor;
|
|||
mod fstring;
|
||||
mod indentation;
|
||||
|
||||
const BOM: char = '\u{feff}';
|
||||
|
||||
/// A lexer for Python source code.
|
||||
#[derive(Debug)]
|
||||
pub struct Lexer<'src> {
|
||||
|
@ -100,11 +102,10 @@ impl<'src> Lexer<'src> {
|
|||
errors: Vec::new(),
|
||||
};
|
||||
|
||||
// TODO: Handle possible mismatch between BOM and explicit encoding declaration.
|
||||
// spell-checker:ignore feff
|
||||
lexer.cursor.eat_char('\u{feff}');
|
||||
|
||||
if start_offset > TextSize::new(0) {
|
||||
if start_offset == TextSize::new(0) {
|
||||
// TODO: Handle possible mismatch between BOM and explicit encoding declaration.
|
||||
lexer.cursor.eat_char(BOM);
|
||||
} else {
|
||||
lexer.cursor.skip_bytes(start_offset.to_usize());
|
||||
}
|
||||
|
||||
|
@ -1922,8 +1923,8 @@ mod tests {
|
|||
}
|
||||
}
|
||||
|
||||
fn lex(source: &str, mode: Mode) -> LexerOutput {
|
||||
let mut lexer = Lexer::new(source, mode, TextSize::default());
|
||||
fn lex(source: &str, mode: Mode, start_offset: TextSize) -> LexerOutput {
|
||||
let mut lexer = Lexer::new(source, mode, start_offset);
|
||||
let mut tokens = Vec::new();
|
||||
loop {
|
||||
let kind = lexer.next_token();
|
||||
|
@ -1943,8 +1944,8 @@ mod tests {
|
|||
}
|
||||
}
|
||||
|
||||
fn lex_valid(source: &str, mode: Mode) -> LexerOutput {
|
||||
let output = lex(source, mode);
|
||||
fn lex_valid(source: &str, mode: Mode, start_offset: TextSize) -> LexerOutput {
|
||||
let output = lex(source, mode, start_offset);
|
||||
|
||||
if !output.errors.is_empty() {
|
||||
let mut message = "Unexpected lexical errors for a valid source:\n".to_string();
|
||||
|
@ -1959,7 +1960,7 @@ mod tests {
|
|||
}
|
||||
|
||||
fn lex_invalid(source: &str, mode: Mode) -> LexerOutput {
|
||||
let output = lex(source, mode);
|
||||
let output = lex(source, mode, TextSize::default());
|
||||
|
||||
assert!(
|
||||
!output.errors.is_empty(),
|
||||
|
@ -1970,11 +1971,35 @@ mod tests {
|
|||
}
|
||||
|
||||
fn lex_source(source: &str) -> LexerOutput {
|
||||
lex_valid(source, Mode::Module)
|
||||
lex_valid(source, Mode::Module, TextSize::default())
|
||||
}
|
||||
|
||||
fn lex_source_with_offset(source: &str, start_offset: TextSize) -> LexerOutput {
|
||||
lex_valid(source, Mode::Module, start_offset)
|
||||
}
|
||||
|
||||
fn lex_jupyter_source(source: &str) -> LexerOutput {
|
||||
lex_valid(source, Mode::Ipython)
|
||||
lex_valid(source, Mode::Ipython, TextSize::default())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn bom() {
|
||||
let source = "\u{feff}x = 1";
|
||||
assert_snapshot!(lex_source(source));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn bom_with_offset() {
|
||||
let source = "\u{feff}x + y + z";
|
||||
assert_snapshot!(lex_source_with_offset(source, TextSize::new(7)));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn bom_with_offset_edge() {
|
||||
// BOM offsets the first token by 3, so make sure that lexing from offset 11 (variable z)
|
||||
// doesn't panic. Refer https://github.com/astral-sh/ruff/issues/11731
|
||||
let source = "\u{feff}x + y + z";
|
||||
assert_snapshot!(lex_source_with_offset(source, TextSize::new(11)));
|
||||
}
|
||||
|
||||
fn ipython_escape_command_line_continuation_eol(eol: &str) -> LexerOutput {
|
||||
|
@ -2118,7 +2143,7 @@ foo = ,func
|
|||
def f(arg=%timeit a = b):
|
||||
pass"
|
||||
.trim();
|
||||
let output = lex(source, Mode::Ipython);
|
||||
let output = lex(source, Mode::Ipython, TextSize::default());
|
||||
assert!(output.errors.is_empty());
|
||||
assert_no_ipython_escape_command(&output.tokens);
|
||||
}
|
||||
|
@ -2351,7 +2376,7 @@ if first:
|
|||
}
|
||||
|
||||
fn get_tokens_only(source: &str) -> Vec<TokenKind> {
|
||||
let output = lex(source, Mode::Module);
|
||||
let output = lex(source, Mode::Module, TextSize::default());
|
||||
assert!(output.errors.is_empty());
|
||||
output.tokens.into_iter().map(|token| token.kind).collect()
|
||||
}
|
||||
|
@ -2593,7 +2618,7 @@ f"{(lambda x:{x})}"
|
|||
}
|
||||
|
||||
fn lex_fstring_error(source: &str) -> FStringErrorType {
|
||||
let output = lex(source, Mode::Module);
|
||||
let output = lex(source, Mode::Module, TextSize::default());
|
||||
match output
|
||||
.errors
|
||||
.into_iter()
|
||||
|
|
|
@ -0,0 +1,29 @@
|
|||
---
|
||||
source: crates/ruff_python_parser/src/lexer.rs
|
||||
expression: lex_source(source)
|
||||
---
|
||||
## Tokens
|
||||
```
|
||||
[
|
||||
(
|
||||
Name(
|
||||
"x",
|
||||
),
|
||||
3..4,
|
||||
),
|
||||
(
|
||||
Equal,
|
||||
5..6,
|
||||
),
|
||||
(
|
||||
Int(
|
||||
1,
|
||||
),
|
||||
7..8,
|
||||
),
|
||||
(
|
||||
Newline,
|
||||
8..8,
|
||||
),
|
||||
]
|
||||
```
|
|
@ -0,0 +1,29 @@
|
|||
---
|
||||
source: crates/ruff_python_parser/src/lexer.rs
|
||||
expression: "lex_source_with_offset(source, TextSize::new(7))"
|
||||
---
|
||||
## Tokens
|
||||
```
|
||||
[
|
||||
(
|
||||
Name(
|
||||
"y",
|
||||
),
|
||||
7..8,
|
||||
),
|
||||
(
|
||||
Plus,
|
||||
9..10,
|
||||
),
|
||||
(
|
||||
Name(
|
||||
"z",
|
||||
),
|
||||
11..12,
|
||||
),
|
||||
(
|
||||
Newline,
|
||||
12..12,
|
||||
),
|
||||
]
|
||||
```
|
|
@ -0,0 +1,19 @@
|
|||
---
|
||||
source: crates/ruff_python_parser/src/lexer.rs
|
||||
expression: "lex_source_with_offset(source, TextSize::new(11))"
|
||||
---
|
||||
## Tokens
|
||||
```
|
||||
[
|
||||
(
|
||||
Name(
|
||||
"z",
|
||||
),
|
||||
11..12,
|
||||
),
|
||||
(
|
||||
Newline,
|
||||
12..12,
|
||||
),
|
||||
]
|
||||
```
|
Loading…
Add table
Add a link
Reference in a new issue