mirror of
https://github.com/astral-sh/ruff.git
synced 2025-08-19 01:50:38 +00:00
Lexer should consider BOM for the start offset (#11732)
## Summary This PR fixes a bug where the lexer didn't consider the BOM into the start offset. fixes: #11731 ## Test Plan Add multiple test cases which involves BOM character in the source for the lexer and verify the snapshot.
This commit is contained in:
parent
3b19df04d7
commit
2567e14b7a
4 changed files with 117 additions and 15 deletions
|
@ -30,6 +30,8 @@ mod cursor;
|
||||||
mod fstring;
|
mod fstring;
|
||||||
mod indentation;
|
mod indentation;
|
||||||
|
|
||||||
|
const BOM: char = '\u{feff}';
|
||||||
|
|
||||||
/// A lexer for Python source code.
|
/// A lexer for Python source code.
|
||||||
#[derive(Debug)]
|
#[derive(Debug)]
|
||||||
pub struct Lexer<'src> {
|
pub struct Lexer<'src> {
|
||||||
|
@ -100,11 +102,10 @@ impl<'src> Lexer<'src> {
|
||||||
errors: Vec::new(),
|
errors: Vec::new(),
|
||||||
};
|
};
|
||||||
|
|
||||||
// TODO: Handle possible mismatch between BOM and explicit encoding declaration.
|
if start_offset == TextSize::new(0) {
|
||||||
// spell-checker:ignore feff
|
// TODO: Handle possible mismatch between BOM and explicit encoding declaration.
|
||||||
lexer.cursor.eat_char('\u{feff}');
|
lexer.cursor.eat_char(BOM);
|
||||||
|
} else {
|
||||||
if start_offset > TextSize::new(0) {
|
|
||||||
lexer.cursor.skip_bytes(start_offset.to_usize());
|
lexer.cursor.skip_bytes(start_offset.to_usize());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1922,8 +1923,8 @@ mod tests {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn lex(source: &str, mode: Mode) -> LexerOutput {
|
fn lex(source: &str, mode: Mode, start_offset: TextSize) -> LexerOutput {
|
||||||
let mut lexer = Lexer::new(source, mode, TextSize::default());
|
let mut lexer = Lexer::new(source, mode, start_offset);
|
||||||
let mut tokens = Vec::new();
|
let mut tokens = Vec::new();
|
||||||
loop {
|
loop {
|
||||||
let kind = lexer.next_token();
|
let kind = lexer.next_token();
|
||||||
|
@ -1943,8 +1944,8 @@ mod tests {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn lex_valid(source: &str, mode: Mode) -> LexerOutput {
|
fn lex_valid(source: &str, mode: Mode, start_offset: TextSize) -> LexerOutput {
|
||||||
let output = lex(source, mode);
|
let output = lex(source, mode, start_offset);
|
||||||
|
|
||||||
if !output.errors.is_empty() {
|
if !output.errors.is_empty() {
|
||||||
let mut message = "Unexpected lexical errors for a valid source:\n".to_string();
|
let mut message = "Unexpected lexical errors for a valid source:\n".to_string();
|
||||||
|
@ -1959,7 +1960,7 @@ mod tests {
|
||||||
}
|
}
|
||||||
|
|
||||||
fn lex_invalid(source: &str, mode: Mode) -> LexerOutput {
|
fn lex_invalid(source: &str, mode: Mode) -> LexerOutput {
|
||||||
let output = lex(source, mode);
|
let output = lex(source, mode, TextSize::default());
|
||||||
|
|
||||||
assert!(
|
assert!(
|
||||||
!output.errors.is_empty(),
|
!output.errors.is_empty(),
|
||||||
|
@ -1970,11 +1971,35 @@ mod tests {
|
||||||
}
|
}
|
||||||
|
|
||||||
fn lex_source(source: &str) -> LexerOutput {
|
fn lex_source(source: &str) -> LexerOutput {
|
||||||
lex_valid(source, Mode::Module)
|
lex_valid(source, Mode::Module, TextSize::default())
|
||||||
|
}
|
||||||
|
|
||||||
|
fn lex_source_with_offset(source: &str, start_offset: TextSize) -> LexerOutput {
|
||||||
|
lex_valid(source, Mode::Module, start_offset)
|
||||||
}
|
}
|
||||||
|
|
||||||
fn lex_jupyter_source(source: &str) -> LexerOutput {
|
fn lex_jupyter_source(source: &str) -> LexerOutput {
|
||||||
lex_valid(source, Mode::Ipython)
|
lex_valid(source, Mode::Ipython, TextSize::default())
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn bom() {
|
||||||
|
let source = "\u{feff}x = 1";
|
||||||
|
assert_snapshot!(lex_source(source));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn bom_with_offset() {
|
||||||
|
let source = "\u{feff}x + y + z";
|
||||||
|
assert_snapshot!(lex_source_with_offset(source, TextSize::new(7)));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn bom_with_offset_edge() {
|
||||||
|
// BOM offsets the first token by 3, so make sure that lexing from offset 11 (variable z)
|
||||||
|
// doesn't panic. Refer https://github.com/astral-sh/ruff/issues/11731
|
||||||
|
let source = "\u{feff}x + y + z";
|
||||||
|
assert_snapshot!(lex_source_with_offset(source, TextSize::new(11)));
|
||||||
}
|
}
|
||||||
|
|
||||||
fn ipython_escape_command_line_continuation_eol(eol: &str) -> LexerOutput {
|
fn ipython_escape_command_line_continuation_eol(eol: &str) -> LexerOutput {
|
||||||
|
@ -2118,7 +2143,7 @@ foo = ,func
|
||||||
def f(arg=%timeit a = b):
|
def f(arg=%timeit a = b):
|
||||||
pass"
|
pass"
|
||||||
.trim();
|
.trim();
|
||||||
let output = lex(source, Mode::Ipython);
|
let output = lex(source, Mode::Ipython, TextSize::default());
|
||||||
assert!(output.errors.is_empty());
|
assert!(output.errors.is_empty());
|
||||||
assert_no_ipython_escape_command(&output.tokens);
|
assert_no_ipython_escape_command(&output.tokens);
|
||||||
}
|
}
|
||||||
|
@ -2351,7 +2376,7 @@ if first:
|
||||||
}
|
}
|
||||||
|
|
||||||
fn get_tokens_only(source: &str) -> Vec<TokenKind> {
|
fn get_tokens_only(source: &str) -> Vec<TokenKind> {
|
||||||
let output = lex(source, Mode::Module);
|
let output = lex(source, Mode::Module, TextSize::default());
|
||||||
assert!(output.errors.is_empty());
|
assert!(output.errors.is_empty());
|
||||||
output.tokens.into_iter().map(|token| token.kind).collect()
|
output.tokens.into_iter().map(|token| token.kind).collect()
|
||||||
}
|
}
|
||||||
|
@ -2593,7 +2618,7 @@ f"{(lambda x:{x})}"
|
||||||
}
|
}
|
||||||
|
|
||||||
fn lex_fstring_error(source: &str) -> FStringErrorType {
|
fn lex_fstring_error(source: &str) -> FStringErrorType {
|
||||||
let output = lex(source, Mode::Module);
|
let output = lex(source, Mode::Module, TextSize::default());
|
||||||
match output
|
match output
|
||||||
.errors
|
.errors
|
||||||
.into_iter()
|
.into_iter()
|
||||||
|
|
|
@ -0,0 +1,29 @@
|
||||||
|
---
|
||||||
|
source: crates/ruff_python_parser/src/lexer.rs
|
||||||
|
expression: lex_source(source)
|
||||||
|
---
|
||||||
|
## Tokens
|
||||||
|
```
|
||||||
|
[
|
||||||
|
(
|
||||||
|
Name(
|
||||||
|
"x",
|
||||||
|
),
|
||||||
|
3..4,
|
||||||
|
),
|
||||||
|
(
|
||||||
|
Equal,
|
||||||
|
5..6,
|
||||||
|
),
|
||||||
|
(
|
||||||
|
Int(
|
||||||
|
1,
|
||||||
|
),
|
||||||
|
7..8,
|
||||||
|
),
|
||||||
|
(
|
||||||
|
Newline,
|
||||||
|
8..8,
|
||||||
|
),
|
||||||
|
]
|
||||||
|
```
|
|
@ -0,0 +1,29 @@
|
||||||
|
---
|
||||||
|
source: crates/ruff_python_parser/src/lexer.rs
|
||||||
|
expression: "lex_source_with_offset(source, TextSize::new(7))"
|
||||||
|
---
|
||||||
|
## Tokens
|
||||||
|
```
|
||||||
|
[
|
||||||
|
(
|
||||||
|
Name(
|
||||||
|
"y",
|
||||||
|
),
|
||||||
|
7..8,
|
||||||
|
),
|
||||||
|
(
|
||||||
|
Plus,
|
||||||
|
9..10,
|
||||||
|
),
|
||||||
|
(
|
||||||
|
Name(
|
||||||
|
"z",
|
||||||
|
),
|
||||||
|
11..12,
|
||||||
|
),
|
||||||
|
(
|
||||||
|
Newline,
|
||||||
|
12..12,
|
||||||
|
),
|
||||||
|
]
|
||||||
|
```
|
|
@ -0,0 +1,19 @@
|
||||||
|
---
|
||||||
|
source: crates/ruff_python_parser/src/lexer.rs
|
||||||
|
expression: "lex_source_with_offset(source, TextSize::new(11))"
|
||||||
|
---
|
||||||
|
## Tokens
|
||||||
|
```
|
||||||
|
[
|
||||||
|
(
|
||||||
|
Name(
|
||||||
|
"z",
|
||||||
|
),
|
||||||
|
11..12,
|
||||||
|
),
|
||||||
|
(
|
||||||
|
Newline,
|
||||||
|
12..12,
|
||||||
|
),
|
||||||
|
]
|
||||||
|
```
|
Loading…
Add table
Add a link
Reference in a new issue