Fix SimpleTokenizer's backward lexing of # (#5878)

This commit is contained in:
Micha Reiser 2023-07-20 11:54:18 +02:00 committed by GitHub
parent 8c5f8a8aef
commit 76e9ce6dc0
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
13 changed files with 1863 additions and 52 deletions

1
Cargo.lock generated
View file

@ -2201,6 +2201,7 @@ dependencies = [
"insta",
"memchr",
"ruff_text_size",
"smallvec",
"unic-ucd-ident",
]

View file

@ -95,8 +95,11 @@ pub(crate) fn remove_exception_handler_assignment(
locator: &Locator,
) -> Result<Edit> {
// Lex backwards, to the token just before the `as`.
let mut tokenizer =
SimpleTokenizer::up_to(bound_exception.range.start(), locator.contents()).skip_trivia();
let mut tokenizer = SimpleTokenizer::up_to_without_back_comment(
bound_exception.range.start(),
locator.contents(),
)
.skip_trivia();
// Eat the `as` token.
let preceding = tokenizer

View file

@ -8,8 +8,7 @@ use ruff_python_ast::node::{AnyNodeRef, AstNode};
use ruff_python_ast::source_code::Locator;
use ruff_python_ast::whitespace;
use ruff_python_trivia::{
first_non_trivia_token_rev, PythonWhitespace, SimpleToken, SimpleTokenKind, SimpleTokenizer,
UniversalNewlines,
PythonWhitespace, SimpleToken, SimpleTokenKind, SimpleTokenizer, UniversalNewlines,
};
use crate::comments::visitor::{CommentPlacement, DecoratedComment};
@ -1059,7 +1058,9 @@ fn handle_slice_comments<'a>(
// Check for `foo[ # comment`, but only if they are on the same line
let after_lbracket = matches!(
first_non_trivia_token_rev(comment.slice().start(), locator.contents()),
SimpleTokenizer::up_to_without_back_comment(comment.slice().start(), locator.contents())
.skip_trivia()
.next_back(),
Some(SimpleToken {
kind: SimpleTokenKind::LBracket,
..

View file

@ -16,6 +16,7 @@ license = { workspace = true }
ruff_text_size = { workspace = true }
memchr = { workspace = true }
smallvec = { workspace = true }
unic-ucd-ident = "0.9.0"
[dev-dependencies]

View file

@ -0,0 +1,78 @@
---
source: crates/ruff_python_trivia/src/tokenizer.rs
expression: test_case.tokenize_reverse()
---
[
SimpleToken {
kind: Comment,
range: 17..43,
},
SimpleToken {
kind: Whitespace,
range: 16..17,
},
SimpleToken {
kind: Other,
range: 15..16,
},
SimpleToken {
kind: Bogus,
range: 14..15,
},
SimpleToken {
kind: Bogus,
range: 13..14,
},
SimpleToken {
kind: Bogus,
range: 12..13,
},
SimpleToken {
kind: Bogus,
range: 11..12,
},
SimpleToken {
kind: Bogus,
range: 10..11,
},
SimpleToken {
kind: Bogus,
range: 9..10,
},
SimpleToken {
kind: Bogus,
range: 8..9,
},
SimpleToken {
kind: Bogus,
range: 7..8,
},
SimpleToken {
kind: Bogus,
range: 6..7,
},
SimpleToken {
kind: Bogus,
range: 5..6,
},
SimpleToken {
kind: Bogus,
range: 4..5,
},
SimpleToken {
kind: Bogus,
range: 3..4,
},
SimpleToken {
kind: Bogus,
range: 2..3,
},
SimpleToken {
kind: Bogus,
range: 1..2,
},
SimpleToken {
kind: Bogus,
range: 0..1,
},
]

View file

@ -0,0 +1,94 @@
---
source: crates/ruff_python_trivia/src/tokenizer.rs
expression: test_case.tokenize_reverse()
---
[
SimpleToken {
kind: Comment,
range: 21..51,
},
SimpleToken {
kind: Whitespace,
range: 20..21,
},
SimpleToken {
kind: Other,
range: 19..20,
},
SimpleToken {
kind: Bogus,
range: 18..19,
},
SimpleToken {
kind: Bogus,
range: 17..18,
},
SimpleToken {
kind: Bogus,
range: 16..17,
},
SimpleToken {
kind: Bogus,
range: 15..16,
},
SimpleToken {
kind: Bogus,
range: 14..15,
},
SimpleToken {
kind: Bogus,
range: 13..14,
},
SimpleToken {
kind: Bogus,
range: 12..13,
},
SimpleToken {
kind: Bogus,
range: 11..12,
},
SimpleToken {
kind: Bogus,
range: 10..11,
},
SimpleToken {
kind: Bogus,
range: 9..10,
},
SimpleToken {
kind: Bogus,
range: 8..9,
},
SimpleToken {
kind: Bogus,
range: 7..8,
},
SimpleToken {
kind: Bogus,
range: 6..7,
},
SimpleToken {
kind: Bogus,
range: 5..6,
},
SimpleToken {
kind: Bogus,
range: 4..5,
},
SimpleToken {
kind: Bogus,
range: 3..4,
},
SimpleToken {
kind: Bogus,
range: 2..3,
},
SimpleToken {
kind: Bogus,
range: 1..2,
},
SimpleToken {
kind: Bogus,
range: 0..1,
},
]

View file

@ -0,0 +1,314 @@
---
source: crates/ruff_python_trivia/src/tokenizer.rs
expression: test_case.tokenize_reverse()
---
[
SimpleToken {
kind: Other,
range: 76..77,
},
SimpleToken {
kind: Bogus,
range: 75..76,
},
SimpleToken {
kind: Bogus,
range: 74..75,
},
SimpleToken {
kind: Bogus,
range: 73..74,
},
SimpleToken {
kind: Bogus,
range: 72..73,
},
SimpleToken {
kind: Bogus,
range: 71..72,
},
SimpleToken {
kind: Bogus,
range: 70..71,
},
SimpleToken {
kind: Bogus,
range: 69..70,
},
SimpleToken {
kind: Bogus,
range: 68..69,
},
SimpleToken {
kind: Bogus,
range: 67..68,
},
SimpleToken {
kind: Bogus,
range: 66..67,
},
SimpleToken {
kind: Bogus,
range: 65..66,
},
SimpleToken {
kind: Bogus,
range: 64..65,
},
SimpleToken {
kind: Bogus,
range: 63..64,
},
SimpleToken {
kind: Bogus,
range: 62..63,
},
SimpleToken {
kind: Bogus,
range: 61..62,
},
SimpleToken {
kind: Bogus,
range: 60..61,
},
SimpleToken {
kind: Bogus,
range: 59..60,
},
SimpleToken {
kind: Bogus,
range: 58..59,
},
SimpleToken {
kind: Bogus,
range: 57..58,
},
SimpleToken {
kind: Bogus,
range: 56..57,
},
SimpleToken {
kind: Bogus,
range: 55..56,
},
SimpleToken {
kind: Bogus,
range: 54..55,
},
SimpleToken {
kind: Bogus,
range: 53..54,
},
SimpleToken {
kind: Bogus,
range: 52..53,
},
SimpleToken {
kind: Bogus,
range: 51..52,
},
SimpleToken {
kind: Bogus,
range: 50..51,
},
SimpleToken {
kind: Bogus,
range: 49..50,
},
SimpleToken {
kind: Bogus,
range: 48..49,
},
SimpleToken {
kind: Bogus,
range: 47..48,
},
SimpleToken {
kind: Bogus,
range: 46..47,
},
SimpleToken {
kind: Bogus,
range: 45..46,
},
SimpleToken {
kind: Bogus,
range: 44..45,
},
SimpleToken {
kind: Bogus,
range: 43..44,
},
SimpleToken {
kind: Bogus,
range: 42..43,
},
SimpleToken {
kind: Bogus,
range: 41..42,
},
SimpleToken {
kind: Bogus,
range: 40..41,
},
SimpleToken {
kind: Bogus,
range: 39..40,
},
SimpleToken {
kind: Bogus,
range: 38..39,
},
SimpleToken {
kind: Bogus,
range: 37..38,
},
SimpleToken {
kind: Bogus,
range: 36..37,
},
SimpleToken {
kind: Bogus,
range: 35..36,
},
SimpleToken {
kind: Bogus,
range: 34..35,
},
SimpleToken {
kind: Bogus,
range: 33..34,
},
SimpleToken {
kind: Bogus,
range: 32..33,
},
SimpleToken {
kind: Bogus,
range: 31..32,
},
SimpleToken {
kind: Bogus,
range: 30..31,
},
SimpleToken {
kind: Bogus,
range: 29..30,
},
SimpleToken {
kind: Bogus,
range: 28..29,
},
SimpleToken {
kind: Bogus,
range: 27..28,
},
SimpleToken {
kind: Bogus,
range: 26..27,
},
SimpleToken {
kind: Bogus,
range: 25..26,
},
SimpleToken {
kind: Bogus,
range: 24..25,
},
SimpleToken {
kind: Bogus,
range: 23..24,
},
SimpleToken {
kind: Bogus,
range: 22..23,
},
SimpleToken {
kind: Bogus,
range: 21..22,
},
SimpleToken {
kind: Bogus,
range: 20..21,
},
SimpleToken {
kind: Bogus,
range: 19..20,
},
SimpleToken {
kind: Bogus,
range: 18..19,
},
SimpleToken {
kind: Bogus,
range: 17..18,
},
SimpleToken {
kind: Bogus,
range: 16..17,
},
SimpleToken {
kind: Bogus,
range: 15..16,
},
SimpleToken {
kind: Bogus,
range: 14..15,
},
SimpleToken {
kind: Bogus,
range: 13..14,
},
SimpleToken {
kind: Bogus,
range: 12..13,
},
SimpleToken {
kind: Bogus,
range: 11..12,
},
SimpleToken {
kind: Bogus,
range: 10..11,
},
SimpleToken {
kind: Bogus,
range: 9..10,
},
SimpleToken {
kind: Bogus,
range: 8..9,
},
SimpleToken {
kind: Bogus,
range: 7..8,
},
SimpleToken {
kind: Bogus,
range: 6..7,
},
SimpleToken {
kind: Bogus,
range: 5..6,
},
SimpleToken {
kind: Bogus,
range: 4..5,
},
SimpleToken {
kind: Bogus,
range: 3..4,
},
SimpleToken {
kind: Bogus,
range: 2..3,
},
SimpleToken {
kind: Bogus,
range: 1..2,
},
SimpleToken {
kind: Bogus,
range: 0..1,
},
]

View file

@ -0,0 +1,322 @@
---
source: crates/ruff_python_trivia/src/tokenizer.rs
expression: test_case.tokenize_reverse()
---
[
SimpleToken {
kind: Other,
range: 78..79,
},
SimpleToken {
kind: Bogus,
range: 77..78,
},
SimpleToken {
kind: Bogus,
range: 76..77,
},
SimpleToken {
kind: Bogus,
range: 75..76,
},
SimpleToken {
kind: Bogus,
range: 74..75,
},
SimpleToken {
kind: Bogus,
range: 73..74,
},
SimpleToken {
kind: Bogus,
range: 72..73,
},
SimpleToken {
kind: Bogus,
range: 71..72,
},
SimpleToken {
kind: Bogus,
range: 70..71,
},
SimpleToken {
kind: Bogus,
range: 69..70,
},
SimpleToken {
kind: Bogus,
range: 68..69,
},
SimpleToken {
kind: Bogus,
range: 67..68,
},
SimpleToken {
kind: Bogus,
range: 66..67,
},
SimpleToken {
kind: Bogus,
range: 65..66,
},
SimpleToken {
kind: Bogus,
range: 64..65,
},
SimpleToken {
kind: Bogus,
range: 63..64,
},
SimpleToken {
kind: Bogus,
range: 62..63,
},
SimpleToken {
kind: Bogus,
range: 61..62,
},
SimpleToken {
kind: Bogus,
range: 60..61,
},
SimpleToken {
kind: Bogus,
range: 59..60,
},
SimpleToken {
kind: Bogus,
range: 58..59,
},
SimpleToken {
kind: Bogus,
range: 57..58,
},
SimpleToken {
kind: Bogus,
range: 56..57,
},
SimpleToken {
kind: Bogus,
range: 55..56,
},
SimpleToken {
kind: Bogus,
range: 54..55,
},
SimpleToken {
kind: Bogus,
range: 53..54,
},
SimpleToken {
kind: Bogus,
range: 52..53,
},
SimpleToken {
kind: Bogus,
range: 51..52,
},
SimpleToken {
kind: Bogus,
range: 50..51,
},
SimpleToken {
kind: Bogus,
range: 49..50,
},
SimpleToken {
kind: Bogus,
range: 48..49,
},
SimpleToken {
kind: Bogus,
range: 47..48,
},
SimpleToken {
kind: Bogus,
range: 46..47,
},
SimpleToken {
kind: Bogus,
range: 45..46,
},
SimpleToken {
kind: Bogus,
range: 44..45,
},
SimpleToken {
kind: Bogus,
range: 43..44,
},
SimpleToken {
kind: Bogus,
range: 42..43,
},
SimpleToken {
kind: Bogus,
range: 41..42,
},
SimpleToken {
kind: Bogus,
range: 40..41,
},
SimpleToken {
kind: Bogus,
range: 39..40,
},
SimpleToken {
kind: Bogus,
range: 38..39,
},
SimpleToken {
kind: Bogus,
range: 37..38,
},
SimpleToken {
kind: Bogus,
range: 36..37,
},
SimpleToken {
kind: Bogus,
range: 35..36,
},
SimpleToken {
kind: Bogus,
range: 34..35,
},
SimpleToken {
kind: Bogus,
range: 33..34,
},
SimpleToken {
kind: Bogus,
range: 32..33,
},
SimpleToken {
kind: Bogus,
range: 31..32,
},
SimpleToken {
kind: Bogus,
range: 30..31,
},
SimpleToken {
kind: Bogus,
range: 29..30,
},
SimpleToken {
kind: Bogus,
range: 28..29,
},
SimpleToken {
kind: Bogus,
range: 27..28,
},
SimpleToken {
kind: Bogus,
range: 26..27,
},
SimpleToken {
kind: Bogus,
range: 25..26,
},
SimpleToken {
kind: Bogus,
range: 24..25,
},
SimpleToken {
kind: Bogus,
range: 23..24,
},
SimpleToken {
kind: Bogus,
range: 22..23,
},
SimpleToken {
kind: Bogus,
range: 21..22,
},
SimpleToken {
kind: Bogus,
range: 20..21,
},
SimpleToken {
kind: Bogus,
range: 19..20,
},
SimpleToken {
kind: Bogus,
range: 18..19,
},
SimpleToken {
kind: Bogus,
range: 17..18,
},
SimpleToken {
kind: Bogus,
range: 16..17,
},
SimpleToken {
kind: Bogus,
range: 15..16,
},
SimpleToken {
kind: Bogus,
range: 14..15,
},
SimpleToken {
kind: Bogus,
range: 13..14,
},
SimpleToken {
kind: Bogus,
range: 12..13,
},
SimpleToken {
kind: Bogus,
range: 11..12,
},
SimpleToken {
kind: Bogus,
range: 10..11,
},
SimpleToken {
kind: Bogus,
range: 9..10,
},
SimpleToken {
kind: Bogus,
range: 8..9,
},
SimpleToken {
kind: Bogus,
range: 7..8,
},
SimpleToken {
kind: Bogus,
range: 6..7,
},
SimpleToken {
kind: Bogus,
range: 5..6,
},
SimpleToken {
kind: Bogus,
range: 4..5,
},
SimpleToken {
kind: Bogus,
range: 3..4,
},
SimpleToken {
kind: Bogus,
range: 2..3,
},
SimpleToken {
kind: Bogus,
range: 1..2,
},
SimpleToken {
kind: Bogus,
range: 0..1,
},
]

View file

@ -0,0 +1,222 @@
---
source: crates/ruff_python_trivia/src/tokenizer.rs
expression: test_case.tokenize_reverse()
---
[
SimpleToken {
kind: Comment,
range: 53..72,
},
SimpleToken {
kind: Whitespace,
range: 52..53,
},
SimpleToken {
kind: Other,
range: 51..52,
},
SimpleToken {
kind: Bogus,
range: 50..51,
},
SimpleToken {
kind: Bogus,
range: 49..50,
},
SimpleToken {
kind: Bogus,
range: 48..49,
},
SimpleToken {
kind: Bogus,
range: 47..48,
},
SimpleToken {
kind: Bogus,
range: 46..47,
},
SimpleToken {
kind: Bogus,
range: 45..46,
},
SimpleToken {
kind: Bogus,
range: 44..45,
},
SimpleToken {
kind: Bogus,
range: 43..44,
},
SimpleToken {
kind: Bogus,
range: 42..43,
},
SimpleToken {
kind: Bogus,
range: 41..42,
},
SimpleToken {
kind: Bogus,
range: 40..41,
},
SimpleToken {
kind: Bogus,
range: 39..40,
},
SimpleToken {
kind: Bogus,
range: 38..39,
},
SimpleToken {
kind: Bogus,
range: 37..38,
},
SimpleToken {
kind: Bogus,
range: 36..37,
},
SimpleToken {
kind: Bogus,
range: 35..36,
},
SimpleToken {
kind: Bogus,
range: 34..35,
},
SimpleToken {
kind: Bogus,
range: 33..34,
},
SimpleToken {
kind: Bogus,
range: 32..33,
},
SimpleToken {
kind: Bogus,
range: 31..32,
},
SimpleToken {
kind: Bogus,
range: 30..31,
},
SimpleToken {
kind: Bogus,
range: 29..30,
},
SimpleToken {
kind: Bogus,
range: 28..29,
},
SimpleToken {
kind: Bogus,
range: 27..28,
},
SimpleToken {
kind: Bogus,
range: 26..27,
},
SimpleToken {
kind: Bogus,
range: 25..26,
},
SimpleToken {
kind: Bogus,
range: 24..25,
},
SimpleToken {
kind: Bogus,
range: 23..24,
},
SimpleToken {
kind: Bogus,
range: 22..23,
},
SimpleToken {
kind: Bogus,
range: 21..22,
},
SimpleToken {
kind: Bogus,
range: 20..21,
},
SimpleToken {
kind: Bogus,
range: 19..20,
},
SimpleToken {
kind: Bogus,
range: 18..19,
},
SimpleToken {
kind: Bogus,
range: 17..18,
},
SimpleToken {
kind: Bogus,
range: 16..17,
},
SimpleToken {
kind: Bogus,
range: 15..16,
},
SimpleToken {
kind: Bogus,
range: 14..15,
},
SimpleToken {
kind: Bogus,
range: 13..14,
},
SimpleToken {
kind: Bogus,
range: 12..13,
},
SimpleToken {
kind: Bogus,
range: 11..12,
},
SimpleToken {
kind: Bogus,
range: 10..11,
},
SimpleToken {
kind: Bogus,
range: 9..10,
},
SimpleToken {
kind: Bogus,
range: 8..9,
},
SimpleToken {
kind: Bogus,
range: 7..8,
},
SimpleToken {
kind: Bogus,
range: 6..7,
},
SimpleToken {
kind: Bogus,
range: 5..6,
},
SimpleToken {
kind: Bogus,
range: 4..5,
},
SimpleToken {
kind: Bogus,
range: 3..4,
},
SimpleToken {
kind: Bogus,
range: 2..3,
},
SimpleToken {
kind: Bogus,
range: 1..2,
},
SimpleToken {
kind: Bogus,
range: 0..1,
},
]

View file

@ -0,0 +1,66 @@
---
source: crates/ruff_python_trivia/src/tokenizer.rs
expression: test_case.tokenize_reverse()
---
[
SimpleToken {
kind: Comment,
range: 14..27,
},
SimpleToken {
kind: Whitespace,
range: 13..14,
},
SimpleToken {
kind: Other,
range: 12..13,
},
SimpleToken {
kind: Bogus,
range: 11..12,
},
SimpleToken {
kind: Bogus,
range: 10..11,
},
SimpleToken {
kind: Bogus,
range: 9..10,
},
SimpleToken {
kind: Bogus,
range: 8..9,
},
SimpleToken {
kind: Bogus,
range: 7..8,
},
SimpleToken {
kind: Bogus,
range: 6..7,
},
SimpleToken {
kind: Bogus,
range: 5..6,
},
SimpleToken {
kind: Bogus,
range: 4..5,
},
SimpleToken {
kind: Bogus,
range: 3..4,
},
SimpleToken {
kind: Bogus,
range: 2..3,
},
SimpleToken {
kind: Bogus,
range: 1..2,
},
SimpleToken {
kind: Bogus,
range: 0..1,
},
]

View file

@ -0,0 +1,150 @@
---
source: crates/ruff_python_trivia/src/tokenizer.rs
expression: test_case.tokenize_reverse()
---
[
SimpleToken {
kind: Comment,
range: 35..54,
},
SimpleToken {
kind: Whitespace,
range: 34..35,
},
SimpleToken {
kind: Other,
range: 33..34,
},
SimpleToken {
kind: Bogus,
range: 32..33,
},
SimpleToken {
kind: Bogus,
range: 31..32,
},
SimpleToken {
kind: Bogus,
range: 30..31,
},
SimpleToken {
kind: Bogus,
range: 29..30,
},
SimpleToken {
kind: Bogus,
range: 28..29,
},
SimpleToken {
kind: Bogus,
range: 27..28,
},
SimpleToken {
kind: Bogus,
range: 26..27,
},
SimpleToken {
kind: Bogus,
range: 25..26,
},
SimpleToken {
kind: Bogus,
range: 24..25,
},
SimpleToken {
kind: Bogus,
range: 23..24,
},
SimpleToken {
kind: Bogus,
range: 22..23,
},
SimpleToken {
kind: Bogus,
range: 21..22,
},
SimpleToken {
kind: Bogus,
range: 20..21,
},
SimpleToken {
kind: Bogus,
range: 19..20,
},
SimpleToken {
kind: Bogus,
range: 18..19,
},
SimpleToken {
kind: Bogus,
range: 17..18,
},
SimpleToken {
kind: Bogus,
range: 16..17,
},
SimpleToken {
kind: Bogus,
range: 15..16,
},
SimpleToken {
kind: Bogus,
range: 14..15,
},
SimpleToken {
kind: Bogus,
range: 13..14,
},
SimpleToken {
kind: Bogus,
range: 12..13,
},
SimpleToken {
kind: Bogus,
range: 11..12,
},
SimpleToken {
kind: Bogus,
range: 10..11,
},
SimpleToken {
kind: Bogus,
range: 9..10,
},
SimpleToken {
kind: Bogus,
range: 8..9,
},
SimpleToken {
kind: Bogus,
range: 7..8,
},
SimpleToken {
kind: Bogus,
range: 6..7,
},
SimpleToken {
kind: Bogus,
range: 5..6,
},
SimpleToken {
kind: Bogus,
range: 4..5,
},
SimpleToken {
kind: Bogus,
range: 3..4,
},
SimpleToken {
kind: Bogus,
range: 2..3,
},
SimpleToken {
kind: Bogus,
range: 1..2,
},
SimpleToken {
kind: Bogus,
range: 0..1,
},
]

View file

@ -0,0 +1,326 @@
---
source: crates/ruff_python_trivia/src/tokenizer.rs
expression: test_case.tokenize_reverse()
---
[
SimpleToken {
kind: Other,
range: 79..80,
},
SimpleToken {
kind: Bogus,
range: 78..79,
},
SimpleToken {
kind: Bogus,
range: 77..78,
},
SimpleToken {
kind: Bogus,
range: 76..77,
},
SimpleToken {
kind: Bogus,
range: 75..76,
},
SimpleToken {
kind: Bogus,
range: 74..75,
},
SimpleToken {
kind: Bogus,
range: 73..74,
},
SimpleToken {
kind: Bogus,
range: 72..73,
},
SimpleToken {
kind: Bogus,
range: 71..72,
},
SimpleToken {
kind: Bogus,
range: 70..71,
},
SimpleToken {
kind: Bogus,
range: 69..70,
},
SimpleToken {
kind: Bogus,
range: 68..69,
},
SimpleToken {
kind: Bogus,
range: 67..68,
},
SimpleToken {
kind: Bogus,
range: 66..67,
},
SimpleToken {
kind: Bogus,
range: 65..66,
},
SimpleToken {
kind: Bogus,
range: 64..65,
},
SimpleToken {
kind: Bogus,
range: 63..64,
},
SimpleToken {
kind: Bogus,
range: 62..63,
},
SimpleToken {
kind: Bogus,
range: 61..62,
},
SimpleToken {
kind: Bogus,
range: 60..61,
},
SimpleToken {
kind: Bogus,
range: 59..60,
},
SimpleToken {
kind: Bogus,
range: 58..59,
},
SimpleToken {
kind: Bogus,
range: 57..58,
},
SimpleToken {
kind: Bogus,
range: 56..57,
},
SimpleToken {
kind: Bogus,
range: 55..56,
},
SimpleToken {
kind: Bogus,
range: 54..55,
},
SimpleToken {
kind: Bogus,
range: 53..54,
},
SimpleToken {
kind: Bogus,
range: 52..53,
},
SimpleToken {
kind: Bogus,
range: 51..52,
},
SimpleToken {
kind: Bogus,
range: 50..51,
},
SimpleToken {
kind: Bogus,
range: 49..50,
},
SimpleToken {
kind: Bogus,
range: 48..49,
},
SimpleToken {
kind: Bogus,
range: 47..48,
},
SimpleToken {
kind: Bogus,
range: 46..47,
},
SimpleToken {
kind: Bogus,
range: 45..46,
},
SimpleToken {
kind: Bogus,
range: 44..45,
},
SimpleToken {
kind: Bogus,
range: 43..44,
},
SimpleToken {
kind: Bogus,
range: 42..43,
},
SimpleToken {
kind: Bogus,
range: 41..42,
},
SimpleToken {
kind: Bogus,
range: 40..41,
},
SimpleToken {
kind: Bogus,
range: 39..40,
},
SimpleToken {
kind: Bogus,
range: 38..39,
},
SimpleToken {
kind: Bogus,
range: 37..38,
},
SimpleToken {
kind: Bogus,
range: 36..37,
},
SimpleToken {
kind: Bogus,
range: 35..36,
},
SimpleToken {
kind: Bogus,
range: 34..35,
},
SimpleToken {
kind: Bogus,
range: 33..34,
},
SimpleToken {
kind: Bogus,
range: 32..33,
},
SimpleToken {
kind: Bogus,
range: 31..32,
},
SimpleToken {
kind: Bogus,
range: 30..31,
},
SimpleToken {
kind: Bogus,
range: 29..30,
},
SimpleToken {
kind: Bogus,
range: 28..29,
},
SimpleToken {
kind: Bogus,
range: 27..28,
},
SimpleToken {
kind: Bogus,
range: 26..27,
},
SimpleToken {
kind: Bogus,
range: 25..26,
},
SimpleToken {
kind: Bogus,
range: 24..25,
},
SimpleToken {
kind: Bogus,
range: 23..24,
},
SimpleToken {
kind: Bogus,
range: 22..23,
},
SimpleToken {
kind: Bogus,
range: 21..22,
},
SimpleToken {
kind: Bogus,
range: 20..21,
},
SimpleToken {
kind: Bogus,
range: 19..20,
},
SimpleToken {
kind: Bogus,
range: 18..19,
},
SimpleToken {
kind: Bogus,
range: 17..18,
},
SimpleToken {
kind: Bogus,
range: 16..17,
},
SimpleToken {
kind: Bogus,
range: 15..16,
},
SimpleToken {
kind: Bogus,
range: 14..15,
},
SimpleToken {
kind: Bogus,
range: 13..14,
},
SimpleToken {
kind: Bogus,
range: 12..13,
},
SimpleToken {
kind: Bogus,
range: 11..12,
},
SimpleToken {
kind: Bogus,
range: 10..11,
},
SimpleToken {
kind: Bogus,
range: 9..10,
},
SimpleToken {
kind: Bogus,
range: 8..9,
},
SimpleToken {
kind: Bogus,
range: 7..8,
},
SimpleToken {
kind: Bogus,
range: 6..7,
},
SimpleToken {
kind: Bogus,
range: 5..6,
},
SimpleToken {
kind: Bogus,
range: 4..5,
},
SimpleToken {
kind: Bogus,
range: 3..4,
},
SimpleToken {
kind: Bogus,
range: 2..3,
},
SimpleToken {
kind: Bogus,
range: 1..2,
},
SimpleToken {
kind: Bogus,
range: 0..1,
},
]

View file

@ -1,4 +1,4 @@
use memchr::memrchr3_iter;
use memchr::{memchr2, memchr3, memrchr3_iter};
use ruff_text_size::{TextLen, TextRange, TextSize};
use unic_ucd_ident::{is_xid_continue, is_xid_start};
@ -18,30 +18,22 @@ pub fn first_non_trivia_token(offset: TextSize, code: &str) -> Option<SimpleToke
.next()
}
/// Returns the first non-trivia token right before `offset` or `None` if at the start of the file
/// or all preceding tokens are trivia tokens.
///
/// ## Notes
///
/// Prefer [`first_non_trivia_token`] whenever possible because reverse lookup is expensive because of comments.
pub fn first_non_trivia_token_rev(offset: TextSize, code: &str) -> Option<SimpleToken> {
SimpleTokenizer::up_to(offset, code)
.skip_trivia()
.next_back()
}
/// Returns the number of newlines between `offset` and the first non whitespace character in the source code.
pub fn lines_before(offset: TextSize, code: &str) -> u32 {
let tokens = SimpleTokenizer::up_to(offset, code);
let mut newlines = 0u32;
let mut cursor = Cursor::new(&code[TextRange::up_to(offset)]);
for token in tokens.rev() {
match token.kind() {
SimpleTokenKind::Newline => {
let mut newlines = 0u32;
while let Some(c) = cursor.bump_back() {
match c {
'\n' => {
cursor.eat_char_back('\r');
newlines += 1;
}
SimpleTokenKind::Whitespace => {
// ignore
'\r' => {
newlines += 1;
}
c if is_python_whitespace(c) => {
continue;
}
_ => {
break;
@ -54,16 +46,20 @@ pub fn lines_before(offset: TextSize, code: &str) -> u32 {
/// Counts the empty lines between `offset` and the first non-whitespace character.
pub fn lines_after(offset: TextSize, code: &str) -> u32 {
let tokens = SimpleTokenizer::starts_at(offset, code);
let mut newlines = 0u32;
let mut cursor = Cursor::new(&code[offset.to_usize()..]);
for token in tokens {
match token.kind() {
SimpleTokenKind::Newline => {
let mut newlines = 0u32;
while let Some(c) = cursor.bump() {
match c {
'\n' => {
newlines += 1;
}
SimpleTokenKind::Whitespace => {
// ignore
'\r' => {
cursor.eat_char('\n');
newlines += 1;
}
c if is_python_whitespace(c) => {
continue;
}
_ => {
break;
@ -278,6 +274,8 @@ impl<'a> SimpleTokenizer<'a> {
}
/// Creates a tokenizer that lexes tokens from the start of `source` up to `offset`.
///
/// Consider using [`SimpleTokenizer::up_to_without_back_comment`] if intend to lex backwards.
pub fn up_to(offset: TextSize, source: &'a str) -> Self {
Self::new(source, TextRange::up_to(offset))
}
@ -423,45 +421,40 @@ impl<'a> SimpleTokenizer<'a> {
// For all other tokens, test if the character isn't part of a comment.
c => {
// Skip the test whether there's a preceding comment if it has been performed before.
let comment_offset = if self.back_line_has_no_comment {
let comment_length = if self.back_line_has_no_comment {
None
} else {
let bytes = self.cursor.chars().as_str().as_bytes();
let mut line_start = 0;
let mut last_comment_offset = None;
let mut potential_comment_starts: smallvec::SmallVec<[TextSize; 2]> =
smallvec::SmallVec::new();
// Find the start of the line, or any potential comments.
for index in memrchr3_iter(b'\n', b'\r', b'#', bytes) {
if bytes[index] == b'#' {
// Potentially a comment, but not guaranteed
last_comment_offset = Some(index);
// SAFETY: Safe, because ruff only supports files up to 4GB
potential_comment_starts.push(TextSize::try_from(index).unwrap());
} else {
line_start = index + 1;
break;
}
}
// Verify if this is indeed a comment. Doing this only when we've found a comment is significantly
// faster because comments are rare.
last_comment_offset.filter(|last_comment_offset| {
let before_comment =
&self.cursor.chars().as_str()[line_start..*last_comment_offset];
before_comment.chars().all(|c| {
is_python_whitespace(c)
|| SimpleTokenKind::from_non_trivia_char(c)
!= SimpleTokenKind::Other
})
})
// No comments
if potential_comment_starts.is_empty() {
None
} else {
// The line contains at least one `#` token. The `#` can indicate the start of a
// comment, meaning the current token is commented out, or it is a regular `#` inside of a string.
self.comment_from_hash_positions(&potential_comment_starts)
}
};
// From here on it is guaranteed that this line has no other comment.
self.back_line_has_no_comment = true;
if let Some(comment_offset) = comment_offset {
let comment_length = self.cursor.chars().as_str().len() - comment_offset;
if let Some(comment_length) = comment_length {
// It is a comment, bump all tokens
for _ in 0..comment_length {
for _ in 0..usize::from(comment_length) {
self.cursor.bump_back().unwrap();
}
@ -519,6 +512,141 @@ impl<'a> SimpleTokenizer<'a> {
pub fn skip_trivia(self) -> impl Iterator<Item = SimpleToken> + DoubleEndedIterator + 'a {
self.filter(|t| !t.kind().is_trivia())
}
/// Given the position of `#` tokens on a line, test if any `#` is the start of a comment and, if so, return the
/// length of the comment.
///
/// The challenge is that `#` tokens can also appear inside of strings:
///
/// ```python
/// ' #not a comment'
/// ```
///
/// This looks innocent but is the `'` really the start of the new string or could it be a closing delimiter
/// of a previously started string:
///
/// ```python
/// ' a string\
/// ` # a comment '
/// ```
///
/// The only way to reliability tell whether the `#` is a comment when the comment contains a quote char is
/// to forward lex all strings and comments and test if there's any unclosed string literal. If so, then
/// the hash cannot be a comment.
fn comment_from_hash_positions(&self, hash_positions: &[TextSize]) -> Option<TextSize> {
// Iterate over the `#` positions from the start to the end of the line.
// This is necessary to correctly support `a # comment # comment`.
for possible_start in hash_positions.iter().rev() {
let comment_bytes =
self.source[TextRange::new(*possible_start, self.back_offset)].as_bytes();
// Test if the comment contains any quotes. If so, then it's possible that the `#` token isn't
// the start of a comment, but instead part of a string:
// ```python
// a + 'a string # not a comment'
// a + '''a string
// # not a comment'''
// ```
match memchr2(b'\'', b'"', comment_bytes) {
// Most comments don't contain quotes, and most strings don't contain comments.
// For these it's safe to assume that they are comments.
None => return Some(self.cursor.chars().as_str().text_len() - possible_start),
// Now it gets complicated... There's no good way to know whether this is a string or not.
// It is necessary to lex all strings and comments from the start to know if it is one or the other.
Some(_) => {
if find_unterminated_string_kind(
&self.cursor.chars().as_str()[TextRange::up_to(*possible_start)],
)
.is_none()
{
// There's no unterminated string at the comment's start position. This *must*
// be a comment.
return Some(self.cursor.chars().as_str().text_len() - possible_start);
}
// This is a hash inside of a string: `'test # not a comment'` continue with the next potential comment on the line.
}
}
}
None
}
}
fn find_unterminated_string_kind(input: &str) -> Option<StringKind> {
let mut rest = input;
while let Some(comment_or_string_start) = memchr3(b'#', b'\'', b'\"', rest.as_bytes()) {
let c = rest.as_bytes()[comment_or_string_start];
let after = &rest[comment_or_string_start + 1..];
if c == b'#' {
let comment_end = memchr2(b'\n', b'\r', after.as_bytes()).unwrap_or(after.len());
rest = &after[comment_end..];
} else {
let mut cursor = Cursor::new(after);
let quote_kind = if c == b'\'' {
QuoteKind::Single
} else {
QuoteKind::Double
};
let string_kind = if cursor.eat_char(quote_kind.as_char()) {
// `''` or `""`
if cursor.eat_char(quote_kind.as_char()) {
// `'''` or `"""`
StringKind::Triple(quote_kind)
} else {
// empty string literal, nothing more to lex
continue;
}
} else {
StringKind::Single(quote_kind)
};
if !is_string_terminated(string_kind, &mut cursor) {
return Some(string_kind);
}
rest = cursor.chars().as_str();
}
}
None
}
fn is_string_terminated(kind: StringKind, cursor: &mut Cursor) -> bool {
let quote_char = kind.quote_kind().as_char();
while let Some(c) = cursor.bump() {
match c {
'\n' | '\r' if kind.is_single() => {
// Reached the end of the line without a closing quote, this is an unterminated string literal.
return false;
}
'\\' => {
// Skip over escaped quotes that match this strings quotes or double escaped backslashes
if cursor.eat_char(quote_char) || cursor.eat_char('\\') {
continue;
}
// Eat over line continuation
cursor.eat_char('\r');
cursor.eat_char('\n');
}
c if c == quote_char => {
if kind.is_single() || (cursor.eat_char(quote_char) && cursor.eat_char(quote_char))
{
return true;
}
}
_ => {
// continue
}
}
}
// Reached end without a closing quote
false
}
impl Iterator for SimpleTokenizer<'_> {
@ -547,6 +675,45 @@ impl DoubleEndedIterator for SimpleTokenizer<'_> {
}
}
#[derive(Copy, Clone, Eq, PartialEq, Debug)]
enum StringKind {
/// `'...'` or `"..."`
Single(QuoteKind),
/// `'''...'''` or `"""..."""`
Triple(QuoteKind),
}
impl StringKind {
const fn quote_kind(self) -> QuoteKind {
match self {
StringKind::Single(kind) => kind,
StringKind::Triple(kind) => kind,
}
}
const fn is_single(self) -> bool {
matches!(self, StringKind::Single(_))
}
}
#[derive(Copy, Clone, Eq, PartialEq, Debug)]
enum QuoteKind {
/// `'``
Single,
/// `"`
Double,
}
impl QuoteKind {
const fn as_char(self) -> char {
match self {
QuoteKind::Single => '\'',
QuoteKind::Double => '"',
}
}
}
#[cfg(test)]
mod tests {
use insta::assert_debug_snapshot;
@ -708,6 +875,72 @@ mod tests {
assert_debug_snapshot!("Reverse", test_case.tokenize_reverse());
}
#[test]
fn single_quoted_multiline_string_containing_comment() {
let test_case = tokenize(
r#"'This string contains a hash looking like a comment\
# This is not a comment'"#,
);
assert_debug_snapshot!(test_case.tokenize_reverse());
}
#[test]
fn single_quoted_multiline_string_implicit_concatenation() {
let test_case = tokenize(
r#"'This string contains a hash looking like a comment\
# This is' "not_a_comment""#,
);
assert_debug_snapshot!(test_case.tokenize_reverse());
}
#[test]
fn triple_quoted_multiline_string_containing_comment() {
let test_case = tokenize(
r#"'''This string contains a hash looking like a comment
# This is not a comment'''"#,
);
assert_debug_snapshot!(test_case.tokenize_reverse());
}
#[test]
fn comment_containing_triple_quoted_string() {
let test_case = tokenize("'''leading string''' # a comment '''not a string'''");
assert_debug_snapshot!(test_case.tokenize_reverse());
}
#[test]
fn comment_containing_single_quoted_string() {
let test_case = tokenize("'leading string' # a comment 'not a string'");
assert_debug_snapshot!(test_case.tokenize_reverse());
}
#[test]
fn string_followed_by_multiple_comments() {
let test_case =
tokenize(r#"'a string # containing a hash " # and another hash ' # finally a comment"#);
assert_debug_snapshot!(test_case.tokenize_reverse());
}
#[test]
fn string_with_escaped_quote() {
let test_case = tokenize(r#"'a string \' # containing a hash ' # finally a comment"#);
assert_debug_snapshot!(test_case.tokenize_reverse());
}
#[test]
fn string_with_double_escaped_backslash() {
let test_case = tokenize(r#"'a string \\' # a comment '"#);
assert_debug_snapshot!(test_case.tokenize_reverse());
}
#[test]
fn lines_before_empty_string() {
assert_eq!(lines_before(TextSize::new(0), ""), 0);