mirror of
https://github.com/astral-sh/ruff.git
synced 2025-08-10 13:48:35 +00:00

## Summary This is part of the preparation for detecting syntax errors in the parser from https://github.com/astral-sh/ruff/pull/16090/. As suggested in [this comment](https://github.com/astral-sh/ruff/pull/16090/#discussion_r1953084509), I started working on a `ParseOptions` struct that could be stored in the parser. For this initial refactor, I only made it hold the existing `Mode` option, but for syntax errors, we will also need it to have a `PythonVersion`. For that use case, I'm picturing something like a `ParseOptions::with_python_version` method, so you can extend the current calls to something like ```rust ParseOptions::from(mode).with_python_version(settings.target_version) ``` But I thought it was worth adding `ParseOptions` alone without changing any other behavior first. Most of the diff is just updating call sites taking `Mode` to take `ParseOptions::from(Mode)` or those taking `PySourceType`s to take `ParseOptions::from(PySourceType)`. The interesting changes are in the new `parser/options.rs` file and smaller parts of `parser/mod.rs` and `ruff_python_parser/src/lib.rs`. ## Test Plan Existing tests, this should not change any behavior.
408 lines
9.6 KiB
Rust
408 lines
9.6 KiB
Rust
use insta::assert_debug_snapshot;
|
|
|
|
use ruff_python_parser::{parse_unchecked, Mode, ParseOptions};
|
|
use ruff_python_trivia::{lines_after, lines_before, CommentRanges, SimpleToken, SimpleTokenizer};
|
|
use ruff_python_trivia::{BackwardsTokenizer, SimpleTokenKind};
|
|
use ruff_text_size::{TextLen, TextRange, TextSize};
|
|
|
|
struct TokenizationTestCase {
|
|
source: &'static str,
|
|
range: TextRange,
|
|
tokens: Vec<SimpleToken>,
|
|
}
|
|
|
|
impl TokenizationTestCase {
|
|
fn assert_reverse_tokenization(&self) {
|
|
let mut backwards = self.tokenize_reverse();
|
|
|
|
// Re-reverse to get the tokens in forward order.
|
|
backwards.reverse();
|
|
|
|
assert_eq!(&backwards, &self.tokens);
|
|
}
|
|
|
|
fn tokenize_reverse(&self) -> Vec<SimpleToken> {
|
|
let parsed = parse_unchecked(self.source, ParseOptions::from(Mode::Module));
|
|
let comment_ranges = CommentRanges::from(parsed.tokens());
|
|
BackwardsTokenizer::new(self.source, self.range, &comment_ranges).collect()
|
|
}
|
|
|
|
fn tokens(&self) -> &[SimpleToken] {
|
|
&self.tokens
|
|
}
|
|
}
|
|
|
|
fn tokenize_range(source: &'static str, range: TextRange) -> TokenizationTestCase {
|
|
let tokens: Vec<_> = SimpleTokenizer::new(source, range).collect();
|
|
|
|
TokenizationTestCase {
|
|
source,
|
|
range,
|
|
tokens,
|
|
}
|
|
}
|
|
|
|
fn tokenize(source: &'static str) -> TokenizationTestCase {
|
|
tokenize_range(source, TextRange::new(TextSize::new(0), source.text_len()))
|
|
}
|
|
|
|
#[test]
|
|
fn tokenize_trivia() {
|
|
let source = "# comment\n # comment";
|
|
|
|
let test_case = tokenize(source);
|
|
|
|
assert_debug_snapshot!(test_case.tokens());
|
|
test_case.assert_reverse_tokenization();
|
|
}
|
|
|
|
#[test]
|
|
fn tokenize_parentheses() {
|
|
let source = "([{}])";
|
|
|
|
let test_case = tokenize(source);
|
|
|
|
assert_debug_snapshot!(test_case.tokens());
|
|
test_case.assert_reverse_tokenization();
|
|
}
|
|
|
|
#[test]
|
|
fn tokenize_comma() {
|
|
let source = ",,,,";
|
|
|
|
let test_case = tokenize(source);
|
|
|
|
assert_debug_snapshot!(test_case.tokens());
|
|
test_case.assert_reverse_tokenization();
|
|
}
|
|
|
|
#[test]
|
|
fn tokenize_eq() {
|
|
// Should tokenize as `==`, then `=`, regardless of whether we're lexing forwards or
|
|
// backwards.
|
|
let source = "===";
|
|
|
|
let test_case = tokenize(source);
|
|
|
|
assert_debug_snapshot!(test_case.tokens());
|
|
test_case.assert_reverse_tokenization();
|
|
}
|
|
|
|
#[test]
|
|
fn tokenize_not_eq() {
|
|
// Should tokenize as `!=`, then `=`, regardless of whether we're lexing forwards or
|
|
// backwards.
|
|
let source = "!==";
|
|
|
|
let test_case = tokenize(source);
|
|
|
|
assert_debug_snapshot!(test_case.tokens());
|
|
test_case.assert_reverse_tokenization();
|
|
}
|
|
|
|
#[test]
|
|
fn tokenize_continuation() {
|
|
let source = "( \\\n )";
|
|
|
|
let test_case = tokenize(source);
|
|
|
|
assert_debug_snapshot!(test_case.tokens());
|
|
test_case.assert_reverse_tokenization();
|
|
}
|
|
|
|
#[test]
|
|
fn tokenize_operators() {
|
|
let source = "-> *= ( -= ) ~ // ** **= ^ ^= | |=";
|
|
|
|
let test_case = tokenize(source);
|
|
|
|
assert_debug_snapshot!(test_case.tokens());
|
|
test_case.assert_reverse_tokenization();
|
|
}
|
|
|
|
#[test]
|
|
fn tokenize_invalid_operators() {
|
|
let source = "-> $=";
|
|
|
|
let test_case = tokenize(source);
|
|
|
|
assert_debug_snapshot!(test_case.tokens());
|
|
|
|
// note: not reversible: [other, bogus, bogus] vs [bogus, bogus, other]
|
|
}
|
|
|
|
#[test]
|
|
fn tricky_unicode() {
|
|
let source = "មុ";
|
|
|
|
let test_case = tokenize(source);
|
|
assert_debug_snapshot!(test_case.tokens());
|
|
test_case.assert_reverse_tokenization();
|
|
}
|
|
|
|
#[test]
|
|
fn identifier_ending_in_non_start_char() {
|
|
let source = "i5";
|
|
|
|
let test_case = tokenize(source);
|
|
assert_debug_snapshot!(test_case.tokens());
|
|
test_case.assert_reverse_tokenization();
|
|
}
|
|
|
|
#[test]
|
|
fn string_with_kind() {
|
|
let source = "f'foo'";
|
|
|
|
let test_case = tokenize(source);
|
|
assert_debug_snapshot!(test_case.tokens());
|
|
|
|
// note: not reversible: [other, bogus] vs [bogus, other]
|
|
}
|
|
|
|
#[test]
|
|
fn string_with_byte_kind() {
|
|
let source = "BR'foo'";
|
|
|
|
let test_case = tokenize(source);
|
|
assert_debug_snapshot!(test_case.tokens());
|
|
|
|
// note: not reversible: [other, bogus] vs [bogus, other]
|
|
}
|
|
|
|
#[test]
|
|
fn string_with_invalid_kind() {
|
|
let source = "abc'foo'";
|
|
|
|
let test_case = tokenize(source);
|
|
assert_debug_snapshot!(test_case.tokens());
|
|
|
|
// note: not reversible: [other, bogus] vs [bogus, other]
|
|
}
|
|
|
|
#[test]
|
|
fn identifier_starting_with_string_kind() {
|
|
let source = "foo bar";
|
|
|
|
let test_case = tokenize(source);
|
|
assert_debug_snapshot!(test_case.tokens());
|
|
test_case.assert_reverse_tokenization();
|
|
}
|
|
|
|
#[test]
|
|
fn ignore_word_with_only_id_continuing_chars() {
|
|
let source = "555";
|
|
|
|
let test_case = tokenize(source);
|
|
assert_debug_snapshot!(test_case.tokens());
|
|
|
|
// note: not reversible: [other, bogus, bogus] vs [bogus, bogus, other]
|
|
}
|
|
|
|
#[test]
|
|
fn tokenize_multichar() {
|
|
let source = "if in else match";
|
|
|
|
let test_case = tokenize(source);
|
|
|
|
assert_debug_snapshot!(test_case.tokens());
|
|
test_case.assert_reverse_tokenization();
|
|
}
|
|
|
|
#[test]
|
|
fn tokenize_substring() {
|
|
let source = "('some string') # comment";
|
|
|
|
let test_case = tokenize_range(source, TextRange::new(TextSize::new(14), source.text_len()));
|
|
|
|
assert_debug_snapshot!(test_case.tokens());
|
|
test_case.assert_reverse_tokenization();
|
|
}
|
|
|
|
#[test]
|
|
fn tokenize_slash() {
|
|
let source = r" # trailing positional comment
|
|
# Positional arguments only after here
|
|
,/";
|
|
|
|
let test_case = tokenize(source);
|
|
|
|
assert_debug_snapshot!(test_case.tokens());
|
|
test_case.assert_reverse_tokenization();
|
|
}
|
|
|
|
#[test]
|
|
fn tokenize_bogus() {
|
|
let source = r#"# leading comment
|
|
"a string"
|
|
a = (10)"#;
|
|
|
|
let test_case = tokenize(source);
|
|
|
|
assert_debug_snapshot!(test_case.tokens());
|
|
assert_debug_snapshot!("Reverse", test_case.tokenize_reverse());
|
|
}
|
|
|
|
#[test]
|
|
fn single_quoted_multiline_string_containing_comment() {
|
|
let test_case = tokenize(
|
|
r"'This string contains a hash looking like a comment\
|
|
# This is not a comment'",
|
|
);
|
|
|
|
assert_debug_snapshot!(test_case.tokenize_reverse());
|
|
}
|
|
|
|
#[test]
|
|
fn single_quoted_multiline_string_implicit_concatenation() {
|
|
let test_case = tokenize(
|
|
r#"'This string contains a hash looking like a comment\
|
|
# This is' "not_a_comment""#,
|
|
);
|
|
|
|
assert_debug_snapshot!(test_case.tokenize_reverse());
|
|
}
|
|
|
|
#[test]
|
|
fn triple_quoted_multiline_string_containing_comment() {
|
|
let test_case = tokenize(
|
|
r"'''This string contains a hash looking like a comment
|
|
# This is not a comment'''",
|
|
);
|
|
|
|
assert_debug_snapshot!(test_case.tokenize_reverse());
|
|
}
|
|
|
|
#[test]
|
|
fn comment_containing_triple_quoted_string() {
|
|
let test_case = tokenize("'''leading string''' # a comment '''not a string'''");
|
|
|
|
assert_debug_snapshot!(test_case.tokenize_reverse());
|
|
}
|
|
|
|
#[test]
|
|
fn comment_containing_single_quoted_string() {
|
|
let test_case = tokenize("'leading string' # a comment 'not a string'");
|
|
|
|
assert_debug_snapshot!(test_case.tokenize_reverse());
|
|
}
|
|
|
|
#[test]
|
|
fn string_followed_by_multiple_comments() {
|
|
let test_case =
|
|
tokenize(r#"'a string # containing a hash " # and another hash ' # finally a comment"#);
|
|
|
|
assert_debug_snapshot!(test_case.tokenize_reverse());
|
|
}
|
|
|
|
#[test]
|
|
fn string_with_escaped_quote() {
|
|
let test_case = tokenize(r"'a string \' # containing a hash ' # finally a comment");
|
|
|
|
assert_debug_snapshot!(test_case.tokenize_reverse());
|
|
}
|
|
|
|
#[test]
|
|
fn string_with_double_escaped_backslash() {
|
|
let test_case = tokenize(r"'a string \\' # a comment '");
|
|
|
|
assert_debug_snapshot!(test_case.tokenize_reverse());
|
|
}
|
|
|
|
#[test]
|
|
fn empty_string_literal() {
|
|
let test_case = tokenize(r"'' # a comment '");
|
|
|
|
assert_debug_snapshot!(test_case.tokenize_reverse());
|
|
}
|
|
|
|
#[test]
|
|
fn lines_before_empty_string() {
|
|
assert_eq!(lines_before(TextSize::new(0), ""), 0);
|
|
}
|
|
|
|
#[test]
|
|
fn lines_before_in_the_middle_of_a_line() {
|
|
assert_eq!(lines_before(TextSize::new(4), "a = 20"), 0);
|
|
}
|
|
|
|
#[test]
|
|
fn lines_before_on_a_new_line() {
|
|
assert_eq!(lines_before(TextSize::new(7), "a = 20\nb = 10"), 1);
|
|
}
|
|
|
|
#[test]
|
|
fn lines_before_multiple_leading_newlines() {
|
|
assert_eq!(lines_before(TextSize::new(9), "a = 20\n\r\nb = 10"), 2);
|
|
}
|
|
|
|
#[test]
|
|
fn lines_before_with_comment_offset() {
|
|
assert_eq!(lines_before(TextSize::new(8), "a = 20\n# a comment"), 0);
|
|
}
|
|
|
|
#[test]
|
|
fn lines_before_with_trailing_comment() {
|
|
assert_eq!(
|
|
lines_before(TextSize::new(22), "a = 20 # some comment\nb = 10"),
|
|
1
|
|
);
|
|
}
|
|
|
|
#[test]
|
|
fn lines_before_with_comment_only_line() {
|
|
assert_eq!(
|
|
lines_before(TextSize::new(22), "a = 20\n# some comment\nb = 10"),
|
|
1
|
|
);
|
|
}
|
|
|
|
#[test]
|
|
fn lines_after_empty_string() {
|
|
assert_eq!(lines_after(TextSize::new(0), ""), 0);
|
|
}
|
|
|
|
#[test]
|
|
fn lines_after_in_the_middle_of_a_line() {
|
|
assert_eq!(lines_after(TextSize::new(4), "a = 20"), 0);
|
|
}
|
|
|
|
#[test]
|
|
fn lines_after_before_a_new_line() {
|
|
assert_eq!(lines_after(TextSize::new(6), "a = 20\nb = 10"), 1);
|
|
}
|
|
|
|
#[test]
|
|
fn lines_after_multiple_newlines() {
|
|
assert_eq!(lines_after(TextSize::new(6), "a = 20\n\r\nb = 10"), 2);
|
|
}
|
|
|
|
#[test]
|
|
fn lines_after_before_comment_offset() {
|
|
assert_eq!(lines_after(TextSize::new(7), "a = 20 # a comment\n"), 0);
|
|
}
|
|
|
|
#[test]
|
|
fn lines_after_with_comment_only_line() {
|
|
assert_eq!(
|
|
lines_after(TextSize::new(6), "a = 20\n# some comment\nb = 10"),
|
|
1
|
|
);
|
|
}
|
|
|
|
#[test]
|
|
fn test_previous_token_simple() {
|
|
let cases = &["x = (", "x = ( ", "x = (\n"];
|
|
for source in cases {
|
|
let token = BackwardsTokenizer::up_to(source.text_len(), source, &[])
|
|
.skip_trivia()
|
|
.next()
|
|
.unwrap();
|
|
assert_eq!(
|
|
token,
|
|
SimpleToken {
|
|
kind: SimpleTokenKind::LParen,
|
|
range: TextRange::new(TextSize::new(4), TextSize::new(5)),
|
|
}
|
|
);
|
|
}
|
|
}
|