From 2cbe1733c8c4d4d884fba33c4804bce94b31cdd2 Mon Sep 17 00:00:00 2001 From: konsti Date: Sat, 16 Sep 2023 05:21:45 +0200 Subject: [PATCH] Use CommentRanges in backwards lexing (#7360) ## Summary The tokenizer was split into a forward and a backwards tokenizer. The backwards tokenizer uses the same names as the forwards ones (e.g. `next_token`). The backwards tokenizer gets the comment ranges that we already built to skip comments. --------- Co-authored-by: Micha Reiser --- Cargo.lock | 4 +- crates/ruff/src/checkers/noqa.rs | 3 +- crates/ruff/src/noqa.rs | 29 +- .../flake8_pytest_style/rules/parametrize.rs | 16 +- .../flake8_simplify/rules/ast_bool_op.rs | 22 +- .../rules/flake8_simplify/rules/ast_ifexp.rs | 9 +- .../flake8_simplify/rules/key_in_dict.rs | 18 +- .../pandas_vet/rules/inplace_argument.rs | 14 +- crates/ruff/src/rules/pycodestyle/helpers.rs | 20 +- .../pycodestyle/rules/literal_comparisons.rs | 1 + .../src/rules/pycodestyle/rules/not_tests.rs | 2 + crates/ruff/src/rules/pyflakes/fixes.rs | 19 +- .../rules/pyflakes/rules/unused_variable.rs | 1 + .../pyupgrade/rules/yield_in_for_loop.rs | 1 + crates/ruff_python_ast/src/parenthesize.rs | 10 +- crates/ruff_python_ast/tests/parenthesize.rs | 57 +- .../src/comments/debug.rs | 4 +- .../ruff_python_formatter/src/comments/mod.rs | 31 +- .../src/comments/placement.rs | 18 +- .../src/comments/visitor.rs | 10 +- .../src/expression/binary_like.rs | 48 +- .../src/expression/expr_attribute.rs | 15 +- .../src/expression/expr_call.rs | 13 +- .../src/expression/expr_if_exp.rs | 6 +- .../src/expression/expr_subscript.rs | 13 +- .../src/expression/expr_unary_op.rs | 12 +- .../src/expression/mod.rs | 41 +- .../src/expression/parentheses.rs | 20 +- crates/ruff_python_formatter/src/lib.rs | 5 +- .../ruff_python_formatter/src/pattern/mod.rs | 24 +- .../src/statement/suite.rs | 8 +- .../ruff_python_index/src/comment_ranges.rs | 70 +- crates/ruff_python_index/src/indexer.rs | 7 +- crates/ruff_python_index/src/lib.rs | 2 +- crates/ruff_python_trivia/Cargo.toml | 3 +- .../ruff_python_trivia/src/comment_ranges.rs | 71 ++ crates/ruff_python_trivia/src/cursor.rs | 2 +- crates/ruff_python_trivia/src/lib.rs | 2 + crates/ruff_python_trivia/src/tokenizer.rs | 712 +++++++----------- crates/ruff_wasm/Cargo.toml | 1 + crates/ruff_wasm/src/lib.rs | 8 +- 41 files changed, 744 insertions(+), 628 deletions(-) create mode 100644 crates/ruff_python_trivia/src/comment_ranges.rs diff --git a/Cargo.lock b/Cargo.lock index 488aaa2df9..02ead9b8cf 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2430,12 +2430,11 @@ name = "ruff_python_trivia" version = "0.0.0" dependencies = [ "insta", - "memchr", + "itertools", "ruff_python_ast", "ruff_python_parser", "ruff_source_file", "ruff_text_size", - "smallvec", "unicode-ident", ] @@ -2492,6 +2491,7 @@ dependencies = [ "ruff_python_formatter", "ruff_python_index", "ruff_python_parser", + "ruff_python_trivia", "ruff_source_file", "ruff_text_size", "ruff_workspace", diff --git a/crates/ruff/src/checkers/noqa.rs b/crates/ruff/src/checkers/noqa.rs index a6995bd4ee..0e323ab032 100644 --- a/crates/ruff/src/checkers/noqa.rs +++ b/crates/ruff/src/checkers/noqa.rs @@ -6,6 +6,7 @@ use itertools::Itertools; use ruff_text_size::{Ranged, TextLen, TextRange, TextSize}; use ruff_diagnostics::{Diagnostic, Edit, Fix}; +use ruff_python_trivia::CommentRanges; use ruff_source_file::Locator; use crate::noqa; @@ -19,7 +20,7 @@ pub(crate) fn check_noqa( diagnostics: &mut Vec, path: &Path, locator: &Locator, - comment_ranges: &[TextRange], + comment_ranges: &CommentRanges, noqa_line_for: &NoqaMapping, analyze_directives: bool, settings: &Settings, diff --git a/crates/ruff/src/noqa.rs b/crates/ruff/src/noqa.rs index c905f7bd5a..43fc2f5b97 100644 --- a/crates/ruff/src/noqa.rs +++ b/crates/ruff/src/noqa.rs @@ -11,7 +11,7 @@ use log::warn; use ruff_text_size::{Ranged, TextLen, TextRange, TextSize}; use ruff_diagnostics::Diagnostic; -use ruff_python_trivia::indentation_at_offset; +use ruff_python_trivia::{indentation_at_offset, CommentRanges}; use ruff_source_file::{LineEnding, Locator}; use crate::codes::NoqaCode; @@ -234,7 +234,7 @@ impl FileExemption { /// globally ignored within the file. pub(crate) fn try_extract( contents: &str, - comment_ranges: &[TextRange], + comment_ranges: &CommentRanges, path: &Path, locator: &Locator, ) -> Option { @@ -457,7 +457,7 @@ pub(crate) fn add_noqa( path: &Path, diagnostics: &[Diagnostic], locator: &Locator, - commented_lines: &[TextRange], + comment_ranges: &CommentRanges, noqa_line_for: &NoqaMapping, line_ending: LineEnding, ) -> Result { @@ -465,7 +465,7 @@ pub(crate) fn add_noqa( path, diagnostics, locator, - commented_lines, + comment_ranges, noqa_line_for, line_ending, ); @@ -477,7 +477,7 @@ fn add_noqa_inner( path: &Path, diagnostics: &[Diagnostic], locator: &Locator, - commented_ranges: &[TextRange], + comment_ranges: &CommentRanges, noqa_line_for: &NoqaMapping, line_ending: LineEnding, ) -> (usize, String) { @@ -487,8 +487,8 @@ fn add_noqa_inner( // Whether the file is exempted from all checks. // Codes that are globally exempted (within the current file). - let exemption = FileExemption::try_extract(locator.contents(), commented_ranges, path, locator); - let directives = NoqaDirectives::from_commented_ranges(commented_ranges, path, locator); + let exemption = FileExemption::try_extract(locator.contents(), comment_ranges, path, locator); + let directives = NoqaDirectives::from_commented_ranges(comment_ranges, path, locator); // Mark any non-ignored diagnostics. for diagnostic in diagnostics { @@ -658,7 +658,7 @@ pub(crate) struct NoqaDirectives<'a> { impl<'a> NoqaDirectives<'a> { pub(crate) fn from_commented_ranges( - comment_ranges: &[TextRange], + comment_ranges: &CommentRanges, path: &Path, locator: &'a Locator<'a>, ) -> Self { @@ -800,6 +800,7 @@ mod tests { use ruff_text_size::{TextRange, TextSize}; use ruff_diagnostics::Diagnostic; + use ruff_python_trivia::CommentRanges; use ruff_source_file::{LineEnding, Locator}; use crate::noqa::{add_noqa_inner, Directive, NoqaMapping, ParsedFileExemption}; @@ -997,7 +998,7 @@ mod tests { path, &[], &Locator::new(contents), - &[], + &CommentRanges::default(), &noqa_line_for, LineEnding::Lf, ); @@ -1017,7 +1018,7 @@ mod tests { path, &diagnostics, &Locator::new(contents), - &[], + &CommentRanges::default(), &noqa_line_for, LineEnding::Lf, ); @@ -1038,11 +1039,13 @@ mod tests { ]; let contents = "x = 1 # noqa: E741\n"; let noqa_line_for = NoqaMapping::default(); + let comment_ranges = + CommentRanges::new(vec![TextRange::new(TextSize::from(7), TextSize::from(19))]); let (count, output) = add_noqa_inner( path, &diagnostics, &Locator::new(contents), - &[TextRange::new(TextSize::from(7), TextSize::from(19))], + &comment_ranges, &noqa_line_for, LineEnding::Lf, ); @@ -1063,11 +1066,13 @@ mod tests { ]; let contents = "x = 1 # noqa"; let noqa_line_for = NoqaMapping::default(); + let comment_ranges = + CommentRanges::new(vec![TextRange::new(TextSize::from(7), TextSize::from(13))]); let (count, output) = add_noqa_inner( path, &diagnostics, &Locator::new(contents), - &[TextRange::new(TextSize::from(7), TextSize::from(13))], + &comment_ranges, &noqa_line_for, LineEnding::Lf, ); diff --git a/crates/ruff/src/rules/flake8_pytest_style/rules/parametrize.rs b/crates/ruff/src/rules/flake8_pytest_style/rules/parametrize.rs index fbe674dc1c..293c4b8f08 100644 --- a/crates/ruff/src/rules/flake8_pytest_style/rules/parametrize.rs +++ b/crates/ruff/src/rules/flake8_pytest_style/rules/parametrize.rs @@ -9,6 +9,7 @@ use ruff_python_ast::node::AstNode; use ruff_python_ast::parenthesize::parenthesized_range; use ruff_python_ast::{self as ast, Arguments, Constant, Decorator, Expr, ExprContext}; use ruff_python_codegen::Generator; +use ruff_python_trivia::CommentRanges; use ruff_python_trivia::{SimpleTokenKind, SimpleTokenizer}; use ruff_text_size::{Ranged, TextRange, TextSize}; @@ -298,12 +299,17 @@ fn elts_to_csv(elts: &[Expr], generator: Generator) -> Option { fn get_parametrize_name_range( decorator: &Decorator, expr: &Expr, + comment_ranges: &CommentRanges, source: &str, ) -> Option { - decorator - .expression - .as_call_expr() - .and_then(|call| parenthesized_range(expr.into(), call.arguments.as_any_node_ref(), source)) + decorator.expression.as_call_expr().and_then(|call| { + parenthesized_range( + expr.into(), + call.arguments.as_any_node_ref(), + comment_ranges, + source, + ) + }) } /// PT006 @@ -322,6 +328,7 @@ fn check_names(checker: &mut Checker, decorator: &Decorator, expr: &Expr) { let name_range = get_parametrize_name_range( decorator, expr, + checker.indexer().comment_ranges(), checker.locator().contents(), ) .unwrap_or(expr.range()); @@ -356,6 +363,7 @@ fn check_names(checker: &mut Checker, decorator: &Decorator, expr: &Expr) { let name_range = get_parametrize_name_range( decorator, expr, + checker.indexer().comment_ranges(), checker.locator().contents(), ) .unwrap_or(expr.range()); diff --git a/crates/ruff/src/rules/flake8_simplify/rules/ast_bool_op.rs b/crates/ruff/src/rules/flake8_simplify/rules/ast_bool_op.rs index ea895e5123..b60b5de122 100644 --- a/crates/ruff/src/rules/flake8_simplify/rules/ast_bool_op.rs +++ b/crates/ruff/src/rules/flake8_simplify/rules/ast_bool_op.rs @@ -773,9 +773,14 @@ fn is_short_circuit( edit = Some(get_short_circuit_edit( value, TextRange::new( - parenthesized_range(furthest.into(), expr.into(), checker.locator().contents()) - .unwrap_or(furthest.range()) - .start(), + parenthesized_range( + furthest.into(), + expr.into(), + checker.indexer().comment_ranges(), + checker.locator().contents(), + ) + .unwrap_or(furthest.range()) + .start(), expr.end(), ), short_circuit_truthiness, @@ -796,9 +801,14 @@ fn is_short_circuit( edit = Some(get_short_circuit_edit( next_value, TextRange::new( - parenthesized_range(furthest.into(), expr.into(), checker.locator().contents()) - .unwrap_or(furthest.range()) - .start(), + parenthesized_range( + furthest.into(), + expr.into(), + checker.indexer().comment_ranges(), + checker.locator().contents(), + ) + .unwrap_or(furthest.range()) + .start(), expr.end(), ), short_circuit_truthiness, diff --git a/crates/ruff/src/rules/flake8_simplify/rules/ast_ifexp.rs b/crates/ruff/src/rules/flake8_simplify/rules/ast_ifexp.rs index 1dc4919308..f5d890f61b 100644 --- a/crates/ruff/src/rules/flake8_simplify/rules/ast_ifexp.rs +++ b/crates/ruff/src/rules/flake8_simplify/rules/ast_ifexp.rs @@ -163,8 +163,13 @@ pub(crate) fn if_expr_with_true_false( checker .locator() .slice( - parenthesized_range(test.into(), expr.into(), checker.locator().contents()) - .unwrap_or(test.range()), + parenthesized_range( + test.into(), + expr.into(), + checker.indexer().comment_ranges(), + checker.locator().contents(), + ) + .unwrap_or(test.range()), ) .to_string(), expr.range(), diff --git a/crates/ruff/src/rules/flake8_simplify/rules/key_in_dict.rs b/crates/ruff/src/rules/flake8_simplify/rules/key_in_dict.rs index 9fe20266f2..bddbeffc84 100644 --- a/crates/ruff/src/rules/flake8_simplify/rules/key_in_dict.rs +++ b/crates/ruff/src/rules/flake8_simplify/rules/key_in_dict.rs @@ -88,10 +88,20 @@ fn key_in_dict( } // Extract the exact range of the left and right expressions. - let left_range = parenthesized_range(left.into(), parent, checker.locator().contents()) - .unwrap_or(left.range()); - let right_range = parenthesized_range(right.into(), parent, checker.locator().contents()) - .unwrap_or(right.range()); + let left_range = parenthesized_range( + left.into(), + parent, + checker.indexer().comment_ranges(), + checker.locator().contents(), + ) + .unwrap_or(left.range()); + let right_range = parenthesized_range( + right.into(), + parent, + checker.indexer().comment_ranges(), + checker.locator().contents(), + ) + .unwrap_or(right.range()); let mut diagnostic = Diagnostic::new( InDictKeys { diff --git a/crates/ruff/src/rules/pandas_vet/rules/inplace_argument.rs b/crates/ruff/src/rules/pandas_vet/rules/inplace_argument.rs index 0c88965082..417b769591 100644 --- a/crates/ruff/src/rules/pandas_vet/rules/inplace_argument.rs +++ b/crates/ruff/src/rules/pandas_vet/rules/inplace_argument.rs @@ -3,6 +3,7 @@ use ruff_macros::{derive_message_formats, violation}; use ruff_python_ast::helpers::is_const_true; use ruff_python_ast::parenthesize::parenthesized_range; use ruff_python_ast::{self as ast, Keyword, Stmt}; +use ruff_python_trivia::CommentRanges; use ruff_source_file::Locator; use ruff_text_size::Ranged; @@ -85,6 +86,7 @@ pub(crate) fn inplace_argument(checker: &mut Checker, call: &ast::ExprCall) { call, keyword, statement, + checker.indexer().comment_ranges(), checker.locator(), ) { diagnostic.set_fix(fix); @@ -107,15 +109,21 @@ fn convert_inplace_argument_to_assignment( call: &ast::ExprCall, keyword: &Keyword, statement: &Stmt, + comment_ranges: &CommentRanges, locator: &Locator, ) -> Option { // Add the assignment. let attr = call.func.as_attribute_expr()?; let insert_assignment = Edit::insertion( format!("{name} = ", name = locator.slice(attr.value.range())), - parenthesized_range(call.into(), statement.into(), locator.contents()) - .unwrap_or(call.range()) - .start(), + parenthesized_range( + call.into(), + statement.into(), + comment_ranges, + locator.contents(), + ) + .unwrap_or(call.range()) + .start(), ); // Remove the `inplace` argument. diff --git a/crates/ruff/src/rules/pycodestyle/helpers.rs b/crates/ruff/src/rules/pycodestyle/helpers.rs index ce8bd311a3..9b01107260 100644 --- a/crates/ruff/src/rules/pycodestyle/helpers.rs +++ b/crates/ruff/src/rules/pycodestyle/helpers.rs @@ -3,6 +3,7 @@ use unicode_width::UnicodeWidthStr; use ruff_python_ast::node::AnyNodeRef; use ruff_python_ast::parenthesize::parenthesized_range; use ruff_python_ast::{CmpOp, Expr}; +use ruff_python_trivia::CommentRanges; use ruff_source_file::{Line, Locator}; use ruff_text_size::{Ranged, TextLen, TextRange}; @@ -17,6 +18,7 @@ pub(super) fn generate_comparison( ops: &[CmpOp], comparators: &[Expr], parent: AnyNodeRef, + comment_ranges: &CommentRanges, locator: &Locator, ) -> String { let start = left.start(); @@ -24,9 +26,12 @@ pub(super) fn generate_comparison( let mut contents = String::with_capacity(usize::from(end - start)); // Add the left side of the comparison. - contents.push_str(locator.slice( - parenthesized_range(left.into(), parent, locator.contents()).unwrap_or(left.range()), - )); + contents.push_str( + locator.slice( + parenthesized_range(left.into(), parent, comment_ranges, locator.contents()) + .unwrap_or(left.range()), + ), + ); for (op, comparator) in ops.iter().zip(comparators) { // Add the operator. @@ -46,8 +51,13 @@ pub(super) fn generate_comparison( // Add the right side of the comparison. contents.push_str( locator.slice( - parenthesized_range(comparator.into(), parent, locator.contents()) - .unwrap_or(comparator.range()), + parenthesized_range( + comparator.into(), + parent, + comment_ranges, + locator.contents(), + ) + .unwrap_or(comparator.range()), ), ); } diff --git a/crates/ruff/src/rules/pycodestyle/rules/literal_comparisons.rs b/crates/ruff/src/rules/pycodestyle/rules/literal_comparisons.rs index 20536c4125..2abf9faf65 100644 --- a/crates/ruff/src/rules/pycodestyle/rules/literal_comparisons.rs +++ b/crates/ruff/src/rules/pycodestyle/rules/literal_comparisons.rs @@ -283,6 +283,7 @@ pub(crate) fn literal_comparisons(checker: &mut Checker, compare: &ast::ExprComp &ops, &compare.comparators, compare.into(), + checker.indexer().comment_ranges(), checker.locator(), ); for diagnostic in &mut diagnostics { diff --git a/crates/ruff/src/rules/pycodestyle/rules/not_tests.rs b/crates/ruff/src/rules/pycodestyle/rules/not_tests.rs index 4eff0c9670..024b685ca2 100644 --- a/crates/ruff/src/rules/pycodestyle/rules/not_tests.rs +++ b/crates/ruff/src/rules/pycodestyle/rules/not_tests.rs @@ -100,6 +100,7 @@ pub(crate) fn not_tests(checker: &mut Checker, unary_op: &ast::ExprUnaryOp) { &[CmpOp::NotIn], comparators, unary_op.into(), + checker.indexer().comment_ranges(), checker.locator(), ), unary_op.range(), @@ -118,6 +119,7 @@ pub(crate) fn not_tests(checker: &mut Checker, unary_op: &ast::ExprUnaryOp) { &[CmpOp::IsNot], comparators, unary_op.into(), + checker.indexer().comment_ranges(), checker.locator(), ), unary_op.range(), diff --git a/crates/ruff/src/rules/pyflakes/fixes.rs b/crates/ruff/src/rules/pyflakes/fixes.rs index ae6006d8d7..7f0675de9e 100644 --- a/crates/ruff/src/rules/pyflakes/fixes.rs +++ b/crates/ruff/src/rules/pyflakes/fixes.rs @@ -4,7 +4,7 @@ use ruff_diagnostics::Edit; use ruff_python_ast as ast; use ruff_python_codegen::Stylist; use ruff_python_semantic::Binding; -use ruff_python_trivia::{SimpleTokenKind, SimpleTokenizer}; +use ruff_python_trivia::{BackwardsTokenizer, SimpleTokenKind, SimpleTokenizer}; use ruff_source_file::Locator; use ruff_text_size::Ranged; @@ -91,20 +91,27 @@ pub(crate) fn remove_exception_handler_assignment( bound_exception: &Binding, locator: &Locator, ) -> Result { - // Lex backwards, to the token just before the `as`. + // Find the position just after the exception name. This is a late pass so we only have the + // binding and can't look its parent in the AST up anymore. + // ``` + // except ZeroDivisionError as err: + // ^^^ This is the bound_exception range + // ^^^^ lex this range + // ^ preceding_end (we want to remove from here) + // ``` + // There can't be any comments in that range. let mut tokenizer = - SimpleTokenizer::up_to_without_back_comment(bound_exception.start(), locator.contents()) - .skip_trivia(); + BackwardsTokenizer::up_to(bound_exception.start(), locator.contents(), &[]).skip_trivia(); // Eat the `as` token. let preceding = tokenizer - .next_back() + .next() .context("expected the exception name to be preceded by `as`")?; debug_assert!(matches!(preceding.kind, SimpleTokenKind::As)); // Lex to the end of the preceding token, which should be the exception value. let preceding = tokenizer - .next_back() + .next() .context("expected the exception name to be preceded by a token")?; // Lex forwards, to the `:` token. diff --git a/crates/ruff/src/rules/pyflakes/rules/unused_variable.rs b/crates/ruff/src/rules/pyflakes/rules/unused_variable.rs index 89eebff824..64aaa16e75 100644 --- a/crates/ruff/src/rules/pyflakes/rules/unused_variable.rs +++ b/crates/ruff/src/rules/pyflakes/rules/unused_variable.rs @@ -224,6 +224,7 @@ fn remove_unused_variable(binding: &Binding, checker: &Checker) -> Option { let start = parenthesized_range( target.into(), statement.into(), + checker.indexer().comment_ranges(), checker.locator().contents(), ) .unwrap_or(target.range()) diff --git a/crates/ruff/src/rules/pyupgrade/rules/yield_in_for_loop.rs b/crates/ruff/src/rules/pyupgrade/rules/yield_in_for_loop.rs index fb97cd42fd..c9558966e5 100644 --- a/crates/ruff/src/rules/pyupgrade/rules/yield_in_for_loop.rs +++ b/crates/ruff/src/rules/pyupgrade/rules/yield_in_for_loop.rs @@ -108,6 +108,7 @@ pub(crate) fn yield_in_for_loop(checker: &mut Checker, stmt_for: &ast::StmtFor) parenthesized_range( iter.as_ref().into(), stmt_for.into(), + checker.indexer().comment_ranges(), checker.locator().contents(), ) .unwrap_or(iter.range()), diff --git a/crates/ruff_python_ast/src/parenthesize.rs b/crates/ruff_python_ast/src/parenthesize.rs index f86f09028b..8c6b441a98 100644 --- a/crates/ruff_python_ast/src/parenthesize.rs +++ b/crates/ruff_python_ast/src/parenthesize.rs @@ -1,5 +1,5 @@ -use ruff_python_trivia::{SimpleTokenKind, SimpleTokenizer}; -use ruff_text_size::{Ranged, TextRange, TextSize}; +use ruff_python_trivia::{BackwardsTokenizer, CommentRanges, SimpleTokenKind, SimpleTokenizer}; +use ruff_text_size::{Ranged, TextLen, TextRange}; use crate::node::AnyNodeRef; use crate::ExpressionRef; @@ -9,6 +9,7 @@ use crate::ExpressionRef; pub fn parenthesized_range( expr: ExpressionRef, parent: AnyNodeRef, + comment_ranges: &CommentRanges, source: &str, ) -> Option { // If the parent is a node that brings its own parentheses, exclude the closing parenthesis @@ -23,7 +24,7 @@ pub fn parenthesized_range( // - `Tuple`: The elements of a tuple. The only risk is a single-element tuple (e.g., `(x,)`), // which must have a trailing comma anyway. let exclusive_parent_end = if parent.is_arguments() { - parent.end() - TextSize::new(1) + parent.end() - ")".text_len() } else { parent.end() }; @@ -33,9 +34,8 @@ pub fn parenthesized_range( .skip_trivia() .take_while(|token| token.kind == SimpleTokenKind::RParen); - let left_tokenizer = SimpleTokenizer::up_to_without_back_comment(expr.start(), source) + let left_tokenizer = BackwardsTokenizer::up_to(expr.start(), source, comment_ranges) .skip_trivia() - .rev() .take_while(|token| token.kind == SimpleTokenKind::LParen); // Zip closing parenthesis with opening parenthesis. The order is intentional, as testing for diff --git a/crates/ruff_python_ast/tests/parenthesize.rs b/crates/ruff_python_ast/tests/parenthesize.rs index 9713e7b7dd..69a68f7f82 100644 --- a/crates/ruff_python_ast/tests/parenthesize.rs +++ b/crates/ruff_python_ast/tests/parenthesize.rs @@ -1,5 +1,6 @@ use ruff_python_ast::parenthesize::parenthesized_range; use ruff_python_parser::parse_expression; +use ruff_python_trivia::CommentRanges; use ruff_text_size::TextRange; #[test] @@ -10,7 +11,12 @@ fn test_parenthesized_name() { let bin_op = expr.as_bin_op_expr().unwrap(); let name = bin_op.left.as_ref(); - let parenthesized = parenthesized_range(name.into(), bin_op.into(), source_code); + let parenthesized = parenthesized_range( + name.into(), + bin_op.into(), + &CommentRanges::default(), + source_code, + ); assert_eq!(parenthesized, Some(TextRange::new(0.into(), 3.into()))); } @@ -22,7 +28,12 @@ fn test_non_parenthesized_name() { let bin_op = expr.as_bin_op_expr().unwrap(); let name = bin_op.left.as_ref(); - let parenthesized = parenthesized_range(name.into(), bin_op.into(), source_code); + let parenthesized = parenthesized_range( + name.into(), + bin_op.into(), + &CommentRanges::default(), + source_code, + ); assert_eq!(parenthesized, None); } @@ -35,7 +46,12 @@ fn test_parenthesized_argument() { let arguments = &call.arguments; let argument = arguments.args.first().unwrap(); - let parenthesized = parenthesized_range(argument.into(), arguments.into(), source_code); + let parenthesized = parenthesized_range( + argument.into(), + arguments.into(), + &CommentRanges::default(), + source_code, + ); assert_eq!(parenthesized, Some(TextRange::new(2.into(), 5.into()))); } @@ -48,7 +64,12 @@ fn test_non_parenthesized_argument() { let arguments = &call.arguments; let argument = arguments.args.first().unwrap(); - let parenthesized = parenthesized_range(argument.into(), arguments.into(), source_code); + let parenthesized = parenthesized_range( + argument.into(), + arguments.into(), + &CommentRanges::default(), + source_code, + ); assert_eq!(parenthesized, None); } @@ -60,7 +81,12 @@ fn test_parenthesized_tuple_member() { let tuple = expr.as_tuple_expr().unwrap(); let member = tuple.elts.last().unwrap(); - let parenthesized = parenthesized_range(member.into(), tuple.into(), source_code); + let parenthesized = parenthesized_range( + member.into(), + tuple.into(), + &CommentRanges::default(), + source_code, + ); assert_eq!(parenthesized, Some(TextRange::new(4.into(), 7.into()))); } @@ -72,7 +98,12 @@ fn test_non_parenthesized_tuple_member() { let tuple = expr.as_tuple_expr().unwrap(); let member = tuple.elts.last().unwrap(); - let parenthesized = parenthesized_range(member.into(), tuple.into(), source_code); + let parenthesized = parenthesized_range( + member.into(), + tuple.into(), + &CommentRanges::default(), + source_code, + ); assert_eq!(parenthesized, None); } @@ -84,7 +115,12 @@ fn test_twice_parenthesized_name() { let bin_op = expr.as_bin_op_expr().unwrap(); let name = bin_op.left.as_ref(); - let parenthesized = parenthesized_range(name.into(), bin_op.into(), source_code); + let parenthesized = parenthesized_range( + name.into(), + bin_op.into(), + &CommentRanges::default(), + source_code, + ); assert_eq!(parenthesized, Some(TextRange::new(0.into(), 5.into()))); } @@ -97,6 +133,11 @@ fn test_twice_parenthesized_argument() { let arguments = &call.arguments; let argument = arguments.args.first().unwrap(); - let parenthesized = parenthesized_range(argument.into(), arguments.into(), source_code); + let parenthesized = parenthesized_range( + argument.into(), + arguments.into(), + &CommentRanges::default(), + source_code, + ); assert_eq!(parenthesized, Some(TextRange::new(2.into(), 11.into()))); } diff --git a/crates/ruff_python_formatter/src/comments/debug.rs b/crates/ruff_python_formatter/src/comments/debug.rs index baaf405f11..0d9057b85a 100644 --- a/crates/ruff_python_formatter/src/comments/debug.rs +++ b/crates/ruff_python_formatter/src/comments/debug.rs @@ -182,6 +182,7 @@ mod tests { use ruff_formatter::SourceCode; use ruff_python_ast::node::AnyNode; use ruff_python_ast::{StmtBreak, StmtContinue}; + use ruff_python_trivia::CommentRanges; use ruff_text_size::{TextRange, TextSize}; use crate::comments::map::MultiMap; @@ -231,7 +232,8 @@ break; ), ); - let comments = Comments::new(comments_map); + let comment_ranges = CommentRanges::default(); + let comments = Comments::new(comments_map, &comment_ranges); assert_debug_snapshot!(comments.debug(source_code)); } diff --git a/crates/ruff_python_formatter/src/comments/mod.rs b/crates/ruff_python_formatter/src/comments/mod.rs index 4d6eac168c..3e99fc0942 100644 --- a/crates/ruff_python_formatter/src/comments/mod.rs +++ b/crates/ruff_python_formatter/src/comments/mod.rs @@ -99,8 +99,7 @@ use ruff_formatter::{SourceCode, SourceCodeSlice}; use ruff_python_ast::node::AnyNodeRef; use ruff_python_ast::visitor::preorder::{PreorderVisitor, TraversalSignal}; use ruff_python_ast::Mod; -use ruff_python_index::CommentRanges; -use ruff_python_trivia::PythonWhitespace; +use ruff_python_trivia::{CommentRanges, PythonWhitespace}; use ruff_source_file::Locator; use ruff_text_size::{Ranged, TextRange}; @@ -281,7 +280,7 @@ type CommentsMap<'a> = MultiMap, SourceComment>; /// The comments of a syntax tree stored by node. /// /// Cloning `comments` is cheap as it only involves bumping a reference counter. -#[derive(Debug, Clone, Default)] +#[derive(Debug, Clone)] pub(crate) struct Comments<'a> { /// The implementation uses an [Rc] so that [Comments] has a lifetime independent from the [crate::Formatter]. /// Independent lifetimes are necessary to support the use case where a (formattable object)[crate::Format] @@ -306,15 +305,31 @@ pub(crate) struct Comments<'a> { /// } /// ``` data: Rc>, + /// We need those for backwards lexing + comment_ranges: &'a CommentRanges, } impl<'a> Comments<'a> { - fn new(comments: CommentsMap<'a>) -> Self { + fn new(comments: CommentsMap<'a>, comment_ranges: &'a CommentRanges) -> Self { Self { data: Rc::new(CommentsData { comments }), + comment_ranges, } } + /// Effectively a [`Default`] implementation that works around the lifetimes for tests + #[cfg(test)] + pub(crate) fn from_ranges(comment_ranges: &'a CommentRanges) -> Self { + Self { + data: Rc::new(CommentsData::default()), + comment_ranges, + } + } + + pub(crate) fn ranges(&self) -> &'a CommentRanges { + self.comment_ranges + } + /// Extracts the comments from the AST. pub(crate) fn from_ast( root: &'a Mod, @@ -324,12 +339,13 @@ impl<'a> Comments<'a> { let map = if comment_ranges.is_empty() { CommentsMap::new() } else { - let mut builder = CommentsMapBuilder::new(Locator::new(source_code.as_str())); + let mut builder = + CommentsMapBuilder::new(Locator::new(source_code.as_str()), comment_ranges); CommentsVisitor::new(source_code, comment_ranges, &mut builder).visit(root); builder.finish() }; - Self::new(map) + Self::new(map, comment_ranges) } /// Returns `true` if the given `node` has any comments. @@ -528,9 +544,10 @@ mod tests { use ruff_formatter::SourceCode; use ruff_python_ast::Mod; - use ruff_python_index::{CommentRanges, CommentRangesBuilder}; + use ruff_python_index::CommentRangesBuilder; use ruff_python_parser::lexer::lex; use ruff_python_parser::{parse_tokens, Mode}; + use ruff_python_trivia::CommentRanges; use crate::comments::Comments; diff --git a/crates/ruff_python_formatter/src/comments/placement.rs b/crates/ruff_python_formatter/src/comments/placement.rs index 6557a4fb9b..1d4ebcce99 100644 --- a/crates/ruff_python_formatter/src/comments/placement.rs +++ b/crates/ruff_python_formatter/src/comments/placement.rs @@ -4,7 +4,8 @@ use ruff_python_ast::node::AnyNodeRef; use ruff_python_ast::whitespace::indentation; use ruff_python_ast::{self as ast, Comprehension, Expr, MatchCase, Parameters}; use ruff_python_trivia::{ - find_only_token_in_range, indentation_at_offset, SimpleToken, SimpleTokenKind, SimpleTokenizer, + find_only_token_in_range, indentation_at_offset, BackwardsTokenizer, CommentRanges, + SimpleToken, SimpleTokenKind, SimpleTokenizer, }; use ruff_source_file::Locator; use ruff_text_size::{Ranged, TextLen, TextRange}; @@ -20,12 +21,13 @@ use crate::pattern::pattern_match_sequence::SequenceType; /// Manually attach comments to nodes that the default placement gets wrong. pub(super) fn place_comment<'a>( comment: DecoratedComment<'a>, + comment_ranges: &CommentRanges, locator: &Locator, ) -> CommentPlacement<'a> { handle_parenthesized_comment(comment, locator) .or_else(|comment| handle_end_of_line_comment_around_body(comment, locator)) .or_else(|comment| handle_own_line_comment_around_body(comment, locator)) - .or_else(|comment| handle_enclosed_comment(comment, locator)) + .or_else(|comment| handle_enclosed_comment(comment, comment_ranges, locator)) } /// Handle parenthesized comments. A parenthesized comment is a comment that appears within a @@ -172,6 +174,7 @@ fn handle_parenthesized_comment<'a>( /// Handle a comment that is enclosed by a node. fn handle_enclosed_comment<'a>( comment: DecoratedComment<'a>, + comment_ranges: &CommentRanges, locator: &Locator, ) -> CommentPlacement<'a> { match comment.enclosing_node() { @@ -213,13 +216,15 @@ fn handle_enclosed_comment<'a>( AnyNodeRef::ExprDict(_) => handle_dict_unpacking_comment(comment, locator) .or_else(|comment| handle_bracketed_end_of_line_comment(comment, locator)), AnyNodeRef::ExprIfExp(expr_if) => handle_expr_if_comment(comment, expr_if, locator), - AnyNodeRef::ExprSlice(expr_slice) => handle_slice_comments(comment, expr_slice, locator), + AnyNodeRef::ExprSlice(expr_slice) => { + handle_slice_comments(comment, expr_slice, comment_ranges, locator) + } AnyNodeRef::ExprStarred(starred) => { handle_trailing_expression_starred_star_end_of_line_comment(comment, starred, locator) } AnyNodeRef::ExprSubscript(expr_subscript) => { if let Expr::Slice(expr_slice) = expr_subscript.slice.as_ref() { - handle_slice_comments(comment, expr_slice, locator) + handle_slice_comments(comment, expr_slice, comment_ranges, locator) } else { CommentPlacement::Default(comment) } @@ -958,6 +963,7 @@ fn handle_module_level_own_line_comment_before_class_or_function_comment<'a>( fn handle_slice_comments<'a>( comment: DecoratedComment<'a>, expr_slice: &'a ast::ExprSlice, + comment_ranges: &CommentRanges, locator: &Locator, ) -> CommentPlacement<'a> { let ast::ExprSlice { @@ -969,9 +975,9 @@ fn handle_slice_comments<'a>( // Check for `foo[ # comment`, but only if they are on the same line let after_lbracket = matches!( - SimpleTokenizer::up_to_without_back_comment(comment.start(), locator.contents()) + BackwardsTokenizer::up_to(comment.start(), locator.contents(), comment_ranges) .skip_trivia() - .next_back(), + .next(), Some(SimpleToken { kind: SimpleTokenKind::LBracket, .. diff --git a/crates/ruff_python_formatter/src/comments/visitor.rs b/crates/ruff_python_formatter/src/comments/visitor.rs index c291e635e9..91819acc39 100644 --- a/crates/ruff_python_formatter/src/comments/visitor.rs +++ b/crates/ruff_python_formatter/src/comments/visitor.rs @@ -8,8 +8,7 @@ use ruff_python_ast::{Mod, Stmt}; // pre-order. #[allow(clippy::wildcard_imports)] use ruff_python_ast::visitor::preorder::*; -use ruff_python_index::CommentRanges; -use ruff_python_trivia::is_python_whitespace; +use ruff_python_trivia::{is_python_whitespace, CommentRanges}; use ruff_source_file::Locator; use ruff_text_size::{Ranged, TextRange, TextSize}; @@ -536,12 +535,14 @@ impl<'a> PushComment<'a> for CommentsVecBuilder<'a> { /// [`CommentsMap`]. pub(super) struct CommentsMapBuilder<'a> { comments: CommentsMap<'a>, + /// We need those for backwards lexing + comment_ranges: &'a CommentRanges, locator: Locator<'a>, } impl<'a> PushComment<'a> for CommentsMapBuilder<'a> { fn push_comment(&mut self, placement: DecoratedComment<'a>) { - let placement = place_comment(placement, &self.locator); + let placement = place_comment(placement, self.comment_ranges, &self.locator); match placement { CommentPlacement::Leading { node, comment } => { self.push_leading_comment(node, comment); @@ -603,9 +604,10 @@ impl<'a> PushComment<'a> for CommentsMapBuilder<'a> { } impl<'a> CommentsMapBuilder<'a> { - pub(crate) fn new(locator: Locator<'a>) -> Self { + pub(crate) fn new(locator: Locator<'a>, comment_ranges: &'a CommentRanges) -> Self { Self { comments: CommentsMap::default(), + comment_ranges, locator, } } diff --git a/crates/ruff_python_formatter/src/expression/binary_like.rs b/crates/ruff_python_formatter/src/expression/binary_like.rs index 55e55c8037..8e4ab0f93b 100644 --- a/crates/ruff_python_formatter/src/expression/binary_like.rs +++ b/crates/ruff_python_formatter/src/expression/binary_like.rs @@ -8,6 +8,7 @@ use ruff_python_ast::{ Constant, Expr, ExprAttribute, ExprBinOp, ExprBoolOp, ExprCompare, ExprConstant, ExprUnaryOp, UnaryOp, }; +use ruff_python_trivia::CommentRanges; use ruff_python_trivia::{SimpleToken, SimpleTokenKind, SimpleTokenizer}; use ruff_text_size::{Ranged, TextRange}; @@ -179,7 +180,13 @@ impl<'a> BinaryLike<'a> { ) { let expression = operand.expression(); match expression { - Expr::BinOp(binary) if !is_expression_parenthesized(expression.into(), source) => { + Expr::BinOp(binary) + if !is_expression_parenthesized( + expression.into(), + comments.ranges(), + source, + ) => + { let leading_comments = operand .leading_binary_comments() .unwrap_or_else(|| comments.leading(binary)); @@ -198,7 +205,11 @@ impl<'a> BinaryLike<'a> { ); } Expr::Compare(compare) - if !is_expression_parenthesized(expression.into(), source) => + if !is_expression_parenthesized( + expression.into(), + comments.ranges(), + source, + ) => { let leading_comments = operand .leading_binary_comments() @@ -218,7 +229,11 @@ impl<'a> BinaryLike<'a> { ); } Expr::BoolOp(bool_op) - if !is_expression_parenthesized(expression.into(), source) => + if !is_expression_parenthesized( + expression.into(), + comments.ranges(), + source, + ) => { let leading_comments = operand .leading_binary_comments() @@ -282,7 +297,11 @@ impl Format> for BinaryLike<'_> { AnyString::from_expression(operand.expression()) .filter(|string| { string.is_implicit_concatenated() - && !is_expression_parenthesized(string.into(), source) + && !is_expression_parenthesized( + string.into(), + comments.ranges(), + source, + ) }) .map(|string| (index, string, operand)) }) @@ -430,6 +449,7 @@ impl Format> for BinaryLike<'_> { if (right_operand_has_leading_comments && !is_expression_parenthesized( right_operand.expression().into(), + f.context().comments().ranges(), f.context().source(), )) || right_operator.has_trailing_comments() @@ -466,11 +486,16 @@ impl Format> for BinaryLike<'_> { } } -fn is_simple_power_expression(left: &Expr, right: &Expr, source: &str) -> bool { +fn is_simple_power_expression( + left: &Expr, + right: &Expr, + comment_range: &CommentRanges, + source: &str, +) -> bool { is_simple_power_operand(left) && is_simple_power_operand(right) - && !is_expression_parenthesized(left.into(), source) - && !is_expression_parenthesized(right.into(), source) + && !is_expression_parenthesized(left.into(), comment_range, source) + && !is_expression_parenthesized(right.into(), comment_range, source) } /// Return `true` if an [`Expr`] adheres to [Black's definition](https://black.readthedocs.io/en/stable/the_black_code_style/current_style.html#line-breaks-binary-operators) @@ -664,6 +689,7 @@ impl Format> for FlatBinaryExpressionSlice<'_> { && is_simple_power_expression( left.last_operand().expression(), right.first_operand().expression(), + f.context().comments().ranges(), f.context().source(), ); @@ -806,7 +832,7 @@ impl<'a> Operand<'a> { } => !leading_comments.is_empty(), Operand::Middle { expression } | Operand::Right { expression, .. } => { let leading = comments.leading(*expression); - if is_expression_parenthesized((*expression).into(), source) { + if is_expression_parenthesized((*expression).into(), comments.ranges(), source) { leading.iter().any(|comment| { !comment.is_formatted() && matches!( @@ -853,7 +879,11 @@ impl Format> for Operand<'_> { fn fmt(&self, f: &mut Formatter>) -> FormatResult<()> { let expression = self.expression(); - return if is_expression_parenthesized(expression.into(), f.context().source()) { + return if is_expression_parenthesized( + expression.into(), + f.context().comments().ranges(), + f.context().source(), + ) { let comments = f.context().comments().clone(); let expression_comments = comments.leading_dangling_trailing(expression); diff --git a/crates/ruff_python_formatter/src/expression/expr_attribute.rs b/crates/ruff_python_formatter/src/expression/expr_attribute.rs index 37abc02b0a..c9cc463bea 100644 --- a/crates/ruff_python_formatter/src/expression/expr_attribute.rs +++ b/crates/ruff_python_formatter/src/expression/expr_attribute.rs @@ -45,7 +45,7 @@ impl FormatNodeRule for FormatExprAttribute { value: Constant::Int(_) | Constant::Float(_), .. }) - ) || is_expression_parenthesized(value.into(), f.context().source()); + ) || is_expression_parenthesized(value.into(), f.context().comments().ranges(), f.context().source()); if call_chain_layout == CallChainLayout::Fluent { if parenthesize_value { @@ -142,15 +142,22 @@ impl NeedsParentheses for ExprAttribute { context: &PyFormatContext, ) -> OptionalParentheses { // Checks if there are any own line comments in an attribute chain (a.b.c). - if CallChainLayout::from_expression(self.into(), context.source()) - == CallChainLayout::Fluent + if CallChainLayout::from_expression( + self.into(), + context.comments().ranges(), + context.source(), + ) == CallChainLayout::Fluent { OptionalParentheses::Multiline } else if context.comments().has_dangling(self) { OptionalParentheses::Always } else if self.value.is_name_expr() { OptionalParentheses::BestFit - } else if is_expression_parenthesized(self.value.as_ref().into(), context.source()) { + } else if is_expression_parenthesized( + self.value.as_ref().into(), + context.comments().ranges(), + context.source(), + ) { OptionalParentheses::Never } else { self.value.needs_parentheses(self.into(), context) diff --git a/crates/ruff_python_formatter/src/expression/expr_call.rs b/crates/ruff_python_formatter/src/expression/expr_call.rs index 6df21ba328..2fe26d8d34 100644 --- a/crates/ruff_python_formatter/src/expression/expr_call.rs +++ b/crates/ruff_python_formatter/src/expression/expr_call.rs @@ -82,13 +82,20 @@ impl NeedsParentheses for ExprCall { _parent: AnyNodeRef, context: &PyFormatContext, ) -> OptionalParentheses { - if CallChainLayout::from_expression(self.into(), context.source()) - == CallChainLayout::Fluent + if CallChainLayout::from_expression( + self.into(), + context.comments().ranges(), + context.source(), + ) == CallChainLayout::Fluent { OptionalParentheses::Multiline } else if context.comments().has_dangling(self) { OptionalParentheses::Always - } else if is_expression_parenthesized(self.func.as_ref().into(), context.source()) { + } else if is_expression_parenthesized( + self.func.as_ref().into(), + context.comments().ranges(), + context.source(), + ) { OptionalParentheses::Never } else { self.func.needs_parentheses(self.into(), context) diff --git a/crates/ruff_python_formatter/src/expression/expr_if_exp.rs b/crates/ruff_python_formatter/src/expression/expr_if_exp.rs index 1b22fb568c..a1b57b3bf3 100644 --- a/crates/ruff_python_formatter/src/expression/expr_if_exp.rs +++ b/crates/ruff_python_formatter/src/expression/expr_if_exp.rs @@ -101,7 +101,11 @@ impl Format> for FormatOrElse<'_> { fn fmt(&self, f: &mut Formatter>) -> FormatResult<()> { match self.orelse { Expr::IfExp(expr) - if !is_expression_parenthesized(expr.into(), f.context().source()) => + if !is_expression_parenthesized( + expr.into(), + f.context().comments().ranges(), + f.context().source(), + ) => { write!(f, [expr.format().with_options(ExprIfExpLayout::Nested)]) } diff --git a/crates/ruff_python_formatter/src/expression/expr_subscript.rs b/crates/ruff_python_formatter/src/expression/expr_subscript.rs index b4c9dc7959..ecd4a2abc1 100644 --- a/crates/ruff_python_formatter/src/expression/expr_subscript.rs +++ b/crates/ruff_python_formatter/src/expression/expr_subscript.rs @@ -89,11 +89,18 @@ impl NeedsParentheses for ExprSubscript { context: &PyFormatContext, ) -> OptionalParentheses { { - if CallChainLayout::from_expression(self.into(), context.source()) - == CallChainLayout::Fluent + if CallChainLayout::from_expression( + self.into(), + context.comments().ranges(), + context.source(), + ) == CallChainLayout::Fluent { OptionalParentheses::Multiline - } else if is_expression_parenthesized(self.value.as_ref().into(), context.source()) { + } else if is_expression_parenthesized( + self.value.as_ref().into(), + context.comments().ranges(), + context.source(), + ) { OptionalParentheses::Never } else { match self.value.needs_parentheses(self.into(), context) { diff --git a/crates/ruff_python_formatter/src/expression/expr_unary_op.rs b/crates/ruff_python_formatter/src/expression/expr_unary_op.rs index 3405fa3e5c..1b58d03571 100644 --- a/crates/ruff_python_formatter/src/expression/expr_unary_op.rs +++ b/crates/ruff_python_formatter/src/expression/expr_unary_op.rs @@ -46,7 +46,11 @@ impl FormatNodeRule for FormatExprUnaryOp { // a) // ``` if comments.has_leading(operand.as_ref()) - && !is_expression_parenthesized(operand.as_ref().into(), f.context().source()) + && !is_expression_parenthesized( + operand.as_ref().into(), + f.context().comments().ranges(), + f.context().source(), + ) { hard_line_break().fmt(f)?; } else if op.is_not() { @@ -72,7 +76,11 @@ impl NeedsParentheses for ExprUnaryOp { context: &PyFormatContext, ) -> OptionalParentheses { // We preserve the parentheses of the operand. It should not be necessary to break this expression. - if is_expression_parenthesized(self.operand.as_ref().into(), context.source()) { + if is_expression_parenthesized( + self.operand.as_ref().into(), + context.comments().ranges(), + context.source(), + ) { OptionalParentheses::Never } else { OptionalParentheses::Multiline diff --git a/crates/ruff_python_formatter/src/expression/mod.rs b/crates/ruff_python_formatter/src/expression/mod.rs index ba2b979fe5..dd20097a7e 100644 --- a/crates/ruff_python_formatter/src/expression/mod.rs +++ b/crates/ruff_python_formatter/src/expression/mod.rs @@ -9,6 +9,7 @@ use ruff_python_ast as ast; use ruff_python_ast::node::AnyNodeRef; use ruff_python_ast::visitor::preorder::{walk_expr, PreorderVisitor}; use ruff_python_ast::{Constant, Expr, ExpressionRef, Operator}; +use ruff_python_trivia::CommentRanges; use crate::builders::parenthesize_if_expands; use crate::comments::leading_comments; @@ -103,9 +104,11 @@ impl FormatRule> for FormatExpr { }); let parenthesize = match parentheses { - Parentheses::Preserve => { - is_expression_parenthesized(expression.into(), f.context().source()) - } + Parentheses::Preserve => is_expression_parenthesized( + expression.into(), + f.context().comments().ranges(), + f.context().source(), + ), Parentheses::Always => true, // Fluent style means we already have parentheses Parentheses::Never => false, @@ -186,7 +189,11 @@ impl Format> for MaybeParenthesizeExpression<'_> { let comments = f.context().comments(); let preserve_parentheses = parenthesize.is_optional() - && is_expression_parenthesized((*expression).into(), f.context().source()); + && is_expression_parenthesized( + (*expression).into(), + f.context().comments().ranges(), + f.context().source(), + ); let has_comments = comments.has_leading(*expression) || comments.has_trailing_own_line(*expression); @@ -581,7 +588,11 @@ impl<'input> PreorderVisitor<'input> for CanOmitOptionalParenthesesVisitor<'inpu self.last = Some(expr); // Rule only applies for non-parenthesized expressions. - if is_expression_parenthesized(expr.into(), self.context.source()) { + if is_expression_parenthesized( + expr.into(), + self.context.comments().ranges(), + self.context.source(), + ) { self.any_parenthesized_expressions = true; } else { self.visit_subexpression(expr); @@ -635,7 +646,11 @@ pub enum CallChainLayout { } impl CallChainLayout { - pub(crate) fn from_expression(mut expr: ExpressionRef, source: &str) -> Self { + pub(crate) fn from_expression( + mut expr: ExpressionRef, + comment_ranges: &CommentRanges, + source: &str, + ) -> Self { let mut attributes_after_parentheses = 0; loop { match expr { @@ -646,7 +661,7 @@ impl CallChainLayout { // data[:100].T // ^^^^^^^^^^ value // ``` - if is_expression_parenthesized(value.into(), source) { + if is_expression_parenthesized(value.into(), comment_ranges, source) { // `(a).b`. We preserve these parentheses so don't recurse attributes_after_parentheses += 1; break; @@ -674,7 +689,7 @@ impl CallChainLayout { // f2 = (a).w().t(1,) // ^ expr // ``` - if is_expression_parenthesized(expr, source) { + if is_expression_parenthesized(expr, comment_ranges, source) { attributes_after_parentheses += 1; } @@ -683,7 +698,7 @@ impl CallChainLayout { } // We preserve these parentheses so don't recurse - if is_expression_parenthesized(expr, source) { + if is_expression_parenthesized(expr, comment_ranges, source) { break; } } @@ -704,7 +719,11 @@ impl CallChainLayout { match self { CallChainLayout::Default => { if f.context().node_level().is_parenthesized() { - CallChainLayout::from_expression(item.into(), f.context().source()) + CallChainLayout::from_expression( + item.into(), + f.context().comments().ranges(), + f.context().source(), + ) } else { CallChainLayout::NonFluent } @@ -745,7 +764,7 @@ fn has_parentheses(expr: &Expr, context: &PyFormatContext) -> Option bool { +pub(crate) fn is_expression_parenthesized( + expr: ExpressionRef, + comment_ranges: &CommentRanges, + contents: &str, +) -> bool { // First test if there's a closing parentheses because it tends to be cheaper. if matches!( first_non_trivia_token(expr.end(), contents), @@ -109,11 +116,10 @@ pub(crate) fn is_expression_parenthesized(expr: ExpressionRef, contents: &str) - .. }) ) { - let mut tokenizer = - SimpleTokenizer::up_to_without_back_comment(expr.start(), contents).skip_trivia(); - matches!( - tokenizer.next_back(), + BackwardsTokenizer::up_to(expr.start(), contents, comment_ranges) + .skip_trivia() + .next(), Some(SimpleToken { kind: SimpleTokenKind::LParen, .. @@ -418,6 +424,7 @@ impl Format> for FormatEmptyParenthesized<'_> { mod tests { use ruff_python_ast::ExpressionRef; use ruff_python_parser::parse_expression; + use ruff_python_trivia::CommentRanges; use crate::expression::parentheses::is_expression_parenthesized; @@ -427,6 +434,7 @@ mod tests { let expr = parse_expression(expression, "").unwrap(); assert!(!is_expression_parenthesized( ExpressionRef::from(&expr), + &CommentRanges::default(), expression )); } diff --git a/crates/ruff_python_formatter/src/lib.rs b/crates/ruff_python_formatter/src/lib.rs index 4eb54f1b34..031b93510b 100644 --- a/crates/ruff_python_formatter/src/lib.rs +++ b/crates/ruff_python_formatter/src/lib.rs @@ -5,9 +5,10 @@ use ruff_formatter::prelude::*; use ruff_formatter::{format, FormatError, Formatted, PrintError, Printed, SourceCode}; use ruff_python_ast::node::AstNode; use ruff_python_ast::Mod; -use ruff_python_index::{CommentRanges, CommentRangesBuilder}; +use ruff_python_index::CommentRangesBuilder; use ruff_python_parser::lexer::{lex, LexicalError}; use ruff_python_parser::{parse_tokens, Mode, ParseError}; +use ruff_python_trivia::CommentRanges; use ruff_source_file::Locator; use crate::comments::{ @@ -120,7 +121,7 @@ impl From for FormatModuleError { } } -#[tracing::instrument(level=Level::TRACE, skip_all)] +#[tracing::instrument(level = Level::TRACE, skip_all)] pub fn format_module( contents: &str, options: PyFormatOptions, diff --git a/crates/ruff_python_formatter/src/pattern/mod.rs b/crates/ruff_python_formatter/src/pattern/mod.rs index d0f44e258a..50d1f99fb2 100644 --- a/crates/ruff_python_formatter/src/pattern/mod.rs +++ b/crates/ruff_python_formatter/src/pattern/mod.rs @@ -1,7 +1,10 @@ use ruff_formatter::{FormatOwnedWithRule, FormatRefWithRule, FormatRule, FormatRuleWithOptions}; use ruff_python_ast::node::AnyNodeRef; use ruff_python_ast::Pattern; -use ruff_python_trivia::{first_non_trivia_token, SimpleToken, SimpleTokenKind, SimpleTokenizer}; +use ruff_python_trivia::CommentRanges; +use ruff_python_trivia::{ + first_non_trivia_token, BackwardsTokenizer, SimpleToken, SimpleTokenKind, +}; use ruff_text_size::Ranged; use crate::expression::parentheses::{ @@ -48,7 +51,11 @@ impl FormatRule> for FormatPattern { }); let parenthesize = match self.parentheses { - Parentheses::Preserve => is_pattern_parenthesized(pattern, f.context().source()), + Parentheses::Preserve => is_pattern_parenthesized( + pattern, + f.context().comments().ranges(), + f.context().source(), + ), Parentheses::Always => true, Parentheses::Never => false, }; @@ -98,7 +105,11 @@ impl<'ast> IntoFormat> for Pattern { } } -fn is_pattern_parenthesized(pattern: &Pattern, contents: &str) -> bool { +fn is_pattern_parenthesized( + pattern: &Pattern, + comment_ranges: &CommentRanges, + contents: &str, +) -> bool { // First test if there's a closing parentheses because it tends to be cheaper. if matches!( first_non_trivia_token(pattern.end(), contents), @@ -107,11 +118,10 @@ fn is_pattern_parenthesized(pattern: &Pattern, contents: &str) -> bool { .. }) ) { - let mut tokenizer = - SimpleTokenizer::up_to_without_back_comment(pattern.start(), contents).skip_trivia(); - matches!( - tokenizer.next_back(), + BackwardsTokenizer::up_to(pattern.start(), contents, comment_ranges) + .skip_trivia() + .next(), Some(SimpleToken { kind: SimpleTokenKind::LParen, .. diff --git a/crates/ruff_python_formatter/src/statement/suite.rs b/crates/ruff_python_formatter/src/statement/suite.rs index 8ae33eebb3..0faf69a9d2 100644 --- a/crates/ruff_python_formatter/src/statement/suite.rs +++ b/crates/ruff_python_formatter/src/statement/suite.rs @@ -561,6 +561,7 @@ impl Format> for SuiteChildStatement<'_> { mod tests { use ruff_formatter::format; use ruff_python_parser::parse_suite; + use ruff_python_trivia::CommentRanges; use crate::comments::Comments; use crate::prelude::*; @@ -591,7 +592,12 @@ def trailing_func(): let statements = parse_suite(source, "test.py").unwrap(); - let context = PyFormatContext::new(PyFormatOptions::default(), source, Comments::default()); + let comment_ranges = CommentRanges::default(); + let context = PyFormatContext::new( + PyFormatOptions::default(), + source, + Comments::from_ranges(&comment_ranges), + ); let test_formatter = format_with(|f: &mut PyFormatter| statements.format().with_options(level).fmt(f)); diff --git a/crates/ruff_python_index/src/comment_ranges.rs b/crates/ruff_python_index/src/comment_ranges.rs index 9376d91e02..602446a934 100644 --- a/crates/ruff_python_index/src/comment_ranges.rs +++ b/crates/ruff_python_index/src/comment_ranges.rs @@ -1,70 +1,8 @@ -use itertools::Itertools; -use std::fmt::{Debug, Formatter}; -use std::ops::Deref; +use std::fmt::Debug; use ruff_python_parser::Tok; -use ruff_text_size::{Ranged, TextRange}; - -/// Stores the ranges of comments sorted by [`TextRange::start`] in increasing order. No two ranges are overlapping. -#[derive(Clone)] -pub struct CommentRanges { - raw: Vec, -} - -impl CommentRanges { - /// Returns `true` if the given range includes a comment. - pub fn intersects(&self, target: TextRange) -> bool { - self.raw - .binary_search_by(|range| { - if target.contains_range(*range) { - std::cmp::Ordering::Equal - } else if range.end() < target.start() { - std::cmp::Ordering::Less - } else { - std::cmp::Ordering::Greater - } - }) - .is_ok() - } - - /// Returns the comments who are within the range - pub fn comments_in_range(&self, range: TextRange) -> &[TextRange] { - let start = self - .raw - .partition_point(|comment| comment.start() < range.start()); - // We expect there are few comments, so switching to find should be faster - match self.raw[start..] - .iter() - .find_position(|comment| comment.end() > range.end()) - { - Some((in_range, _element)) => &self.raw[start..start + in_range], - None => &self.raw[start..], - } - } -} - -impl Deref for CommentRanges { - type Target = [TextRange]; - - fn deref(&self) -> &Self::Target { - self.raw.as_slice() - } -} - -impl Debug for CommentRanges { - fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { - f.debug_tuple("CommentRanges").field(&self.raw).finish() - } -} - -impl<'a> IntoIterator for &'a CommentRanges { - type IntoIter = std::slice::Iter<'a, TextRange>; - type Item = &'a TextRange; - - fn into_iter(self) -> Self::IntoIter { - self.raw.iter() - } -} +use ruff_python_trivia::CommentRanges; +use ruff_text_size::TextRange; #[derive(Debug, Clone, Default)] pub struct CommentRangesBuilder { @@ -79,6 +17,6 @@ impl CommentRangesBuilder { } pub fn finish(self) -> CommentRanges { - CommentRanges { raw: self.ranges } + CommentRanges::new(self.ranges) } } diff --git a/crates/ruff_python_index/src/indexer.rs b/crates/ruff_python_index/src/indexer.rs index aec33f087a..11273f8131 100644 --- a/crates/ruff_python_index/src/indexer.rs +++ b/crates/ruff_python_index/src/indexer.rs @@ -1,15 +1,16 @@ //! Struct used to index source code, to enable efficient lookup of tokens that //! are omitted from the AST (e.g., commented lines). +use crate::CommentRangesBuilder; use ruff_python_ast::Stmt; use ruff_python_parser::lexer::LexResult; use ruff_python_parser::{StringKind, Tok}; -use ruff_python_trivia::{has_leading_content, has_trailing_content, is_python_whitespace}; +use ruff_python_trivia::{ + has_leading_content, has_trailing_content, is_python_whitespace, CommentRanges, +}; use ruff_source_file::Locator; use ruff_text_size::{Ranged, TextRange, TextSize}; -use super::comment_ranges::{CommentRanges, CommentRangesBuilder}; - pub struct Indexer { comment_ranges: CommentRanges, diff --git a/crates/ruff_python_index/src/lib.rs b/crates/ruff_python_index/src/lib.rs index a6ed9363ad..2e585ca5df 100644 --- a/crates/ruff_python_index/src/lib.rs +++ b/crates/ruff_python_index/src/lib.rs @@ -1,5 +1,5 @@ mod comment_ranges; mod indexer; -pub use comment_ranges::{CommentRanges, CommentRangesBuilder}; +pub use comment_ranges::CommentRangesBuilder; pub use indexer::Indexer; diff --git a/crates/ruff_python_trivia/Cargo.toml b/crates/ruff_python_trivia/Cargo.toml index 8ae3fad931..e77cf41bac 100644 --- a/crates/ruff_python_trivia/Cargo.toml +++ b/crates/ruff_python_trivia/Cargo.toml @@ -16,8 +16,7 @@ license = { workspace = true } ruff_text_size = { path = "../ruff_text_size" } ruff_source_file = { path = "../ruff_source_file" } -memchr = { workspace = true } -smallvec = { workspace = true } +itertools = { workspace = true } unicode-ident = { workspace = true } [dev-dependencies] diff --git a/crates/ruff_python_trivia/src/comment_ranges.rs b/crates/ruff_python_trivia/src/comment_ranges.rs new file mode 100644 index 0000000000..a5b7b7ed50 --- /dev/null +++ b/crates/ruff_python_trivia/src/comment_ranges.rs @@ -0,0 +1,71 @@ +use std::fmt::{Debug, Formatter}; +use std::ops::Deref; + +use itertools::Itertools; + +use ruff_text_size::{Ranged, TextRange}; + +/// Stores the ranges of comments sorted by [`TextRange::start`] in increasing order. No two ranges are overlapping. +#[derive(Clone, Default)] +pub struct CommentRanges { + raw: Vec, +} + +impl CommentRanges { + pub fn new(ranges: Vec) -> Self { + Self { raw: ranges } + } + + /// Returns `true` if the given range includes a comment. + pub fn intersects(&self, target: TextRange) -> bool { + self.raw + .binary_search_by(|range| { + if target.contains_range(*range) { + std::cmp::Ordering::Equal + } else if range.end() < target.start() { + std::cmp::Ordering::Less + } else { + std::cmp::Ordering::Greater + } + }) + .is_ok() + } + + /// Returns the comments who are within the range + pub fn comments_in_range(&self, range: TextRange) -> &[TextRange] { + let start = self + .raw + .partition_point(|comment| comment.start() < range.start()); + // We expect there are few comments, so switching to find should be faster + match self.raw[start..] + .iter() + .find_position(|comment| comment.end() > range.end()) + { + Some((in_range, _element)) => &self.raw[start..start + in_range], + None => &self.raw[start..], + } + } +} + +impl Deref for CommentRanges { + type Target = [TextRange]; + + fn deref(&self) -> &Self::Target { + self.raw.as_slice() + } +} + +impl Debug for CommentRanges { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + f.debug_tuple("CommentRanges").field(&self.raw).finish() + } +} + +impl<'a> IntoIterator for &'a CommentRanges { + type Item = &'a TextRange; + type IntoIter = std::slice::Iter<'a, TextRange>; + + fn into_iter(self) -> Self::IntoIter { + self.raw.iter() + } +} diff --git a/crates/ruff_python_trivia/src/cursor.rs b/crates/ruff_python_trivia/src/cursor.rs index 336720269c..e046fa92ba 100644 --- a/crates/ruff_python_trivia/src/cursor.rs +++ b/crates/ruff_python_trivia/src/cursor.rs @@ -44,7 +44,7 @@ impl<'a> Cursor<'a> { self.chars.clone().next_back().unwrap_or(EOF_CHAR) } - // SAFETY: THe `source.text_len` call in `new` would panic if the string length is larger than a `u32`. + // SAFETY: The `source.text_len` call in `new` would panic if the string length is larger than a `u32`. #[allow(clippy::cast_possible_truncation)] pub fn text_len(&self) -> TextSize { TextSize::new(self.chars.as_str().len() as u32) diff --git a/crates/ruff_python_trivia/src/lib.rs b/crates/ruff_python_trivia/src/lib.rs index bef83ac26a..9c9bb8158c 100644 --- a/crates/ruff_python_trivia/src/lib.rs +++ b/crates/ruff_python_trivia/src/lib.rs @@ -1,8 +1,10 @@ +mod comment_ranges; mod cursor; pub mod textwrap; mod tokenizer; mod whitespace; +pub use comment_ranges::CommentRanges; pub use cursor::*; pub use tokenizer::*; pub use whitespace::*; diff --git a/crates/ruff_python_trivia/src/tokenizer.rs b/crates/ruff_python_trivia/src/tokenizer.rs index d7d0645f3f..2072eae936 100644 --- a/crates/ruff_python_trivia/src/tokenizer.rs +++ b/crates/ruff_python_trivia/src/tokenizer.rs @@ -1,4 +1,3 @@ -use memchr::{memchr2, memchr3, memrchr3_iter}; use unicode_ident::{is_xid_continue, is_xid_start}; use ruff_text_size::{Ranged, TextLen, TextRange, TextSize}; @@ -121,6 +120,47 @@ fn is_identifier_continuation(c: char) -> bool { } } +fn to_keyword_or_other(source: &str) -> SimpleTokenKind { + match source { + "and" => SimpleTokenKind::And, + "as" => SimpleTokenKind::As, + "assert" => SimpleTokenKind::Assert, + "async" => SimpleTokenKind::Async, + "await" => SimpleTokenKind::Await, + "break" => SimpleTokenKind::Break, + "class" => SimpleTokenKind::Class, + "continue" => SimpleTokenKind::Continue, + "def" => SimpleTokenKind::Def, + "del" => SimpleTokenKind::Del, + "elif" => SimpleTokenKind::Elif, + "else" => SimpleTokenKind::Else, + "except" => SimpleTokenKind::Except, + "finally" => SimpleTokenKind::Finally, + "for" => SimpleTokenKind::For, + "from" => SimpleTokenKind::From, + "global" => SimpleTokenKind::Global, + "if" => SimpleTokenKind::If, + "import" => SimpleTokenKind::Import, + "in" => SimpleTokenKind::In, + "is" => SimpleTokenKind::Is, + "lambda" => SimpleTokenKind::Lambda, + "nonlocal" => SimpleTokenKind::Nonlocal, + "not" => SimpleTokenKind::Not, + "or" => SimpleTokenKind::Or, + "pass" => SimpleTokenKind::Pass, + "raise" => SimpleTokenKind::Raise, + "return" => SimpleTokenKind::Return, + "try" => SimpleTokenKind::Try, + "while" => SimpleTokenKind::While, + "match" => SimpleTokenKind::Match, // Match is a soft keyword that depends on the context but we can always lex it as a keyword and leave it to the caller (parser) to decide if it should be handled as an identifier or keyword. + "type" => SimpleTokenKind::Type, // Type is a soft keyword that depends on the context but we can always lex it as a keyword and leave it to the caller (parser) to decide if it should be handled as an identifier or keyword. + "case" => SimpleTokenKind::Case, + "with" => SimpleTokenKind::With, + "yield" => SimpleTokenKind::Yield, + _ => SimpleTokenKind::Other, // Potentially an identifier, but only if it isn't a string prefix. We can ignore this for now https://docs.python.org/3/reference/lexical_analysis.html#string-and-bytes-literals + } +} + #[derive(Clone, Debug, Eq, PartialEq, Hash)] pub struct SimpleToken { pub kind: SimpleTokenKind, @@ -421,17 +461,15 @@ impl SimpleTokenKind { } } -/// Simple zero allocation tokenizer for tokenizing trivia (and some tokens). +/// Simple zero allocation tokenizer handling most tokens. /// /// The tokenizer must start at an offset that is trivia (e.g. not inside of a multiline string). /// -/// The tokenizer doesn't guarantee any correctness after it returned a [`SimpleTokenKind::Other`]. That's why it -/// will return [`SimpleTokenKind::Bogus`] for every character after until it reaches the end of the file. +/// In case it finds something it can't parse, the tokenizer will return a +/// [`SimpleTokenKind::Other`] and then only a final [`SimpleTokenKind::Bogus`] afterwards. pub struct SimpleTokenizer<'a> { offset: TextSize, - back_offset: TextSize, /// `true` when it is known that the current `back` line has no comment for sure. - back_line_has_no_comment: bool, bogus: bool, source: &'a str, cursor: Cursor<'a>, @@ -441,8 +479,6 @@ impl<'a> SimpleTokenizer<'a> { pub fn new(source: &'a str, range: TextRange) -> Self { Self { offset: range.start(), - back_offset: range.end(), - back_line_has_no_comment: false, bogus: false, source, cursor: Cursor::new(&source[range]), @@ -454,64 +490,6 @@ impl<'a> SimpleTokenizer<'a> { Self::new(source, range) } - /// Creates a tokenizer that lexes tokens from the start of `source` up to `offset`. - /// - /// Consider using [`SimpleTokenizer::up_to_without_back_comment`] if intend to lex backwards. - pub fn up_to(offset: TextSize, source: &'a str) -> Self { - Self::new(source, TextRange::up_to(offset)) - } - - /// Creates a tokenizer that lexes tokens from the start of `source` up to `offset`, and informs - /// the lexer that the line at `offset` contains no comments. This can significantly speed up backwards lexing - /// because the lexer doesn't need to scan for comments. - pub fn up_to_without_back_comment(offset: TextSize, source: &'a str) -> Self { - let mut tokenizer = Self::up_to(offset, source); - tokenizer.back_line_has_no_comment = true; - tokenizer - } - - fn to_keyword_or_other(&self, range: TextRange) -> SimpleTokenKind { - let source = &self.source[range]; - match source { - "and" => SimpleTokenKind::And, - "as" => SimpleTokenKind::As, - "assert" => SimpleTokenKind::Assert, - "async" => SimpleTokenKind::Async, - "await" => SimpleTokenKind::Await, - "break" => SimpleTokenKind::Break, - "class" => SimpleTokenKind::Class, - "continue" => SimpleTokenKind::Continue, - "def" => SimpleTokenKind::Def, - "del" => SimpleTokenKind::Del, - "elif" => SimpleTokenKind::Elif, - "else" => SimpleTokenKind::Else, - "except" => SimpleTokenKind::Except, - "finally" => SimpleTokenKind::Finally, - "for" => SimpleTokenKind::For, - "from" => SimpleTokenKind::From, - "global" => SimpleTokenKind::Global, - "if" => SimpleTokenKind::If, - "import" => SimpleTokenKind::Import, - "in" => SimpleTokenKind::In, - "is" => SimpleTokenKind::Is, - "lambda" => SimpleTokenKind::Lambda, - "nonlocal" => SimpleTokenKind::Nonlocal, - "not" => SimpleTokenKind::Not, - "or" => SimpleTokenKind::Or, - "pass" => SimpleTokenKind::Pass, - "raise" => SimpleTokenKind::Raise, - "return" => SimpleTokenKind::Return, - "try" => SimpleTokenKind::Try, - "while" => SimpleTokenKind::While, - "match" => SimpleTokenKind::Match, // Match is a soft keyword that depends on the context but we can always lex it as a keyword and leave it to the caller (parser) to decide if it should be handled as an identifier or keyword. - "type" => SimpleTokenKind::Type, // Type is a soft keyword that depends on the context but we can always lex it as a keyword and leave it to the caller (parser) to decide if it should be handled as an identifier or keyword. - "case" => SimpleTokenKind::Case, - "with" => SimpleTokenKind::With, - "yield" => SimpleTokenKind::Yield, - _ => SimpleTokenKind::Other, // Potentially an identifier, but only if it isn't a string prefix. We can ignore this for now https://docs.python.org/3/reference/lexical_analysis.html#string-and-bytes-literals - } - } - fn next_token(&mut self) -> SimpleToken { self.cursor.start_token(); @@ -523,6 +501,7 @@ impl<'a> SimpleTokenizer<'a> { }; if self.bogus { + // Emit a single final bogus token let token = SimpleToken { kind: SimpleTokenKind::Bogus, range: TextRange::at(self.offset, first.text_len()), @@ -532,14 +511,29 @@ impl<'a> SimpleTokenizer<'a> { return token; } - let kind = match first { + let kind = self.next_token_inner(first); + + let token_len = self.cursor.token_len(); + + let token = SimpleToken { + kind, + range: TextRange::at(self.offset, token_len), + }; + + self.offset += token_len; + + token + } + + fn next_token_inner(&mut self, first: char) -> SimpleTokenKind { + match first { // Keywords and identifiers c if is_identifier_start(c) => { self.cursor.eat_while(is_identifier_continuation); let token_len = self.cursor.token_len(); let range = TextRange::at(self.offset, token_len); - let kind = self.to_keyword_or_other(range); + let kind = to_keyword_or_other(&self.source[range]); if kind == SimpleTokenKind::Other { self.bogus = true; @@ -717,24 +711,102 @@ impl<'a> SimpleTokenizer<'a> { self.bogus = true; SimpleTokenKind::Other } - }; - - let token_len = self.cursor.token_len(); - - let token = SimpleToken { - kind, - range: TextRange::at(self.offset, token_len), - }; - - self.offset += token_len; - - token + } } - /// Returns the next token from the back. Prefer iterating forwards. Iterating backwards is significantly more expensive - /// because it needs to check if the line has any comments when encountering any non-trivia token. - pub fn next_token_back(&mut self) -> SimpleToken { + pub fn skip_trivia(self) -> impl Iterator + 'a { + self.filter(|t| !t.kind().is_trivia()) + } +} + +impl Iterator for SimpleTokenizer<'_> { + type Item = SimpleToken; + + fn next(&mut self) -> Option { + let token = self.next_token(); + + if token.kind == SimpleTokenKind::EndOfFile { + None + } else { + Some(token) + } + } +} + +/// Simple zero allocation backwards tokenizer for finding preceding tokens. +/// +/// The tokenizer must start at an offset that is trivia (e.g. not inside of a multiline string). +/// It will fail when reaching a string. +/// +/// In case it finds something it can't parse, the tokenizer will return a +/// [`SimpleTokenKind::Other`] and then only a final [`SimpleTokenKind::Bogus`] afterwards. +pub struct BackwardsTokenizer<'a> { + offset: TextSize, + back_offset: TextSize, + /// Remember if we have check for comments + after_newline: bool, + /// Not `&CommentRanges` to avoid a circular dependency + comment_ranges: &'a [TextRange], + /// The index the previously line ending comment + previous_comment_idx: Option, + bogus: bool, + source: &'a str, + cursor: Cursor<'a>, +} + +impl<'a> BackwardsTokenizer<'a> { + pub fn new(source: &'a str, range: TextRange, comment_range: &'a [TextRange]) -> Self { + Self { + offset: range.start(), + back_offset: range.end(), + // We could start tokenizing at a comment + after_newline: true, + comment_ranges: comment_range, + previous_comment_idx: None, + bogus: false, + source, + cursor: Cursor::new(&source[range]), + } + } + + pub fn up_to(offset: TextSize, source: &'a str, comment_range: &'a [TextRange]) -> Self { + Self::new(source, TextRange::up_to(offset), comment_range) + } + + pub fn skip_trivia(self) -> impl Iterator + 'a { + self.filter(|t| !t.kind().is_trivia()) + } + + pub fn next_token(&mut self) -> SimpleToken { self.cursor.start_token(); + self.back_offset = self.cursor.text_len() + self.offset; + + if self.after_newline { + // This comment ended a line with a higher line number, not the current one + let previous_comment_idx = self.previous_comment_idx.unwrap_or_else(|| { + self.comment_ranges + .partition_point(|comment| comment.end() <= self.back_offset) + }); + // If `previous_comment_idx == 0`, we're in a comment free region + if previous_comment_idx > 0 { + let comment = self.comment_ranges[previous_comment_idx - 1]; + if comment.end() == self.back_offset { + // Skip the comment without iterating over the chars manually + self.cursor = + Cursor::new(&self.source[TextRange::new(self.offset, comment.start())]); + debug_assert_eq!(self.cursor.text_len() + self.offset, comment.start()); + self.after_newline = false; + self.previous_comment_idx = Some(previous_comment_idx - 1); + return SimpleToken { + kind: SimpleTokenKind::Comment, + range: comment.range(), + }; + } + // At least memoize the binary search + self.previous_comment_idx = Some(previous_comment_idx); + } + self.after_newline = false; + } let Some(last) = self.cursor.bump_back() else { return SimpleToken { @@ -762,322 +834,132 @@ impl<'a> SimpleTokenizer<'a> { } '\r' => { - self.back_line_has_no_comment = false; + self.after_newline = true; SimpleTokenKind::Newline } '\n' => { - self.back_line_has_no_comment = false; self.cursor.eat_char_back('\r'); + self.after_newline = true; SimpleTokenKind::Newline } - - // Empty comment (could also be a comment nested in another comment, but this shouldn't matter for what we use the lexer for) - '#' => SimpleTokenKind::Comment, - - // For all other tokens, test if the character isn't part of a comment. - c => { - // Skip the test whether there's a preceding comment if it has been performed before. - let comment_length = if self.back_line_has_no_comment { - None - } else { - let bytes = self.cursor.chars().as_str().as_bytes(); - let mut potential_comment_starts: smallvec::SmallVec<[TextSize; 2]> = - smallvec::SmallVec::new(); - - // Find the start of the line, or any potential comments. - for index in memrchr3_iter(b'\n', b'\r', b'#', bytes) { - if bytes[index] == b'#' { - // Potentially a comment, but not guaranteed - // SAFETY: Safe, because ruff only supports files up to 4GB - potential_comment_starts.push(TextSize::try_from(index).unwrap()); - } else { - break; - } - } - - // No comments - if potential_comment_starts.is_empty() { - None - } else { - // The line contains at least one `#` token. The `#` can indicate the start of a - // comment, meaning the current token is commented out, or it is a regular `#` inside of a string. - self.comment_from_hash_positions(&potential_comment_starts) - } - }; - - // From here on it is guaranteed that this line has no other comment. - self.back_line_has_no_comment = true; - - if let Some(comment_length) = comment_length { - // It is a comment, bump all tokens - for _ in 0..usize::from(comment_length) { - self.cursor.bump_back().unwrap(); - } - - SimpleTokenKind::Comment - } else { - match c { - // Keywords and identifiers - c if is_identifier_continuation(c) => { - // if we only have identifier continuations but no start (e.g. 555) we - // don't want to consume the chars, so in that case, we want to rewind the - // cursor to here - let savepoint = self.cursor.clone(); - self.cursor.eat_back_while(is_identifier_continuation); - - let token_len = self.cursor.token_len(); - let range = TextRange::at(self.back_offset - token_len, token_len); - - if self.source[range] - .chars() - .next() - .is_some_and(is_identifier_start) - { - self.to_keyword_or_other(range) - } else { - self.cursor = savepoint; - self.bogus = true; - SimpleTokenKind::Other - } - } - - // Non-trivia tokens that are unambiguous when lexing backwards. - // In other words: these are characters that _don't_ appear at the - // end of a multi-character token (like `!=`). - '\\' => SimpleTokenKind::Continuation, - ':' => SimpleTokenKind::Colon, - '~' => SimpleTokenKind::Tilde, - '%' => SimpleTokenKind::Percent, - '|' => SimpleTokenKind::Vbar, - ',' => SimpleTokenKind::Comma, - ';' => SimpleTokenKind::Semi, - '(' => SimpleTokenKind::LParen, - ')' => SimpleTokenKind::RParen, - '[' => SimpleTokenKind::LBracket, - ']' => SimpleTokenKind::RBracket, - '{' => SimpleTokenKind::LBrace, - '}' => SimpleTokenKind::RBrace, - '&' => SimpleTokenKind::Ampersand, - '^' => SimpleTokenKind::Circumflex, - '+' => SimpleTokenKind::Plus, - '-' => SimpleTokenKind::Minus, - - // Non-trivia tokens that _are_ ambiguous when lexing backwards. - // In other words: these are characters that _might_ mark the end - // of a multi-character token (like `!=` or `->` or `//` or `**`). - '=' | '*' | '/' | '@' | '!' | '<' | '>' | '.' => { - // This could be a single-token token, like `+` in `x + y`, or a - // multi-character token, like `+=` in `x += y`. It could also be a sequence - // of multi-character tokens, like `x ==== y`, which is invalid, _but_ it's - // important that we produce the same token stream when lexing backwards as - // we do when lexing forwards. So, identify the range of the sequence, lex - // forwards, and return the last token. - let mut cursor = self.cursor.clone(); - cursor.eat_back_while(|c| { - matches!( - c, - ':' | '~' - | '%' - | '|' - | '&' - | '^' - | '+' - | '-' - | '=' - | '*' - | '/' - | '@' - | '!' - | '<' - | '>' - | '.' - ) - }); - - let token_len = cursor.token_len(); - let range = TextRange::at(self.back_offset - token_len, token_len); - - let forward_lexer = Self::new(self.source, range); - if let Some(token) = forward_lexer.last() { - // If the token spans multiple characters, bump the cursor. Note, - // though, that we already bumped the cursor to past the last character - // in the token at the very start of `next_token_back`. - for _ in self.source[token.range].chars().rev().skip(1) { - self.cursor.bump_back().unwrap(); - } - token.kind() - } else { - self.bogus = true; - SimpleTokenKind::Other - } - } - - _ => { - self.bogus = true; - SimpleTokenKind::Other - } - } - } - } + _ => self.next_token_inner(last), }; let token_len = self.cursor.token_len(); - let start = self.back_offset - token_len; - - let token = SimpleToken { + SimpleToken { kind, range: TextRange::at(start, token_len), - }; - - self.back_offset = start; - - token - } - - pub fn skip_trivia(self) -> impl Iterator + DoubleEndedIterator + 'a { - self.filter(|t| !t.kind().is_trivia()) - } - - /// Given the position of `#` tokens on a line, test if any `#` is the start of a comment and, if so, return the - /// length of the comment. - /// - /// The challenge is that `#` tokens can also appear inside of strings: - /// - /// ```python - /// ' #not a comment' - /// ``` - /// - /// This looks innocent but is the `'` really the start of the new string or could it be a closing delimiter - /// of a previously started string: - /// - /// ```python - /// ' a string\ - /// ` # a comment ' - /// ``` - /// - /// The only way to reliability tell whether the `#` is a comment when the comment contains a quote char is - /// to forward lex all strings and comments and test if there's any unclosed string literal. If so, then - /// the hash cannot be a comment. - fn comment_from_hash_positions(&self, hash_positions: &[TextSize]) -> Option { - // Iterate over the `#` positions from the start to the end of the line. - // This is necessary to correctly support `a # comment # comment`. - for possible_start in hash_positions.iter().rev() { - let comment_bytes = - self.source[TextRange::new(*possible_start, self.back_offset)].as_bytes(); - - // Test if the comment contains any quotes. If so, then it's possible that the `#` token isn't - // the start of a comment, but instead part of a string: - // ```python - // a + 'a string # not a comment' - // a + '''a string - // # not a comment''' - // ``` - match memchr2(b'\'', b'"', comment_bytes) { - // Most comments don't contain quotes, and most strings don't contain comments. - // For these it's safe to assume that they are comments. - None => return Some(self.cursor.chars().as_str().text_len() - possible_start), - // Now it gets complicated... There's no good way to know whether this is a string or not. - // It is necessary to lex all strings and comments from the start to know if it is one or the other. - Some(_) => { - if find_unterminated_string_kind( - &self.cursor.chars().as_str()[TextRange::up_to(*possible_start)], - ) - .is_none() - { - // There's no unterminated string at the comment's start position. This *must* - // be a comment. - return Some(self.cursor.chars().as_str().text_len() - possible_start); - } - - // This is a hash inside of a string: `'test # not a comment'` continue with the next potential comment on the line. - } - } - } - - None - } -} - -fn find_unterminated_string_kind(input: &str) -> Option { - let mut rest = input; - - while let Some(comment_or_string_start) = memchr3(b'#', b'\'', b'\"', rest.as_bytes()) { - let c = rest.as_bytes()[comment_or_string_start] as char; - let after = &rest[comment_or_string_start + 1..]; - - if c == '#' { - let comment_end = memchr2(b'\n', b'\r', after.as_bytes()).unwrap_or(after.len()); - rest = &after[comment_end..]; - } else { - let mut cursor = Cursor::new(after); - let quote_kind = if c == '\'' { - QuoteKind::Single - } else { - QuoteKind::Double - }; - - let string_kind = if cursor.eat_char(quote_kind.as_char()) { - // `''` or `""` - if cursor.eat_char(quote_kind.as_char()) { - // `'''` or `"""` - StringKind::Triple(quote_kind) - } else { - // empty string literal, nothing more to lex - rest = cursor.chars().as_str(); - continue; - } - } else { - StringKind::Single(quote_kind) - }; - - if !is_string_terminated(string_kind, &mut cursor) { - return Some(string_kind); - } - - rest = cursor.chars().as_str(); } } - None -} + /// Helper to parser the previous token once we skipped all whitespace + fn next_token_inner(&mut self, last: char) -> SimpleTokenKind { + match last { + // Keywords and identifiers + c if is_identifier_continuation(c) => { + // if we only have identifier continuations but no start (e.g. 555) we + // don't want to consume the chars, so in that case, we want to rewind the + // cursor to here + let savepoint = self.cursor.clone(); + self.cursor.eat_back_while(is_identifier_continuation); -fn is_string_terminated(kind: StringKind, cursor: &mut Cursor) -> bool { - let quote_char = kind.quote_kind().as_char(); + let token_len = self.cursor.token_len(); + let range = TextRange::at(self.back_offset - token_len, token_len); - while let Some(c) = cursor.bump() { - match c { - '\n' | '\r' if kind.is_single() => { - // Reached the end of the line without a closing quote, this is an unterminated string literal. - return false; - } - '\\' => { - // Skip over escaped quotes that match this strings quotes or double escaped backslashes - if cursor.eat_char(quote_char) || cursor.eat_char('\\') { - continue; - } - // Eat over line continuation - cursor.eat_char('\r'); - cursor.eat_char('\n'); - } - c if c == quote_char => { - if kind.is_single() || (cursor.eat_char(quote_char) && cursor.eat_char(quote_char)) + if self.source[range] + .chars() + .next() + .is_some_and(is_identifier_start) { - return true; + to_keyword_or_other(&self.source[range]) + } else { + self.cursor = savepoint; + self.bogus = true; + SimpleTokenKind::Other + } + } + + // Non-trivia tokens that are unambiguous when lexing backwards. + // In other words: these are characters that _don't_ appear at the + // end of a multi-character token (like `!=`). + '\\' => SimpleTokenKind::Continuation, + ':' => SimpleTokenKind::Colon, + '~' => SimpleTokenKind::Tilde, + '%' => SimpleTokenKind::Percent, + '|' => SimpleTokenKind::Vbar, + ',' => SimpleTokenKind::Comma, + ';' => SimpleTokenKind::Semi, + '(' => SimpleTokenKind::LParen, + ')' => SimpleTokenKind::RParen, + '[' => SimpleTokenKind::LBracket, + ']' => SimpleTokenKind::RBracket, + '{' => SimpleTokenKind::LBrace, + '}' => SimpleTokenKind::RBrace, + '&' => SimpleTokenKind::Ampersand, + '^' => SimpleTokenKind::Circumflex, + '+' => SimpleTokenKind::Plus, + '-' => SimpleTokenKind::Minus, + + // Non-trivia tokens that _are_ ambiguous when lexing backwards. + // In other words: these are characters that _might_ mark the end + // of a multi-character token (like `!=` or `->` or `//` or `**`). + '=' | '*' | '/' | '@' | '!' | '<' | '>' | '.' => { + // This could be a single-token token, like `+` in `x + y`, or a + // multi-character token, like `+=` in `x += y`. It could also be a sequence + // of multi-character tokens, like `x ==== y`, which is invalid, _but_ it's + // important that we produce the same token stream when lexing backwards as + // we do when lexing forwards. So, identify the range of the sequence, lex + // forwards, and return the last token. + let mut cursor = self.cursor.clone(); + cursor.eat_back_while(|c| { + matches!( + c, + ':' | '~' + | '%' + | '|' + | '&' + | '^' + | '+' + | '-' + | '=' + | '*' + | '/' + | '@' + | '!' + | '<' + | '>' + | '.' + ) + }); + + let token_len = cursor.token_len(); + let range = TextRange::at(self.back_offset - token_len, token_len); + + let forward_lexer = SimpleTokenizer::new(self.source, range); + if let Some(token) = forward_lexer.last() { + // If the token spans multiple characters, bump the cursor. Note, + // though, that we already bumped the cursor to past the last character + // in the token at the very start of `next_token_back`.y + for _ in self.source[token.range].chars().rev().skip(1) { + self.cursor.bump_back().unwrap(); + } + token.kind() + } else { + self.bogus = true; + SimpleTokenKind::Other } } _ => { - // continue + self.bogus = true; + SimpleTokenKind::Other } } } - - // Reached end without a closing quote - false } -impl Iterator for SimpleTokenizer<'_> { +impl Iterator for BackwardsTokenizer<'_> { type Item = SimpleToken; fn next(&mut self) -> Option { @@ -1091,64 +973,16 @@ impl Iterator for SimpleTokenizer<'_> { } } -impl DoubleEndedIterator for SimpleTokenizer<'_> { - fn next_back(&mut self) -> Option { - let token = self.next_token_back(); - - if token.kind == SimpleTokenKind::EndOfFile { - None - } else { - Some(token) - } - } -} - -#[derive(Copy, Clone, Eq, PartialEq, Debug)] -enum StringKind { - /// `'...'` or `"..."` - Single(QuoteKind), - /// `'''...'''` or `"""..."""` - Triple(QuoteKind), -} - -impl StringKind { - const fn quote_kind(self) -> QuoteKind { - match self { - StringKind::Single(kind) => kind, - StringKind::Triple(kind) => kind, - } - } - - const fn is_single(self) -> bool { - matches!(self, StringKind::Single(_)) - } -} - -#[derive(Copy, Clone, Eq, PartialEq, Debug)] -enum QuoteKind { - /// `'`` - Single, - - /// `"` - Double, -} - -impl QuoteKind { - const fn as_char(self) -> char { - match self { - QuoteKind::Single => '\'', - QuoteKind::Double => '"', - } - } -} - #[cfg(test)] mod tests { use insta::assert_debug_snapshot; + use ruff_python_parser::lexer::lex; + use ruff_python_parser::{Mode, Tok}; use ruff_text_size::{TextLen, TextRange, TextSize}; use crate::tokenizer::{lines_after, lines_before, SimpleToken, SimpleTokenizer}; + use crate::{BackwardsTokenizer, SimpleTokenKind}; struct TokenizationTestCase { source: &'static str, @@ -1167,9 +1001,17 @@ mod tests { } fn tokenize_reverse(&self) -> Vec { - SimpleTokenizer::new(self.source, self.range) - .rev() - .collect() + let comment_ranges: Vec<_> = lex(self.source, Mode::Module) + .filter_map(|result| { + let (token, range) = result.expect("Input to be a valid python program."); + if matches!(token, Tok::Comment(_)) { + Some(range) + } else { + None + } + }) + .collect(); + BackwardsTokenizer::new(self.source, self.range, &comment_ranges).collect() } fn tokens(&self) -> &[SimpleToken] { @@ -1495,4 +1337,22 @@ mod tests { 1 ); } + + #[test] + fn test_previous_token_simple() { + let cases = &["x = (", "x = ( ", "x = (\n"]; + for source in cases { + let token = BackwardsTokenizer::up_to(source.text_len(), source, &[]) + .skip_trivia() + .next() + .unwrap(); + assert_eq!( + token, + SimpleToken { + kind: SimpleTokenKind::LParen, + range: TextRange::new(TextSize::new(4), TextSize::new(5)), + } + ); + } + } } diff --git a/crates/ruff_wasm/Cargo.toml b/crates/ruff_wasm/Cargo.toml index 4e368dec5f..3ecc3ad702 100644 --- a/crates/ruff_wasm/Cargo.toml +++ b/crates/ruff_wasm/Cargo.toml @@ -28,6 +28,7 @@ ruff_python_index = { path = "../ruff_python_index" } ruff_python_parser = { path = "../ruff_python_parser" } ruff_source_file = { path = "../ruff_source_file" } ruff_text_size = { path = "../ruff_text_size" } +ruff_python_trivia = { path = "../ruff_python_trivia" } ruff_workspace = { path = "../ruff_workspace" } console_error_panic_hook = { version = "0.1.7", optional = true } diff --git a/crates/ruff_wasm/src/lib.rs b/crates/ruff_wasm/src/lib.rs index 24d2359325..2ab51cf77a 100644 --- a/crates/ruff_wasm/src/lib.rs +++ b/crates/ruff_wasm/src/lib.rs @@ -16,10 +16,10 @@ use ruff_formatter::{FormatResult, Formatted, LineWidth}; use ruff_python_ast::{Mod, PySourceType}; use ruff_python_codegen::Stylist; use ruff_python_formatter::{format_node, pretty_comments, PyFormatContext, PyFormatOptions}; -use ruff_python_index::{CommentRanges, CommentRangesBuilder, Indexer}; +use ruff_python_index::{CommentRangesBuilder, Indexer}; use ruff_python_parser::lexer::LexResult; -use ruff_python_parser::AsMode; -use ruff_python_parser::{parse_tokens, Mode}; +use ruff_python_parser::{parse_tokens, AsMode, Mode}; +use ruff_python_trivia::CommentRanges; use ruff_source_file::{Locator, SourceLocation}; use ruff_text_size::Ranged; use ruff_workspace::configuration::Configuration; @@ -116,7 +116,7 @@ impl Workspace { Ok(Workspace { settings }) } - #[wasm_bindgen(js_name=defaultSettings)] + #[wasm_bindgen(js_name = defaultSettings)] pub fn default_settings() -> Result { serde_wasm_bindgen::to_value(&Options { // Propagate defaults.