Introduce PythonWhitespace to confine trim operations to Python whitespace (#4994)

## Summary We use `.trim()` and friends in a bunch of places, to strip whitespace from source code. However, not all Unicode whitespace characters are considered "whitespace" in Python, which only supports the standard space, tab, and form-feed characters. This PR audits our usages of `.trim()`, `.trim_start()`, `.trim_end()`, and `char::is_whitespace`, and replaces them as appropriate with a new `.trim_whitespace()` analogues, powered by a `PythonWhitespace` trait. In general, the only place that should continue to use `.trim()` is content within docstrings, which don't need to adhere to Python's semantic definitions of whitespace. Closes #4991.
2025-11-14 18:01:43 +00:00 · 2023-06-09 21:44:50 -04:00 · 2023-06-09 21:44:50 -04:00 · f401050878
commit f401050878
parent c1ac50093c
14 changed files with 64 additions and 32 deletions
--- a/crates/ruff/src/autofix/edits.rs
+++ b/crates/ruff/src/autofix/edits.rs
@ -7,7 +7,7 @@ use rustpython_parser::{lexer, Mode, Tok};
 use ruff_diagnostics::Edit;
 use ruff_python_ast::helpers;
 use ruff_python_ast::source_code::{Indexer, Locator, Stylist};
-use ruff_python_whitespace::NewlineWithTrailingNewline;
+use ruff_python_whitespace::{is_python_whitespace, NewlineWithTrailingNewline, PythonWhitespace};
 use crate::autofix::codemods;
@ -242,7 +242,7 @@ fn trailing_semicolon(stmt: &Stmt, locator: &Locator) -> Option<TextSize> {
    let contents = locator.after(stmt.end());
    for line in NewlineWithTrailingNewline::from(contents) {
-        let trimmed = line.trim_start();
+        let trimmed = line.trim_whitespace_start();
        if trimmed.starts_with(';') {
            let colon_offset = line.text_len() - trimmed.text_len();
@ -262,7 +262,7 @@ fn next_stmt_break(semicolon: TextSize, locator: &Locator) -> TextSize {
    let contents = &locator.contents()[usize::from(start_location)..];
    for line in NewlineWithTrailingNewline::from(contents) {
-        let trimmed = line.trim();
+        let trimmed = line.trim_whitespace();
        // Skip past any continuations.
        if trimmed.starts_with('\\') {
            continue;
@ -276,7 +276,7 @@ fn next_stmt_break(semicolon: TextSize, locator: &Locator) -> TextSize {
            } else {
                // Otherwise, find the start of the next statement. (Or, anything that isn't
                // whitespace.)
-                let relative_offset = line.find(|c: char| !c.is_whitespace()).unwrap();
+                let relative_offset = line.find(|c: char| !is_python_whitespace(c)).unwrap();
                line.start() + TextSize::try_from(relative_offset).unwrap()
            };
    }
--- a/crates/ruff/src/importer/insertion.rs
+++ b/crates/ruff/src/importer/insertion.rs
@ -8,7 +8,7 @@ use rustpython_parser::{lexer, Mode, Tok};
 use ruff_diagnostics::Edit;
 use ruff_python_ast::helpers::is_docstring_stmt;
 use ruff_python_ast::source_code::{Locator, Stylist};
-use ruff_python_whitespace::UniversalNewlineIterator;
+use ruff_python_whitespace::{PythonWhitespace, UniversalNewlineIterator};
 use ruff_textwrap::indent;
 #[derive(Debug, Clone, PartialEq, Eq)]
@ -69,7 +69,7 @@ impl<'a> Insertion<'a> {
        // Skip over commented lines.
        for line in UniversalNewlineIterator::with_offset(locator.after(location), location) {
-            if line.trim_start().starts_with('#') {
+            if line.trim_whitespace_start().starts_with('#') {
                location = line.full_end();
            } else {
                break;
--- a/crates/ruff/src/noqa.rs
+++ b/crates/ruff/src/noqa.rs
@ -12,7 +12,7 @@ use ruff_text_size::{TextLen, TextRange, TextSize};
 use ruff_diagnostics::Diagnostic;
 use ruff_python_ast::source_code::Locator;
-use ruff_python_whitespace::LineEnding;
+use ruff_python_whitespace::{LineEnding, PythonWhitespace};
 use crate::codes::NoqaCode;
 use crate::registry::{AsRule, Rule, RuleSet};
@ -87,7 +87,7 @@ enum ParsedExemption<'a> {
 /// Return a [`ParsedExemption`] for a given comment line.
 fn parse_file_exemption(line: &str) -> ParsedExemption {
-    let line = line.trim_start();
+    let line = line.trim_whitespace_start();
    if line.starts_with("# flake8: noqa")
        || line.starts_with("# flake8: NOQA")
--- a/crates/ruff/src/rules/flake8_pytest_style/rules/helpers.rs
+++ b/crates/ruff/src/rules/flake8_pytest_style/rules/helpers.rs
@ -3,6 +3,7 @@ use rustpython_parser::ast::{self, Constant, Decorator, Expr, Keyword};
 use ruff_python_ast::call_path::{collect_call_path, CallPath};
 use ruff_python_ast::helpers::map_callable;
 use ruff_python_semantic::model::SemanticModel;
 use ruff_python_whitespace::PythonWhitespace;
 pub(super) fn get_mark_decorators(
    decorators: &[Decorator],
@ -81,7 +82,7 @@ pub(super) fn split_names(names: &str) -> Vec<&str> {
    names
        .split(',')
        .filter_map(|s| {
-            let trimmed = s.trim();
+            let trimmed = s.trim_whitespace();
            if trimmed.is_empty() {
                None
            } else {
--- a/crates/ruff/src/rules/flake8_simplify/rules/fix_if.rs
+++ b/crates/ruff/src/rules/flake8_simplify/rules/fix_if.rs
@ -11,6 +11,7 @@ use crate::autofix::codemods::CodegenStylist;
 use ruff_diagnostics::Edit;
 use ruff_python_ast::source_code::{Locator, Stylist};
 use ruff_python_ast::whitespace;
 use ruff_python_whitespace::PythonWhitespace;
 use crate::cst::matchers::{match_function_def, match_if, match_indented_block, match_statement};
@ -44,7 +45,7 @@ pub(crate) fn fix_nested_if_statements(
    let contents = locator.lines(stmt.range());
    // Handle `elif` blocks differently; detect them upfront.
-    let is_elif = contents.trim_start().starts_with("elif");
+    let is_elif = contents.trim_whitespace_start().starts_with("elif");
    // If this is an `elif`, we have to remove the `elif` keyword for now. (We'll
    // restore the `el` later on.)
--- a/crates/ruff/src/rules/isort/helpers.rs
+++ b/crates/ruff/src/rules/isort/helpers.rs
@ -2,7 +2,7 @@ use rustpython_parser::ast::{Ranged, Stmt};
 use rustpython_parser::{lexer, Mode, Tok};
 use ruff_python_ast::source_code::Locator;
-use ruff_python_whitespace::UniversalNewlines;
+use ruff_python_whitespace::{PythonWhitespace, UniversalNewlines};
 use crate::rules::isort::types::TrailingComma;
@ -63,7 +63,7 @@ pub(super) fn has_comment_break(stmt: &Stmt, locator: &Locator) -> bool {
    //   def f(): pass
    let mut seen_blank = false;
    for line in locator.up_to(stmt.start()).universal_newlines().rev() {
-        let line = line.trim();
+        let line = line.trim_whitespace();
        if seen_blank {
            if line.starts_with('#') {
                return true;
--- a/crates/ruff/src/rules/isort/rules/organize_imports.rs
+++ b/crates/ruff/src/rules/isort/rules/organize_imports.rs
@ -10,7 +10,7 @@ use ruff_python_ast::helpers::{
    followed_by_multi_statement_line, preceded_by_multi_statement_line, trailing_lines_end,
 };
 use ruff_python_ast::source_code::{Indexer, Locator, Stylist};
-use ruff_python_whitespace::{leading_indentation, UniversalNewlines};
+use ruff_python_whitespace::{leading_indentation, PythonWhitespace, UniversalNewlines};
 use ruff_textwrap::indent;
 use crate::line_width::LineWidth;
@ -72,7 +72,9 @@ fn matches_ignoring_indentation(val1: &str, val2: &str) -> bool {
    val1.universal_newlines()
        .zip_longest(val2.universal_newlines())
        .all(|pair| match pair {
-            EitherOrBoth::Both(line1, line2) => line1.trim_start() == line2.trim_start(),
+            EitherOrBoth::Both(line1, line2) => {
                line1.trim_whitespace_start() == line2.trim_whitespace_start()
            }
            _ => false,
        })
 }
--- a/crates/ruff/src/rules/pycodestyle/rules/logical_lines/mod.rs
+++ b/crates/ruff/src/rules/pycodestyle/rules/logical_lines/mod.rs
@ -25,6 +25,7 @@ pub(crate) use missing_whitespace_around_operator::{
    MissingWhitespaceAroundBitwiseOrShiftOperator, MissingWhitespaceAroundModuloOperator,
    MissingWhitespaceAroundOperator,
 };
 use ruff_python_whitespace::is_python_whitespace;
 pub(crate) use space_around_operator::{
    space_around_operator, MultipleSpacesAfterOperator, MultipleSpacesBeforeOperator,
    TabAfterOperator, TabBeforeOperator,
@ -378,7 +379,7 @@ impl Whitespace {
                len += c.text_len();
            } else if matches!(c, '\n' | '\r') {
                break;
-            } else if c.is_whitespace() {
+            } else if is_python_whitespace(c) {
                count += 1;
                len += c.text_len();
            } else {
@ -409,7 +410,7 @@ impl Whitespace {
            } else if matches!(c, '\n' | '\r') {
                // Indent
                return (Self::None, TextSize::default());
-            } else if c.is_whitespace() {
+            } else if is_python_whitespace(c) {
                count += 1;
                len += c.text_len();
            } else {
--- a/crates/ruff/src/rules/pycodestyle/rules/logical_lines/whitespace_before_comment.rs
+++ b/crates/ruff/src/rules/pycodestyle/rules/logical_lines/whitespace_before_comment.rs
@ -4,6 +4,7 @@ use ruff_diagnostics::Violation;
 use ruff_macros::{derive_message_formats, violation};
 use ruff_python_ast::source_code::Locator;
 use ruff_python_ast::token_kind::TokenKind;
 use ruff_python_whitespace::PythonWhitespace;
 use crate::checkers::logical_lines::LogicalLinesContext;
 use crate::rules::pycodestyle::rules::logical_lines::LogicalLine;
@ -156,7 +157,7 @@ pub(crate) fn whitespace_before_comment(
            ));
            let token_text = locator.slice(range);
-            let is_inline_comment = !line_text.trim().is_empty();
+            let is_inline_comment = !line_text.trim_whitespace().is_empty();
            if is_inline_comment {
                if range.start() - prev_end < "  ".text_len() {
                    context.push(
--- a/crates/ruff/src/rules/pydocstyle/rules/blank_before_after_class.rs
+++ b/crates/ruff/src/rules/pydocstyle/rules/blank_before_after_class.rs
@ -4,7 +4,7 @@ use rustpython_parser::ast::Ranged;
 use ruff_diagnostics::{AlwaysAutofixableViolation, Diagnostic, Edit, Fix};
 use ruff_macros::{derive_message_formats, violation};
 use ruff_python_semantic::definition::{Definition, Member, MemberKind};
-use ruff_python_whitespace::{UniversalNewlineIterator, UniversalNewlines};
+use ruff_python_whitespace::{PythonWhitespace, UniversalNewlineIterator, UniversalNewlines};
 use crate::checkers::ast::Checker;
 use crate::docstrings::Docstring;
@ -133,10 +133,9 @@ pub(crate) fn blank_before_after_class(checker: &mut Checker, docstring: &Docstr
            .locator
            .slice(TextRange::new(docstring.end(), stmt.end()));
-        let all_blank_after = after
+        let all_blank_after = after.universal_newlines().skip(1).all(|line| {
-            .universal_newlines()
+            line.trim_whitespace().is_empty() || line.trim_whitespace_start().starts_with('#')
-            .skip(1)
+        });
            .all(|line| line.trim().is_empty() || line.trim_start().starts_with('#'));
        if all_blank_after {
            return;
        }
--- a/crates/ruff/src/rules/pydocstyle/rules/blank_before_after_function.rs
+++ b/crates/ruff/src/rules/pydocstyle/rules/blank_before_after_function.rs
@ -6,7 +6,7 @@ use rustpython_parser::ast::Ranged;
 use ruff_diagnostics::{AlwaysAutofixableViolation, Diagnostic, Edit, Fix};
 use ruff_macros::{derive_message_formats, violation};
 use ruff_python_semantic::definition::{Definition, Member, MemberKind};
-use ruff_python_whitespace::{UniversalNewlineIterator, UniversalNewlines};
+use ruff_python_whitespace::{PythonWhitespace, UniversalNewlineIterator, UniversalNewlines};
 use crate::checkers::ast::Checker;
 use crate::docstrings::Docstring;
@ -102,10 +102,9 @@ pub(crate) fn blank_before_after_function(checker: &mut Checker, docstring: &Doc
            .slice(TextRange::new(docstring.end(), stmt.end()));
        // If the docstring is only followed by blank and commented lines, abort.
-        let all_blank_after = after
+        let all_blank_after = after.universal_newlines().skip(1).all(|line| {
-            .universal_newlines()
+            line.trim_whitespace().is_empty() || line.trim_whitespace_start().starts_with('#')
-            .skip(1)
+        });
            .all(|line| line.trim().is_empty() || line.trim_start().starts_with('#'));
        if all_blank_after {
            return;
        }
@ -129,7 +128,7 @@ pub(crate) fn blank_before_after_function(checker: &mut Checker, docstring: &Doc
        // Avoid violations for blank lines followed by inner functions or classes.
        if blank_lines_after == 1
            && lines
-                .find(|line| !line.trim_start().starts_with('#'))
+                .find(|line| !line.trim_whitespace_start().starts_with('#'))
                .map_or(false, |line| INNER_FUNCTION_OR_CLASS_REGEX.is_match(&line))
        {
            return;
--- a/crates/ruff_python_ast/src/helpers.rs
+++ b/crates/ruff_python_ast/src/helpers.rs
@ -4,7 +4,7 @@ use std::path::Path;
 use itertools::Itertools;
 use log::error;
 use num_traits::Zero;
-use ruff_python_whitespace::UniversalNewlineIterator;
+use ruff_python_whitespace::{PythonWhitespace, UniversalNewlineIterator};
 use ruff_text_size::{TextRange, TextSize};
 use rustc_hash::{FxHashMap, FxHashSet};
 use rustpython_parser::ast::{
@ -1031,7 +1031,7 @@ pub fn trailing_lines_end(stmt: &Stmt, locator: &Locator) -> TextSize {
    let rest = &locator.contents()[usize::from(line_end)..];
    UniversalNewlineIterator::with_offset(rest, line_end)
-        .take_while(|line| line.trim().is_empty())
+        .take_while(|line| line.trim_whitespace().is_empty())
        .last()
        .map_or(line_end, |l| l.full_end())
 }
--- a/crates/ruff_python_formatter/src/comments/placement.rs
+++ b/crates/ruff_python_formatter/src/comments/placement.rs
@ -4,7 +4,7 @@ use crate::trivia::{SimpleTokenizer, TokenKind};
 use ruff_python_ast::node::AnyNodeRef;
 use ruff_python_ast::source_code::Locator;
 use ruff_python_ast::whitespace;
-use ruff_python_whitespace::UniversalNewlines;
+use ruff_python_whitespace::{PythonWhitespace, UniversalNewlines};
 use ruff_text_size::{TextRange, TextSize};
 use rustpython_parser::ast::Ranged;
 use std::cmp::Ordering;
@ -986,7 +986,7 @@ fn max_empty_lines(contents: &str) -> usize {
    let mut max_empty_lines = 0;
    for line in contents.universal_newlines().skip(1) {
-        if line.trim().is_empty() {
+        if line.trim_whitespace().is_empty() {
            empty_lines += 1;
        } else {
            max_empty_lines = max_empty_lines.max(empty_lines);
--- a/crates/ruff_python_whitespace/src/whitespace.rs
+++ b/crates/ruff_python_whitespace/src/whitespace.rs
@ -13,3 +13,31 @@ pub fn leading_indentation(line: &str) -> &str {
    line.find(|char: char| !is_python_whitespace(char))
        .map_or(line, |index| &line[..index])
 }
 pub trait PythonWhitespace {
    /// Like `str::trim()`, but only removes whitespace characters that Python considers
    /// to be [whitespace](https://docs.python.org/3/reference/lexical_analysis.html#whitespace-between-tokens).
    fn trim_whitespace(&self) -> &Self;
    /// Like `str::trim_start()`, but only removes whitespace characters that Python considers
    /// to be [whitespace](https://docs.python.org/3/reference/lexical_analysis.html#whitespace-between-tokens).
    fn trim_whitespace_start(&self) -> &Self;
    /// Like `str::trim_end()`, but only removes whitespace characters that Python considers
    /// to be [whitespace](https://docs.python.org/3/reference/lexical_analysis.html#whitespace-between-tokens).
    fn trim_whitespace_end(&self) -> &Self;
 }
 impl PythonWhitespace for str {
    fn trim_whitespace(&self) -> &Self {
        self.trim_matches(is_python_whitespace)
    }
    fn trim_whitespace_start(&self) -> &Self {
        self.trim_start_matches(is_python_whitespace)
    }
    fn trim_whitespace_end(&self) -> &Self {
        self.trim_end_matches(is_python_whitespace)
    }
 }