Introduce PythonWhitespace to confine trim operations to Python whitespace (#4994)

## Summary

We use `.trim()` and friends in a bunch of places, to strip whitespace
from source code. However, not all Unicode whitespace characters are
considered "whitespace" in Python, which only supports the standard
space, tab, and form-feed characters.

This PR audits our usages of `.trim()`, `.trim_start()`, `.trim_end()`,
and `char::is_whitespace`, and replaces them as appropriate with a new
`.trim_whitespace()` analogues, powered by a `PythonWhitespace` trait.

In general, the only place that should continue to use `.trim()` is
content within docstrings, which don't need to adhere to Python's
semantic definitions of whitespace.

Closes #4991.
This commit is contained in:
Charlie Marsh 2023-06-09 21:44:50 -04:00 committed by GitHub
parent c1ac50093c
commit f401050878
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
14 changed files with 64 additions and 32 deletions

View file

@ -7,7 +7,7 @@ use rustpython_parser::{lexer, Mode, Tok};
use ruff_diagnostics::Edit; use ruff_diagnostics::Edit;
use ruff_python_ast::helpers; use ruff_python_ast::helpers;
use ruff_python_ast::source_code::{Indexer, Locator, Stylist}; use ruff_python_ast::source_code::{Indexer, Locator, Stylist};
use ruff_python_whitespace::NewlineWithTrailingNewline; use ruff_python_whitespace::{is_python_whitespace, NewlineWithTrailingNewline, PythonWhitespace};
use crate::autofix::codemods; use crate::autofix::codemods;
@ -242,7 +242,7 @@ fn trailing_semicolon(stmt: &Stmt, locator: &Locator) -> Option<TextSize> {
let contents = locator.after(stmt.end()); let contents = locator.after(stmt.end());
for line in NewlineWithTrailingNewline::from(contents) { for line in NewlineWithTrailingNewline::from(contents) {
let trimmed = line.trim_start(); let trimmed = line.trim_whitespace_start();
if trimmed.starts_with(';') { if trimmed.starts_with(';') {
let colon_offset = line.text_len() - trimmed.text_len(); let colon_offset = line.text_len() - trimmed.text_len();
@ -262,7 +262,7 @@ fn next_stmt_break(semicolon: TextSize, locator: &Locator) -> TextSize {
let contents = &locator.contents()[usize::from(start_location)..]; let contents = &locator.contents()[usize::from(start_location)..];
for line in NewlineWithTrailingNewline::from(contents) { for line in NewlineWithTrailingNewline::from(contents) {
let trimmed = line.trim(); let trimmed = line.trim_whitespace();
// Skip past any continuations. // Skip past any continuations.
if trimmed.starts_with('\\') { if trimmed.starts_with('\\') {
continue; continue;
@ -276,7 +276,7 @@ fn next_stmt_break(semicolon: TextSize, locator: &Locator) -> TextSize {
} else { } else {
// Otherwise, find the start of the next statement. (Or, anything that isn't // Otherwise, find the start of the next statement. (Or, anything that isn't
// whitespace.) // whitespace.)
let relative_offset = line.find(|c: char| !c.is_whitespace()).unwrap(); let relative_offset = line.find(|c: char| !is_python_whitespace(c)).unwrap();
line.start() + TextSize::try_from(relative_offset).unwrap() line.start() + TextSize::try_from(relative_offset).unwrap()
}; };
} }

View file

@ -8,7 +8,7 @@ use rustpython_parser::{lexer, Mode, Tok};
use ruff_diagnostics::Edit; use ruff_diagnostics::Edit;
use ruff_python_ast::helpers::is_docstring_stmt; use ruff_python_ast::helpers::is_docstring_stmt;
use ruff_python_ast::source_code::{Locator, Stylist}; use ruff_python_ast::source_code::{Locator, Stylist};
use ruff_python_whitespace::UniversalNewlineIterator; use ruff_python_whitespace::{PythonWhitespace, UniversalNewlineIterator};
use ruff_textwrap::indent; use ruff_textwrap::indent;
#[derive(Debug, Clone, PartialEq, Eq)] #[derive(Debug, Clone, PartialEq, Eq)]
@ -69,7 +69,7 @@ impl<'a> Insertion<'a> {
// Skip over commented lines. // Skip over commented lines.
for line in UniversalNewlineIterator::with_offset(locator.after(location), location) { for line in UniversalNewlineIterator::with_offset(locator.after(location), location) {
if line.trim_start().starts_with('#') { if line.trim_whitespace_start().starts_with('#') {
location = line.full_end(); location = line.full_end();
} else { } else {
break; break;

View file

@ -12,7 +12,7 @@ use ruff_text_size::{TextLen, TextRange, TextSize};
use ruff_diagnostics::Diagnostic; use ruff_diagnostics::Diagnostic;
use ruff_python_ast::source_code::Locator; use ruff_python_ast::source_code::Locator;
use ruff_python_whitespace::LineEnding; use ruff_python_whitespace::{LineEnding, PythonWhitespace};
use crate::codes::NoqaCode; use crate::codes::NoqaCode;
use crate::registry::{AsRule, Rule, RuleSet}; use crate::registry::{AsRule, Rule, RuleSet};
@ -87,7 +87,7 @@ enum ParsedExemption<'a> {
/// Return a [`ParsedExemption`] for a given comment line. /// Return a [`ParsedExemption`] for a given comment line.
fn parse_file_exemption(line: &str) -> ParsedExemption { fn parse_file_exemption(line: &str) -> ParsedExemption {
let line = line.trim_start(); let line = line.trim_whitespace_start();
if line.starts_with("# flake8: noqa") if line.starts_with("# flake8: noqa")
|| line.starts_with("# flake8: NOQA") || line.starts_with("# flake8: NOQA")

View file

@ -3,6 +3,7 @@ use rustpython_parser::ast::{self, Constant, Decorator, Expr, Keyword};
use ruff_python_ast::call_path::{collect_call_path, CallPath}; use ruff_python_ast::call_path::{collect_call_path, CallPath};
use ruff_python_ast::helpers::map_callable; use ruff_python_ast::helpers::map_callable;
use ruff_python_semantic::model::SemanticModel; use ruff_python_semantic::model::SemanticModel;
use ruff_python_whitespace::PythonWhitespace;
pub(super) fn get_mark_decorators( pub(super) fn get_mark_decorators(
decorators: &[Decorator], decorators: &[Decorator],
@ -81,7 +82,7 @@ pub(super) fn split_names(names: &str) -> Vec<&str> {
names names
.split(',') .split(',')
.filter_map(|s| { .filter_map(|s| {
let trimmed = s.trim(); let trimmed = s.trim_whitespace();
if trimmed.is_empty() { if trimmed.is_empty() {
None None
} else { } else {

View file

@ -11,6 +11,7 @@ use crate::autofix::codemods::CodegenStylist;
use ruff_diagnostics::Edit; use ruff_diagnostics::Edit;
use ruff_python_ast::source_code::{Locator, Stylist}; use ruff_python_ast::source_code::{Locator, Stylist};
use ruff_python_ast::whitespace; use ruff_python_ast::whitespace;
use ruff_python_whitespace::PythonWhitespace;
use crate::cst::matchers::{match_function_def, match_if, match_indented_block, match_statement}; use crate::cst::matchers::{match_function_def, match_if, match_indented_block, match_statement};
@ -44,7 +45,7 @@ pub(crate) fn fix_nested_if_statements(
let contents = locator.lines(stmt.range()); let contents = locator.lines(stmt.range());
// Handle `elif` blocks differently; detect them upfront. // Handle `elif` blocks differently; detect them upfront.
let is_elif = contents.trim_start().starts_with("elif"); let is_elif = contents.trim_whitespace_start().starts_with("elif");
// If this is an `elif`, we have to remove the `elif` keyword for now. (We'll // If this is an `elif`, we have to remove the `elif` keyword for now. (We'll
// restore the `el` later on.) // restore the `el` later on.)

View file

@ -2,7 +2,7 @@ use rustpython_parser::ast::{Ranged, Stmt};
use rustpython_parser::{lexer, Mode, Tok}; use rustpython_parser::{lexer, Mode, Tok};
use ruff_python_ast::source_code::Locator; use ruff_python_ast::source_code::Locator;
use ruff_python_whitespace::UniversalNewlines; use ruff_python_whitespace::{PythonWhitespace, UniversalNewlines};
use crate::rules::isort::types::TrailingComma; use crate::rules::isort::types::TrailingComma;
@ -63,7 +63,7 @@ pub(super) fn has_comment_break(stmt: &Stmt, locator: &Locator) -> bool {
// def f(): pass // def f(): pass
let mut seen_blank = false; let mut seen_blank = false;
for line in locator.up_to(stmt.start()).universal_newlines().rev() { for line in locator.up_to(stmt.start()).universal_newlines().rev() {
let line = line.trim(); let line = line.trim_whitespace();
if seen_blank { if seen_blank {
if line.starts_with('#') { if line.starts_with('#') {
return true; return true;

View file

@ -10,7 +10,7 @@ use ruff_python_ast::helpers::{
followed_by_multi_statement_line, preceded_by_multi_statement_line, trailing_lines_end, followed_by_multi_statement_line, preceded_by_multi_statement_line, trailing_lines_end,
}; };
use ruff_python_ast::source_code::{Indexer, Locator, Stylist}; use ruff_python_ast::source_code::{Indexer, Locator, Stylist};
use ruff_python_whitespace::{leading_indentation, UniversalNewlines}; use ruff_python_whitespace::{leading_indentation, PythonWhitespace, UniversalNewlines};
use ruff_textwrap::indent; use ruff_textwrap::indent;
use crate::line_width::LineWidth; use crate::line_width::LineWidth;
@ -72,7 +72,9 @@ fn matches_ignoring_indentation(val1: &str, val2: &str) -> bool {
val1.universal_newlines() val1.universal_newlines()
.zip_longest(val2.universal_newlines()) .zip_longest(val2.universal_newlines())
.all(|pair| match pair { .all(|pair| match pair {
EitherOrBoth::Both(line1, line2) => line1.trim_start() == line2.trim_start(), EitherOrBoth::Both(line1, line2) => {
line1.trim_whitespace_start() == line2.trim_whitespace_start()
}
_ => false, _ => false,
}) })
} }

View file

@ -25,6 +25,7 @@ pub(crate) use missing_whitespace_around_operator::{
MissingWhitespaceAroundBitwiseOrShiftOperator, MissingWhitespaceAroundModuloOperator, MissingWhitespaceAroundBitwiseOrShiftOperator, MissingWhitespaceAroundModuloOperator,
MissingWhitespaceAroundOperator, MissingWhitespaceAroundOperator,
}; };
use ruff_python_whitespace::is_python_whitespace;
pub(crate) use space_around_operator::{ pub(crate) use space_around_operator::{
space_around_operator, MultipleSpacesAfterOperator, MultipleSpacesBeforeOperator, space_around_operator, MultipleSpacesAfterOperator, MultipleSpacesBeforeOperator,
TabAfterOperator, TabBeforeOperator, TabAfterOperator, TabBeforeOperator,
@ -378,7 +379,7 @@ impl Whitespace {
len += c.text_len(); len += c.text_len();
} else if matches!(c, '\n' | '\r') { } else if matches!(c, '\n' | '\r') {
break; break;
} else if c.is_whitespace() { } else if is_python_whitespace(c) {
count += 1; count += 1;
len += c.text_len(); len += c.text_len();
} else { } else {
@ -409,7 +410,7 @@ impl Whitespace {
} else if matches!(c, '\n' | '\r') { } else if matches!(c, '\n' | '\r') {
// Indent // Indent
return (Self::None, TextSize::default()); return (Self::None, TextSize::default());
} else if c.is_whitespace() { } else if is_python_whitespace(c) {
count += 1; count += 1;
len += c.text_len(); len += c.text_len();
} else { } else {

View file

@ -4,6 +4,7 @@ use ruff_diagnostics::Violation;
use ruff_macros::{derive_message_formats, violation}; use ruff_macros::{derive_message_formats, violation};
use ruff_python_ast::source_code::Locator; use ruff_python_ast::source_code::Locator;
use ruff_python_ast::token_kind::TokenKind; use ruff_python_ast::token_kind::TokenKind;
use ruff_python_whitespace::PythonWhitespace;
use crate::checkers::logical_lines::LogicalLinesContext; use crate::checkers::logical_lines::LogicalLinesContext;
use crate::rules::pycodestyle::rules::logical_lines::LogicalLine; use crate::rules::pycodestyle::rules::logical_lines::LogicalLine;
@ -156,7 +157,7 @@ pub(crate) fn whitespace_before_comment(
)); ));
let token_text = locator.slice(range); let token_text = locator.slice(range);
let is_inline_comment = !line_text.trim().is_empty(); let is_inline_comment = !line_text.trim_whitespace().is_empty();
if is_inline_comment { if is_inline_comment {
if range.start() - prev_end < " ".text_len() { if range.start() - prev_end < " ".text_len() {
context.push( context.push(

View file

@ -4,7 +4,7 @@ use rustpython_parser::ast::Ranged;
use ruff_diagnostics::{AlwaysAutofixableViolation, Diagnostic, Edit, Fix}; use ruff_diagnostics::{AlwaysAutofixableViolation, Diagnostic, Edit, Fix};
use ruff_macros::{derive_message_formats, violation}; use ruff_macros::{derive_message_formats, violation};
use ruff_python_semantic::definition::{Definition, Member, MemberKind}; use ruff_python_semantic::definition::{Definition, Member, MemberKind};
use ruff_python_whitespace::{UniversalNewlineIterator, UniversalNewlines}; use ruff_python_whitespace::{PythonWhitespace, UniversalNewlineIterator, UniversalNewlines};
use crate::checkers::ast::Checker; use crate::checkers::ast::Checker;
use crate::docstrings::Docstring; use crate::docstrings::Docstring;
@ -133,10 +133,9 @@ pub(crate) fn blank_before_after_class(checker: &mut Checker, docstring: &Docstr
.locator .locator
.slice(TextRange::new(docstring.end(), stmt.end())); .slice(TextRange::new(docstring.end(), stmt.end()));
let all_blank_after = after let all_blank_after = after.universal_newlines().skip(1).all(|line| {
.universal_newlines() line.trim_whitespace().is_empty() || line.trim_whitespace_start().starts_with('#')
.skip(1) });
.all(|line| line.trim().is_empty() || line.trim_start().starts_with('#'));
if all_blank_after { if all_blank_after {
return; return;
} }

View file

@ -6,7 +6,7 @@ use rustpython_parser::ast::Ranged;
use ruff_diagnostics::{AlwaysAutofixableViolation, Diagnostic, Edit, Fix}; use ruff_diagnostics::{AlwaysAutofixableViolation, Diagnostic, Edit, Fix};
use ruff_macros::{derive_message_formats, violation}; use ruff_macros::{derive_message_formats, violation};
use ruff_python_semantic::definition::{Definition, Member, MemberKind}; use ruff_python_semantic::definition::{Definition, Member, MemberKind};
use ruff_python_whitespace::{UniversalNewlineIterator, UniversalNewlines}; use ruff_python_whitespace::{PythonWhitespace, UniversalNewlineIterator, UniversalNewlines};
use crate::checkers::ast::Checker; use crate::checkers::ast::Checker;
use crate::docstrings::Docstring; use crate::docstrings::Docstring;
@ -102,10 +102,9 @@ pub(crate) fn blank_before_after_function(checker: &mut Checker, docstring: &Doc
.slice(TextRange::new(docstring.end(), stmt.end())); .slice(TextRange::new(docstring.end(), stmt.end()));
// If the docstring is only followed by blank and commented lines, abort. // If the docstring is only followed by blank and commented lines, abort.
let all_blank_after = after let all_blank_after = after.universal_newlines().skip(1).all(|line| {
.universal_newlines() line.trim_whitespace().is_empty() || line.trim_whitespace_start().starts_with('#')
.skip(1) });
.all(|line| line.trim().is_empty() || line.trim_start().starts_with('#'));
if all_blank_after { if all_blank_after {
return; return;
} }
@ -129,7 +128,7 @@ pub(crate) fn blank_before_after_function(checker: &mut Checker, docstring: &Doc
// Avoid violations for blank lines followed by inner functions or classes. // Avoid violations for blank lines followed by inner functions or classes.
if blank_lines_after == 1 if blank_lines_after == 1
&& lines && lines
.find(|line| !line.trim_start().starts_with('#')) .find(|line| !line.trim_whitespace_start().starts_with('#'))
.map_or(false, |line| INNER_FUNCTION_OR_CLASS_REGEX.is_match(&line)) .map_or(false, |line| INNER_FUNCTION_OR_CLASS_REGEX.is_match(&line))
{ {
return; return;

View file

@ -4,7 +4,7 @@ use std::path::Path;
use itertools::Itertools; use itertools::Itertools;
use log::error; use log::error;
use num_traits::Zero; use num_traits::Zero;
use ruff_python_whitespace::UniversalNewlineIterator; use ruff_python_whitespace::{PythonWhitespace, UniversalNewlineIterator};
use ruff_text_size::{TextRange, TextSize}; use ruff_text_size::{TextRange, TextSize};
use rustc_hash::{FxHashMap, FxHashSet}; use rustc_hash::{FxHashMap, FxHashSet};
use rustpython_parser::ast::{ use rustpython_parser::ast::{
@ -1031,7 +1031,7 @@ pub fn trailing_lines_end(stmt: &Stmt, locator: &Locator) -> TextSize {
let rest = &locator.contents()[usize::from(line_end)..]; let rest = &locator.contents()[usize::from(line_end)..];
UniversalNewlineIterator::with_offset(rest, line_end) UniversalNewlineIterator::with_offset(rest, line_end)
.take_while(|line| line.trim().is_empty()) .take_while(|line| line.trim_whitespace().is_empty())
.last() .last()
.map_or(line_end, |l| l.full_end()) .map_or(line_end, |l| l.full_end())
} }

View file

@ -4,7 +4,7 @@ use crate::trivia::{SimpleTokenizer, TokenKind};
use ruff_python_ast::node::AnyNodeRef; use ruff_python_ast::node::AnyNodeRef;
use ruff_python_ast::source_code::Locator; use ruff_python_ast::source_code::Locator;
use ruff_python_ast::whitespace; use ruff_python_ast::whitespace;
use ruff_python_whitespace::UniversalNewlines; use ruff_python_whitespace::{PythonWhitespace, UniversalNewlines};
use ruff_text_size::{TextRange, TextSize}; use ruff_text_size::{TextRange, TextSize};
use rustpython_parser::ast::Ranged; use rustpython_parser::ast::Ranged;
use std::cmp::Ordering; use std::cmp::Ordering;
@ -986,7 +986,7 @@ fn max_empty_lines(contents: &str) -> usize {
let mut max_empty_lines = 0; let mut max_empty_lines = 0;
for line in contents.universal_newlines().skip(1) { for line in contents.universal_newlines().skip(1) {
if line.trim().is_empty() { if line.trim_whitespace().is_empty() {
empty_lines += 1; empty_lines += 1;
} else { } else {
max_empty_lines = max_empty_lines.max(empty_lines); max_empty_lines = max_empty_lines.max(empty_lines);

View file

@ -13,3 +13,31 @@ pub fn leading_indentation(line: &str) -> &str {
line.find(|char: char| !is_python_whitespace(char)) line.find(|char: char| !is_python_whitespace(char))
.map_or(line, |index| &line[..index]) .map_or(line, |index| &line[..index])
} }
pub trait PythonWhitespace {
/// Like `str::trim()`, but only removes whitespace characters that Python considers
/// to be [whitespace](https://docs.python.org/3/reference/lexical_analysis.html#whitespace-between-tokens).
fn trim_whitespace(&self) -> &Self;
/// Like `str::trim_start()`, but only removes whitespace characters that Python considers
/// to be [whitespace](https://docs.python.org/3/reference/lexical_analysis.html#whitespace-between-tokens).
fn trim_whitespace_start(&self) -> &Self;
/// Like `str::trim_end()`, but only removes whitespace characters that Python considers
/// to be [whitespace](https://docs.python.org/3/reference/lexical_analysis.html#whitespace-between-tokens).
fn trim_whitespace_end(&self) -> &Self;
}
impl PythonWhitespace for str {
fn trim_whitespace(&self) -> &Self {
self.trim_matches(is_python_whitespace)
}
fn trim_whitespace_start(&self) -> &Self {
self.trim_start_matches(is_python_whitespace)
}
fn trim_whitespace_end(&self) -> &Self {
self.trim_end_matches(is_python_whitespace)
}
}