[pyupgrade] Do not report when a UTF-8 comment is followed by a non-UTF-8 one (UP009) (#14728)

## Summary

Resolves #14704.

## Test Plan

`cargo nextest run` and `cargo insta test`.

---------

Co-authored-by: Micha Reiser <micha@reiser.io>
This commit is contained in:
InSync 2024-12-11 17:30:41 +07:00 committed by GitHub
parent a55722e740
commit c8d505c8ea
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
22 changed files with 273 additions and 49 deletions

View file

@ -0,0 +1,3 @@
print('No coding coments here')
# -*- coding: utf-8 -*-
# -*- coding: utf-8 -*-

View file

@ -0,0 +1,3 @@
#!/usr/bin/python
# -*- coding: utf-8 -*-
# -*- coding: ascii -*-

View file

@ -0,0 +1,3 @@
# -*- coding: utf-8 -*-

View file

@ -0,0 +1,2 @@
# -*- coding: ascii -*-
# -*- coding: latin -*-

View file

@ -0,0 +1,2 @@
# -*- coding: ascii -*-
# -*- coding: utf-8 -*-

View file

@ -0,0 +1,3 @@
# -*- coding: utf8 -*-
print("the following is not a coding comment")
# -*- coding: ascii -*-

View file

@ -0,0 +1,2 @@
# -*- coding: utf-8 -*-
# -*- coding: ascii -*-

View file

@ -0,0 +1,2 @@
# -*- coding: utf-8 -*-
# -*- coding: utf-8 -*-

View file

@ -0,0 +1,3 @@
# -*- coding: utf-8 -*-
# -*- coding: utf-8 -*-
# -*- coding: ascii -*-

View file

@ -73,12 +73,7 @@ pub(crate) fn check_tokens(
}
if settings.rules.enabled(Rule::UTF8EncodingDeclaration) {
pyupgrade::rules::unnecessary_coding_comment(
&mut diagnostics,
locator,
indexer,
comment_ranges,
);
pyupgrade::rules::unnecessary_coding_comment(&mut diagnostics, locator, comment_ranges);
}
if settings.rules.enabled(Rule::TabIndentation) {

View file

@ -77,6 +77,18 @@ mod tests {
#[test_case(Rule::UTF8EncodingDeclaration, Path::new("UP009_8.py"))]
#[test_case(Rule::UTF8EncodingDeclaration, Path::new("UP009_9.py"))]
#[test_case(Rule::UTF8EncodingDeclaration, Path::new("UP009_10.py"))]
#[test_case(Rule::UTF8EncodingDeclaration, Path::new("UP009_other_other.py"))]
#[test_case(Rule::UTF8EncodingDeclaration, Path::new("UP009_other_utf8.py"))]
#[test_case(Rule::UTF8EncodingDeclaration, Path::new("UP009_utf8_other.py"))]
#[test_case(Rule::UTF8EncodingDeclaration, Path::new("UP009_utf8_utf8.py"))]
#[test_case(Rule::UTF8EncodingDeclaration, Path::new("UP009_utf8_utf8_other.py"))]
#[test_case(Rule::UTF8EncodingDeclaration, Path::new("UP009_utf8_code_other.py"))]
#[test_case(Rule::UTF8EncodingDeclaration, Path::new("UP009_code_utf8_utf8.py"))]
#[test_case(
Rule::UTF8EncodingDeclaration,
Path::new("UP009_hashbang_utf8_other.py")
)]
#[test_case(Rule::UTF8EncodingDeclaration, Path::new("UP009_many_empty_lines.py"))]
#[test_case(Rule::UnicodeKindPrefix, Path::new("UP025.py"))]
#[test_case(Rule::UnnecessaryBuiltinImport, Path::new("UP029.py"))]
#[test_case(Rule::UnnecessaryClassParentheses, Path::new("UP039.py"))]

View file

@ -1,13 +1,13 @@
use std::iter::FusedIterator;
use std::sync::LazyLock;
use regex::Regex;
use ruff_diagnostics::{AlwaysFixableViolation, Diagnostic, Edit, Fix};
use ruff_macros::{derive_message_formats, ViolationMetadata};
use ruff_python_index::Indexer;
use ruff_python_trivia::CommentRanges;
use ruff_source_file::LineRanges;
use ruff_text_size::{Ranged, TextRange};
use ruff_text_size::{Ranged, TextRange, TextSize};
use crate::Locator;
@ -46,57 +46,134 @@ impl AlwaysFixableViolation for UTF8EncodingDeclaration {
// Regex from PEP263.
static CODING_COMMENT_REGEX: LazyLock<Regex> =
LazyLock::new(|| Regex::new(r"^[ \t\f]*#.*?coding[:=][ \t]*utf-?8").unwrap());
LazyLock::new(|| Regex::new(r"^[ \t\f]*#.*?coding[:=][ \t]*(?<name>[-_.a-zA-Z0-9]+)").unwrap());
#[derive(Debug)]
enum CodingComment {
/// A UTF-8 encoding declaration.
UTF8(CodingCommentRange),
/// A declaration for any non utf8 encoding
OtherEncoding,
/// Any other comment
NoEncoding,
}
#[derive(Debug)]
struct CodingCommentRange {
comment: TextRange,
line: TextRange,
}
/// UP009
pub(crate) fn unnecessary_coding_comment(
diagnostics: &mut Vec<Diagnostic>,
locator: &Locator,
indexer: &Indexer,
comment_ranges: &CommentRanges,
) {
// The coding comment must be on one of the first two lines. Since each comment spans at least
// one line, we only need to check the first two comments at most.
for comment_range in comment_ranges.iter().take(2) {
// If leading content is not whitespace then it's not a valid coding comment e.g.
// ```
// print(x) # coding=utf8
// ```
let line_range = locator.full_line_range(comment_range.start());
if !locator
.slice(TextRange::new(line_range.start(), comment_range.start()))
.trim()
.is_empty()
{
continue;
}
let mut iter = CodingCommentIterator::new(locator, comment_ranges)
.skip_while(|comment| matches!(comment, CodingComment::NoEncoding));
// If the line is after a continuation then it's not a valid coding comment e.g.
// ```
// x = 1 \
// # coding=utf8
// x = 2
// ```
if indexer
.preceded_by_continuations(line_range.start(), locator.contents())
.is_some()
{
continue;
}
let Some(CodingComment::UTF8(range)) = iter.next() else {
return;
};
if CODING_COMMENT_REGEX.is_match(locator.slice(line_range)) {
#[allow(deprecated)]
let index = locator.compute_line_index(line_range.start());
if index.to_zero_indexed() > 1 {
continue;
}
let line_index = locator.count_lines(TextRange::up_to(range.comment.start()));
let mut diagnostic = Diagnostic::new(UTF8EncodingDeclaration, *comment_range);
diagnostic.set_fix(Fix::safe_edit(Edit::deletion(
line_range.start(),
line_range.end(),
)));
diagnostics.push(diagnostic);
// Comment must be on the first or second line
if line_index > 1 {
return;
}
// ```python
// # -*- coding: utf-8 -*-
// # -*- coding: latin-1 -*-
// ```
// or
// ```python
// # -*- coding: utf-8 -*-
// # comment
// # -*- coding: latin-1 -*-
// ```
if matches!(
(iter.next(), iter.next()),
(Some(CodingComment::OtherEncoding), _)
| (
Some(CodingComment::NoEncoding),
Some(CodingComment::OtherEncoding)
)
) {
return;
}
let fix = Fix::safe_edit(Edit::range_deletion(range.line));
let diagnostic = Diagnostic::new(UTF8EncodingDeclaration, range.comment);
diagnostics.push(diagnostic.with_fix(fix));
}
struct CodingCommentIterator<'a> {
/// End offset of the last comment trivia or `None` if there
/// was any non-comment comment since the iterator started (e.g. a print statement)
last_trivia_end: Option<TextSize>,
locator: &'a Locator<'a>,
comments: std::slice::Iter<'a, TextRange>,
}
impl<'a> CodingCommentIterator<'a> {
fn new(locator: &'a Locator<'a>, comments: &'a CommentRanges) -> Self {
Self {
last_trivia_end: Some(locator.bom_start_offset()),
locator,
comments: comments.iter(),
}
}
}
impl Iterator for CodingCommentIterator<'_> {
type Item = CodingComment;
fn next(&mut self) -> Option<Self::Item> {
let comment = self.comments.next()?;
let line_range = self.locator.full_line_range(comment.start());
// If leading content is not whitespace then it's not a valid coding comment e.g.
// ```py
// print(x) # coding=utf8
// ```
// or
// ```python
// print(test)
// # -*- coding: utf-8 -*-
// ```
let last_trivia_end = self.last_trivia_end.take()?;
let before_hash_sign = self
.locator
.slice(TextRange::new(last_trivia_end, comment.start()));
if !before_hash_sign.trim().is_empty() {
return None;
}
self.last_trivia_end = Some(comment.end());
let result = if let Some(parts_of_interest) =
CODING_COMMENT_REGEX.captures(self.locator.slice(line_range))
{
let coding_name = parts_of_interest.name("name").unwrap();
match coding_name.as_str() {
"utf8" | "utf-8" => CodingComment::UTF8(CodingCommentRange {
comment: comment.range(),
line: line_range,
}),
_ => CodingComment::OtherEncoding,
}
} else {
CodingComment::NoEncoding
};
Some(result)
}
}
impl FusedIterator for CodingCommentIterator<'_> {}

View file

@ -0,0 +1,5 @@
---
source: crates/ruff_linter/src/rules/pyupgrade/mod.rs
snapshot_kind: text
---

View file

@ -0,0 +1,5 @@
---
source: crates/ruff_linter/src/rules/pyupgrade/mod.rs
snapshot_kind: text
---

View file

@ -0,0 +1,5 @@
---
source: crates/ruff_linter/src/rules/pyupgrade/mod.rs
snapshot_kind: text
---

View file

@ -0,0 +1,5 @@
---
source: crates/ruff_linter/src/rules/pyupgrade/mod.rs
snapshot_kind: text
---

View file

@ -0,0 +1,5 @@
---
source: crates/ruff_linter/src/rules/pyupgrade/mod.rs
snapshot_kind: text
---

View file

@ -0,0 +1,17 @@
---
source: crates/ruff_linter/src/rules/pyupgrade/mod.rs
snapshot_kind: text
---
UP009_utf8_code_other.py:1:1: UP009 [*] UTF-8 encoding declaration is unnecessary
|
1 | # -*- coding: utf8 -*-
| ^^^^^^^^^^^^^^^^^^^^^^ UP009
2 | print("the following is not a coding comment")
3 | # -*- coding: ascii -*-
|
= help: Remove unnecessary coding comment
Safe fix
1 |-# -*- coding: utf8 -*-
2 1 | print("the following is not a coding comment")
3 2 | # -*- coding: ascii -*-

View file

@ -0,0 +1,5 @@
---
source: crates/ruff_linter/src/rules/pyupgrade/mod.rs
snapshot_kind: text
---

View file

@ -0,0 +1,15 @@
---
source: crates/ruff_linter/src/rules/pyupgrade/mod.rs
snapshot_kind: text
---
UP009_utf8_utf8.py:1:1: UP009 [*] UTF-8 encoding declaration is unnecessary
|
1 | # -*- coding: utf-8 -*-
| ^^^^^^^^^^^^^^^^^^^^^^^ UP009
2 | # -*- coding: utf-8 -*-
|
= help: Remove unnecessary coding comment
Safe fix
1 1 | # -*- coding: utf-8 -*-
2 |-# -*- coding: utf-8 -*-

View file

@ -0,0 +1,17 @@
---
source: crates/ruff_linter/src/rules/pyupgrade/mod.rs
snapshot_kind: text
---
UP009_utf8_utf8_other.py:1:1: UP009 [*] UTF-8 encoding declaration is unnecessary
|
1 | # -*- coding: utf-8 -*-
| ^^^^^^^^^^^^^^^^^^^^^^^ UP009
2 | # -*- coding: utf-8 -*-
3 | # -*- coding: ascii -*-
|
= help: Remove unnecessary coding comment
Safe fix
1 1 | # -*- coding: utf-8 -*-
2 |-# -*- coding: utf-8 -*-
3 2 | # -*- coding: ascii -*-

View file

@ -295,6 +295,44 @@ pub trait LineRanges {
/// ## Panics
/// If the start or end of `range` is out of bounds.
fn full_lines_str(&self, range: TextRange) -> &str;
/// The number of lines `range` spans.
///
/// ## Examples
///
/// ```
/// # use ruff_text_size::{Ranged, TextRange};
/// # use ruff_source_file::LineRanges;
///
/// assert_eq!("a\nb".count_lines(TextRange::up_to(1.into())), 0);
/// assert_eq!("a\nb\r\nc".count_lines(TextRange::up_to(3.into())), 1, "Up to the end of the second line");
/// assert_eq!("a\nb\r\nc".count_lines(TextRange::up_to(4.into())), 2, "In between the line break characters");
/// assert_eq!("a\nb\r\nc".count_lines(TextRange::up_to(5.into())), 2);
/// assert_eq!("Single line".count_lines(TextRange::up_to(13.into())), 0);
/// assert_eq!("out\nof\nbounds end".count_lines(TextRange::up_to(55.into())), 2);
/// ```
fn count_lines(&self, range: TextRange) -> u32 {
let mut count = 0;
let mut line_end = self.line_end(range.start());
loop {
let next_line_start = self.full_line_end(line_end);
// Reached the end of the string
if next_line_start == line_end {
break count;
}
// Range ends at the line boundary
if line_end >= range.end() {
break count;
}
count += 1;
line_end = self.line_end(next_line_start);
}
}
}
impl LineRanges for str {