[pyupgrade] Do not report when a UTF-8 comment is followed by a non-UTF-8 one (UP009) (#14728)

## Summary Resolves #14704. ## Test Plan `cargo nextest run` and `cargo insta test`. --------- Co-authored-by: Micha Reiser <micha@reiser.io>
2025-09-26 20:09:22 +00:00 · 2024-12-11 17:30:41 +07:00 · 2024-12-11 17:30:41 +07:00 · c8d505c8ea
commit c8d505c8ea
parent a55722e740
22 changed files with 273 additions and 49 deletions
--- a/crates/ruff_linter/resources/test/fixtures/pyupgrade/UP009_code_utf8_utf8.py
+++ b/crates/ruff_linter/resources/test/fixtures/pyupgrade/UP009_code_utf8_utf8.py
@ -0,0 +1,3 @@
 print('No coding coments here')
 # -*- coding: utf-8 -*-
 # -*- coding: utf-8 -*-
--- a/crates/ruff_linter/resources/test/fixtures/pyupgrade/UP009_hashbang_utf8_other.py
+++ b/crates/ruff_linter/resources/test/fixtures/pyupgrade/UP009_hashbang_utf8_other.py
@ -0,0 +1,3 @@
 #!/usr/bin/python
 # -*- coding: utf-8 -*-
 # -*- coding: ascii -*-
--- a/crates/ruff_linter/resources/test/fixtures/pyupgrade/UP009_many_empty_lines.py
+++ b/crates/ruff_linter/resources/test/fixtures/pyupgrade/UP009_many_empty_lines.py
@ -0,0 +1,3 @@
 # -*- coding: utf-8 -*-
--- a/crates/ruff_linter/resources/test/fixtures/pyupgrade/UP009_other_other.py
+++ b/crates/ruff_linter/resources/test/fixtures/pyupgrade/UP009_other_other.py
@ -0,0 +1,2 @@
 # -*- coding: ascii -*-
 # -*- coding: latin -*-
--- a/crates/ruff_linter/resources/test/fixtures/pyupgrade/UP009_other_utf8.py
+++ b/crates/ruff_linter/resources/test/fixtures/pyupgrade/UP009_other_utf8.py
@ -0,0 +1,2 @@
 # -*- coding: ascii -*-
 # -*- coding: utf-8 -*-
--- a/crates/ruff_linter/resources/test/fixtures/pyupgrade/UP009_utf8_code_other.py
+++ b/crates/ruff_linter/resources/test/fixtures/pyupgrade/UP009_utf8_code_other.py
@ -0,0 +1,3 @@
 # -*- coding: utf8 -*-
 print("the following is not a coding comment")
 # -*- coding: ascii -*-
--- a/crates/ruff_linter/resources/test/fixtures/pyupgrade/UP009_utf8_other.py
+++ b/crates/ruff_linter/resources/test/fixtures/pyupgrade/UP009_utf8_other.py
@ -0,0 +1,2 @@
 # -*- coding: utf-8 -*-
 # -*- coding: ascii -*-
--- a/crates/ruff_linter/resources/test/fixtures/pyupgrade/UP009_utf8_utf8.py
+++ b/crates/ruff_linter/resources/test/fixtures/pyupgrade/UP009_utf8_utf8.py
@ -0,0 +1,2 @@
 # -*- coding: utf-8 -*-
 # -*- coding: utf-8 -*-
--- a/crates/ruff_linter/resources/test/fixtures/pyupgrade/UP009_utf8_utf8_other.py
+++ b/crates/ruff_linter/resources/test/fixtures/pyupgrade/UP009_utf8_utf8_other.py
@ -0,0 +1,3 @@
 # -*- coding: utf-8 -*-
 # -*- coding: utf-8 -*-
 # -*- coding: ascii -*-
--- a/crates/ruff_linter/src/checkers/tokens.rs
+++ b/crates/ruff_linter/src/checkers/tokens.rs
@ -73,12 +73,7 @@ pub(crate) fn check_tokens(
    }
    if settings.rules.enabled(Rule::UTF8EncodingDeclaration) {
-        pyupgrade::rules::unnecessary_coding_comment(
+        pyupgrade::rules::unnecessary_coding_comment(&mut diagnostics, locator, comment_ranges);
            &mut diagnostics,
            locator,
            indexer,
            comment_ranges,
        );
    }
    if settings.rules.enabled(Rule::TabIndentation) {
--- a/crates/ruff_linter/src/rules/pyupgrade/mod.rs
+++ b/crates/ruff_linter/src/rules/pyupgrade/mod.rs
@ -77,6 +77,18 @@ mod tests {
    #[test_case(Rule::UTF8EncodingDeclaration, Path::new("UP009_8.py"))]
    #[test_case(Rule::UTF8EncodingDeclaration, Path::new("UP009_9.py"))]
    #[test_case(Rule::UTF8EncodingDeclaration, Path::new("UP009_10.py"))]
    #[test_case(Rule::UTF8EncodingDeclaration, Path::new("UP009_other_other.py"))]
    #[test_case(Rule::UTF8EncodingDeclaration, Path::new("UP009_other_utf8.py"))]
    #[test_case(Rule::UTF8EncodingDeclaration, Path::new("UP009_utf8_other.py"))]
    #[test_case(Rule::UTF8EncodingDeclaration, Path::new("UP009_utf8_utf8.py"))]
    #[test_case(Rule::UTF8EncodingDeclaration, Path::new("UP009_utf8_utf8_other.py"))]
    #[test_case(Rule::UTF8EncodingDeclaration, Path::new("UP009_utf8_code_other.py"))]
    #[test_case(Rule::UTF8EncodingDeclaration, Path::new("UP009_code_utf8_utf8.py"))]
    #[test_case(
        Rule::UTF8EncodingDeclaration,
        Path::new("UP009_hashbang_utf8_other.py")
    )]
    #[test_case(Rule::UTF8EncodingDeclaration, Path::new("UP009_many_empty_lines.py"))]
    #[test_case(Rule::UnicodeKindPrefix, Path::new("UP025.py"))]
    #[test_case(Rule::UnnecessaryBuiltinImport, Path::new("UP029.py"))]
    #[test_case(Rule::UnnecessaryClassParentheses, Path::new("UP039.py"))]
--- a/crates/ruff_linter/src/rules/pyupgrade/rules/unnecessary_coding_comment.rs
+++ b/crates/ruff_linter/src/rules/pyupgrade/rules/unnecessary_coding_comment.rs
@ -1,13 +1,13 @@
 use std::iter::FusedIterator;
 use std::sync::LazyLock;
 use regex::Regex;
 use ruff_diagnostics::{AlwaysFixableViolation, Diagnostic, Edit, Fix};
 use ruff_macros::{derive_message_formats, ViolationMetadata};
 use ruff_python_index::Indexer;
 use ruff_python_trivia::CommentRanges;
 use ruff_source_file::LineRanges;
-use ruff_text_size::{Ranged, TextRange};
+use ruff_text_size::{Ranged, TextRange, TextSize};
 use crate::Locator;
@ -46,57 +46,134 @@ impl AlwaysFixableViolation for UTF8EncodingDeclaration {
 // Regex from PEP263.
 static CODING_COMMENT_REGEX: LazyLock<Regex> =
-    LazyLock::new(|| Regex::new(r"^[ \t\f]*#.*?coding[:=][ \t]*utf-?8").unwrap());
+    LazyLock::new(|| Regex::new(r"^[ \t\f]*#.*?coding[:=][ \t]*(?<name>[-_.a-zA-Z0-9]+)").unwrap());
 #[derive(Debug)]
 enum CodingComment {
    /// A UTF-8 encoding declaration.
    UTF8(CodingCommentRange),
    /// A declaration for any non utf8 encoding
    OtherEncoding,
    /// Any other comment
    NoEncoding,
 }
 #[derive(Debug)]
 struct CodingCommentRange {
    comment: TextRange,
    line: TextRange,
 }
 /// UP009
 pub(crate) fn unnecessary_coding_comment(
    diagnostics: &mut Vec<Diagnostic>,
    locator: &Locator,
    indexer: &Indexer,
    comment_ranges: &CommentRanges,
 ) {
-    // The coding comment must be on one of the first two lines. Since each comment spans at least
+    let mut iter = CodingCommentIterator::new(locator, comment_ranges)
-    // one line, we only need to check the first two comments at most.
+        .skip_while(|comment| matches!(comment, CodingComment::NoEncoding));
    for comment_range in comment_ranges.iter().take(2) {
        // If leading content is not whitespace then it's not a valid coding comment e.g.
        // ```
        // print(x) # coding=utf8
        // ```
        let line_range = locator.full_line_range(comment_range.start());
        if !locator
            .slice(TextRange::new(line_range.start(), comment_range.start()))
            .trim()
            .is_empty()
        {
            continue;
        }
-        // If the line is after a continuation then it's not a valid coding comment e.g.
+    let Some(CodingComment::UTF8(range)) = iter.next() else {
-        // ```
+        return;
-        // x = 1 \
+    };
        //    # coding=utf8
        // x = 2
        // ```
        if indexer
            .preceded_by_continuations(line_range.start(), locator.contents())
            .is_some()
        {
            continue;
        }
-        if CODING_COMMENT_REGEX.is_match(locator.slice(line_range)) {
+    let line_index = locator.count_lines(TextRange::up_to(range.comment.start()));
            #[allow(deprecated)]
            let index = locator.compute_line_index(line_range.start());
            if index.to_zero_indexed() > 1 {
                continue;
            }
-            let mut diagnostic = Diagnostic::new(UTF8EncodingDeclaration, *comment_range);
+    // Comment must be on the first or second line
-            diagnostic.set_fix(Fix::safe_edit(Edit::deletion(
+    if line_index > 1 {
-                line_range.start(),
+        return;
-                line_range.end(),
+    }
-            )));
+
-            diagnostics.push(diagnostic);
+    // ```python
    // # -*- coding: utf-8 -*-
    // # -*- coding: latin-1 -*-
    // ```
    // or
    // ```python
    // # -*- coding: utf-8 -*-
    // # comment
    // # -*- coding: latin-1 -*-
    // ```
    if matches!(
        (iter.next(), iter.next()),
        (Some(CodingComment::OtherEncoding), _)
            | (
                Some(CodingComment::NoEncoding),
                Some(CodingComment::OtherEncoding)
            )
    ) {
        return;
    }
    let fix = Fix::safe_edit(Edit::range_deletion(range.line));
    let diagnostic = Diagnostic::new(UTF8EncodingDeclaration, range.comment);
    diagnostics.push(diagnostic.with_fix(fix));
 }
 struct CodingCommentIterator<'a> {
    /// End offset of the last comment trivia or `None` if there
    /// was any non-comment comment since the iterator started (e.g. a print statement)
    last_trivia_end: Option<TextSize>,
    locator: &'a Locator<'a>,
    comments: std::slice::Iter<'a, TextRange>,
 }
 impl<'a> CodingCommentIterator<'a> {
    fn new(locator: &'a Locator<'a>, comments: &'a CommentRanges) -> Self {
        Self {
            last_trivia_end: Some(locator.bom_start_offset()),
            locator,
            comments: comments.iter(),
        }
    }
 }
 impl Iterator for CodingCommentIterator<'_> {
    type Item = CodingComment;
    fn next(&mut self) -> Option<Self::Item> {
        let comment = self.comments.next()?;
        let line_range = self.locator.full_line_range(comment.start());
        // If leading content is not whitespace then it's not a valid coding comment e.g.
        // ```py
        // print(x) # coding=utf8
        // ```
        // or
        // ```python
        // print(test)
        // # -*- coding: utf-8 -*-
        // ```
        let last_trivia_end = self.last_trivia_end.take()?;
        let before_hash_sign = self
            .locator
            .slice(TextRange::new(last_trivia_end, comment.start()));
        if !before_hash_sign.trim().is_empty() {
            return None;
        }
        self.last_trivia_end = Some(comment.end());
        let result = if let Some(parts_of_interest) =
            CODING_COMMENT_REGEX.captures(self.locator.slice(line_range))
        {
            let coding_name = parts_of_interest.name("name").unwrap();
            match coding_name.as_str() {
                "utf8" | "utf-8" => CodingComment::UTF8(CodingCommentRange {
                    comment: comment.range(),
                    line: line_range,
                }),
                _ => CodingComment::OtherEncoding,
            }
        } else {
            CodingComment::NoEncoding
        };
        Some(result)
    }
 }
 impl FusedIterator for CodingCommentIterator<'_> {}
--- a/crates/ruff_linter/src/rules/pyupgrade/snapshots/ruff_linterrulespyupgradetestsUP009_code_utf8_utf8.py.snap
+++ b/crates/ruff_linter/src/rules/pyupgrade/snapshots/ruff_linterrulespyupgradetestsUP009_code_utf8_utf8.py.snap
@ -0,0 +1,5 @@
 ---
 source: crates/ruff_linter/src/rules/pyupgrade/mod.rs
 snapshot_kind: text
 ---
--- a/crates/ruff_linter/src/rules/pyupgrade/snapshots/ruff_linterrulespyupgradetestsUP009_hashbang_utf8_other.py.snap
+++ b/crates/ruff_linter/src/rules/pyupgrade/snapshots/ruff_linterrulespyupgradetestsUP009_hashbang_utf8_other.py.snap
@ -0,0 +1,5 @@
 ---
 source: crates/ruff_linter/src/rules/pyupgrade/mod.rs
 snapshot_kind: text
 ---
--- a/crates/ruff_linter/src/rules/pyupgrade/snapshots/ruff_linterrulespyupgradetestsUP009_many_empty_lines.py.snap
+++ b/crates/ruff_linter/src/rules/pyupgrade/snapshots/ruff_linterrulespyupgradetestsUP009_many_empty_lines.py.snap
@ -0,0 +1,5 @@
 ---
 source: crates/ruff_linter/src/rules/pyupgrade/mod.rs
 snapshot_kind: text
 ---
--- a/crates/ruff_linter/src/rules/pyupgrade/snapshots/ruff_linterrulespyupgradetestsUP009_other_other.py.snap
+++ b/crates/ruff_linter/src/rules/pyupgrade/snapshots/ruff_linterrulespyupgradetestsUP009_other_other.py.snap
@ -0,0 +1,5 @@
 ---
 source: crates/ruff_linter/src/rules/pyupgrade/mod.rs
 snapshot_kind: text
 ---
--- a/crates/ruff_linter/src/rules/pyupgrade/snapshots/ruff_linterrulespyupgradetestsUP009_other_utf8.py.snap
+++ b/crates/ruff_linter/src/rules/pyupgrade/snapshots/ruff_linterrulespyupgradetestsUP009_other_utf8.py.snap
@ -0,0 +1,5 @@
 ---
 source: crates/ruff_linter/src/rules/pyupgrade/mod.rs
 snapshot_kind: text
 ---
--- a/crates/ruff_linter/src/rules/pyupgrade/snapshots/ruff_linterrulespyupgradetestsUP009_utf8_code_other.py.snap
+++ b/crates/ruff_linter/src/rules/pyupgrade/snapshots/ruff_linterrulespyupgradetestsUP009_utf8_code_other.py.snap
@ -0,0 +1,17 @@
 ---
 source: crates/ruff_linter/src/rules/pyupgrade/mod.rs
 snapshot_kind: text
 ---
 UP009_utf8_code_other.py:1:1: UP009 [*] UTF-8 encoding declaration is unnecessary
  |
 1 | # -*- coding: utf8 -*-
  | ^^^^^^^^^^^^^^^^^^^^^^ UP009
 2 | print("the following is not a coding comment")
 3 | # -*- coding: ascii -*-
  |
  = help: Remove unnecessary coding comment
 ℹ Safe fix
 1   |-# -*- coding: utf8 -*-
 2 1 | print("the following is not a coding comment")
 3 2 | # -*- coding: ascii -*-
--- a/crates/ruff_linter/src/rules/pyupgrade/snapshots/ruff_linterrulespyupgradetestsUP009_utf8_other.py.snap
+++ b/crates/ruff_linter/src/rules/pyupgrade/snapshots/ruff_linterrulespyupgradetestsUP009_utf8_other.py.snap
@ -0,0 +1,5 @@
 ---
 source: crates/ruff_linter/src/rules/pyupgrade/mod.rs
 snapshot_kind: text
 ---
--- a/crates/ruff_linter/src/rules/pyupgrade/snapshots/ruff_linterrulespyupgradetestsUP009_utf8_utf8.py.snap
+++ b/crates/ruff_linter/src/rules/pyupgrade/snapshots/ruff_linterrulespyupgradetestsUP009_utf8_utf8.py.snap
@ -0,0 +1,15 @@
 ---
 source: crates/ruff_linter/src/rules/pyupgrade/mod.rs
 snapshot_kind: text
 ---
 UP009_utf8_utf8.py:1:1: UP009 [*] UTF-8 encoding declaration is unnecessary
  |
 1 | # -*- coding: utf-8 -*-
  | ^^^^^^^^^^^^^^^^^^^^^^^ UP009
 2 | # -*- coding: utf-8 -*-
  |
  = help: Remove unnecessary coding comment
 ℹ Safe fix
 1 1 | # -*- coding: utf-8 -*-
 2   |-# -*- coding: utf-8 -*-
--- a/crates/ruff_linter/src/rules/pyupgrade/snapshots/ruff_linterrulespyupgradetestsUP009_utf8_utf8_other.py.snap
+++ b/crates/ruff_linter/src/rules/pyupgrade/snapshots/ruff_linterrulespyupgradetestsUP009_utf8_utf8_other.py.snap
@ -0,0 +1,17 @@
 ---
 source: crates/ruff_linter/src/rules/pyupgrade/mod.rs
 snapshot_kind: text
 ---
 UP009_utf8_utf8_other.py:1:1: UP009 [*] UTF-8 encoding declaration is unnecessary
  |
 1 | # -*- coding: utf-8 -*-
  | ^^^^^^^^^^^^^^^^^^^^^^^ UP009
 2 | # -*- coding: utf-8 -*-
 3 | # -*- coding: ascii -*-
  |
  = help: Remove unnecessary coding comment
 ℹ Safe fix
 1 1 | # -*- coding: utf-8 -*-
 2   |-# -*- coding: utf-8 -*-
 3 2 | # -*- coding: ascii -*-
--- a/crates/ruff_source_file/src/line_ranges.rs
+++ b/crates/ruff_source_file/src/line_ranges.rs
@ -295,6 +295,44 @@ pub trait LineRanges {
    /// ## Panics
    /// If the start or end of `range` is out of bounds.
    fn full_lines_str(&self, range: TextRange) -> &str;
    /// The number of lines `range` spans.
    ///
    /// ## Examples
    ///
    /// ```
    /// # use ruff_text_size::{Ranged, TextRange};
    /// # use ruff_source_file::LineRanges;
    ///
    /// assert_eq!("a\nb".count_lines(TextRange::up_to(1.into())), 0);
    /// assert_eq!("a\nb\r\nc".count_lines(TextRange::up_to(3.into())), 1, "Up to the end of the second line");
    /// assert_eq!("a\nb\r\nc".count_lines(TextRange::up_to(4.into())), 2, "In between the line break characters");
    /// assert_eq!("a\nb\r\nc".count_lines(TextRange::up_to(5.into())), 2);
    /// assert_eq!("Single line".count_lines(TextRange::up_to(13.into())), 0);
    /// assert_eq!("out\nof\nbounds end".count_lines(TextRange::up_to(55.into())), 2);
    /// ```
    fn count_lines(&self, range: TextRange) -> u32 {
        let mut count = 0;
        let mut line_end = self.line_end(range.start());
        loop {
            let next_line_start = self.full_line_end(line_end);
            // Reached the end of the string
            if next_line_start == line_end {
                break count;
            }
            // Range ends at the line boundary
            if line_end >= range.end() {
                break count;
            }
            count += 1;
            line_end = self.line_end(next_line_start);
        }
    }
 }
 impl LineRanges for str {
		`@ -0,0 +1,2 @@`
							`# -- coding: ascii --`
							`# -- coding: latin --`
		`@ -0,0 +1,2 @@`
							`# -- coding: ascii --`
							`# -- coding: utf-8 --`