[pyupgrade] Do not report when a UTF-8 comment is followed by a non-UTF-8 one (UP009) (#14728)

## Summary Resolves #14704. ## Test Plan `cargo nextest run` and `cargo insta test`. --------- Co-authored-by: Micha Reiser <micha@reiser.io>
2025-07-24 13:34:40 +00:00 · 2024-12-11 17:30:41 +07:00 · 2024-12-11 17:30:41 +07:00 · c8d505c8ea
commit c8d505c8ea
parent a55722e740
22 changed files with 273 additions and 49 deletions
--- a/crates/ruff_linter/resources/test/fixtures/pyupgrade/UP009_code_utf8_utf8.py
+++ b/crates/ruff_linter/resources/test/fixtures/pyupgrade/UP009_code_utf8_utf8.py
@ -0,0 +1,3 @@
+print('No coding coments here')
+# -*- coding: utf-8 -*-
+# -*- coding: utf-8 -*-
--- a/crates/ruff_linter/resources/test/fixtures/pyupgrade/UP009_hashbang_utf8_other.py
+++ b/crates/ruff_linter/resources/test/fixtures/pyupgrade/UP009_hashbang_utf8_other.py
@ -0,0 +1,3 @@
+#!/usr/bin/python
+# -*- coding: utf-8 -*-
+# -*- coding: ascii -*-
--- a/crates/ruff_linter/resources/test/fixtures/pyupgrade/UP009_many_empty_lines.py
+++ b/crates/ruff_linter/resources/test/fixtures/pyupgrade/UP009_many_empty_lines.py
@ -0,0 +1,3 @@
+
+
+# -*- coding: utf-8 -*-
--- a/crates/ruff_linter/resources/test/fixtures/pyupgrade/UP009_other_other.py
+++ b/crates/ruff_linter/resources/test/fixtures/pyupgrade/UP009_other_other.py
@ -0,0 +1,2 @@
+# -*- coding: ascii -*-
+# -*- coding: latin -*-
--- a/crates/ruff_linter/resources/test/fixtures/pyupgrade/UP009_other_utf8.py
+++ b/crates/ruff_linter/resources/test/fixtures/pyupgrade/UP009_other_utf8.py
@ -0,0 +1,2 @@
+# -*- coding: ascii -*-
+# -*- coding: utf-8 -*-
--- a/crates/ruff_linter/resources/test/fixtures/pyupgrade/UP009_utf8_code_other.py
+++ b/crates/ruff_linter/resources/test/fixtures/pyupgrade/UP009_utf8_code_other.py
@ -0,0 +1,3 @@
+# -*- coding: utf8 -*-
+print("the following is not a coding comment")
+# -*- coding: ascii -*-
--- a/crates/ruff_linter/resources/test/fixtures/pyupgrade/UP009_utf8_other.py
+++ b/crates/ruff_linter/resources/test/fixtures/pyupgrade/UP009_utf8_other.py
@ -0,0 +1,2 @@
+# -*- coding: utf-8 -*-
+# -*- coding: ascii -*-
--- a/crates/ruff_linter/resources/test/fixtures/pyupgrade/UP009_utf8_utf8.py
+++ b/crates/ruff_linter/resources/test/fixtures/pyupgrade/UP009_utf8_utf8.py
@ -0,0 +1,2 @@
+# -*- coding: utf-8 -*-
+# -*- coding: utf-8 -*-
--- a/crates/ruff_linter/resources/test/fixtures/pyupgrade/UP009_utf8_utf8_other.py
+++ b/crates/ruff_linter/resources/test/fixtures/pyupgrade/UP009_utf8_utf8_other.py
@ -0,0 +1,3 @@
+# -*- coding: utf-8 -*-
+# -*- coding: utf-8 -*-
+# -*- coding: ascii -*-
--- a/crates/ruff_linter/src/checkers/tokens.rs
+++ b/crates/ruff_linter/src/checkers/tokens.rs
@ -73,12 +73,7 @@ pub(crate) fn check_tokens(
    }

    if settings.rules.enabled(Rule::UTF8EncodingDeclaration) {
-        pyupgrade::rules::unnecessary_coding_comment(
-            &mut diagnostics,
-            locator,
-            indexer,
-            comment_ranges,
-        );
+        pyupgrade::rules::unnecessary_coding_comment(&mut diagnostics, locator, comment_ranges);
    }

    if settings.rules.enabled(Rule::TabIndentation) {
--- a/crates/ruff_linter/src/rules/pyupgrade/mod.rs
+++ b/crates/ruff_linter/src/rules/pyupgrade/mod.rs
@ -77,6 +77,18 @@ mod tests {
    #[test_case(Rule::UTF8EncodingDeclaration, Path::new("UP009_8.py"))]
    #[test_case(Rule::UTF8EncodingDeclaration, Path::new("UP009_9.py"))]
    #[test_case(Rule::UTF8EncodingDeclaration, Path::new("UP009_10.py"))]
+    #[test_case(Rule::UTF8EncodingDeclaration, Path::new("UP009_other_other.py"))]
+    #[test_case(Rule::UTF8EncodingDeclaration, Path::new("UP009_other_utf8.py"))]
+    #[test_case(Rule::UTF8EncodingDeclaration, Path::new("UP009_utf8_other.py"))]
+    #[test_case(Rule::UTF8EncodingDeclaration, Path::new("UP009_utf8_utf8.py"))]
+    #[test_case(Rule::UTF8EncodingDeclaration, Path::new("UP009_utf8_utf8_other.py"))]
+    #[test_case(Rule::UTF8EncodingDeclaration, Path::new("UP009_utf8_code_other.py"))]
+    #[test_case(Rule::UTF8EncodingDeclaration, Path::new("UP009_code_utf8_utf8.py"))]
+    #[test_case(
+        Rule::UTF8EncodingDeclaration,
+        Path::new("UP009_hashbang_utf8_other.py")
+    )]
+    #[test_case(Rule::UTF8EncodingDeclaration, Path::new("UP009_many_empty_lines.py"))]
    #[test_case(Rule::UnicodeKindPrefix, Path::new("UP025.py"))]
    #[test_case(Rule::UnnecessaryBuiltinImport, Path::new("UP029.py"))]
    #[test_case(Rule::UnnecessaryClassParentheses, Path::new("UP039.py"))]
--- a/crates/ruff_linter/src/rules/pyupgrade/rules/unnecessary_coding_comment.rs
+++ b/crates/ruff_linter/src/rules/pyupgrade/rules/unnecessary_coding_comment.rs
@ -1,13 +1,13 @@
+use std::iter::FusedIterator;
 use std::sync::LazyLock;

 use regex::Regex;

 use ruff_diagnostics::{AlwaysFixableViolation, Diagnostic, Edit, Fix};
 use ruff_macros::{derive_message_formats, ViolationMetadata};
-use ruff_python_index::Indexer;
 use ruff_python_trivia::CommentRanges;
 use ruff_source_file::LineRanges;
-use ruff_text_size::{Ranged, TextRange};
+use ruff_text_size::{Ranged, TextRange, TextSize};

 use crate::Locator;

@ -46,57 +46,134 @@ impl AlwaysFixableViolation for UTF8EncodingDeclaration {

 // Regex from PEP263.
 static CODING_COMMENT_REGEX: LazyLock<Regex> =
-    LazyLock::new(|| Regex::new(r"^[ \t\f]*#.*?coding[:=][ \t]*utf-?8").unwrap());
+    LazyLock::new(|| Regex::new(r"^[ \t\f]*#.*?coding[:=][ \t]*(?<name>[-_.a-zA-Z0-9]+)").unwrap());
+
+#[derive(Debug)]
+enum CodingComment {
+    /// A UTF-8 encoding declaration.
+    UTF8(CodingCommentRange),
+    /// A declaration for any non utf8 encoding
+    OtherEncoding,
+    /// Any other comment
+    NoEncoding,
+}
+
+#[derive(Debug)]
+struct CodingCommentRange {
+    comment: TextRange,
+    line: TextRange,
+}

 /// UP009
 pub(crate) fn unnecessary_coding_comment(
    diagnostics: &mut Vec<Diagnostic>,
    locator: &Locator,
-    indexer: &Indexer,
    comment_ranges: &CommentRanges,
 ) {
-    // The coding comment must be on one of the first two lines. Since each comment spans at least
-    // one line, we only need to check the first two comments at most.
-    for comment_range in comment_ranges.iter().take(2) {
-        // If leading content is not whitespace then it's not a valid coding comment e.g.
-        // ```
-        // print(x) # coding=utf8
-        // ```
-        let line_range = locator.full_line_range(comment_range.start());
-        if !locator
-            .slice(TextRange::new(line_range.start(), comment_range.start()))
-            .trim()
-            .is_empty()
-        {
-            continue;
-        }
+    let mut iter = CodingCommentIterator::new(locator, comment_ranges)
+        .skip_while(|comment| matches!(comment, CodingComment::NoEncoding));

-        // If the line is after a continuation then it's not a valid coding comment e.g.
-        // ```
-        // x = 1 \
-        //    # coding=utf8
-        // x = 2
-        // ```
-        if indexer
-            .preceded_by_continuations(line_range.start(), locator.contents())
-            .is_some()
-        {
-            continue;
-        }
+    let Some(CodingComment::UTF8(range)) = iter.next() else {
+        return;
+    };

-        if CODING_COMMENT_REGEX.is_match(locator.slice(line_range)) {
-            #[allow(deprecated)]
-            let index = locator.compute_line_index(line_range.start());
-            if index.to_zero_indexed() > 1 {
-                continue;
-            }
+    let line_index = locator.count_lines(TextRange::up_to(range.comment.start()));

-            let mut diagnostic = Diagnostic::new(UTF8EncodingDeclaration, *comment_range);
-            diagnostic.set_fix(Fix::safe_edit(Edit::deletion(
-                line_range.start(),
-                line_range.end(),
-            )));
-            diagnostics.push(diagnostic);
+    // Comment must be on the first or second line
+    if line_index > 1 {
+        return;
+    }
+
+    // ```python
+    // # -*- coding: utf-8 -*-
+    // # -*- coding: latin-1 -*-
+    // ```
+    // or
+    // ```python
+    // # -*- coding: utf-8 -*-
+    // # comment
+    // # -*- coding: latin-1 -*-
+    // ```
+    if matches!(
+        (iter.next(), iter.next()),
+        (Some(CodingComment::OtherEncoding), _)
+            | (
+                Some(CodingComment::NoEncoding),
+                Some(CodingComment::OtherEncoding)
+            )
+    ) {
+        return;
+    }
+
+    let fix = Fix::safe_edit(Edit::range_deletion(range.line));
+    let diagnostic = Diagnostic::new(UTF8EncodingDeclaration, range.comment);
+
+    diagnostics.push(diagnostic.with_fix(fix));
+}
+
+struct CodingCommentIterator<'a> {
+    /// End offset of the last comment trivia or `None` if there
+    /// was any non-comment comment since the iterator started (e.g. a print statement)
+    last_trivia_end: Option<TextSize>,
+    locator: &'a Locator<'a>,
+    comments: std::slice::Iter<'a, TextRange>,
+}
+
+impl<'a> CodingCommentIterator<'a> {
+    fn new(locator: &'a Locator<'a>, comments: &'a CommentRanges) -> Self {
+        Self {
+            last_trivia_end: Some(locator.bom_start_offset()),
+            locator,
+            comments: comments.iter(),
        }
    }
 }
+
+impl Iterator for CodingCommentIterator<'_> {
+    type Item = CodingComment;
+
+    fn next(&mut self) -> Option<Self::Item> {
+        let comment = self.comments.next()?;
+        let line_range = self.locator.full_line_range(comment.start());
+
+        // If leading content is not whitespace then it's not a valid coding comment e.g.
+        // ```py
+        // print(x) # coding=utf8
+        // ```
+        // or
+        // ```python
+        // print(test)
+        // # -*- coding: utf-8 -*-
+        // ```
+        let last_trivia_end = self.last_trivia_end.take()?;
+        let before_hash_sign = self
+            .locator
+            .slice(TextRange::new(last_trivia_end, comment.start()));
+
+        if !before_hash_sign.trim().is_empty() {
+            return None;
+        }
+
+        self.last_trivia_end = Some(comment.end());
+
+        let result = if let Some(parts_of_interest) =
+            CODING_COMMENT_REGEX.captures(self.locator.slice(line_range))
+        {
+            let coding_name = parts_of_interest.name("name").unwrap();
+
+            match coding_name.as_str() {
+                "utf8" | "utf-8" => CodingComment::UTF8(CodingCommentRange {
+                    comment: comment.range(),
+                    line: line_range,
+                }),
+                _ => CodingComment::OtherEncoding,
+            }
+        } else {
+            CodingComment::NoEncoding
+        };
+
+        Some(result)
+    }
+}
+
+impl FusedIterator for CodingCommentIterator<'_> {}
--- a/crates/ruff_linter/src/rules/pyupgrade/snapshots/ruff_linterrulespyupgradetestsUP009_code_utf8_utf8.py.snap
+++ b/crates/ruff_linter/src/rules/pyupgrade/snapshots/ruff_linterrulespyupgradetestsUP009_code_utf8_utf8.py.snap
@ -0,0 +1,5 @@
+---
+source: crates/ruff_linter/src/rules/pyupgrade/mod.rs
+snapshot_kind: text
+---
+
--- a/crates/ruff_linter/src/rules/pyupgrade/snapshots/ruff_linterrulespyupgradetestsUP009_hashbang_utf8_other.py.snap
+++ b/crates/ruff_linter/src/rules/pyupgrade/snapshots/ruff_linterrulespyupgradetestsUP009_hashbang_utf8_other.py.snap
@ -0,0 +1,5 @@
+---
+source: crates/ruff_linter/src/rules/pyupgrade/mod.rs
+snapshot_kind: text
+---
+
--- a/crates/ruff_linter/src/rules/pyupgrade/snapshots/ruff_linterrulespyupgradetestsUP009_many_empty_lines.py.snap
+++ b/crates/ruff_linter/src/rules/pyupgrade/snapshots/ruff_linterrulespyupgradetestsUP009_many_empty_lines.py.snap
@ -0,0 +1,5 @@
+---
+source: crates/ruff_linter/src/rules/pyupgrade/mod.rs
+snapshot_kind: text
+---
+
--- a/crates/ruff_linter/src/rules/pyupgrade/snapshots/ruff_linterrulespyupgradetestsUP009_other_other.py.snap
+++ b/crates/ruff_linter/src/rules/pyupgrade/snapshots/ruff_linterrulespyupgradetestsUP009_other_other.py.snap
@ -0,0 +1,5 @@
+---
+source: crates/ruff_linter/src/rules/pyupgrade/mod.rs
+snapshot_kind: text
+---
+
--- a/crates/ruff_linter/src/rules/pyupgrade/snapshots/ruff_linterrulespyupgradetestsUP009_other_utf8.py.snap
+++ b/crates/ruff_linter/src/rules/pyupgrade/snapshots/ruff_linterrulespyupgradetestsUP009_other_utf8.py.snap
@ -0,0 +1,5 @@
+---
+source: crates/ruff_linter/src/rules/pyupgrade/mod.rs
+snapshot_kind: text
+---
+
--- a/crates/ruff_linter/src/rules/pyupgrade/snapshots/ruff_linterrulespyupgradetestsUP009_utf8_code_other.py.snap
+++ b/crates/ruff_linter/src/rules/pyupgrade/snapshots/ruff_linterrulespyupgradetestsUP009_utf8_code_other.py.snap
@ -0,0 +1,17 @@
+---
+source: crates/ruff_linter/src/rules/pyupgrade/mod.rs
+snapshot_kind: text
+---
+UP009_utf8_code_other.py:1:1: UP009 [*] UTF-8 encoding declaration is unnecessary
+  |
+1 | # -*- coding: utf8 -*-
+  | ^^^^^^^^^^^^^^^^^^^^^^ UP009
+2 | print("the following is not a coding comment")
+3 | # -*- coding: ascii -*-
+  |
+  = help: Remove unnecessary coding comment
+
+ℹ Safe fix
+1   |-# -*- coding: utf8 -*-
+2 1 | print("the following is not a coding comment")
+3 2 | # -*- coding: ascii -*-
--- a/crates/ruff_linter/src/rules/pyupgrade/snapshots/ruff_linterrulespyupgradetestsUP009_utf8_other.py.snap
+++ b/crates/ruff_linter/src/rules/pyupgrade/snapshots/ruff_linterrulespyupgradetestsUP009_utf8_other.py.snap
@ -0,0 +1,5 @@
+---
+source: crates/ruff_linter/src/rules/pyupgrade/mod.rs
+snapshot_kind: text
+---
+
--- a/crates/ruff_linter/src/rules/pyupgrade/snapshots/ruff_linterrulespyupgradetestsUP009_utf8_utf8.py.snap
+++ b/crates/ruff_linter/src/rules/pyupgrade/snapshots/ruff_linterrulespyupgradetestsUP009_utf8_utf8.py.snap
@ -0,0 +1,15 @@
+---
+source: crates/ruff_linter/src/rules/pyupgrade/mod.rs
+snapshot_kind: text
+---
+UP009_utf8_utf8.py:1:1: UP009 [*] UTF-8 encoding declaration is unnecessary
+  |
+1 | # -*- coding: utf-8 -*-
+  | ^^^^^^^^^^^^^^^^^^^^^^^ UP009
+2 | # -*- coding: utf-8 -*-
+  |
+  = help: Remove unnecessary coding comment
+
+ℹ Safe fix
+1 1 | # -*- coding: utf-8 -*-
+2   |-# -*- coding: utf-8 -*-
--- a/crates/ruff_linter/src/rules/pyupgrade/snapshots/ruff_linterrulespyupgradetestsUP009_utf8_utf8_other.py.snap
+++ b/crates/ruff_linter/src/rules/pyupgrade/snapshots/ruff_linterrulespyupgradetestsUP009_utf8_utf8_other.py.snap
@ -0,0 +1,17 @@
+---
+source: crates/ruff_linter/src/rules/pyupgrade/mod.rs
+snapshot_kind: text
+---
+UP009_utf8_utf8_other.py:1:1: UP009 [*] UTF-8 encoding declaration is unnecessary
+  |
+1 | # -*- coding: utf-8 -*-
+  | ^^^^^^^^^^^^^^^^^^^^^^^ UP009
+2 | # -*- coding: utf-8 -*-
+3 | # -*- coding: ascii -*-
+  |
+  = help: Remove unnecessary coding comment
+
+ℹ Safe fix
+1 1 | # -*- coding: utf-8 -*-
+2   |-# -*- coding: utf-8 -*-
+3 2 | # -*- coding: ascii -*-
--- a/crates/ruff_source_file/src/line_ranges.rs
+++ b/crates/ruff_source_file/src/line_ranges.rs
@ -295,6 +295,44 @@ pub trait LineRanges {
    /// ## Panics
    /// If the start or end of `range` is out of bounds.
    fn full_lines_str(&self, range: TextRange) -> &str;
+
+    /// The number of lines `range` spans.
+    ///
+    /// ## Examples
+    ///
+    /// ```
+    /// # use ruff_text_size::{Ranged, TextRange};
+    /// # use ruff_source_file::LineRanges;
+    ///
+    /// assert_eq!("a\nb".count_lines(TextRange::up_to(1.into())), 0);
+    /// assert_eq!("a\nb\r\nc".count_lines(TextRange::up_to(3.into())), 1, "Up to the end of the second line");
+    /// assert_eq!("a\nb\r\nc".count_lines(TextRange::up_to(4.into())), 2, "In between the line break characters");
+    /// assert_eq!("a\nb\r\nc".count_lines(TextRange::up_to(5.into())), 2);
+    /// assert_eq!("Single line".count_lines(TextRange::up_to(13.into())), 0);
+    /// assert_eq!("out\nof\nbounds end".count_lines(TextRange::up_to(55.into())), 2);
+    /// ```
+    fn count_lines(&self, range: TextRange) -> u32 {
+        let mut count = 0;
+        let mut line_end = self.line_end(range.start());
+
+        loop {
+            let next_line_start = self.full_line_end(line_end);
+
+            // Reached the end of the string
+            if next_line_start == line_end {
+                break count;
+            }
+
+            // Range ends at the line boundary
+            if line_end >= range.end() {
+                break count;
+            }
+
+            count += 1;
+
+            line_end = self.line_end(next_line_start);
+        }
+    }
 }

 impl LineRanges for str {