[flake8-simplify] Fix incorrect fix for positive maxsplit without separator (SIM905) (#20056)

## Summary Resolves #20033 ## Test Plan unit tests added to the new split function, existing snapshot test updated. Co-authored-by: Brent Westbrook <brentrwestbrook@gmail.com>
2025-09-29 21:34:57 +00:00 · 2025-09-18 22:56:34 +02:00 · 2025-09-18 22:56:34 +02:00 · bc89d0394c
commit bc89d0394c
parent 706be0a6e7
4 changed files with 141 additions and 51 deletions
--- a/crates/ruff_linter/resources/test/fixtures/flake8_simplify/SIM905.py
+++ b/crates/ruff_linter/resources/test/fixtures/flake8_simplify/SIM905.py
@ -170,3 +170,4 @@ print("<\x1c\x1d\x1e\x1f".rsplit(maxsplit=0))
 # leading/trailing whitespace should not count towards maxsplit
 " a b c d ".split(maxsplit=2)  # ["a", "b", "c d "]
 " a b c d ".rsplit(maxsplit=2)  # [" a b", "c", "d"]
+"a  b".split(maxsplit=1)  # ["a", "b"]
--- a/crates/ruff_linter/src/rules/flake8_simplify/rules/split_static_string.rs
+++ b/crates/ruff_linter/src/rules/flake8_simplify/rules/split_static_string.rs
@ -58,8 +58,8 @@ enum Method {
 }

 impl Method {
-    fn is_split(self) -> bool {
-        matches!(self, Method::Split)
+    fn is_rsplit(self) -> bool {
+        matches!(self, Method::RSplit)
    }
 }

@ -201,65 +201,19 @@ fn split_default(
    method: Method,
    settings: &LinterSettings,
 ) -> Option<Expr> {
-    // From the Python documentation:
-    // > If sep is not specified or is None, a different splitting algorithm is applied: runs of
-    // > consecutive whitespace are regarded as a single separator, and the result will contain
-    // > no empty strings at the start or end if the string has leading or trailing whitespace.
-    // > Consequently, splitting an empty string or a string consisting of just whitespace with
-    // > a None separator returns [].
-    // https://docs.python.org/3/library/stdtypes.html#str.split
    let string_val = str_value.to_str();
    match max_split.cmp(&0) {
-        Ordering::Greater => {
-            if !is_maxsplit_without_separator_fix_enabled(settings) {
-                return None;
-            }
+        Ordering::Greater if !is_maxsplit_without_separator_fix_enabled(settings) => None,
+        Ordering::Greater | Ordering::Equal => {
            let Ok(max_split) = usize::try_from(max_split) else {
                return None;
            };
-            let list_items: Vec<&str> = if method.is_split() {
-                string_val
-                    .trim_start_matches(py_unicode_is_whitespace)
-                    .splitn(max_split + 1, py_unicode_is_whitespace)
-                    .filter(|s| !s.is_empty())
-                    .collect()
-            } else {
-                let mut items: Vec<&str> = string_val
-                    .trim_end_matches(py_unicode_is_whitespace)
-                    .rsplitn(max_split + 1, py_unicode_is_whitespace)
-                    .filter(|s| !s.is_empty())
-                    .collect();
-                items.reverse();
-                items
-            };
+            let list_items = split_whitespace_with_maxsplit(string_val, max_split, method);
            Some(construct_replacement(
                &list_items,
                str_value.first_literal_flags(),
            ))
        }
-        Ordering::Equal => {
-            // Behavior for maxsplit = 0 when sep is None:
-            // - If the string is empty or all whitespace, result is [].
-            // - Otherwise:
-            //   - " x ".split(maxsplit=0)  -> ['x ']
-            //   - " x ".rsplit(maxsplit=0) -> [' x']
-            //   - "".split(maxsplit=0) -> []
-            //   - " ".split(maxsplit=0) -> []
-            let processed_str = if method.is_split() {
-                string_val.trim_start_matches(py_unicode_is_whitespace)
-            } else {
-                string_val.trim_end_matches(py_unicode_is_whitespace)
-            };
-            let list_items: &[_] = if processed_str.is_empty() {
-                &[]
-            } else {
-                &[processed_str]
-            };
-            Some(construct_replacement(
-                list_items,
-                str_value.first_literal_flags(),
-            ))
-        }
        Ordering::Less => {
            let list_items: Vec<&str> = string_val
                .split(py_unicode_is_whitespace)
@ -367,3 +321,107 @@ const fn py_unicode_is_whitespace(ch: char) -> bool {
        | '\u{3000}'
    )
 }
+
+struct WhitespaceMaxSplitIterator<'a> {
+    remaining: &'a str,
+    max_split: usize,
+    splits: usize,
+    method: Method,
+}
+
+impl<'a> WhitespaceMaxSplitIterator<'a> {
+    fn new(s: &'a str, max_split: usize, method: Method) -> Self {
+        let remaining = match method {
+            Method::Split => s.trim_start_matches(py_unicode_is_whitespace),
+            Method::RSplit => s.trim_end_matches(py_unicode_is_whitespace),
+        };
+
+        Self {
+            remaining,
+            max_split,
+            splits: 0,
+            method,
+        }
+    }
+}
+
+impl<'a> Iterator for WhitespaceMaxSplitIterator<'a> {
+    type Item = &'a str;
+
+    fn next(&mut self) -> Option<Self::Item> {
+        if self.remaining.is_empty() {
+            return None;
+        }
+
+        if self.splits >= self.max_split {
+            let result = self.remaining;
+            self.remaining = "";
+            return Some(result);
+        }
+
+        self.splits += 1;
+        match self.method {
+            Method::Split => match self.remaining.split_once(py_unicode_is_whitespace) {
+                Some((s, remaining)) => {
+                    self.remaining = remaining.trim_start_matches(py_unicode_is_whitespace);
+                    Some(s)
+                }
+                None => Some(std::mem::take(&mut self.remaining)),
+            },
+            Method::RSplit => match self.remaining.rsplit_once(py_unicode_is_whitespace) {
+                Some((remaining, s)) => {
+                    self.remaining = remaining.trim_end_matches(py_unicode_is_whitespace);
+                    Some(s)
+                }
+                None => Some(std::mem::take(&mut self.remaining)),
+            },
+        }
+    }
+}
+
+// From the Python documentation:
+// > If sep is not specified or is None, a different splitting algorithm is applied: runs of
+// > consecutive whitespace are regarded as a single separator, and the result will contain
+// > no empty strings at the start or end if the string has leading or trailing whitespace.
+// > Consequently, splitting an empty string or a string consisting of just whitespace with
+// > a None separator returns [].
+// https://docs.python.org/3/library/stdtypes.html#str.split
+fn split_whitespace_with_maxsplit(s: &str, max_split: usize, method: Method) -> Vec<&str> {
+    let mut result: Vec<_> = WhitespaceMaxSplitIterator::new(s, max_split, method).collect();
+    if method.is_rsplit() {
+        result.reverse();
+    }
+    result
+}
+
+#[cfg(test)]
+mod tests {
+    use super::{Method, split_whitespace_with_maxsplit};
+    use test_case::test_case;
+
+    #[test_case("  ", 1, &[])]
+    #[test_case("a  b", 1, &["a", "b"])]
+    #[test_case("a  b", 2, &["a", "b"])]
+    #[test_case(" a b c d ", 2, &["a", "b", "c d "])]
+    #[test_case("  a  b  c  ", 1, &["a", "b  c  "])]
+    #[test_case(" x ", 0, &["x "])]
+    #[test_case(" ", 0, &[])]
+    #[test_case("a\u{3000}b", 1, &["a", "b"])]
+    fn test_split_whitespace_with_maxsplit(s: &str, max_split: usize, expected: &[&str]) {
+        let parts = split_whitespace_with_maxsplit(s, max_split, Method::Split);
+        assert_eq!(parts, expected);
+    }
+
+    #[test_case("  ", 1, &[])]
+    #[test_case("a  b", 1, &["a", "b"])]
+    #[test_case("a  b", 2, &["a", "b"])]
+    #[test_case(" a b c d ", 2, &[" a b", "c", "d"])]
+    #[test_case("  a  b  c  ", 1, &["  a  b", "c"])]
+    #[test_case(" x ", 0, &[" x"])]
+    #[test_case(" ", 0, &[])]
+    #[test_case("a\u{3000}b", 1, &["a", "b"])]
+    fn test_rsplit_whitespace_with_maxsplit(s: &str, max_split: usize, expected: &[&str]) {
+        let parts = split_whitespace_with_maxsplit(s, max_split, Method::RSplit);
+        assert_eq!(parts, expected);
+    }
+}
--- a/crates/ruff_linter/src/rules/flake8_simplify/snapshots/ruff_linterrulesflake8_simplifytestsSIM905_SIM905.py.snap
+++ b/crates/ruff_linter/src/rules/flake8_simplify/snapshots/ruff_linterrulesflake8_simplifytestsSIM905_SIM905.py.snap
@ -1372,6 +1372,7 @@ SIM905 Consider using a list literal instead of `str.split`
 171 | " a b c d ".split(maxsplit=2)  # ["a", "b", "c d "]
    | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 172 | " a b c d ".rsplit(maxsplit=2)  # [" a b", "c", "d"]
+173 | "a  b".split(maxsplit=1)  # ["a", "b"]
    |
 help: Replace with list literal

@ -1382,5 +1383,16 @@ SIM905 Consider using a list literal instead of `str.rsplit`
 171 | " a b c d ".split(maxsplit=2)  # ["a", "b", "c d "]
 172 | " a b c d ".rsplit(maxsplit=2)  # [" a b", "c", "d"]
    | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+173 | "a  b".split(maxsplit=1)  # ["a", "b"]
+    |
+help: Replace with list literal
+
+SIM905 Consider using a list literal instead of `str.split`
+   --> SIM905.py:173:1
+    |
+171 | " a b c d ".split(maxsplit=2)  # ["a", "b", "c d "]
+172 | " a b c d ".rsplit(maxsplit=2)  # [" a b", "c", "d"]
+173 | "a  b".split(maxsplit=1)  # ["a", "b"]
+    | ^^^^^^^^^^^^^^^^^^^^^^^^
    |
 help: Replace with list literal
--- a/crates/ruff_linter/src/rules/flake8_simplify/snapshots/ruff_linterrulesflake8_simplifytestspreview__SIM905_SIM905.py.snap
+++ b/crates/ruff_linter/src/rules/flake8_simplify/snapshots/ruff_linterrulesflake8_simplifytestspreview__SIM905_SIM905.py.snap
@ -1420,6 +1420,7 @@ SIM905 [*] Consider using a list literal instead of `str.split`
 171 | " a b c d ".split(maxsplit=2)  # ["a", "b", "c d "]
    | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 172 | " a b c d ".rsplit(maxsplit=2)  # [" a b", "c", "d"]
+173 | "a  b".split(maxsplit=1)  # ["a", "b"]
    |
 help: Replace with list literal
 168 | print("<\x1c\x1d\x1e\x1f".rsplit(maxsplit=0))
@ -1428,6 +1429,7 @@ help: Replace with list literal
    - " a b c d ".split(maxsplit=2)  # ["a", "b", "c d "]
 171 + ["a", "b", "c d "]  # ["a", "b", "c d "]
 172 | " a b c d ".rsplit(maxsplit=2)  # [" a b", "c", "d"]
+173 | "a  b".split(maxsplit=1)  # ["a", "b"]

 SIM905 [*] Consider using a list literal instead of `str.rsplit`
   --> SIM905.py:172:1
@ -1436,6 +1438,7 @@ SIM905 [*] Consider using a list literal instead of `str.rsplit`
 171 | " a b c d ".split(maxsplit=2)  # ["a", "b", "c d "]
 172 | " a b c d ".rsplit(maxsplit=2)  # [" a b", "c", "d"]
    | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+173 | "a  b".split(maxsplit=1)  # ["a", "b"]
    |
 help: Replace with list literal
 169 | 
@ -1443,3 +1446,19 @@ help: Replace with list literal
 171 | " a b c d ".split(maxsplit=2)  # ["a", "b", "c d "]
    - " a b c d ".rsplit(maxsplit=2)  # [" a b", "c", "d"]
 172 + [" a b", "c", "d"]  # [" a b", "c", "d"]
+173 | "a  b".split(maxsplit=1)  # ["a", "b"]
+
+SIM905 [*] Consider using a list literal instead of `str.split`
+   --> SIM905.py:173:1
+    |
+171 | " a b c d ".split(maxsplit=2)  # ["a", "b", "c d "]
+172 | " a b c d ".rsplit(maxsplit=2)  # [" a b", "c", "d"]
+173 | "a  b".split(maxsplit=1)  # ["a", "b"]
+    | ^^^^^^^^^^^^^^^^^^^^^^^^
+    |
+help: Replace with list literal
+170 | # leading/trailing whitespace should not count towards maxsplit
+171 | " a b c d ".split(maxsplit=2)  # ["a", "b", "c d "]
+172 | " a b c d ".rsplit(maxsplit=2)  # [" a b", "c", "d"]
+    - "a  b".split(maxsplit=1)  # ["a", "b"]
+173 + ["a", "b"]  # ["a", "b"]