Move Python whitespace utilities into new ruff_python_whitespace crate (#4993)

## Summary `ruff_newlines` becomes `ruff_python_whitespace`, and includes the existing "universal newline" handlers alongside the Python whitespace-specific utilities.
2025-07-24 05:25:17 +00:00 · 2023-06-09 20:59:57 -04:00 · 2023-06-09 20:59:57 -04:00 · 1d756dc3a7
commit 1d756dc3a7
parent e86f12a1ec
57 changed files with 153 additions and 140 deletions
--- a/crates/ruff_python_whitespace/Cargo.toml
+++ b/crates/ruff_python_whitespace/Cargo.toml
@ -0,0 +1,13 @@
+[package]
+name = "ruff_python_whitespace"
+version = "0.0.0"
+publish = false
+edition = { workspace = true }
+rust-version = { workspace = true }
+
+[lib]
+
+[dependencies]
+ruff_text_size = { workspace = true }
+
+memchr = { workspace = true }
--- a/crates/ruff_python_whitespace/src/lib.rs
+++ b/crates/ruff_python_whitespace/src/lib.rs
@ -0,0 +1,5 @@
+mod newlines;
+mod whitespace;
+
+pub use newlines::*;
+pub use whitespace::*;
--- a/crates/ruff_python_whitespace/src/newlines.rs
+++ b/crates/ruff_python_whitespace/src/newlines.rs
@ -0,0 +1,453 @@
+use std::iter::FusedIterator;
+use std::ops::Deref;
+
+use memchr::{memchr2, memrchr2};
+use ruff_text_size::{TextLen, TextRange, TextSize};
+
+/// Extension trait for [`str`] that provides a [`UniversalNewlineIterator`].
+pub trait UniversalNewlines {
+    fn universal_newlines(&self) -> UniversalNewlineIterator<'_>;
+}
+
+impl UniversalNewlines for str {
+    fn universal_newlines(&self) -> UniversalNewlineIterator<'_> {
+        UniversalNewlineIterator::from(self)
+    }
+}
+
+/// Like [`str#lines`], but accommodates LF, CRLF, and CR line endings,
+/// the latter of which are not supported by [`str#lines`].
+///
+/// ## Examples
+///
+/// ```rust
+/// # use ruff_text_size::TextSize;
+/// # use ruff_python_whitespace::{Line, UniversalNewlineIterator};
+/// let mut lines = UniversalNewlineIterator::from("foo\nbar\n\r\nbaz\rbop");
+///
+/// assert_eq!(lines.next_back(), Some(Line::new("bop", TextSize::from(14))));
+/// assert_eq!(lines.next(), Some(Line::new("foo\n", TextSize::from(0))));
+/// assert_eq!(lines.next_back(), Some(Line::new("baz\r", TextSize::from(10))));
+/// assert_eq!(lines.next(), Some(Line::new("bar\n", TextSize::from(4))));
+/// assert_eq!(lines.next_back(), Some(Line::new("\r\n", TextSize::from(8))));
+/// assert_eq!(lines.next(), None);
+/// ```
+pub struct UniversalNewlineIterator<'a> {
+    text: &'a str,
+    offset: TextSize,
+    offset_back: TextSize,
+}
+
+impl<'a> UniversalNewlineIterator<'a> {
+    pub fn with_offset(text: &'a str, offset: TextSize) -> UniversalNewlineIterator<'a> {
+        UniversalNewlineIterator {
+            text,
+            offset,
+            offset_back: offset + text.text_len(),
+        }
+    }
+
+    pub fn from(text: &'a str) -> UniversalNewlineIterator<'a> {
+        Self::with_offset(text, TextSize::default())
+    }
+}
+
+/// Finds the next newline character. Returns its position and the [`LineEnding`].
+#[inline]
+pub fn find_newline(text: &str) -> Option<(usize, LineEnding)> {
+    let bytes = text.as_bytes();
+    if let Some(position) = memchr2(b'\n', b'\r', bytes) {
+        // SAFETY: memchr guarantees to return valid positions
+        #[allow(unsafe_code)]
+        let newline_character = unsafe { *bytes.get_unchecked(position) };
+
+        let line_ending = match newline_character {
+            // Explicit branch for `\n` as this is the most likely path
+            b'\n' => LineEnding::Lf,
+            // '\r\n'
+            b'\r' if bytes.get(position.saturating_add(1)) == Some(&b'\n') => LineEnding::CrLf,
+            // '\r'
+            _ => LineEnding::Cr,
+        };
+
+        Some((position, line_ending))
+    } else {
+        None
+    }
+}
+
+impl<'a> Iterator for UniversalNewlineIterator<'a> {
+    type Item = Line<'a>;
+
+    #[inline]
+    fn next(&mut self) -> Option<Line<'a>> {
+        if self.text.is_empty() {
+            return None;
+        }
+
+        let line = if let Some((newline_position, line_ending)) = find_newline(self.text) {
+            let (text, remainder) = self.text.split_at(newline_position + line_ending.len());
+
+            let line = Line {
+                offset: self.offset,
+                text,
+            };
+
+            self.text = remainder;
+            self.offset += text.text_len();
+
+            line
+        }
+        // Last line
+        else {
+            Line {
+                offset: self.offset,
+                text: std::mem::take(&mut self.text),
+            }
+        };
+
+        Some(line)
+    }
+
+    fn last(mut self) -> Option<Self::Item> {
+        self.next_back()
+    }
+}
+
+impl DoubleEndedIterator for UniversalNewlineIterator<'_> {
+    #[inline]
+    fn next_back(&mut self) -> Option<Self::Item> {
+        if self.text.is_empty() {
+            return None;
+        }
+
+        let len = self.text.len();
+
+        // Trim any trailing newlines.
+        let haystack = match self.text.as_bytes()[len - 1] {
+            b'\n' if len > 1 && self.text.as_bytes()[len - 2] == b'\r' => &self.text[..len - 2],
+            b'\n' | b'\r' => &self.text[..len - 1],
+            _ => self.text,
+        };
+
+        // Find the end of the previous line. The previous line is the text up to, but not including
+        // the newline character.
+        let line = if let Some(line_end) = memrchr2(b'\n', b'\r', haystack.as_bytes()) {
+            // '\n' or '\r' or '\r\n'
+            let (remainder, line) = self.text.split_at(line_end + 1);
+            self.text = remainder;
+            self.offset_back -= line.text_len();
+
+            Line {
+                text: line,
+                offset: self.offset_back,
+            }
+        } else {
+            // Last line
+            let offset = self.offset_back - self.text.text_len();
+            Line {
+                text: std::mem::take(&mut self.text),
+                offset,
+            }
+        };
+
+        Some(line)
+    }
+}
+
+impl FusedIterator for UniversalNewlineIterator<'_> {}
+
+/// Like [`UniversalNewlineIterator`], but includes a trailing newline as an empty line.
+pub struct NewlineWithTrailingNewline<'a> {
+    trailing: Option<Line<'a>>,
+    underlying: UniversalNewlineIterator<'a>,
+}
+
+impl<'a> NewlineWithTrailingNewline<'a> {
+    pub fn from(input: &'a str) -> NewlineWithTrailingNewline<'a> {
+        Self::with_offset(input, TextSize::default())
+    }
+
+    pub fn with_offset(input: &'a str, offset: TextSize) -> Self {
+        NewlineWithTrailingNewline {
+            underlying: UniversalNewlineIterator::with_offset(input, offset),
+            trailing: if input.ends_with(['\r', '\n']) {
+                Some(Line {
+                    text: "",
+                    offset: offset + input.text_len(),
+                })
+            } else {
+                None
+            },
+        }
+    }
+}
+
+impl<'a> Iterator for NewlineWithTrailingNewline<'a> {
+    type Item = Line<'a>;
+
+    #[inline]
+    fn next(&mut self) -> Option<Line<'a>> {
+        self.underlying.next().or_else(|| self.trailing.take())
+    }
+}
+
+#[derive(Debug, Clone, Eq, PartialEq)]
+pub struct Line<'a> {
+    text: &'a str,
+    offset: TextSize,
+}
+
+impl<'a> Line<'a> {
+    pub fn new(text: &'a str, offset: TextSize) -> Self {
+        Self { text, offset }
+    }
+
+    #[inline]
+    pub const fn start(&self) -> TextSize {
+        self.offset
+    }
+
+    /// Returns the byte offset where the line ends, including its terminating new line character.
+    #[inline]
+    pub fn full_end(&self) -> TextSize {
+        self.offset + self.full_text_len()
+    }
+
+    /// Returns the byte offset where the line ends, excluding its new line character
+    #[inline]
+    pub fn end(&self) -> TextSize {
+        self.offset + self.as_str().text_len()
+    }
+
+    /// Returns the range of the line, including its terminating new line character.
+    #[inline]
+    pub fn full_range(&self) -> TextRange {
+        TextRange::at(self.offset, self.text.text_len())
+    }
+
+    /// Returns the range of the line, excluding its terminating new line character
+    #[inline]
+    pub fn range(&self) -> TextRange {
+        TextRange::new(self.start(), self.end())
+    }
+
+    /// Returns the line's new line character, if any.
+    #[inline]
+    pub fn line_ending(&self) -> Option<LineEnding> {
+        let mut bytes = self.text.bytes().rev();
+        match bytes.next() {
+            Some(b'\n') => {
+                if bytes.next() == Some(b'\r') {
+                    Some(LineEnding::CrLf)
+                } else {
+                    Some(LineEnding::Lf)
+                }
+            }
+            Some(b'\r') => Some(LineEnding::Cr),
+            _ => None,
+        }
+    }
+
+    /// Returns the text of the line, excluding the terminating new line character.
+    #[inline]
+    pub fn as_str(&self) -> &'a str {
+        let newline_len = self
+            .line_ending()
+            .map_or(0, |line_ending| line_ending.len());
+        &self.text[..self.text.len() - newline_len]
+    }
+
+    /// Returns the line's text, including the terminating new line character.
+    #[inline]
+    pub fn as_full_str(&self) -> &'a str {
+        self.text
+    }
+
+    #[inline]
+    pub fn full_text_len(&self) -> TextSize {
+        self.text.text_len()
+    }
+}
+
+impl Deref for Line<'_> {
+    type Target = str;
+
+    fn deref(&self) -> &Self::Target {
+        self.as_str()
+    }
+}
+
+impl PartialEq<&str> for Line<'_> {
+    fn eq(&self, other: &&str) -> bool {
+        self.as_str() == *other
+    }
+}
+
+impl PartialEq<Line<'_>> for &str {
+    fn eq(&self, other: &Line<'_>) -> bool {
+        *self == other.as_str()
+    }
+}
+
+/// The line ending style used in Python source code.
+/// See <https://docs.python.org/3/reference/lexical_analysis.html#physical-lines>
+#[derive(Debug, PartialEq, Eq, Copy, Clone)]
+pub enum LineEnding {
+    Lf,
+    Cr,
+    CrLf,
+}
+
+impl Default for LineEnding {
+    fn default() -> Self {
+        if cfg!(windows) {
+            LineEnding::CrLf
+        } else {
+            LineEnding::Lf
+        }
+    }
+}
+
+impl LineEnding {
+    pub const fn as_str(&self) -> &'static str {
+        match self {
+            LineEnding::Lf => "\n",
+            LineEnding::CrLf => "\r\n",
+            LineEnding::Cr => "\r",
+        }
+    }
+
+    #[allow(clippy::len_without_is_empty)]
+    pub const fn len(&self) -> usize {
+        match self {
+            LineEnding::Lf | LineEnding::Cr => 1,
+            LineEnding::CrLf => 2,
+        }
+    }
+
+    pub const fn text_len(&self) -> TextSize {
+        match self {
+            LineEnding::Lf | LineEnding::Cr => TextSize::new(1),
+            LineEnding::CrLf => TextSize::new(2),
+        }
+    }
+}
+
+impl Deref for LineEnding {
+    type Target = str;
+
+    fn deref(&self) -> &Self::Target {
+        self.as_str()
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use ruff_text_size::TextSize;
+
+    use super::{Line, UniversalNewlineIterator};
+
+    #[test]
+    fn universal_newlines_empty_str() {
+        let lines: Vec<_> = UniversalNewlineIterator::from("").collect();
+        assert_eq!(lines, Vec::<Line>::new());
+
+        let lines: Vec<_> = UniversalNewlineIterator::from("").rev().collect();
+        assert_eq!(lines, Vec::<Line>::new());
+    }
+
+    #[test]
+    fn universal_newlines_forward() {
+        let lines: Vec<_> = UniversalNewlineIterator::from("foo\nbar\n\r\nbaz\rbop").collect();
+        assert_eq!(
+            lines,
+            vec![
+                Line::new("foo\n", TextSize::from(0)),
+                Line::new("bar\n", TextSize::from(4)),
+                Line::new("\r\n", TextSize::from(8)),
+                Line::new("baz\r", TextSize::from(10)),
+                Line::new("bop", TextSize::from(14)),
+            ]
+        );
+
+        let lines: Vec<_> = UniversalNewlineIterator::from("foo\nbar\n\r\nbaz\rbop\n").collect();
+        assert_eq!(
+            lines,
+            vec![
+                Line::new("foo\n", TextSize::from(0)),
+                Line::new("bar\n", TextSize::from(4)),
+                Line::new("\r\n", TextSize::from(8)),
+                Line::new("baz\r", TextSize::from(10)),
+                Line::new("bop\n", TextSize::from(14)),
+            ]
+        );
+
+        let lines: Vec<_> = UniversalNewlineIterator::from("foo\nbar\n\r\nbaz\rbop\n\n").collect();
+        assert_eq!(
+            lines,
+            vec![
+                Line::new("foo\n", TextSize::from(0)),
+                Line::new("bar\n", TextSize::from(4)),
+                Line::new("\r\n", TextSize::from(8)),
+                Line::new("baz\r", TextSize::from(10)),
+                Line::new("bop\n", TextSize::from(14)),
+                Line::new("\n", TextSize::from(18)),
+            ]
+        );
+    }
+
+    #[test]
+    fn universal_newlines_backwards() {
+        let lines: Vec<_> = UniversalNewlineIterator::from("foo\nbar\n\r\nbaz\rbop")
+            .rev()
+            .collect();
+        assert_eq!(
+            lines,
+            vec![
+                Line::new("bop", TextSize::from(14)),
+                Line::new("baz\r", TextSize::from(10)),
+                Line::new("\r\n", TextSize::from(8)),
+                Line::new("bar\n", TextSize::from(4)),
+                Line::new("foo\n", TextSize::from(0)),
+            ]
+        );
+
+        let lines: Vec<_> = UniversalNewlineIterator::from("foo\nbar\n\nbaz\rbop\n")
+            .rev()
+            .map(|line| line.as_str())
+            .collect();
+
+        assert_eq!(
+            lines,
+            vec![
+                Line::new("bop\n", TextSize::from(13)),
+                Line::new("baz\r", TextSize::from(9)),
+                Line::new("\n", TextSize::from(8)),
+                Line::new("bar\n", TextSize::from(4)),
+                Line::new("foo\n", TextSize::from(0)),
+            ]
+        );
+    }
+
+    #[test]
+    fn universal_newlines_mixed() {
+        let mut lines = UniversalNewlineIterator::from("foo\nbar\n\r\nbaz\rbop");
+
+        assert_eq!(
+            lines.next_back(),
+            Some(Line::new("bop", TextSize::from(14)))
+        );
+        assert_eq!(lines.next(), Some(Line::new("foo\n", TextSize::from(0))));
+        assert_eq!(
+            lines.next_back(),
+            Some(Line::new("baz\r", TextSize::from(10)))
+        );
+        assert_eq!(lines.next(), Some(Line::new("bar\n", TextSize::from(4))));
+        assert_eq!(
+            lines.next_back(),
+            Some(Line::new("\r\n", TextSize::from(8)))
+        );
+        assert_eq!(lines.next(), None);
+    }
+}
--- a/crates/ruff_python_whitespace/src/whitespace.rs
+++ b/crates/ruff_python_whitespace/src/whitespace.rs
@ -0,0 +1,15 @@
+/// Returns `true` for [whitespace](https://docs.python.org/3/reference/lexical_analysis.html#whitespace-between-tokens)
+/// characters.
+pub const fn is_python_whitespace(c: char) -> bool {
+    matches!(
+        c,
+        // Space, tab, or form-feed
+        ' ' | '\t' | '\x0C'
+    )
+}
+
+/// Extract the leading indentation from a line.
+pub fn leading_indentation(line: &str) -> &str {
+    line.find(|char: char| !is_python_whitespace(char))
+        .map_or(line, |index| &line[..index])
+}