mirror of
https://github.com/astral-sh/ruff.git
synced 2025-08-19 01:51:30 +00:00
Implement an iterator for universal newlines (#3454)
# Summary We need to support CR line endings (as opposed to LF and CRLF line endings, which are already supported). They're rare, but they do appear in Python code, and we tend to panic on any file that uses them. Our `Locator` abstraction now supports CR line endings. However, Rust's `str#lines` implementation does _not_. This PR adds a `UniversalNewlineIterator` implementation that respects all of CR, LF, and CRLF line endings, and plugs it into most of the `.lines()` call sites. As an alternative design, it could be nice if we could leverage `Locator` for this. We've already computed all of the line endings, so we could probably iterate much more efficiently? # Test Plan Largely relying on automated testing, however, also ran over some known failure cases, like #3404.
This commit is contained in:
parent
2a4d6ab3b2
commit
c2750a59ab
35 changed files with 325 additions and 126 deletions
|
@ -14,6 +14,7 @@ use rustpython_parser::{lexer, Mode, StringKind, Tok};
|
|||
use smallvec::{smallvec, SmallVec};
|
||||
|
||||
use crate::context::Context;
|
||||
use crate::newlines::StrExt;
|
||||
use crate::source_code::{Generator, Indexer, Locator, Stylist};
|
||||
use crate::types::{Binding, BindingKind, CallPath, Range};
|
||||
use crate::visitor;
|
||||
|
@ -1125,7 +1126,7 @@ pub fn end_of_statement(stmt: &Stmt, locator: &Locator) -> Location {
|
|||
}
|
||||
|
||||
// Otherwise, find the end of the last line that's "part of" the statement.
|
||||
for (lineno, line) in contents.lines().enumerate() {
|
||||
for (lineno, line) in contents.universal_newlines().enumerate() {
|
||||
if line.ends_with('\\') {
|
||||
continue;
|
||||
}
|
||||
|
|
|
@ -6,6 +6,7 @@ pub mod function_type;
|
|||
pub mod hashable;
|
||||
pub mod helpers;
|
||||
pub mod logging;
|
||||
pub mod newlines;
|
||||
pub mod operations;
|
||||
pub mod relocate;
|
||||
pub mod source_code;
|
||||
|
|
192
crates/ruff_python_ast/src/newlines.rs
Normal file
192
crates/ruff_python_ast/src/newlines.rs
Normal file
|
@ -0,0 +1,192 @@
|
|||
use std::iter::FusedIterator;
|
||||
|
||||
/// Extension trait for [`str`] that provides a [`UniversalNewlineIterator`].
|
||||
pub trait StrExt {
|
||||
fn universal_newlines(&self) -> UniversalNewlineIterator<'_>;
|
||||
}
|
||||
|
||||
impl StrExt for str {
|
||||
fn universal_newlines(&self) -> UniversalNewlineIterator<'_> {
|
||||
UniversalNewlineIterator::from(self)
|
||||
}
|
||||
}
|
||||
|
||||
/// Like [`str#lines`], but accommodates LF, CRLF, and CR line endings,
|
||||
/// the latter of which are not supported by [`str#lines`].
|
||||
///
|
||||
/// ## Examples
|
||||
///
|
||||
/// ```rust
|
||||
/// use ruff_python_ast::newlines::UniversalNewlineIterator;
|
||||
///
|
||||
/// let mut lines = UniversalNewlineIterator::from("foo\nbar\n\r\nbaz\rbop");
|
||||
///
|
||||
/// assert_eq!(lines.next_back(), Some("bop"));
|
||||
/// assert_eq!(lines.next(), Some("foo"));
|
||||
/// assert_eq!(lines.next_back(), Some("baz"));
|
||||
/// assert_eq!(lines.next(), Some("bar"));
|
||||
/// assert_eq!(lines.next_back(), Some(""));
|
||||
/// assert_eq!(lines.next(), None);
|
||||
/// ```
|
||||
pub struct UniversalNewlineIterator<'a> {
|
||||
text: &'a str,
|
||||
}
|
||||
|
||||
impl<'a> UniversalNewlineIterator<'a> {
|
||||
pub fn from(text: &'a str) -> UniversalNewlineIterator<'a> {
|
||||
UniversalNewlineIterator { text }
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> Iterator for UniversalNewlineIterator<'a> {
|
||||
type Item = &'a str;
|
||||
|
||||
#[inline]
|
||||
fn next(&mut self) -> Option<&'a str> {
|
||||
if self.text.is_empty() {
|
||||
return None;
|
||||
}
|
||||
|
||||
let line = match self.text.find(['\n', '\r']) {
|
||||
// Non-last line
|
||||
Some(line_end) => {
|
||||
let (line, remainder) = self.text.split_at(line_end);
|
||||
|
||||
self.text = match remainder.as_bytes()[0] {
|
||||
// Explicit branch for `\n` as this is the most likely path
|
||||
b'\n' => &remainder[1..],
|
||||
// '\r\n'
|
||||
b'\r' if remainder.as_bytes().get(1) == Some(&b'\n') => &remainder[2..],
|
||||
// '\r'
|
||||
_ => &remainder[1..],
|
||||
};
|
||||
|
||||
line
|
||||
}
|
||||
// Last line
|
||||
None => std::mem::take(&mut self.text),
|
||||
};
|
||||
|
||||
Some(line)
|
||||
}
|
||||
|
||||
fn last(mut self) -> Option<Self::Item> {
|
||||
self.next_back()
|
||||
}
|
||||
}
|
||||
|
||||
impl DoubleEndedIterator for UniversalNewlineIterator<'_> {
|
||||
#[inline]
|
||||
fn next_back(&mut self) -> Option<Self::Item> {
|
||||
if self.text.is_empty() {
|
||||
return None;
|
||||
}
|
||||
|
||||
let len = self.text.len();
|
||||
|
||||
// Trim any trailing newlines.
|
||||
self.text = match self.text.as_bytes()[len - 1] {
|
||||
b'\n' if len > 1 && self.text.as_bytes()[len - 2] == b'\r' => &self.text[..len - 2],
|
||||
b'\n' | b'\r' => &self.text[..len - 1],
|
||||
_ => self.text,
|
||||
};
|
||||
|
||||
// Find the end of the previous line. The previous line is the text up to, but not including
|
||||
// the newline character.
|
||||
let line = match self.text.rfind(['\n', '\r']) {
|
||||
// '\n' or '\r' or '\r\n'
|
||||
Some(line_end) => {
|
||||
let (remainder, line) = self.text.split_at(line_end + 1);
|
||||
self.text = remainder;
|
||||
|
||||
line
|
||||
}
|
||||
// Last line
|
||||
None => std::mem::take(&mut self.text),
|
||||
};
|
||||
|
||||
Some(line)
|
||||
}
|
||||
}
|
||||
|
||||
impl FusedIterator for UniversalNewlineIterator<'_> {}
|
||||
|
||||
/// Like [`UniversalNewlineIterator`], but includes a trailing newline as an empty line.
|
||||
pub struct NewlineWithTrailingNewline<'a> {
|
||||
trailing: Option<&'a str>,
|
||||
underlying: UniversalNewlineIterator<'a>,
|
||||
}
|
||||
|
||||
impl<'a> NewlineWithTrailingNewline<'a> {
|
||||
pub fn from(input: &'a str) -> NewlineWithTrailingNewline<'a> {
|
||||
NewlineWithTrailingNewline {
|
||||
underlying: UniversalNewlineIterator::from(input),
|
||||
trailing: if input.ends_with(['\r', '\n']) {
|
||||
Some("")
|
||||
} else {
|
||||
None
|
||||
},
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> Iterator for NewlineWithTrailingNewline<'a> {
|
||||
type Item = &'a str;
|
||||
|
||||
#[inline]
|
||||
fn next(&mut self) -> Option<&'a str> {
|
||||
self.underlying.next().or_else(|| self.trailing.take())
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::UniversalNewlineIterator;
|
||||
|
||||
#[test]
|
||||
fn universal_newlines_empty_str() {
|
||||
let lines: Vec<_> = UniversalNewlineIterator::from("").collect();
|
||||
assert_eq!(lines, Vec::<&str>::default());
|
||||
|
||||
let lines: Vec<_> = UniversalNewlineIterator::from("").rev().collect();
|
||||
assert_eq!(lines, Vec::<&str>::default());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn universal_newlines_forward() {
|
||||
let lines: Vec<_> = UniversalNewlineIterator::from("foo\nbar\n\r\nbaz\rbop").collect();
|
||||
assert_eq!(lines, vec!["foo", "bar", "", "baz", "bop"]);
|
||||
|
||||
let lines: Vec<_> = UniversalNewlineIterator::from("foo\nbar\n\r\nbaz\rbop\n").collect();
|
||||
assert_eq!(lines, vec!["foo", "bar", "", "baz", "bop"]);
|
||||
|
||||
let lines: Vec<_> = UniversalNewlineIterator::from("foo\nbar\n\r\nbaz\rbop\n\n").collect();
|
||||
assert_eq!(lines, vec!["foo", "bar", "", "baz", "bop", ""]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn universal_newlines_backwards() {
|
||||
let lines: Vec<_> = UniversalNewlineIterator::from("foo\nbar\n\r\nbaz\rbop")
|
||||
.rev()
|
||||
.collect();
|
||||
assert_eq!(lines, vec!["bop", "baz", "", "bar", "foo"]);
|
||||
|
||||
let lines: Vec<_> = UniversalNewlineIterator::from("foo\nbar\n\nbaz\rbop\n")
|
||||
.rev()
|
||||
.collect();
|
||||
|
||||
assert_eq!(lines, vec!["bop", "baz", "", "bar", "foo"]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn universal_newlines_mixed() {
|
||||
let mut lines = UniversalNewlineIterator::from("foo\nbar\n\r\nbaz\rbop");
|
||||
|
||||
assert_eq!(lines.next_back(), Some("bop"));
|
||||
assert_eq!(lines.next(), Some("foo"));
|
||||
assert_eq!(lines.next_back(), Some("baz"));
|
||||
assert_eq!(lines.next(), Some("bar"));
|
||||
assert_eq!(lines.next_back(), Some(""));
|
||||
assert_eq!(lines.next(), None);
|
||||
}
|
||||
}
|
|
@ -56,10 +56,18 @@ impl<'a> Locator<'a> {
|
|||
self.contents
|
||||
}
|
||||
|
||||
/// Return the number of lines in the source code.
|
||||
pub fn count_lines(&self) -> usize {
|
||||
let index = self.get_or_init_index();
|
||||
index.count_lines()
|
||||
}
|
||||
|
||||
/// Return the number of bytes in the source code.
|
||||
pub const fn len(&self) -> usize {
|
||||
self.contents.len()
|
||||
}
|
||||
|
||||
/// Return `true` if the source code is empty.
|
||||
pub const fn is_empty(&self) -> bool {
|
||||
self.contents.is_empty()
|
||||
}
|
||||
|
@ -83,6 +91,14 @@ impl Index {
|
|||
Index::Utf8(utf8) => utf8.byte_offset(location, contents),
|
||||
}
|
||||
}
|
||||
|
||||
/// Return the number of lines in the source code.
|
||||
fn count_lines(&self) -> usize {
|
||||
match self {
|
||||
Index::Ascii(ascii) => ascii.line_start_byte_offsets.len(),
|
||||
Index::Utf8(utf8) => utf8.line_start_byte_offsets.len(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl From<&str> for Index {
|
||||
|
|
|
@ -40,19 +40,18 @@ pub fn raw_contents(contents: &str) -> &str {
|
|||
|
||||
/// Return the leading quote for a string or byte literal (e.g., `"""`).
|
||||
pub fn leading_quote(content: &str) -> Option<&str> {
|
||||
if let Some(first_line) = content.lines().next() {
|
||||
for pattern in TRIPLE_QUOTE_STR_PREFIXES
|
||||
.iter()
|
||||
.chain(TRIPLE_QUOTE_BYTE_PREFIXES)
|
||||
.chain(SINGLE_QUOTE_STR_PREFIXES)
|
||||
.chain(SINGLE_QUOTE_BYTE_PREFIXES)
|
||||
{
|
||||
if first_line.starts_with(pattern) {
|
||||
return Some(pattern);
|
||||
TRIPLE_QUOTE_STR_PREFIXES
|
||||
.iter()
|
||||
.chain(TRIPLE_QUOTE_BYTE_PREFIXES)
|
||||
.chain(SINGLE_QUOTE_STR_PREFIXES)
|
||||
.chain(SINGLE_QUOTE_BYTE_PREFIXES)
|
||||
.find_map(|pattern| {
|
||||
if content.starts_with(pattern) {
|
||||
Some(*pattern)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
}
|
||||
None
|
||||
})
|
||||
}
|
||||
|
||||
/// Return the trailing quote string for a string or byte literal (e.g., `"""`).
|
||||
|
|
|
@ -1,5 +1,3 @@
|
|||
use std::str::Lines;
|
||||
|
||||
use rustpython_parser::ast::{Located, Location};
|
||||
|
||||
use crate::source_code::Locator;
|
||||
|
@ -39,38 +37,3 @@ pub fn clean(indentation: &str) -> String {
|
|||
.map(|char| if char.is_whitespace() { char } else { ' ' })
|
||||
.collect()
|
||||
}
|
||||
|
||||
/// Like `str#lines`, but includes a trailing newline as an empty line.
|
||||
pub struct LinesWithTrailingNewline<'a> {
|
||||
trailing: Option<&'a str>,
|
||||
underlying: Lines<'a>,
|
||||
}
|
||||
|
||||
impl<'a> LinesWithTrailingNewline<'a> {
|
||||
pub fn from(input: &'a str) -> LinesWithTrailingNewline<'a> {
|
||||
LinesWithTrailingNewline {
|
||||
underlying: input.lines(),
|
||||
trailing: if input.ends_with('\n') {
|
||||
Some("")
|
||||
} else {
|
||||
None
|
||||
},
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> Iterator for LinesWithTrailingNewline<'a> {
|
||||
type Item = &'a str;
|
||||
|
||||
#[inline]
|
||||
fn next(&mut self) -> Option<&'a str> {
|
||||
let mut next = self.underlying.next();
|
||||
if next.is_none() {
|
||||
if self.trailing.is_some() {
|
||||
next = self.trailing;
|
||||
self.trailing = None;
|
||||
}
|
||||
}
|
||||
next
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue