Remove parser dependency from ruff-python-ast (#6096)

This commit is contained in:
Micha Reiser 2023-07-26 17:47:22 +02:00 committed by GitHub
parent 99127243f4
commit 2cf00fee96
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
658 changed files with 1714 additions and 1546 deletions

View file

@ -1,9 +1,8 @@
mod cursor;
mod newlines;
pub mod textwrap;
mod tokenizer;
mod whitespace;
pub use cursor::*;
pub use newlines::*;
pub use tokenizer::*;
pub use whitespace::*;

View file

@ -1,453 +0,0 @@
use std::iter::FusedIterator;
use std::ops::Deref;
use memchr::{memchr2, memrchr2};
use ruff_text_size::{TextLen, TextRange, TextSize};
/// Extension trait for [`str`] that provides a [`UniversalNewlineIterator`].
pub trait UniversalNewlines {
fn universal_newlines(&self) -> UniversalNewlineIterator<'_>;
}
impl UniversalNewlines for str {
fn universal_newlines(&self) -> UniversalNewlineIterator<'_> {
UniversalNewlineIterator::from(self)
}
}
/// Like [`str#lines`], but accommodates LF, CRLF, and CR line endings,
/// the latter of which are not supported by [`str#lines`].
///
/// ## Examples
///
/// ```rust
/// # use ruff_text_size::TextSize;
/// # use ruff_python_trivia::{Line, UniversalNewlineIterator};
/// let mut lines = UniversalNewlineIterator::from("foo\nbar\n\r\nbaz\rbop");
///
/// assert_eq!(lines.next_back(), Some(Line::new("bop", TextSize::from(14))));
/// assert_eq!(lines.next(), Some(Line::new("foo\n", TextSize::from(0))));
/// assert_eq!(lines.next_back(), Some(Line::new("baz\r", TextSize::from(10))));
/// assert_eq!(lines.next(), Some(Line::new("bar\n", TextSize::from(4))));
/// assert_eq!(lines.next_back(), Some(Line::new("\r\n", TextSize::from(8))));
/// assert_eq!(lines.next(), None);
/// ```
pub struct UniversalNewlineIterator<'a> {
text: &'a str,
offset: TextSize,
offset_back: TextSize,
}
impl<'a> UniversalNewlineIterator<'a> {
pub fn with_offset(text: &'a str, offset: TextSize) -> UniversalNewlineIterator<'a> {
UniversalNewlineIterator {
text,
offset,
offset_back: offset + text.text_len(),
}
}
pub fn from(text: &'a str) -> UniversalNewlineIterator<'a> {
Self::with_offset(text, TextSize::default())
}
}
/// Finds the next newline character. Returns its position and the [`LineEnding`].
#[inline]
pub fn find_newline(text: &str) -> Option<(usize, LineEnding)> {
let bytes = text.as_bytes();
if let Some(position) = memchr2(b'\n', b'\r', bytes) {
// SAFETY: memchr guarantees to return valid positions
#[allow(unsafe_code)]
let newline_character = unsafe { *bytes.get_unchecked(position) };
let line_ending = match newline_character {
// Explicit branch for `\n` as this is the most likely path
b'\n' => LineEnding::Lf,
// '\r\n'
b'\r' if bytes.get(position.saturating_add(1)) == Some(&b'\n') => LineEnding::CrLf,
// '\r'
_ => LineEnding::Cr,
};
Some((position, line_ending))
} else {
None
}
}
impl<'a> Iterator for UniversalNewlineIterator<'a> {
type Item = Line<'a>;
#[inline]
fn next(&mut self) -> Option<Line<'a>> {
if self.text.is_empty() {
return None;
}
let line = if let Some((newline_position, line_ending)) = find_newline(self.text) {
let (text, remainder) = self.text.split_at(newline_position + line_ending.len());
let line = Line {
offset: self.offset,
text,
};
self.text = remainder;
self.offset += text.text_len();
line
}
// Last line
else {
Line {
offset: self.offset,
text: std::mem::take(&mut self.text),
}
};
Some(line)
}
fn last(mut self) -> Option<Self::Item> {
self.next_back()
}
}
impl DoubleEndedIterator for UniversalNewlineIterator<'_> {
#[inline]
fn next_back(&mut self) -> Option<Self::Item> {
if self.text.is_empty() {
return None;
}
let len = self.text.len();
// Trim any trailing newlines.
let haystack = match self.text.as_bytes()[len - 1] {
b'\n' if len > 1 && self.text.as_bytes()[len - 2] == b'\r' => &self.text[..len - 2],
b'\n' | b'\r' => &self.text[..len - 1],
_ => self.text,
};
// Find the end of the previous line. The previous line is the text up to, but not including
// the newline character.
let line = if let Some(line_end) = memrchr2(b'\n', b'\r', haystack.as_bytes()) {
// '\n' or '\r' or '\r\n'
let (remainder, line) = self.text.split_at(line_end + 1);
self.text = remainder;
self.offset_back -= line.text_len();
Line {
text: line,
offset: self.offset_back,
}
} else {
// Last line
let offset = self.offset_back - self.text.text_len();
Line {
text: std::mem::take(&mut self.text),
offset,
}
};
Some(line)
}
}
impl FusedIterator for UniversalNewlineIterator<'_> {}
/// Like [`UniversalNewlineIterator`], but includes a trailing newline as an empty line.
pub struct NewlineWithTrailingNewline<'a> {
trailing: Option<Line<'a>>,
underlying: UniversalNewlineIterator<'a>,
}
impl<'a> NewlineWithTrailingNewline<'a> {
pub fn from(input: &'a str) -> NewlineWithTrailingNewline<'a> {
Self::with_offset(input, TextSize::default())
}
pub fn with_offset(input: &'a str, offset: TextSize) -> Self {
NewlineWithTrailingNewline {
underlying: UniversalNewlineIterator::with_offset(input, offset),
trailing: if input.ends_with(['\r', '\n']) {
Some(Line {
text: "",
offset: offset + input.text_len(),
})
} else {
None
},
}
}
}
impl<'a> Iterator for NewlineWithTrailingNewline<'a> {
type Item = Line<'a>;
#[inline]
fn next(&mut self) -> Option<Line<'a>> {
self.underlying.next().or_else(|| self.trailing.take())
}
}
#[derive(Debug, Clone, Eq, PartialEq)]
pub struct Line<'a> {
text: &'a str,
offset: TextSize,
}
impl<'a> Line<'a> {
pub fn new(text: &'a str, offset: TextSize) -> Self {
Self { text, offset }
}
#[inline]
pub const fn start(&self) -> TextSize {
self.offset
}
/// Returns the byte offset where the line ends, including its terminating new line character.
#[inline]
pub fn full_end(&self) -> TextSize {
self.offset + self.full_text_len()
}
/// Returns the byte offset where the line ends, excluding its new line character
#[inline]
pub fn end(&self) -> TextSize {
self.offset + self.as_str().text_len()
}
/// Returns the range of the line, including its terminating new line character.
#[inline]
pub fn full_range(&self) -> TextRange {
TextRange::at(self.offset, self.text.text_len())
}
/// Returns the range of the line, excluding its terminating new line character
#[inline]
pub fn range(&self) -> TextRange {
TextRange::new(self.start(), self.end())
}
/// Returns the line's new line character, if any.
#[inline]
pub fn line_ending(&self) -> Option<LineEnding> {
let mut bytes = self.text.bytes().rev();
match bytes.next() {
Some(b'\n') => {
if bytes.next() == Some(b'\r') {
Some(LineEnding::CrLf)
} else {
Some(LineEnding::Lf)
}
}
Some(b'\r') => Some(LineEnding::Cr),
_ => None,
}
}
/// Returns the text of the line, excluding the terminating new line character.
#[inline]
pub fn as_str(&self) -> &'a str {
let newline_len = self
.line_ending()
.map_or(0, |line_ending| line_ending.len());
&self.text[..self.text.len() - newline_len]
}
/// Returns the line's text, including the terminating new line character.
#[inline]
pub fn as_full_str(&self) -> &'a str {
self.text
}
#[inline]
pub fn full_text_len(&self) -> TextSize {
self.text.text_len()
}
}
impl Deref for Line<'_> {
type Target = str;
fn deref(&self) -> &Self::Target {
self.as_str()
}
}
impl PartialEq<&str> for Line<'_> {
fn eq(&self, other: &&str) -> bool {
self.as_str() == *other
}
}
impl PartialEq<Line<'_>> for &str {
fn eq(&self, other: &Line<'_>) -> bool {
*self == other.as_str()
}
}
/// The line ending style used in Python source code.
/// See <https://docs.python.org/3/reference/lexical_analysis.html#physical-lines>
#[derive(Debug, PartialEq, Eq, Copy, Clone)]
pub enum LineEnding {
Lf,
Cr,
CrLf,
}
impl Default for LineEnding {
fn default() -> Self {
if cfg!(windows) {
LineEnding::CrLf
} else {
LineEnding::Lf
}
}
}
impl LineEnding {
pub const fn as_str(&self) -> &'static str {
match self {
LineEnding::Lf => "\n",
LineEnding::CrLf => "\r\n",
LineEnding::Cr => "\r",
}
}
#[allow(clippy::len_without_is_empty)]
pub const fn len(&self) -> usize {
match self {
LineEnding::Lf | LineEnding::Cr => 1,
LineEnding::CrLf => 2,
}
}
pub const fn text_len(&self) -> TextSize {
match self {
LineEnding::Lf | LineEnding::Cr => TextSize::new(1),
LineEnding::CrLf => TextSize::new(2),
}
}
}
impl Deref for LineEnding {
type Target = str;
fn deref(&self) -> &Self::Target {
self.as_str()
}
}
#[cfg(test)]
mod tests {
use ruff_text_size::TextSize;
use super::{Line, UniversalNewlineIterator};
#[test]
fn universal_newlines_empty_str() {
let lines: Vec<_> = UniversalNewlineIterator::from("").collect();
assert_eq!(lines, Vec::<Line>::new());
let lines: Vec<_> = UniversalNewlineIterator::from("").rev().collect();
assert_eq!(lines, Vec::<Line>::new());
}
#[test]
fn universal_newlines_forward() {
let lines: Vec<_> = UniversalNewlineIterator::from("foo\nbar\n\r\nbaz\rbop").collect();
assert_eq!(
lines,
vec![
Line::new("foo\n", TextSize::from(0)),
Line::new("bar\n", TextSize::from(4)),
Line::new("\r\n", TextSize::from(8)),
Line::new("baz\r", TextSize::from(10)),
Line::new("bop", TextSize::from(14)),
]
);
let lines: Vec<_> = UniversalNewlineIterator::from("foo\nbar\n\r\nbaz\rbop\n").collect();
assert_eq!(
lines,
vec![
Line::new("foo\n", TextSize::from(0)),
Line::new("bar\n", TextSize::from(4)),
Line::new("\r\n", TextSize::from(8)),
Line::new("baz\r", TextSize::from(10)),
Line::new("bop\n", TextSize::from(14)),
]
);
let lines: Vec<_> = UniversalNewlineIterator::from("foo\nbar\n\r\nbaz\rbop\n\n").collect();
assert_eq!(
lines,
vec![
Line::new("foo\n", TextSize::from(0)),
Line::new("bar\n", TextSize::from(4)),
Line::new("\r\n", TextSize::from(8)),
Line::new("baz\r", TextSize::from(10)),
Line::new("bop\n", TextSize::from(14)),
Line::new("\n", TextSize::from(18)),
]
);
}
#[test]
fn universal_newlines_backwards() {
let lines: Vec<_> = UniversalNewlineIterator::from("foo\nbar\n\r\nbaz\rbop")
.rev()
.collect();
assert_eq!(
lines,
vec![
Line::new("bop", TextSize::from(14)),
Line::new("baz\r", TextSize::from(10)),
Line::new("\r\n", TextSize::from(8)),
Line::new("bar\n", TextSize::from(4)),
Line::new("foo\n", TextSize::from(0)),
]
);
let lines: Vec<_> = UniversalNewlineIterator::from("foo\nbar\n\nbaz\rbop\n")
.rev()
.map(|line| line.as_str())
.collect();
assert_eq!(
lines,
vec![
Line::new("bop\n", TextSize::from(13)),
Line::new("baz\r", TextSize::from(9)),
Line::new("\n", TextSize::from(8)),
Line::new("bar\n", TextSize::from(4)),
Line::new("foo\n", TextSize::from(0)),
]
);
}
#[test]
fn universal_newlines_mixed() {
let mut lines = UniversalNewlineIterator::from("foo\nbar\n\r\nbaz\rbop");
assert_eq!(
lines.next_back(),
Some(Line::new("bop", TextSize::from(14)))
);
assert_eq!(lines.next(), Some(Line::new("foo\n", TextSize::from(0))));
assert_eq!(
lines.next_back(),
Some(Line::new("baz\r", TextSize::from(10)))
);
assert_eq!(lines.next(), Some(Line::new("bar\n", TextSize::from(4))));
assert_eq!(
lines.next_back(),
Some(Line::new("\r\n", TextSize::from(8)))
);
assert_eq!(lines.next(), None);
}
}

View file

@ -0,0 +1,346 @@
//! Functions related to adding and removing indentation from lines of
//! text.
use std::borrow::Cow;
use std::cmp;
use crate::PythonWhitespace;
use ruff_source_file::newlines::UniversalNewlines;
/// Indent each line by the given prefix.
///
/// # Examples
///
/// ```
/// # use ruff_python_trivia::textwrap::indent;
///
/// assert_eq!(indent("First line.\nSecond line.\n", " "),
/// " First line.\n Second line.\n");
/// ```
///
/// When indenting, trailing whitespace is stripped from the prefix.
/// This means that empty lines remain empty afterwards:
///
/// ```
/// # use ruff_python_trivia::textwrap::indent;
///
/// assert_eq!(indent("First line.\n\n\nSecond line.\n", " "),
/// " First line.\n\n\n Second line.\n");
/// ```
///
/// Notice how `"\n\n\n"` remained as `"\n\n\n"`.
///
/// This feature is useful when you want to indent text and have a
/// space between your prefix and the text. In this case, you _don't_
/// want a trailing space on empty lines:
///
/// ```
/// # use ruff_python_trivia::textwrap::indent;
///
/// assert_eq!(indent("foo = 123\n\nprint(foo)\n", "# "),
/// "# foo = 123\n#\n# print(foo)\n");
/// ```
///
/// Notice how `"\n\n"` became `"\n#\n"` instead of `"\n# \n"` which
/// would have trailing whitespace.
///
/// Leading and trailing whitespace coming from the text itself is
/// kept unchanged:
///
/// ```
/// # use ruff_python_trivia::textwrap::indent;
///
/// assert_eq!(indent(" \t Foo ", "->"), "-> \t Foo ");
/// ```
pub fn indent<'a>(text: &'a str, prefix: &str) -> Cow<'a, str> {
if prefix.is_empty() {
return Cow::Borrowed(text);
}
let mut result = String::with_capacity(text.len() + prefix.len());
let trimmed_prefix = prefix.trim_whitespace_end();
for line in text.universal_newlines() {
if line.trim_whitespace().is_empty() {
result.push_str(trimmed_prefix);
} else {
result.push_str(prefix);
}
result.push_str(line.as_full_str());
}
Cow::Owned(result)
}
/// Removes common leading whitespace from each line.
///
/// This function will look at each non-empty line and determine the
/// maximum amount of whitespace that can be removed from all lines:
///
/// ```
/// # use ruff_python_trivia::textwrap::dedent;
///
/// assert_eq!(dedent("
/// 1st line
/// 2nd line
/// 3rd line
/// "), "
/// 1st line
/// 2nd line
/// 3rd line
/// ");
/// ```
pub fn dedent(text: &str) -> Cow<'_, str> {
// Find the minimum amount of leading whitespace on each line.
let prefix_len = text
.universal_newlines()
.fold(usize::MAX, |prefix_len, line| {
let leading_whitespace_len = line.len() - line.trim_whitespace_start().len();
if leading_whitespace_len == line.len() {
// Skip empty lines.
prefix_len
} else {
cmp::min(prefix_len, leading_whitespace_len)
}
});
// If there is no common prefix, no need to dedent.
if prefix_len == usize::MAX {
return Cow::Borrowed(text);
}
// Remove the common prefix from each line.
let mut result = String::with_capacity(text.len());
for line in text.universal_newlines() {
if line.trim_whitespace().is_empty() {
if let Some(line_ending) = line.line_ending() {
result.push_str(&line_ending);
}
} else {
result.push_str(&line.as_full_str()[prefix_len..]);
}
}
Cow::Owned(result)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn indent_empty() {
assert_eq!(indent("\n", " "), "\n");
}
#[test]
#[rustfmt::skip]
fn indent_nonempty() {
let text = [
" foo\n",
"bar\n",
" baz\n",
].join("");
let expected = [
"// foo\n",
"// bar\n",
"// baz\n",
].join("");
assert_eq!(indent(&text, "// "), expected);
}
#[test]
#[rustfmt::skip]
fn indent_empty_line() {
let text = [
" foo",
"bar",
"",
" baz",
].join("\n");
let expected = [
"// foo",
"// bar",
"//",
"// baz",
].join("\n");
assert_eq!(indent(&text, "// "), expected);
}
#[test]
#[rustfmt::skip]
fn indent_mixed_newlines() {
let text = [
" foo\r\n",
"bar\n",
" baz\r",
].join("");
let expected = [
"// foo\r\n",
"// bar\n",
"// baz\r",
].join("");
assert_eq!(indent(&text, "// "), expected);
}
#[test]
fn dedent_empty() {
assert_eq!(dedent(""), "");
}
#[test]
#[rustfmt::skip]
fn dedent_multi_line() {
let x = [
" foo",
" bar",
" baz",
].join("\n");
let y = [
" foo",
"bar",
" baz"
].join("\n");
assert_eq!(dedent(&x), y);
}
#[test]
#[rustfmt::skip]
fn dedent_empty_line() {
let x = [
" foo",
" bar",
" ",
" baz"
].join("\n");
let y = [
" foo",
"bar",
"",
" baz"
].join("\n");
assert_eq!(dedent(&x), y);
}
#[test]
#[rustfmt::skip]
fn dedent_blank_line() {
let x = [
" foo",
"",
" bar",
" foo",
" bar",
" baz",
].join("\n");
let y = [
"foo",
"",
" bar",
" foo",
" bar",
" baz",
].join("\n");
assert_eq!(dedent(&x), y);
}
#[test]
#[rustfmt::skip]
fn dedent_whitespace_line() {
let x = [
" foo",
" ",
" bar",
" foo",
" bar",
" baz",
].join("\n");
let y = [
"foo",
"",
" bar",
" foo",
" bar",
" baz",
].join("\n");
assert_eq!(dedent(&x), y);
}
#[test]
#[rustfmt::skip]
fn dedent_mixed_whitespace() {
let x = [
"\tfoo",
" bar",
].join("\n");
let y = [
"foo",
" bar",
].join("\n");
assert_eq!(dedent(&x), y);
}
#[test]
#[rustfmt::skip]
fn dedent_tabbed_whitespace() {
let x = [
"\t\tfoo",
"\t\t\tbar",
].join("\n");
let y = [
"foo",
"\tbar",
].join("\n");
assert_eq!(dedent(&x), y);
}
#[test]
#[rustfmt::skip]
fn dedent_mixed_tabbed_whitespace() {
let x = [
"\t \tfoo",
"\t \t\tbar",
].join("\n");
let y = [
"foo",
"\tbar",
].join("\n");
assert_eq!(dedent(&x), y);
}
#[test]
#[rustfmt::skip]
fn dedent_preserve_no_terminating_newline() {
let x = [
" foo",
" bar",
].join("\n");
let y = [
"foo",
" bar",
].join("\n");
assert_eq!(dedent(&x), y);
}
#[test]
#[rustfmt::skip]
fn dedent_mixed_newlines() {
let x = [
" foo\r\n",
" bar\n",
" baz\r",
].join("");
let y = [
" foo\r\n",
"bar\n",
" baz\r"
].join("");
assert_eq!(dedent(&x), y);
}
#[test]
fn dedent_non_python_whitespace() {
let text = r#"        C = int(f.rea1,0],[-1,0,1]],
[[-1,-1,1],[1,1,-1],[0,-1,0]],
[[-1,-1,-1],[1,1,0],[1,0,1]]
]"#;
assert_eq!(dedent(text), text);
}
}

View file

@ -189,6 +189,9 @@ pub enum SimpleTokenKind {
/// `if`
If,
/// `elif`
Elif,
/// `in`
In,
@ -295,6 +298,7 @@ impl<'a> SimpleTokenizer<'a> {
"as" => SimpleTokenKind::As,
"async" => SimpleTokenKind::Async,
"else" => SimpleTokenKind::Else,
"elif" => SimpleTokenKind::Elif,
"if" => SimpleTokenKind::If,
"in" => SimpleTokenKind::In,
"match" => SimpleTokenKind::Match, // Match is a soft keyword that depends on the context but we can always lex it as a keyword and leave it to the caller (parser) to decide if it should be handled as an identifier or keyword.

View file

@ -1,3 +1,41 @@
use ruff_source_file::Locator;
use ruff_text_size::{TextRange, TextSize};
/// Extract the leading indentation from a line.
pub fn indentation_at_offset<'a>(locator: &'a Locator, offset: TextSize) -> Option<&'a str> {
let line_start = locator.line_start(offset);
let indentation = &locator.contents()[TextRange::new(line_start, offset)];
if indentation.chars().all(is_python_whitespace) {
Some(indentation)
} else {
None
}
}
/// Return `true` if the node starting the given [`TextSize`] has leading content.
pub fn has_leading_content(offset: TextSize, locator: &Locator) -> bool {
let line_start = locator.line_start(offset);
let leading = &locator.contents()[TextRange::new(line_start, offset)];
leading.chars().any(|char| !is_python_whitespace(char))
}
/// Return `true` if the node ending at the given [`TextSize`] has trailing content.
pub fn has_trailing_content(offset: TextSize, locator: &Locator) -> bool {
let line_end = locator.line_end(offset);
let trailing = &locator.contents()[TextRange::new(offset, line_end)];
for char in trailing.chars() {
if char == '#' {
return false;
}
if !is_python_whitespace(char) {
return true;
}
}
false
}
/// Returns `true` for [whitespace](https://docs.python.org/3/reference/lexical_analysis.html#whitespace-between-tokens)
/// characters.
pub const fn is_python_whitespace(c: char) -> bool {
@ -41,3 +79,50 @@ impl PythonWhitespace for str {
self.trim_end_matches(is_python_whitespace)
}
}
#[cfg(test)]
mod tests {
use crate::has_trailing_content;
use ruff_source_file::Locator;
use rustpython_ast::{Ranged, Suite};
use rustpython_parser::{Parse, ParseError};
#[test]
fn trailing_content() -> Result<(), ParseError> {
let contents = "x = 1";
let program = Suite::parse(contents, "<filename>")?;
let stmt = program.first().unwrap();
let locator = Locator::new(contents);
assert!(!has_trailing_content(stmt.end(), &locator));
let contents = "x = 1; y = 2";
let program = Suite::parse(contents, "<filename>")?;
let stmt = program.first().unwrap();
let locator = Locator::new(contents);
assert!(has_trailing_content(stmt.end(), &locator));
let contents = "x = 1 ";
let program = Suite::parse(contents, "<filename>")?;
let stmt = program.first().unwrap();
let locator = Locator::new(contents);
assert!(!has_trailing_content(stmt.end(), &locator));
let contents = "x = 1 # Comment";
let program = Suite::parse(contents, "<filename>")?;
let stmt = program.first().unwrap();
let locator = Locator::new(contents);
assert!(!has_trailing_content(stmt.end(), &locator));
let contents = r#"
x = 1
y = 2
"#
.trim();
let program = Suite::parse(contents, "<filename>")?;
let stmt = program.first().unwrap();
let locator = Locator::new(contents);
assert!(!has_trailing_content(stmt.end(), &locator));
Ok(())
}
}