mirror of
https://github.com/astral-sh/ruff.git
synced 2025-08-31 23:57:35 +00:00
Rename ruff_python_whitespace
to ruff_python_trivia
(#5886)
## Summary This crate now contains utilities for dealing with trivia more broadly: whitespace, newlines, "simple" trivia lexing, etc. So renaming it to reflect its increased responsibilities. To avoid conflicts, I've also renamed `Token` and `TokenKind` to `SimpleToken` and `SimpleTokenKind`.
This commit is contained in:
parent
a75a6de577
commit
5f3da9955a
86 changed files with 360 additions and 353 deletions
22
crates/ruff_python_trivia/Cargo.toml
Normal file
22
crates/ruff_python_trivia/Cargo.toml
Normal file
|
@ -0,0 +1,22 @@
|
|||
[package]
|
||||
name = "ruff_python_trivia"
|
||||
version = "0.0.0"
|
||||
publish = false
|
||||
authors = { workspace = true }
|
||||
edition = { workspace = true }
|
||||
rust-version = { workspace = true }
|
||||
homepage = { workspace = true }
|
||||
documentation = { workspace = true }
|
||||
repository = { workspace = true }
|
||||
license = { workspace = true }
|
||||
|
||||
[lib]
|
||||
|
||||
[dependencies]
|
||||
ruff_text_size = { workspace = true }
|
||||
|
||||
memchr = { workspace = true }
|
||||
unic-ucd-ident = "0.9.0"
|
||||
|
||||
[dev-dependencies]
|
||||
insta = { workspace = true }
|
103
crates/ruff_python_trivia/src/cursor.rs
Normal file
103
crates/ruff_python_trivia/src/cursor.rs
Normal file
|
@ -0,0 +1,103 @@
|
|||
use std::str::Chars;
|
||||
|
||||
use ruff_text_size::{TextLen, TextSize};
|
||||
|
||||
pub const EOF_CHAR: char = '\0';
|
||||
|
||||
/// A [`Cursor`] over a string.
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct Cursor<'a> {
|
||||
chars: Chars<'a>,
|
||||
source_length: TextSize,
|
||||
}
|
||||
|
||||
impl<'a> Cursor<'a> {
|
||||
pub fn new(source: &'a str) -> Self {
|
||||
Self {
|
||||
source_length: source.text_len(),
|
||||
chars: source.chars(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Return the remaining input as a string slice.
|
||||
pub fn chars(&self) -> Chars<'a> {
|
||||
self.chars.clone()
|
||||
}
|
||||
|
||||
/// Peeks the next character from the input stream without consuming it.
|
||||
/// Returns [`EOF_CHAR`] if the file is at the end of the file.
|
||||
pub fn first(&self) -> char {
|
||||
self.chars.clone().next().unwrap_or(EOF_CHAR)
|
||||
}
|
||||
|
||||
/// Peeks the next character from the input stream without consuming it.
|
||||
/// Returns [`EOF_CHAR`] if the file is at the end of the file.
|
||||
pub fn last(&self) -> char {
|
||||
self.chars.clone().next_back().unwrap_or(EOF_CHAR)
|
||||
}
|
||||
|
||||
// SAFETY: THe `source.text_len` call in `new` would panic if the string length is larger than a `u32`.
|
||||
#[allow(clippy::cast_possible_truncation)]
|
||||
pub fn text_len(&self) -> TextSize {
|
||||
TextSize::new(self.chars.as_str().len() as u32)
|
||||
}
|
||||
|
||||
pub fn token_len(&self) -> TextSize {
|
||||
self.source_length - self.text_len()
|
||||
}
|
||||
|
||||
pub fn start_token(&mut self) {
|
||||
self.source_length = self.text_len();
|
||||
}
|
||||
|
||||
/// Returns `true` if the file is at the end of the file.
|
||||
pub fn is_eof(&self) -> bool {
|
||||
self.chars.as_str().is_empty()
|
||||
}
|
||||
|
||||
/// Consumes the next character
|
||||
pub fn bump(&mut self) -> Option<char> {
|
||||
self.chars.next()
|
||||
}
|
||||
|
||||
/// Consumes the next character from the back
|
||||
pub fn bump_back(&mut self) -> Option<char> {
|
||||
self.chars.next_back()
|
||||
}
|
||||
|
||||
pub fn eat_char(&mut self, c: char) -> bool {
|
||||
if self.first() == c {
|
||||
self.bump();
|
||||
true
|
||||
} else {
|
||||
false
|
||||
}
|
||||
}
|
||||
|
||||
pub fn eat_char_back(&mut self, c: char) -> bool {
|
||||
if self.last() == c {
|
||||
self.bump_back();
|
||||
true
|
||||
} else {
|
||||
false
|
||||
}
|
||||
}
|
||||
|
||||
/// Eats symbols while predicate returns true or until the end of file is reached.
|
||||
pub fn eat_while(&mut self, mut predicate: impl FnMut(char) -> bool) {
|
||||
// It was tried making optimized version of this for eg. line comments, but
|
||||
// LLVM can inline all of this and compile it down to fast iteration over bytes.
|
||||
while predicate(self.first()) && !self.is_eof() {
|
||||
self.bump();
|
||||
}
|
||||
}
|
||||
|
||||
/// Eats symbols from the back while predicate returns true or until the beginning of file is reached.
|
||||
pub fn eat_back_while(&mut self, mut predicate: impl FnMut(char) -> bool) {
|
||||
// It was tried making optimized version of this for eg. line comments, but
|
||||
// LLVM can inline all of this and compile it down to fast iteration over bytes.
|
||||
while predicate(self.last()) && !self.is_eof() {
|
||||
self.bump_back();
|
||||
}
|
||||
}
|
||||
}
|
9
crates/ruff_python_trivia/src/lib.rs
Normal file
9
crates/ruff_python_trivia/src/lib.rs
Normal file
|
@ -0,0 +1,9 @@
|
|||
mod cursor;
|
||||
mod newlines;
|
||||
mod tokenizer;
|
||||
mod whitespace;
|
||||
|
||||
pub use cursor::*;
|
||||
pub use newlines::*;
|
||||
pub use tokenizer::*;
|
||||
pub use whitespace::*;
|
453
crates/ruff_python_trivia/src/newlines.rs
Normal file
453
crates/ruff_python_trivia/src/newlines.rs
Normal file
|
@ -0,0 +1,453 @@
|
|||
use std::iter::FusedIterator;
|
||||
use std::ops::Deref;
|
||||
|
||||
use memchr::{memchr2, memrchr2};
|
||||
use ruff_text_size::{TextLen, TextRange, TextSize};
|
||||
|
||||
/// Extension trait for [`str`] that provides a [`UniversalNewlineIterator`].
|
||||
pub trait UniversalNewlines {
|
||||
fn universal_newlines(&self) -> UniversalNewlineIterator<'_>;
|
||||
}
|
||||
|
||||
impl UniversalNewlines for str {
|
||||
fn universal_newlines(&self) -> UniversalNewlineIterator<'_> {
|
||||
UniversalNewlineIterator::from(self)
|
||||
}
|
||||
}
|
||||
|
||||
/// Like [`str#lines`], but accommodates LF, CRLF, and CR line endings,
|
||||
/// the latter of which are not supported by [`str#lines`].
|
||||
///
|
||||
/// ## Examples
|
||||
///
|
||||
/// ```rust
|
||||
/// # use ruff_text_size::TextSize;
|
||||
/// # use ruff_python_trivia::{Line, UniversalNewlineIterator};
|
||||
/// let mut lines = UniversalNewlineIterator::from("foo\nbar\n\r\nbaz\rbop");
|
||||
///
|
||||
/// assert_eq!(lines.next_back(), Some(Line::new("bop", TextSize::from(14))));
|
||||
/// assert_eq!(lines.next(), Some(Line::new("foo\n", TextSize::from(0))));
|
||||
/// assert_eq!(lines.next_back(), Some(Line::new("baz\r", TextSize::from(10))));
|
||||
/// assert_eq!(lines.next(), Some(Line::new("bar\n", TextSize::from(4))));
|
||||
/// assert_eq!(lines.next_back(), Some(Line::new("\r\n", TextSize::from(8))));
|
||||
/// assert_eq!(lines.next(), None);
|
||||
/// ```
|
||||
pub struct UniversalNewlineIterator<'a> {
|
||||
text: &'a str,
|
||||
offset: TextSize,
|
||||
offset_back: TextSize,
|
||||
}
|
||||
|
||||
impl<'a> UniversalNewlineIterator<'a> {
|
||||
pub fn with_offset(text: &'a str, offset: TextSize) -> UniversalNewlineIterator<'a> {
|
||||
UniversalNewlineIterator {
|
||||
text,
|
||||
offset,
|
||||
offset_back: offset + text.text_len(),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn from(text: &'a str) -> UniversalNewlineIterator<'a> {
|
||||
Self::with_offset(text, TextSize::default())
|
||||
}
|
||||
}
|
||||
|
||||
/// Finds the next newline character. Returns its position and the [`LineEnding`].
|
||||
#[inline]
|
||||
pub fn find_newline(text: &str) -> Option<(usize, LineEnding)> {
|
||||
let bytes = text.as_bytes();
|
||||
if let Some(position) = memchr2(b'\n', b'\r', bytes) {
|
||||
// SAFETY: memchr guarantees to return valid positions
|
||||
#[allow(unsafe_code)]
|
||||
let newline_character = unsafe { *bytes.get_unchecked(position) };
|
||||
|
||||
let line_ending = match newline_character {
|
||||
// Explicit branch for `\n` as this is the most likely path
|
||||
b'\n' => LineEnding::Lf,
|
||||
// '\r\n'
|
||||
b'\r' if bytes.get(position.saturating_add(1)) == Some(&b'\n') => LineEnding::CrLf,
|
||||
// '\r'
|
||||
_ => LineEnding::Cr,
|
||||
};
|
||||
|
||||
Some((position, line_ending))
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> Iterator for UniversalNewlineIterator<'a> {
|
||||
type Item = Line<'a>;
|
||||
|
||||
#[inline]
|
||||
fn next(&mut self) -> Option<Line<'a>> {
|
||||
if self.text.is_empty() {
|
||||
return None;
|
||||
}
|
||||
|
||||
let line = if let Some((newline_position, line_ending)) = find_newline(self.text) {
|
||||
let (text, remainder) = self.text.split_at(newline_position + line_ending.len());
|
||||
|
||||
let line = Line {
|
||||
offset: self.offset,
|
||||
text,
|
||||
};
|
||||
|
||||
self.text = remainder;
|
||||
self.offset += text.text_len();
|
||||
|
||||
line
|
||||
}
|
||||
// Last line
|
||||
else {
|
||||
Line {
|
||||
offset: self.offset,
|
||||
text: std::mem::take(&mut self.text),
|
||||
}
|
||||
};
|
||||
|
||||
Some(line)
|
||||
}
|
||||
|
||||
fn last(mut self) -> Option<Self::Item> {
|
||||
self.next_back()
|
||||
}
|
||||
}
|
||||
|
||||
impl DoubleEndedIterator for UniversalNewlineIterator<'_> {
|
||||
#[inline]
|
||||
fn next_back(&mut self) -> Option<Self::Item> {
|
||||
if self.text.is_empty() {
|
||||
return None;
|
||||
}
|
||||
|
||||
let len = self.text.len();
|
||||
|
||||
// Trim any trailing newlines.
|
||||
let haystack = match self.text.as_bytes()[len - 1] {
|
||||
b'\n' if len > 1 && self.text.as_bytes()[len - 2] == b'\r' => &self.text[..len - 2],
|
||||
b'\n' | b'\r' => &self.text[..len - 1],
|
||||
_ => self.text,
|
||||
};
|
||||
|
||||
// Find the end of the previous line. The previous line is the text up to, but not including
|
||||
// the newline character.
|
||||
let line = if let Some(line_end) = memrchr2(b'\n', b'\r', haystack.as_bytes()) {
|
||||
// '\n' or '\r' or '\r\n'
|
||||
let (remainder, line) = self.text.split_at(line_end + 1);
|
||||
self.text = remainder;
|
||||
self.offset_back -= line.text_len();
|
||||
|
||||
Line {
|
||||
text: line,
|
||||
offset: self.offset_back,
|
||||
}
|
||||
} else {
|
||||
// Last line
|
||||
let offset = self.offset_back - self.text.text_len();
|
||||
Line {
|
||||
text: std::mem::take(&mut self.text),
|
||||
offset,
|
||||
}
|
||||
};
|
||||
|
||||
Some(line)
|
||||
}
|
||||
}
|
||||
|
||||
impl FusedIterator for UniversalNewlineIterator<'_> {}
|
||||
|
||||
/// Like [`UniversalNewlineIterator`], but includes a trailing newline as an empty line.
|
||||
pub struct NewlineWithTrailingNewline<'a> {
|
||||
trailing: Option<Line<'a>>,
|
||||
underlying: UniversalNewlineIterator<'a>,
|
||||
}
|
||||
|
||||
impl<'a> NewlineWithTrailingNewline<'a> {
|
||||
pub fn from(input: &'a str) -> NewlineWithTrailingNewline<'a> {
|
||||
Self::with_offset(input, TextSize::default())
|
||||
}
|
||||
|
||||
pub fn with_offset(input: &'a str, offset: TextSize) -> Self {
|
||||
NewlineWithTrailingNewline {
|
||||
underlying: UniversalNewlineIterator::with_offset(input, offset),
|
||||
trailing: if input.ends_with(['\r', '\n']) {
|
||||
Some(Line {
|
||||
text: "",
|
||||
offset: offset + input.text_len(),
|
||||
})
|
||||
} else {
|
||||
None
|
||||
},
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> Iterator for NewlineWithTrailingNewline<'a> {
|
||||
type Item = Line<'a>;
|
||||
|
||||
#[inline]
|
||||
fn next(&mut self) -> Option<Line<'a>> {
|
||||
self.underlying.next().or_else(|| self.trailing.take())
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Eq, PartialEq)]
|
||||
pub struct Line<'a> {
|
||||
text: &'a str,
|
||||
offset: TextSize,
|
||||
}
|
||||
|
||||
impl<'a> Line<'a> {
|
||||
pub fn new(text: &'a str, offset: TextSize) -> Self {
|
||||
Self { text, offset }
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub const fn start(&self) -> TextSize {
|
||||
self.offset
|
||||
}
|
||||
|
||||
/// Returns the byte offset where the line ends, including its terminating new line character.
|
||||
#[inline]
|
||||
pub fn full_end(&self) -> TextSize {
|
||||
self.offset + self.full_text_len()
|
||||
}
|
||||
|
||||
/// Returns the byte offset where the line ends, excluding its new line character
|
||||
#[inline]
|
||||
pub fn end(&self) -> TextSize {
|
||||
self.offset + self.as_str().text_len()
|
||||
}
|
||||
|
||||
/// Returns the range of the line, including its terminating new line character.
|
||||
#[inline]
|
||||
pub fn full_range(&self) -> TextRange {
|
||||
TextRange::at(self.offset, self.text.text_len())
|
||||
}
|
||||
|
||||
/// Returns the range of the line, excluding its terminating new line character
|
||||
#[inline]
|
||||
pub fn range(&self) -> TextRange {
|
||||
TextRange::new(self.start(), self.end())
|
||||
}
|
||||
|
||||
/// Returns the line's new line character, if any.
|
||||
#[inline]
|
||||
pub fn line_ending(&self) -> Option<LineEnding> {
|
||||
let mut bytes = self.text.bytes().rev();
|
||||
match bytes.next() {
|
||||
Some(b'\n') => {
|
||||
if bytes.next() == Some(b'\r') {
|
||||
Some(LineEnding::CrLf)
|
||||
} else {
|
||||
Some(LineEnding::Lf)
|
||||
}
|
||||
}
|
||||
Some(b'\r') => Some(LineEnding::Cr),
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns the text of the line, excluding the terminating new line character.
|
||||
#[inline]
|
||||
pub fn as_str(&self) -> &'a str {
|
||||
let newline_len = self
|
||||
.line_ending()
|
||||
.map_or(0, |line_ending| line_ending.len());
|
||||
&self.text[..self.text.len() - newline_len]
|
||||
}
|
||||
|
||||
/// Returns the line's text, including the terminating new line character.
|
||||
#[inline]
|
||||
pub fn as_full_str(&self) -> &'a str {
|
||||
self.text
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn full_text_len(&self) -> TextSize {
|
||||
self.text.text_len()
|
||||
}
|
||||
}
|
||||
|
||||
impl Deref for Line<'_> {
|
||||
type Target = str;
|
||||
|
||||
fn deref(&self) -> &Self::Target {
|
||||
self.as_str()
|
||||
}
|
||||
}
|
||||
|
||||
impl PartialEq<&str> for Line<'_> {
|
||||
fn eq(&self, other: &&str) -> bool {
|
||||
self.as_str() == *other
|
||||
}
|
||||
}
|
||||
|
||||
impl PartialEq<Line<'_>> for &str {
|
||||
fn eq(&self, other: &Line<'_>) -> bool {
|
||||
*self == other.as_str()
|
||||
}
|
||||
}
|
||||
|
||||
/// The line ending style used in Python source code.
|
||||
/// See <https://docs.python.org/3/reference/lexical_analysis.html#physical-lines>
|
||||
#[derive(Debug, PartialEq, Eq, Copy, Clone)]
|
||||
pub enum LineEnding {
|
||||
Lf,
|
||||
Cr,
|
||||
CrLf,
|
||||
}
|
||||
|
||||
impl Default for LineEnding {
|
||||
fn default() -> Self {
|
||||
if cfg!(windows) {
|
||||
LineEnding::CrLf
|
||||
} else {
|
||||
LineEnding::Lf
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl LineEnding {
|
||||
pub const fn as_str(&self) -> &'static str {
|
||||
match self {
|
||||
LineEnding::Lf => "\n",
|
||||
LineEnding::CrLf => "\r\n",
|
||||
LineEnding::Cr => "\r",
|
||||
}
|
||||
}
|
||||
|
||||
#[allow(clippy::len_without_is_empty)]
|
||||
pub const fn len(&self) -> usize {
|
||||
match self {
|
||||
LineEnding::Lf | LineEnding::Cr => 1,
|
||||
LineEnding::CrLf => 2,
|
||||
}
|
||||
}
|
||||
|
||||
pub const fn text_len(&self) -> TextSize {
|
||||
match self {
|
||||
LineEnding::Lf | LineEnding::Cr => TextSize::new(1),
|
||||
LineEnding::CrLf => TextSize::new(2),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Deref for LineEnding {
|
||||
type Target = str;
|
||||
|
||||
fn deref(&self) -> &Self::Target {
|
||||
self.as_str()
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use ruff_text_size::TextSize;
|
||||
|
||||
use super::{Line, UniversalNewlineIterator};
|
||||
|
||||
#[test]
|
||||
fn universal_newlines_empty_str() {
|
||||
let lines: Vec<_> = UniversalNewlineIterator::from("").collect();
|
||||
assert_eq!(lines, Vec::<Line>::new());
|
||||
|
||||
let lines: Vec<_> = UniversalNewlineIterator::from("").rev().collect();
|
||||
assert_eq!(lines, Vec::<Line>::new());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn universal_newlines_forward() {
|
||||
let lines: Vec<_> = UniversalNewlineIterator::from("foo\nbar\n\r\nbaz\rbop").collect();
|
||||
assert_eq!(
|
||||
lines,
|
||||
vec![
|
||||
Line::new("foo\n", TextSize::from(0)),
|
||||
Line::new("bar\n", TextSize::from(4)),
|
||||
Line::new("\r\n", TextSize::from(8)),
|
||||
Line::new("baz\r", TextSize::from(10)),
|
||||
Line::new("bop", TextSize::from(14)),
|
||||
]
|
||||
);
|
||||
|
||||
let lines: Vec<_> = UniversalNewlineIterator::from("foo\nbar\n\r\nbaz\rbop\n").collect();
|
||||
assert_eq!(
|
||||
lines,
|
||||
vec![
|
||||
Line::new("foo\n", TextSize::from(0)),
|
||||
Line::new("bar\n", TextSize::from(4)),
|
||||
Line::new("\r\n", TextSize::from(8)),
|
||||
Line::new("baz\r", TextSize::from(10)),
|
||||
Line::new("bop\n", TextSize::from(14)),
|
||||
]
|
||||
);
|
||||
|
||||
let lines: Vec<_> = UniversalNewlineIterator::from("foo\nbar\n\r\nbaz\rbop\n\n").collect();
|
||||
assert_eq!(
|
||||
lines,
|
||||
vec![
|
||||
Line::new("foo\n", TextSize::from(0)),
|
||||
Line::new("bar\n", TextSize::from(4)),
|
||||
Line::new("\r\n", TextSize::from(8)),
|
||||
Line::new("baz\r", TextSize::from(10)),
|
||||
Line::new("bop\n", TextSize::from(14)),
|
||||
Line::new("\n", TextSize::from(18)),
|
||||
]
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn universal_newlines_backwards() {
|
||||
let lines: Vec<_> = UniversalNewlineIterator::from("foo\nbar\n\r\nbaz\rbop")
|
||||
.rev()
|
||||
.collect();
|
||||
assert_eq!(
|
||||
lines,
|
||||
vec![
|
||||
Line::new("bop", TextSize::from(14)),
|
||||
Line::new("baz\r", TextSize::from(10)),
|
||||
Line::new("\r\n", TextSize::from(8)),
|
||||
Line::new("bar\n", TextSize::from(4)),
|
||||
Line::new("foo\n", TextSize::from(0)),
|
||||
]
|
||||
);
|
||||
|
||||
let lines: Vec<_> = UniversalNewlineIterator::from("foo\nbar\n\nbaz\rbop\n")
|
||||
.rev()
|
||||
.map(|line| line.as_str())
|
||||
.collect();
|
||||
|
||||
assert_eq!(
|
||||
lines,
|
||||
vec![
|
||||
Line::new("bop\n", TextSize::from(13)),
|
||||
Line::new("baz\r", TextSize::from(9)),
|
||||
Line::new("\n", TextSize::from(8)),
|
||||
Line::new("bar\n", TextSize::from(4)),
|
||||
Line::new("foo\n", TextSize::from(0)),
|
||||
]
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn universal_newlines_mixed() {
|
||||
let mut lines = UniversalNewlineIterator::from("foo\nbar\n\r\nbaz\rbop");
|
||||
|
||||
assert_eq!(
|
||||
lines.next_back(),
|
||||
Some(Line::new("bop", TextSize::from(14)))
|
||||
);
|
||||
assert_eq!(lines.next(), Some(Line::new("foo\n", TextSize::from(0))));
|
||||
assert_eq!(
|
||||
lines.next_back(),
|
||||
Some(Line::new("baz\r", TextSize::from(10)))
|
||||
);
|
||||
assert_eq!(lines.next(), Some(Line::new("bar\n", TextSize::from(4))));
|
||||
assert_eq!(
|
||||
lines.next_back(),
|
||||
Some(Line::new("\r\n", TextSize::from(8)))
|
||||
);
|
||||
assert_eq!(lines.next(), None);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,218 @@
|
|||
---
|
||||
source: crates/ruff_python_trivia/src/tokenizer.rs
|
||||
expression: test_case.tokenize_reverse()
|
||||
---
|
||||
[
|
||||
SimpleToken {
|
||||
kind: RParen,
|
||||
range: 52..53,
|
||||
},
|
||||
SimpleToken {
|
||||
kind: Other,
|
||||
range: 51..52,
|
||||
},
|
||||
SimpleToken {
|
||||
kind: Bogus,
|
||||
range: 50..51,
|
||||
},
|
||||
SimpleToken {
|
||||
kind: Bogus,
|
||||
range: 49..50,
|
||||
},
|
||||
SimpleToken {
|
||||
kind: Bogus,
|
||||
range: 48..49,
|
||||
},
|
||||
SimpleToken {
|
||||
kind: Bogus,
|
||||
range: 47..48,
|
||||
},
|
||||
SimpleToken {
|
||||
kind: Bogus,
|
||||
range: 46..47,
|
||||
},
|
||||
SimpleToken {
|
||||
kind: Bogus,
|
||||
range: 45..46,
|
||||
},
|
||||
SimpleToken {
|
||||
kind: Bogus,
|
||||
range: 44..45,
|
||||
},
|
||||
SimpleToken {
|
||||
kind: Bogus,
|
||||
range: 43..44,
|
||||
},
|
||||
SimpleToken {
|
||||
kind: Bogus,
|
||||
range: 42..43,
|
||||
},
|
||||
SimpleToken {
|
||||
kind: Bogus,
|
||||
range: 41..42,
|
||||
},
|
||||
SimpleToken {
|
||||
kind: Bogus,
|
||||
range: 40..41,
|
||||
},
|
||||
SimpleToken {
|
||||
kind: Bogus,
|
||||
range: 39..40,
|
||||
},
|
||||
SimpleToken {
|
||||
kind: Bogus,
|
||||
range: 38..39,
|
||||
},
|
||||
SimpleToken {
|
||||
kind: Bogus,
|
||||
range: 37..38,
|
||||
},
|
||||
SimpleToken {
|
||||
kind: Bogus,
|
||||
range: 36..37,
|
||||
},
|
||||
SimpleToken {
|
||||
kind: Bogus,
|
||||
range: 35..36,
|
||||
},
|
||||
SimpleToken {
|
||||
kind: Bogus,
|
||||
range: 34..35,
|
||||
},
|
||||
SimpleToken {
|
||||
kind: Bogus,
|
||||
range: 33..34,
|
||||
},
|
||||
SimpleToken {
|
||||
kind: Bogus,
|
||||
range: 32..33,
|
||||
},
|
||||
SimpleToken {
|
||||
kind: Bogus,
|
||||
range: 31..32,
|
||||
},
|
||||
SimpleToken {
|
||||
kind: Bogus,
|
||||
range: 30..31,
|
||||
},
|
||||
SimpleToken {
|
||||
kind: Bogus,
|
||||
range: 29..30,
|
||||
},
|
||||
SimpleToken {
|
||||
kind: Bogus,
|
||||
range: 28..29,
|
||||
},
|
||||
SimpleToken {
|
||||
kind: Bogus,
|
||||
range: 27..28,
|
||||
},
|
||||
SimpleToken {
|
||||
kind: Bogus,
|
||||
range: 26..27,
|
||||
},
|
||||
SimpleToken {
|
||||
kind: Bogus,
|
||||
range: 25..26,
|
||||
},
|
||||
SimpleToken {
|
||||
kind: Bogus,
|
||||
range: 24..25,
|
||||
},
|
||||
SimpleToken {
|
||||
kind: Bogus,
|
||||
range: 23..24,
|
||||
},
|
||||
SimpleToken {
|
||||
kind: Bogus,
|
||||
range: 22..23,
|
||||
},
|
||||
SimpleToken {
|
||||
kind: Bogus,
|
||||
range: 21..22,
|
||||
},
|
||||
SimpleToken {
|
||||
kind: Bogus,
|
||||
range: 20..21,
|
||||
},
|
||||
SimpleToken {
|
||||
kind: Bogus,
|
||||
range: 19..20,
|
||||
},
|
||||
SimpleToken {
|
||||
kind: Bogus,
|
||||
range: 18..19,
|
||||
},
|
||||
SimpleToken {
|
||||
kind: Bogus,
|
||||
range: 17..18,
|
||||
},
|
||||
SimpleToken {
|
||||
kind: Bogus,
|
||||
range: 16..17,
|
||||
},
|
||||
SimpleToken {
|
||||
kind: Bogus,
|
||||
range: 15..16,
|
||||
},
|
||||
SimpleToken {
|
||||
kind: Bogus,
|
||||
range: 14..15,
|
||||
},
|
||||
SimpleToken {
|
||||
kind: Bogus,
|
||||
range: 13..14,
|
||||
},
|
||||
SimpleToken {
|
||||
kind: Bogus,
|
||||
range: 12..13,
|
||||
},
|
||||
SimpleToken {
|
||||
kind: Bogus,
|
||||
range: 11..12,
|
||||
},
|
||||
SimpleToken {
|
||||
kind: Bogus,
|
||||
range: 10..11,
|
||||
},
|
||||
SimpleToken {
|
||||
kind: Bogus,
|
||||
range: 9..10,
|
||||
},
|
||||
SimpleToken {
|
||||
kind: Bogus,
|
||||
range: 8..9,
|
||||
},
|
||||
SimpleToken {
|
||||
kind: Bogus,
|
||||
range: 7..8,
|
||||
},
|
||||
SimpleToken {
|
||||
kind: Bogus,
|
||||
range: 6..7,
|
||||
},
|
||||
SimpleToken {
|
||||
kind: Bogus,
|
||||
range: 5..6,
|
||||
},
|
||||
SimpleToken {
|
||||
kind: Bogus,
|
||||
range: 4..5,
|
||||
},
|
||||
SimpleToken {
|
||||
kind: Bogus,
|
||||
range: 3..4,
|
||||
},
|
||||
SimpleToken {
|
||||
kind: Bogus,
|
||||
range: 2..3,
|
||||
},
|
||||
SimpleToken {
|
||||
kind: Bogus,
|
||||
range: 1..2,
|
||||
},
|
||||
SimpleToken {
|
||||
kind: Bogus,
|
||||
range: 0..1,
|
||||
},
|
||||
]
|
|
@ -0,0 +1,10 @@
|
|||
---
|
||||
source: crates/ruff_python_trivia/src/tokenizer.rs
|
||||
expression: test_case.tokens()
|
||||
---
|
||||
[
|
||||
SimpleToken {
|
||||
kind: Other,
|
||||
range: 0..2,
|
||||
},
|
||||
]
|
|
@ -0,0 +1,18 @@
|
|||
---
|
||||
source: crates/ruff_python_trivia/src/tokenizer.rs
|
||||
expression: test_case.tokens()
|
||||
---
|
||||
[
|
||||
SimpleToken {
|
||||
kind: Other,
|
||||
range: 0..1,
|
||||
},
|
||||
SimpleToken {
|
||||
kind: Bogus,
|
||||
range: 1..2,
|
||||
},
|
||||
SimpleToken {
|
||||
kind: Bogus,
|
||||
range: 2..3,
|
||||
},
|
||||
]
|
|
@ -0,0 +1,126 @@
|
|||
---
|
||||
source: crates/ruff_python_trivia/src/tokenizer.rs
|
||||
expression: test_case.tokens()
|
||||
---
|
||||
[
|
||||
SimpleToken {
|
||||
kind: Comment,
|
||||
range: 0..17,
|
||||
},
|
||||
SimpleToken {
|
||||
kind: Newline,
|
||||
range: 17..18,
|
||||
},
|
||||
SimpleToken {
|
||||
kind: Whitespace,
|
||||
range: 18..26,
|
||||
},
|
||||
SimpleToken {
|
||||
kind: Other,
|
||||
range: 26..27,
|
||||
},
|
||||
SimpleToken {
|
||||
kind: Bogus,
|
||||
range: 27..28,
|
||||
},
|
||||
SimpleToken {
|
||||
kind: Bogus,
|
||||
range: 28..29,
|
||||
},
|
||||
SimpleToken {
|
||||
kind: Bogus,
|
||||
range: 29..30,
|
||||
},
|
||||
SimpleToken {
|
||||
kind: Bogus,
|
||||
range: 30..31,
|
||||
},
|
||||
SimpleToken {
|
||||
kind: Bogus,
|
||||
range: 31..32,
|
||||
},
|
||||
SimpleToken {
|
||||
kind: Bogus,
|
||||
range: 32..33,
|
||||
},
|
||||
SimpleToken {
|
||||
kind: Bogus,
|
||||
range: 33..34,
|
||||
},
|
||||
SimpleToken {
|
||||
kind: Bogus,
|
||||
range: 34..35,
|
||||
},
|
||||
SimpleToken {
|
||||
kind: Bogus,
|
||||
range: 35..36,
|
||||
},
|
||||
SimpleToken {
|
||||
kind: Bogus,
|
||||
range: 36..37,
|
||||
},
|
||||
SimpleToken {
|
||||
kind: Bogus,
|
||||
range: 37..38,
|
||||
},
|
||||
SimpleToken {
|
||||
kind: Bogus,
|
||||
range: 38..39,
|
||||
},
|
||||
SimpleToken {
|
||||
kind: Bogus,
|
||||
range: 39..40,
|
||||
},
|
||||
SimpleToken {
|
||||
kind: Bogus,
|
||||
range: 40..41,
|
||||
},
|
||||
SimpleToken {
|
||||
kind: Bogus,
|
||||
range: 41..42,
|
||||
},
|
||||
SimpleToken {
|
||||
kind: Bogus,
|
||||
range: 42..43,
|
||||
},
|
||||
SimpleToken {
|
||||
kind: Bogus,
|
||||
range: 43..44,
|
||||
},
|
||||
SimpleToken {
|
||||
kind: Bogus,
|
||||
range: 44..45,
|
||||
},
|
||||
SimpleToken {
|
||||
kind: Bogus,
|
||||
range: 45..46,
|
||||
},
|
||||
SimpleToken {
|
||||
kind: Bogus,
|
||||
range: 46..47,
|
||||
},
|
||||
SimpleToken {
|
||||
kind: Bogus,
|
||||
range: 47..48,
|
||||
},
|
||||
SimpleToken {
|
||||
kind: Bogus,
|
||||
range: 48..49,
|
||||
},
|
||||
SimpleToken {
|
||||
kind: Bogus,
|
||||
range: 49..50,
|
||||
},
|
||||
SimpleToken {
|
||||
kind: Bogus,
|
||||
range: 50..51,
|
||||
},
|
||||
SimpleToken {
|
||||
kind: Bogus,
|
||||
range: 51..52,
|
||||
},
|
||||
SimpleToken {
|
||||
kind: Bogus,
|
||||
range: 52..53,
|
||||
},
|
||||
]
|
|
@ -0,0 +1,22 @@
|
|||
---
|
||||
source: crates/ruff_python_trivia/src/tokenizer.rs
|
||||
expression: test_case.tokens()
|
||||
---
|
||||
[
|
||||
SimpleToken {
|
||||
kind: Comma,
|
||||
range: 0..1,
|
||||
},
|
||||
SimpleToken {
|
||||
kind: Comma,
|
||||
range: 1..2,
|
||||
},
|
||||
SimpleToken {
|
||||
kind: Comma,
|
||||
range: 2..3,
|
||||
},
|
||||
SimpleToken {
|
||||
kind: Comma,
|
||||
range: 3..4,
|
||||
},
|
||||
]
|
|
@ -0,0 +1,30 @@
|
|||
---
|
||||
source: crates/ruff_python_trivia/src/tokenizer.rs
|
||||
expression: test_case.tokens()
|
||||
---
|
||||
[
|
||||
SimpleToken {
|
||||
kind: LParen,
|
||||
range: 0..1,
|
||||
},
|
||||
SimpleToken {
|
||||
kind: Whitespace,
|
||||
range: 1..2,
|
||||
},
|
||||
SimpleToken {
|
||||
kind: Continuation,
|
||||
range: 2..3,
|
||||
},
|
||||
SimpleToken {
|
||||
kind: Newline,
|
||||
range: 3..4,
|
||||
},
|
||||
SimpleToken {
|
||||
kind: Whitespace,
|
||||
range: 4..5,
|
||||
},
|
||||
SimpleToken {
|
||||
kind: RParen,
|
||||
range: 5..6,
|
||||
},
|
||||
]
|
|
@ -0,0 +1,34 @@
|
|||
---
|
||||
source: crates/ruff_python_trivia/src/tokenizer.rs
|
||||
expression: test_case.tokens()
|
||||
---
|
||||
[
|
||||
SimpleToken {
|
||||
kind: If,
|
||||
range: 0..2,
|
||||
},
|
||||
SimpleToken {
|
||||
kind: Whitespace,
|
||||
range: 2..3,
|
||||
},
|
||||
SimpleToken {
|
||||
kind: In,
|
||||
range: 3..5,
|
||||
},
|
||||
SimpleToken {
|
||||
kind: Whitespace,
|
||||
range: 5..6,
|
||||
},
|
||||
SimpleToken {
|
||||
kind: Else,
|
||||
range: 6..10,
|
||||
},
|
||||
SimpleToken {
|
||||
kind: Whitespace,
|
||||
range: 10..11,
|
||||
},
|
||||
SimpleToken {
|
||||
kind: Match,
|
||||
range: 11..16,
|
||||
},
|
||||
]
|
|
@ -0,0 +1,30 @@
|
|||
---
|
||||
source: crates/ruff_python_trivia/src/tokenizer.rs
|
||||
expression: test_case.tokens()
|
||||
---
|
||||
[
|
||||
SimpleToken {
|
||||
kind: LParen,
|
||||
range: 0..1,
|
||||
},
|
||||
SimpleToken {
|
||||
kind: LBracket,
|
||||
range: 1..2,
|
||||
},
|
||||
SimpleToken {
|
||||
kind: LBrace,
|
||||
range: 2..3,
|
||||
},
|
||||
SimpleToken {
|
||||
kind: RBrace,
|
||||
range: 3..4,
|
||||
},
|
||||
SimpleToken {
|
||||
kind: RBracket,
|
||||
range: 4..5,
|
||||
},
|
||||
SimpleToken {
|
||||
kind: RParen,
|
||||
range: 5..6,
|
||||
},
|
||||
]
|
|
@ -0,0 +1,42 @@
|
|||
---
|
||||
source: crates/ruff_python_trivia/src/tokenizer.rs
|
||||
expression: test_case.tokens()
|
||||
---
|
||||
[
|
||||
SimpleToken {
|
||||
kind: Whitespace,
|
||||
range: 0..1,
|
||||
},
|
||||
SimpleToken {
|
||||
kind: Comment,
|
||||
range: 1..30,
|
||||
},
|
||||
SimpleToken {
|
||||
kind: Newline,
|
||||
range: 30..31,
|
||||
},
|
||||
SimpleToken {
|
||||
kind: Whitespace,
|
||||
range: 31..39,
|
||||
},
|
||||
SimpleToken {
|
||||
kind: Comment,
|
||||
range: 39..77,
|
||||
},
|
||||
SimpleToken {
|
||||
kind: Newline,
|
||||
range: 77..78,
|
||||
},
|
||||
SimpleToken {
|
||||
kind: Whitespace,
|
||||
range: 78..86,
|
||||
},
|
||||
SimpleToken {
|
||||
kind: Comma,
|
||||
range: 86..87,
|
||||
},
|
||||
SimpleToken {
|
||||
kind: Slash,
|
||||
range: 87..88,
|
||||
},
|
||||
]
|
|
@ -0,0 +1,18 @@
|
|||
---
|
||||
source: crates/ruff_python_trivia/src/tokenizer.rs
|
||||
expression: test_case.tokens()
|
||||
---
|
||||
[
|
||||
SimpleToken {
|
||||
kind: RParen,
|
||||
range: 14..15,
|
||||
},
|
||||
SimpleToken {
|
||||
kind: Whitespace,
|
||||
range: 15..16,
|
||||
},
|
||||
SimpleToken {
|
||||
kind: Comment,
|
||||
range: 16..25,
|
||||
},
|
||||
]
|
|
@ -0,0 +1,22 @@
|
|||
---
|
||||
source: crates/ruff_python_trivia/src/tokenizer.rs
|
||||
expression: test_case.tokens()
|
||||
---
|
||||
[
|
||||
SimpleToken {
|
||||
kind: Comment,
|
||||
range: 0..9,
|
||||
},
|
||||
SimpleToken {
|
||||
kind: Newline,
|
||||
range: 9..10,
|
||||
},
|
||||
SimpleToken {
|
||||
kind: Whitespace,
|
||||
range: 10..14,
|
||||
},
|
||||
SimpleToken {
|
||||
kind: Comment,
|
||||
range: 14..23,
|
||||
},
|
||||
]
|
|
@ -0,0 +1,10 @@
|
|||
---
|
||||
source: crates/ruff_python_trivia/src/tokenizer.rs
|
||||
expression: test_case.tokens()
|
||||
---
|
||||
[
|
||||
SimpleToken {
|
||||
kind: Other,
|
||||
range: 0..6,
|
||||
},
|
||||
]
|
784
crates/ruff_python_trivia/src/tokenizer.rs
Normal file
784
crates/ruff_python_trivia/src/tokenizer.rs
Normal file
|
@ -0,0 +1,784 @@
|
|||
use memchr::memrchr3_iter;
|
||||
use ruff_text_size::{TextLen, TextRange, TextSize};
|
||||
use unic_ucd_ident::{is_xid_continue, is_xid_start};
|
||||
|
||||
use crate::{is_python_whitespace, Cursor};
|
||||
|
||||
/// Searches for the first non-trivia character in `range`.
|
||||
///
|
||||
/// The search skips over any whitespace and comments.
|
||||
///
|
||||
/// Returns `Some` if the range contains any non-trivia character. The first item is the absolute offset
|
||||
/// of the character, the second item the non-trivia character.
|
||||
///
|
||||
/// Returns `None` if the range is empty or only contains trivia (whitespace or comments).
|
||||
pub fn first_non_trivia_token(offset: TextSize, code: &str) -> Option<SimpleToken> {
|
||||
SimpleTokenizer::starts_at(offset, code)
|
||||
.skip_trivia()
|
||||
.next()
|
||||
}
|
||||
|
||||
/// Returns the first non-trivia token right before `offset` or `None` if at the start of the file
|
||||
/// or all preceding tokens are trivia tokens.
|
||||
///
|
||||
/// ## Notes
|
||||
///
|
||||
/// Prefer [`first_non_trivia_token`] whenever possible because reverse lookup is expensive because of comments.
|
||||
pub fn first_non_trivia_token_rev(offset: TextSize, code: &str) -> Option<SimpleToken> {
|
||||
SimpleTokenizer::up_to(offset, code)
|
||||
.skip_trivia()
|
||||
.next_back()
|
||||
}
|
||||
|
||||
/// Returns the number of newlines between `offset` and the first non whitespace character in the source code.
|
||||
pub fn lines_before(offset: TextSize, code: &str) -> u32 {
|
||||
let tokens = SimpleTokenizer::up_to(offset, code);
|
||||
let mut newlines = 0u32;
|
||||
|
||||
for token in tokens.rev() {
|
||||
match token.kind() {
|
||||
SimpleTokenKind::Newline => {
|
||||
newlines += 1;
|
||||
}
|
||||
SimpleTokenKind::Whitespace => {
|
||||
// ignore
|
||||
}
|
||||
_ => {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
newlines
|
||||
}
|
||||
|
||||
/// Counts the empty lines between `offset` and the first non-whitespace character.
|
||||
pub fn lines_after(offset: TextSize, code: &str) -> u32 {
|
||||
let tokens = SimpleTokenizer::starts_at(offset, code);
|
||||
let mut newlines = 0u32;
|
||||
|
||||
for token in tokens {
|
||||
match token.kind() {
|
||||
SimpleTokenKind::Newline => {
|
||||
newlines += 1;
|
||||
}
|
||||
SimpleTokenKind::Whitespace => {
|
||||
// ignore
|
||||
}
|
||||
_ => {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
newlines
|
||||
}
|
||||
|
||||
/// Returns the position after skipping any trailing trivia up to, but not including the newline character.
|
||||
pub fn skip_trailing_trivia(offset: TextSize, code: &str) -> TextSize {
|
||||
let tokenizer = SimpleTokenizer::starts_at(offset, code);
|
||||
|
||||
for token in tokenizer {
|
||||
match token.kind() {
|
||||
SimpleTokenKind::Whitespace
|
||||
| SimpleTokenKind::Comment
|
||||
| SimpleTokenKind::Continuation => {
|
||||
// No op
|
||||
}
|
||||
_ => {
|
||||
return token.start();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
offset
|
||||
}
|
||||
|
||||
fn is_identifier_start(c: char) -> bool {
|
||||
c.is_ascii_alphabetic() || c == '_' || is_non_ascii_identifier_start(c)
|
||||
}
|
||||
|
||||
// Checks if the character c is a valid continuation character as described
|
||||
// in https://docs.python.org/3/reference/lexical_analysis.html#identifiers
|
||||
fn is_identifier_continuation(c: char) -> bool {
|
||||
if c.is_ascii() {
|
||||
matches!(c, 'a'..='z' | 'A'..='Z' | '_' | '0'..='9')
|
||||
} else {
|
||||
is_xid_continue(c)
|
||||
}
|
||||
}
|
||||
|
||||
fn is_non_ascii_identifier_start(c: char) -> bool {
|
||||
is_xid_start(c)
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, Eq, PartialEq, Hash)]
|
||||
pub struct SimpleToken {
|
||||
pub kind: SimpleTokenKind,
|
||||
pub range: TextRange,
|
||||
}
|
||||
|
||||
impl SimpleToken {
|
||||
pub const fn kind(&self) -> SimpleTokenKind {
|
||||
self.kind
|
||||
}
|
||||
|
||||
#[allow(unused)]
|
||||
pub const fn range(&self) -> TextRange {
|
||||
self.range
|
||||
}
|
||||
|
||||
pub const fn start(&self) -> TextSize {
|
||||
self.range.start()
|
||||
}
|
||||
|
||||
pub const fn end(&self) -> TextSize {
|
||||
self.range.end()
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Copy, Clone, Debug, Eq, PartialEq, Hash)]
|
||||
pub enum SimpleTokenKind {
|
||||
/// A comment, not including the trailing new line.
|
||||
Comment,
|
||||
|
||||
/// Sequence of ' ' or '\t'
|
||||
Whitespace,
|
||||
|
||||
/// Start or end of the file
|
||||
EndOfFile,
|
||||
|
||||
/// `\\`
|
||||
Continuation,
|
||||
|
||||
/// `\n` or `\r` or `\r\n`
|
||||
Newline,
|
||||
|
||||
/// `(`
|
||||
LParen,
|
||||
|
||||
/// `)`
|
||||
RParen,
|
||||
|
||||
/// `{`
|
||||
LBrace,
|
||||
|
||||
/// `}`
|
||||
RBrace,
|
||||
|
||||
/// `[`
|
||||
LBracket,
|
||||
|
||||
/// `]`
|
||||
RBracket,
|
||||
|
||||
/// `,`
|
||||
Comma,
|
||||
|
||||
/// `:`
|
||||
Colon,
|
||||
|
||||
/// '/'
|
||||
Slash,
|
||||
|
||||
/// '*'
|
||||
Star,
|
||||
|
||||
/// `.`.
|
||||
Dot,
|
||||
|
||||
/// `else`
|
||||
Else,
|
||||
|
||||
/// `if`
|
||||
If,
|
||||
|
||||
/// `in`
|
||||
In,
|
||||
|
||||
/// `as`
|
||||
As,
|
||||
|
||||
/// `match`
|
||||
Match,
|
||||
|
||||
/// `with`
|
||||
With,
|
||||
|
||||
/// `async`
|
||||
Async,
|
||||
|
||||
/// Any other non trivia token.
|
||||
Other,
|
||||
|
||||
/// Returned for each character after [`SimpleTokenKind::Other`] has been returned once.
|
||||
Bogus,
|
||||
}
|
||||
|
||||
impl SimpleTokenKind {
|
||||
const fn from_non_trivia_char(c: char) -> SimpleTokenKind {
|
||||
match c {
|
||||
'(' => SimpleTokenKind::LParen,
|
||||
')' => SimpleTokenKind::RParen,
|
||||
'[' => SimpleTokenKind::LBracket,
|
||||
']' => SimpleTokenKind::RBracket,
|
||||
'{' => SimpleTokenKind::LBrace,
|
||||
'}' => SimpleTokenKind::RBrace,
|
||||
',' => SimpleTokenKind::Comma,
|
||||
':' => SimpleTokenKind::Colon,
|
||||
'/' => SimpleTokenKind::Slash,
|
||||
'*' => SimpleTokenKind::Star,
|
||||
'.' => SimpleTokenKind::Dot,
|
||||
_ => SimpleTokenKind::Other,
|
||||
}
|
||||
}
|
||||
|
||||
const fn is_trivia(self) -> bool {
|
||||
matches!(
|
||||
self,
|
||||
SimpleTokenKind::Whitespace
|
||||
| SimpleTokenKind::Newline
|
||||
| SimpleTokenKind::Comment
|
||||
| SimpleTokenKind::Continuation
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
/// Simple zero allocation tokenizer for tokenizing trivia (and some tokens).
|
||||
///
|
||||
/// The tokenizer must start at an offset that is trivia (e.g. not inside of a multiline string).
|
||||
///
|
||||
/// The tokenizer doesn't guarantee any correctness after it returned a [`SimpleTokenKind::Other`]. That's why it
|
||||
/// will return [`SimpleTokenKind::Bogus`] for every character after until it reaches the end of the file.
|
||||
pub struct SimpleTokenizer<'a> {
|
||||
offset: TextSize,
|
||||
back_offset: TextSize,
|
||||
/// `true` when it is known that the current `back` line has no comment for sure.
|
||||
back_line_has_no_comment: bool,
|
||||
bogus: bool,
|
||||
source: &'a str,
|
||||
cursor: Cursor<'a>,
|
||||
}
|
||||
|
||||
impl<'a> SimpleTokenizer<'a> {
|
||||
pub fn new(source: &'a str, range: TextRange) -> Self {
|
||||
Self {
|
||||
offset: range.start(),
|
||||
back_offset: range.end(),
|
||||
back_line_has_no_comment: false,
|
||||
bogus: false,
|
||||
source,
|
||||
cursor: Cursor::new(&source[range]),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn starts_at(offset: TextSize, source: &'a str) -> Self {
|
||||
let range = TextRange::new(offset, source.text_len());
|
||||
Self::new(source, range)
|
||||
}
|
||||
|
||||
/// Creates a tokenizer that lexes tokens from the start of `source` up to `offset`.
|
||||
pub fn up_to(offset: TextSize, source: &'a str) -> Self {
|
||||
Self::new(source, TextRange::up_to(offset))
|
||||
}
|
||||
|
||||
/// Creates a tokenizer that lexes tokens from the start of `source` up to `offset`, and informs
|
||||
/// the lexer that the line at `offset` contains no comments. This can significantly speed up backwards lexing
|
||||
/// because the lexer doesn't need to scan for comments.
|
||||
pub fn up_to_without_back_comment(offset: TextSize, source: &'a str) -> Self {
|
||||
let mut tokenizer = Self::up_to(offset, source);
|
||||
tokenizer.back_line_has_no_comment = true;
|
||||
tokenizer
|
||||
}
|
||||
|
||||
fn to_keyword_or_other(&self, range: TextRange) -> SimpleTokenKind {
|
||||
let source = &self.source[range];
|
||||
match source {
|
||||
"as" => SimpleTokenKind::As,
|
||||
"async" => SimpleTokenKind::Async,
|
||||
"else" => SimpleTokenKind::Else,
|
||||
"if" => SimpleTokenKind::If,
|
||||
"in" => SimpleTokenKind::In,
|
||||
"match" => SimpleTokenKind::Match, // Match is a soft keyword that depends on the context but we can always lex it as a keyword and leave it to the caller (parser) to decide if it should be handled as an identifier or keyword.
|
||||
"with" => SimpleTokenKind::With,
|
||||
// ...,
|
||||
_ => SimpleTokenKind::Other, // Potentially an identifier, but only if it isn't a string prefix. We can ignore this for now https://docs.python.org/3/reference/lexical_analysis.html#string-and-bytes-literals
|
||||
}
|
||||
}
|
||||
|
||||
fn next_token(&mut self) -> SimpleToken {
|
||||
self.cursor.start_token();
|
||||
|
||||
let Some(first) = self.cursor.bump() else {
|
||||
return SimpleToken {
|
||||
kind: SimpleTokenKind::EndOfFile,
|
||||
range: TextRange::empty(self.offset),
|
||||
};
|
||||
};
|
||||
|
||||
if self.bogus {
|
||||
let token = SimpleToken {
|
||||
kind: SimpleTokenKind::Bogus,
|
||||
range: TextRange::at(self.offset, first.text_len()),
|
||||
};
|
||||
|
||||
self.offset += first.text_len();
|
||||
return token;
|
||||
}
|
||||
|
||||
let kind = match first {
|
||||
' ' | '\t' => {
|
||||
self.cursor.eat_while(|c| matches!(c, ' ' | '\t'));
|
||||
SimpleTokenKind::Whitespace
|
||||
}
|
||||
|
||||
'\n' => SimpleTokenKind::Newline,
|
||||
|
||||
'\r' => {
|
||||
self.cursor.eat_char('\n');
|
||||
SimpleTokenKind::Newline
|
||||
}
|
||||
|
||||
'#' => {
|
||||
self.cursor.eat_while(|c| !matches!(c, '\n' | '\r'));
|
||||
SimpleTokenKind::Comment
|
||||
}
|
||||
|
||||
'\\' => SimpleTokenKind::Continuation,
|
||||
|
||||
c => {
|
||||
let kind = if is_identifier_start(c) {
|
||||
self.cursor.eat_while(is_identifier_continuation);
|
||||
let token_len = self.cursor.token_len();
|
||||
|
||||
let range = TextRange::at(self.offset, token_len);
|
||||
self.to_keyword_or_other(range)
|
||||
} else {
|
||||
SimpleTokenKind::from_non_trivia_char(c)
|
||||
};
|
||||
|
||||
if kind == SimpleTokenKind::Other {
|
||||
self.bogus = true;
|
||||
}
|
||||
kind
|
||||
}
|
||||
};
|
||||
|
||||
let token_len = self.cursor.token_len();
|
||||
|
||||
let token = SimpleToken {
|
||||
kind,
|
||||
range: TextRange::at(self.offset, token_len),
|
||||
};
|
||||
|
||||
self.offset += token_len;
|
||||
|
||||
token
|
||||
}
|
||||
|
||||
/// Returns the next token from the back. Prefer iterating forwards. Iterating backwards is significantly more expensive
|
||||
/// because it needs to check if the line has any comments when encountering any non-trivia token.
|
||||
pub fn next_token_back(&mut self) -> SimpleToken {
|
||||
self.cursor.start_token();
|
||||
|
||||
let Some(last) = self.cursor.bump_back() else {
|
||||
return SimpleToken {
|
||||
kind: SimpleTokenKind::EndOfFile,
|
||||
range: TextRange::empty(self.back_offset),
|
||||
};
|
||||
};
|
||||
|
||||
if self.bogus {
|
||||
let token = SimpleToken {
|
||||
kind: SimpleTokenKind::Bogus,
|
||||
range: TextRange::at(self.back_offset - last.text_len(), last.text_len()),
|
||||
};
|
||||
|
||||
self.back_offset -= last.text_len();
|
||||
return token;
|
||||
}
|
||||
|
||||
let kind = match last {
|
||||
// This may not be 100% correct because it will lex-out trailing whitespace from a comment
|
||||
// as whitespace rather than being part of the token. This shouldn't matter for what we use the lexer for.
|
||||
' ' | '\t' => {
|
||||
self.cursor.eat_back_while(|c| matches!(c, ' ' | '\t'));
|
||||
SimpleTokenKind::Whitespace
|
||||
}
|
||||
|
||||
'\r' => {
|
||||
self.back_line_has_no_comment = false;
|
||||
SimpleTokenKind::Newline
|
||||
}
|
||||
|
||||
'\n' => {
|
||||
self.back_line_has_no_comment = false;
|
||||
self.cursor.eat_char_back('\r');
|
||||
SimpleTokenKind::Newline
|
||||
}
|
||||
|
||||
// Empty comment (could also be a comment nested in another comment, but this shouldn't matter for what we use the lexer for)
|
||||
'#' => SimpleTokenKind::Comment,
|
||||
|
||||
// For all other tokens, test if the character isn't part of a comment.
|
||||
c => {
|
||||
// Skip the test whether there's a preceding comment if it has been performed before.
|
||||
let comment_offset = if self.back_line_has_no_comment {
|
||||
None
|
||||
} else {
|
||||
let bytes = self.cursor.chars().as_str().as_bytes();
|
||||
let mut line_start = 0;
|
||||
let mut last_comment_offset = None;
|
||||
|
||||
// Find the start of the line, or any potential comments.
|
||||
for index in memrchr3_iter(b'\n', b'\r', b'#', bytes) {
|
||||
if bytes[index] == b'#' {
|
||||
// Potentially a comment, but not guaranteed
|
||||
last_comment_offset = Some(index);
|
||||
} else {
|
||||
line_start = index + 1;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// Verify if this is indeed a comment. Doing this only when we've found a comment is significantly
|
||||
// faster because comments are rare.
|
||||
last_comment_offset.filter(|last_comment_offset| {
|
||||
let before_comment =
|
||||
&self.cursor.chars().as_str()[line_start..*last_comment_offset];
|
||||
|
||||
before_comment.chars().all(|c| {
|
||||
is_python_whitespace(c)
|
||||
|| SimpleTokenKind::from_non_trivia_char(c)
|
||||
!= SimpleTokenKind::Other
|
||||
})
|
||||
})
|
||||
};
|
||||
|
||||
// From here on it is guaranteed that this line has no other comment.
|
||||
self.back_line_has_no_comment = true;
|
||||
|
||||
if let Some(comment_offset) = comment_offset {
|
||||
let comment_length = self.cursor.chars().as_str().len() - comment_offset;
|
||||
// It is a comment, bump all tokens
|
||||
for _ in 0..comment_length {
|
||||
self.cursor.bump_back().unwrap();
|
||||
}
|
||||
|
||||
SimpleTokenKind::Comment
|
||||
} else if c == '\\' {
|
||||
SimpleTokenKind::Continuation
|
||||
} else {
|
||||
let kind = if is_identifier_continuation(c) {
|
||||
// if we only have identifier continuations but no start (e.g. 555) we
|
||||
// don't want to consume the chars, so in that case, we want to rewind the
|
||||
// cursor to here
|
||||
let savepoint = self.cursor.clone();
|
||||
self.cursor.eat_back_while(is_identifier_continuation);
|
||||
|
||||
let token_len = self.cursor.token_len();
|
||||
let range = TextRange::at(self.back_offset - token_len, token_len);
|
||||
|
||||
if self.source[range]
|
||||
.chars()
|
||||
.next()
|
||||
.is_some_and(is_identifier_start)
|
||||
{
|
||||
self.to_keyword_or_other(range)
|
||||
} else {
|
||||
self.cursor = savepoint;
|
||||
SimpleTokenKind::Other
|
||||
}
|
||||
} else {
|
||||
SimpleTokenKind::from_non_trivia_char(c)
|
||||
};
|
||||
|
||||
if kind == SimpleTokenKind::Other {
|
||||
self.bogus = true;
|
||||
}
|
||||
|
||||
kind
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
let token_len = self.cursor.token_len();
|
||||
|
||||
let start = self.back_offset - token_len;
|
||||
|
||||
let token = SimpleToken {
|
||||
kind,
|
||||
range: TextRange::at(start, token_len),
|
||||
};
|
||||
|
||||
self.back_offset = start;
|
||||
|
||||
token
|
||||
}
|
||||
|
||||
pub fn skip_trivia(self) -> impl Iterator<Item = SimpleToken> + DoubleEndedIterator + 'a {
|
||||
self.filter(|t| !t.kind().is_trivia())
|
||||
}
|
||||
}
|
||||
|
||||
impl Iterator for SimpleTokenizer<'_> {
|
||||
type Item = SimpleToken;
|
||||
|
||||
fn next(&mut self) -> Option<Self::Item> {
|
||||
let token = self.next_token();
|
||||
|
||||
if token.kind == SimpleTokenKind::EndOfFile {
|
||||
None
|
||||
} else {
|
||||
Some(token)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl DoubleEndedIterator for SimpleTokenizer<'_> {
|
||||
fn next_back(&mut self) -> Option<Self::Item> {
|
||||
let token = self.next_token_back();
|
||||
|
||||
if token.kind == SimpleTokenKind::EndOfFile {
|
||||
None
|
||||
} else {
|
||||
Some(token)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use insta::assert_debug_snapshot;
|
||||
use ruff_text_size::{TextLen, TextRange, TextSize};
|
||||
|
||||
use crate::tokenizer::{lines_after, lines_before, SimpleToken, SimpleTokenizer};
|
||||
|
||||
struct TokenizationTestCase {
|
||||
source: &'static str,
|
||||
range: TextRange,
|
||||
tokens: Vec<SimpleToken>,
|
||||
}
|
||||
|
||||
impl TokenizationTestCase {
|
||||
fn assert_reverse_tokenization(&self) {
|
||||
let mut backwards = self.tokenize_reverse();
|
||||
|
||||
// Re-reverse to get the tokens in forward order.
|
||||
backwards.reverse();
|
||||
|
||||
assert_eq!(&backwards, &self.tokens);
|
||||
}
|
||||
|
||||
fn tokenize_reverse(&self) -> Vec<SimpleToken> {
|
||||
SimpleTokenizer::new(self.source, self.range)
|
||||
.rev()
|
||||
.collect()
|
||||
}
|
||||
|
||||
fn tokens(&self) -> &[SimpleToken] {
|
||||
&self.tokens
|
||||
}
|
||||
}
|
||||
|
||||
fn tokenize_range(source: &'static str, range: TextRange) -> TokenizationTestCase {
|
||||
let tokens: Vec<_> = SimpleTokenizer::new(source, range).collect();
|
||||
|
||||
TokenizationTestCase {
|
||||
source,
|
||||
range,
|
||||
tokens,
|
||||
}
|
||||
}
|
||||
|
||||
fn tokenize(source: &'static str) -> TokenizationTestCase {
|
||||
tokenize_range(source, TextRange::new(TextSize::new(0), source.text_len()))
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn tokenize_trivia() {
|
||||
let source = "# comment\n # comment";
|
||||
|
||||
let test_case = tokenize(source);
|
||||
|
||||
assert_debug_snapshot!(test_case.tokens());
|
||||
test_case.assert_reverse_tokenization();
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn tokenize_parentheses() {
|
||||
let source = "([{}])";
|
||||
|
||||
let test_case = tokenize(source);
|
||||
|
||||
assert_debug_snapshot!(test_case.tokens());
|
||||
test_case.assert_reverse_tokenization();
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn tokenize_comma() {
|
||||
let source = ",,,,";
|
||||
|
||||
let test_case = tokenize(source);
|
||||
|
||||
assert_debug_snapshot!(test_case.tokens());
|
||||
test_case.assert_reverse_tokenization();
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn tokenize_continuation() {
|
||||
let source = "( \\\n )";
|
||||
|
||||
let test_case = tokenize(source);
|
||||
|
||||
assert_debug_snapshot!(test_case.tokens());
|
||||
test_case.assert_reverse_tokenization();
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn tricky_unicode() {
|
||||
let source = "មុ";
|
||||
|
||||
let test_case = tokenize(source);
|
||||
assert_debug_snapshot!(test_case.tokens());
|
||||
test_case.assert_reverse_tokenization();
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn identifier_ending_in_non_start_char() {
|
||||
let source = "i5";
|
||||
|
||||
let test_case = tokenize(source);
|
||||
assert_debug_snapshot!(test_case.tokens());
|
||||
test_case.assert_reverse_tokenization();
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn ignore_word_with_only_id_continuing_chars() {
|
||||
let source = "555";
|
||||
|
||||
let test_case = tokenize(source);
|
||||
assert_debug_snapshot!(test_case.tokens());
|
||||
|
||||
// note: not reversible: [other, bogus, bogus] vs [bogus, bogus, other]
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn tokenize_multichar() {
|
||||
let source = "if in else match";
|
||||
|
||||
let test_case = tokenize(source);
|
||||
|
||||
assert_debug_snapshot!(test_case.tokens());
|
||||
test_case.assert_reverse_tokenization();
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn tokenize_substring() {
|
||||
let source = "('some string') # comment";
|
||||
|
||||
let test_case =
|
||||
tokenize_range(source, TextRange::new(TextSize::new(14), source.text_len()));
|
||||
|
||||
assert_debug_snapshot!(test_case.tokens());
|
||||
test_case.assert_reverse_tokenization();
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn tokenize_slash() {
|
||||
let source = r#" # trailing positional comment
|
||||
# Positional arguments only after here
|
||||
,/"#;
|
||||
|
||||
let test_case = tokenize(source);
|
||||
|
||||
assert_debug_snapshot!(test_case.tokens());
|
||||
test_case.assert_reverse_tokenization();
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn tokenize_bogus() {
|
||||
let source = r#"# leading comment
|
||||
"a string"
|
||||
a = (10)"#;
|
||||
|
||||
let test_case = tokenize(source);
|
||||
|
||||
assert_debug_snapshot!(test_case.tokens());
|
||||
assert_debug_snapshot!("Reverse", test_case.tokenize_reverse());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn lines_before_empty_string() {
|
||||
assert_eq!(lines_before(TextSize::new(0), ""), 0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn lines_before_in_the_middle_of_a_line() {
|
||||
assert_eq!(lines_before(TextSize::new(4), "a = 20"), 0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn lines_before_on_a_new_line() {
|
||||
assert_eq!(lines_before(TextSize::new(7), "a = 20\nb = 10"), 1);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn lines_before_multiple_leading_newlines() {
|
||||
assert_eq!(lines_before(TextSize::new(9), "a = 20\n\r\nb = 10"), 2);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn lines_before_with_comment_offset() {
|
||||
assert_eq!(lines_before(TextSize::new(8), "a = 20\n# a comment"), 0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn lines_before_with_trailing_comment() {
|
||||
assert_eq!(
|
||||
lines_before(TextSize::new(22), "a = 20 # some comment\nb = 10"),
|
||||
1
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn lines_before_with_comment_only_line() {
|
||||
assert_eq!(
|
||||
lines_before(TextSize::new(22), "a = 20\n# some comment\nb = 10"),
|
||||
1
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn lines_after_empty_string() {
|
||||
assert_eq!(lines_after(TextSize::new(0), ""), 0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn lines_after_in_the_middle_of_a_line() {
|
||||
assert_eq!(lines_after(TextSize::new(4), "a = 20"), 0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn lines_after_before_a_new_line() {
|
||||
assert_eq!(lines_after(TextSize::new(6), "a = 20\nb = 10"), 1);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn lines_after_multiple_newlines() {
|
||||
assert_eq!(lines_after(TextSize::new(6), "a = 20\n\r\nb = 10"), 2);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn lines_after_before_comment_offset() {
|
||||
assert_eq!(lines_after(TextSize::new(7), "a = 20 # a comment\n"), 0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn lines_after_with_comment_only_line() {
|
||||
assert_eq!(
|
||||
lines_after(TextSize::new(6), "a = 20\n# some comment\nb = 10"),
|
||||
1
|
||||
);
|
||||
}
|
||||
}
|
43
crates/ruff_python_trivia/src/whitespace.rs
Normal file
43
crates/ruff_python_trivia/src/whitespace.rs
Normal file
|
@ -0,0 +1,43 @@
|
|||
/// Returns `true` for [whitespace](https://docs.python.org/3/reference/lexical_analysis.html#whitespace-between-tokens)
|
||||
/// characters.
|
||||
pub const fn is_python_whitespace(c: char) -> bool {
|
||||
matches!(
|
||||
c,
|
||||
// Space, tab, or form-feed
|
||||
' ' | '\t' | '\x0C'
|
||||
)
|
||||
}
|
||||
|
||||
/// Extract the leading indentation from a line.
|
||||
pub fn leading_indentation(line: &str) -> &str {
|
||||
line.find(|char: char| !is_python_whitespace(char))
|
||||
.map_or(line, |index| &line[..index])
|
||||
}
|
||||
|
||||
pub trait PythonWhitespace {
|
||||
/// Like `str::trim()`, but only removes whitespace characters that Python considers
|
||||
/// to be [whitespace](https://docs.python.org/3/reference/lexical_analysis.html#whitespace-between-tokens).
|
||||
fn trim_whitespace(&self) -> &Self;
|
||||
|
||||
/// Like `str::trim_start()`, but only removes whitespace characters that Python considers
|
||||
/// to be [whitespace](https://docs.python.org/3/reference/lexical_analysis.html#whitespace-between-tokens).
|
||||
fn trim_whitespace_start(&self) -> &Self;
|
||||
|
||||
/// Like `str::trim_end()`, but only removes whitespace characters that Python considers
|
||||
/// to be [whitespace](https://docs.python.org/3/reference/lexical_analysis.html#whitespace-between-tokens).
|
||||
fn trim_whitespace_end(&self) -> &Self;
|
||||
}
|
||||
|
||||
impl PythonWhitespace for str {
|
||||
fn trim_whitespace(&self) -> &Self {
|
||||
self.trim_matches(is_python_whitespace)
|
||||
}
|
||||
|
||||
fn trim_whitespace_start(&self) -> &Self {
|
||||
self.trim_start_matches(is_python_whitespace)
|
||||
}
|
||||
|
||||
fn trim_whitespace_end(&self) -> &Self {
|
||||
self.trim_end_matches(is_python_whitespace)
|
||||
}
|
||||
}
|
Loading…
Add table
Add a link
Reference in a new issue