Pull in RustPython parser (#6099)

2025-09-27 04:19:18 +00:00 · 2023-07-27 11:29:11 +02:00 · 2023-07-27 11:29:11 +02:00 · 40f54375cb
commit 40f54375cb
parent 86539c1fc5
779 changed files with 108400 additions and 2078 deletions
--- a/crates/ruff_python_parser/src/lexer/cursor.rs
+++ b/crates/ruff_python_parser/src/lexer/cursor.rs
@ -0,0 +1,107 @@
+use ruff_text_size::{TextLen, TextSize};
+use std::str::Chars;
+
+pub(crate) const EOF_CHAR: char = '\0';
+
+#[derive(Clone, Debug)]
+pub(super) struct Cursor<'a> {
+    chars: Chars<'a>,
+    source_length: TextSize,
+    #[cfg(debug_assertions)]
+    prev_char: char,
+}
+
+impl<'a> Cursor<'a> {
+    pub(crate) fn new(source: &'a str) -> Self {
+        Self {
+            source_length: source.text_len(),
+            chars: source.chars(),
+            #[cfg(debug_assertions)]
+            prev_char: EOF_CHAR,
+        }
+    }
+
+    /// Returns the previous token. Useful for debug assertions.
+    #[cfg(debug_assertions)]
+    pub(super) const fn previous(&self) -> char {
+        self.prev_char
+    }
+
+    /// Peeks the next character from the input stream without consuming it.
+    /// Returns [`EOF_CHAR`] if the file is at the end of the file.
+    pub(super) fn first(&self) -> char {
+        self.chars.clone().next().unwrap_or(EOF_CHAR)
+    }
+
+    /// Peeks the second character from the input stream without consuming it.
+    /// Returns [`EOF_CHAR`] if the position is past the end of the file.
+    pub(super) fn second(&self) -> char {
+        let mut chars = self.chars.clone();
+        chars.next();
+        chars.next().unwrap_or(EOF_CHAR)
+    }
+
+    /// Returns the remaining text to lex.
+    pub(super) fn rest(&self) -> &'a str {
+        self.chars.as_str()
+    }
+
+    // SAFETY: The `source.text_len` call in `new` would panic if the string length is larger than a `u32`.
+    #[allow(clippy::cast_possible_truncation)]
+    pub(super) fn text_len(&self) -> TextSize {
+        TextSize::new(self.chars.as_str().len() as u32)
+    }
+
+    pub(super) fn token_len(&self) -> TextSize {
+        self.source_length - self.text_len()
+    }
+
+    pub(super) fn start_token(&mut self) {
+        self.source_length = self.text_len();
+    }
+
+    pub(super) fn is_eof(&self) -> bool {
+        self.chars.as_str().is_empty()
+    }
+
+    /// Consumes the next character
+    pub(super) fn bump(&mut self) -> Option<char> {
+        let prev = self.chars.next()?;
+
+        #[cfg(debug_assertions)]
+        {
+            self.prev_char = prev;
+        }
+
+        Some(prev)
+    }
+
+    pub(super) fn eat_char(&mut self, c: char) -> bool {
+        if self.first() == c {
+            self.bump();
+            true
+        } else {
+            false
+        }
+    }
+
+    pub(super) fn eat_if<F>(&mut self, mut predicate: F) -> Option<char>
+    where
+        F: FnMut(char) -> bool,
+    {
+        if predicate(self.first()) && !self.is_eof() {
+            self.bump()
+        } else {
+            None
+        }
+    }
+
+    /// Eats symbols while predicate returns true or until the end of file is reached.
+    pub(super) fn eat_while(&mut self, mut predicate: impl FnMut(char) -> bool) {
+        // It was tried making optimized version of this for eg. line comments, but
+        // LLVM can inline all of this and compile it down to fast iteration over bytes.
+        while predicate(self.first()) && !self.is_eof() {
+            self.bump();
+        }
+    }
+}
--- a/crates/ruff_python_parser/src/lexer/indentation.rs
+++ b/crates/ruff_python_parser/src/lexer/indentation.rs
@ -0,0 +1,126 @@
+use static_assertions::assert_eq_size;
+use std::cmp::Ordering;
+use std::fmt::Debug;
+
+/// The column index of an indentation.
+///
+/// A space increments the column by one. A tab adds up to 2 (if tab size is 2) indices, but just one
+/// if the column isn't even.
+#[derive(Debug, Copy, Clone, Eq, PartialEq, Ord, PartialOrd, Default)]
+pub(super) struct Column(u32);
+
+impl Column {
+    pub(super) const fn new(column: u32) -> Self {
+        Self(column)
+    }
+}
+
+/// The number of characters in an indentation. Each character accounts for 1.
+#[derive(Debug, Copy, Clone, Eq, PartialEq, Ord, PartialOrd, Default)]
+pub(super) struct Character(u32);
+
+impl Character {
+    pub(super) const fn new(characters: u32) -> Self {
+        Self(characters)
+    }
+}
+
+/// The [Indentation](https://docs.python.org/3/reference/lexical_analysis.html#indentation) of a logical line.
+#[derive(Copy, Clone, Debug, Eq, PartialEq, Default)]
+pub(super) struct Indentation {
+    column: Column,
+    character: Character,
+}
+
+impl Indentation {
+    const TAB_SIZE: u32 = 2;
+
+    pub(super) const fn root() -> Self {
+        Self {
+            column: Column::new(0),
+            character: Character::new(0),
+        }
+    }
+
+    #[cfg(test)]
+    pub(super) const fn new(column: Column, character: Character) -> Self {
+        Self { column, character }
+    }
+
+    #[must_use]
+    pub(super) fn add_space(self) -> Self {
+        Self {
+            character: Character(self.character.0 + 1),
+            column: Column(self.column.0 + 1),
+        }
+    }
+
+    #[must_use]
+    pub(super) fn add_tab(self) -> Self {
+        Self {
+            character: Character(self.character.0 + 1),
+            // Compute the column index:
+            // * Adds `TAB_SIZE` if `column` is a multiple of `TAB_SIZE`
+            // * Rounds `column` up to the next multiple of `TAB_SIZE` otherwise.
+            // https://github.com/python/cpython/blob/2cf99026d6320f38937257da1ab014fc873a11a6/Parser/tokenizer.c#L1818
+            column: Column((self.column.0 / Self::TAB_SIZE + 1) * Self::TAB_SIZE),
+        }
+    }
+
+    pub(super) fn try_compare(self, other: Indentation) -> Result<Ordering, UnexpectedIndentation> {
+        let column_ordering = self.column.cmp(&other.column);
+        let character_ordering = self.character.cmp(&other.character);
+
+        if column_ordering == character_ordering {
+            Ok(column_ordering)
+        } else {
+            Err(UnexpectedIndentation)
+        }
+    }
+}
+
+#[derive(Debug, Copy, Clone, PartialEq)]
+pub(super) struct UnexpectedIndentation;
+
+// The indentations stack is used to keep track of the current indentation level
+// [See Indentation](docs.python.org/3/reference/lexical_analysis.html#indentation).
+#[derive(Debug, Clone, Default)]
+pub(super) struct Indentations {
+    stack: Vec<Indentation>,
+}
+
+impl Indentations {
+    pub(super) fn push(&mut self, indent: Indentation) {
+        debug_assert_eq!(self.current().try_compare(indent), Ok(Ordering::Less));
+
+        self.stack.push(indent);
+    }
+
+    pub(super) fn pop(&mut self) -> Option<Indentation> {
+        self.stack.pop()
+    }
+
+    pub(super) fn current(&self) -> &Indentation {
+        static ROOT: Indentation = Indentation::root();
+        self.stack.last().unwrap_or(&ROOT)
+    }
+}
+
+assert_eq_size!(Indentation, u64);
+
+#[cfg(test)]
+mod tests {
+    use super::{Character, Column, Indentation};
+    use std::cmp::Ordering;
+
+    #[test]
+    fn indentation_try_compare() {
+        let tab = Indentation::new(Column::new(8), Character::new(1));
+
+        assert_eq!(tab.try_compare(tab), Ok(Ordering::Equal));
+
+        let two_tabs = Indentation::new(Column::new(16), Character::new(2));
+        assert_eq!(two_tabs.try_compare(tab), Ok(Ordering::Greater));
+        assert_eq!(tab.try_compare(two_tabs), Ok(Ordering::Less));
+    }
+}