Create a rust_python_ast crate (#3370)

This PR productionizes @MichaReiser's suggestion in https://github.com/charliermarsh/ruff/issues/1820#issuecomment-1440204423, by creating a separate crate for the `ast` module (`rust_python_ast`). This will enable us to further split up the `ruff` crate, as we'll be able to create (e.g.) separate sub-linter crates that have access to these common AST utilities. This was mostly a straightforward copy (with adjustments to module imports), as the few dependencies that _did_ require modifications were handled in #3366, #3367, and #3368.
2025-09-27 20:42:10 +00:00 · 2023-03-07 10:18:40 -05:00 · 2023-03-07 10:18:40 -05:00 · bad6bdda1f
commit bad6bdda1f
parent a5d302fcbf
405 changed files with 1336 additions and 988 deletions
--- a/crates/ruff_python_ast/src/source_code/generator.rs
+++ b/crates/ruff_python_ast/src/source_code/generator.rs
--- a/crates/ruff_python_ast/src/source_code/indexer.rs
+++ b/crates/ruff_python_ast/src/source_code/indexer.rs
@ -0,0 +1,117 @@
+//! Struct used to index source code, to enable efficient lookup of tokens that
+//! are omitted from the AST (e.g., commented lines).
+
+use rustpython_parser::ast::Location;
+use rustpython_parser::lexer::LexResult;
+use rustpython_parser::Tok;
+
+pub struct Indexer {
+    commented_lines: Vec<usize>,
+    continuation_lines: Vec<usize>,
+}
+
+impl Indexer {
+    pub fn commented_lines(&self) -> &[usize] {
+        &self.commented_lines
+    }
+
+    pub fn continuation_lines(&self) -> &[usize] {
+        &self.continuation_lines
+    }
+}
+
+impl From<&[LexResult]> for Indexer {
+    fn from(lxr: &[LexResult]) -> Self {
+        let mut commented_lines = Vec::new();
+        let mut continuation_lines = Vec::new();
+        let mut prev: Option<(&Location, &Tok, &Location)> = None;
+        for (start, tok, end) in lxr.iter().flatten() {
+            if matches!(tok, Tok::Comment(_)) {
+                commented_lines.push(start.row());
+            }
+            if let Some((.., prev_tok, prev_end)) = prev {
+                if !matches!(
+                    prev_tok,
+                    Tok::Newline | Tok::NonLogicalNewline | Tok::Comment(..)
+                ) {
+                    for line in prev_end.row()..start.row() {
+                        continuation_lines.push(line);
+                    }
+                }
+            }
+            prev = Some((start, tok, end));
+        }
+        Self {
+            commented_lines,
+            continuation_lines,
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use rustpython_parser::lexer::LexResult;
+    use rustpython_parser::{lexer, Mode};
+
+    use crate::source_code::Indexer;
+
+    #[test]
+    fn continuation() {
+        let contents = r#"x = 1"#;
+        let lxr: Vec<LexResult> = lexer::lex(contents, Mode::Module).collect();
+        let indexer: Indexer = lxr.as_slice().into();
+        assert_eq!(indexer.continuation_lines(), Vec::<usize>::new().as_slice());
+
+        let contents = r#"
+# Hello, world!
+
+x = 1
+
+y = 2
+"#
+        .trim();
+        let lxr: Vec<LexResult> = lexer::lex(contents, Mode::Module).collect();
+        let indexer: Indexer = lxr.as_slice().into();
+        assert_eq!(indexer.continuation_lines(), Vec::<usize>::new().as_slice());
+
+        let contents = r#"
+x = \
+    1
+
+if True:
+    z = \
+        \
+        2
+
+(
+    "abc" # Foo
+    "def" \
+    "ghi"
+)
+"#
+        .trim();
+        let lxr: Vec<LexResult> = lexer::lex(contents, Mode::Module).collect();
+        let indexer: Indexer = lxr.as_slice().into();
+        assert_eq!(indexer.continuation_lines(), [1, 5, 6, 11]);
+
+        let contents = r#"
+x = 1; import sys
+import os
+
+if True:
+    x = 1; import sys
+    import os
+
+if True:
+    x = 1; \
+        import os
+
+x = 1; \
+import os
+"#
+        .trim();
+        let lxr: Vec<LexResult> = lexer::lex(contents, Mode::Module).collect();
+        let indexer: Indexer = lxr.as_slice().into();
+        assert_eq!(indexer.continuation_lines(), [9, 12]);
+    }
+}
--- a/crates/ruff_python_ast/src/source_code/locator.rs
+++ b/crates/ruff_python_ast/src/source_code/locator.rs
@ -0,0 +1,256 @@
+//! Struct used to efficiently slice source code at (row, column) Locations.
+
+use once_cell::unsync::OnceCell;
+use rustpython_parser::ast::Location;
+
+use crate::types::Range;
+
+pub struct Locator<'a> {
+    contents: &'a str,
+    index: OnceCell<Index>,
+}
+
+pub enum Index {
+    Ascii(Vec<usize>),
+    Utf8(Vec<Vec<usize>>),
+}
+
+/// Compute the starting byte index of each line in ASCII source code.
+fn index_ascii(contents: &str) -> Vec<usize> {
+    let mut index = Vec::with_capacity(48);
+    index.push(0);
+    let bytes = contents.as_bytes();
+    for (i, byte) in bytes.iter().enumerate() {
+        if *byte == b'\n' {
+            index.push(i + 1);
+        }
+    }
+    index
+}
+
+/// Compute the starting byte index of each character in UTF-8 source code.
+fn index_utf8(contents: &str) -> Vec<Vec<usize>> {
+    let mut index = Vec::with_capacity(48);
+    let mut current_row = Vec::with_capacity(48);
+    let mut current_byte_offset = 0;
+    let mut previous_char = '\0';
+    for char in contents.chars() {
+        // Skip BOM.
+        if previous_char == '\0' && char == '\u{feff}' {
+            current_byte_offset += char.len_utf8();
+            continue;
+        }
+
+        current_row.push(current_byte_offset);
+        if char == '\n' {
+            if previous_char == '\r' {
+                current_row.pop();
+            }
+            index.push(current_row);
+            current_row = Vec::with_capacity(48);
+        }
+        current_byte_offset += char.len_utf8();
+        previous_char = char;
+    }
+    index.push(current_row);
+    index
+}
+
+/// Compute the starting byte index of each line in source code.
+pub fn index(contents: &str) -> Index {
+    if contents.is_ascii() {
+        Index::Ascii(index_ascii(contents))
+    } else {
+        Index::Utf8(index_utf8(contents))
+    }
+}
+
+/// Truncate a [`Location`] to a byte offset in ASCII source code.
+fn truncate_ascii(location: Location, index: &[usize], contents: &str) -> usize {
+    if location.row() - 1 == index.len() && location.column() == 0
+        || (!index.is_empty()
+            && location.row() - 1 == index.len() - 1
+            && index[location.row() - 1] + location.column() >= contents.len())
+    {
+        contents.len()
+    } else {
+        index[location.row() - 1] + location.column()
+    }
+}
+
+/// Truncate a [`Location`] to a byte offset in UTF-8 source code.
+fn truncate_utf8(location: Location, index: &[Vec<usize>], contents: &str) -> usize {
+    if (location.row() - 1 == index.len() && location.column() == 0)
+        || (location.row() - 1 == index.len() - 1
+            && location.column() == index[location.row() - 1].len())
+    {
+        contents.len()
+    } else {
+        index[location.row() - 1][location.column()]
+    }
+}
+
+/// Truncate a [`Location`] to a byte offset in source code.
+fn truncate(location: Location, index: &Index, contents: &str) -> usize {
+    match index {
+        Index::Ascii(index) => truncate_ascii(location, index, contents),
+        Index::Utf8(index) => truncate_utf8(location, index, contents),
+    }
+}
+
+impl<'a> Locator<'a> {
+    pub const fn new(contents: &'a str) -> Self {
+        Self {
+            contents,
+            index: OnceCell::new(),
+        }
+    }
+
+    fn get_or_init_index(&self) -> &Index {
+        self.index.get_or_init(|| index(self.contents))
+    }
+
+    /// Take the source code up to the given [`Location`].
+    pub fn take(&self, location: Location) -> &'a str {
+        let index = self.get_or_init_index();
+        let offset = truncate(location, index, self.contents);
+        &self.contents[..offset]
+    }
+
+    /// Take the source code after the given [`Location`].
+    pub fn skip(&self, location: Location) -> &'a str {
+        let index = self.get_or_init_index();
+        let offset = truncate(location, index, self.contents);
+        &self.contents[offset..]
+    }
+
+    /// Take the source code between the given [`Range`].
+    pub fn slice(&self, range: Range) -> &'a str {
+        let index = self.get_or_init_index();
+        let start = truncate(range.location, index, self.contents);
+        let end = truncate(range.end_location, index, self.contents);
+        &self.contents[start..end]
+    }
+
+    pub const fn len(&self) -> usize {
+        self.contents.len()
+    }
+
+    pub const fn is_empty(&self) -> bool {
+        self.contents.is_empty()
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use rustpython_parser::ast::Location;
+
+    use super::{index_ascii, index_utf8, truncate_ascii, truncate_utf8};
+
+    #[test]
+    fn ascii_index() {
+        let contents = "";
+        let index = index_ascii(contents);
+        assert_eq!(index, [0]);
+
+        let contents = "x = 1";
+        let index = index_ascii(contents);
+        assert_eq!(index, [0]);
+
+        let contents = "x = 1\n";
+        let index = index_ascii(contents);
+        assert_eq!(index, [0, 6]);
+
+        let contents = "x = 1\r\n";
+        let index = index_ascii(contents);
+        assert_eq!(index, [0, 7]);
+
+        let contents = "x = 1\ny = 2\nz = x + y\n";
+        let index = index_ascii(contents);
+        assert_eq!(index, [0, 6, 12, 22]);
+    }
+
+    #[test]
+    fn ascii_truncate() {
+        let contents = "x = 1\ny = 2";
+        let index = index_ascii(contents);
+
+        // First row.
+        let loc = truncate_ascii(Location::new(1, 0), &index, contents);
+        assert_eq!(loc, 0);
+
+        // Second row.
+        let loc = truncate_ascii(Location::new(2, 0), &index, contents);
+        assert_eq!(loc, 6);
+
+        // One-past-the-end.
+        let loc = truncate_ascii(Location::new(3, 0), &index, contents);
+        assert_eq!(loc, 11);
+    }
+
+    #[test]
+    fn utf8_index() {
+        let contents = "";
+        let index = index_utf8(contents);
+        assert_eq!(index.len(), 1);
+        assert_eq!(index[0], Vec::<usize>::new());
+
+        let contents = "x = 1";
+        let index = index_utf8(contents);
+        assert_eq!(index.len(), 1);
+        assert_eq!(index[0], [0, 1, 2, 3, 4]);
+
+        let contents = "x = 1\n";
+        let index = index_utf8(contents);
+        assert_eq!(index.len(), 2);
+        assert_eq!(index[0], [0, 1, 2, 3, 4, 5]);
+        assert_eq!(index[1], Vec::<usize>::new());
+
+        let contents = "x = 1\r\n";
+        let index = index_utf8(contents);
+        assert_eq!(index.len(), 2);
+        assert_eq!(index[0], [0, 1, 2, 3, 4, 5]);
+        assert_eq!(index[1], Vec::<usize>::new());
+
+        let contents = "x = 1\ny = 2\nz = x + y\n";
+        let index = index_utf8(contents);
+        assert_eq!(index.len(), 4);
+        assert_eq!(index[0], [0, 1, 2, 3, 4, 5]);
+        assert_eq!(index[1], [6, 7, 8, 9, 10, 11]);
+        assert_eq!(index[2], [12, 13, 14, 15, 16, 17, 18, 19, 20, 21]);
+        assert_eq!(index[3], Vec::<usize>::new());
+
+        let contents = "# \u{4e9c}\nclass Foo:\n    \"\"\".\"\"\"";
+        let index = index_utf8(contents);
+        assert_eq!(index.len(), 3);
+        assert_eq!(index[0], [0, 1, 2, 5]);
+        assert_eq!(index[1], [6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]);
+        assert_eq!(index[2], [17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27]);
+    }
+
+    #[test]
+    fn utf8_truncate() {
+        let contents = "x = '☃'\ny = 2";
+        let index = index_utf8(contents);
+
+        // First row.
+        let loc = truncate_utf8(Location::new(1, 0), &index, contents);
+        assert_eq!(loc, 0);
+
+        let loc = truncate_utf8(Location::new(1, 5), &index, contents);
+        assert_eq!(loc, 5);
+        assert_eq!(&contents[loc..], "☃'\ny = 2");
+
+        let loc = truncate_utf8(Location::new(1, 6), &index, contents);
+        assert_eq!(loc, 8);
+        assert_eq!(&contents[loc..], "'\ny = 2");
+
+        // Second row.
+        let loc = truncate_utf8(Location::new(2, 0), &index, contents);
+        assert_eq!(loc, 10);
+
+        // One-past-the-end.
+        let loc = truncate_utf8(Location::new(3, 0), &index, contents);
+        assert_eq!(loc, 15);
+    }
+}
--- a/crates/ruff_python_ast/src/source_code/mod.rs
+++ b/crates/ruff_python_ast/src/source_code/mod.rs
@ -0,0 +1,21 @@
+mod generator;
+mod indexer;
+mod locator;
+mod stylist;
+
+pub use generator::Generator;
+pub use indexer::Indexer;
+pub use locator::Locator;
+use rustpython_parser as parser;
+use rustpython_parser::ParseError;
+pub use stylist::{LineEnding, Stylist};
+
+/// Run round-trip source code generation on a given Python code.
+pub fn round_trip(code: &str, source_path: &str) -> Result<String, ParseError> {
+    let locator = Locator::new(code);
+    let python_ast = parser::parse_program(code, source_path)?;
+    let stylist = Stylist::from_contents(code, &locator);
+    let mut generator: Generator = (&stylist).into();
+    generator.unparse_suite(&python_ast);
+    Ok(generator.generate())
+}
--- a/crates/ruff_python_ast/src/source_code/stylist.rs
+++ b/crates/ruff_python_ast/src/source_code/stylist.rs
@ -0,0 +1,319 @@
+//! Detect code style from Python source code.
+
+use std::fmt;
+use std::ops::Deref;
+
+use once_cell::unsync::OnceCell;
+use rustpython_parser::ast::Location;
+use rustpython_parser::{lexer, Mode, Tok};
+
+use crate::source_code::Locator;
+use ruff_rustpython::vendor;
+
+use crate::strings::leading_quote;
+use crate::types::Range;
+
+pub struct Stylist<'a> {
+    contents: &'a str,
+    locator: &'a Locator<'a>,
+    indentation: OnceCell<Indentation>,
+    quote: OnceCell<Quote>,
+    line_ending: OnceCell<LineEnding>,
+}
+
+impl<'a> Stylist<'a> {
+    pub fn indentation(&'a self) -> &'a Indentation {
+        self.indentation
+            .get_or_init(|| detect_indentation(self.contents, self.locator).unwrap_or_default())
+    }
+
+    pub fn quote(&'a self) -> &'a Quote {
+        self.quote
+            .get_or_init(|| detect_quote(self.contents, self.locator).unwrap_or_default())
+    }
+
+    pub fn line_ending(&'a self) -> &'a LineEnding {
+        self.line_ending
+            .get_or_init(|| detect_line_ending(self.contents).unwrap_or_default())
+    }
+
+    pub fn from_contents(contents: &'a str, locator: &'a Locator<'a>) -> Self {
+        Self {
+            contents,
+            locator,
+            indentation: OnceCell::default(),
+            quote: OnceCell::default(),
+            line_ending: OnceCell::default(),
+        }
+    }
+}
+
+/// The quotation style used in Python source code.
+#[derive(Debug, Default, PartialEq, Eq)]
+pub enum Quote {
+    Single,
+    #[default]
+    Double,
+}
+
+impl From<Quote> for char {
+    fn from(val: Quote) -> Self {
+        match val {
+            Quote::Single => '\'',
+            Quote::Double => '"',
+        }
+    }
+}
+
+impl From<&Quote> for vendor::str::Quote {
+    fn from(val: &Quote) -> Self {
+        match val {
+            Quote::Single => vendor::str::Quote::Single,
+            Quote::Double => vendor::str::Quote::Double,
+        }
+    }
+}
+
+impl fmt::Display for Quote {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        match self {
+            Quote::Single => write!(f, "\'"),
+            Quote::Double => write!(f, "\""),
+        }
+    }
+}
+
+impl From<&Quote> for char {
+    fn from(val: &Quote) -> Self {
+        match val {
+            Quote::Single => '\'',
+            Quote::Double => '"',
+        }
+    }
+}
+
+/// The indentation style used in Python source code.
+#[derive(Debug, PartialEq, Eq)]
+pub struct Indentation(String);
+
+impl Indentation {
+    pub const fn new(indentation: String) -> Self {
+        Self(indentation)
+    }
+}
+
+impl Default for Indentation {
+    fn default() -> Self {
+        Indentation("    ".to_string())
+    }
+}
+
+impl Indentation {
+    pub fn as_str(&self) -> &str {
+        self.0.as_str()
+    }
+
+    pub fn as_char(&self) -> char {
+        self.0.chars().next().unwrap()
+    }
+}
+
+impl Deref for Indentation {
+    type Target = str;
+
+    fn deref(&self) -> &Self::Target {
+        self.as_str()
+    }
+}
+
+/// The line ending style used in Python source code.
+/// See <https://docs.python.org/3/reference/lexical_analysis.html#physical-lines>
+#[derive(Debug, PartialEq, Eq)]
+pub enum LineEnding {
+    Lf,
+    Cr,
+    CrLf,
+}
+
+impl Default for LineEnding {
+    fn default() -> Self {
+        if cfg!(windows) {
+            LineEnding::CrLf
+        } else {
+            LineEnding::Lf
+        }
+    }
+}
+
+impl LineEnding {
+    pub const fn as_str(&self) -> &'static str {
+        match self {
+            LineEnding::CrLf => "\r\n",
+            LineEnding::Lf => "\n",
+            LineEnding::Cr => "\r",
+        }
+    }
+}
+
+impl Deref for LineEnding {
+    type Target = str;
+
+    fn deref(&self) -> &Self::Target {
+        self.as_str()
+    }
+}
+
+/// Detect the indentation style of the given tokens.
+fn detect_indentation(contents: &str, locator: &Locator) -> Option<Indentation> {
+    for (_start, tok, end) in lexer::lex(contents, Mode::Module).flatten() {
+        if let Tok::Indent { .. } = tok {
+            let start = Location::new(end.row(), 0);
+            let whitespace = locator.slice(Range::new(start, end));
+            return Some(Indentation(whitespace.to_string()));
+        }
+    }
+    None
+}
+
+/// Detect the quotation style of the given tokens.
+fn detect_quote(contents: &str, locator: &Locator) -> Option<Quote> {
+    for (start, tok, end) in lexer::lex(contents, Mode::Module).flatten() {
+        if let Tok::String { .. } = tok {
+            let content = locator.slice(Range::new(start, end));
+            if let Some(pattern) = leading_quote(content) {
+                if pattern.contains("\"\"\"") {
+                    continue;
+                } else if pattern.contains('\'') {
+                    return Some(Quote::Single);
+                } else if pattern.contains('"') {
+                    return Some(Quote::Double);
+                }
+                unreachable!("Expected string to start with a valid quote prefix")
+            }
+        }
+    }
+    None
+}
+
+/// Detect the line ending style of the given contents.
+fn detect_line_ending(contents: &str) -> Option<LineEnding> {
+    if let Some(position) = contents.find('\n') {
+        let position = position.saturating_sub(1);
+        return if let Some('\r') = contents.chars().nth(position) {
+            Some(LineEnding::CrLf)
+        } else {
+            Some(LineEnding::Lf)
+        };
+    } else if contents.find('\r').is_some() {
+        return Some(LineEnding::Cr);
+    }
+    None
+}
+
+#[cfg(test)]
+mod tests {
+    use crate::source_code::stylist::{
+        detect_indentation, detect_line_ending, detect_quote, Indentation, LineEnding, Quote,
+    };
+    use crate::source_code::Locator;
+
+    #[test]
+    fn indentation() {
+        let contents = r#"x = 1"#;
+        let locator = Locator::new(contents);
+        assert_eq!(detect_indentation(contents, &locator), None);
+
+        let contents = r#"
+if True:
+  pass
+"#;
+        let locator = Locator::new(contents);
+        assert_eq!(
+            detect_indentation(contents, &locator),
+            Some(Indentation("  ".to_string()))
+        );
+
+        let contents = r#"
+if True:
+    pass
+"#;
+        let locator = Locator::new(contents);
+        assert_eq!(
+            detect_indentation(contents, &locator),
+            Some(Indentation("    ".to_string()))
+        );
+
+        let contents = r#"
+if True:
+	pass
+"#;
+        let locator = Locator::new(contents);
+        assert_eq!(
+            detect_indentation(contents, &locator),
+            Some(Indentation("\t".to_string()))
+        );
+
+        // TODO(charlie): Should non-significant whitespace be detected?
+        let contents = r#"
+x = (
+  1,
+  2,
+  3,
+)
+"#;
+        let locator = Locator::new(contents);
+        assert_eq!(detect_indentation(contents, &locator), None);
+    }
+
+    #[test]
+    fn quote() {
+        let contents = r#"x = 1"#;
+        let locator = Locator::new(contents);
+        assert_eq!(detect_quote(contents, &locator), None);
+
+        let contents = r#"x = '1'"#;
+        let locator = Locator::new(contents);
+        assert_eq!(detect_quote(contents, &locator), Some(Quote::Single));
+
+        let contents = r#"x = "1""#;
+        let locator = Locator::new(contents);
+        assert_eq!(detect_quote(contents, &locator), Some(Quote::Double));
+
+        let contents = r#"s = "It's done.""#;
+        let locator = Locator::new(contents);
+        assert_eq!(detect_quote(contents, &locator), Some(Quote::Double));
+
+        // No style if only double quoted docstring (will take default Double)
+        let contents = r#"
+def f():
+    """Docstring."""
+    pass
+"#;
+        let locator = Locator::new(contents);
+        assert_eq!(detect_quote(contents, &locator), None);
+
+        // Detect from string literal appearing after docstring
+        let contents = r#"
+"""Module docstring."""
+
+a = 'v'
+"#;
+        let locator = Locator::new(contents);
+        assert_eq!(detect_quote(contents, &locator), Some(Quote::Single));
+    }
+
+    #[test]
+    fn line_ending() {
+        let contents = "x = 1";
+        assert_eq!(detect_line_ending(contents), None);
+
+        let contents = "x = 1\n";
+        assert_eq!(detect_line_ending(contents), Some(LineEnding::Lf));
+
+        let contents = "x = 1\r";
+        assert_eq!(detect_line_ending(contents), Some(LineEnding::Cr));
+
+        let contents = "x = 1\r\n";
+        assert_eq!(detect_line_ending(contents), Some(LineEnding::CrLf));
+    }
+}