Pull in RustPython parser (#6099)

2025-08-23 03:45:14 +00:00 · 2023-07-27 11:29:11 +02:00 · 2023-07-27 11:29:11 +02:00 · 40f54375cb
commit 40f54375cb
parent 86539c1fc5
779 changed files with 108400 additions and 2078 deletions
--- a/crates/ruff_python_parser/src/lib.rs
+++ b/crates/ruff_python_parser/src/lib.rs
@ -1,11 +1,134 @@
-use rustpython_ast::text_size::TextSize;
-use rustpython_ast::{CmpOp, Expr, Mod, ModModule, Ranged, Suite};
-use rustpython_parser as parser;
-use rustpython_parser::lexer::LexResult;
-use rustpython_parser::text_size::TextRange;
-use rustpython_parser::{lexer, Mode, ParseError, Tok};
+//! This crate can be used to parse Python source code into an Abstract
+//! Syntax Tree.
+//!
+//! ## Overview:
+//!
+//! The process by which source code is parsed into an AST can be broken down
+//! into two general stages: [lexical analysis] and [parsing].
+//!
+//! During lexical analysis, the source code is converted into a stream of lexical
+//! tokens that represent the smallest meaningful units of the language. For example,
+//! the source code `print("Hello world")` would _roughly_ be converted into the following
+//! stream of tokens:
+//!
+//! ```text
+//! Name("print"), LeftParen, String("Hello world"), RightParen
+//! ```
+//!
+//! these tokens are then consumed by the `ruff_python_parser`, which matches them against a set of
+//! grammar rules to verify that the source code is syntactically valid and to construct
+//! an AST that represents the source code.
+//!
+//! During parsing, the `ruff_python_parser` consumes the tokens generated by the lexer and constructs
+//! a tree representation of the source code. The tree is made up of nodes that represent
+//! the different syntactic constructs of the language. If the source code is syntactically
+//! invalid, parsing fails and an error is returned. After a successful parse, the AST can
+//! be used to perform further analysis on the source code. Continuing with the example
+//! above, the AST generated by the `ruff_python_parser` would _roughly_ look something like this:
+//!
+//! ```text
+//! node: Expr {
+//!     value: {
+//!         node: Call {
+//!             func: {
+//!                 node: Name {
+//!                     id: "print",
+//!                     ctx: Load,
+//!                 },
+//!             },
+//!             args: [
+//!                 node: Constant {
+//!                     value: Str("Hello World"),
+//!                     kind: None,
+//!                 },
+//!             ],
+//!             keywords: [],
+//!         },
+//!     },
+//! },
+//!```
+//!
+//! Note: The Tokens/ASTs shown above are not the exact tokens/ASTs generated by the `ruff_python_parser`.
+//!
+//! ## Source code layout:
+//!
+//! The functionality of this crate is split into several modules:
+//!
+//! - token: This module contains the definition of the tokens that are generated by the lexer.
+//! - [lexer]: This module contains the lexer and is responsible for generating the tokens.
+//! - `ruff_python_parser`: This module contains an interface to the `ruff_python_parser` and is responsible for generating the AST.
+//!     - Functions and strings have special parsing requirements that are handled in additional files.
+//! - mode: This module contains the definition of the different modes that the `ruff_python_parser` can be in.
+//!
+//! # Examples
+//!
+//! For example, to get a stream of tokens from a given string, one could do this:
+//!
+//! ```
+//! use ruff_python_parser::{lexer::lex, Mode};
+//!
+//! let python_source = r#"
+//! def is_odd(i):
+//!     return bool(i & 1)
+//! "#;
+//! let mut tokens = lex(python_source, Mode::Module);
+//! assert!(tokens.all(|t| t.is_ok()));
+//! ```
+//!
+//! These tokens can be directly fed into the `ruff_python_parser` to generate an AST:
+//!
+//! ```
+//! use ruff_python_parser::{lexer::lex, Mode, parse_tokens};
+//!
+//! let python_source = r#"
+//! def is_odd(i):
+//!    return bool(i & 1)
+//! "#;
+//! let tokens = lex(python_source, Mode::Module);
+//! let ast = parse_tokens(tokens, Mode::Module, "<embedded>");
+//!
+//! assert!(ast.is_ok());
+//! ```
+//!
+//! Alternatively, you can use one of the other `parse_*` functions to parse a string directly without using a specific
+//! mode or tokenizing the source beforehand:
+//!
+//! ```
+//! use ruff_python_parser::{Parse};
+//! use ruff_python_ast as ast;
+//!
+//! let python_source = r#"
+//! def is_odd(i):
+//!   return bool(i & 1)
+//! "#;
+//! let ast = ast::Suite::parse(python_source, "<embedded>");
+//!
+//! assert!(ast.is_ok());
+//! ```
+//!
+//! [lexical analysis]: https://en.wikipedia.org/wiki/Lexical_analysis
+//! [parsing]: https://en.wikipedia.org/wiki/Parsing
+//! [lexer]: crate::lexer

-pub mod token_kind;
+use crate::lexer::LexResult;
+pub use parse::Parse;
+pub use parser::{parse, parse_starts_at, parse_tokens, ParseError, ParseErrorType};
+#[allow(deprecated)]
+pub use parser::{parse_expression, parse_expression_starts_at, parse_program};
+use ruff_python_ast::{CmpOp, Expr, Mod, ModModule, Ranged, Suite};
+use ruff_text_size::{TextRange, TextSize};
+pub use string::FStringErrorType;
+pub use token::{StringKind, Tok, TokenKind};
+
+mod function;
+// Skip flattening lexer to distinguish from full ruff_python_parser
+mod context;
+pub mod lexer;
+mod parse;
+mod parser;
+mod soft_keywords;
+mod string;
+mod token;
 pub mod typing;

 /// Collect tokens up to and including the first error.
@ -141,15 +264,97 @@ impl LocatedCmpOp {
    }
 }

+/// Control in the different modes by which a source file can be parsed.
+/// The mode argument specifies in what way code must be parsed.
+#[derive(Clone, Copy, Debug, Hash, PartialEq, Eq)]
+pub enum Mode {
+    /// The code consists of a sequence of statements.
+    Module,
+    /// The code consists of a sequence of interactive statement.
+    Interactive,
+    /// The code consists of a single expression.
+    Expression,
+    /// The code consists of a sequence of statements which are part of a
+    /// Jupyter Notebook and thus could include escape commands scoped to
+    /// a single line.
+    ///
+    /// ## Limitations:
+    ///
+    /// For [Dynamic object information], the escape characters (`?`, `??`)
+    /// must be used before an object. For example, `?foo` will be recognized,
+    /// but `foo?` will not.
+    ///
+    /// ## Supported escape commands:
+    ///
+    /// - [Magic command system] which is limited to [line magics] and can start
+    ///   with `?` or `??`.
+    /// - [Dynamic object information] which can start with `?` or `??`.
+    /// - [System shell access] which can start with `!` or `!!`.
+    /// - [Automatic parentheses and quotes] which can start with `/`, `;`, or `,`.
+    ///
+    /// [Magic command system]: https://ipython.readthedocs.io/en/stable/interactive/reference.html#magic-command-system
+    /// [line magics]: https://ipython.readthedocs.io/en/stable/interactive/magics.html#line-magics
+    /// [Dynamic object information]: https://ipython.readthedocs.io/en/stable/interactive/reference.html#dynamic-object-information
+    /// [System shell access]: https://ipython.readthedocs.io/en/stable/interactive/reference.html#system-shell-access
+    /// [Automatic parentheses and quotes]: https://ipython.readthedocs.io/en/stable/interactive/reference.html#automatic-parentheses-and-quotes
+    Jupyter,
+}
+
+impl std::str::FromStr for Mode {
+    type Err = ModeParseError;
+    fn from_str(s: &str) -> Result<Self, ModeParseError> {
+        match s {
+            "exec" | "single" => Ok(Mode::Module),
+            "eval" => Ok(Mode::Expression),
+            "jupyter" => Ok(Mode::Jupyter),
+            _ => Err(ModeParseError),
+        }
+    }
+}
+
+/// Returned when a given mode is not valid.
+#[derive(Debug)]
+pub struct ModeParseError;
+
+impl std::fmt::Display for ModeParseError {
+    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
+        write!(f, r#"mode must be "exec", "eval", "jupyter", or "single""#)
+    }
+}
+
+#[rustfmt::skip]
+#[allow(unreachable_pub)]
+#[allow(clippy::type_complexity)]
+#[allow(clippy::extra_unused_lifetimes)]
+#[allow(clippy::needless_lifetimes)]
+#[allow(clippy::unused_self)]
+#[allow(clippy::cast_sign_loss)]
+#[allow(clippy::default_trait_access)]
+#[allow(clippy::let_unit_value)]
+#[allow(clippy::just_underscores_and_digits)]
+#[allow(clippy::no_effect_underscore_binding)]
+#[allow(clippy::trivially_copy_pass_by_ref)]
+#[allow(clippy::option_option)]
+#[allow(clippy::unnecessary_wraps)]
+#[allow(clippy::uninlined_format_args)]
+#[allow(clippy::cloned_instead_of_copied)]
+mod python {
+
+    #[cfg(feature = "lalrpop")]
+    include!(concat!(env!("OUT_DIR"), "/src/python.rs"));
+
+    #[cfg(not(feature = "lalrpop"))]
+    include!("python.rs");
+}
+
 #[cfg(test)]
 mod tests {
+    use crate::Parse;
    use crate::{first_colon_range, locate_cmp_ops, LocatedCmpOp};
    use anyhow::Result;
-    use ruff_text_size::TextSize;
-    use rustpython_ast::text_size::{TextLen, TextRange};
-    use rustpython_ast::CmpOp;
-    use rustpython_ast::Expr;
-    use rustpython_parser::Parse;
+    use ruff_python_ast::CmpOp;
+    use ruff_python_ast::Expr;
+    use ruff_text_size::{TextLen, TextRange, TextSize};

    #[test]
    fn extract_first_colon_range() {