Use Jupyter mode while parsing Notebook files (#5552)

## Summary Enable using the new `Mode::Jupyter` for the tokenizer/parser to parse Jupyter line magic tokens. The individual call to the lexer i.e., `lex_starts_at` done by various rules should consider the context of the source code (is this content from a Jupyter Notebook?). Thus, a new field `source_type` (of type `PySourceType`) is added to `Checker` which is being passed around as an argument to the relevant functions. This is then used to determine the `Mode` for the lexer. ## Test Plan Add new test cases to make sure that the magic statement is considered while generating the diagnostic and autofix: * For `I001`, if there's a magic statement in between two import blocks, they should be sorted independently fixes: #6090
2025-07-18 02:25:09 +00:00 · 2023-08-05 06:02:07 +05:30 · 2023-08-05 06:02:07 +05:30 · 32fa05765a
commit 32fa05765a
parent d788957ec4
52 changed files with 652 additions and 196 deletions
--- a/crates/ruff_python_parser/src/lib.rs
+++ b/crates/ruff_python_parser/src/lib.rs
@ -114,7 +114,7 @@ pub use parser::{
    parse, parse_expression, parse_expression_starts_at, parse_program, parse_starts_at,
    parse_suite, parse_tokens, ParseError, ParseErrorType,
 };
-use ruff_python_ast::{CmpOp, Expr, Mod, Ranged, Suite};
+use ruff_python_ast::{CmpOp, Expr, Mod, PySourceType, Ranged, Suite};
 use ruff_text_size::{TextRange, TextSize};
 pub use string::FStringErrorType;
 pub use token::{StringKind, Tok, TokenKind};
@ -130,9 +130,9 @@ mod token;
 pub mod typing;

 /// Collect tokens up to and including the first error.
-pub fn tokenize(contents: &str) -> Vec<LexResult> {
+pub fn tokenize(contents: &str, mode: Mode) -> Vec<LexResult> {
    let mut tokens: Vec<LexResult> = vec![];
-    for tok in lexer::lex(contents, Mode::Module) {
+    for tok in lexer::lex(contents, mode) {
        let is_err = tok.is_err();
        tokens.push(tok);
        if is_err {
@ -146,17 +146,32 @@ pub fn tokenize(contents: &str) -> Vec<LexResult> {
 pub fn parse_program_tokens(
    lxr: Vec<LexResult>,
    source_path: &str,
+    is_jupyter_notebook: bool,
 ) -> anyhow::Result<Suite, ParseError> {
-    match parse_tokens(lxr, Mode::Module, source_path)? {
+    let mode = if is_jupyter_notebook {
+        Mode::Jupyter
+    } else {
+        Mode::Module
+    };
+    match parse_tokens(lxr, mode, source_path)? {
        Mod::Module(m) => Ok(m.body),
        Mod::Expression(_) => unreachable!("Mode::Module doesn't return other variant"),
    }
 }

 /// Return the `Range` of the first `Tok::Colon` token in a `Range`.
-pub fn first_colon_range(range: TextRange, source: &str) -> Option<TextRange> {
+pub fn first_colon_range(
+    range: TextRange,
+    source: &str,
+    is_jupyter_notebook: bool,
+) -> Option<TextRange> {
    let contents = &source[range];
-    let range = lexer::lex_starts_at(contents, Mode::Module, range.start())
+    let mode = if is_jupyter_notebook {
+        Mode::Jupyter
+    } else {
+        Mode::Module
+    };
+    let range = lexer::lex_starts_at(contents, mode, range.start())
        .flatten()
        .find(|(tok, _)| tok.is_colon())
        .map(|(_, range)| range);
@ -308,6 +323,19 @@ impl std::str::FromStr for Mode {
    }
 }

+pub trait AsMode {
+    fn as_mode(&self) -> Mode;
+}
+
+impl AsMode for PySourceType {
+    fn as_mode(&self) -> Mode {
+        match self {
+            PySourceType::Python | PySourceType::Stub => Mode::Module,
+            PySourceType::Jupyter => Mode::Jupyter,
+        }
+    }
+}
+
 /// Returned when a given mode is not valid.
 #[derive(Debug)]
 pub struct ModeParseError;
@ -357,6 +385,7 @@ mod tests {
        let range = first_colon_range(
            TextRange::new(TextSize::from(0), contents.text_len()),
            contents,
+            false,
        )
        .unwrap();
        assert_eq!(&contents[range], ":");