Add full-lexer feature (#36)

2025-08-25 12:55:24 +00:00 · 2023-05-16 02:21:34 +09:00 · 2023-05-16 02:21:34 +09:00 · 27e3873dc2
commit 27e3873dc2
parent dd4cc25227
7 changed files with 1216 additions and 1188 deletions
--- a/parser/Cargo.toml
+++ b/parser/Cargo.toml
@ -13,6 +13,7 @@ default = ["location"]
 location = ["rustpython-ast/location", "rustpython-parser-core/location"]
 serde = ["dep:serde", "rustpython-parser-core/serde"]
 all-nodes-with-ranges = ["rustpython-ast/all-nodes-with-ranges"]
 full-lexer = []
 [build-dependencies]
 anyhow = { workspace = true }
--- a/parser/src/lexer.rs
+++ b/parser/src/lexer.rs
@ -450,6 +450,7 @@ where
    }
    /// Lex a single comment.
    #[cfg(feature = "full-lexer")]
    fn lex_comment(&mut self) -> LexResult {
        let start_pos = self.get_pos();
        let mut value = String::new();
@ -465,6 +466,20 @@ where
        }
    }
    /// Discard comment if full-lexer is not enabled.
    #[cfg(not(feature = "full-lexer"))]
    fn lex_comment(&mut self) {
        loop {
            match self.window[0] {
                Some('\n' | '\r') | None => {
                    return;
                }
                Some(_) => {}
            }
            self.next_char().unwrap();
        }
    }
    /// Lex a string literal.
    fn lex_string(&mut self, kind: StringKind) -> LexResult {
        let start_pos = self.get_pos();
@ -611,8 +626,9 @@ where
                    tabs += 1;
                }
                Some('#') => {
-                    let comment = self.lex_comment()?;
+                    let _comment = self.lex_comment();
-                    self.emit(comment);
+                    #[cfg(feature = "full-lexer")]
                    self.emit(_comment?);
                    spaces = 0;
                    tabs = 0;
                }
@ -753,8 +769,9 @@ where
                self.emit(number);
            }
            '#' => {
-                let comment = self.lex_comment()?;
+                let _comment = self.lex_comment();
-                self.emit(comment);
+                #[cfg(feature = "full-lexer")]
                self.emit(_comment?);
            }
            '"' | '\'' => {
                let string = self.lex_string(StringKind::String)?;
@ -1101,6 +1118,7 @@ where
                    self.at_begin_of_line = true;
                    self.emit((Tok::Newline, TextRange::new(tok_start, tok_end)));
                } else {
                    #[cfg(feature = "full-lexer")]
                    self.emit((Tok::NonLogicalNewline, TextRange::new(tok_start, tok_end)));
                }
            }
@ -1408,6 +1426,7 @@ mod tests {
        ($($name:ident: $eol:expr,)*) => {
            $(
            #[test]
            #[cfg(feature = "full-lexer")]
            fn $name() {
                let source = format!(r"99232  # {}", $eol);
                let tokens = lex_source(&source);
@ -1428,6 +1447,7 @@ mod tests {
        ($($name:ident: $eol:expr,)*) => {
            $(
            #[test]
            #[cfg(feature = "full-lexer")]
            fn $name() {
                let source = format!("123  # Foo{}456", $eol);
                let tokens = lex_source(&source);
@ -1607,6 +1627,7 @@ mod tests {
        ($($name:ident: $eol:expr,)*) => {
        $(
            #[test]
            #[cfg(feature = "full-lexer")]
            fn $name() {
                let source = r"x = [
@ -1669,6 +1690,7 @@ mod tests {
    }
    #[test]
    #[cfg(feature = "full-lexer")]
    fn test_non_logical_newline_in_string_continuation() {
        let source = r"(
    'a'
@ -1698,6 +1720,7 @@ mod tests {
    }
    #[test]
    #[cfg(feature = "full-lexer")]
    fn test_logical_newline_line_comment() {
        let source = "#Hello\n#World";
        let tokens = lex_source(source);
--- a/parser/src/parser.rs
+++ b/parser/src/parser.rs
@ -190,9 +190,10 @@ pub fn parse_tokens(
    source_path: &str,
 ) -> Result<ast::Mod, ParseError> {
    let marker_token = (Tok::start_marker(mode), Default::default());
-    let lexer = iter::once(Ok(marker_token))
+    let lexer = iter::once(Ok(marker_token)).chain(lxr);
-        .chain(lxr)
+    #[cfg(feature = "full-lexer")]
-        .filter_ok(|(tok, _)| !matches!(tok, Tok::Comment { .. } | Tok::NonLogicalNewline));
+    let lexer =
        lexer.filter_ok(|(tok, _)| !matches!(tok, Tok::Comment { .. } | Tok::NonLogicalNewline));
    python::TopParser::new()
        .parse(
            lexer
--- a/parser/src/python.lalrpop
+++ b/parser/src/python.lalrpop
@ -1743,6 +1743,6 @@ extern {
        name => token::Tok::Name { name: <String> },
        "\n" => token::Tok::Newline,
        ";" => token::Tok::Semi,
-        "#" => token::Tok::Comment(_),
+        // "#" => token::Tok::Comment(_),
    }
 }
--- a/parser/src/python.rs
+++ b/parser/src/python.rs
--- a/parser/src/soft_keywords.rs
+++ b/parser/src/soft_keywords.rs
@ -86,18 +86,19 @@ where
        self.start_of_line = next.as_ref().map_or(false, |lex_result| {
            lex_result.as_ref().map_or(false, |(tok, _)| {
                #[cfg(feature = "full-lexer")]
                if matches!(tok, Tok::NonLogicalNewline | Tok::Comment { .. }) {
-                    self.start_of_line
+                    return self.start_of_line;
                } else {
                    matches!(
                        tok,
                        Tok::StartModule
                            | Tok::StartInteractive
                            | Tok::Newline
                            | Tok::Indent
                            | Tok::Dedent
                    )
                }
                matches!(
                    tok,
                    Tok::StartModule
                        | Tok::StartInteractive
                        | Tok::Newline
                        | Tok::Indent
                        | Tok::Dedent
                )
            })
        });
--- a/parser/src/token.rs
+++ b/parser/src/token.rs
@ -43,11 +43,13 @@ pub enum Tok {
        triple_quoted: bool,
    },
    /// Token value for a comment. These are filtered out of the token stream prior to parsing.
    #[cfg(feature = "full-lexer")]
    Comment(String),
    /// Token value for a newline.
    Newline,
    /// Token value for a newline that is not a logical line break. These are filtered out of
    /// the token stream prior to parsing.
    #[cfg(feature = "full-lexer")]
    NonLogicalNewline,
    /// Token value for an indent.
    Indent,
@ -223,6 +225,7 @@ impl fmt::Display for Tok {
                write!(f, "{kind}{quotes}{value}{quotes}")
            }
            Newline => f.write_str("Newline"),
            #[cfg(feature = "full-lexer")]
            NonLogicalNewline => f.write_str("NonLogicalNewline"),
            Indent => f.write_str("Indent"),
            Dedent => f.write_str("Dedent"),
@ -236,6 +239,7 @@ impl fmt::Display for Tok {
            Rsqb => f.write_str("']'"),
            Colon => f.write_str("':'"),
            Comma => f.write_str("','"),
            #[cfg(feature = "full-lexer")]
            Comment(value) => f.write_str(value),
            Semi => f.write_str("';'"),
            Plus => f.write_str("'+'"),