Add NonLogicalNewline token

This token is completely ignored by the parser, but it's useful for other users of the lexer, such as the Ruff linter. For example, the token is helpful for a "trailing comma" lint. The same idea exists in Python's `tokenize` module - there is a NEWLINE token (logical newline), and a NL token (non-logical newline). Fixes #4385.
2025-08-26 05:14:40 +00:00 · 2023-01-12 14:53:55 +02:00 · 2023-01-12 14:53:55 +02:00 · 674eeec29c
commit 674eeec29c
parent 4f1e7c6291
3 changed files with 84 additions and 3 deletions
--- a/parser/src/lexer.rs
+++ b/parser/src/lexer.rs
@ -1075,10 +1075,13 @@ where
                self.next_char();
                let tok_end = self.get_pos();

-                // Depending on the nesting level, we emit newline or not:
+                // Depending on the nesting level, we emit a logical or
+                // non-logical newline:
                if self.nesting == 0 {
                    self.at_begin_of_line = true;
                    self.emit((tok_start, Tok::Newline, tok_end));
+                } else {
+                    self.emit((tok_start, Tok::NonLogicalNewline, tok_end));
                }
            }
            ' ' | '\t' | '\x0C' => {
@ -1464,7 +1467,16 @@ mod tests {
        $(
            #[test]
            fn $name() {
-                let source = format!("x = [{}    1,2{}]{}", $eol, $eol, $eol);
+                let source = r"x = [
+
+    1,2
+,(3,
+4,
+), {
+5,
+6,\
+7}]
+".replace("\n", $eol);
                let tokens = lex_source(&source);
                assert_eq!(
                    tokens,
@ -1474,9 +1486,32 @@ mod tests {
                        },
                        Tok::Equal,
                        Tok::Lsqb,
+                        Tok::NonLogicalNewline,
+                        Tok::NonLogicalNewline,
                        Tok::Int { value: BigInt::from(1) },
                        Tok::Comma,
                        Tok::Int { value: BigInt::from(2) },
+                        Tok::NonLogicalNewline,
+                        Tok::Comma,
+                        Tok::Lpar,
+                        Tok::Int { value: BigInt::from(3) },
+                        Tok::Comma,
+                        Tok::NonLogicalNewline,
+                        Tok::Int { value: BigInt::from(4) },
+                        Tok::Comma,
+                        Tok::NonLogicalNewline,
+                        Tok::Rpar,
+                        Tok::Comma,
+                        Tok::Lbrace,
+                        Tok::NonLogicalNewline,
+                        Tok::Int { value: BigInt::from(5) },
+                        Tok::Comma,
+                        Tok::NonLogicalNewline,
+                        Tok::Int { value: BigInt::from(6) },
+                        Tok::Comma,
+                        // Continuation here - no NonLogicalNewline.
+                        Tok::Int { value: BigInt::from(7) },
+                        Tok::Rbrace,
                        Tok::Rsqb,
                        Tok::Newline,
                    ]
@ -1492,6 +1527,50 @@ mod tests {
        test_newline_in_brackets_unix_eol: UNIX_EOL,
    }

+    #[test]
+    fn test_non_logical_newline_in_string_continuation() {
+        let source = r"(
+    'a'
+    'b'
+
+    'c' \
+    'd'
+)";
+        let tokens = lex_source(source);
+        assert_eq!(
+            tokens,
+            vec![
+                Tok::Lpar,
+                Tok::NonLogicalNewline,
+                stok("a"),
+                Tok::NonLogicalNewline,
+                stok("b"),
+                Tok::NonLogicalNewline,
+                Tok::NonLogicalNewline,
+                stok("c"),
+                stok("d"),
+                Tok::NonLogicalNewline,
+                Tok::Rpar,
+                Tok::Newline,
+            ]
+        );
+    }
+
+    #[test]
+    fn test_logical_newline_line_comment() {
+        let source = "#Hello\n#World";
+        let tokens = lex_source(source);
+        assert_eq!(
+            tokens,
+            vec![
+                Tok::Comment("#Hello".to_owned()),
+                // tokenize.py does put an NL here...
+                Tok::Comment("#World".to_owned()),
+                // ... and here, but doesn't seem very useful.
+            ]
+        );
+    }
+
    #[test]
    fn test_operators() {
        let source = "//////=/ /";
--- a/parser/src/parser.rs
+++ b/parser/src/parser.rs
@ -96,7 +96,7 @@ pub fn parse_located(
    let marker_token = (Default::default(), mode.to_marker(), Default::default());
    let tokenizer = iter::once(Ok(marker_token))
        .chain(lxr)
-        .filter_ok(|(_, tok, _)| !matches!(tok, Tok::Comment { .. }));
+        .filter_ok(|(_, tok, _)| !matches!(tok, Tok::Comment { .. } | Tok::NonLogicalNewline));

    python::TopParser::new()
        .parse(tokenizer)
--- a/parser/src/token.rs
+++ b/parser/src/token.rs
@ -25,6 +25,7 @@ pub enum Tok {
        triple_quoted: bool,
    },
    Newline,
+    NonLogicalNewline,
    Indent,
    Dedent,
    StartModule,
@ -136,6 +137,7 @@ impl fmt::Display for Tok {
                write!(f, "{kind}{quotes}{value}{quotes}")
            }
            Newline => f.write_str("Newline"),
+            NonLogicalNewline => f.write_str("NonLogicalNewline"),
            Indent => f.write_str("Indent"),
            Dedent => f.write_str("Dedent"),
            StartModule => f.write_str("StartProgram"),