From 674eeec29c6e0b4462ccc4932f57a392dd29af7d Mon Sep 17 00:00:00 2001
From: Ran Benita <ran@unusedvar.com>
Date: Thu, 12 Jan 2023 14:53:55 +0200
Subject: [PATCH] Add NonLogicalNewline token

This token is completely ignored by the parser, but it's useful for
other users of the lexer, such as the Ruff linter. For example, the
token is helpful for a "trailing comma" lint.

The same idea exists in Python's `tokenize` module - there is a NEWLINE
token (logical newline), and a NL token (non-logical newline).

Fixes #4385.
---
 parser/src/lexer.rs  | 83 ++++++++++++++++++++++++++++++++++++++++++--
 parser/src/parser.rs |  2 +-
 parser/src/token.rs  |  2 ++
 3 files changed, 84 insertions(+), 3 deletions(-)

diff --git a/parser/src/lexer.rs b/parser/src/lexer.rs
index cb38503..1c124f6 100644
--- a/parser/src/lexer.rs
+++ b/parser/src/lexer.rs
@@ -1075,10 +1075,13 @@ where
                 self.next_char();
                 let tok_end = self.get_pos();
 
-                // Depending on the nesting level, we emit newline or not:
+                // Depending on the nesting level, we emit a logical or
+                // non-logical newline:
                 if self.nesting == 0 {
                     self.at_begin_of_line = true;
                     self.emit((tok_start, Tok::Newline, tok_end));
+                } else {
+                    self.emit((tok_start, Tok::NonLogicalNewline, tok_end));
                 }
             }
             ' ' | '\t' | '\x0C' => {
@@ -1464,7 +1467,16 @@ mod tests {
         $(
             #[test]
             fn $name() {
-                let source = format!("x = [{}    1,2{}]{}", $eol, $eol, $eol);
+                let source = r"x = [
+
+    1,2
+,(3,
+4,
+), {
+5,
+6,\
+7}]
+".replace("\n", $eol);
                 let tokens = lex_source(&source);
                 assert_eq!(
                     tokens,
@@ -1474,9 +1486,32 @@ mod tests {
                         },
                         Tok::Equal,
                         Tok::Lsqb,
+                        Tok::NonLogicalNewline,
+                        Tok::NonLogicalNewline,
                         Tok::Int { value: BigInt::from(1) },
                         Tok::Comma,
                         Tok::Int { value: BigInt::from(2) },
+                        Tok::NonLogicalNewline,
+                        Tok::Comma,
+                        Tok::Lpar,
+                        Tok::Int { value: BigInt::from(3) },
+                        Tok::Comma,
+                        Tok::NonLogicalNewline,
+                        Tok::Int { value: BigInt::from(4) },
+                        Tok::Comma,
+                        Tok::NonLogicalNewline,
+                        Tok::Rpar,
+                        Tok::Comma,
+                        Tok::Lbrace,
+                        Tok::NonLogicalNewline,
+                        Tok::Int { value: BigInt::from(5) },
+                        Tok::Comma,
+                        Tok::NonLogicalNewline,
+                        Tok::Int { value: BigInt::from(6) },
+                        Tok::Comma,
+                        // Continuation here - no NonLogicalNewline.
+                        Tok::Int { value: BigInt::from(7) },
+                        Tok::Rbrace,
                         Tok::Rsqb,
                         Tok::Newline,
                     ]
@@ -1492,6 +1527,50 @@ mod tests {
         test_newline_in_brackets_unix_eol: UNIX_EOL,
     }
 
+    #[test]
+    fn test_non_logical_newline_in_string_continuation() {
+        let source = r"(
+    'a'
+    'b'
+
+    'c' \
+    'd'
+)";
+        let tokens = lex_source(source);
+        assert_eq!(
+            tokens,
+            vec![
+                Tok::Lpar,
+                Tok::NonLogicalNewline,
+                stok("a"),
+                Tok::NonLogicalNewline,
+                stok("b"),
+                Tok::NonLogicalNewline,
+                Tok::NonLogicalNewline,
+                stok("c"),
+                stok("d"),
+                Tok::NonLogicalNewline,
+                Tok::Rpar,
+                Tok::Newline,
+            ]
+        );
+    }
+
+    #[test]
+    fn test_logical_newline_line_comment() {
+        let source = "#Hello\n#World";
+        let tokens = lex_source(source);
+        assert_eq!(
+            tokens,
+            vec![
+                Tok::Comment("#Hello".to_owned()),
+                // tokenize.py does put an NL here...
+                Tok::Comment("#World".to_owned()),
+                // ... and here, but doesn't seem very useful.
+            ]
+        );
+    }
+
     #[test]
     fn test_operators() {
         let source = "//////=/ /";
diff --git a/parser/src/parser.rs b/parser/src/parser.rs
index 4277d30..4d9b52b 100644
--- a/parser/src/parser.rs
+++ b/parser/src/parser.rs
@@ -96,7 +96,7 @@ pub fn parse_located(
     let marker_token = (Default::default(), mode.to_marker(), Default::default());
     let tokenizer = iter::once(Ok(marker_token))
         .chain(lxr)
-        .filter_ok(|(_, tok, _)| !matches!(tok, Tok::Comment { .. }));
+        .filter_ok(|(_, tok, _)| !matches!(tok, Tok::Comment { .. } | Tok::NonLogicalNewline));
 
     python::TopParser::new()
         .parse(tokenizer)
diff --git a/parser/src/token.rs b/parser/src/token.rs
index 72aa89d..b51b2f4 100644
--- a/parser/src/token.rs
+++ b/parser/src/token.rs
@@ -25,6 +25,7 @@ pub enum Tok {
         triple_quoted: bool,
     },
     Newline,
+    NonLogicalNewline,
     Indent,
     Dedent,
     StartModule,
@@ -136,6 +137,7 @@ impl fmt::Display for Tok {
                 write!(f, "{kind}{quotes}{value}{quotes}")
             }
             Newline => f.write_str("Newline"),
+            NonLogicalNewline => f.write_str("NonLogicalNewline"),
             Indent => f.write_str("Indent"),
             Dedent => f.write_str("Dedent"),
             StartModule => f.write_str("StartProgram"),