From 674eeec29c6e0b4462ccc4932f57a392dd29af7d Mon Sep 17 00:00:00 2001 From: Ran Benita Date: Thu, 12 Jan 2023 14:53:55 +0200 Subject: [PATCH] Add NonLogicalNewline token This token is completely ignored by the parser, but it's useful for other users of the lexer, such as the Ruff linter. For example, the token is helpful for a "trailing comma" lint. The same idea exists in Python's `tokenize` module - there is a NEWLINE token (logical newline), and a NL token (non-logical newline). Fixes #4385. --- parser/src/lexer.rs | 83 ++++++++++++++++++++++++++++++++++++++++++-- parser/src/parser.rs | 2 +- parser/src/token.rs | 2 ++ 3 files changed, 84 insertions(+), 3 deletions(-) diff --git a/parser/src/lexer.rs b/parser/src/lexer.rs index cb38503..1c124f6 100644 --- a/parser/src/lexer.rs +++ b/parser/src/lexer.rs @@ -1075,10 +1075,13 @@ where self.next_char(); let tok_end = self.get_pos(); - // Depending on the nesting level, we emit newline or not: + // Depending on the nesting level, we emit a logical or + // non-logical newline: if self.nesting == 0 { self.at_begin_of_line = true; self.emit((tok_start, Tok::Newline, tok_end)); + } else { + self.emit((tok_start, Tok::NonLogicalNewline, tok_end)); } } ' ' | '\t' | '\x0C' => { @@ -1464,7 +1467,16 @@ mod tests { $( #[test] fn $name() { - let source = format!("x = [{} 1,2{}]{}", $eol, $eol, $eol); + let source = r"x = [ + + 1,2 +,(3, +4, +), { +5, +6,\ +7}] +".replace("\n", $eol); let tokens = lex_source(&source); assert_eq!( tokens, @@ -1474,9 +1486,32 @@ mod tests { }, Tok::Equal, Tok::Lsqb, + Tok::NonLogicalNewline, + Tok::NonLogicalNewline, Tok::Int { value: BigInt::from(1) }, Tok::Comma, Tok::Int { value: BigInt::from(2) }, + Tok::NonLogicalNewline, + Tok::Comma, + Tok::Lpar, + Tok::Int { value: BigInt::from(3) }, + Tok::Comma, + Tok::NonLogicalNewline, + Tok::Int { value: BigInt::from(4) }, + Tok::Comma, + Tok::NonLogicalNewline, + Tok::Rpar, + Tok::Comma, + Tok::Lbrace, + Tok::NonLogicalNewline, + Tok::Int { value: BigInt::from(5) }, + Tok::Comma, + Tok::NonLogicalNewline, + Tok::Int { value: BigInt::from(6) }, + Tok::Comma, + // Continuation here - no NonLogicalNewline. + Tok::Int { value: BigInt::from(7) }, + Tok::Rbrace, Tok::Rsqb, Tok::Newline, ] @@ -1492,6 +1527,50 @@ mod tests { test_newline_in_brackets_unix_eol: UNIX_EOL, } + #[test] + fn test_non_logical_newline_in_string_continuation() { + let source = r"( + 'a' + 'b' + + 'c' \ + 'd' +)"; + let tokens = lex_source(source); + assert_eq!( + tokens, + vec![ + Tok::Lpar, + Tok::NonLogicalNewline, + stok("a"), + Tok::NonLogicalNewline, + stok("b"), + Tok::NonLogicalNewline, + Tok::NonLogicalNewline, + stok("c"), + stok("d"), + Tok::NonLogicalNewline, + Tok::Rpar, + Tok::Newline, + ] + ); + } + + #[test] + fn test_logical_newline_line_comment() { + let source = "#Hello\n#World"; + let tokens = lex_source(source); + assert_eq!( + tokens, + vec![ + Tok::Comment("#Hello".to_owned()), + // tokenize.py does put an NL here... + Tok::Comment("#World".to_owned()), + // ... and here, but doesn't seem very useful. + ] + ); + } + #[test] fn test_operators() { let source = "//////=/ /"; diff --git a/parser/src/parser.rs b/parser/src/parser.rs index 4277d30..4d9b52b 100644 --- a/parser/src/parser.rs +++ b/parser/src/parser.rs @@ -96,7 +96,7 @@ pub fn parse_located( let marker_token = (Default::default(), mode.to_marker(), Default::default()); let tokenizer = iter::once(Ok(marker_token)) .chain(lxr) - .filter_ok(|(_, tok, _)| !matches!(tok, Tok::Comment { .. })); + .filter_ok(|(_, tok, _)| !matches!(tok, Tok::Comment { .. } | Tok::NonLogicalNewline)); python::TopParser::new() .parse(tokenizer) diff --git a/parser/src/token.rs b/parser/src/token.rs index 72aa89d..b51b2f4 100644 --- a/parser/src/token.rs +++ b/parser/src/token.rs @@ -25,6 +25,7 @@ pub enum Tok { triple_quoted: bool, }, Newline, + NonLogicalNewline, Indent, Dedent, StartModule, @@ -136,6 +137,7 @@ impl fmt::Display for Tok { write!(f, "{kind}{quotes}{value}{quotes}") } Newline => f.write_str("Newline"), + NonLogicalNewline => f.write_str("NonLogicalNewline"), Indent => f.write_str("Indent"), Dedent => f.write_str("Dedent"), StartModule => f.write_str("StartProgram"),