diff --git a/parser/src/lexer.rs b/parser/src/lexer.rs index cb38503..1c124f6 100644 --- a/parser/src/lexer.rs +++ b/parser/src/lexer.rs @@ -1075,10 +1075,13 @@ where self.next_char(); let tok_end = self.get_pos(); - // Depending on the nesting level, we emit newline or not: + // Depending on the nesting level, we emit a logical or + // non-logical newline: if self.nesting == 0 { self.at_begin_of_line = true; self.emit((tok_start, Tok::Newline, tok_end)); + } else { + self.emit((tok_start, Tok::NonLogicalNewline, tok_end)); } } ' ' | '\t' | '\x0C' => { @@ -1464,7 +1467,16 @@ mod tests { $( #[test] fn $name() { - let source = format!("x = [{} 1,2{}]{}", $eol, $eol, $eol); + let source = r"x = [ + + 1,2 +,(3, +4, +), { +5, +6,\ +7}] +".replace("\n", $eol); let tokens = lex_source(&source); assert_eq!( tokens, @@ -1474,9 +1486,32 @@ mod tests { }, Tok::Equal, Tok::Lsqb, + Tok::NonLogicalNewline, + Tok::NonLogicalNewline, Tok::Int { value: BigInt::from(1) }, Tok::Comma, Tok::Int { value: BigInt::from(2) }, + Tok::NonLogicalNewline, + Tok::Comma, + Tok::Lpar, + Tok::Int { value: BigInt::from(3) }, + Tok::Comma, + Tok::NonLogicalNewline, + Tok::Int { value: BigInt::from(4) }, + Tok::Comma, + Tok::NonLogicalNewline, + Tok::Rpar, + Tok::Comma, + Tok::Lbrace, + Tok::NonLogicalNewline, + Tok::Int { value: BigInt::from(5) }, + Tok::Comma, + Tok::NonLogicalNewline, + Tok::Int { value: BigInt::from(6) }, + Tok::Comma, + // Continuation here - no NonLogicalNewline. + Tok::Int { value: BigInt::from(7) }, + Tok::Rbrace, Tok::Rsqb, Tok::Newline, ] @@ -1492,6 +1527,50 @@ mod tests { test_newline_in_brackets_unix_eol: UNIX_EOL, } + #[test] + fn test_non_logical_newline_in_string_continuation() { + let source = r"( + 'a' + 'b' + + 'c' \ + 'd' +)"; + let tokens = lex_source(source); + assert_eq!( + tokens, + vec![ + Tok::Lpar, + Tok::NonLogicalNewline, + stok("a"), + Tok::NonLogicalNewline, + stok("b"), + Tok::NonLogicalNewline, + Tok::NonLogicalNewline, + stok("c"), + stok("d"), + Tok::NonLogicalNewline, + Tok::Rpar, + Tok::Newline, + ] + ); + } + + #[test] + fn test_logical_newline_line_comment() { + let source = "#Hello\n#World"; + let tokens = lex_source(source); + assert_eq!( + tokens, + vec![ + Tok::Comment("#Hello".to_owned()), + // tokenize.py does put an NL here... + Tok::Comment("#World".to_owned()), + // ... and here, but doesn't seem very useful. + ] + ); + } + #[test] fn test_operators() { let source = "//////=/ /"; diff --git a/parser/src/parser.rs b/parser/src/parser.rs index 4277d30..4d9b52b 100644 --- a/parser/src/parser.rs +++ b/parser/src/parser.rs @@ -96,7 +96,7 @@ pub fn parse_located( let marker_token = (Default::default(), mode.to_marker(), Default::default()); let tokenizer = iter::once(Ok(marker_token)) .chain(lxr) - .filter_ok(|(_, tok, _)| !matches!(tok, Tok::Comment { .. })); + .filter_ok(|(_, tok, _)| !matches!(tok, Tok::Comment { .. } | Tok::NonLogicalNewline)); python::TopParser::new() .parse(tokenizer) diff --git a/parser/src/token.rs b/parser/src/token.rs index 72aa89d..b51b2f4 100644 --- a/parser/src/token.rs +++ b/parser/src/token.rs @@ -25,6 +25,7 @@ pub enum Tok { triple_quoted: bool, }, Newline, + NonLogicalNewline, Indent, Dedent, StartModule, @@ -136,6 +137,7 @@ impl fmt::Display for Tok { write!(f, "{kind}{quotes}{value}{quotes}") } Newline => f.write_str("Newline"), + NonLogicalNewline => f.write_str("NonLogicalNewline"), Indent => f.write_str("Indent"), Dedent => f.write_str("Dedent"), StartModule => f.write_str("StartProgram"),