Add NonLogicalNewline token

This token is completely ignored by the parser, but it's useful for
other users of the lexer, such as the Ruff linter. For example, the
token is helpful for a "trailing comma" lint.

The same idea exists in Python's `tokenize` module - there is a NEWLINE
token (logical newline), and a NL token (non-logical newline).

Fixes #4385.
This commit is contained in:
Ran Benita 2023-01-12 14:53:55 +02:00
parent 4f1e7c6291
commit 674eeec29c
3 changed files with 84 additions and 3 deletions

View file

@ -1075,10 +1075,13 @@ where
self.next_char();
let tok_end = self.get_pos();
// Depending on the nesting level, we emit newline or not:
// Depending on the nesting level, we emit a logical or
// non-logical newline:
if self.nesting == 0 {
self.at_begin_of_line = true;
self.emit((tok_start, Tok::Newline, tok_end));
} else {
self.emit((tok_start, Tok::NonLogicalNewline, tok_end));
}
}
' ' | '\t' | '\x0C' => {
@ -1464,7 +1467,16 @@ mod tests {
$(
#[test]
fn $name() {
let source = format!("x = [{} 1,2{}]{}", $eol, $eol, $eol);
let source = r"x = [
1,2
,(3,
4,
), {
5,
6,\
7}]
".replace("\n", $eol);
let tokens = lex_source(&source);
assert_eq!(
tokens,
@ -1474,9 +1486,32 @@ mod tests {
},
Tok::Equal,
Tok::Lsqb,
Tok::NonLogicalNewline,
Tok::NonLogicalNewline,
Tok::Int { value: BigInt::from(1) },
Tok::Comma,
Tok::Int { value: BigInt::from(2) },
Tok::NonLogicalNewline,
Tok::Comma,
Tok::Lpar,
Tok::Int { value: BigInt::from(3) },
Tok::Comma,
Tok::NonLogicalNewline,
Tok::Int { value: BigInt::from(4) },
Tok::Comma,
Tok::NonLogicalNewline,
Tok::Rpar,
Tok::Comma,
Tok::Lbrace,
Tok::NonLogicalNewline,
Tok::Int { value: BigInt::from(5) },
Tok::Comma,
Tok::NonLogicalNewline,
Tok::Int { value: BigInt::from(6) },
Tok::Comma,
// Continuation here - no NonLogicalNewline.
Tok::Int { value: BigInt::from(7) },
Tok::Rbrace,
Tok::Rsqb,
Tok::Newline,
]
@ -1492,6 +1527,50 @@ mod tests {
test_newline_in_brackets_unix_eol: UNIX_EOL,
}
#[test]
fn test_non_logical_newline_in_string_continuation() {
let source = r"(
'a'
'b'
'c' \
'd'
)";
let tokens = lex_source(source);
assert_eq!(
tokens,
vec![
Tok::Lpar,
Tok::NonLogicalNewline,
stok("a"),
Tok::NonLogicalNewline,
stok("b"),
Tok::NonLogicalNewline,
Tok::NonLogicalNewline,
stok("c"),
stok("d"),
Tok::NonLogicalNewline,
Tok::Rpar,
Tok::Newline,
]
);
}
#[test]
fn test_logical_newline_line_comment() {
let source = "#Hello\n#World";
let tokens = lex_source(source);
assert_eq!(
tokens,
vec![
Tok::Comment("#Hello".to_owned()),
// tokenize.py does put an NL here...
Tok::Comment("#World".to_owned()),
// ... and here, but doesn't seem very useful.
]
);
}
#[test]
fn test_operators() {
let source = "//////=/ /";

View file

@ -96,7 +96,7 @@ pub fn parse_located(
let marker_token = (Default::default(), mode.to_marker(), Default::default());
let tokenizer = iter::once(Ok(marker_token))
.chain(lxr)
.filter_ok(|(_, tok, _)| !matches!(tok, Tok::Comment { .. }));
.filter_ok(|(_, tok, _)| !matches!(tok, Tok::Comment { .. } | Tok::NonLogicalNewline));
python::TopParser::new()
.parse(tokenizer)

View file

@ -25,6 +25,7 @@ pub enum Tok {
triple_quoted: bool,
},
Newline,
NonLogicalNewline,
Indent,
Dedent,
StartModule,
@ -136,6 +137,7 @@ impl fmt::Display for Tok {
write!(f, "{kind}{quotes}{value}{quotes}")
}
Newline => f.write_str("Newline"),
NonLogicalNewline => f.write_str("NonLogicalNewline"),
Indent => f.write_str("Indent"),
Dedent => f.write_str("Dedent"),
StartModule => f.write_str("StartProgram"),