diff --git a/crates/ruff_python_formatter/src/comments/placement.rs b/crates/ruff_python_formatter/src/comments/placement.rs index cc8dc569b5..ec38633573 100644 --- a/crates/ruff_python_formatter/src/comments/placement.rs +++ b/crates/ruff_python_formatter/src/comments/placement.rs @@ -951,39 +951,11 @@ fn handle_dict_unpacking_comment<'a>( // if the remaining tokens from the previous node are exactly `**`, // re-assign the comment to the one that follows the stars - let mut count = 0u32; - - // we start from the preceding node but we skip its token - if let Some(token) = tokens.next() { - // The Keyword case - if token.kind == SimpleTokenKind::Star { - count += 1; - } else { - // The dict case - debug_assert!( - matches!( - token, - SimpleToken { - kind: SimpleTokenKind::LBrace - | SimpleTokenKind::Comma - | SimpleTokenKind::Colon, - .. - } - ), - "{token:?}", - ); - } + if tokens.any(|token| token.kind == SimpleTokenKind::DoubleStar) { + CommentPlacement::trailing(following, comment) + } else { + CommentPlacement::Default(comment) } - - for token in tokens { - debug_assert!(token.kind == SimpleTokenKind::Star, "Expected star token"); - count += 1; - } - if count == 2 { - return CommentPlacement::trailing(following, comment); - } - - CommentPlacement::Default(comment) } /// Own line comments coming after the node are always dangling comments diff --git a/crates/ruff_python_trivia/src/cursor.rs b/crates/ruff_python_trivia/src/cursor.rs index 43a750cb4f..336720269c 100644 --- a/crates/ruff_python_trivia/src/cursor.rs +++ b/crates/ruff_python_trivia/src/cursor.rs @@ -30,6 +30,14 @@ impl<'a> Cursor<'a> { self.chars.clone().next().unwrap_or(EOF_CHAR) } + /// Peeks the second character from the input stream without consuming it. + /// Returns [`EOF_CHAR`] if the position is past the end of the file. + pub fn second(&self) -> char { + let mut chars = self.chars.clone(); + chars.next(); + chars.next().unwrap_or(EOF_CHAR) + } + /// Peeks the next character from the input stream without consuming it. /// Returns [`EOF_CHAR`] if the file is at the end of the file. pub fn last(&self) -> char { diff --git a/crates/ruff_python_trivia/src/snapshots/ruff_python_trivia__tokenizer__tests__tokenize_characters.snap b/crates/ruff_python_trivia/src/snapshots/ruff_python_trivia__tokenizer__tests__tokenize_characters.snap deleted file mode 100644 index 0dcdad5d26..0000000000 --- a/crates/ruff_python_trivia/src/snapshots/ruff_python_trivia__tokenizer__tests__tokenize_characters.snap +++ /dev/null @@ -1,46 +0,0 @@ ---- -source: crates/ruff_python_trivia/src/tokenizer.rs -expression: test_case.tokens() ---- -[ - SimpleToken { - kind: Minus, - range: 0..1, - }, - SimpleToken { - kind: Greater, - range: 1..2, - }, - SimpleToken { - kind: Whitespace, - range: 2..3, - }, - SimpleToken { - kind: Star, - range: 3..4, - }, - SimpleToken { - kind: Equals, - range: 4..5, - }, - SimpleToken { - kind: Whitespace, - range: 5..6, - }, - SimpleToken { - kind: LParen, - range: 6..7, - }, - SimpleToken { - kind: Tilde, - range: 7..8, - }, - SimpleToken { - kind: Equals, - range: 8..9, - }, - SimpleToken { - kind: RParen, - range: 9..10, - }, -] diff --git a/crates/ruff_python_trivia/src/snapshots/ruff_python_trivia__tokenizer__tests__tokenize_eq.snap b/crates/ruff_python_trivia/src/snapshots/ruff_python_trivia__tokenizer__tests__tokenize_eq.snap new file mode 100644 index 0000000000..26f9c5ae2c --- /dev/null +++ b/crates/ruff_python_trivia/src/snapshots/ruff_python_trivia__tokenizer__tests__tokenize_eq.snap @@ -0,0 +1,14 @@ +--- +source: crates/ruff_python_trivia/src/tokenizer.rs +expression: test_case.tokens() +--- +[ + SimpleToken { + kind: EqEqual, + range: 0..2, + }, + SimpleToken { + kind: Equals, + range: 2..3, + }, +] diff --git a/crates/ruff_python_trivia/src/snapshots/ruff_python_trivia__tokenizer__tests__tokenize_invalid_operators.snap b/crates/ruff_python_trivia/src/snapshots/ruff_python_trivia__tokenizer__tests__tokenize_invalid_operators.snap new file mode 100644 index 0000000000..cee59e0ba3 --- /dev/null +++ b/crates/ruff_python_trivia/src/snapshots/ruff_python_trivia__tokenizer__tests__tokenize_invalid_operators.snap @@ -0,0 +1,22 @@ +--- +source: crates/ruff_python_trivia/src/tokenizer.rs +expression: test_case.tokens() +--- +[ + SimpleToken { + kind: RArrow, + range: 0..2, + }, + SimpleToken { + kind: Whitespace, + range: 2..3, + }, + SimpleToken { + kind: Other, + range: 3..4, + }, + SimpleToken { + kind: Bogus, + range: 4..5, + }, +] diff --git a/crates/ruff_python_trivia/src/snapshots/ruff_python_trivia__tokenizer__tests__tokenize_not_eq.snap b/crates/ruff_python_trivia/src/snapshots/ruff_python_trivia__tokenizer__tests__tokenize_not_eq.snap new file mode 100644 index 0000000000..00a7d9c1fc --- /dev/null +++ b/crates/ruff_python_trivia/src/snapshots/ruff_python_trivia__tokenizer__tests__tokenize_not_eq.snap @@ -0,0 +1,14 @@ +--- +source: crates/ruff_python_trivia/src/tokenizer.rs +expression: test_case.tokens() +--- +[ + SimpleToken { + kind: NotEqual, + range: 0..2, + }, + SimpleToken { + kind: Equals, + range: 2..3, + }, +] diff --git a/crates/ruff_python_trivia/src/snapshots/ruff_python_trivia__tokenizer__tests__tokenize_operators.snap b/crates/ruff_python_trivia/src/snapshots/ruff_python_trivia__tokenizer__tests__tokenize_operators.snap new file mode 100644 index 0000000000..f0f92f80c0 --- /dev/null +++ b/crates/ruff_python_trivia/src/snapshots/ruff_python_trivia__tokenizer__tests__tokenize_operators.snap @@ -0,0 +1,106 @@ +--- +source: crates/ruff_python_trivia/src/tokenizer.rs +expression: test_case.tokens() +--- +[ + SimpleToken { + kind: RArrow, + range: 0..2, + }, + SimpleToken { + kind: Whitespace, + range: 2..3, + }, + SimpleToken { + kind: StarEqual, + range: 3..5, + }, + SimpleToken { + kind: Whitespace, + range: 5..6, + }, + SimpleToken { + kind: LParen, + range: 6..7, + }, + SimpleToken { + kind: Whitespace, + range: 7..8, + }, + SimpleToken { + kind: MinusEqual, + range: 8..10, + }, + SimpleToken { + kind: Whitespace, + range: 10..11, + }, + SimpleToken { + kind: RParen, + range: 11..12, + }, + SimpleToken { + kind: Whitespace, + range: 12..13, + }, + SimpleToken { + kind: Tilde, + range: 13..14, + }, + SimpleToken { + kind: Whitespace, + range: 14..15, + }, + SimpleToken { + kind: DoubleSlash, + range: 15..17, + }, + SimpleToken { + kind: Whitespace, + range: 17..18, + }, + SimpleToken { + kind: DoubleStar, + range: 18..20, + }, + SimpleToken { + kind: Whitespace, + range: 20..21, + }, + SimpleToken { + kind: DoubleStarEqual, + range: 21..24, + }, + SimpleToken { + kind: Whitespace, + range: 24..25, + }, + SimpleToken { + kind: Circumflex, + range: 25..26, + }, + SimpleToken { + kind: Whitespace, + range: 26..27, + }, + SimpleToken { + kind: CircumflexEqual, + range: 27..29, + }, + SimpleToken { + kind: Whitespace, + range: 29..30, + }, + SimpleToken { + kind: Vbar, + range: 30..31, + }, + SimpleToken { + kind: Whitespace, + range: 31..32, + }, + SimpleToken { + kind: VbarEqual, + range: 32..34, + }, +] diff --git a/crates/ruff_python_trivia/src/tokenizer.rs b/crates/ruff_python_trivia/src/tokenizer.rs index 6da3e5d779..d767e1d59e 100644 --- a/crates/ruff_python_trivia/src/tokenizer.rs +++ b/crates/ruff_python_trivia/src/tokenizer.rs @@ -1,7 +1,8 @@ use memchr::{memchr2, memchr3, memrchr3_iter}; -use ruff_text_size::{TextLen, TextRange, TextSize}; use unic_ucd_ident::{is_xid_continue, is_xid_start}; +use ruff_text_size::{TextLen, TextRange, TextSize}; + use crate::{is_python_whitespace, Cursor}; /// Searches for the first non-trivia character in `range`. @@ -213,6 +214,78 @@ pub enum SimpleTokenKind { /// `~` Tilde, + /// `==` + EqEqual, + + /// `!=` + NotEqual, + + /// `<=` + LessEqual, + + /// `>=` + GreaterEqual, + + /// `<<` + LeftShift, + + /// `>>` + RightShift, + + /// `**` + DoubleStar, + + /// `**=` + DoubleStarEqual, + + /// `+=` + PlusEqual, + + /// `-=` + MinusEqual, + + /// `*=` + StarEqual, + + /// `/=` + SlashEqual, + + /// `%=` + PercentEqual, + + /// `&=` + AmperEqual, + + /// `|=` + VbarEqual, + + /// `^=` + CircumflexEqual, + + /// `<<=` + LeftShiftEqual, + + /// `>>=` + RightShiftEqual, + + /// `//` + DoubleSlash, + + /// `//=` + DoubleSlashEqual, + + /// `:=` + ColonEqual, + + /// `...` + Ellipsis, + + /// `@=` + AtEqual, + + /// `->` + RArrow, + /// `and` And, @@ -326,35 +399,6 @@ pub enum SimpleTokenKind { } impl SimpleTokenKind { - const fn from_non_trivia_char(c: char) -> SimpleTokenKind { - match c { - '(' => SimpleTokenKind::LParen, - ')' => SimpleTokenKind::RParen, - '[' => SimpleTokenKind::LBracket, - ']' => SimpleTokenKind::RBracket, - '{' => SimpleTokenKind::LBrace, - '}' => SimpleTokenKind::RBrace, - ',' => SimpleTokenKind::Comma, - ':' => SimpleTokenKind::Colon, - ';' => SimpleTokenKind::Semi, - '/' => SimpleTokenKind::Slash, - '*' => SimpleTokenKind::Star, - '.' => SimpleTokenKind::Dot, - '+' => SimpleTokenKind::Plus, - '-' => SimpleTokenKind::Minus, - '=' => SimpleTokenKind::Equals, - '>' => SimpleTokenKind::Greater, - '<' => SimpleTokenKind::Less, - '%' => SimpleTokenKind::Percent, - '&' => SimpleTokenKind::Ampersand, - '^' => SimpleTokenKind::Circumflex, - '|' => SimpleTokenKind::Vbar, - '@' => SimpleTokenKind::At, - '~' => SimpleTokenKind::Tilde, - _ => SimpleTokenKind::Other, - } - } - const fn is_trivia(self) -> bool { matches!( self, @@ -478,6 +522,20 @@ impl<'a> SimpleTokenizer<'a> { } let kind = match first { + // Keywords and identifiers + c if is_identifier_start(c) => { + self.cursor.eat_while(is_identifier_continuation); + let token_len = self.cursor.token_len(); + + let range = TextRange::at(self.offset, token_len); + let kind = self.to_keyword_or_other(range); + + if kind == SimpleTokenKind::Other { + self.bogus = true; + } + kind + } + ' ' | '\t' => { self.cursor.eat_while(|c| matches!(c, ' ' | '\t')); SimpleTokenKind::Whitespace @@ -497,21 +555,156 @@ impl<'a> SimpleTokenizer<'a> { '\\' => SimpleTokenKind::Continuation, - c => { - let kind = if is_identifier_start(c) { - self.cursor.eat_while(is_identifier_continuation); - let token_len = self.cursor.token_len(); - - let range = TextRange::at(self.offset, token_len); - self.to_keyword_or_other(range) + // Non-trivia, non-keyword tokens + '=' => { + if self.cursor.eat_char('=') { + SimpleTokenKind::EqEqual } else { - SimpleTokenKind::from_non_trivia_char(c) - }; - - if kind == SimpleTokenKind::Other { - self.bogus = true; + SimpleTokenKind::Equals } - kind + } + '+' => { + if self.cursor.eat_char('=') { + SimpleTokenKind::PlusEqual + } else { + SimpleTokenKind::Plus + } + } + '*' => { + if self.cursor.eat_char('=') { + SimpleTokenKind::StarEqual + } else if self.cursor.eat_char('*') { + if self.cursor.eat_char('=') { + SimpleTokenKind::DoubleStarEqual + } else { + SimpleTokenKind::DoubleStar + } + } else { + SimpleTokenKind::Star + } + } + '/' => { + if self.cursor.eat_char('=') { + SimpleTokenKind::SlashEqual + } else if self.cursor.eat_char('/') { + if self.cursor.eat_char('=') { + SimpleTokenKind::DoubleSlashEqual + } else { + SimpleTokenKind::DoubleSlash + } + } else { + SimpleTokenKind::Slash + } + } + '%' => { + if self.cursor.eat_char('=') { + SimpleTokenKind::PercentEqual + } else { + SimpleTokenKind::Percent + } + } + '|' => { + if self.cursor.eat_char('=') { + SimpleTokenKind::VbarEqual + } else { + SimpleTokenKind::Vbar + } + } + '^' => { + if self.cursor.eat_char('=') { + SimpleTokenKind::CircumflexEqual + } else { + SimpleTokenKind::Circumflex + } + } + '&' => { + if self.cursor.eat_char('=') { + SimpleTokenKind::AmperEqual + } else { + SimpleTokenKind::Ampersand + } + } + '-' => { + if self.cursor.eat_char('=') { + SimpleTokenKind::MinusEqual + } else if self.cursor.eat_char('>') { + SimpleTokenKind::RArrow + } else { + SimpleTokenKind::Minus + } + } + '@' => { + if self.cursor.eat_char('=') { + SimpleTokenKind::AtEqual + } else { + SimpleTokenKind::At + } + } + '!' => { + if self.cursor.eat_char('=') { + SimpleTokenKind::NotEqual + } else { + self.bogus = true; + SimpleTokenKind::Other + } + } + '~' => SimpleTokenKind::Tilde, + ':' => { + if self.cursor.eat_char('=') { + SimpleTokenKind::ColonEqual + } else { + SimpleTokenKind::Colon + } + } + ';' => SimpleTokenKind::Semi, + '<' => { + if self.cursor.eat_char('<') { + if self.cursor.eat_char('=') { + SimpleTokenKind::LeftShiftEqual + } else { + SimpleTokenKind::LeftShift + } + } else if self.cursor.eat_char('=') { + SimpleTokenKind::LessEqual + } else { + SimpleTokenKind::Less + } + } + '>' => { + if self.cursor.eat_char('>') { + if self.cursor.eat_char('=') { + SimpleTokenKind::RightShiftEqual + } else { + SimpleTokenKind::RightShift + } + } else if self.cursor.eat_char('=') { + SimpleTokenKind::GreaterEqual + } else { + SimpleTokenKind::Greater + } + } + ',' => SimpleTokenKind::Comma, + '.' => { + if self.cursor.first() == '.' && self.cursor.second() == '.' { + self.cursor.bump(); + self.cursor.bump(); + SimpleTokenKind::Ellipsis + } else { + SimpleTokenKind::Dot + } + } + + // Bracket tokens + '(' => SimpleTokenKind::LParen, + ')' => SimpleTokenKind::RParen, + '[' => SimpleTokenKind::LBracket, + ']' => SimpleTokenKind::RBracket, + '{' => SimpleTokenKind::LBrace, + '}' => SimpleTokenKind::RBrace, + + _ => { + self.bogus = true; + SimpleTokenKind::Other } }; @@ -612,38 +805,108 @@ impl<'a> SimpleTokenizer<'a> { } SimpleTokenKind::Comment - } else if c == '\\' { - SimpleTokenKind::Continuation } else { - let kind = if is_identifier_continuation(c) { - // if we only have identifier continuations but no start (e.g. 555) we - // don't want to consume the chars, so in that case, we want to rewind the - // cursor to here - let savepoint = self.cursor.clone(); - self.cursor.eat_back_while(is_identifier_continuation); + match c { + // Keywords and identifiers + c if is_identifier_continuation(c) => { + // if we only have identifier continuations but no start (e.g. 555) we + // don't want to consume the chars, so in that case, we want to rewind the + // cursor to here + let savepoint = self.cursor.clone(); + self.cursor.eat_back_while(is_identifier_continuation); - let token_len = self.cursor.token_len(); - let range = TextRange::at(self.back_offset - token_len, token_len); + let token_len = self.cursor.token_len(); + let range = TextRange::at(self.back_offset - token_len, token_len); - if self.source[range] - .chars() - .next() - .is_some_and(is_identifier_start) - { - self.to_keyword_or_other(range) - } else { - self.cursor = savepoint; + if self.source[range] + .chars() + .next() + .is_some_and(is_identifier_start) + { + self.to_keyword_or_other(range) + } else { + self.cursor = savepoint; + self.bogus = true; + SimpleTokenKind::Other + } + } + + // Non-trivia tokens that are unambiguous when lexing backwards. + // In other words: these are characters that _don't_ appear at the + // end of a multi-character token (like `!=`). + '\\' => SimpleTokenKind::Continuation, + ':' => SimpleTokenKind::Colon, + '~' => SimpleTokenKind::Tilde, + '%' => SimpleTokenKind::Percent, + '|' => SimpleTokenKind::Vbar, + ',' => SimpleTokenKind::Comma, + ';' => SimpleTokenKind::Semi, + '(' => SimpleTokenKind::LParen, + ')' => SimpleTokenKind::RParen, + '[' => SimpleTokenKind::LBracket, + ']' => SimpleTokenKind::RBracket, + '{' => SimpleTokenKind::LBrace, + '}' => SimpleTokenKind::RBrace, + '&' => SimpleTokenKind::Ampersand, + '^' => SimpleTokenKind::Circumflex, + '+' => SimpleTokenKind::Plus, + '-' => SimpleTokenKind::Minus, + + // Non-trivia tokens that _are_ ambiguous when lexing backwards. + // In other words: these are characters that _might_ mark the end + // of a multi-character token (like `!=` or `->` or `//` or `**`). + '=' | '*' | '/' | '@' | '!' | '<' | '>' | '.' => { + // This could be a single-token token, like `+` in `x + y`, or a + // multi-character token, like `+=` in `x += y`. It could also be a sequence + // of multi-character tokens, like `x ==== y`, which is invalid, _but_ it's + // important that we produce the same token stream when lexing backwards as + // we do when lexing forwards. So, identify the range of the sequence, lex + // forwards, and return the last token. + let mut cursor = self.cursor.clone(); + cursor.eat_back_while(|c| { + matches!( + c, + ':' | '~' + | '%' + | '|' + | '&' + | '^' + | '+' + | '-' + | '=' + | '*' + | '/' + | '@' + | '!' + | '<' + | '>' + | '.' + ) + }); + + let token_len = cursor.token_len(); + let range = TextRange::at(self.back_offset - token_len, token_len); + + let forward_lexer = Self::new(self.source, range); + if let Some(token) = forward_lexer.last() { + // If the token spans multiple characters, bump the cursor. Note, + // though, that we already bumped the cursor to past the last character + // in the token at the very start of `next_token_back`. + for _ in self.source[token.range].chars().rev().skip(1) { + self.cursor.bump_back().unwrap(); + } + token.kind() + } else { + self.bogus = true; + SimpleTokenKind::Other + } + } + + _ => { + self.bogus = true; SimpleTokenKind::Other } - } else { - SimpleTokenKind::from_non_trivia_char(c) - }; - - if kind == SimpleTokenKind::Other { - self.bogus = true; } - - kind } } }; @@ -871,6 +1134,7 @@ impl QuoteKind { #[cfg(test)] mod tests { use insta::assert_debug_snapshot; + use ruff_text_size::{TextLen, TextRange, TextSize}; use crate::tokenizer::{lines_after, lines_before, SimpleToken, SimpleTokenizer}; @@ -946,6 +1210,30 @@ mod tests { test_case.assert_reverse_tokenization(); } + #[test] + fn tokenize_eq() { + // Should tokenize as `==`, then `=`, regardless of whether we're lexing forwards or + // backwards. + let source = "==="; + + let test_case = tokenize(source); + + assert_debug_snapshot!(test_case.tokens()); + test_case.assert_reverse_tokenization(); + } + + #[test] + fn tokenize_not_eq() { + // Should tokenize as `!=`, then `=`, regardless of whether we're lexing forwards or + // backwards. + let source = "!=="; + + let test_case = tokenize(source); + + assert_debug_snapshot!(test_case.tokens()); + test_case.assert_reverse_tokenization(); + } + #[test] fn tokenize_continuation() { let source = "( \\\n )"; @@ -957,8 +1245,8 @@ mod tests { } #[test] - fn tokenize_characters() { - let source = "-> *= (~=)"; + fn tokenize_operators() { + let source = "-> *= ( -= ) ~ // ** **= ^ ^= | |="; let test_case = tokenize(source); @@ -966,6 +1254,17 @@ mod tests { test_case.assert_reverse_tokenization(); } + #[test] + fn tokenize_invalid_operators() { + let source = "-> $="; + + let test_case = tokenize(source); + + assert_debug_snapshot!(test_case.tokens()); + + // note: not reversible: [other, bogus, bogus] vs [bogus, bogus, other] + } + #[test] fn tricky_unicode() { let source = "មុ";