use unicode_ident::{is_xid_continue, is_xid_start}; use ruff_text_size::{Ranged, TextLen, TextRange, TextSize}; use crate::{is_python_whitespace, Cursor}; /// Searches for the first non-trivia character after `offset`. /// /// The search skips over any whitespace and comments. /// /// Returns `Some` if the source code after `offset` contains any non-trivia character./// /// Returns `None` if the text after `offset` is empty or only contains trivia (whitespace or comments). pub fn first_non_trivia_token(offset: TextSize, code: &str) -> Option { SimpleTokenizer::starts_at(offset, code) .skip_trivia() .next() } /// Returns the only non-trivia, non-closing parenthesis token in `range`. /// /// Includes debug assertions that the range only contains that single token. pub fn find_only_token_in_range( range: TextRange, token_kind: SimpleTokenKind, code: &str, ) -> SimpleToken { let mut tokens = SimpleTokenizer::new(code, range) .skip_trivia() .skip_while(|token| token.kind == SimpleTokenKind::RParen); let token = tokens.next().expect("Expected a token"); debug_assert_eq!(token.kind(), token_kind); let mut tokens = tokens.skip_while(|token| token.kind == SimpleTokenKind::LParen); #[allow(clippy::debug_assert_with_mut_call)] { debug_assert_eq!(tokens.next(), None); } token } /// Returns the number of newlines between `offset` and the first non whitespace character in the source code. pub fn lines_before(offset: TextSize, code: &str) -> u32 { let mut cursor = Cursor::new(&code[TextRange::up_to(offset)]); let mut newlines = 0u32; while let Some(c) = cursor.bump_back() { match c { '\n' => { cursor.eat_char_back('\r'); newlines += 1; } '\r' => { newlines += 1; } c if is_python_whitespace(c) => { continue; } _ => { break; } } } newlines } /// Counts the empty lines between `offset` and the first non-whitespace character. pub fn lines_after(offset: TextSize, code: &str) -> u32 { let mut cursor = Cursor::new(&code[offset.to_usize()..]); let mut newlines = 0u32; while let Some(c) = cursor.bump() { match c { '\n' => { newlines += 1; } '\r' => { cursor.eat_char('\n'); newlines += 1; } c if is_python_whitespace(c) => { continue; } _ => { break; } } } newlines } /// Counts the empty lines after `offset`, ignoring any trailing trivia: end-of-line comments, /// own-line comments, and any intermediary newlines. pub fn lines_after_ignoring_trivia(offset: TextSize, code: &str) -> u32 { let mut newlines = 0u32; for token in SimpleTokenizer::starts_at(offset, code) { match token.kind() { SimpleTokenKind::Newline => { newlines += 1; } SimpleTokenKind::Whitespace => {} // If we see a comment, reset the newlines counter. SimpleTokenKind::Comment => { newlines = 0; } // As soon as we see a non-trivia token, we're done. _ => { break; } } } newlines } /// Counts the empty lines after `offset`, ignoring any trailing trivia on the same line as /// `offset`. #[allow(clippy::cast_possible_truncation)] pub fn lines_after_ignoring_end_of_line_trivia(offset: TextSize, code: &str) -> u32 { // SAFETY: We don't support files greater than 4GB, so casting to u32 is safe. SimpleTokenizer::starts_at(offset, code) .skip_while(|token| token.kind != SimpleTokenKind::Newline && token.kind.is_trivia()) .take_while(|token| { token.kind == SimpleTokenKind::Newline || token.kind == SimpleTokenKind::Whitespace }) .filter(|token| token.kind == SimpleTokenKind::Newline) .count() as u32 } fn is_identifier_start(c: char) -> bool { if c.is_ascii() { c.is_ascii_alphabetic() || c == '_' } else { is_xid_start(c) } } // Checks if the character c is a valid continuation character as described // in https://docs.python.org/3/reference/lexical_analysis.html#identifiers fn is_identifier_continuation(c: char) -> bool { // Arrange things such that ASCII codepoints never // result in the slower `is_xid_continue` getting called. if c.is_ascii() { matches!(c, 'a'..='z' | 'A'..='Z' | '_' | '0'..='9') } else { is_xid_continue(c) } } fn to_keyword_or_other(source: &str) -> SimpleTokenKind { match source { "and" => SimpleTokenKind::And, "as" => SimpleTokenKind::As, "assert" => SimpleTokenKind::Assert, "async" => SimpleTokenKind::Async, "await" => SimpleTokenKind::Await, "break" => SimpleTokenKind::Break, "class" => SimpleTokenKind::Class, "continue" => SimpleTokenKind::Continue, "def" => SimpleTokenKind::Def, "del" => SimpleTokenKind::Del, "elif" => SimpleTokenKind::Elif, "else" => SimpleTokenKind::Else, "except" => SimpleTokenKind::Except, "finally" => SimpleTokenKind::Finally, "for" => SimpleTokenKind::For, "from" => SimpleTokenKind::From, "global" => SimpleTokenKind::Global, "if" => SimpleTokenKind::If, "import" => SimpleTokenKind::Import, "in" => SimpleTokenKind::In, "is" => SimpleTokenKind::Is, "lambda" => SimpleTokenKind::Lambda, "nonlocal" => SimpleTokenKind::Nonlocal, "not" => SimpleTokenKind::Not, "or" => SimpleTokenKind::Or, "pass" => SimpleTokenKind::Pass, "raise" => SimpleTokenKind::Raise, "return" => SimpleTokenKind::Return, "try" => SimpleTokenKind::Try, "while" => SimpleTokenKind::While, "match" => SimpleTokenKind::Match, // Match is a soft keyword that depends on the context but we can always lex it as a keyword and leave it to the caller (parser) to decide if it should be handled as an identifier or keyword. "type" => SimpleTokenKind::Type, // Type is a soft keyword that depends on the context but we can always lex it as a keyword and leave it to the caller (parser) to decide if it should be handled as an identifier or keyword. "case" => SimpleTokenKind::Case, "with" => SimpleTokenKind::With, "yield" => SimpleTokenKind::Yield, _ => SimpleTokenKind::Name, // Potentially an identifier, but only if it isn't a string prefix. The caller (SimpleTokenizer) is responsible for enforcing that constraint. } } #[derive(Clone, Debug, Eq, PartialEq, Hash)] pub struct SimpleToken { pub kind: SimpleTokenKind, pub range: TextRange, } impl SimpleToken { pub const fn kind(&self) -> SimpleTokenKind { self.kind } } impl Ranged for SimpleToken { fn range(&self) -> TextRange { self.range } } #[derive(Copy, Clone, Debug, Eq, PartialEq, Hash)] pub enum SimpleTokenKind { /// A comment, not including the trailing new line. Comment, /// Sequence of ' ' or '\t' Whitespace, /// Start or end of the file EndOfFile, /// `\\` Continuation, /// `\n` or `\r` or `\r\n` Newline, /// `(` LParen, /// `)` RParen, /// `{` LBrace, /// `}` RBrace, /// `[` LBracket, /// `]` RBracket, /// `,` Comma, /// `:` Colon, /// `;` Semi, /// `/` Slash, /// `*` Star, /// `.` Dot, /// `+` Plus, /// `-` Minus, /// `=` Equals, /// `>` Greater, /// `<` Less, /// `%` Percent, /// `&` Ampersand, /// `^` Circumflex, /// `|` Vbar, /// `@` At, /// `~` Tilde, /// `==` EqEqual, /// `!=` NotEqual, /// `<=` LessEqual, /// `>=` GreaterEqual, /// `<<` LeftShift, /// `>>` RightShift, /// `**` DoubleStar, /// `**=` DoubleStarEqual, /// `+=` PlusEqual, /// `-=` MinusEqual, /// `*=` StarEqual, /// `/=` SlashEqual, /// `%=` PercentEqual, /// `&=` AmperEqual, /// `|=` VbarEqual, /// `^=` CircumflexEqual, /// `<<=` LeftShiftEqual, /// `>>=` RightShiftEqual, /// `//` DoubleSlash, /// `//=` DoubleSlashEqual, /// `:=` ColonEqual, /// `...` Ellipsis, /// `@=` AtEqual, /// `->` RArrow, /// `and` And, /// `as` As, /// `assert` Assert, /// `async` Async, /// `await` Await, /// `break` Break, /// `class` Class, /// `continue` Continue, /// `def` Def, /// `del` Del, /// `elif` Elif, /// `else` Else, /// `except` Except, /// `finally` Finally, /// `for` For, /// `from` From, /// `global` Global, /// `if` If, /// `import` Import, /// `in` In, /// `is` Is, /// `lambda` Lambda, /// `nonlocal` Nonlocal, /// `not` Not, /// `or` Or, /// `pass` Pass, /// `raise` Raise, /// `return` Return, /// `try` Try, /// `while` While, /// `match` Match, /// `type` Type, /// `case` Case, /// `with` With, /// `yield` Yield, /// An identifier or keyword. Name, /// Any other non trivia token. Other, /// Returned for each character after [`SimpleTokenKind::Other`] has been returned once. Bogus, } impl SimpleTokenKind { pub const fn is_trivia(self) -> bool { matches!( self, SimpleTokenKind::Whitespace | SimpleTokenKind::Newline | SimpleTokenKind::Comment | SimpleTokenKind::Continuation ) } pub const fn is_comment(self) -> bool { matches!(self, SimpleTokenKind::Comment) } } /// Simple zero allocation tokenizer handling most tokens. /// /// The tokenizer must start at an offset that is trivia (e.g. not inside of a multiline string). /// /// In case it finds something it can't parse, the tokenizer will return a /// [`SimpleTokenKind::Other`] and then only a final [`SimpleTokenKind::Bogus`] afterwards. pub struct SimpleTokenizer<'a> { offset: TextSize, /// `true` when it is known that the current `back` line has no comment for sure. bogus: bool, source: &'a str, cursor: Cursor<'a>, } impl<'a> SimpleTokenizer<'a> { pub fn new(source: &'a str, range: TextRange) -> Self { Self { offset: range.start(), bogus: false, source, cursor: Cursor::new(&source[range]), } } pub fn starts_at(offset: TextSize, source: &'a str) -> Self { let range = TextRange::new(offset, source.text_len()); Self::new(source, range) } fn next_token(&mut self) -> SimpleToken { self.cursor.start_token(); let Some(first) = self.cursor.bump() else { return SimpleToken { kind: SimpleTokenKind::EndOfFile, range: TextRange::empty(self.offset), }; }; if self.bogus { // Emit a single final bogus token let token = SimpleToken { kind: SimpleTokenKind::Bogus, range: TextRange::new(self.offset, self.source.text_len()), }; // Set the cursor to EOF self.cursor = Cursor::new(""); self.offset = self.source.text_len(); return token; } let kind = self.next_token_inner(first); let token_len = self.cursor.token_len(); let token = SimpleToken { kind, range: TextRange::at(self.offset, token_len), }; self.offset += token_len; token } fn next_token_inner(&mut self, first: char) -> SimpleTokenKind { match first { // Keywords and identifiers c if is_identifier_start(c) => { self.cursor.eat_while(is_identifier_continuation); let token_len = self.cursor.token_len(); let range = TextRange::at(self.offset, token_len); let kind = to_keyword_or_other(&self.source[range]); // If the next character is a quote, we may be in a string prefix. For example: // `f"foo`. if kind == SimpleTokenKind::Name && matches!(self.cursor.first(), '"' | '\'') && matches!( &self.source[range], "B" | "BR" | "Br" | "F" | "FR" | "Fr" | "R" | "RB" | "RF" | "Rb" | "Rf" | "U" | "b" | "bR" | "br" | "f" | "fR" | "fr" | "r" | "rB" | "rF" | "rb" | "rf" | "u" ) { self.bogus = true; SimpleTokenKind::Other } else { kind } } // Space, tab, or form feed. We ignore the true semantics of form feed, and treat it as // whitespace. ' ' | '\t' | '\x0C' => { self.cursor.eat_while(|c| matches!(c, ' ' | '\t' | '\x0C')); SimpleTokenKind::Whitespace } '\n' => SimpleTokenKind::Newline, '\r' => { self.cursor.eat_char('\n'); SimpleTokenKind::Newline } '#' => { self.cursor.eat_while(|c| !matches!(c, '\n' | '\r')); SimpleTokenKind::Comment } '\\' => SimpleTokenKind::Continuation, // Non-trivia, non-keyword tokens '=' => { if self.cursor.eat_char('=') { SimpleTokenKind::EqEqual } else { SimpleTokenKind::Equals } } '+' => { if self.cursor.eat_char('=') { SimpleTokenKind::PlusEqual } else { SimpleTokenKind::Plus } } '*' => { if self.cursor.eat_char('=') { SimpleTokenKind::StarEqual } else if self.cursor.eat_char('*') { if self.cursor.eat_char('=') { SimpleTokenKind::DoubleStarEqual } else { SimpleTokenKind::DoubleStar } } else { SimpleTokenKind::Star } } '/' => { if self.cursor.eat_char('=') { SimpleTokenKind::SlashEqual } else if self.cursor.eat_char('/') { if self.cursor.eat_char('=') { SimpleTokenKind::DoubleSlashEqual } else { SimpleTokenKind::DoubleSlash } } else { SimpleTokenKind::Slash } } '%' => { if self.cursor.eat_char('=') { SimpleTokenKind::PercentEqual } else { SimpleTokenKind::Percent } } '|' => { if self.cursor.eat_char('=') { SimpleTokenKind::VbarEqual } else { SimpleTokenKind::Vbar } } '^' => { if self.cursor.eat_char('=') { SimpleTokenKind::CircumflexEqual } else { SimpleTokenKind::Circumflex } } '&' => { if self.cursor.eat_char('=') { SimpleTokenKind::AmperEqual } else { SimpleTokenKind::Ampersand } } '-' => { if self.cursor.eat_char('=') { SimpleTokenKind::MinusEqual } else if self.cursor.eat_char('>') { SimpleTokenKind::RArrow } else { SimpleTokenKind::Minus } } '@' => { if self.cursor.eat_char('=') { SimpleTokenKind::AtEqual } else { SimpleTokenKind::At } } '!' => { if self.cursor.eat_char('=') { SimpleTokenKind::NotEqual } else { self.bogus = true; SimpleTokenKind::Other } } '~' => SimpleTokenKind::Tilde, ':' => { if self.cursor.eat_char('=') { SimpleTokenKind::ColonEqual } else { SimpleTokenKind::Colon } } ';' => SimpleTokenKind::Semi, '<' => { if self.cursor.eat_char('<') { if self.cursor.eat_char('=') { SimpleTokenKind::LeftShiftEqual } else { SimpleTokenKind::LeftShift } } else if self.cursor.eat_char('=') { SimpleTokenKind::LessEqual } else { SimpleTokenKind::Less } } '>' => { if self.cursor.eat_char('>') { if self.cursor.eat_char('=') { SimpleTokenKind::RightShiftEqual } else { SimpleTokenKind::RightShift } } else if self.cursor.eat_char('=') { SimpleTokenKind::GreaterEqual } else { SimpleTokenKind::Greater } } ',' => SimpleTokenKind::Comma, '.' => { if self.cursor.first() == '.' && self.cursor.second() == '.' { self.cursor.bump(); self.cursor.bump(); SimpleTokenKind::Ellipsis } else { SimpleTokenKind::Dot } } // Bracket tokens '(' => SimpleTokenKind::LParen, ')' => SimpleTokenKind::RParen, '[' => SimpleTokenKind::LBracket, ']' => SimpleTokenKind::RBracket, '{' => SimpleTokenKind::LBrace, '}' => SimpleTokenKind::RBrace, _ => { self.bogus = true; SimpleTokenKind::Other } } } pub fn skip_trivia(self) -> impl Iterator + 'a { self.filter(|t| !t.kind().is_trivia()) } } impl Iterator for SimpleTokenizer<'_> { type Item = SimpleToken; fn next(&mut self) -> Option { let token = self.next_token(); if token.kind == SimpleTokenKind::EndOfFile { None } else { Some(token) } } } /// Simple zero allocation backwards tokenizer for finding preceding tokens. /// /// The tokenizer must start at an offset that is trivia (e.g. not inside of a multiline string). /// It will fail when reaching a string. /// /// In case it finds something it can't parse, the tokenizer will return a /// [`SimpleTokenKind::Other`] and then only a final [`SimpleTokenKind::Bogus`] afterwards. pub struct BackwardsTokenizer<'a> { offset: TextSize, back_offset: TextSize, /// Not `&CommentRanges` to avoid a circular dependency. comment_ranges: &'a [TextRange], bogus: bool, source: &'a str, cursor: Cursor<'a>, } impl<'a> BackwardsTokenizer<'a> { pub fn new(source: &'a str, range: TextRange, comment_range: &'a [TextRange]) -> Self { Self { offset: range.start(), back_offset: range.end(), // Throw out any comments that follow the range. comment_ranges: &comment_range [..comment_range.partition_point(|comment| comment.start() <= range.end())], bogus: false, source, cursor: Cursor::new(&source[range]), } } pub fn up_to(offset: TextSize, source: &'a str, comment_range: &'a [TextRange]) -> Self { Self::new(source, TextRange::up_to(offset), comment_range) } pub fn skip_trivia(self) -> impl Iterator + 'a { self.filter(|t| !t.kind().is_trivia()) } pub fn next_token(&mut self) -> SimpleToken { self.cursor.start_token(); self.back_offset = self.cursor.text_len() + self.offset; let Some(last) = self.cursor.bump_back() else { return SimpleToken { kind: SimpleTokenKind::EndOfFile, range: TextRange::empty(self.back_offset), }; }; if self.bogus { let token = SimpleToken { kind: SimpleTokenKind::Bogus, range: TextRange::up_to(self.back_offset), }; // Set the cursor to EOF self.cursor = Cursor::new(""); self.back_offset = TextSize::new(0); return token; } if let Some(comment) = self .comment_ranges .last() .filter(|comment| comment.contains_inclusive(self.back_offset)) { self.comment_ranges = &self.comment_ranges[..self.comment_ranges.len() - 1]; // Skip the comment without iterating over the chars manually. self.cursor = Cursor::new(&self.source[TextRange::new(self.offset, comment.start())]); debug_assert_eq!(self.cursor.text_len() + self.offset, comment.start()); return SimpleToken { kind: SimpleTokenKind::Comment, range: comment.range(), }; } let kind = match last { // Space, tab, or form feed. We ignore the true semantics of form feed, and treat it as // whitespace. Note that this will lex-out trailing whitespace from a comment as // whitespace rather than as part of the comment token, but this shouldn't matter for // our use case. ' ' | '\t' | '\x0C' => { self.cursor .eat_back_while(|c| matches!(c, ' ' | '\t' | '\x0C')); SimpleTokenKind::Whitespace } '\r' => SimpleTokenKind::Newline, '\n' => { self.cursor.eat_char_back('\r'); SimpleTokenKind::Newline } _ => self.next_token_inner(last), }; let token_len = self.cursor.token_len(); let start = self.back_offset - token_len; SimpleToken { kind, range: TextRange::at(start, token_len), } } /// Helper to parser the previous token once we skipped all whitespace fn next_token_inner(&mut self, last: char) -> SimpleTokenKind { match last { // Keywords and identifiers c if is_identifier_continuation(c) => { // if we only have identifier continuations but no start (e.g. 555) we // don't want to consume the chars, so in that case, we want to rewind the // cursor to here let savepoint = self.cursor.clone(); self.cursor.eat_back_while(is_identifier_continuation); let token_len = self.cursor.token_len(); let range = TextRange::at(self.back_offset - token_len, token_len); if self.source[range] .chars() .next() .is_some_and(is_identifier_start) { to_keyword_or_other(&self.source[range]) } else { self.cursor = savepoint; self.bogus = true; SimpleTokenKind::Other } } // Non-trivia tokens that are unambiguous when lexing backwards. // In other words: these are characters that _don't_ appear at the // end of a multi-character token (like `!=`). '\\' => SimpleTokenKind::Continuation, ':' => SimpleTokenKind::Colon, '~' => SimpleTokenKind::Tilde, '%' => SimpleTokenKind::Percent, '|' => SimpleTokenKind::Vbar, ',' => SimpleTokenKind::Comma, ';' => SimpleTokenKind::Semi, '(' => SimpleTokenKind::LParen, ')' => SimpleTokenKind::RParen, '[' => SimpleTokenKind::LBracket, ']' => SimpleTokenKind::RBracket, '{' => SimpleTokenKind::LBrace, '}' => SimpleTokenKind::RBrace, '&' => SimpleTokenKind::Ampersand, '^' => SimpleTokenKind::Circumflex, '+' => SimpleTokenKind::Plus, '-' => SimpleTokenKind::Minus, // Non-trivia tokens that _are_ ambiguous when lexing backwards. // In other words: these are characters that _might_ mark the end // of a multi-character token (like `!=` or `->` or `//` or `**`). '=' | '*' | '/' | '@' | '!' | '<' | '>' | '.' => { // This could be a single-token token, like `+` in `x + y`, or a // multi-character token, like `+=` in `x += y`. It could also be a sequence // of multi-character tokens, like `x ==== y`, which is invalid, _but_ it's // important that we produce the same token stream when lexing backwards as // we do when lexing forwards. So, identify the range of the sequence, lex // forwards, and return the last token. let mut cursor = self.cursor.clone(); cursor.eat_back_while(|c| { matches!( c, ':' | '~' | '%' | '|' | '&' | '^' | '+' | '-' | '=' | '*' | '/' | '@' | '!' | '<' | '>' | '.' ) }); let token_len = cursor.token_len(); let range = TextRange::at(self.back_offset - token_len, token_len); let forward_lexer = SimpleTokenizer::new(self.source, range); if let Some(token) = forward_lexer.last() { // If the token spans multiple characters, bump the cursor. Note, // though, that we already bumped the cursor to past the last character // in the token at the very start of `next_token_back`.y for _ in self.source[token.range].chars().rev().skip(1) { self.cursor.bump_back().unwrap(); } token.kind() } else { self.bogus = true; SimpleTokenKind::Other } } _ => { self.bogus = true; SimpleTokenKind::Other } } } } impl Iterator for BackwardsTokenizer<'_> { type Item = SimpleToken; fn next(&mut self) -> Option { let token = self.next_token(); if token.kind == SimpleTokenKind::EndOfFile { None } else { Some(token) } } }