ruff/crates/ruff_python_parser/src/lexer.rs
Carl Meyer dd6f6233bd
bump MSRV to 1.83 (#16294)
According to our new MSRV policy (see
https://github.com/astral-sh/ruff/issues/16370 ), bump our MSRV to 1.83
(N - 2), and autofix some new clippy lints.
2025-02-26 06:12:43 -08:00

2465 lines
82 KiB
Rust
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

//! This module takes care of lexing Python source text.
//!
//! This means source code is scanned and translated into separate tokens. The rules
//! governing what is and is not a valid token are defined in the Python reference
//! guide section on [Lexical analysis].
//!
//! [Lexical analysis]: https://docs.python.org/3/reference/lexical_analysis.html
use std::cmp::Ordering;
use std::str::FromStr;
use unicode_ident::{is_xid_continue, is_xid_start};
use unicode_normalization::UnicodeNormalization;
use ruff_python_ast::name::Name;
use ruff_python_ast::{Int, IpyEscapeKind, StringFlags};
use ruff_python_trivia::is_python_whitespace;
use ruff_text_size::{TextLen, TextRange, TextSize};
use crate::error::{FStringErrorType, LexicalError, LexicalErrorType};
use crate::lexer::cursor::{Cursor, EOF_CHAR};
use crate::lexer::fstring::{FStringContext, FStrings, FStringsCheckpoint};
use crate::lexer::indentation::{Indentation, Indentations, IndentationsCheckpoint};
use crate::token::{TokenFlags, TokenKind, TokenValue};
use crate::Mode;
mod cursor;
mod fstring;
mod indentation;
const BOM: char = '\u{feff}';
/// A lexer for Python source code.
#[derive(Debug)]
pub struct Lexer<'src> {
/// Source code to be lexed.
source: &'src str,
/// A pointer to the current character of the source code which is being lexed.
cursor: Cursor<'src>,
/// The kind of the current token.
current_kind: TokenKind,
/// The range of the current token.
current_range: TextRange,
/// The value of the current token.
current_value: TokenValue,
/// Flags for the current token.
current_flags: TokenFlags,
/// Lexer state.
state: State,
/// Represents the current level of nesting in the lexer, indicating the depth of parentheses.
/// The lexer is within a parenthesized context if the value is greater than 0.
nesting: u32,
/// A stack of indentation representing the current indentation level.
indentations: Indentations,
pending_indentation: Option<Indentation>,
/// Lexer mode.
mode: Mode,
/// F-string contexts.
fstrings: FStrings,
/// Errors encountered while lexing.
errors: Vec<LexicalError>,
}
impl<'src> Lexer<'src> {
/// Create a new lexer for the given input source which starts at the given offset.
///
/// If the start offset is greater than 0, the cursor is moved ahead that many bytes.
/// This means that the input source should be the complete source code and not the
/// sliced version.
pub(crate) fn new(source: &'src str, mode: Mode, start_offset: TextSize) -> Self {
assert!(
u32::try_from(source.len()).is_ok(),
"Lexer only supports files with a size up to 4GB"
);
let (state, nesting) = if mode == Mode::ParenthesizedExpression {
(State::Other, 1)
} else {
(State::AfterNewline, 0)
};
let mut lexer = Lexer {
source,
cursor: Cursor::new(source),
state,
current_kind: TokenKind::EndOfFile,
current_range: TextRange::empty(start_offset),
current_value: TokenValue::None,
current_flags: TokenFlags::empty(),
nesting,
indentations: Indentations::default(),
pending_indentation: None,
mode,
fstrings: FStrings::default(),
errors: Vec::new(),
};
if start_offset == TextSize::new(0) {
// TODO: Handle possible mismatch between BOM and explicit encoding declaration.
lexer.cursor.eat_char(BOM);
} else {
lexer.cursor.skip_bytes(start_offset.to_usize());
}
lexer
}
/// Returns the kind of the current token.
pub(crate) fn current_kind(&self) -> TokenKind {
self.current_kind
}
/// Returns the range of the current token.
pub(crate) fn current_range(&self) -> TextRange {
self.current_range
}
/// Returns the flags for the current token.
pub(crate) fn current_flags(&self) -> TokenFlags {
self.current_flags
}
/// Takes the token value corresponding to the current token out of the lexer, replacing it
/// with the default value.
///
/// All the subsequent call to this method without moving the lexer would always return the
/// default value which is [`TokenValue::None`].
pub(crate) fn take_value(&mut self) -> TokenValue {
std::mem::take(&mut self.current_value)
}
/// Helper function to push the given error, updating the current range with the error location
/// and return the [`TokenKind::Unknown`] token.
fn push_error(&mut self, error: LexicalError) -> TokenKind {
self.current_range = error.location();
self.errors.push(error);
TokenKind::Unknown
}
/// Lex the next token.
pub fn next_token(&mut self) -> TokenKind {
self.cursor.start_token();
self.current_value = TokenValue::None;
self.current_flags = TokenFlags::empty();
self.current_kind = self.lex_token();
// For `Unknown` token, the `push_error` method updates the current range.
if !matches!(self.current_kind, TokenKind::Unknown) {
self.current_range = self.token_range();
}
self.current_kind
}
fn lex_token(&mut self) -> TokenKind {
if let Some(fstring) = self.fstrings.current() {
if !fstring.is_in_expression(self.nesting) {
if let Some(token) = self.lex_fstring_middle_or_end() {
if matches!(token, TokenKind::FStringEnd) {
self.fstrings.pop();
}
return token;
}
}
}
// Return dedent tokens until the current indentation level matches the indentation of the next token.
else if let Some(indentation) = self.pending_indentation.take() {
match self.indentations.current().try_compare(indentation) {
Ok(Ordering::Greater) => {
self.pending_indentation = Some(indentation);
if self.indentations.dedent_one(indentation).is_err() {
return self.push_error(LexicalError::new(
LexicalErrorType::IndentationError,
self.token_range(),
));
}
return TokenKind::Dedent;
}
Ok(_) => {}
Err(_) => {
return self.push_error(LexicalError::new(
LexicalErrorType::IndentationError,
self.token_range(),
));
}
}
}
if self.state.is_after_newline() {
if let Some(indentation) = self.eat_indentation() {
return indentation;
}
} else {
if let Err(error) = self.skip_whitespace() {
return self.push_error(error);
}
}
// The lexer might've skipped whitespaces, so update the start offset
self.cursor.start_token();
if let Some(c) = self.cursor.bump() {
if c.is_ascii() {
self.consume_ascii_character(c)
} else if is_unicode_identifier_start(c) {
let identifier = self.lex_identifier(c);
self.state = State::Other;
identifier
} else {
self.push_error(LexicalError::new(
LexicalErrorType::UnrecognizedToken { tok: c },
self.token_range(),
))
}
} else {
// Reached the end of the file. Emit a trailing newline token if not at the beginning of a logical line,
// empty the dedent stack, and finally, return the EndOfFile token.
self.consume_end()
}
}
fn eat_indentation(&mut self) -> Option<TokenKind> {
let mut indentation = Indentation::root();
loop {
match self.cursor.first() {
' ' => {
self.cursor.bump();
indentation = indentation.add_space();
}
'\t' => {
self.cursor.bump();
indentation = indentation.add_tab();
}
'\\' => {
self.cursor.bump();
if self.cursor.eat_char('\r') {
self.cursor.eat_char('\n');
} else if self.cursor.is_eof() {
return Some(self.push_error(LexicalError::new(
LexicalErrorType::Eof,
self.token_range(),
)));
} else if !self.cursor.eat_char('\n') {
return Some(self.push_error(LexicalError::new(
LexicalErrorType::LineContinuationError,
TextRange::at(self.offset() - '\\'.text_len(), '\\'.text_len()),
)));
}
indentation = Indentation::root();
}
// Form feed
'\x0C' => {
self.cursor.bump();
indentation = Indentation::root();
}
_ => break,
}
}
// Handle indentation if this is a new, not all empty, logical line
if !matches!(self.cursor.first(), '\n' | '\r' | '#' | EOF_CHAR) {
self.state = State::NonEmptyLogicalLine;
// Set to false so that we don't handle indentation on the next call.
return self.handle_indentation(indentation);
}
None
}
fn handle_indentation(&mut self, indentation: Indentation) -> Option<TokenKind> {
let token = match self.indentations.current().try_compare(indentation) {
// Dedent
Ok(Ordering::Greater) => {
self.pending_indentation = Some(indentation);
if self.indentations.dedent_one(indentation).is_err() {
return Some(self.push_error(LexicalError::new(
LexicalErrorType::IndentationError,
self.token_range(),
)));
};
// The lexer might've eaten some whitespaces to calculate the `indentation`. For
// example:
//
// ```py
// if first:
// if second:
// pass
// foo
// # ^
// ```
//
// Here, the cursor is at `^` and the `indentation` contains the whitespaces before
// the `pass` token.
self.cursor.start_token();
Some(TokenKind::Dedent)
}
Ok(Ordering::Equal) => None,
// Indent
Ok(Ordering::Less) => {
self.indentations.indent(indentation);
Some(TokenKind::Indent)
}
Err(_) => {
return Some(self.push_error(LexicalError::new(
LexicalErrorType::IndentationError,
self.token_range(),
)));
}
};
token
}
fn skip_whitespace(&mut self) -> Result<(), LexicalError> {
loop {
match self.cursor.first() {
' ' => {
self.cursor.bump();
}
'\t' => {
self.cursor.bump();
}
'\\' => {
self.cursor.bump();
if self.cursor.eat_char('\r') {
self.cursor.eat_char('\n');
} else if self.cursor.is_eof() {
return Err(LexicalError::new(LexicalErrorType::Eof, self.token_range()));
} else if !self.cursor.eat_char('\n') {
return Err(LexicalError::new(
LexicalErrorType::LineContinuationError,
TextRange::at(self.offset() - '\\'.text_len(), '\\'.text_len()),
));
}
}
// Form feed
'\x0C' => {
self.cursor.bump();
}
_ => break,
}
}
Ok(())
}
// Dispatch based on the given character.
fn consume_ascii_character(&mut self, c: char) -> TokenKind {
let token = match c {
c if is_ascii_identifier_start(c) => self.lex_identifier(c),
'0'..='9' => self.lex_number(c),
'#' => return self.lex_comment(),
'\'' | '"' => self.lex_string(c),
'=' => {
if self.cursor.eat_char('=') {
TokenKind::EqEqual
} else {
self.state = State::AfterEqual;
return TokenKind::Equal;
}
}
'+' => {
if self.cursor.eat_char('=') {
TokenKind::PlusEqual
} else {
TokenKind::Plus
}
}
'*' => {
if self.cursor.eat_char('=') {
TokenKind::StarEqual
} else if self.cursor.eat_char('*') {
if self.cursor.eat_char('=') {
TokenKind::DoubleStarEqual
} else {
TokenKind::DoubleStar
}
} else {
TokenKind::Star
}
}
c @ ('%' | '!')
if self.mode == Mode::Ipython
&& self.state.is_after_equal()
&& self.nesting == 0 =>
{
// SAFETY: Safe because `c` has been matched against one of the possible escape command token
self.lex_ipython_escape_command(IpyEscapeKind::try_from(c).unwrap())
}
c @ ('%' | '!' | '?' | '/' | ';' | ',')
if self.mode == Mode::Ipython && self.state.is_new_logical_line() =>
{
let kind = if let Ok(kind) = IpyEscapeKind::try_from([c, self.cursor.first()]) {
self.cursor.bump();
kind
} else {
// SAFETY: Safe because `c` has been matched against one of the possible escape command token
IpyEscapeKind::try_from(c).unwrap()
};
self.lex_ipython_escape_command(kind)
}
'?' if self.mode == Mode::Ipython => TokenKind::Question,
'/' => {
if self.cursor.eat_char('=') {
TokenKind::SlashEqual
} else if self.cursor.eat_char('/') {
if self.cursor.eat_char('=') {
TokenKind::DoubleSlashEqual
} else {
TokenKind::DoubleSlash
}
} else {
TokenKind::Slash
}
}
'%' => {
if self.cursor.eat_char('=') {
TokenKind::PercentEqual
} else {
TokenKind::Percent
}
}
'|' => {
if self.cursor.eat_char('=') {
TokenKind::VbarEqual
} else {
TokenKind::Vbar
}
}
'^' => {
if self.cursor.eat_char('=') {
TokenKind::CircumflexEqual
} else {
TokenKind::CircumFlex
}
}
'&' => {
if self.cursor.eat_char('=') {
TokenKind::AmperEqual
} else {
TokenKind::Amper
}
}
'-' => {
if self.cursor.eat_char('=') {
TokenKind::MinusEqual
} else if self.cursor.eat_char('>') {
TokenKind::Rarrow
} else {
TokenKind::Minus
}
}
'@' => {
if self.cursor.eat_char('=') {
TokenKind::AtEqual
} else {
TokenKind::At
}
}
'!' => {
if self.cursor.eat_char('=') {
TokenKind::NotEqual
} else {
TokenKind::Exclamation
}
}
'~' => TokenKind::Tilde,
'(' => {
self.nesting += 1;
TokenKind::Lpar
}
')' => {
self.nesting = self.nesting.saturating_sub(1);
TokenKind::Rpar
}
'[' => {
self.nesting += 1;
TokenKind::Lsqb
}
']' => {
self.nesting = self.nesting.saturating_sub(1);
TokenKind::Rsqb
}
'{' => {
self.nesting += 1;
TokenKind::Lbrace
}
'}' => {
if let Some(fstring) = self.fstrings.current_mut() {
if fstring.nesting() == self.nesting {
return self.push_error(LexicalError::new(
LexicalErrorType::FStringError(FStringErrorType::SingleRbrace),
self.token_range(),
));
}
fstring.try_end_format_spec(self.nesting);
}
self.nesting = self.nesting.saturating_sub(1);
TokenKind::Rbrace
}
':' => {
if self
.fstrings
.current_mut()
.is_some_and(|fstring| fstring.try_start_format_spec(self.nesting))
{
TokenKind::Colon
} else if self.cursor.eat_char('=') {
TokenKind::ColonEqual
} else {
TokenKind::Colon
}
}
';' => TokenKind::Semi,
'<' => {
if self.cursor.eat_char('<') {
if self.cursor.eat_char('=') {
TokenKind::LeftShiftEqual
} else {
TokenKind::LeftShift
}
} else if self.cursor.eat_char('=') {
TokenKind::LessEqual
} else {
TokenKind::Less
}
}
'>' => {
if self.cursor.eat_char('>') {
if self.cursor.eat_char('=') {
TokenKind::RightShiftEqual
} else {
TokenKind::RightShift
}
} else if self.cursor.eat_char('=') {
TokenKind::GreaterEqual
} else {
TokenKind::Greater
}
}
',' => TokenKind::Comma,
'.' => {
if self.cursor.first().is_ascii_digit() {
self.lex_decimal_number('.')
} else if self.cursor.eat_char2('.', '.') {
TokenKind::Ellipsis
} else {
TokenKind::Dot
}
}
'\n' => {
return if self.nesting == 0 && !self.state.is_new_logical_line() {
self.state = State::AfterNewline;
TokenKind::Newline
} else {
if let Some(fstring) = self.fstrings.current_mut() {
fstring.try_end_format_spec(self.nesting);
}
TokenKind::NonLogicalNewline
}
}
'\r' => {
self.cursor.eat_char('\n');
return if self.nesting == 0 && !self.state.is_new_logical_line() {
self.state = State::AfterNewline;
TokenKind::Newline
} else {
if let Some(fstring) = self.fstrings.current_mut() {
fstring.try_end_format_spec(self.nesting);
}
TokenKind::NonLogicalNewline
};
}
_ => {
self.state = State::Other;
return self.push_error(LexicalError::new(
LexicalErrorType::UnrecognizedToken { tok: c },
self.token_range(),
));
}
};
self.state = State::Other;
token
}
/// Lex an identifier. Also used for keywords and string/bytes literals with a prefix.
fn lex_identifier(&mut self, first: char) -> TokenKind {
// Detect potential string like rb'' b'' f'' u'' r''
let quote = match (first, self.cursor.first()) {
(_, quote @ ('\'' | '"')) => self.try_single_char_prefix(first).then(|| {
self.cursor.bump();
quote
}),
(_, second) if is_quote(self.cursor.second()) => {
self.try_double_char_prefix([first, second]).then(|| {
self.cursor.bump();
// SAFETY: Safe because of the `is_quote` check in this match arm's guard
self.cursor.bump().unwrap()
})
}
_ => None,
};
if let Some(quote) = quote {
if self.current_flags.is_f_string() {
return self.lex_fstring_start(quote);
}
return self.lex_string(quote);
}
// Keep track of whether the identifier is ASCII-only or not.
//
// This is important because Python applies NFKC normalization to
// identifiers: https://docs.python.org/3/reference/lexical_analysis.html#identifiers.
// We need to therefore do the same in our lexer, but applying NFKC normalization
// unconditionally is extremely expensive. If we know an identifier is ASCII-only,
// (by far the most common case), we can skip NFKC normalization of the identifier.
let mut is_ascii = first.is_ascii();
self.cursor
.eat_while(|c| is_identifier_continuation(c, &mut is_ascii));
let text = self.token_text();
if !is_ascii {
self.current_value = TokenValue::Name(text.nfkc().collect::<Name>());
return TokenKind::Name;
}
// Short circuit for names that are longer than any known keyword.
// It helps Rust to predict that the Name::new call in the keyword match's default branch
// is guaranteed to fit into a stack allocated (inline) Name.
if text.len() > 8 {
self.current_value = TokenValue::Name(Name::new(text));
return TokenKind::Name;
}
match text {
"False" => TokenKind::False,
"None" => TokenKind::None,
"True" => TokenKind::True,
"and" => TokenKind::And,
"as" => TokenKind::As,
"assert" => TokenKind::Assert,
"async" => TokenKind::Async,
"await" => TokenKind::Await,
"break" => TokenKind::Break,
"case" => TokenKind::Case,
"class" => TokenKind::Class,
"continue" => TokenKind::Continue,
"def" => TokenKind::Def,
"del" => TokenKind::Del,
"elif" => TokenKind::Elif,
"else" => TokenKind::Else,
"except" => TokenKind::Except,
"finally" => TokenKind::Finally,
"for" => TokenKind::For,
"from" => TokenKind::From,
"global" => TokenKind::Global,
"if" => TokenKind::If,
"import" => TokenKind::Import,
"in" => TokenKind::In,
"is" => TokenKind::Is,
"lambda" => TokenKind::Lambda,
"match" => TokenKind::Match,
"nonlocal" => TokenKind::Nonlocal,
"not" => TokenKind::Not,
"or" => TokenKind::Or,
"pass" => TokenKind::Pass,
"raise" => TokenKind::Raise,
"return" => TokenKind::Return,
"try" => TokenKind::Try,
"type" => TokenKind::Type,
"while" => TokenKind::While,
"with" => TokenKind::With,
"yield" => TokenKind::Yield,
_ => {
self.current_value = TokenValue::Name(Name::new(text));
TokenKind::Name
}
}
}
/// Try lexing the single character string prefix, updating the token flags accordingly.
/// Returns `true` if it matches.
fn try_single_char_prefix(&mut self, first: char) -> bool {
match first {
'f' | 'F' => self.current_flags |= TokenFlags::F_STRING,
'u' | 'U' => self.current_flags |= TokenFlags::UNICODE_STRING,
'b' | 'B' => self.current_flags |= TokenFlags::BYTE_STRING,
'r' => self.current_flags |= TokenFlags::RAW_STRING_LOWERCASE,
'R' => self.current_flags |= TokenFlags::RAW_STRING_UPPERCASE,
_ => return false,
}
true
}
/// Try lexing the double character string prefix, updating the token flags accordingly.
/// Returns `true` if it matches.
fn try_double_char_prefix(&mut self, value: [char; 2]) -> bool {
match value {
['r', 'f' | 'F'] | ['f' | 'F', 'r'] => {
self.current_flags |= TokenFlags::F_STRING | TokenFlags::RAW_STRING_LOWERCASE;
}
['R', 'f' | 'F'] | ['f' | 'F', 'R'] => {
self.current_flags |= TokenFlags::F_STRING | TokenFlags::RAW_STRING_UPPERCASE;
}
['r', 'b' | 'B'] | ['b' | 'B', 'r'] => {
self.current_flags |= TokenFlags::BYTE_STRING | TokenFlags::RAW_STRING_LOWERCASE;
}
['R', 'b' | 'B'] | ['b' | 'B', 'R'] => {
self.current_flags |= TokenFlags::BYTE_STRING | TokenFlags::RAW_STRING_UPPERCASE;
}
_ => return false,
}
true
}
/// Lex a f-string start token.
fn lex_fstring_start(&mut self, quote: char) -> TokenKind {
#[cfg(debug_assertions)]
debug_assert_eq!(self.cursor.previous(), quote);
if quote == '"' {
self.current_flags |= TokenFlags::DOUBLE_QUOTES;
}
if self.cursor.eat_char2(quote, quote) {
self.current_flags |= TokenFlags::TRIPLE_QUOTED_STRING;
}
self.fstrings
.push(FStringContext::new(self.current_flags, self.nesting));
TokenKind::FStringStart
}
/// Lex a f-string middle or end token.
fn lex_fstring_middle_or_end(&mut self) -> Option<TokenKind> {
// SAFETY: Safe because the function is only called when `self.fstrings` is not empty.
let fstring = self.fstrings.current().unwrap();
// Check if we're at the end of the f-string.
if fstring.is_triple_quoted() {
let quote_char = fstring.quote_char();
if self.cursor.eat_char3(quote_char, quote_char, quote_char) {
self.current_flags = fstring.flags();
return Some(TokenKind::FStringEnd);
}
} else if self.cursor.eat_char(fstring.quote_char()) {
self.current_flags = fstring.flags();
return Some(TokenKind::FStringEnd);
}
// We have to decode `{{` and `}}` into `{` and `}` respectively. As an
// optimization, we only allocate a new string we find any escaped curly braces,
// otherwise this string will remain empty and we'll use a source slice instead.
let mut normalized = String::new();
// Tracks the last offset of token value that has been written to `normalized`.
let mut last_offset = self.offset();
// This isn't going to change for the duration of the loop.
let in_format_spec = fstring.is_in_format_spec(self.nesting);
let mut in_named_unicode = false;
loop {
match self.cursor.first() {
// The condition is to differentiate between the `NUL` (`\0`) character
// in the source code and the one returned by `self.cursor.first()` when
// we reach the end of the source code.
EOF_CHAR if self.cursor.is_eof() => {
let error = if fstring.is_triple_quoted() {
FStringErrorType::UnterminatedTripleQuotedString
} else {
FStringErrorType::UnterminatedString
};
self.fstrings.pop();
return Some(self.push_error(LexicalError::new(
LexicalErrorType::FStringError(error),
self.token_range(),
)));
}
'\n' | '\r' if !fstring.is_triple_quoted() => {
// If we encounter a newline while we're in a format spec, then
// we stop here and let the lexer emit the newline token.
//
// Relevant discussion: https://github.com/python/cpython/issues/110259
if in_format_spec {
break;
}
self.fstrings.pop();
return Some(self.push_error(LexicalError::new(
LexicalErrorType::FStringError(FStringErrorType::UnterminatedString),
self.token_range(),
)));
}
'\\' => {
self.cursor.bump(); // '\'
if matches!(self.cursor.first(), '{' | '}') {
// Don't consume `{` or `}` as we want them to be emitted as tokens.
// They will be handled in the next iteration.
continue;
} else if !fstring.is_raw_string() {
if self.cursor.eat_char2('N', '{') {
in_named_unicode = true;
continue;
}
}
// Consume the escaped character.
if self.cursor.eat_char('\r') {
self.cursor.eat_char('\n');
} else {
self.cursor.bump();
}
}
quote @ ('\'' | '"') if quote == fstring.quote_char() => {
if let Some(triple_quotes) = fstring.triple_quotes() {
if self.cursor.rest().starts_with(triple_quotes) {
break;
}
self.cursor.bump();
} else {
break;
}
}
'{' => {
if self.cursor.second() == '{' && !in_format_spec {
self.cursor.bump();
normalized
.push_str(&self.source[TextRange::new(last_offset, self.offset())]);
self.cursor.bump(); // Skip the second `{`
last_offset = self.offset();
} else {
break;
}
}
'}' => {
if in_named_unicode {
in_named_unicode = false;
self.cursor.bump();
} else if self.cursor.second() == '}' && !in_format_spec {
self.cursor.bump();
normalized
.push_str(&self.source[TextRange::new(last_offset, self.offset())]);
self.cursor.bump(); // Skip the second `}`
last_offset = self.offset();
} else {
break;
}
}
_ => {
self.cursor.bump();
}
}
}
let range = self.token_range();
if range.is_empty() {
return None;
}
let value = if normalized.is_empty() {
self.source[range].to_string()
} else {
normalized.push_str(&self.source[TextRange::new(last_offset, self.offset())]);
normalized
};
self.current_value = TokenValue::FStringMiddle(value.into_boxed_str());
self.current_flags = fstring.flags();
Some(TokenKind::FStringMiddle)
}
/// Lex a string literal.
fn lex_string(&mut self, quote: char) -> TokenKind {
#[cfg(debug_assertions)]
debug_assert_eq!(self.cursor.previous(), quote);
if quote == '"' {
self.current_flags |= TokenFlags::DOUBLE_QUOTES;
}
// If the next two characters are also the quote character, then we have a triple-quoted
// string; consume those two characters and ensure that we require a triple-quote to close
if self.cursor.eat_char2(quote, quote) {
self.current_flags |= TokenFlags::TRIPLE_QUOTED_STRING;
}
let value_start = self.offset();
let quote_byte = u8::try_from(quote).expect("char that fits in u8");
let value_end = if self.current_flags.is_triple_quoted() {
// For triple-quoted strings, scan until we find the closing quote (ignoring escaped
// quotes) or the end of the file.
loop {
let Some(index) = memchr::memchr(quote_byte, self.cursor.rest().as_bytes()) else {
self.cursor.skip_to_end();
return self.push_error(LexicalError::new(
LexicalErrorType::UnclosedStringError,
self.token_range(),
));
};
// Rare case: if there are an odd number of backslashes before the quote, then
// the quote is escaped and we should continue scanning.
let num_backslashes = self.cursor.rest().as_bytes()[..index]
.iter()
.rev()
.take_while(|&&c| c == b'\\')
.count();
// Advance the cursor past the quote and continue scanning.
self.cursor.skip_bytes(index + 1);
// If the character is escaped, continue scanning.
if num_backslashes % 2 == 1 {
continue;
}
// Otherwise, if it's followed by two more quotes, then we're done.
if self.cursor.eat_char2(quote, quote) {
break self.offset() - TextSize::new(3);
}
}
} else {
// For non-triple-quoted strings, scan until we find the closing quote, but end early
// if we encounter a newline or the end of the file.
loop {
let Some(index) =
memchr::memchr3(quote_byte, b'\r', b'\n', self.cursor.rest().as_bytes())
else {
self.cursor.skip_to_end();
return self.push_error(LexicalError::new(
LexicalErrorType::StringError,
self.token_range(),
));
};
// Rare case: if there are an odd number of backslashes before the quote, then
// the quote is escaped and we should continue scanning.
let num_backslashes = self.cursor.rest().as_bytes()[..index]
.iter()
.rev()
.take_while(|&&c| c == b'\\')
.count();
// Skip up to the current character.
self.cursor.skip_bytes(index);
// Lookahead because we want to bump only if it's a quote or being escaped.
let quote_or_newline = self.cursor.first();
// If the character is escaped, continue scanning.
if num_backslashes % 2 == 1 {
self.cursor.bump();
if quote_or_newline == '\r' {
self.cursor.eat_char('\n');
}
continue;
}
match quote_or_newline {
'\r' | '\n' => {
return self.push_error(LexicalError::new(
LexicalErrorType::UnclosedStringError,
self.token_range(),
));
}
ch if ch == quote => {
let value_end = self.offset();
self.cursor.bump();
break value_end;
}
_ => unreachable!("memchr2 returned an index that is not a quote or a newline"),
}
}
};
self.current_value = TokenValue::String(
self.source[TextRange::new(value_start, value_end)]
.to_string()
.into_boxed_str(),
);
TokenKind::String
}
/// Numeric lexing. The feast can start!
fn lex_number(&mut self, first: char) -> TokenKind {
if first == '0' {
if self.cursor.eat_if(|c| matches!(c, 'x' | 'X')).is_some() {
self.lex_number_radix(Radix::Hex)
} else if self.cursor.eat_if(|c| matches!(c, 'o' | 'O')).is_some() {
self.lex_number_radix(Radix::Octal)
} else if self.cursor.eat_if(|c| matches!(c, 'b' | 'B')).is_some() {
self.lex_number_radix(Radix::Binary)
} else {
self.lex_decimal_number(first)
}
} else {
self.lex_decimal_number(first)
}
}
/// Lex a hex/octal/decimal/binary number without a decimal point.
fn lex_number_radix(&mut self, radix: Radix) -> TokenKind {
#[cfg(debug_assertions)]
debug_assert!(matches!(
self.cursor.previous().to_ascii_lowercase(),
'x' | 'o' | 'b'
));
// Lex the portion of the token after the base prefix (e.g., `9D5` in `0x9D5`).
let mut number = LexedText::new(self.offset(), self.source);
self.radix_run(&mut number, radix);
// Extract the entire number, including the base prefix (e.g., `0x9D5`).
let token = &self.source[self.token_range()];
let value = match Int::from_str_radix(number.as_str(), radix.as_u32(), token) {
Ok(int) => int,
Err(err) => {
return self.push_error(LexicalError::new(
LexicalErrorType::OtherError(format!("{err:?}").into_boxed_str()),
self.token_range(),
));
}
};
self.current_value = TokenValue::Int(value);
TokenKind::Int
}
/// Lex a normal number, that is, no octal, hex or binary number.
fn lex_decimal_number(&mut self, first_digit_or_dot: char) -> TokenKind {
#[cfg(debug_assertions)]
debug_assert!(self.cursor.previous().is_ascii_digit() || self.cursor.previous() == '.');
let start_is_zero = first_digit_or_dot == '0';
let mut number = LexedText::new(self.token_start(), self.source);
if first_digit_or_dot != '.' {
number.push(first_digit_or_dot);
self.radix_run(&mut number, Radix::Decimal);
};
let is_float = if first_digit_or_dot == '.' || self.cursor.eat_char('.') {
number.push('.');
if self.cursor.eat_char('_') {
return self.push_error(LexicalError::new(
LexicalErrorType::OtherError("Invalid Syntax".to_string().into_boxed_str()),
TextRange::new(self.offset() - TextSize::new(1), self.offset()),
));
}
self.radix_run(&mut number, Radix::Decimal);
true
} else {
// Normal number:
false
};
let is_float = match self.cursor.rest().as_bytes() {
[b'e' | b'E', b'0'..=b'9', ..] | [b'e' | b'E', b'-' | b'+', b'0'..=b'9', ..] => {
// 'e' | 'E'
number.push(self.cursor.bump().unwrap());
if let Some(sign) = self.cursor.eat_if(|c| matches!(c, '+' | '-')) {
number.push(sign);
}
self.radix_run(&mut number, Radix::Decimal);
true
}
_ => is_float,
};
if is_float {
// Improvement: Use `Cow` instead of pushing to value text
let Ok(value) = f64::from_str(number.as_str()) else {
return self.push_error(LexicalError::new(
LexicalErrorType::OtherError(
"Invalid decimal literal".to_string().into_boxed_str(),
),
self.token_range(),
));
};
// Parse trailing 'j':
if self.cursor.eat_if(|c| matches!(c, 'j' | 'J')).is_some() {
self.current_value = TokenValue::Complex {
real: 0.0,
imag: value,
};
TokenKind::Complex
} else {
self.current_value = TokenValue::Float(value);
TokenKind::Float
}
} else {
// Parse trailing 'j':
if self.cursor.eat_if(|c| matches!(c, 'j' | 'J')).is_some() {
let imag = f64::from_str(number.as_str()).unwrap();
self.current_value = TokenValue::Complex { real: 0.0, imag };
TokenKind::Complex
} else {
let value = match Int::from_str(number.as_str()) {
Ok(value) => {
if start_is_zero && value.as_u8() != Some(0) {
// Leading zeros in decimal integer literals are not permitted.
return self.push_error(LexicalError::new(
LexicalErrorType::OtherError(
"Invalid decimal integer literal"
.to_string()
.into_boxed_str(),
),
self.token_range(),
));
}
value
}
Err(err) => {
return self.push_error(LexicalError::new(
LexicalErrorType::OtherError(format!("{err:?}").into_boxed_str()),
self.token_range(),
))
}
};
self.current_value = TokenValue::Int(value);
TokenKind::Int
}
}
}
/// Consume a sequence of numbers with the given radix,
/// the digits can be decorated with underscores
/// like this: '`1_2_3_4`' == '1234'
fn radix_run(&mut self, number: &mut LexedText, radix: Radix) {
loop {
if let Some(c) = self.cursor.eat_if(|c| radix.is_digit(c)) {
number.push(c);
}
// Number that contains `_` separators. Remove them from the parsed text.
else if self.cursor.first() == '_' && radix.is_digit(self.cursor.second()) {
// Skip over `_`
self.cursor.bump();
number.skip_char();
} else {
break;
}
}
}
/// Lex a single comment.
fn lex_comment(&mut self) -> TokenKind {
#[cfg(debug_assertions)]
debug_assert_eq!(self.cursor.previous(), '#');
let bytes = self.cursor.rest().as_bytes();
let offset = memchr::memchr2(b'\n', b'\r', bytes).unwrap_or(bytes.len());
self.cursor.skip_bytes(offset);
TokenKind::Comment
}
/// Lex a single IPython escape command.
fn lex_ipython_escape_command(&mut self, escape_kind: IpyEscapeKind) -> TokenKind {
let mut value = String::new();
loop {
match self.cursor.first() {
'\\' => {
// Only skip the line continuation if it is followed by a newline
// otherwise it is a normal backslash which is part of the magic command:
//
// Skip this backslash
// v
// !pwd \
// && ls -a | sed 's/^/\\ /'
// ^^
// Don't skip these backslashes
if self.cursor.second() == '\r' {
self.cursor.bump();
self.cursor.bump();
self.cursor.eat_char('\n');
continue;
} else if self.cursor.second() == '\n' {
self.cursor.bump();
self.cursor.bump();
continue;
}
self.cursor.bump();
value.push('\\');
}
// Help end escape commands are those that end with 1 or 2 question marks.
// Here, we're only looking for a subset of help end escape commands which
// are the ones that has the escape token at the start of the line as well.
// On the other hand, we're not looking for help end escape commands that
// are strict in the sense that the escape token is only at the end. For example,
//
// * `%foo?` is recognized as a help end escape command but not as a strict one.
// * `foo?` is recognized as a strict help end escape command which is not
// lexed here but is identified at the parser level.
//
// Help end escape commands implemented in the IPython codebase using regex:
// https://github.com/ipython/ipython/blob/292e3a23459ca965b8c1bfe2c3707044c510209a/IPython/core/inputtransformer2.py#L454-L462
'?' => {
self.cursor.bump();
let mut question_count = 1u32;
while self.cursor.eat_char('?') {
question_count += 1;
}
// The original implementation in the IPython codebase is based on regex which
// means that it's strict in the sense that it won't recognize a help end escape:
// * If there's any whitespace before the escape token (e.g. `%foo ?`)
// * If there are more than 2 question mark tokens (e.g. `%foo???`)
// which is what we're doing here as well. In that case, we'll continue with
// the prefixed escape token.
//
// Now, the whitespace and empty value check also makes sure that an empty
// command (e.g. `%?` or `? ??`, no value after/between the escape tokens)
// is not recognized as a help end escape command. So, `%?` and `? ??` are
// `IpyEscapeKind::Magic` and `IpyEscapeKind::Help` because of the initial `%` and `??`
// tokens.
if question_count > 2
|| value.chars().last().is_none_or(is_python_whitespace)
|| !matches!(self.cursor.first(), '\n' | '\r' | EOF_CHAR)
{
// Not a help end escape command, so continue with the lexing.
value.reserve(question_count as usize);
for _ in 0..question_count {
value.push('?');
}
continue;
}
if escape_kind.is_help() {
// If we've recognize this as a help end escape command, then
// any question mark token / whitespaces at the start are not
// considered as part of the value.
//
// For example, `??foo?` is recognized as `IpyEscapeKind::Help` and
// `value` is `foo` instead of `??foo`.
value = value.trim_start_matches([' ', '?']).to_string();
} else if escape_kind.is_magic() {
// Between `%` and `?` (at the end), the `?` takes priority
// over the `%` so `%foo?` is recognized as `IpyEscapeKind::Help`
// and `value` is `%foo` instead of `foo`. So, we need to
// insert the magic escape token at the start.
value.insert_str(0, escape_kind.as_str());
}
let kind = match question_count {
1 => IpyEscapeKind::Help,
2 => IpyEscapeKind::Help2,
_ => unreachable!("`question_count` is always 1 or 2"),
};
self.current_value = TokenValue::IpyEscapeCommand {
kind,
value: value.into_boxed_str(),
};
return TokenKind::IpyEscapeCommand;
}
'\n' | '\r' | EOF_CHAR => {
self.current_value = TokenValue::IpyEscapeCommand {
kind: escape_kind,
value: value.into_boxed_str(),
};
return TokenKind::IpyEscapeCommand;
}
c => {
self.cursor.bump();
value.push(c);
}
}
}
}
fn consume_end(&mut self) -> TokenKind {
// We reached end of file.
// First of all, we need all nestings to be finished.
// For Mode::ParenthesizedExpression we start with nesting level 1.
// So we check if we end with that level.
let init_nesting = u32::from(self.mode == Mode::ParenthesizedExpression);
if self.nesting > init_nesting {
// Reset the nesting to avoid going into infinite loop.
self.nesting = 0;
return self.push_error(LexicalError::new(LexicalErrorType::Eof, self.token_range()));
}
// Next, insert a trailing newline, if required.
if !self.state.is_new_logical_line() {
self.state = State::AfterNewline;
TokenKind::Newline
}
// Next, flush the indentation stack to zero.
else if self.indentations.dedent().is_some() {
TokenKind::Dedent
} else {
TokenKind::EndOfFile
}
}
/// Re-lex the [`NonLogicalNewline`] token at the given position in the context of a logical
/// line.
///
/// Returns a boolean indicating whether the lexer's position has changed. This could result
/// into the new current token being different than the previous current token but is not
/// necessarily true. If the return value is `true` then the caller is responsible for updating
/// it's state accordingly.
///
/// This method is a no-op if the lexer isn't in a parenthesized context.
///
/// ## Explanation
///
/// The lexer emits two different kinds of newline token based on the context. If it's in a
/// parenthesized context, it'll emit a [`NonLogicalNewline`] token otherwise it'll emit a
/// regular [`Newline`] token. Based on the type of newline token, the lexer will consume and
/// emit the indentation tokens appropriately which affects the structure of the code.
///
/// For example:
/// ```py
/// if call(foo
/// def bar():
/// pass
/// ```
///
/// Here, the lexer emits a [`NonLogicalNewline`] token after `foo` which means that the lexer
/// doesn't emit an `Indent` token before the `def` keyword. This leads to an AST which
/// considers the function `bar` as part of the module block and the `if` block remains empty.
///
/// This method is to facilitate the parser if it recovers from these kind of scenarios so that
/// the lexer can then re-lex a [`NonLogicalNewline`] token to a [`Newline`] token which in
/// turn helps the parser to build the correct AST.
///
/// In the above snippet, it would mean that this method would move the lexer back to the
/// newline character after the `foo` token and emit it as a [`Newline`] token instead of
/// [`NonLogicalNewline`]. This means that the next token emitted by the lexer would be an
/// `Indent` token.
///
/// There are cases where the lexer's position will change but the re-lexed token will remain
/// the same. This is to help the parser to add the error message at an appropriate location.
/// Consider the following example:
///
/// ```py
/// if call(foo, [a, b
/// def bar():
/// pass
/// ```
///
/// Here, the parser recovers from two unclosed parenthesis. The inner unclosed `[` will call
/// into the re-lexing logic and reduce the nesting level from 2 to 1. And, the re-lexing logic
/// will move the lexer at the newline after `b` but still emit a [`NonLogicalNewline`] token.
/// Only after the parser recovers from the outer unclosed `(` does the re-lexing logic emit
/// the [`Newline`] token.
///
/// [`Newline`]: TokenKind::Newline
/// [`NonLogicalNewline`]: TokenKind::NonLogicalNewline
pub(crate) fn re_lex_logical_token(
&mut self,
non_logical_newline_start: Option<TextSize>,
) -> bool {
if self.nesting == 0 {
return false;
}
// Reduce the nesting level because the parser recovered from an error inside list parsing
// i.e., it recovered from an unclosed parenthesis (`(`, `[`, or `{`).
self.nesting -= 1;
// The lexer can't be moved back for a triple-quoted f-string because the newlines are
// part of the f-string itself, so there is no newline token to be emitted.
if self.current_flags.is_triple_quoted_fstring() {
return false;
}
let Some(new_position) = non_logical_newline_start else {
return false;
};
// Earlier we reduced the nesting level unconditionally. Now that we know the lexer's
// position is going to be moved back, the lexer needs to be put back into a
// parenthesized context if the current token is a closing parenthesis.
//
// ```py
// (a, [b,
// c
// )
// ```
//
// Here, the parser would request to re-lex the token when it's at `)` and can recover
// from an unclosed `[`. This method will move the lexer back to the newline character
// after `c` which means it goes back into parenthesized context.
if matches!(
self.current_kind,
TokenKind::Rpar | TokenKind::Rsqb | TokenKind::Rbrace
) {
self.nesting += 1;
}
self.cursor = Cursor::new(self.source);
self.cursor.skip_bytes(new_position.to_usize());
self.state = State::Other;
self.next_token();
true
}
#[inline]
fn token_range(&self) -> TextRange {
let end = self.offset();
let len = self.cursor.token_len();
TextRange::at(end - len, len)
}
#[inline]
fn token_text(&self) -> &'src str {
&self.source[self.token_range()]
}
/// Retrieves the current offset of the cursor within the source code.
// SAFETY: Lexer doesn't allow files larger than 4GB
#[allow(clippy::cast_possible_truncation)]
#[inline]
fn offset(&self) -> TextSize {
TextSize::new(self.source.len() as u32) - self.cursor.text_len()
}
#[inline]
fn token_start(&self) -> TextSize {
self.token_range().start()
}
/// Creates a checkpoint to which the lexer can later return to using [`Self::rewind`].
pub(crate) fn checkpoint(&self) -> LexerCheckpoint {
LexerCheckpoint {
value: self.current_value.clone(),
current_kind: self.current_kind,
current_range: self.current_range,
current_flags: self.current_flags,
cursor_offset: self.offset(),
state: self.state,
nesting: self.nesting,
indentations_checkpoint: self.indentations.checkpoint(),
pending_indentation: self.pending_indentation,
fstrings_checkpoint: self.fstrings.checkpoint(),
errors_position: self.errors.len(),
}
}
/// Restore the lexer to the given checkpoint.
pub(crate) fn rewind(&mut self, checkpoint: LexerCheckpoint) {
let LexerCheckpoint {
value,
current_kind,
current_range,
current_flags,
cursor_offset,
state,
nesting,
indentations_checkpoint,
pending_indentation,
fstrings_checkpoint,
errors_position,
} = checkpoint;
let mut cursor = Cursor::new(self.source);
// We preserve the previous char using this method.
cursor.skip_bytes(cursor_offset.to_usize());
self.current_value = value;
self.current_kind = current_kind;
self.current_range = current_range;
self.current_flags = current_flags;
self.cursor = cursor;
self.state = state;
self.nesting = nesting;
self.indentations.rewind(indentations_checkpoint);
self.pending_indentation = pending_indentation;
self.fstrings.rewind(fstrings_checkpoint);
self.errors.truncate(errors_position);
}
pub fn finish(self) -> Vec<LexicalError> {
self.errors
}
}
pub(crate) struct LexerCheckpoint {
value: TokenValue,
current_kind: TokenKind,
current_range: TextRange,
current_flags: TokenFlags,
cursor_offset: TextSize,
state: State,
nesting: u32,
indentations_checkpoint: IndentationsCheckpoint,
pending_indentation: Option<Indentation>,
fstrings_checkpoint: FStringsCheckpoint,
errors_position: usize,
}
#[derive(Copy, Clone, Debug)]
enum State {
/// Lexer is right at the beginning of the file or after a `Newline` token.
AfterNewline,
/// The lexer is at the start of a new logical line but **after** the indentation
NonEmptyLogicalLine,
/// Lexer is right after an equal token
AfterEqual,
/// Inside of a logical line
Other,
}
impl State {
const fn is_after_newline(self) -> bool {
matches!(self, State::AfterNewline)
}
const fn is_new_logical_line(self) -> bool {
matches!(self, State::AfterNewline | State::NonEmptyLogicalLine)
}
const fn is_after_equal(self) -> bool {
matches!(self, State::AfterEqual)
}
}
#[derive(Copy, Clone, Debug)]
enum Radix {
Binary,
Octal,
Decimal,
Hex,
}
impl Radix {
const fn as_u32(self) -> u32 {
match self {
Radix::Binary => 2,
Radix::Octal => 8,
Radix::Decimal => 10,
Radix::Hex => 16,
}
}
const fn is_digit(self, c: char) -> bool {
match self {
Radix::Binary => matches!(c, '0'..='1'),
Radix::Octal => matches!(c, '0'..='7'),
Radix::Decimal => c.is_ascii_digit(),
Radix::Hex => c.is_ascii_hexdigit(),
}
}
}
const fn is_quote(c: char) -> bool {
matches!(c, '\'' | '"')
}
const fn is_ascii_identifier_start(c: char) -> bool {
matches!(c, 'a'..='z' | 'A'..='Z' | '_')
}
// Checks if the character c is a valid starting character as described
// in https://docs.python.org/3/reference/lexical_analysis.html#identifiers
fn is_unicode_identifier_start(c: char) -> bool {
is_xid_start(c)
}
/// Checks if the character c is a valid continuation character as described
/// in <https://docs.python.org/3/reference/lexical_analysis.html#identifiers>.
///
/// Additionally, this function also keeps track of whether or not the total
/// identifier is ASCII-only or not by mutably altering a reference to a
/// boolean value passed in.
fn is_identifier_continuation(c: char, identifier_is_ascii_only: &mut bool) -> bool {
// Arrange things such that ASCII codepoints never
// result in the slower `is_xid_continue` getting called.
if c.is_ascii() {
matches!(c, 'a'..='z' | 'A'..='Z' | '_' | '0'..='9')
} else {
*identifier_is_ascii_only = false;
is_xid_continue(c)
}
}
enum LexedText<'a> {
Source { source: &'a str, range: TextRange },
Owned(String),
}
impl<'a> LexedText<'a> {
fn new(start: TextSize, source: &'a str) -> Self {
Self::Source {
range: TextRange::empty(start),
source,
}
}
fn push(&mut self, c: char) {
match self {
LexedText::Source { range, source } => {
*range = range.add_end(c.text_len());
debug_assert!(source[*range].ends_with(c));
}
LexedText::Owned(owned) => owned.push(c),
}
}
fn as_str<'b>(&'b self) -> &'b str
where
'b: 'a,
{
match self {
LexedText::Source { range, source } => &source[*range],
LexedText::Owned(owned) => owned,
}
}
fn skip_char(&mut self) {
match self {
LexedText::Source { range, source } => {
*self = LexedText::Owned(source[*range].to_string());
}
LexedText::Owned(_) => {}
}
}
}
/// Create a new [`Lexer`] for the given source code and [`Mode`].
pub fn lex(source: &str, mode: Mode) -> Lexer {
Lexer::new(source, mode, TextSize::default())
}
#[cfg(test)]
mod tests {
use std::fmt::Write;
use insta::assert_snapshot;
use super::*;
const WINDOWS_EOL: &str = "\r\n";
const MAC_EOL: &str = "\r";
const UNIX_EOL: &str = "\n";
/// Same as [`Token`] except that this includes the [`TokenValue`] as well.
struct TestToken {
kind: TokenKind,
value: TokenValue,
range: TextRange,
flags: TokenFlags,
}
impl std::fmt::Debug for TestToken {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
let mut tuple = f.debug_tuple("");
let mut tuple = if matches!(self.value, TokenValue::None) {
tuple.field(&self.kind)
} else {
tuple.field(&self.value)
};
tuple = tuple.field(&self.range);
if self.flags.is_empty() {
tuple.finish()
} else {
tuple.field(&self.flags).finish()
}
}
}
struct LexerOutput {
tokens: Vec<TestToken>,
errors: Vec<LexicalError>,
}
impl std::fmt::Display for LexerOutput {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
writeln!(f, "## Tokens")?;
writeln!(f, "```\n{:#?}\n```", self.tokens)?;
if !self.errors.is_empty() {
writeln!(f, "## Errors")?;
writeln!(f, "```\n{:#?}\n```", self.errors)?;
}
Ok(())
}
}
fn lex(source: &str, mode: Mode, start_offset: TextSize) -> LexerOutput {
let mut lexer = Lexer::new(source, mode, start_offset);
let mut tokens = Vec::new();
loop {
let kind = lexer.next_token();
if kind.is_eof() {
break;
}
tokens.push(TestToken {
kind,
value: lexer.take_value(),
range: lexer.current_range(),
flags: lexer.current_flags(),
});
}
LexerOutput {
tokens,
errors: lexer.finish(),
}
}
fn lex_valid(source: &str, mode: Mode, start_offset: TextSize) -> LexerOutput {
let output = lex(source, mode, start_offset);
if !output.errors.is_empty() {
let mut message = "Unexpected lexical errors for a valid source:\n".to_string();
for error in &output.errors {
writeln!(&mut message, "{error:?}").unwrap();
}
writeln!(&mut message, "Source:\n{source}").unwrap();
panic!("{message}");
}
output
}
fn lex_invalid(source: &str, mode: Mode) -> LexerOutput {
let output = lex(source, mode, TextSize::default());
assert!(
!output.errors.is_empty(),
"Expected lexer to generate at least one error for the following source:\n{source}"
);
output
}
fn lex_source(source: &str) -> LexerOutput {
lex_valid(source, Mode::Module, TextSize::default())
}
fn lex_source_with_offset(source: &str, start_offset: TextSize) -> LexerOutput {
lex_valid(source, Mode::Module, start_offset)
}
fn lex_jupyter_source(source: &str) -> LexerOutput {
lex_valid(source, Mode::Ipython, TextSize::default())
}
#[test]
fn bom() {
let source = "\u{feff}x = 1";
assert_snapshot!(lex_source(source));
}
#[test]
fn bom_with_offset() {
let source = "\u{feff}x + y + z";
assert_snapshot!(lex_source_with_offset(source, TextSize::new(7)));
}
#[test]
fn bom_with_offset_edge() {
// BOM offsets the first token by 3, so make sure that lexing from offset 11 (variable z)
// doesn't panic. Refer https://github.com/astral-sh/ruff/issues/11731
let source = "\u{feff}x + y + z";
assert_snapshot!(lex_source_with_offset(source, TextSize::new(11)));
}
fn ipython_escape_command_line_continuation_eol(eol: &str) -> LexerOutput {
let source = format!("%matplotlib \\{eol} --inline");
lex_jupyter_source(&source)
}
#[test]
fn test_ipython_escape_command_line_continuation_unix_eol() {
assert_snapshot!(ipython_escape_command_line_continuation_eol(UNIX_EOL));
}
#[test]
fn test_ipython_escape_command_line_continuation_mac_eol() {
assert_snapshot!(ipython_escape_command_line_continuation_eol(MAC_EOL));
}
#[test]
fn test_ipython_escape_command_line_continuation_windows_eol() {
assert_snapshot!(ipython_escape_command_line_continuation_eol(WINDOWS_EOL));
}
fn ipython_escape_command_line_continuation_with_eol_and_eof(eol: &str) -> LexerOutput {
let source = format!("%matplotlib \\{eol}");
lex_jupyter_source(&source)
}
#[test]
fn test_ipython_escape_command_line_continuation_with_unix_eol_and_eof() {
assert_snapshot!(ipython_escape_command_line_continuation_with_eol_and_eof(
UNIX_EOL
));
}
#[test]
fn test_ipython_escape_command_line_continuation_with_mac_eol_and_eof() {
assert_snapshot!(ipython_escape_command_line_continuation_with_eol_and_eof(
MAC_EOL
));
}
#[test]
fn test_ipython_escape_command_line_continuation_with_windows_eol_and_eof() {
assert_snapshot!(ipython_escape_command_line_continuation_with_eol_and_eof(
WINDOWS_EOL
));
}
#[test]
fn test_empty_ipython_escape_command() {
let source = "%\n%%\n!\n!!\n?\n??\n/\n,\n;";
assert_snapshot!(lex_jupyter_source(source));
}
#[test]
fn test_ipython_escape_command() {
let source = r"
?foo
??foo
%timeit a = b
%timeit a % 3
%matplotlib \
--inline
!pwd \
&& ls -a | sed 's/^/\\ /'
!!cd /Users/foo/Library/Application\ Support/
/foo 1 2
,foo 1 2
;foo 1 2
!ls
"
.trim();
assert_snapshot!(lex_jupyter_source(source));
}
#[test]
fn test_ipython_help_end_escape_command() {
let source = r"
?foo?
?? foo?
?? foo ?
?foo??
??foo??
???foo?
???foo??
??foo???
???foo???
?? \
foo?
?? \
?
????
%foo?
%foo??
%%foo???
!pwd?"
.trim();
assert_snapshot!(lex_jupyter_source(source));
}
#[test]
fn test_ipython_escape_command_indentation() {
let source = r"
if True:
%matplotlib \
--inline"
.trim();
assert_snapshot!(lex_jupyter_source(source));
}
#[test]
fn test_ipython_escape_command_assignment() {
let source = r"
pwd = !pwd
foo = %timeit a = b
bar = %timeit a % 3
baz = %matplotlib \
inline"
.trim();
assert_snapshot!(lex_jupyter_source(source));
}
fn assert_no_ipython_escape_command(tokens: &[TestToken]) {
for token in tokens {
if matches!(token.kind, TokenKind::IpyEscapeCommand) {
panic!("Unexpected escape command token at {:?}", token.range)
}
}
}
#[test]
fn test_ipython_escape_command_not_an_assignment() {
let source = r"
# Other escape kinds are not valid here (can't test `foo = ?str` because '?' is not a valid token)
foo = /func
foo = ;func
foo = ,func
(foo == %timeit a = b)
(foo := %timeit a = b)
def f(arg=%timeit a = b):
pass"
.trim();
let output = lex(source, Mode::Ipython, TextSize::default());
assert!(output.errors.is_empty());
assert_no_ipython_escape_command(&output.tokens);
}
#[test]
fn test_numbers() {
let source =
"0x2f 0o12 0b1101 0 123 123_45_67_890 0.2 1e+2 2.1e3 2j 2.2j 000 0x995DC9BBDF1939FA 0x995DC9BBDF1939FA995DC9BBDF1939FA";
assert_snapshot!(lex_source(source));
}
#[test]
fn test_invalid_leading_zero_small() {
let source = "025";
assert_snapshot!(lex_invalid(source, Mode::Module));
}
#[test]
fn test_invalid_leading_zero_big() {
let source =
"0252222222222222522222222222225222222222222252222222222222522222222222225222222222222";
assert_snapshot!(lex_invalid(source, Mode::Module));
}
#[test]
fn test_line_comment_long() {
let source = "99232 # foo".to_string();
assert_snapshot!(lex_source(&source));
}
#[test]
fn test_line_comment_whitespace() {
let source = "99232 # ".to_string();
assert_snapshot!(lex_source(&source));
}
#[test]
fn test_line_comment_single_whitespace() {
let source = "99232 # ".to_string();
assert_snapshot!(lex_source(&source));
}
#[test]
fn test_line_comment_empty() {
let source = "99232 #".to_string();
assert_snapshot!(lex_source(&source));
}
fn comment_until_eol(eol: &str) -> LexerOutput {
let source = format!("123 # Foo{eol}456");
lex_source(&source)
}
#[test]
fn test_comment_until_unix_eol() {
assert_snapshot!(comment_until_eol(UNIX_EOL));
}
#[test]
fn test_comment_until_mac_eol() {
assert_snapshot!(comment_until_eol(MAC_EOL));
}
#[test]
fn test_comment_until_windows_eol() {
assert_snapshot!(comment_until_eol(WINDOWS_EOL));
}
#[test]
fn test_assignment() {
let source = r"a_variable = 99 + 2-0";
assert_snapshot!(lex_source(source));
}
fn indentation_with_eol(eol: &str) -> LexerOutput {
let source = format!("def foo():{eol} return 99{eol}{eol}");
lex_source(&source)
}
#[test]
fn test_indentation_with_unix_eol() {
assert_snapshot!(indentation_with_eol(UNIX_EOL));
}
#[test]
fn test_indentation_with_mac_eol() {
assert_snapshot!(indentation_with_eol(MAC_EOL));
}
#[test]
fn test_indentation_with_windows_eol() {
assert_snapshot!(indentation_with_eol(WINDOWS_EOL));
}
fn double_dedent_with_eol(eol: &str) -> LexerOutput {
let source = format!("def foo():{eol} if x:{eol}{eol} return 99{eol}{eol}");
lex_source(&source)
}
#[test]
fn test_double_dedent_with_unix_eol() {
assert_snapshot!(double_dedent_with_eol(UNIX_EOL));
}
#[test]
fn test_double_dedent_with_mac_eol() {
assert_snapshot!(double_dedent_with_eol(MAC_EOL));
}
#[test]
fn test_double_dedent_with_windows_eol() {
assert_snapshot!(double_dedent_with_eol(WINDOWS_EOL));
}
fn double_dedent_with_tabs_eol(eol: &str) -> LexerOutput {
let source = format!("def foo():{eol}\tif x:{eol}{eol}\t\t return 99{eol}{eol}");
lex_source(&source)
}
#[test]
fn test_double_dedent_with_tabs_unix_eol() {
assert_snapshot!(double_dedent_with_tabs_eol(UNIX_EOL));
}
#[test]
fn test_double_dedent_with_tabs_mac_eol() {
assert_snapshot!(double_dedent_with_tabs_eol(MAC_EOL));
}
#[test]
fn test_double_dedent_with_tabs_windows_eol() {
assert_snapshot!(double_dedent_with_tabs_eol(WINDOWS_EOL));
}
#[test]
fn dedent_after_whitespace() {
let source = "\
if first:
if second:
pass
foo
";
assert_snapshot!(lex_source(source));
}
fn newline_in_brackets_eol(eol: &str) -> LexerOutput {
let source = r"x = [
1,2
,(3,
4,
), {
5,
6,\
7}]
"
.replace('\n', eol);
lex_source(&source)
}
#[test]
fn test_newline_in_brackets_unix_eol() {
assert_snapshot!(newline_in_brackets_eol(UNIX_EOL));
}
#[test]
fn test_newline_in_brackets_mac_eol() {
assert_snapshot!(newline_in_brackets_eol(MAC_EOL));
}
#[test]
fn test_newline_in_brackets_windows_eol() {
assert_snapshot!(newline_in_brackets_eol(WINDOWS_EOL));
}
#[test]
fn test_non_logical_newline_in_string_continuation() {
let source = r"(
'a'
'b'
'c' \
'd'
)";
assert_snapshot!(lex_source(source));
}
#[test]
fn test_logical_newline_line_comment() {
let source = "#Hello\n#World\n";
assert_snapshot!(lex_source(source));
}
#[test]
fn test_operators() {
let source = "//////=/ /";
assert_snapshot!(lex_source(source));
}
#[test]
fn test_string() {
let source = r#""double" 'single' 'can\'t' "\\\"" '\t\r\n' '\g' r'raw\'' '\420' '\200\0a'"#;
assert_snapshot!(lex_source(source));
}
fn string_continuation_with_eol(eol: &str) -> LexerOutput {
let source = format!("\"abc\\{eol}def\"");
lex_source(&source)
}
#[test]
fn test_string_continuation_with_unix_eol() {
assert_snapshot!(string_continuation_with_eol(UNIX_EOL));
}
#[test]
fn test_string_continuation_with_mac_eol() {
assert_snapshot!(string_continuation_with_eol(MAC_EOL));
}
#[test]
fn test_string_continuation_with_windows_eol() {
assert_snapshot!(string_continuation_with_eol(WINDOWS_EOL));
}
#[test]
fn test_escape_unicode_name() {
let source = r#""\N{EN SPACE}""#;
assert_snapshot!(lex_source(source));
}
fn get_tokens_only(source: &str) -> Vec<TokenKind> {
let output = lex(source, Mode::Module, TextSize::default());
assert!(output.errors.is_empty());
output.tokens.into_iter().map(|token| token.kind).collect()
}
#[test]
fn test_nfkc_normalization() {
let source1 = "𝒞 = 500";
let source2 = "C = 500";
assert_eq!(get_tokens_only(source1), get_tokens_only(source2));
}
fn triple_quoted_eol(eol: &str) -> LexerOutput {
let source = format!("\"\"\"{eol} test string{eol} \"\"\"");
lex_source(&source)
}
#[test]
fn test_triple_quoted_unix_eol() {
assert_snapshot!(triple_quoted_eol(UNIX_EOL));
}
#[test]
fn test_triple_quoted_mac_eol() {
assert_snapshot!(triple_quoted_eol(MAC_EOL));
}
#[test]
fn test_triple_quoted_windows_eol() {
assert_snapshot!(triple_quoted_eol(WINDOWS_EOL));
}
// This test case is to just make sure that the lexer doesn't go into
// infinite loop on invalid input.
#[test]
fn test_infinite_loop() {
let source = "[1";
lex_invalid(source, Mode::Module);
}
/// Emoji identifiers are a non-standard python feature and are not supported by our lexer.
#[test]
fn test_emoji_identifier() {
let source = "🐦";
assert_snapshot!(lex_invalid(source, Mode::Module));
}
#[test]
fn tet_too_low_dedent() {
let source = "if True:
pass
pass";
assert_snapshot!(lex_invalid(source, Mode::Module));
}
#[test]
fn test_empty_fstrings() {
let source = r#"f"" "" F"" f'' '' f"""""" f''''''"#;
assert_snapshot!(lex_source(source));
}
#[test]
fn test_fstring_prefix() {
let source = r#"f"" F"" rf"" rF"" Rf"" RF"" fr"" Fr"" fR"" FR"""#;
assert_snapshot!(lex_source(source));
}
#[test]
fn test_fstring() {
let source = r#"f"normal {foo} {{another}} {bar} {{{three}}}""#;
assert_snapshot!(lex_source(source));
}
#[test]
fn test_fstring_parentheses() {
let source = r#"f"{}" f"{{}}" f" {}" f"{{{}}}" f"{{{{}}}}" f" {} {{}} {{{}}} {{{{}}}} ""#;
assert_snapshot!(lex_source(source));
}
fn fstring_single_quote_escape_eol(eol: &str) -> LexerOutput {
let source = format!(r"f'text \{eol} more text'");
lex_source(&source)
}
#[test]
fn test_fstring_single_quote_escape_unix_eol() {
assert_snapshot!(fstring_single_quote_escape_eol(UNIX_EOL));
}
#[test]
fn test_fstring_single_quote_escape_mac_eol() {
assert_snapshot!(fstring_single_quote_escape_eol(MAC_EOL));
}
#[test]
fn test_fstring_single_quote_escape_windows_eol() {
assert_snapshot!(fstring_single_quote_escape_eol(WINDOWS_EOL));
}
#[test]
fn test_fstring_escape() {
let source = r#"f"\{x:\"\{x}} \"\"\
end""#;
assert_snapshot!(lex_source(source));
}
#[test]
fn test_fstring_escape_braces() {
let source = r"f'\{foo}' f'\\{foo}' f'\{{foo}}' f'\\{{foo}}'";
assert_snapshot!(lex_source(source));
}
#[test]
fn test_fstring_escape_raw() {
let source = r#"rf"\{x:\"\{x}} \"\"\
end""#;
assert_snapshot!(lex_source(source));
}
#[test]
fn test_fstring_named_unicode() {
let source = r#"f"\N{BULLET} normal \Nope \N""#;
assert_snapshot!(lex_source(source));
}
#[test]
fn test_fstring_named_unicode_raw() {
let source = r#"rf"\N{BULLET} normal""#;
assert_snapshot!(lex_source(source));
}
#[test]
fn test_fstring_with_named_expression() {
let source = r#"f"{x:=10} {(x:=10)} {x,{y:=10}} {[x:=10]}""#;
assert_snapshot!(lex_source(source));
}
#[test]
fn test_fstring_with_format_spec() {
let source = r#"f"{foo:} {x=!s:.3f} {x:.{y}f} {'':*^{1:{1}}} {x:{{1}.pop()}}""#;
assert_snapshot!(lex_source(source));
}
#[test]
fn test_fstring_with_multiline_format_spec() {
// The last f-string is invalid syntactically but we should still lex it.
// Note that the `b` is a `Name` token and not a `FStringMiddle` token.
let source = r"f'''__{
x:d
}__'''
f'''__{
x:a
b
c
}__'''
f'__{
x:d
}__'
f'__{
x:a
b
}__'
";
assert_snapshot!(lex_source(source));
}
#[test]
fn test_fstring_conversion() {
let source = r#"f"{x!s} {x=!r} {x:.3f!r} {{x!r}}""#;
assert_snapshot!(lex_source(source));
}
#[test]
fn test_fstring_nested() {
let source = r#"f"foo {f"bar {x + f"{wow}"}"} baz" f'foo {f'bar'} some {f"another"}'"#;
assert_snapshot!(lex_source(source));
}
#[test]
fn test_fstring_expression_multiline() {
let source = r#"f"first {
x
*
y
} second""#;
assert_snapshot!(lex_source(source));
}
#[test]
fn test_fstring_multiline() {
let source = r#"f"""
hello
world
""" f'''
world
hello
''' f"some {f"""multiline
allowed {x}"""} string""#;
assert_snapshot!(lex_source(source));
}
#[test]
fn test_fstring_comments() {
let source = r#"f"""
# not a comment { # comment {
x
} # not a comment
""""#;
assert_snapshot!(lex_source(source));
}
#[test]
fn test_fstring_with_ipy_escape_command() {
let source = r#"f"foo {!pwd} bar""#;
assert_snapshot!(lex_source(source));
}
#[test]
fn test_fstring_with_lambda_expression() {
let source = r#"
f"{lambda x:{x}}"
f"{(lambda x:{x})}"
"#
.trim();
assert_snapshot!(lex_source(source));
}
#[test]
fn test_fstring_with_nul_char() {
let source = r"f'\0'";
assert_snapshot!(lex_source(source));
}
#[test]
fn test_match_softkeyword_in_notebook() {
let source = r"match foo:
case bar:
pass";
assert_snapshot!(lex_jupyter_source(source));
}
fn lex_fstring_error(source: &str) -> FStringErrorType {
let output = lex(source, Mode::Module, TextSize::default());
match output
.errors
.into_iter()
.next()
.expect("lexer should give at least one error")
.into_error()
{
LexicalErrorType::FStringError(error) => error,
err => panic!("Expected FStringError: {err:?}"),
}
}
#[test]
fn test_fstring_error() {
use FStringErrorType::{SingleRbrace, UnterminatedString, UnterminatedTripleQuotedString};
assert_eq!(lex_fstring_error("f'}'"), SingleRbrace);
assert_eq!(lex_fstring_error("f'{{}'"), SingleRbrace);
assert_eq!(lex_fstring_error("f'{{}}}'"), SingleRbrace);
assert_eq!(lex_fstring_error("f'foo}'"), SingleRbrace);
assert_eq!(lex_fstring_error(r"f'\u007b}'"), SingleRbrace);
assert_eq!(lex_fstring_error("f'{a:b}}'"), SingleRbrace);
assert_eq!(lex_fstring_error("f'{3:}}>10}'"), SingleRbrace);
assert_eq!(lex_fstring_error(r"f'\{foo}\}'"), SingleRbrace);
assert_eq!(lex_fstring_error(r#"f""#), UnterminatedString);
assert_eq!(lex_fstring_error(r"f'"), UnterminatedString);
assert_eq!(lex_fstring_error(r#"f""""#), UnterminatedTripleQuotedString);
assert_eq!(lex_fstring_error(r"f'''"), UnterminatedTripleQuotedString);
assert_eq!(
lex_fstring_error(r#"f"""""#),
UnterminatedTripleQuotedString
);
assert_eq!(
lex_fstring_error(r#"f""""""#),
UnterminatedTripleQuotedString
);
}
}