ruff/crates/ruff_python_parser/src/lexer.rs
Dhruv Manilawala a58bde6958
Remove less used parser dependencies (#11718)
## Summary

This PR removes the following dependencies from the `ruff_python_parser`
crate:
* `anyhow` (moved to dev dependencies)
* `is-macro`
* `itertools`

The main motivation is that they aren't used much.

Additionally, it updates the return type of `parse_type_annotation` to
use a more specific `ParseError` instead of the generic `anyhow::Error`.

## Test Plan

`cargo insta test`
2024-06-03 13:08:24 +00:00

2632 lines
86 KiB
Rust
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

//! This module takes care of lexing Python source text.
//!
//! This means source code is scanned and translated into separate tokens. The rules
//! governing what is and is not a valid token are defined in the Python reference
//! guide section on [Lexical analysis].
//!
//! [Lexical analysis]: https://docs.python.org/3/reference/lexical_analysis.html
use std::cmp::Ordering;
use std::str::FromStr;
use bitflags::bitflags;
use unicode_ident::{is_xid_continue, is_xid_start};
use unicode_normalization::UnicodeNormalization;
use ruff_python_ast::str::Quote;
use ruff_python_ast::str_prefix::{
AnyStringPrefix, ByteStringPrefix, FStringPrefix, StringLiteralPrefix,
};
use ruff_python_ast::{AnyStringFlags, Int, IpyEscapeKind, StringFlags};
use ruff_text_size::{Ranged, TextLen, TextRange, TextSize};
use crate::error::FStringErrorType;
use crate::lexer::cursor::{Cursor, EOF_CHAR};
use crate::lexer::fstring::{FStringContext, FStrings, FStringsCheckpoint};
use crate::lexer::indentation::{Indentation, Indentations, IndentationsCheckpoint};
use crate::{Mode, TokenKind};
mod cursor;
mod fstring;
mod indentation;
/// A lexer for Python source code.
#[derive(Debug)]
pub struct Lexer<'src> {
/// Source code to be lexed.
source: &'src str,
/// A pointer to the current character of the source code which is being lexed.
cursor: Cursor<'src>,
/// The kind of the current token.
current_kind: TokenKind,
/// The range of the current token.
current_range: TextRange,
/// The value of the current token.
current_value: TokenValue,
/// Flags for the current token.
current_flags: TokenFlags,
/// Lexer state.
state: State,
/// Represents the current level of nesting in the lexer, indicating the depth of parentheses.
/// The lexer is within a parenthesized context if the value is greater than 0.
nesting: u32,
/// A stack of indentation representing the current indentation level.
indentations: Indentations,
pending_indentation: Option<Indentation>,
/// Lexer mode.
mode: Mode,
/// F-string contexts.
fstrings: FStrings,
/// Errors encountered while lexing.
errors: Vec<LexicalError>,
}
impl<'src> Lexer<'src> {
/// Create a new lexer for the given input source which starts at the given offset.
///
/// If the start offset is greater than 0, the cursor is moved ahead that many bytes.
/// This means that the input source should be the complete source code and not the
/// sliced version.
pub(crate) fn new(source: &'src str, mode: Mode, start_offset: TextSize) -> Self {
assert!(
u32::try_from(source.len()).is_ok(),
"Lexer only supports files with a size up to 4GB"
);
let mut lexer = Lexer {
source,
cursor: Cursor::new(source),
state: State::AfterNewline,
current_kind: TokenKind::EndOfFile,
current_range: TextRange::empty(start_offset),
current_value: TokenValue::None,
current_flags: TokenFlags::empty(),
nesting: 0,
indentations: Indentations::default(),
pending_indentation: None,
mode,
fstrings: FStrings::default(),
errors: Vec::new(),
};
// TODO: Handle possible mismatch between BOM and explicit encoding declaration.
// spell-checker:ignore feff
lexer.cursor.eat_char('\u{feff}');
if start_offset > TextSize::new(0) {
lexer.cursor.skip_bytes(start_offset.to_usize());
}
lexer
}
/// Returns the kind of the current token.
pub(crate) fn current_kind(&self) -> TokenKind {
self.current_kind
}
/// Returns the range of the current token.
pub(crate) fn current_range(&self) -> TextRange {
self.current_range
}
/// Returns the flags for the current token.
pub(crate) fn current_flags(&self) -> TokenFlags {
self.current_flags
}
/// Takes the token value corresponding to the current token out of the lexer, replacing it
/// with the default value.
///
/// All the subsequent call to this method without moving the lexer would always return the
/// default value which is [`TokenValue::None`].
pub(crate) fn take_value(&mut self) -> TokenValue {
std::mem::take(&mut self.current_value)
}
/// Lex the next token.
pub fn next_token(&mut self) -> TokenKind {
self.cursor.start_token();
self.current_value = TokenValue::None;
self.current_flags = TokenFlags::empty();
self.current_kind = self.lex_token();
self.current_range = self.token_range();
self.current_kind
}
fn lex_token(&mut self) -> TokenKind {
if let Some(fstring) = self.fstrings.current() {
if !fstring.is_in_expression(self.nesting) {
if let Some(token) = self.lex_fstring_middle_or_end() {
if matches!(token, TokenKind::FStringEnd) {
self.fstrings.pop();
}
return token;
}
}
}
// Return dedent tokens until the current indentation level matches the indentation of the next token.
else if let Some(indentation) = self.pending_indentation.take() {
match self.indentations.current().try_compare(indentation) {
Ok(Ordering::Greater) => {
self.pending_indentation = Some(indentation);
if self.indentations.dedent_one(indentation).is_err() {
return self.push_error(LexicalError::new(
LexicalErrorType::IndentationError,
self.token_range(),
));
}
return TokenKind::Dedent;
}
Ok(_) => {}
Err(_) => {
return self.push_error(LexicalError::new(
LexicalErrorType::IndentationError,
self.token_range(),
));
}
}
}
if self.state.is_after_newline() {
if let Some(indentation) = self.eat_indentation() {
return indentation;
}
} else {
if let Err(error) = self.skip_whitespace() {
return self.push_error(error);
}
}
// The lexer might've skipped whitespaces, so update the start offset
self.cursor.start_token();
if let Some(c) = self.cursor.bump() {
if c.is_ascii() {
self.consume_ascii_character(c)
} else if is_unicode_identifier_start(c) {
let identifier = self.lex_identifier(c);
self.state = State::Other;
identifier
} else {
self.push_error(LexicalError::new(
LexicalErrorType::UnrecognizedToken { tok: c },
self.token_range(),
))
}
} else {
// Reached the end of the file. Emit a trailing newline token if not at the beginning of a logical line,
// empty the dedent stack, and finally, return the EndOfFile token.
self.consume_end()
}
}
fn eat_indentation(&mut self) -> Option<TokenKind> {
let mut indentation = Indentation::root();
loop {
match self.cursor.first() {
' ' => {
self.cursor.bump();
indentation = indentation.add_space();
}
'\t' => {
self.cursor.bump();
indentation = indentation.add_tab();
}
'\\' => {
self.cursor.bump();
if self.cursor.eat_char('\r') {
self.cursor.eat_char('\n');
} else if self.cursor.is_eof() {
return Some(self.push_error(LexicalError::new(
LexicalErrorType::Eof,
self.token_range(),
)));
} else if !self.cursor.eat_char('\n') {
return Some(self.push_error(LexicalError::new(
LexicalErrorType::LineContinuationError,
self.token_range(),
)));
}
indentation = Indentation::root();
}
// Form feed
'\x0C' => {
self.cursor.bump();
indentation = Indentation::root();
}
_ => break,
}
}
// Handle indentation if this is a new, not all empty, logical line
if !matches!(self.cursor.first(), '\n' | '\r' | '#' | EOF_CHAR) {
self.state = State::NonEmptyLogicalLine;
// Set to false so that we don't handle indentation on the next call.
return self.handle_indentation(indentation);
}
None
}
fn handle_indentation(&mut self, indentation: Indentation) -> Option<TokenKind> {
let token = match self.indentations.current().try_compare(indentation) {
// Dedent
Ok(Ordering::Greater) => {
self.pending_indentation = Some(indentation);
if self.indentations.dedent_one(indentation).is_err() {
return Some(self.push_error(LexicalError::new(
LexicalErrorType::IndentationError,
self.token_range(),
)));
};
// The lexer might've eaten some whitespaces to calculate the `indentation`. For
// example:
//
// ```py
// if first:
// if second:
// pass
// foo
// # ^
// ```
//
// Here, the cursor is at `^` and the `indentation` contains the whitespaces before
// the `pass` token.
self.cursor.start_token();
Some(TokenKind::Dedent)
}
Ok(Ordering::Equal) => None,
// Indent
Ok(Ordering::Less) => {
self.indentations.indent(indentation);
Some(TokenKind::Indent)
}
Err(_) => {
return Some(self.push_error(LexicalError::new(
LexicalErrorType::IndentationError,
self.token_range(),
)));
}
};
token
}
fn skip_whitespace(&mut self) -> Result<(), LexicalError> {
loop {
match self.cursor.first() {
' ' => {
self.cursor.bump();
}
'\t' => {
self.cursor.bump();
}
'\\' => {
self.cursor.bump();
if self.cursor.eat_char('\r') {
self.cursor.eat_char('\n');
} else if self.cursor.is_eof() {
return Err(LexicalError::new(LexicalErrorType::Eof, self.token_range()));
} else if !self.cursor.eat_char('\n') {
return Err(LexicalError::new(
LexicalErrorType::LineContinuationError,
self.token_range(),
));
}
}
// Form feed
'\x0C' => {
self.cursor.bump();
}
_ => break,
}
}
Ok(())
}
// Dispatch based on the given character.
fn consume_ascii_character(&mut self, c: char) -> TokenKind {
let token = match c {
c if is_ascii_identifier_start(c) => self.lex_identifier(c),
'0'..='9' => self.lex_number(c),
'#' => return self.lex_comment(),
'\'' | '"' => self.lex_string(c),
'=' => {
if self.cursor.eat_char('=') {
TokenKind::EqEqual
} else {
self.state = State::AfterEqual;
return TokenKind::Equal;
}
}
'+' => {
if self.cursor.eat_char('=') {
TokenKind::PlusEqual
} else {
TokenKind::Plus
}
}
'*' => {
if self.cursor.eat_char('=') {
TokenKind::StarEqual
} else if self.cursor.eat_char('*') {
if self.cursor.eat_char('=') {
TokenKind::DoubleStarEqual
} else {
TokenKind::DoubleStar
}
} else {
TokenKind::Star
}
}
c @ ('%' | '!')
if self.mode == Mode::Ipython
&& self.state.is_after_equal()
&& self.nesting == 0 =>
{
// SAFETY: Safe because `c` has been matched against one of the possible escape command token
self.lex_ipython_escape_command(IpyEscapeKind::try_from(c).unwrap())
}
c @ ('%' | '!' | '?' | '/' | ';' | ',')
if self.mode == Mode::Ipython && self.state.is_new_logical_line() =>
{
let kind = if let Ok(kind) = IpyEscapeKind::try_from([c, self.cursor.first()]) {
self.cursor.bump();
kind
} else {
// SAFETY: Safe because `c` has been matched against one of the possible escape command token
IpyEscapeKind::try_from(c).unwrap()
};
self.lex_ipython_escape_command(kind)
}
'?' if self.mode == Mode::Ipython => TokenKind::Question,
'/' => {
if self.cursor.eat_char('=') {
TokenKind::SlashEqual
} else if self.cursor.eat_char('/') {
if self.cursor.eat_char('=') {
TokenKind::DoubleSlashEqual
} else {
TokenKind::DoubleSlash
}
} else {
TokenKind::Slash
}
}
'%' => {
if self.cursor.eat_char('=') {
TokenKind::PercentEqual
} else {
TokenKind::Percent
}
}
'|' => {
if self.cursor.eat_char('=') {
TokenKind::VbarEqual
} else {
TokenKind::Vbar
}
}
'^' => {
if self.cursor.eat_char('=') {
TokenKind::CircumflexEqual
} else {
TokenKind::CircumFlex
}
}
'&' => {
if self.cursor.eat_char('=') {
TokenKind::AmperEqual
} else {
TokenKind::Amper
}
}
'-' => {
if self.cursor.eat_char('=') {
TokenKind::MinusEqual
} else if self.cursor.eat_char('>') {
TokenKind::Rarrow
} else {
TokenKind::Minus
}
}
'@' => {
if self.cursor.eat_char('=') {
TokenKind::AtEqual
} else {
TokenKind::At
}
}
'!' => {
if self.cursor.eat_char('=') {
TokenKind::NotEqual
} else {
TokenKind::Exclamation
}
}
'~' => TokenKind::Tilde,
'(' => {
self.nesting += 1;
TokenKind::Lpar
}
')' => {
self.nesting = self.nesting.saturating_sub(1);
TokenKind::Rpar
}
'[' => {
self.nesting += 1;
TokenKind::Lsqb
}
']' => {
self.nesting = self.nesting.saturating_sub(1);
TokenKind::Rsqb
}
'{' => {
self.nesting += 1;
TokenKind::Lbrace
}
'}' => {
if let Some(fstring) = self.fstrings.current_mut() {
if fstring.nesting() == self.nesting {
return self.push_error(LexicalError::new(
LexicalErrorType::FStringError(FStringErrorType::SingleRbrace),
self.token_range(),
));
}
fstring.try_end_format_spec(self.nesting);
}
self.nesting = self.nesting.saturating_sub(1);
TokenKind::Rbrace
}
':' => {
if self
.fstrings
.current_mut()
.is_some_and(|fstring| fstring.try_start_format_spec(self.nesting))
{
TokenKind::Colon
} else if self.cursor.eat_char('=') {
TokenKind::ColonEqual
} else {
TokenKind::Colon
}
}
';' => TokenKind::Semi,
'<' => {
if self.cursor.eat_char('<') {
if self.cursor.eat_char('=') {
TokenKind::LeftShiftEqual
} else {
TokenKind::LeftShift
}
} else if self.cursor.eat_char('=') {
TokenKind::LessEqual
} else {
TokenKind::Less
}
}
'>' => {
if self.cursor.eat_char('>') {
if self.cursor.eat_char('=') {
TokenKind::RightShiftEqual
} else {
TokenKind::RightShift
}
} else if self.cursor.eat_char('=') {
TokenKind::GreaterEqual
} else {
TokenKind::Greater
}
}
',' => TokenKind::Comma,
'.' => {
if self.cursor.first().is_ascii_digit() {
self.lex_decimal_number('.')
} else if self.cursor.eat_char2('.', '.') {
TokenKind::Ellipsis
} else {
TokenKind::Dot
}
}
'\n' => {
return if self.nesting == 0 && !self.state.is_new_logical_line() {
self.state = State::AfterNewline;
TokenKind::Newline
} else {
if let Some(fstring) = self.fstrings.current_mut() {
fstring.try_end_format_spec(self.nesting);
}
TokenKind::NonLogicalNewline
}
}
'\r' => {
self.cursor.eat_char('\n');
return if self.nesting == 0 && !self.state.is_new_logical_line() {
self.state = State::AfterNewline;
TokenKind::Newline
} else {
if let Some(fstring) = self.fstrings.current_mut() {
fstring.try_end_format_spec(self.nesting);
}
TokenKind::NonLogicalNewline
};
}
_ => {
self.state = State::Other;
return self.push_error(LexicalError::new(
LexicalErrorType::UnrecognizedToken { tok: c },
self.token_range(),
));
}
};
self.state = State::Other;
token
}
/// Lex an identifier. Also used for keywords and string/bytes literals with a prefix.
fn lex_identifier(&mut self, first: char) -> TokenKind {
// Detect potential string like rb'' b'' f'' u'' r''
let quote = match (first, self.cursor.first()) {
(_, quote @ ('\'' | '"')) => self.try_single_char_prefix(first).then(|| {
self.cursor.bump();
quote
}),
(_, second) if is_quote(self.cursor.second()) => {
self.try_double_char_prefix([first, second]).then(|| {
self.cursor.bump();
// SAFETY: Safe because of the `is_quote` check in this match arm's guard
self.cursor.bump().unwrap()
})
}
_ => None,
};
if let Some(quote) = quote {
if self.current_flags.is_f_string() {
return self.lex_fstring_start(quote);
}
return self.lex_string(quote);
}
// Keep track of whether the identifier is ASCII-only or not.
//
// This is important because Python applies NFKC normalization to
// identifiers: https://docs.python.org/3/reference/lexical_analysis.html#identifiers.
// We need to therefore do the same in our lexer, but applying NFKC normalization
// unconditionally is extremely expensive. If we know an identifier is ASCII-only,
// (by far the most common case), we can skip NFKC normalization of the identifier.
let mut is_ascii = first.is_ascii();
self.cursor
.eat_while(|c| is_identifier_continuation(c, &mut is_ascii));
let text = self.token_text();
if !is_ascii {
self.current_value = TokenValue::Name(text.nfkc().collect::<String>().into_boxed_str());
return TokenKind::Name;
}
match text {
"False" => TokenKind::False,
"None" => TokenKind::None,
"True" => TokenKind::True,
"and" => TokenKind::And,
"as" => TokenKind::As,
"assert" => TokenKind::Assert,
"async" => TokenKind::Async,
"await" => TokenKind::Await,
"break" => TokenKind::Break,
"case" => TokenKind::Case,
"class" => TokenKind::Class,
"continue" => TokenKind::Continue,
"def" => TokenKind::Def,
"del" => TokenKind::Del,
"elif" => TokenKind::Elif,
"else" => TokenKind::Else,
"except" => TokenKind::Except,
"finally" => TokenKind::Finally,
"for" => TokenKind::For,
"from" => TokenKind::From,
"global" => TokenKind::Global,
"if" => TokenKind::If,
"import" => TokenKind::Import,
"in" => TokenKind::In,
"is" => TokenKind::Is,
"lambda" => TokenKind::Lambda,
"match" => TokenKind::Match,
"nonlocal" => TokenKind::Nonlocal,
"not" => TokenKind::Not,
"or" => TokenKind::Or,
"pass" => TokenKind::Pass,
"raise" => TokenKind::Raise,
"return" => TokenKind::Return,
"try" => TokenKind::Try,
"type" => TokenKind::Type,
"while" => TokenKind::While,
"with" => TokenKind::With,
"yield" => TokenKind::Yield,
_ => {
self.current_value = TokenValue::Name(text.to_string().into_boxed_str());
TokenKind::Name
}
}
}
/// Try lexing the single character string prefix, updating the token flags accordingly.
/// Returns `true` if it matches.
fn try_single_char_prefix(&mut self, first: char) -> bool {
match first {
'f' | 'F' => self.current_flags |= TokenFlags::F_STRING,
'u' | 'U' => self.current_flags |= TokenFlags::UNICODE_STRING,
'b' | 'B' => self.current_flags |= TokenFlags::BYTE_STRING,
'r' => self.current_flags |= TokenFlags::RAW_STRING_LOWERCASE,
'R' => self.current_flags |= TokenFlags::RAW_STRING_UPPERCASE,
_ => return false,
}
true
}
/// Try lexing the double character string prefix, updating the token flags accordingly.
/// Returns `true` if it matches.
fn try_double_char_prefix(&mut self, value: [char; 2]) -> bool {
match value {
['r', 'f' | 'F'] | ['f' | 'F', 'r'] => {
self.current_flags |= TokenFlags::F_STRING | TokenFlags::RAW_STRING_LOWERCASE;
}
['R', 'f' | 'F'] | ['f' | 'F', 'R'] => {
self.current_flags |= TokenFlags::F_STRING | TokenFlags::RAW_STRING_UPPERCASE;
}
['r', 'b' | 'B'] | ['b' | 'B', 'r'] => {
self.current_flags |= TokenFlags::BYTE_STRING | TokenFlags::RAW_STRING_LOWERCASE;
}
['R', 'b' | 'B'] | ['b' | 'B', 'R'] => {
self.current_flags |= TokenFlags::BYTE_STRING | TokenFlags::RAW_STRING_UPPERCASE;
}
_ => return false,
}
true
}
/// Lex a f-string start token.
fn lex_fstring_start(&mut self, quote: char) -> TokenKind {
#[cfg(debug_assertions)]
debug_assert_eq!(self.cursor.previous(), quote);
if quote == '"' {
self.current_flags |= TokenFlags::DOUBLE_QUOTES;
}
if self.cursor.eat_char2(quote, quote) {
self.current_flags |= TokenFlags::TRIPLE_QUOTED_STRING;
}
self.fstrings
.push(FStringContext::new(self.current_flags, self.nesting));
TokenKind::FStringStart
}
/// Lex a f-string middle or end token.
fn lex_fstring_middle_or_end(&mut self) -> Option<TokenKind> {
// SAFETY: Safe because the function is only called when `self.fstrings` is not empty.
let fstring = self.fstrings.current().unwrap();
// Check if we're at the end of the f-string.
if fstring.is_triple_quoted() {
let quote_char = fstring.quote_char();
if self.cursor.eat_char3(quote_char, quote_char, quote_char) {
self.current_flags = fstring.flags();
return Some(TokenKind::FStringEnd);
}
} else if self.cursor.eat_char(fstring.quote_char()) {
self.current_flags = fstring.flags();
return Some(TokenKind::FStringEnd);
}
// We have to decode `{{` and `}}` into `{` and `}` respectively. As an
// optimization, we only allocate a new string we find any escaped curly braces,
// otherwise this string will remain empty and we'll use a source slice instead.
let mut normalized = String::new();
// Tracks the last offset of token value that has been written to `normalized`.
let mut last_offset = self.offset();
// This isn't going to change for the duration of the loop.
let in_format_spec = fstring.is_in_format_spec(self.nesting);
let mut in_named_unicode = false;
loop {
match self.cursor.first() {
// The condition is to differentiate between the `NUL` (`\0`) character
// in the source code and the one returned by `self.cursor.first()` when
// we reach the end of the source code.
EOF_CHAR if self.cursor.is_eof() => {
let error = if fstring.is_triple_quoted() {
FStringErrorType::UnterminatedTripleQuotedString
} else {
FStringErrorType::UnterminatedString
};
self.fstrings.pop();
return Some(self.push_error(LexicalError::new(
LexicalErrorType::FStringError(error),
self.token_range(),
)));
}
'\n' | '\r' if !fstring.is_triple_quoted() => {
// If we encounter a newline while we're in a format spec, then
// we stop here and let the lexer emit the newline token.
//
// Relevant discussion: https://github.com/python/cpython/issues/110259
if in_format_spec {
break;
}
self.fstrings.pop();
return Some(self.push_error(LexicalError::new(
LexicalErrorType::FStringError(FStringErrorType::UnterminatedString),
self.token_range(),
)));
}
'\\' => {
self.cursor.bump(); // '\'
if matches!(self.cursor.first(), '{' | '}') {
// Don't consume `{` or `}` as we want them to be emitted as tokens.
// They will be handled in the next iteration.
continue;
} else if !fstring.is_raw_string() {
if self.cursor.eat_char2('N', '{') {
in_named_unicode = true;
continue;
}
}
// Consume the escaped character.
if self.cursor.eat_char('\r') {
self.cursor.eat_char('\n');
} else {
self.cursor.bump();
}
}
quote @ ('\'' | '"') if quote == fstring.quote_char() => {
if let Some(triple_quotes) = fstring.triple_quotes() {
if self.cursor.rest().starts_with(triple_quotes) {
break;
}
self.cursor.bump();
} else {
break;
}
}
'{' => {
if self.cursor.second() == '{' && !in_format_spec {
self.cursor.bump();
normalized
.push_str(&self.source[TextRange::new(last_offset, self.offset())]);
self.cursor.bump(); // Skip the second `{`
last_offset = self.offset();
} else {
break;
}
}
'}' => {
if in_named_unicode {
in_named_unicode = false;
self.cursor.bump();
} else if self.cursor.second() == '}' && !in_format_spec {
self.cursor.bump();
normalized
.push_str(&self.source[TextRange::new(last_offset, self.offset())]);
self.cursor.bump(); // Skip the second `}`
last_offset = self.offset();
} else {
break;
}
}
_ => {
self.cursor.bump();
}
}
}
let range = self.token_range();
if range.is_empty() {
return None;
}
let value = if normalized.is_empty() {
self.source[range].to_string()
} else {
normalized.push_str(&self.source[TextRange::new(last_offset, self.offset())]);
normalized
};
self.current_value = TokenValue::FStringMiddle(value.into_boxed_str());
self.current_flags = fstring.flags();
Some(TokenKind::FStringMiddle)
}
/// Lex a string literal.
fn lex_string(&mut self, quote: char) -> TokenKind {
#[cfg(debug_assertions)]
debug_assert_eq!(self.cursor.previous(), quote);
if quote == '"' {
self.current_flags |= TokenFlags::DOUBLE_QUOTES;
}
// If the next two characters are also the quote character, then we have a triple-quoted
// string; consume those two characters and ensure that we require a triple-quote to close
if self.cursor.eat_char2(quote, quote) {
self.current_flags |= TokenFlags::TRIPLE_QUOTED_STRING;
}
let value_start = self.offset();
let quote_byte = u8::try_from(quote).expect("char that fits in u8");
let value_end = if self.current_flags.is_triple_quoted() {
// For triple-quoted strings, scan until we find the closing quote (ignoring escaped
// quotes) or the end of the file.
loop {
let Some(index) = memchr::memchr(quote_byte, self.cursor.rest().as_bytes()) else {
self.cursor.skip_to_end();
return self.push_error(LexicalError::new(
LexicalErrorType::UnclosedStringError,
self.token_range(),
));
};
// Rare case: if there are an odd number of backslashes before the quote, then
// the quote is escaped and we should continue scanning.
let num_backslashes = self.cursor.rest().as_bytes()[..index]
.iter()
.rev()
.take_while(|&&c| c == b'\\')
.count();
// Advance the cursor past the quote and continue scanning.
self.cursor.skip_bytes(index + 1);
// If the character is escaped, continue scanning.
if num_backslashes % 2 == 1 {
continue;
}
// Otherwise, if it's followed by two more quotes, then we're done.
if self.cursor.eat_char2(quote, quote) {
break self.offset() - TextSize::new(3);
}
}
} else {
// For non-triple-quoted strings, scan until we find the closing quote, but end early
// if we encounter a newline or the end of the file.
loop {
let Some(index) =
memchr::memchr3(quote_byte, b'\r', b'\n', self.cursor.rest().as_bytes())
else {
self.cursor.skip_to_end();
return self.push_error(LexicalError::new(
LexicalErrorType::StringError,
self.token_range(),
));
};
// Rare case: if there are an odd number of backslashes before the quote, then
// the quote is escaped and we should continue scanning.
let num_backslashes = self.cursor.rest().as_bytes()[..index]
.iter()
.rev()
.take_while(|&&c| c == b'\\')
.count();
// Skip up to the current character.
self.cursor.skip_bytes(index);
let ch = self.cursor.bump();
// If the character is escaped, continue scanning.
if num_backslashes % 2 == 1 {
if ch == Some('\r') {
self.cursor.eat_char('\n');
}
continue;
}
match ch {
Some('\r' | '\n') => {
return self.push_error(LexicalError::new(
LexicalErrorType::UnclosedStringError,
self.token_range(),
));
}
Some(ch) if ch == quote => {
break self.offset() - TextSize::new(1);
}
_ => unreachable!("memchr2 returned an index that is not a quote or a newline"),
}
}
};
self.current_value = TokenValue::String(
self.source[TextRange::new(value_start, value_end)]
.to_string()
.into_boxed_str(),
);
TokenKind::String
}
/// Numeric lexing. The feast can start!
fn lex_number(&mut self, first: char) -> TokenKind {
if first == '0' {
if self.cursor.eat_if(|c| matches!(c, 'x' | 'X')).is_some() {
self.lex_number_radix(Radix::Hex)
} else if self.cursor.eat_if(|c| matches!(c, 'o' | 'O')).is_some() {
self.lex_number_radix(Radix::Octal)
} else if self.cursor.eat_if(|c| matches!(c, 'b' | 'B')).is_some() {
self.lex_number_radix(Radix::Binary)
} else {
self.lex_decimal_number(first)
}
} else {
self.lex_decimal_number(first)
}
}
/// Lex a hex/octal/decimal/binary number without a decimal point.
fn lex_number_radix(&mut self, radix: Radix) -> TokenKind {
#[cfg(debug_assertions)]
debug_assert!(matches!(
self.cursor.previous().to_ascii_lowercase(),
'x' | 'o' | 'b'
));
// Lex the portion of the token after the base prefix (e.g., `9D5` in `0x9D5`).
let mut number = LexedText::new(self.offset(), self.source);
self.radix_run(&mut number, radix);
// Extract the entire number, including the base prefix (e.g., `0x9D5`).
let token = &self.source[self.token_range()];
let value = match Int::from_str_radix(number.as_str(), radix.as_u32(), token) {
Ok(int) => int,
Err(err) => {
return self.push_error(LexicalError::new(
LexicalErrorType::OtherError(format!("{err:?}").into_boxed_str()),
self.token_range(),
));
}
};
self.current_value = TokenValue::Int(value);
TokenKind::Int
}
/// Lex a normal number, that is, no octal, hex or binary number.
fn lex_decimal_number(&mut self, first_digit_or_dot: char) -> TokenKind {
#[cfg(debug_assertions)]
debug_assert!(self.cursor.previous().is_ascii_digit() || self.cursor.previous() == '.');
let start_is_zero = first_digit_or_dot == '0';
let mut number = LexedText::new(self.token_start(), self.source);
if first_digit_or_dot != '.' {
number.push(first_digit_or_dot);
self.radix_run(&mut number, Radix::Decimal);
};
let is_float = if first_digit_or_dot == '.' || self.cursor.eat_char('.') {
number.push('.');
if self.cursor.eat_char('_') {
return self.push_error(LexicalError::new(
LexicalErrorType::OtherError("Invalid Syntax".to_string().into_boxed_str()),
TextRange::new(self.offset() - TextSize::new(1), self.offset()),
));
}
self.radix_run(&mut number, Radix::Decimal);
true
} else {
// Normal number:
false
};
let is_float = match self.cursor.rest().as_bytes() {
[b'e' | b'E', b'0'..=b'9', ..] | [b'e' | b'E', b'-' | b'+', b'0'..=b'9', ..] => {
// 'e' | 'E'
number.push(self.cursor.bump().unwrap());
if let Some(sign) = self.cursor.eat_if(|c| matches!(c, '+' | '-')) {
number.push(sign);
}
self.radix_run(&mut number, Radix::Decimal);
true
}
_ => is_float,
};
if is_float {
// Improvement: Use `Cow` instead of pushing to value text
let Ok(value) = f64::from_str(number.as_str()) else {
return self.push_error(LexicalError::new(
LexicalErrorType::OtherError(
"Invalid decimal literal".to_string().into_boxed_str(),
),
self.token_range(),
));
};
// Parse trailing 'j':
if self.cursor.eat_if(|c| matches!(c, 'j' | 'J')).is_some() {
self.current_value = TokenValue::Complex {
real: 0.0,
imag: value,
};
TokenKind::Complex
} else {
self.current_value = TokenValue::Float(value);
TokenKind::Float
}
} else {
// Parse trailing 'j':
if self.cursor.eat_if(|c| matches!(c, 'j' | 'J')).is_some() {
let imag = f64::from_str(number.as_str()).unwrap();
self.current_value = TokenValue::Complex { real: 0.0, imag };
TokenKind::Complex
} else {
let value = match Int::from_str(number.as_str()) {
Ok(value) => {
if start_is_zero && value.as_u8() != Some(0) {
// Leading zeros in decimal integer literals are not permitted.
return self.push_error(LexicalError::new(
LexicalErrorType::OtherError(
"Invalid decimal integer literal"
.to_string()
.into_boxed_str(),
),
self.token_range(),
));
}
value
}
Err(err) => {
return self.push_error(LexicalError::new(
LexicalErrorType::OtherError(format!("{err:?}").into_boxed_str()),
self.token_range(),
))
}
};
self.current_value = TokenValue::Int(value);
TokenKind::Int
}
}
}
/// Consume a sequence of numbers with the given radix,
/// the digits can be decorated with underscores
/// like this: '`1_2_3_4`' == '1234'
fn radix_run(&mut self, number: &mut LexedText, radix: Radix) {
loop {
if let Some(c) = self.cursor.eat_if(|c| radix.is_digit(c)) {
number.push(c);
}
// Number that contains `_` separators. Remove them from the parsed text.
else if self.cursor.first() == '_' && radix.is_digit(self.cursor.second()) {
// Skip over `_`
self.cursor.bump();
number.skip_char();
} else {
break;
}
}
}
/// Lex a single comment.
fn lex_comment(&mut self) -> TokenKind {
#[cfg(debug_assertions)]
debug_assert_eq!(self.cursor.previous(), '#');
let bytes = self.cursor.rest().as_bytes();
let offset = memchr::memchr2(b'\n', b'\r', bytes).unwrap_or(bytes.len());
self.cursor.skip_bytes(offset);
TokenKind::Comment
}
/// Lex a single IPython escape command.
fn lex_ipython_escape_command(&mut self, escape_kind: IpyEscapeKind) -> TokenKind {
let mut value = String::new();
loop {
match self.cursor.first() {
'\\' => {
// Only skip the line continuation if it is followed by a newline
// otherwise it is a normal backslash which is part of the magic command:
//
// Skip this backslash
// v
// !pwd \
// && ls -a | sed 's/^/\\ /'
// ^^
// Don't skip these backslashes
if self.cursor.second() == '\r' {
self.cursor.bump();
self.cursor.bump();
self.cursor.eat_char('\n');
continue;
} else if self.cursor.second() == '\n' {
self.cursor.bump();
self.cursor.bump();
continue;
}
self.cursor.bump();
value.push('\\');
}
// Help end escape commands are those that end with 1 or 2 question marks.
// Here, we're only looking for a subset of help end escape commands which
// are the ones that has the escape token at the start of the line as well.
// On the other hand, we're not looking for help end escape commands that
// are strict in the sense that the escape token is only at the end. For example,
//
// * `%foo?` is recognized as a help end escape command but not as a strict one.
// * `foo?` is recognized as a strict help end escape command which is not
// lexed here but is identified at the parser level.
//
// Help end escape commands implemented in the IPython codebase using regex:
// https://github.com/ipython/ipython/blob/292e3a23459ca965b8c1bfe2c3707044c510209a/IPython/core/inputtransformer2.py#L454-L462
'?' => {
self.cursor.bump();
let mut question_count = 1u32;
while self.cursor.eat_char('?') {
question_count += 1;
}
// The original implementation in the IPython codebase is based on regex which
// means that it's strict in the sense that it won't recognize a help end escape:
// * If there's any whitespace before the escape token (e.g. `%foo ?`)
// * If there are more than 2 question mark tokens (e.g. `%foo???`)
// which is what we're doing here as well. In that case, we'll continue with
// the prefixed escape token.
//
// Now, the whitespace and empty value check also makes sure that an empty
// command (e.g. `%?` or `? ??`, no value after/between the escape tokens)
// is not recognized as a help end escape command. So, `%?` and `? ??` are
// `IpyEscapeKind::Magic` and `IpyEscapeKind::Help` because of the initial `%` and `??`
// tokens.
if question_count > 2
|| value.chars().last().map_or(true, is_python_whitespace)
|| !matches!(self.cursor.first(), '\n' | '\r' | EOF_CHAR)
{
// Not a help end escape command, so continue with the lexing.
value.reserve(question_count as usize);
for _ in 0..question_count {
value.push('?');
}
continue;
}
if escape_kind.is_help() {
// If we've recognize this as a help end escape command, then
// any question mark token / whitespaces at the start are not
// considered as part of the value.
//
// For example, `??foo?` is recognized as `IpyEscapeKind::Help` and
// `value` is `foo` instead of `??foo`.
value = value.trim_start_matches([' ', '?']).to_string();
} else if escape_kind.is_magic() {
// Between `%` and `?` (at the end), the `?` takes priority
// over the `%` so `%foo?` is recognized as `IpyEscapeKind::Help`
// and `value` is `%foo` instead of `foo`. So, we need to
// insert the magic escape token at the start.
value.insert_str(0, escape_kind.as_str());
}
let kind = match question_count {
1 => IpyEscapeKind::Help,
2 => IpyEscapeKind::Help2,
_ => unreachable!("`question_count` is always 1 or 2"),
};
self.current_value = TokenValue::IpyEscapeCommand {
kind,
value: value.into_boxed_str(),
};
return TokenKind::IpyEscapeCommand;
}
'\n' | '\r' | EOF_CHAR => {
self.current_value = TokenValue::IpyEscapeCommand {
kind: escape_kind,
value: value.into_boxed_str(),
};
return TokenKind::IpyEscapeCommand;
}
c => {
self.cursor.bump();
value.push(c);
}
}
}
}
fn consume_end(&mut self) -> TokenKind {
// We reached end of file.
// First of all, we need all nestings to be finished.
if self.nesting > 0 {
// Reset the nesting to avoid going into infinite loop.
self.nesting = 0;
return self.push_error(LexicalError::new(LexicalErrorType::Eof, self.token_range()));
}
// Next, insert a trailing newline, if required.
if !self.state.is_new_logical_line() {
self.state = State::AfterNewline;
TokenKind::Newline
}
// Next, flush the indentation stack to zero.
else if self.indentations.dedent().is_some() {
TokenKind::Dedent
} else {
TokenKind::EndOfFile
}
}
#[inline]
fn token_range(&self) -> TextRange {
let end = self.offset();
let len = self.cursor.token_len();
TextRange::at(end - len, len)
}
#[inline]
fn token_text(&self) -> &'src str {
&self.source[self.token_range()]
}
/// Retrieves the current offset of the cursor within the source code.
// SAFETY: Lexer doesn't allow files larger than 4GB
#[allow(clippy::cast_possible_truncation)]
#[inline]
fn offset(&self) -> TextSize {
TextSize::new(self.source.len() as u32) - self.cursor.text_len()
}
#[inline]
fn token_start(&self) -> TextSize {
self.token_range().start()
}
/// Helper function to push the given error and return the [`TokenKind::Unknown`] token.
fn push_error(&mut self, error: LexicalError) -> TokenKind {
self.errors.push(error);
TokenKind::Unknown
}
/// Creates a checkpoint to which the lexer can later return to using [`Self::rewind`].
pub(crate) fn checkpoint(&self) -> LexerCheckpoint<'src> {
LexerCheckpoint {
value: self.current_value.clone(),
current_kind: self.current_kind,
current_range: self.current_range,
current_flags: self.current_flags,
cursor: self.cursor.clone(),
state: self.state,
nesting: self.nesting,
indentations_checkpoint: self.indentations.checkpoint(),
pending_indentation: self.pending_indentation,
fstrings_checkpoint: self.fstrings.checkpoint(),
errors_position: self.errors.len(),
}
}
/// Restore the lexer to the given checkpoint.
pub(crate) fn rewind(&mut self, checkpoint: LexerCheckpoint<'src>) {
let LexerCheckpoint {
value,
current_kind,
current_range,
current_flags,
cursor,
state,
nesting,
indentations_checkpoint,
pending_indentation,
fstrings_checkpoint,
errors_position,
} = checkpoint;
self.current_value = value;
self.current_kind = current_kind;
self.current_range = current_range;
self.current_flags = current_flags;
self.cursor = cursor;
self.state = state;
self.nesting = nesting;
self.indentations.rewind(indentations_checkpoint);
self.pending_indentation = pending_indentation;
self.fstrings.rewind(fstrings_checkpoint);
self.errors.truncate(errors_position);
}
pub fn finish(self) -> Vec<LexicalError> {
self.errors
}
}
bitflags! {
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
pub(crate) struct TokenFlags: u8 {
/// The token is a string with double quotes (`"`).
const DOUBLE_QUOTES = 1 << 0;
/// The token is a triple-quoted string i.e., it starts and ends with three consecutive
/// quote characters (`"""` or `'''`).
const TRIPLE_QUOTED_STRING = 1 << 1;
/// The token is a unicode string i.e., prefixed with `u` or `U`
const UNICODE_STRING = 1 << 2;
/// The token is a byte string i.e., prefixed with `b` or `B`
const BYTE_STRING = 1 << 3;
/// The token is an f-string i.e., prefixed with `f` or `F`
const F_STRING = 1 << 4;
/// The token is a raw string and the prefix character is in lowercase.
const RAW_STRING_LOWERCASE = 1 << 5;
/// The token is a raw string and the prefix character is in uppercase.
const RAW_STRING_UPPERCASE = 1 << 6;
/// The token is a raw string i.e., prefixed with `r` or `R`
const RAW_STRING = Self::RAW_STRING_LOWERCASE.bits() | Self::RAW_STRING_UPPERCASE.bits();
}
}
impl StringFlags for TokenFlags {
fn quote_style(self) -> Quote {
if self.intersects(TokenFlags::DOUBLE_QUOTES) {
Quote::Double
} else {
Quote::Single
}
}
fn is_triple_quoted(self) -> bool {
self.intersects(TokenFlags::TRIPLE_QUOTED_STRING)
}
fn prefix(self) -> AnyStringPrefix {
if self.intersects(TokenFlags::F_STRING) {
if self.intersects(TokenFlags::RAW_STRING_LOWERCASE) {
AnyStringPrefix::Format(FStringPrefix::Raw { uppercase_r: false })
} else if self.intersects(TokenFlags::RAW_STRING_UPPERCASE) {
AnyStringPrefix::Format(FStringPrefix::Raw { uppercase_r: true })
} else {
AnyStringPrefix::Format(FStringPrefix::Regular)
}
} else if self.intersects(TokenFlags::BYTE_STRING) {
if self.intersects(TokenFlags::RAW_STRING_LOWERCASE) {
AnyStringPrefix::Bytes(ByteStringPrefix::Raw { uppercase_r: false })
} else if self.intersects(TokenFlags::RAW_STRING_UPPERCASE) {
AnyStringPrefix::Bytes(ByteStringPrefix::Raw { uppercase_r: true })
} else {
AnyStringPrefix::Bytes(ByteStringPrefix::Regular)
}
} else if self.intersects(TokenFlags::RAW_STRING_LOWERCASE) {
AnyStringPrefix::Regular(StringLiteralPrefix::Raw { uppercase: false })
} else if self.intersects(TokenFlags::RAW_STRING_UPPERCASE) {
AnyStringPrefix::Regular(StringLiteralPrefix::Raw { uppercase: true })
} else if self.intersects(TokenFlags::UNICODE_STRING) {
AnyStringPrefix::Regular(StringLiteralPrefix::Unicode)
} else {
AnyStringPrefix::Regular(StringLiteralPrefix::Empty)
}
}
}
impl TokenFlags {
/// Returns `true` if the token is an f-string.
const fn is_f_string(self) -> bool {
self.intersects(TokenFlags::F_STRING)
}
/// Returns `true` if the token is a raw string.
const fn is_raw_string(self) -> bool {
self.intersects(TokenFlags::RAW_STRING)
}
pub(crate) fn as_any_string_flags(self) -> AnyStringFlags {
AnyStringFlags::new(self.prefix(), self.quote_style(), self.is_triple_quoted())
}
}
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
pub struct Token {
/// The kind of the token.
kind: TokenKind,
/// The range of the token.
range: TextRange,
/// The set of flags describing this token.
flags: TokenFlags,
}
impl Token {
pub(crate) fn new(kind: TokenKind, range: TextRange, flags: TokenFlags) -> Token {
Self { kind, range, flags }
}
/// Returns the token kind.
#[inline]
pub const fn kind(&self) -> TokenKind {
self.kind
}
/// Returns the token as a tuple of (kind, range).
#[inline]
pub const fn as_tuple(&self) -> (TokenKind, TextRange) {
(self.kind, self.range)
}
/// Returns `true` if this is a trivia token.
#[inline]
pub const fn is_trivia(self) -> bool {
matches!(self.kind, TokenKind::Comment | TokenKind::NonLogicalNewline)
}
/// Returns `true` if this is any kind of string token.
const fn is_any_string(self) -> bool {
matches!(
self.kind,
TokenKind::String
| TokenKind::FStringStart
| TokenKind::FStringMiddle
| TokenKind::FStringEnd
)
}
/// Returns `true` if the current token is a triple-quoted string of any kind.
///
/// # Panics
///
/// If it isn't a string or any f-string tokens.
pub fn is_triple_quoted_string(self) -> bool {
assert!(self.is_any_string());
self.flags.is_triple_quoted()
}
/// Returns the [`Quote`] style for the current string token of any kind.
///
/// # Panics
///
/// If it isn't a string or any f-string tokens.
pub fn string_quote_style(self) -> Quote {
assert!(self.is_any_string());
self.flags.quote_style()
}
}
impl Ranged for Token {
fn range(&self) -> TextRange {
self.range
}
}
/// Represents an error that occur during lexing and are
/// returned by the `parse_*` functions in the iterator in the
/// [lexer] implementation.
///
/// [lexer]: crate::lexer
#[derive(Debug, Clone, PartialEq)]
pub struct LexicalError {
/// The type of error that occurred.
error: LexicalErrorType,
/// The location of the error.
location: TextRange,
}
impl LexicalError {
/// Creates a new `LexicalError` with the given error type and location.
pub fn new(error: LexicalErrorType, location: TextRange) -> Self {
Self { error, location }
}
pub fn error(&self) -> &LexicalErrorType {
&self.error
}
pub fn into_error(self) -> LexicalErrorType {
self.error
}
pub fn location(&self) -> TextRange {
self.location
}
}
impl std::ops::Deref for LexicalError {
type Target = LexicalErrorType;
fn deref(&self) -> &Self::Target {
self.error()
}
}
impl std::error::Error for LexicalError {
fn source(&self) -> Option<&(dyn std::error::Error + 'static)> {
Some(self.error())
}
}
impl std::fmt::Display for LexicalError {
fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
write!(
f,
"{} at byte offset {}",
self.error(),
u32::from(self.location().start())
)
}
}
/// Represents the different types of errors that can occur during lexing.
#[derive(Debug, Clone, PartialEq)]
pub enum LexicalErrorType {
// TODO: Can probably be removed, the places it is used seem to be able
// to use the `UnicodeError` variant instead.
#[doc(hidden)]
StringError,
/// A string literal without the closing quote.
UnclosedStringError,
/// Decoding of a unicode escape sequence in a string literal failed.
UnicodeError,
/// Missing the `{` for unicode escape sequence.
MissingUnicodeLbrace,
/// Missing the `}` for unicode escape sequence.
MissingUnicodeRbrace,
/// The indentation is not consistent.
IndentationError,
/// An unrecognized token was encountered.
UnrecognizedToken { tok: char },
/// An f-string error containing the [`FStringErrorType`].
FStringError(FStringErrorType),
/// Invalid character encountered in a byte literal.
InvalidByteLiteral,
/// An unexpected character was encountered after a line continuation.
LineContinuationError,
/// An unexpected end of file was encountered.
Eof,
/// An unexpected error occurred.
OtherError(Box<str>),
}
impl std::error::Error for LexicalErrorType {}
impl std::fmt::Display for LexicalErrorType {
fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
match self {
LexicalErrorType::StringError => write!(f, "Got unexpected string"),
LexicalErrorType::FStringError(error) => write!(f, "f-string: {error}"),
LexicalErrorType::InvalidByteLiteral => {
write!(f, "bytes can only contain ASCII literal characters")
}
LexicalErrorType::UnicodeError => write!(f, "Got unexpected unicode"),
LexicalErrorType::IndentationError => {
write!(f, "unindent does not match any outer indentation level")
}
LexicalErrorType::UnrecognizedToken { tok } => {
write!(f, "Got unexpected token {tok}")
}
LexicalErrorType::LineContinuationError => {
write!(f, "unexpected character after line continuation character")
}
LexicalErrorType::Eof => write!(f, "unexpected EOF while parsing"),
LexicalErrorType::OtherError(msg) => write!(f, "{msg}"),
LexicalErrorType::UnclosedStringError => {
write!(f, "missing closing quote in string literal")
}
LexicalErrorType::MissingUnicodeLbrace => {
write!(f, "Missing `{{` in Unicode escape sequence")
}
LexicalErrorType::MissingUnicodeRbrace => {
write!(f, "Missing `}}` in Unicode escape sequence")
}
}
}
}
#[derive(Clone, Debug, Default)]
pub(crate) enum TokenValue {
#[default]
None,
/// Token value for a name, commonly known as an identifier.
///
/// Unicode names are NFKC-normalized by the lexer,
/// matching [the behaviour of Python's lexer](https://docs.python.org/3/reference/lexical_analysis.html#identifiers)
Name(Box<str>),
/// Token value for an integer.
Int(Int),
/// Token value for a floating point number.
Float(f64),
/// Token value for a complex number.
Complex {
/// The real part of the complex number.
real: f64,
/// The imaginary part of the complex number.
imag: f64,
},
/// Token value for a string.
String(Box<str>),
/// Token value that includes the portion of text inside the f-string that's not
/// part of the expression part and isn't an opening or closing brace.
FStringMiddle(Box<str>),
/// Token value for IPython escape commands. These are recognized by the lexer
/// only when the mode is [`Mode::Ipython`].
IpyEscapeCommand {
/// The magic command value.
value: Box<str>,
/// The kind of magic command.
kind: IpyEscapeKind,
},
}
pub(crate) struct LexerCheckpoint<'src> {
value: TokenValue,
current_kind: TokenKind,
current_range: TextRange,
current_flags: TokenFlags,
cursor: Cursor<'src>,
state: State,
nesting: u32,
indentations_checkpoint: IndentationsCheckpoint,
pending_indentation: Option<Indentation>,
fstrings_checkpoint: FStringsCheckpoint,
errors_position: usize,
}
#[derive(Copy, Clone, Debug)]
enum State {
/// Lexer is right at the beginning of the file or after a `Newline` token.
AfterNewline,
/// The lexer is at the start of a new logical line but **after** the indentation
NonEmptyLogicalLine,
/// Lexer is right after an equal token
AfterEqual,
/// Inside of a logical line
Other,
}
impl State {
const fn is_after_newline(self) -> bool {
matches!(self, State::AfterNewline)
}
const fn is_new_logical_line(self) -> bool {
matches!(self, State::AfterNewline | State::NonEmptyLogicalLine)
}
const fn is_after_equal(self) -> bool {
matches!(self, State::AfterEqual)
}
}
#[derive(Copy, Clone, Debug)]
enum Radix {
Binary,
Octal,
Decimal,
Hex,
}
impl Radix {
const fn as_u32(self) -> u32 {
match self {
Radix::Binary => 2,
Radix::Octal => 8,
Radix::Decimal => 10,
Radix::Hex => 16,
}
}
const fn is_digit(self, c: char) -> bool {
match self {
Radix::Binary => matches!(c, '0'..='1'),
Radix::Octal => matches!(c, '0'..='7'),
Radix::Decimal => c.is_ascii_digit(),
Radix::Hex => c.is_ascii_hexdigit(),
}
}
}
const fn is_quote(c: char) -> bool {
matches!(c, '\'' | '"')
}
const fn is_ascii_identifier_start(c: char) -> bool {
matches!(c, 'a'..='z' | 'A'..='Z' | '_')
}
// Checks if the character c is a valid starting character as described
// in https://docs.python.org/3/reference/lexical_analysis.html#identifiers
fn is_unicode_identifier_start(c: char) -> bool {
is_xid_start(c)
}
/// Checks if the character c is a valid continuation character as described
/// in <https://docs.python.org/3/reference/lexical_analysis.html#identifiers>.
///
/// Additionally, this function also keeps track of whether or not the total
/// identifier is ASCII-only or not by mutably altering a reference to a
/// boolean value passed in.
fn is_identifier_continuation(c: char, identifier_is_ascii_only: &mut bool) -> bool {
// Arrange things such that ASCII codepoints never
// result in the slower `is_xid_continue` getting called.
if c.is_ascii() {
matches!(c, 'a'..='z' | 'A'..='Z' | '_' | '0'..='9')
} else {
*identifier_is_ascii_only = false;
is_xid_continue(c)
}
}
/// Returns `true` for [whitespace](https://docs.python.org/3/reference/lexical_analysis.html#whitespace-between-tokens)
/// characters.
///
/// This is the same as `ruff_python_trivia::is_python_whitespace` and is copied
/// here to avoid a circular dependency as `ruff_python_trivia` has a dev-dependency
/// on `ruff_python_lexer`.
const fn is_python_whitespace(c: char) -> bool {
matches!(
c,
// Space, tab, or form-feed
' ' | '\t' | '\x0C'
)
}
enum LexedText<'a> {
Source { source: &'a str, range: TextRange },
Owned(String),
}
impl<'a> LexedText<'a> {
fn new(start: TextSize, source: &'a str) -> Self {
Self::Source {
range: TextRange::empty(start),
source,
}
}
fn push(&mut self, c: char) {
match self {
LexedText::Source { range, source } => {
*range = range.add_end(c.text_len());
debug_assert!(source[*range].ends_with(c));
}
LexedText::Owned(owned) => owned.push(c),
}
}
fn as_str<'b>(&'b self) -> &'b str
where
'b: 'a,
{
match self {
LexedText::Source { range, source } => &source[*range],
LexedText::Owned(owned) => owned,
}
}
fn skip_char(&mut self) {
match self {
LexedText::Source { range, source } => {
*self = LexedText::Owned(source[*range].to_string());
}
LexedText::Owned(_) => {}
}
}
}
/// Create a new [`Lexer`] for the given source code and [`Mode`].
pub fn lex(source: &str, mode: Mode) -> Lexer {
Lexer::new(source, mode, TextSize::default())
}
#[cfg(test)]
mod tests {
use std::fmt::Write;
use insta::assert_snapshot;
use super::*;
const WINDOWS_EOL: &str = "\r\n";
const MAC_EOL: &str = "\r";
const UNIX_EOL: &str = "\n";
/// Same as [`Token`] except that this includes the [`TokenValue`] as well.
struct TestToken {
kind: TokenKind,
value: TokenValue,
range: TextRange,
flags: TokenFlags,
}
impl std::fmt::Debug for TestToken {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
let mut tuple = f.debug_tuple("");
let mut tuple = if matches!(self.value, TokenValue::None) {
tuple.field(&self.kind)
} else {
tuple.field(&self.value)
};
tuple = tuple.field(&self.range);
if self.flags.is_empty() {
tuple.finish()
} else {
tuple.field(&self.flags).finish()
}
}
}
struct LexerOutput {
tokens: Vec<TestToken>,
errors: Vec<LexicalError>,
}
impl std::fmt::Display for LexerOutput {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
writeln!(f, "## Tokens")?;
writeln!(f, "```\n{:#?}\n```", self.tokens)?;
if !self.errors.is_empty() {
writeln!(f, "## Errors")?;
writeln!(f, "```\n{:#?}\n```", self.errors)?;
}
Ok(())
}
}
fn lex(source: &str, mode: Mode) -> LexerOutput {
let mut lexer = Lexer::new(source, mode, TextSize::default());
let mut tokens = Vec::new();
loop {
let kind = lexer.next_token();
if kind.is_eof() {
break;
}
tokens.push(TestToken {
kind,
value: lexer.take_value(),
range: lexer.current_range(),
flags: lexer.current_flags(),
});
}
LexerOutput {
tokens,
errors: lexer.finish(),
}
}
fn lex_valid(source: &str, mode: Mode) -> LexerOutput {
let output = lex(source, mode);
if !output.errors.is_empty() {
let mut message = "Unexpected lexical errors for a valid source:\n".to_string();
for error in &output.errors {
writeln!(&mut message, "{error:?}").unwrap();
}
writeln!(&mut message, "Source:\n{source}").unwrap();
panic!("{message}");
}
output
}
fn lex_invalid(source: &str, mode: Mode) -> LexerOutput {
let output = lex(source, mode);
assert!(
!output.errors.is_empty(),
"Expected lexer to generate at least one error for the following source:\n{source}"
);
output
}
fn lex_source(source: &str) -> LexerOutput {
lex_valid(source, Mode::Module)
}
fn lex_jupyter_source(source: &str) -> LexerOutput {
lex_valid(source, Mode::Ipython)
}
fn ipython_escape_command_line_continuation_eol(eol: &str) -> LexerOutput {
let source = format!("%matplotlib \\{eol} --inline");
lex_jupyter_source(&source)
}
#[test]
fn test_ipython_escape_command_line_continuation_unix_eol() {
assert_snapshot!(ipython_escape_command_line_continuation_eol(UNIX_EOL));
}
#[test]
fn test_ipython_escape_command_line_continuation_mac_eol() {
assert_snapshot!(ipython_escape_command_line_continuation_eol(MAC_EOL));
}
#[test]
fn test_ipython_escape_command_line_continuation_windows_eol() {
assert_snapshot!(ipython_escape_command_line_continuation_eol(WINDOWS_EOL));
}
fn ipython_escape_command_line_continuation_with_eol_and_eof(eol: &str) -> LexerOutput {
let source = format!("%matplotlib \\{eol}");
lex_jupyter_source(&source)
}
#[test]
fn test_ipython_escape_command_line_continuation_with_unix_eol_and_eof() {
assert_snapshot!(ipython_escape_command_line_continuation_with_eol_and_eof(
UNIX_EOL
));
}
#[test]
fn test_ipython_escape_command_line_continuation_with_mac_eol_and_eof() {
assert_snapshot!(ipython_escape_command_line_continuation_with_eol_and_eof(
MAC_EOL
));
}
#[test]
fn test_ipython_escape_command_line_continuation_with_windows_eol_and_eof() {
assert_snapshot!(ipython_escape_command_line_continuation_with_eol_and_eof(
WINDOWS_EOL
));
}
#[test]
fn test_empty_ipython_escape_command() {
let source = "%\n%%\n!\n!!\n?\n??\n/\n,\n;";
assert_snapshot!(lex_jupyter_source(source));
}
#[test]
fn test_ipython_escape_command() {
let source = r"
?foo
??foo
%timeit a = b
%timeit a % 3
%matplotlib \
--inline
!pwd \
&& ls -a | sed 's/^/\\ /'
!!cd /Users/foo/Library/Application\ Support/
/foo 1 2
,foo 1 2
;foo 1 2
!ls
"
.trim();
assert_snapshot!(lex_jupyter_source(source));
}
#[test]
fn test_ipython_help_end_escape_command() {
let source = r"
?foo?
?? foo?
?? foo ?
?foo??
??foo??
???foo?
???foo??
??foo???
???foo???
?? \
foo?
?? \
?
????
%foo?
%foo??
%%foo???
!pwd?"
.trim();
assert_snapshot!(lex_jupyter_source(source));
}
#[test]
fn test_ipython_escape_command_indentation() {
let source = r"
if True:
%matplotlib \
--inline"
.trim();
assert_snapshot!(lex_jupyter_source(source));
}
#[test]
fn test_ipython_escape_command_assignment() {
let source = r"
pwd = !pwd
foo = %timeit a = b
bar = %timeit a % 3
baz = %matplotlib \
inline"
.trim();
assert_snapshot!(lex_jupyter_source(source));
}
fn assert_no_ipython_escape_command(tokens: &[TestToken]) {
for token in tokens {
if matches!(token.kind, TokenKind::IpyEscapeCommand) {
panic!("Unexpected escape command token at {:?}", token.range)
}
}
}
#[test]
fn test_ipython_escape_command_not_an_assignment() {
let source = r"
# Other escape kinds are not valid here (can't test `foo = ?str` because '?' is not a valid token)
foo = /func
foo = ;func
foo = ,func
(foo == %timeit a = b)
(foo := %timeit a = b)
def f(arg=%timeit a = b):
pass"
.trim();
let output = lex(source, Mode::Ipython);
assert!(output.errors.is_empty());
assert_no_ipython_escape_command(&output.tokens);
}
#[test]
fn test_numbers() {
let source =
"0x2f 0o12 0b1101 0 123 123_45_67_890 0.2 1e+2 2.1e3 2j 2.2j 000 0x995DC9BBDF1939FA 0x995DC9BBDF1939FA995DC9BBDF1939FA";
assert_snapshot!(lex_source(source));
}
#[test]
fn test_invalid_leading_zero_small() {
let source = "025";
assert_snapshot!(lex_invalid(source, Mode::Module));
}
#[test]
fn test_invalid_leading_zero_big() {
let source =
"0252222222222222522222222222225222222222222252222222222222522222222222225222222222222";
assert_snapshot!(lex_invalid(source, Mode::Module));
}
#[test]
fn test_line_comment_long() {
let source = "99232 # foo".to_string();
assert_snapshot!(lex_source(&source));
}
#[test]
fn test_line_comment_whitespace() {
let source = "99232 # ".to_string();
assert_snapshot!(lex_source(&source));
}
#[test]
fn test_line_comment_single_whitespace() {
let source = "99232 # ".to_string();
assert_snapshot!(lex_source(&source));
}
#[test]
fn test_line_comment_empty() {
let source = "99232 #".to_string();
assert_snapshot!(lex_source(&source));
}
fn comment_until_eol(eol: &str) -> LexerOutput {
let source = format!("123 # Foo{eol}456");
lex_source(&source)
}
#[test]
fn test_comment_until_unix_eol() {
assert_snapshot!(comment_until_eol(UNIX_EOL));
}
#[test]
fn test_comment_until_mac_eol() {
assert_snapshot!(comment_until_eol(MAC_EOL));
}
#[test]
fn test_comment_until_windows_eol() {
assert_snapshot!(comment_until_eol(WINDOWS_EOL));
}
#[test]
fn test_assignment() {
let source = r"a_variable = 99 + 2-0";
assert_snapshot!(lex_source(source));
}
fn indentation_with_eol(eol: &str) -> LexerOutput {
let source = format!("def foo():{eol} return 99{eol}{eol}");
lex_source(&source)
}
#[test]
fn test_indentation_with_unix_eol() {
assert_snapshot!(indentation_with_eol(UNIX_EOL));
}
#[test]
fn test_indentation_with_mac_eol() {
assert_snapshot!(indentation_with_eol(MAC_EOL));
}
#[test]
fn test_indentation_with_windows_eol() {
assert_snapshot!(indentation_with_eol(WINDOWS_EOL));
}
fn double_dedent_with_eol(eol: &str) -> LexerOutput {
let source = format!("def foo():{eol} if x:{eol}{eol} return 99{eol}{eol}");
lex_source(&source)
}
#[test]
fn test_double_dedent_with_unix_eol() {
assert_snapshot!(double_dedent_with_eol(UNIX_EOL));
}
#[test]
fn test_double_dedent_with_mac_eol() {
assert_snapshot!(double_dedent_with_eol(MAC_EOL));
}
#[test]
fn test_double_dedent_with_windows_eol() {
assert_snapshot!(double_dedent_with_eol(WINDOWS_EOL));
}
fn double_dedent_with_tabs_eol(eol: &str) -> LexerOutput {
let source = format!("def foo():{eol}\tif x:{eol}{eol}\t\t return 99{eol}{eol}");
lex_source(&source)
}
#[test]
fn test_double_dedent_with_tabs_unix_eol() {
assert_snapshot!(double_dedent_with_tabs_eol(UNIX_EOL));
}
#[test]
fn test_double_dedent_with_tabs_mac_eol() {
assert_snapshot!(double_dedent_with_tabs_eol(MAC_EOL));
}
#[test]
fn test_double_dedent_with_tabs_windows_eol() {
assert_snapshot!(double_dedent_with_tabs_eol(WINDOWS_EOL));
}
#[test]
fn dedent_after_whitespace() {
let source = "\
if first:
if second:
pass
foo
";
assert_snapshot!(lex_source(source));
}
fn newline_in_brackets_eol(eol: &str) -> LexerOutput {
let source = r"x = [
1,2
,(3,
4,
), {
5,
6,\
7}]
"
.replace('\n', eol);
lex_source(&source)
}
#[test]
fn test_newline_in_brackets_unix_eol() {
assert_snapshot!(newline_in_brackets_eol(UNIX_EOL));
}
#[test]
fn test_newline_in_brackets_mac_eol() {
assert_snapshot!(newline_in_brackets_eol(MAC_EOL));
}
#[test]
fn test_newline_in_brackets_windows_eol() {
assert_snapshot!(newline_in_brackets_eol(WINDOWS_EOL));
}
#[test]
fn test_non_logical_newline_in_string_continuation() {
let source = r"(
'a'
'b'
'c' \
'd'
)";
assert_snapshot!(lex_source(source));
}
#[test]
fn test_logical_newline_line_comment() {
let source = "#Hello\n#World\n";
assert_snapshot!(lex_source(source));
}
#[test]
fn test_operators() {
let source = "//////=/ /";
assert_snapshot!(lex_source(source));
}
#[test]
fn test_string() {
let source = r#""double" 'single' 'can\'t' "\\\"" '\t\r\n' '\g' r'raw\'' '\420' '\200\0a'"#;
assert_snapshot!(lex_source(source));
}
fn string_continuation_with_eol(eol: &str) -> LexerOutput {
let source = format!("\"abc\\{eol}def\"");
lex_source(&source)
}
#[test]
fn test_string_continuation_with_unix_eol() {
assert_snapshot!(string_continuation_with_eol(UNIX_EOL));
}
#[test]
fn test_string_continuation_with_mac_eol() {
assert_snapshot!(string_continuation_with_eol(MAC_EOL));
}
#[test]
fn test_string_continuation_with_windows_eol() {
assert_snapshot!(string_continuation_with_eol(WINDOWS_EOL));
}
#[test]
fn test_escape_unicode_name() {
let source = r#""\N{EN SPACE}""#;
assert_snapshot!(lex_source(source));
}
fn get_tokens_only(source: &str) -> Vec<TokenKind> {
let output = lex(source, Mode::Module);
assert!(output.errors.is_empty());
output.tokens.into_iter().map(|token| token.kind).collect()
}
#[test]
fn test_nfkc_normalization() {
let source1 = "𝒞 = 500";
let source2 = "C = 500";
assert_eq!(get_tokens_only(source1), get_tokens_only(source2));
}
fn triple_quoted_eol(eol: &str) -> LexerOutput {
let source = format!("\"\"\"{eol} test string{eol} \"\"\"");
lex_source(&source)
}
#[test]
fn test_triple_quoted_unix_eol() {
assert_snapshot!(triple_quoted_eol(UNIX_EOL));
}
#[test]
fn test_triple_quoted_mac_eol() {
assert_snapshot!(triple_quoted_eol(MAC_EOL));
}
#[test]
fn test_triple_quoted_windows_eol() {
assert_snapshot!(triple_quoted_eol(WINDOWS_EOL));
}
// This test case is to just make sure that the lexer doesn't go into
// infinite loop on invalid input.
#[test]
fn test_infinite_loop() {
let source = "[1";
lex_invalid(source, Mode::Module);
}
/// Emoji identifiers are a non-standard python feature and are not supported by our lexer.
#[test]
fn test_emoji_identifier() {
let source = "🐦";
assert_snapshot!(lex_invalid(source, Mode::Module));
}
#[test]
fn tet_too_low_dedent() {
let source = "if True:
pass
pass";
assert_snapshot!(lex_invalid(source, Mode::Module));
}
#[test]
fn test_empty_fstrings() {
let source = r#"f"" "" F"" f'' '' f"""""" f''''''"#;
assert_snapshot!(lex_source(source));
}
#[test]
fn test_fstring_prefix() {
let source = r#"f"" F"" rf"" rF"" Rf"" RF"" fr"" Fr"" fR"" FR"""#;
assert_snapshot!(lex_source(source));
}
#[test]
fn test_fstring() {
let source = r#"f"normal {foo} {{another}} {bar} {{{three}}}""#;
assert_snapshot!(lex_source(source));
}
#[test]
fn test_fstring_parentheses() {
let source = r#"f"{}" f"{{}}" f" {}" f"{{{}}}" f"{{{{}}}}" f" {} {{}} {{{}}} {{{{}}}} ""#;
assert_snapshot!(lex_source(source));
}
fn fstring_single_quote_escape_eol(eol: &str) -> LexerOutput {
let source = format!(r"f'text \{eol} more text'");
lex_source(&source)
}
#[test]
fn test_fstring_single_quote_escape_unix_eol() {
assert_snapshot!(fstring_single_quote_escape_eol(UNIX_EOL));
}
#[test]
fn test_fstring_single_quote_escape_mac_eol() {
assert_snapshot!(fstring_single_quote_escape_eol(MAC_EOL));
}
#[test]
fn test_fstring_single_quote_escape_windows_eol() {
assert_snapshot!(fstring_single_quote_escape_eol(WINDOWS_EOL));
}
#[test]
fn test_fstring_escape() {
let source = r#"f"\{x:\"\{x}} \"\"\
end""#;
assert_snapshot!(lex_source(source));
}
#[test]
fn test_fstring_escape_braces() {
let source = r"f'\{foo}' f'\\{foo}' f'\{{foo}}' f'\\{{foo}}'";
assert_snapshot!(lex_source(source));
}
#[test]
fn test_fstring_escape_raw() {
let source = r#"rf"\{x:\"\{x}} \"\"\
end""#;
assert_snapshot!(lex_source(source));
}
#[test]
fn test_fstring_named_unicode() {
let source = r#"f"\N{BULLET} normal \Nope \N""#;
assert_snapshot!(lex_source(source));
}
#[test]
fn test_fstring_named_unicode_raw() {
let source = r#"rf"\N{BULLET} normal""#;
assert_snapshot!(lex_source(source));
}
#[test]
fn test_fstring_with_named_expression() {
let source = r#"f"{x:=10} {(x:=10)} {x,{y:=10}} {[x:=10]}""#;
assert_snapshot!(lex_source(source));
}
#[test]
fn test_fstring_with_format_spec() {
let source = r#"f"{foo:} {x=!s:.3f} {x:.{y}f} {'':*^{1:{1}}} {x:{{1}.pop()}}""#;
assert_snapshot!(lex_source(source));
}
#[test]
fn test_fstring_with_multiline_format_spec() {
// The last f-string is invalid syntactically but we should still lex it.
// Note that the `b` is a `Name` token and not a `FStringMiddle` token.
let source = r"f'''__{
x:d
}__'''
f'''__{
x:a
b
c
}__'''
f'__{
x:d
}__'
f'__{
x:a
b
}__'
";
assert_snapshot!(lex_source(source));
}
#[test]
fn test_fstring_conversion() {
let source = r#"f"{x!s} {x=!r} {x:.3f!r} {{x!r}}""#;
assert_snapshot!(lex_source(source));
}
#[test]
fn test_fstring_nested() {
let source = r#"f"foo {f"bar {x + f"{wow}"}"} baz" f'foo {f'bar'} some {f"another"}'"#;
assert_snapshot!(lex_source(source));
}
#[test]
fn test_fstring_expression_multiline() {
let source = r#"f"first {
x
*
y
} second""#;
assert_snapshot!(lex_source(source));
}
#[test]
fn test_fstring_multiline() {
let source = r#"f"""
hello
world
""" f'''
world
hello
''' f"some {f"""multiline
allowed {x}"""} string""#;
assert_snapshot!(lex_source(source));
}
#[test]
fn test_fstring_comments() {
let source = r#"f"""
# not a comment { # comment {
x
} # not a comment
""""#;
assert_snapshot!(lex_source(source));
}
#[test]
fn test_fstring_with_ipy_escape_command() {
let source = r#"f"foo {!pwd} bar""#;
assert_snapshot!(lex_source(source));
}
#[test]
fn test_fstring_with_lambda_expression() {
let source = r#"
f"{lambda x:{x}}"
f"{(lambda x:{x})}"
"#
.trim();
assert_snapshot!(lex_source(source));
}
#[test]
fn test_fstring_with_nul_char() {
let source = r"f'\0'";
assert_snapshot!(lex_source(source));
}
#[test]
fn test_match_softkeyword_in_notebook() {
let source = r"match foo:
case bar:
pass";
assert_snapshot!(lex_jupyter_source(source));
}
fn lex_fstring_error(source: &str) -> FStringErrorType {
let output = lex(source, Mode::Module);
match output
.errors
.into_iter()
.next()
.expect("lexer should give at least one error")
.into_error()
{
LexicalErrorType::FStringError(error) => error,
err => panic!("Expected FStringError: {err:?}"),
}
}
#[test]
fn test_fstring_error() {
use FStringErrorType::{SingleRbrace, UnterminatedString, UnterminatedTripleQuotedString};
assert_eq!(lex_fstring_error("f'}'"), SingleRbrace);
assert_eq!(lex_fstring_error("f'{{}'"), SingleRbrace);
assert_eq!(lex_fstring_error("f'{{}}}'"), SingleRbrace);
assert_eq!(lex_fstring_error("f'foo}'"), SingleRbrace);
assert_eq!(lex_fstring_error(r"f'\u007b}'"), SingleRbrace);
assert_eq!(lex_fstring_error("f'{a:b}}'"), SingleRbrace);
assert_eq!(lex_fstring_error("f'{3:}}>10}'"), SingleRbrace);
assert_eq!(lex_fstring_error(r"f'\{foo}\}'"), SingleRbrace);
assert_eq!(lex_fstring_error(r#"f""#), UnterminatedString);
assert_eq!(lex_fstring_error(r"f'"), UnterminatedString);
assert_eq!(lex_fstring_error(r#"f""""#), UnterminatedTripleQuotedString);
assert_eq!(lex_fstring_error(r"f'''"), UnterminatedTripleQuotedString);
assert_eq!(
lex_fstring_error(r#"f"""""#),
UnterminatedTripleQuotedString
);
assert_eq!(
lex_fstring_error(r#"f""""""#),
UnterminatedTripleQuotedString
);
}
}