mirror of
https://github.com/astral-sh/ruff.git
synced 2025-08-03 18:28:56 +00:00

## Summary Given: ```python F"{"ڤ ``` We try to locate the "unclosed left brace" error by subtracting the quote size from the lexer offset -- so we subtract 1 from the end of the source, which puts us in the middle of a Unicode character. I don't think we should try to adjust the offset in this way, since there can be content _after_ the quote. For example, with the advent of PEP 701, this string could reasonably be fixed as: ```python F"{"ڤ"}" ```` Closes https://github.com/astral-sh/ruff/issues/9379.
2274 lines
74 KiB
Rust
2274 lines
74 KiB
Rust
//! This module takes care of lexing Python source text.
|
||
//!
|
||
//! This means source code is scanned and translated into separate tokens. The rules
|
||
//! governing what is and is not a valid token are defined in the Python reference
|
||
//! guide section on [Lexical analysis].
|
||
//!
|
||
//! The primary function in this module is [`lex`], which takes a string slice
|
||
//! and returns an iterator over the tokens in the source code. The tokens are currently returned
|
||
//! as a `Result<Spanned, LexicalError>`, where [`Spanned`] is a tuple containing the
|
||
//! start and end [`TextSize`] and a [`Tok`] denoting the token.
|
||
//!
|
||
//! # Example
|
||
//!
|
||
//! ```
|
||
//! use ruff_python_parser::{lexer::lex, Tok, Mode, StringKind};
|
||
//!
|
||
//! let source = "x = 'RustPython'";
|
||
//! let tokens = lex(source, Mode::Module)
|
||
//! .map(|tok| tok.expect("Failed to lex"))
|
||
//! .collect::<Vec<_>>();
|
||
//!
|
||
//! for (token, range) in tokens {
|
||
//! println!(
|
||
//! "{token:?}@{range:?}",
|
||
//! );
|
||
//! }
|
||
//! ```
|
||
//!
|
||
//! [Lexical analysis]: https://docs.python.org/3/reference/lexical_analysis.html
|
||
|
||
use std::iter::FusedIterator;
|
||
use std::{char, cmp::Ordering, str::FromStr};
|
||
|
||
use unicode_ident::{is_xid_continue, is_xid_start};
|
||
|
||
use ruff_python_ast::{Int, IpyEscapeKind};
|
||
use ruff_text_size::{TextLen, TextRange, TextSize};
|
||
|
||
use crate::lexer::cursor::{Cursor, EOF_CHAR};
|
||
use crate::lexer::fstring::{FStringContext, FStringContextFlags, FStrings};
|
||
use crate::lexer::indentation::{Indentation, Indentations};
|
||
use crate::{
|
||
soft_keywords::SoftKeywordTransformer,
|
||
string::FStringErrorType,
|
||
token::{StringKind, Tok},
|
||
Mode,
|
||
};
|
||
|
||
mod cursor;
|
||
mod fstring;
|
||
mod indentation;
|
||
|
||
/// A lexer for Python source code.
|
||
pub struct Lexer<'source> {
|
||
// Contains the source code to be lexed.
|
||
cursor: Cursor<'source>,
|
||
source: &'source str,
|
||
|
||
state: State,
|
||
// Amount of parenthesis.
|
||
nesting: u32,
|
||
// Indentation levels.
|
||
indentations: Indentations,
|
||
pending_indentation: Option<Indentation>,
|
||
// Lexer mode.
|
||
mode: Mode,
|
||
// F-string contexts.
|
||
fstrings: FStrings,
|
||
}
|
||
|
||
/// Contains a Token along with its `range`.
|
||
pub type Spanned = (Tok, TextRange);
|
||
/// The result of lexing a token.
|
||
pub type LexResult = Result<Spanned, LexicalError>;
|
||
|
||
/// Create a new lexer from a source string.
|
||
///
|
||
/// # Examples
|
||
///
|
||
/// ```
|
||
/// use ruff_python_parser::{Mode, lexer::lex};
|
||
///
|
||
/// let source = "def hello(): return 'world'";
|
||
/// let lexer = lex(source, Mode::Module);
|
||
///
|
||
/// for token in lexer {
|
||
/// println!("{:?}", token);
|
||
/// }
|
||
/// ```
|
||
#[inline]
|
||
pub fn lex(source: &str, mode: Mode) -> SoftKeywordTransformer<Lexer> {
|
||
SoftKeywordTransformer::new(Lexer::new(source, mode), mode)
|
||
}
|
||
|
||
pub struct LexStartsAtIterator<I> {
|
||
start_offset: TextSize,
|
||
inner: I,
|
||
}
|
||
|
||
impl<I> Iterator for LexStartsAtIterator<I>
|
||
where
|
||
I: Iterator<Item = LexResult>,
|
||
{
|
||
type Item = LexResult;
|
||
|
||
#[inline]
|
||
fn next(&mut self) -> Option<Self::Item> {
|
||
let result = match self.inner.next()? {
|
||
Ok((tok, range)) => Ok((tok, range + self.start_offset)),
|
||
Err(error) => Err(LexicalError {
|
||
location: error.location + self.start_offset,
|
||
..error
|
||
}),
|
||
};
|
||
|
||
Some(result)
|
||
}
|
||
|
||
fn size_hint(&self) -> (usize, Option<usize>) {
|
||
self.inner.size_hint()
|
||
}
|
||
}
|
||
|
||
impl<I> FusedIterator for LexStartsAtIterator<I> where I: Iterator<Item = LexResult> + FusedIterator {}
|
||
impl<I> ExactSizeIterator for LexStartsAtIterator<I> where
|
||
I: Iterator<Item = LexResult> + ExactSizeIterator
|
||
{
|
||
}
|
||
|
||
/// Create a new lexer from a source string, starting at a given location.
|
||
/// You probably want to use [`lex`] instead.
|
||
pub fn lex_starts_at(
|
||
source: &str,
|
||
mode: Mode,
|
||
start_offset: TextSize,
|
||
) -> LexStartsAtIterator<SoftKeywordTransformer<Lexer>> {
|
||
LexStartsAtIterator {
|
||
start_offset,
|
||
inner: lex(source, mode),
|
||
}
|
||
}
|
||
|
||
impl<'source> Lexer<'source> {
|
||
/// Create a new lexer from T and a starting location. You probably want to use
|
||
/// [`lex`] instead.
|
||
pub fn new(input: &'source str, mode: Mode) -> Self {
|
||
assert!(
|
||
u32::try_from(input.len()).is_ok(),
|
||
"Lexer only supports files with a size up to 4GB"
|
||
);
|
||
|
||
let mut lxr = Lexer {
|
||
state: State::AfterNewline,
|
||
nesting: 0,
|
||
indentations: Indentations::default(),
|
||
pending_indentation: None,
|
||
|
||
source: input,
|
||
cursor: Cursor::new(input),
|
||
mode,
|
||
fstrings: FStrings::default(),
|
||
};
|
||
// TODO: Handle possible mismatch between BOM and explicit encoding declaration.
|
||
// spell-checker:ignore feff
|
||
lxr.cursor.eat_char('\u{feff}');
|
||
|
||
lxr
|
||
}
|
||
|
||
/// Lex an identifier. Also used for keywords and string/bytes literals with a prefix.
|
||
fn lex_identifier(&mut self, first: char) -> Result<Tok, LexicalError> {
|
||
// Detect potential string like rb'' b'' f'' u'' r''
|
||
match (first, self.cursor.first()) {
|
||
('f' | 'F', quote @ ('\'' | '"')) => {
|
||
self.cursor.bump();
|
||
return Ok(self.lex_fstring_start(quote, false));
|
||
}
|
||
('r' | 'R', 'f' | 'F') | ('f' | 'F', 'r' | 'R') if is_quote(self.cursor.second()) => {
|
||
self.cursor.bump();
|
||
let quote = self.cursor.bump().unwrap();
|
||
return Ok(self.lex_fstring_start(quote, true));
|
||
}
|
||
(_, quote @ ('\'' | '"')) => {
|
||
if let Ok(string_kind) = StringKind::try_from(first) {
|
||
self.cursor.bump();
|
||
return self.lex_string(string_kind, quote);
|
||
}
|
||
}
|
||
(_, second @ ('r' | 'R' | 'b' | 'B')) if is_quote(self.cursor.second()) => {
|
||
self.cursor.bump();
|
||
if let Ok(string_kind) = StringKind::try_from([first, second]) {
|
||
let quote = self.cursor.bump().unwrap();
|
||
return self.lex_string(string_kind, quote);
|
||
}
|
||
}
|
||
_ => {}
|
||
}
|
||
|
||
self.cursor.eat_while(is_identifier_continuation);
|
||
|
||
let text = self.token_text();
|
||
|
||
let keyword = match text {
|
||
"False" => Tok::False,
|
||
"None" => Tok::None,
|
||
"True" => Tok::True,
|
||
"and" => Tok::And,
|
||
"as" => Tok::As,
|
||
"assert" => Tok::Assert,
|
||
"async" => Tok::Async,
|
||
"await" => Tok::Await,
|
||
"break" => Tok::Break,
|
||
"case" => Tok::Case,
|
||
"class" => Tok::Class,
|
||
"continue" => Tok::Continue,
|
||
"def" => Tok::Def,
|
||
"del" => Tok::Del,
|
||
"elif" => Tok::Elif,
|
||
"else" => Tok::Else,
|
||
"except" => Tok::Except,
|
||
"finally" => Tok::Finally,
|
||
"for" => Tok::For,
|
||
"from" => Tok::From,
|
||
"global" => Tok::Global,
|
||
"if" => Tok::If,
|
||
"import" => Tok::Import,
|
||
"in" => Tok::In,
|
||
"is" => Tok::Is,
|
||
"lambda" => Tok::Lambda,
|
||
"match" => Tok::Match,
|
||
"nonlocal" => Tok::Nonlocal,
|
||
"not" => Tok::Not,
|
||
"or" => Tok::Or,
|
||
"pass" => Tok::Pass,
|
||
"raise" => Tok::Raise,
|
||
"return" => Tok::Return,
|
||
"try" => Tok::Try,
|
||
"type" => Tok::Type,
|
||
"while" => Tok::While,
|
||
"with" => Tok::With,
|
||
"yield" => Tok::Yield,
|
||
_ => {
|
||
return Ok(Tok::Name {
|
||
name: text.to_string(),
|
||
})
|
||
}
|
||
};
|
||
|
||
Ok(keyword)
|
||
}
|
||
|
||
/// Numeric lexing. The feast can start!
|
||
fn lex_number(&mut self, first: char) -> Result<Tok, LexicalError> {
|
||
if first == '0' {
|
||
if self.cursor.eat_if(|c| matches!(c, 'x' | 'X')).is_some() {
|
||
self.lex_number_radix(Radix::Hex)
|
||
} else if self.cursor.eat_if(|c| matches!(c, 'o' | 'O')).is_some() {
|
||
self.lex_number_radix(Radix::Octal)
|
||
} else if self.cursor.eat_if(|c| matches!(c, 'b' | 'B')).is_some() {
|
||
self.lex_number_radix(Radix::Binary)
|
||
} else {
|
||
self.lex_decimal_number(first)
|
||
}
|
||
} else {
|
||
self.lex_decimal_number(first)
|
||
}
|
||
}
|
||
|
||
/// Lex a hex/octal/decimal/binary number without a decimal point.
|
||
fn lex_number_radix(&mut self, radix: Radix) -> Result<Tok, LexicalError> {
|
||
#[cfg(debug_assertions)]
|
||
debug_assert!(matches!(
|
||
self.cursor.previous().to_ascii_lowercase(),
|
||
'x' | 'o' | 'b'
|
||
));
|
||
|
||
// Lex the portion of the token after the base prefix (e.g., `9D5` in `0x9D5`).
|
||
let mut number = LexedText::new(self.offset(), self.source);
|
||
self.radix_run(&mut number, radix);
|
||
|
||
// Extract the entire number, including the base prefix (e.g., `0x9D5`).
|
||
let token = &self.source[self.token_range()];
|
||
|
||
let value = match Int::from_str_radix(number.as_str(), radix.as_u32(), token) {
|
||
Ok(int) => int,
|
||
Err(err) => {
|
||
return Err(LexicalError {
|
||
error: LexicalErrorType::OtherError(format!("{err:?}")),
|
||
location: self.token_range().start(),
|
||
});
|
||
}
|
||
};
|
||
Ok(Tok::Int { value })
|
||
}
|
||
|
||
/// Lex a normal number, that is, no octal, hex or binary number.
|
||
fn lex_decimal_number(&mut self, first_digit_or_dot: char) -> Result<Tok, LexicalError> {
|
||
#[cfg(debug_assertions)]
|
||
debug_assert!(self.cursor.previous().is_ascii_digit() || self.cursor.previous() == '.');
|
||
let start_is_zero = first_digit_or_dot == '0';
|
||
|
||
let mut number = LexedText::new(self.token_start(), self.source);
|
||
if first_digit_or_dot != '.' {
|
||
number.push(first_digit_or_dot);
|
||
self.radix_run(&mut number, Radix::Decimal);
|
||
};
|
||
|
||
let is_float = if first_digit_or_dot == '.' || self.cursor.eat_char('.') {
|
||
number.push('.');
|
||
|
||
if self.cursor.eat_char('_') {
|
||
return Err(LexicalError {
|
||
error: LexicalErrorType::OtherError("Invalid Syntax".to_owned()),
|
||
location: self.offset() - TextSize::new(1),
|
||
});
|
||
}
|
||
|
||
self.radix_run(&mut number, Radix::Decimal);
|
||
true
|
||
} else {
|
||
// Normal number:
|
||
false
|
||
};
|
||
|
||
let is_float = match self.cursor.rest().as_bytes() {
|
||
[b'e' | b'E', b'0'..=b'9', ..] | [b'e' | b'E', b'-' | b'+', b'0'..=b'9', ..] => {
|
||
// 'e' | 'E'
|
||
number.push(self.cursor.bump().unwrap());
|
||
|
||
if let Some(sign) = self.cursor.eat_if(|c| matches!(c, '+' | '-')) {
|
||
number.push(sign);
|
||
}
|
||
|
||
self.radix_run(&mut number, Radix::Decimal);
|
||
|
||
true
|
||
}
|
||
_ => is_float,
|
||
};
|
||
|
||
if is_float {
|
||
// Improvement: Use `Cow` instead of pushing to value text
|
||
let value = f64::from_str(number.as_str()).map_err(|_| LexicalError {
|
||
error: LexicalErrorType::OtherError("Invalid decimal literal".to_owned()),
|
||
location: self.token_start(),
|
||
})?;
|
||
|
||
// Parse trailing 'j':
|
||
if self.cursor.eat_if(|c| matches!(c, 'j' | 'J')).is_some() {
|
||
Ok(Tok::Complex {
|
||
real: 0.0,
|
||
imag: value,
|
||
})
|
||
} else {
|
||
Ok(Tok::Float { value })
|
||
}
|
||
} else {
|
||
// Parse trailing 'j':
|
||
if self.cursor.eat_if(|c| matches!(c, 'j' | 'J')).is_some() {
|
||
let imag = f64::from_str(number.as_str()).unwrap();
|
||
Ok(Tok::Complex { real: 0.0, imag })
|
||
} else {
|
||
let value = match Int::from_str(number.as_str()) {
|
||
Ok(value) => {
|
||
if start_is_zero && value.as_u8() != Some(0) {
|
||
// Leading zeros in decimal integer literals are not permitted.
|
||
return Err(LexicalError {
|
||
error: LexicalErrorType::OtherError("Invalid Token".to_owned()),
|
||
location: self.token_range().start(),
|
||
});
|
||
}
|
||
value
|
||
}
|
||
Err(err) => {
|
||
return Err(LexicalError {
|
||
error: LexicalErrorType::OtherError(format!("{err:?}")),
|
||
location: self.token_range().start(),
|
||
})
|
||
}
|
||
};
|
||
Ok(Tok::Int { value })
|
||
}
|
||
}
|
||
}
|
||
|
||
/// Consume a sequence of numbers with the given radix,
|
||
/// the digits can be decorated with underscores
|
||
/// like this: '`1_2_3_4`' == '1234'
|
||
fn radix_run(&mut self, number: &mut LexedText, radix: Radix) {
|
||
loop {
|
||
if let Some(c) = self.cursor.eat_if(|c| radix.is_digit(c)) {
|
||
number.push(c);
|
||
}
|
||
// Number that contains `_` separators. Remove them from the parsed text.
|
||
else if self.cursor.first() == '_' && radix.is_digit(self.cursor.second()) {
|
||
// Skip over `_`
|
||
self.cursor.bump();
|
||
number.skip_char();
|
||
} else {
|
||
break;
|
||
}
|
||
}
|
||
}
|
||
|
||
/// Lex a single comment.
|
||
fn lex_comment(&mut self) -> Tok {
|
||
#[cfg(debug_assertions)]
|
||
debug_assert_eq!(self.cursor.previous(), '#');
|
||
|
||
let bytes = self.cursor.rest().as_bytes();
|
||
let offset = memchr::memchr2(b'\n', b'\r', bytes).unwrap_or(bytes.len());
|
||
self.cursor.skip_bytes(offset);
|
||
|
||
Tok::Comment(self.token_text().to_string())
|
||
}
|
||
|
||
/// Lex a single IPython escape command.
|
||
fn lex_ipython_escape_command(&mut self, escape_kind: IpyEscapeKind) -> Tok {
|
||
let mut value = String::new();
|
||
|
||
loop {
|
||
match self.cursor.first() {
|
||
'\\' => {
|
||
// Only skip the line continuation if it is followed by a newline
|
||
// otherwise it is a normal backslash which is part of the magic command:
|
||
//
|
||
// Skip this backslash
|
||
// v
|
||
// !pwd \
|
||
// && ls -a | sed 's/^/\\ /'
|
||
// ^^
|
||
// Don't skip these backslashes
|
||
if self.cursor.second() == '\r' {
|
||
self.cursor.bump();
|
||
self.cursor.bump();
|
||
self.cursor.eat_char('\n');
|
||
continue;
|
||
} else if self.cursor.second() == '\n' {
|
||
self.cursor.bump();
|
||
self.cursor.bump();
|
||
continue;
|
||
}
|
||
|
||
self.cursor.bump();
|
||
value.push('\\');
|
||
}
|
||
// Help end escape commands are those that end with 1 or 2 question marks.
|
||
// Here, we're only looking for a subset of help end escape commands which
|
||
// are the ones that has the escape token at the start of the line as well.
|
||
// On the other hand, we're not looking for help end escape commands that
|
||
// are strict in the sense that the escape token is only at the end. For example,
|
||
//
|
||
// * `%foo?` is recognized as a help end escape command but not as a strict one.
|
||
// * `foo?` is recognized as a strict help end escape command which is not
|
||
// lexed here but is identified at the parser level.
|
||
//
|
||
// Help end escape commands implemented in the IPython codebase using regex:
|
||
// https://github.com/ipython/ipython/blob/292e3a23459ca965b8c1bfe2c3707044c510209a/IPython/core/inputtransformer2.py#L454-L462
|
||
'?' => {
|
||
self.cursor.bump();
|
||
let mut question_count = 1u32;
|
||
while self.cursor.eat_char('?') {
|
||
question_count += 1;
|
||
}
|
||
|
||
// The original implementation in the IPython codebase is based on regex which
|
||
// means that it's strict in the sense that it won't recognize a help end escape:
|
||
// * If there's any whitespace before the escape token (e.g. `%foo ?`)
|
||
// * If there are more than 2 question mark tokens (e.g. `%foo???`)
|
||
// which is what we're doing here as well. In that case, we'll continue with
|
||
// the prefixed escape token.
|
||
//
|
||
// Now, the whitespace and empty value check also makes sure that an empty
|
||
// command (e.g. `%?` or `? ??`, no value after/between the escape tokens)
|
||
// is not recognized as a help end escape command. So, `%?` and `? ??` are
|
||
// `IpyEscapeKind::Magic` and `IpyEscapeKind::Help` because of the initial `%` and `??`
|
||
// tokens.
|
||
if question_count > 2
|
||
|| value.chars().last().map_or(true, is_python_whitespace)
|
||
|| !matches!(self.cursor.first(), '\n' | '\r' | EOF_CHAR)
|
||
{
|
||
// Not a help end escape command, so continue with the lexing.
|
||
value.reserve(question_count as usize);
|
||
for _ in 0..question_count {
|
||
value.push('?');
|
||
}
|
||
continue;
|
||
}
|
||
|
||
if escape_kind.is_help() {
|
||
// If we've recognize this as a help end escape command, then
|
||
// any question mark token / whitespaces at the start are not
|
||
// considered as part of the value.
|
||
//
|
||
// For example, `??foo?` is recognized as `IpyEscapeKind::Help` and
|
||
// `value` is `foo` instead of `??foo`.
|
||
value = value.trim_start_matches([' ', '?']).to_string();
|
||
} else if escape_kind.is_magic() {
|
||
// Between `%` and `?` (at the end), the `?` takes priority
|
||
// over the `%` so `%foo?` is recognized as `IpyEscapeKind::Help`
|
||
// and `value` is `%foo` instead of `foo`. So, we need to
|
||
// insert the magic escape token at the start.
|
||
value.insert_str(0, escape_kind.as_str());
|
||
}
|
||
|
||
let kind = match question_count {
|
||
1 => IpyEscapeKind::Help,
|
||
2 => IpyEscapeKind::Help2,
|
||
_ => unreachable!("`question_count` is always 1 or 2"),
|
||
};
|
||
return Tok::IpyEscapeCommand { kind, value };
|
||
}
|
||
'\n' | '\r' | EOF_CHAR => {
|
||
return Tok::IpyEscapeCommand {
|
||
kind: escape_kind,
|
||
value,
|
||
};
|
||
}
|
||
c => {
|
||
self.cursor.bump();
|
||
value.push(c);
|
||
}
|
||
}
|
||
}
|
||
}
|
||
|
||
/// Lex a f-string start token.
|
||
fn lex_fstring_start(&mut self, quote: char, is_raw_string: bool) -> Tok {
|
||
#[cfg(debug_assertions)]
|
||
debug_assert_eq!(self.cursor.previous(), quote);
|
||
|
||
let mut flags = FStringContextFlags::empty();
|
||
if quote == '"' {
|
||
flags |= FStringContextFlags::DOUBLE;
|
||
}
|
||
if is_raw_string {
|
||
flags |= FStringContextFlags::RAW;
|
||
}
|
||
if self.cursor.eat_char2(quote, quote) {
|
||
flags |= FStringContextFlags::TRIPLE;
|
||
}
|
||
|
||
self.fstrings.push(FStringContext::new(flags, self.nesting));
|
||
Tok::FStringStart
|
||
}
|
||
|
||
/// Lex a f-string middle or end token.
|
||
fn lex_fstring_middle_or_end(&mut self) -> Result<Option<Tok>, LexicalError> {
|
||
// SAFETY: Safe because the function is only called when `self.fstrings` is not empty.
|
||
let fstring = self.fstrings.current().unwrap();
|
||
self.cursor.start_token();
|
||
|
||
// Check if we're at the end of the f-string.
|
||
if fstring.is_triple_quoted() {
|
||
let quote_char = fstring.quote_char();
|
||
if self.cursor.eat_char3(quote_char, quote_char, quote_char) {
|
||
return Ok(Some(Tok::FStringEnd));
|
||
}
|
||
} else if self.cursor.eat_char(fstring.quote_char()) {
|
||
return Ok(Some(Tok::FStringEnd));
|
||
}
|
||
|
||
// We have to decode `{{` and `}}` into `{` and `}` respectively. As an
|
||
// optimization, we only allocate a new string we find any escaped curly braces,
|
||
// otherwise this string will remain empty and we'll use a source slice instead.
|
||
let mut normalized = String::new();
|
||
|
||
// Tracks the last offset of token value that has been written to `normalized`.
|
||
let mut last_offset = self.offset();
|
||
|
||
// This isn't going to change for the duration of the loop.
|
||
let in_format_spec = fstring.is_in_format_spec(self.nesting);
|
||
|
||
let mut in_named_unicode = false;
|
||
|
||
loop {
|
||
match self.cursor.first() {
|
||
// The condition is to differentiate between the `NUL` (`\0`) character
|
||
// in the source code and the one returned by `self.cursor.first()` when
|
||
// we reach the end of the source code.
|
||
EOF_CHAR if self.cursor.is_eof() => {
|
||
let error = if fstring.is_triple_quoted() {
|
||
FStringErrorType::UnterminatedTripleQuotedString
|
||
} else {
|
||
FStringErrorType::UnterminatedString
|
||
};
|
||
return Err(LexicalError {
|
||
error: LexicalErrorType::FStringError(error),
|
||
location: self.offset(),
|
||
});
|
||
}
|
||
'\n' | '\r' if !fstring.is_triple_quoted() => {
|
||
// If we encounter a newline while we're in a format spec, then
|
||
// we stop here and let the lexer emit the newline token.
|
||
//
|
||
// Relevant discussion: https://github.com/python/cpython/issues/110259
|
||
if in_format_spec {
|
||
break;
|
||
}
|
||
return Err(LexicalError {
|
||
error: LexicalErrorType::FStringError(FStringErrorType::UnterminatedString),
|
||
location: self.offset(),
|
||
});
|
||
}
|
||
'\\' => {
|
||
self.cursor.bump(); // '\'
|
||
if matches!(self.cursor.first(), '{' | '}') {
|
||
// Don't consume `{` or `}` as we want them to be emitted as tokens.
|
||
// They will be handled in the next iteration.
|
||
continue;
|
||
} else if !fstring.is_raw_string() {
|
||
if self.cursor.eat_char2('N', '{') {
|
||
in_named_unicode = true;
|
||
continue;
|
||
}
|
||
}
|
||
// Consume the escaped character.
|
||
if self.cursor.eat_char('\r') {
|
||
self.cursor.eat_char('\n');
|
||
} else {
|
||
self.cursor.bump();
|
||
}
|
||
}
|
||
quote @ ('\'' | '"') if quote == fstring.quote_char() => {
|
||
if let Some(triple_quotes) = fstring.triple_quotes() {
|
||
if self.cursor.rest().starts_with(triple_quotes) {
|
||
break;
|
||
}
|
||
self.cursor.bump();
|
||
} else {
|
||
break;
|
||
}
|
||
}
|
||
'{' => {
|
||
if self.cursor.second() == '{' && !in_format_spec {
|
||
self.cursor.bump();
|
||
normalized
|
||
.push_str(&self.source[TextRange::new(last_offset, self.offset())]);
|
||
self.cursor.bump(); // Skip the second `{`
|
||
last_offset = self.offset();
|
||
} else {
|
||
break;
|
||
}
|
||
}
|
||
'}' => {
|
||
if in_named_unicode {
|
||
in_named_unicode = false;
|
||
self.cursor.bump();
|
||
} else if self.cursor.second() == '}' && !in_format_spec {
|
||
self.cursor.bump();
|
||
normalized
|
||
.push_str(&self.source[TextRange::new(last_offset, self.offset())]);
|
||
self.cursor.bump(); // Skip the second `}`
|
||
last_offset = self.offset();
|
||
} else {
|
||
break;
|
||
}
|
||
}
|
||
_ => {
|
||
self.cursor.bump();
|
||
}
|
||
}
|
||
}
|
||
let range = self.token_range();
|
||
if range.is_empty() {
|
||
return Ok(None);
|
||
}
|
||
|
||
let value = if normalized.is_empty() {
|
||
self.source[range].to_string()
|
||
} else {
|
||
normalized.push_str(&self.source[TextRange::new(last_offset, self.offset())]);
|
||
normalized
|
||
};
|
||
Ok(Some(Tok::FStringMiddle {
|
||
value,
|
||
is_raw: fstring.is_raw_string(),
|
||
}))
|
||
}
|
||
|
||
/// Lex a string literal.
|
||
fn lex_string(&mut self, kind: StringKind, quote: char) -> Result<Tok, LexicalError> {
|
||
#[cfg(debug_assertions)]
|
||
debug_assert_eq!(self.cursor.previous(), quote);
|
||
|
||
// If the next two characters are also the quote character, then we have a triple-quoted
|
||
// string; consume those two characters and ensure that we require a triple-quote to close
|
||
let triple_quoted = self.cursor.eat_char2(quote, quote);
|
||
|
||
let value_start = self.offset();
|
||
|
||
let value_end = loop {
|
||
match self.cursor.bump() {
|
||
Some('\\') => {
|
||
if self.cursor.eat_char('\r') {
|
||
self.cursor.eat_char('\n');
|
||
} else {
|
||
self.cursor.bump();
|
||
}
|
||
}
|
||
Some('\r' | '\n') if !triple_quoted => {
|
||
if let Some(fstring) = self.fstrings.current() {
|
||
// When we are in an f-string, check whether the initial quote
|
||
// matches with f-strings quotes and if it is, then this must be a
|
||
// missing '}' token so raise the proper error.
|
||
if fstring.quote_char() == quote && !fstring.is_triple_quoted() {
|
||
return Err(LexicalError {
|
||
error: LexicalErrorType::FStringError(
|
||
FStringErrorType::UnclosedLbrace,
|
||
),
|
||
location: self.offset() - TextSize::new(1),
|
||
});
|
||
}
|
||
}
|
||
return Err(LexicalError {
|
||
error: LexicalErrorType::OtherError(
|
||
"EOL while scanning string literal".to_owned(),
|
||
),
|
||
location: self.offset() - TextSize::new(1),
|
||
});
|
||
}
|
||
Some(c) if c == quote => {
|
||
if triple_quoted {
|
||
if self.cursor.eat_char2(quote, quote) {
|
||
break self.offset() - TextSize::new(3);
|
||
}
|
||
} else {
|
||
break self.offset() - TextSize::new(1);
|
||
}
|
||
}
|
||
|
||
Some(_) => {}
|
||
None => {
|
||
if let Some(fstring) = self.fstrings.current() {
|
||
// When we are in an f-string, check whether the initial quote
|
||
// matches with f-strings quotes and if it is, then this must be a
|
||
// missing '}' token so raise the proper error.
|
||
if fstring.quote_char() == quote
|
||
&& fstring.is_triple_quoted() == triple_quoted
|
||
{
|
||
return Err(LexicalError {
|
||
error: LexicalErrorType::FStringError(
|
||
FStringErrorType::UnclosedLbrace,
|
||
),
|
||
location: self.offset(),
|
||
});
|
||
}
|
||
}
|
||
return Err(LexicalError {
|
||
error: if triple_quoted {
|
||
LexicalErrorType::Eof
|
||
} else {
|
||
LexicalErrorType::StringError
|
||
},
|
||
location: self.offset(),
|
||
});
|
||
}
|
||
}
|
||
};
|
||
|
||
let tok = Tok::String {
|
||
value: self.source[TextRange::new(value_start, value_end)].to_string(),
|
||
kind,
|
||
triple_quoted,
|
||
};
|
||
Ok(tok)
|
||
}
|
||
|
||
// This is the main entry point. Call this function to retrieve the next token.
|
||
// This function is used by the iterator implementation.
|
||
pub fn next_token(&mut self) -> LexResult {
|
||
if let Some(fstring) = self.fstrings.current() {
|
||
if !fstring.is_in_expression(self.nesting) {
|
||
match self.lex_fstring_middle_or_end() {
|
||
Ok(Some(tok)) => {
|
||
if tok == Tok::FStringEnd {
|
||
self.fstrings.pop();
|
||
}
|
||
return Ok((tok, self.token_range()));
|
||
}
|
||
Err(e) => {
|
||
// This is to prevent an infinite loop in which the lexer
|
||
// continuously returns an error token because the f-string
|
||
// remains on the stack.
|
||
self.fstrings.pop();
|
||
return Err(e);
|
||
}
|
||
_ => {}
|
||
}
|
||
}
|
||
}
|
||
// Return dedent tokens until the current indentation level matches the indentation of the next token.
|
||
else if let Some(indentation) = self.pending_indentation.take() {
|
||
match self.indentations.current().try_compare(indentation) {
|
||
Ok(Ordering::Greater) => {
|
||
self.pending_indentation = Some(indentation);
|
||
let offset = self.offset();
|
||
self.indentations.dedent_one(indentation).map_err(|_| {
|
||
LexicalError::new(LexicalErrorType::IndentationError, offset)
|
||
})?;
|
||
return Ok((Tok::Dedent, TextRange::empty(offset)));
|
||
}
|
||
Ok(_) => {}
|
||
Err(_) => {
|
||
return Err(LexicalError::new(
|
||
LexicalErrorType::IndentationError,
|
||
self.offset(),
|
||
));
|
||
}
|
||
}
|
||
}
|
||
|
||
if self.state.is_after_newline() {
|
||
if let Some(indentation) = self.eat_indentation()? {
|
||
return Ok(indentation);
|
||
}
|
||
} else {
|
||
self.skip_whitespace()?;
|
||
}
|
||
|
||
self.cursor.start_token();
|
||
if let Some(c) = self.cursor.bump() {
|
||
if c.is_ascii() {
|
||
self.consume_ascii_character(c)
|
||
} else if is_unicode_identifier_start(c) {
|
||
let identifier = self.lex_identifier(c)?;
|
||
self.state = State::Other;
|
||
|
||
Ok((identifier, self.token_range()))
|
||
} else {
|
||
Err(LexicalError {
|
||
error: LexicalErrorType::UnrecognizedToken { tok: c },
|
||
location: self.token_start(),
|
||
})
|
||
}
|
||
} else {
|
||
// Reached the end of the file. Emit a trailing newline token if not at the beginning of a logical line,
|
||
// empty the dedent stack, and finally, return the EndOfFile token.
|
||
self.consume_end()
|
||
}
|
||
}
|
||
|
||
fn skip_whitespace(&mut self) -> Result<(), LexicalError> {
|
||
loop {
|
||
match self.cursor.first() {
|
||
' ' => {
|
||
self.cursor.bump();
|
||
}
|
||
'\t' => {
|
||
self.cursor.bump();
|
||
}
|
||
'\\' => {
|
||
self.cursor.bump();
|
||
if self.cursor.eat_char('\r') {
|
||
self.cursor.eat_char('\n');
|
||
} else if self.cursor.is_eof() {
|
||
return Err(LexicalError {
|
||
error: LexicalErrorType::Eof,
|
||
location: self.token_start(),
|
||
});
|
||
} else if !self.cursor.eat_char('\n') {
|
||
return Err(LexicalError {
|
||
error: LexicalErrorType::LineContinuationError,
|
||
location: self.token_start(),
|
||
});
|
||
}
|
||
}
|
||
// Form feed
|
||
'\x0C' => {
|
||
self.cursor.bump();
|
||
}
|
||
_ => break,
|
||
}
|
||
}
|
||
|
||
Ok(())
|
||
}
|
||
|
||
fn eat_indentation(&mut self) -> Result<Option<Spanned>, LexicalError> {
|
||
let mut indentation = Indentation::root();
|
||
self.cursor.start_token();
|
||
|
||
loop {
|
||
match self.cursor.first() {
|
||
' ' => {
|
||
self.cursor.bump();
|
||
indentation = indentation.add_space();
|
||
}
|
||
'\t' => {
|
||
self.cursor.bump();
|
||
indentation = indentation.add_tab();
|
||
}
|
||
'\\' => {
|
||
self.cursor.bump();
|
||
if self.cursor.eat_char('\r') {
|
||
self.cursor.eat_char('\n');
|
||
} else if self.cursor.is_eof() {
|
||
return Err(LexicalError {
|
||
error: LexicalErrorType::Eof,
|
||
location: self.token_start(),
|
||
});
|
||
} else if !self.cursor.eat_char('\n') {
|
||
return Err(LexicalError {
|
||
error: LexicalErrorType::LineContinuationError,
|
||
location: self.token_start(),
|
||
});
|
||
}
|
||
indentation = Indentation::root();
|
||
}
|
||
// Form feed
|
||
'\x0C' => {
|
||
self.cursor.bump();
|
||
indentation = Indentation::root();
|
||
}
|
||
_ => break,
|
||
}
|
||
}
|
||
|
||
// Handle indentation if this is a new, not all empty, logical line
|
||
if !matches!(self.cursor.first(), '\n' | '\r' | '#' | EOF_CHAR) {
|
||
self.state = State::NonEmptyLogicalLine;
|
||
|
||
if let Some(spanned) = self.handle_indentation(indentation)? {
|
||
// Set to false so that we don't handle indentation on the next call.
|
||
|
||
return Ok(Some(spanned));
|
||
}
|
||
}
|
||
|
||
Ok(None)
|
||
}
|
||
|
||
fn handle_indentation(
|
||
&mut self,
|
||
indentation: Indentation,
|
||
) -> Result<Option<Spanned>, LexicalError> {
|
||
let token = match self.indentations.current().try_compare(indentation) {
|
||
// Dedent
|
||
Ok(Ordering::Greater) => {
|
||
self.pending_indentation = Some(indentation);
|
||
|
||
self.indentations.dedent_one(indentation).map_err(|_| {
|
||
LexicalError::new(LexicalErrorType::IndentationError, self.offset())
|
||
})?;
|
||
|
||
Some((Tok::Dedent, TextRange::empty(self.offset())))
|
||
}
|
||
|
||
Ok(Ordering::Equal) => None,
|
||
|
||
// Indent
|
||
Ok(Ordering::Less) => {
|
||
self.indentations.indent(indentation);
|
||
Some((Tok::Indent, self.token_range()))
|
||
}
|
||
Err(_) => {
|
||
return Err(LexicalError {
|
||
error: LexicalErrorType::IndentationError,
|
||
location: self.offset(),
|
||
});
|
||
}
|
||
};
|
||
|
||
Ok(token)
|
||
}
|
||
|
||
fn consume_end(&mut self) -> Result<Spanned, LexicalError> {
|
||
// We reached end of file.
|
||
// First of all, we need all nestings to be finished.
|
||
if self.nesting > 0 {
|
||
// Reset the nesting to avoid going into infinite loop.
|
||
self.nesting = 0;
|
||
return Err(LexicalError {
|
||
error: LexicalErrorType::Eof,
|
||
location: self.offset(),
|
||
});
|
||
}
|
||
|
||
// Next, insert a trailing newline, if required.
|
||
if !self.state.is_new_logical_line() {
|
||
self.state = State::AfterNewline;
|
||
Ok((Tok::Newline, TextRange::empty(self.offset())))
|
||
}
|
||
// Next, flush the indentation stack to zero.
|
||
else if self.indentations.dedent().is_some() {
|
||
Ok((Tok::Dedent, TextRange::empty(self.offset())))
|
||
} else {
|
||
Ok((Tok::EndOfFile, TextRange::empty(self.offset())))
|
||
}
|
||
}
|
||
|
||
// Dispatch based on the given character.
|
||
fn consume_ascii_character(&mut self, c: char) -> Result<Spanned, LexicalError> {
|
||
let token = match c {
|
||
c if is_ascii_identifier_start(c) => self.lex_identifier(c)?,
|
||
'0'..='9' => self.lex_number(c)?,
|
||
'#' => return Ok((self.lex_comment(), self.token_range())),
|
||
'"' | '\'' => self.lex_string(StringKind::String, c)?,
|
||
'=' => {
|
||
if self.cursor.eat_char('=') {
|
||
Tok::EqEqual
|
||
} else {
|
||
self.state = State::AfterEqual;
|
||
return Ok((Tok::Equal, self.token_range()));
|
||
}
|
||
}
|
||
'+' => {
|
||
if self.cursor.eat_char('=') {
|
||
Tok::PlusEqual
|
||
} else {
|
||
Tok::Plus
|
||
}
|
||
}
|
||
'*' => {
|
||
if self.cursor.eat_char('=') {
|
||
Tok::StarEqual
|
||
} else if self.cursor.eat_char('*') {
|
||
if self.cursor.eat_char('=') {
|
||
Tok::DoubleStarEqual
|
||
} else {
|
||
Tok::DoubleStar
|
||
}
|
||
} else {
|
||
Tok::Star
|
||
}
|
||
}
|
||
|
||
c @ ('%' | '!')
|
||
if self.mode == Mode::Ipython
|
||
&& self.state.is_after_equal()
|
||
&& self.nesting == 0 =>
|
||
{
|
||
// SAFETY: Safe because `c` has been matched against one of the possible escape command token
|
||
self.lex_ipython_escape_command(IpyEscapeKind::try_from(c).unwrap())
|
||
}
|
||
|
||
c @ ('%' | '!' | '?' | '/' | ';' | ',')
|
||
if self.mode == Mode::Ipython && self.state.is_new_logical_line() =>
|
||
{
|
||
let kind = if let Ok(kind) = IpyEscapeKind::try_from([c, self.cursor.first()]) {
|
||
self.cursor.bump();
|
||
kind
|
||
} else {
|
||
// SAFETY: Safe because `c` has been matched against one of the possible escape command token
|
||
IpyEscapeKind::try_from(c).unwrap()
|
||
};
|
||
|
||
self.lex_ipython_escape_command(kind)
|
||
}
|
||
|
||
'?' if self.mode == Mode::Ipython => Tok::Question,
|
||
|
||
'/' => {
|
||
if self.cursor.eat_char('=') {
|
||
Tok::SlashEqual
|
||
} else if self.cursor.eat_char('/') {
|
||
if self.cursor.eat_char('=') {
|
||
Tok::DoubleSlashEqual
|
||
} else {
|
||
Tok::DoubleSlash
|
||
}
|
||
} else {
|
||
Tok::Slash
|
||
}
|
||
}
|
||
'%' => {
|
||
if self.cursor.eat_char('=') {
|
||
Tok::PercentEqual
|
||
} else {
|
||
Tok::Percent
|
||
}
|
||
}
|
||
'|' => {
|
||
if self.cursor.eat_char('=') {
|
||
Tok::VbarEqual
|
||
} else {
|
||
Tok::Vbar
|
||
}
|
||
}
|
||
'^' => {
|
||
if self.cursor.eat_char('=') {
|
||
Tok::CircumflexEqual
|
||
} else {
|
||
Tok::CircumFlex
|
||
}
|
||
}
|
||
'&' => {
|
||
if self.cursor.eat_char('=') {
|
||
Tok::AmperEqual
|
||
} else {
|
||
Tok::Amper
|
||
}
|
||
}
|
||
'-' => {
|
||
if self.cursor.eat_char('=') {
|
||
Tok::MinusEqual
|
||
} else if self.cursor.eat_char('>') {
|
||
Tok::Rarrow
|
||
} else {
|
||
Tok::Minus
|
||
}
|
||
}
|
||
'@' => {
|
||
if self.cursor.eat_char('=') {
|
||
Tok::AtEqual
|
||
} else {
|
||
Tok::At
|
||
}
|
||
}
|
||
'!' => {
|
||
if self.cursor.eat_char('=') {
|
||
Tok::NotEqual
|
||
} else {
|
||
Tok::Exclamation
|
||
}
|
||
}
|
||
'~' => Tok::Tilde,
|
||
'(' => {
|
||
self.nesting += 1;
|
||
Tok::Lpar
|
||
}
|
||
')' => {
|
||
self.nesting = self.nesting.saturating_sub(1);
|
||
Tok::Rpar
|
||
}
|
||
'[' => {
|
||
self.nesting += 1;
|
||
Tok::Lsqb
|
||
}
|
||
']' => {
|
||
self.nesting = self.nesting.saturating_sub(1);
|
||
Tok::Rsqb
|
||
}
|
||
'{' => {
|
||
self.nesting += 1;
|
||
Tok::Lbrace
|
||
}
|
||
'}' => {
|
||
if let Some(fstring) = self.fstrings.current_mut() {
|
||
if fstring.nesting() == self.nesting {
|
||
return Err(LexicalError {
|
||
error: LexicalErrorType::FStringError(FStringErrorType::SingleRbrace),
|
||
location: self.token_start(),
|
||
});
|
||
}
|
||
fstring.try_end_format_spec(self.nesting);
|
||
}
|
||
self.nesting = self.nesting.saturating_sub(1);
|
||
Tok::Rbrace
|
||
}
|
||
':' => {
|
||
if self
|
||
.fstrings
|
||
.current_mut()
|
||
.is_some_and(|fstring| fstring.try_start_format_spec(self.nesting))
|
||
{
|
||
Tok::Colon
|
||
} else if self.cursor.eat_char('=') {
|
||
Tok::ColonEqual
|
||
} else {
|
||
Tok::Colon
|
||
}
|
||
}
|
||
';' => Tok::Semi,
|
||
'<' => {
|
||
if self.cursor.eat_char('<') {
|
||
if self.cursor.eat_char('=') {
|
||
Tok::LeftShiftEqual
|
||
} else {
|
||
Tok::LeftShift
|
||
}
|
||
} else if self.cursor.eat_char('=') {
|
||
Tok::LessEqual
|
||
} else {
|
||
Tok::Less
|
||
}
|
||
}
|
||
'>' => {
|
||
if self.cursor.eat_char('>') {
|
||
if self.cursor.eat_char('=') {
|
||
Tok::RightShiftEqual
|
||
} else {
|
||
Tok::RightShift
|
||
}
|
||
} else if self.cursor.eat_char('=') {
|
||
Tok::GreaterEqual
|
||
} else {
|
||
Tok::Greater
|
||
}
|
||
}
|
||
',' => Tok::Comma,
|
||
'.' => {
|
||
if self.cursor.first().is_ascii_digit() {
|
||
self.lex_decimal_number('.')?
|
||
} else if self.cursor.eat_char2('.', '.') {
|
||
Tok::Ellipsis
|
||
} else {
|
||
Tok::Dot
|
||
}
|
||
}
|
||
'\n' => {
|
||
return Ok((
|
||
if self.nesting == 0 && !self.state.is_new_logical_line() {
|
||
self.state = State::AfterNewline;
|
||
Tok::Newline
|
||
} else {
|
||
if let Some(fstring) = self.fstrings.current_mut() {
|
||
fstring.try_end_format_spec(self.nesting);
|
||
}
|
||
Tok::NonLogicalNewline
|
||
},
|
||
self.token_range(),
|
||
))
|
||
}
|
||
'\r' => {
|
||
self.cursor.eat_char('\n');
|
||
|
||
return Ok((
|
||
if self.nesting == 0 && !self.state.is_new_logical_line() {
|
||
self.state = State::AfterNewline;
|
||
Tok::Newline
|
||
} else {
|
||
if let Some(fstring) = self.fstrings.current_mut() {
|
||
fstring.try_end_format_spec(self.nesting);
|
||
}
|
||
Tok::NonLogicalNewline
|
||
},
|
||
self.token_range(),
|
||
));
|
||
}
|
||
|
||
_ => {
|
||
self.state = State::Other;
|
||
|
||
return Err(LexicalError {
|
||
error: LexicalErrorType::UnrecognizedToken { tok: c },
|
||
location: self.token_start(),
|
||
});
|
||
}
|
||
};
|
||
|
||
self.state = State::Other;
|
||
|
||
Ok((token, self.token_range()))
|
||
}
|
||
|
||
#[inline]
|
||
fn token_range(&self) -> TextRange {
|
||
let end = self.offset();
|
||
let len = self.cursor.token_len();
|
||
|
||
TextRange::at(end - len, len)
|
||
}
|
||
|
||
#[inline]
|
||
fn token_text(&self) -> &'source str {
|
||
&self.source[self.token_range()]
|
||
}
|
||
|
||
// Lexer doesn't allow files larger than 4GB
|
||
#[allow(clippy::cast_possible_truncation)]
|
||
#[inline]
|
||
fn offset(&self) -> TextSize {
|
||
TextSize::new(self.source.len() as u32) - self.cursor.text_len()
|
||
}
|
||
|
||
#[inline]
|
||
fn token_start(&self) -> TextSize {
|
||
self.token_range().start()
|
||
}
|
||
}
|
||
|
||
// Implement iterator pattern for Lexer.
|
||
// Calling the next element in the iterator will yield the next lexical
|
||
// token.
|
||
impl Iterator for Lexer<'_> {
|
||
type Item = LexResult;
|
||
|
||
fn next(&mut self) -> Option<Self::Item> {
|
||
let token = self.next_token();
|
||
|
||
match token {
|
||
Ok((Tok::EndOfFile, _)) => None,
|
||
r => Some(r),
|
||
}
|
||
}
|
||
}
|
||
|
||
impl FusedIterator for Lexer<'_> {}
|
||
|
||
/// Represents an error that occur during lexing and are
|
||
/// returned by the `parse_*` functions in the iterator in the
|
||
/// [lexer] implementation.
|
||
///
|
||
/// [lexer]: crate::lexer
|
||
#[derive(Debug, Clone, PartialEq)]
|
||
pub struct LexicalError {
|
||
/// The type of error that occurred.
|
||
pub error: LexicalErrorType,
|
||
/// The location of the error.
|
||
pub location: TextSize,
|
||
}
|
||
|
||
impl LexicalError {
|
||
/// Creates a new `LexicalError` with the given error type and location.
|
||
pub fn new(error: LexicalErrorType, location: TextSize) -> Self {
|
||
Self { error, location }
|
||
}
|
||
}
|
||
|
||
impl std::ops::Deref for LexicalError {
|
||
type Target = LexicalErrorType;
|
||
|
||
fn deref(&self) -> &Self::Target {
|
||
&self.error
|
||
}
|
||
}
|
||
|
||
impl std::error::Error for LexicalError {
|
||
fn source(&self) -> Option<&(dyn std::error::Error + 'static)> {
|
||
Some(&self.error)
|
||
}
|
||
}
|
||
|
||
impl std::fmt::Display for LexicalError {
|
||
fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
|
||
write!(
|
||
f,
|
||
"{} at byte offset {}",
|
||
&self.error,
|
||
u32::from(self.location)
|
||
)
|
||
}
|
||
}
|
||
|
||
/// Represents the different types of errors that can occur during lexing.
|
||
#[derive(Debug, Clone, PartialEq)]
|
||
pub enum LexicalErrorType {
|
||
// TODO: Can probably be removed, the places it is used seem to be able
|
||
// to use the `UnicodeError` variant instead.
|
||
#[doc(hidden)]
|
||
StringError,
|
||
// TODO: Should take a start/end position to report.
|
||
/// Decoding of a unicode escape sequence in a string literal failed.
|
||
UnicodeError,
|
||
/// The nesting of brackets/braces/parentheses is not balanced.
|
||
NestingError,
|
||
/// The indentation is not consistent.
|
||
IndentationError,
|
||
/// Inconsistent use of tabs and spaces.
|
||
TabError,
|
||
/// Encountered a tab after a space.
|
||
TabsAfterSpaces,
|
||
/// A non-default argument follows a default argument.
|
||
DefaultArgumentError,
|
||
/// A duplicate argument was found in a function definition.
|
||
DuplicateArgumentError(String),
|
||
/// A positional argument follows a keyword argument.
|
||
PositionalArgumentError,
|
||
/// An iterable argument unpacking `*args` follows keyword argument unpacking `**kwargs`.
|
||
UnpackedArgumentError,
|
||
/// A keyword argument was repeated.
|
||
DuplicateKeywordArgumentError(String),
|
||
/// An unrecognized token was encountered.
|
||
UnrecognizedToken { tok: char },
|
||
/// An f-string error containing the [`FStringErrorType`].
|
||
FStringError(FStringErrorType),
|
||
/// An unexpected character was encountered after a line continuation.
|
||
LineContinuationError,
|
||
/// An unexpected end of file was encountered.
|
||
Eof,
|
||
/// Occurs when a syntactically invalid assignment was encountered.
|
||
AssignmentError,
|
||
/// An unexpected error occurred.
|
||
OtherError(String),
|
||
}
|
||
|
||
impl std::error::Error for LexicalErrorType {}
|
||
|
||
impl std::fmt::Display for LexicalErrorType {
|
||
fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
|
||
match self {
|
||
LexicalErrorType::StringError => write!(f, "Got unexpected string"),
|
||
LexicalErrorType::FStringError(error) => write!(f, "f-string: {error}"),
|
||
LexicalErrorType::UnicodeError => write!(f, "Got unexpected unicode"),
|
||
LexicalErrorType::NestingError => write!(f, "Got unexpected nesting"),
|
||
LexicalErrorType::IndentationError => {
|
||
write!(f, "unindent does not match any outer indentation level")
|
||
}
|
||
LexicalErrorType::TabError => {
|
||
write!(f, "inconsistent use of tabs and spaces in indentation")
|
||
}
|
||
LexicalErrorType::TabsAfterSpaces => {
|
||
write!(f, "Tabs not allowed as part of indentation after spaces")
|
||
}
|
||
LexicalErrorType::DefaultArgumentError => {
|
||
write!(f, "non-default argument follows default argument")
|
||
}
|
||
LexicalErrorType::DuplicateArgumentError(arg_name) => {
|
||
write!(f, "duplicate argument '{arg_name}' in function definition")
|
||
}
|
||
LexicalErrorType::DuplicateKeywordArgumentError(arg_name) => {
|
||
write!(f, "keyword argument repeated: {arg_name}")
|
||
}
|
||
LexicalErrorType::PositionalArgumentError => {
|
||
write!(f, "positional argument follows keyword argument")
|
||
}
|
||
LexicalErrorType::UnpackedArgumentError => {
|
||
write!(
|
||
f,
|
||
"iterable argument unpacking follows keyword argument unpacking"
|
||
)
|
||
}
|
||
LexicalErrorType::UnrecognizedToken { tok } => {
|
||
write!(f, "Got unexpected token {tok}")
|
||
}
|
||
LexicalErrorType::LineContinuationError => {
|
||
write!(f, "unexpected character after line continuation character")
|
||
}
|
||
LexicalErrorType::Eof => write!(f, "unexpected EOF while parsing"),
|
||
LexicalErrorType::AssignmentError => write!(f, "invalid assignment target"),
|
||
LexicalErrorType::OtherError(msg) => write!(f, "{msg}"),
|
||
}
|
||
}
|
||
}
|
||
|
||
#[derive(Copy, Clone, Debug)]
|
||
enum State {
|
||
/// Lexer is right at the beginning of the file or after a `Newline` token.
|
||
AfterNewline,
|
||
|
||
/// The lexer is at the start of a new logical line but **after** the indentation
|
||
NonEmptyLogicalLine,
|
||
|
||
/// Lexer is right after an equal token
|
||
AfterEqual,
|
||
|
||
/// Inside of a logical line
|
||
Other,
|
||
}
|
||
|
||
impl State {
|
||
const fn is_after_newline(self) -> bool {
|
||
matches!(self, State::AfterNewline)
|
||
}
|
||
|
||
const fn is_new_logical_line(self) -> bool {
|
||
matches!(self, State::AfterNewline | State::NonEmptyLogicalLine)
|
||
}
|
||
|
||
const fn is_after_equal(self) -> bool {
|
||
matches!(self, State::AfterEqual)
|
||
}
|
||
}
|
||
|
||
#[derive(Copy, Clone, Debug)]
|
||
enum Radix {
|
||
Binary,
|
||
Octal,
|
||
Decimal,
|
||
Hex,
|
||
}
|
||
|
||
impl Radix {
|
||
const fn as_u32(self) -> u32 {
|
||
match self {
|
||
Radix::Binary => 2,
|
||
Radix::Octal => 8,
|
||
Radix::Decimal => 10,
|
||
Radix::Hex => 16,
|
||
}
|
||
}
|
||
|
||
const fn is_digit(self, c: char) -> bool {
|
||
match self {
|
||
Radix::Binary => matches!(c, '0'..='1'),
|
||
Radix::Octal => matches!(c, '0'..='7'),
|
||
Radix::Decimal => c.is_ascii_digit(),
|
||
Radix::Hex => matches!(c, '0'..='9' | 'a'..='f' | 'A'..='F'),
|
||
}
|
||
}
|
||
}
|
||
|
||
const fn is_quote(c: char) -> bool {
|
||
matches!(c, '\'' | '"')
|
||
}
|
||
|
||
const fn is_ascii_identifier_start(c: char) -> bool {
|
||
matches!(c, 'a'..='z' | 'A'..='Z' | '_')
|
||
}
|
||
|
||
// Checks if the character c is a valid starting character as described
|
||
// in https://docs.python.org/3/reference/lexical_analysis.html#identifiers
|
||
fn is_unicode_identifier_start(c: char) -> bool {
|
||
is_xid_start(c)
|
||
}
|
||
|
||
// Checks if the character c is a valid continuation character as described
|
||
// in https://docs.python.org/3/reference/lexical_analysis.html#identifiers
|
||
fn is_identifier_continuation(c: char) -> bool {
|
||
match c {
|
||
'a'..='z' | 'A'..='Z' | '_' | '0'..='9' => true,
|
||
c => is_xid_continue(c),
|
||
}
|
||
}
|
||
|
||
/// Returns `true` for [whitespace](https://docs.python.org/3/reference/lexical_analysis.html#whitespace-between-tokens)
|
||
/// characters.
|
||
///
|
||
/// This is the same as `ruff_python_trivia::is_python_whitespace` and is copied
|
||
/// here to avoid a circular dependency as `ruff_python_trivia` has a dev-dependency
|
||
/// on `ruff_python_lexer`.
|
||
const fn is_python_whitespace(c: char) -> bool {
|
||
matches!(
|
||
c,
|
||
// Space, tab, or form-feed
|
||
' ' | '\t' | '\x0C'
|
||
)
|
||
}
|
||
|
||
enum LexedText<'a> {
|
||
Source { source: &'a str, range: TextRange },
|
||
Owned(String),
|
||
}
|
||
|
||
impl<'a> LexedText<'a> {
|
||
fn new(start: TextSize, source: &'a str) -> Self {
|
||
Self::Source {
|
||
range: TextRange::empty(start),
|
||
source,
|
||
}
|
||
}
|
||
|
||
fn push(&mut self, c: char) {
|
||
match self {
|
||
LexedText::Source { range, source } => {
|
||
*range = range.add_end(c.text_len());
|
||
debug_assert!(source[*range].ends_with(c));
|
||
}
|
||
LexedText::Owned(owned) => owned.push(c),
|
||
}
|
||
}
|
||
|
||
fn as_str<'b>(&'b self) -> &'b str
|
||
where
|
||
'b: 'a,
|
||
{
|
||
match self {
|
||
LexedText::Source { range, source } => &source[*range],
|
||
LexedText::Owned(owned) => owned,
|
||
}
|
||
}
|
||
|
||
fn skip_char(&mut self) {
|
||
match self {
|
||
LexedText::Source { range, source } => {
|
||
*self = LexedText::Owned(source[*range].to_string());
|
||
}
|
||
LexedText::Owned(_) => {}
|
||
}
|
||
}
|
||
}
|
||
|
||
#[cfg(test)]
|
||
mod tests {
|
||
use insta::assert_debug_snapshot;
|
||
|
||
use super::*;
|
||
|
||
const WINDOWS_EOL: &str = "\r\n";
|
||
const MAC_EOL: &str = "\r";
|
||
const UNIX_EOL: &str = "\n";
|
||
|
||
fn lex_source_with_mode(source: &str, mode: Mode) -> Vec<Spanned> {
|
||
let lexer = lex(source, mode);
|
||
lexer.map(std::result::Result::unwrap).collect()
|
||
}
|
||
|
||
fn lex_source(source: &str) -> Vec<Spanned> {
|
||
lex_source_with_mode(source, Mode::Module)
|
||
}
|
||
|
||
fn lex_jupyter_source(source: &str) -> Vec<Spanned> {
|
||
lex_source_with_mode(source, Mode::Ipython)
|
||
}
|
||
|
||
fn ipython_escape_command_line_continuation_eol(eol: &str) -> Vec<Spanned> {
|
||
let source = format!("%matplotlib \\{eol} --inline");
|
||
lex_jupyter_source(&source)
|
||
}
|
||
|
||
#[test]
|
||
fn test_ipython_escape_command_line_continuation_unix_eol() {
|
||
assert_debug_snapshot!(ipython_escape_command_line_continuation_eol(UNIX_EOL));
|
||
}
|
||
|
||
#[test]
|
||
fn test_ipython_escape_command_line_continuation_mac_eol() {
|
||
assert_debug_snapshot!(ipython_escape_command_line_continuation_eol(MAC_EOL));
|
||
}
|
||
|
||
#[test]
|
||
fn test_ipython_escape_command_line_continuation_windows_eol() {
|
||
assert_debug_snapshot!(ipython_escape_command_line_continuation_eol(WINDOWS_EOL));
|
||
}
|
||
|
||
fn ipython_escape_command_line_continuation_with_eol_and_eof(eol: &str) -> Vec<Spanned> {
|
||
let source = format!("%matplotlib \\{eol}");
|
||
lex_jupyter_source(&source)
|
||
}
|
||
|
||
#[test]
|
||
fn test_ipython_escape_command_line_continuation_with_unix_eol_and_eof() {
|
||
assert_debug_snapshot!(ipython_escape_command_line_continuation_with_eol_and_eof(
|
||
UNIX_EOL
|
||
));
|
||
}
|
||
|
||
#[test]
|
||
fn test_ipython_escape_command_line_continuation_with_mac_eol_and_eof() {
|
||
assert_debug_snapshot!(ipython_escape_command_line_continuation_with_eol_and_eof(
|
||
MAC_EOL
|
||
));
|
||
}
|
||
|
||
#[test]
|
||
fn test_ipython_escape_command_line_continuation_with_windows_eol_and_eof() {
|
||
assert_debug_snapshot!(ipython_escape_command_line_continuation_with_eol_and_eof(
|
||
WINDOWS_EOL
|
||
));
|
||
}
|
||
|
||
#[test]
|
||
fn test_empty_ipython_escape_command() {
|
||
let source = "%\n%%\n!\n!!\n?\n??\n/\n,\n;";
|
||
assert_debug_snapshot!(lex_jupyter_source(source));
|
||
}
|
||
|
||
#[test]
|
||
fn test_ipython_escape_command() {
|
||
let source = r"
|
||
?foo
|
||
??foo
|
||
%timeit a = b
|
||
%timeit a % 3
|
||
%matplotlib \
|
||
--inline
|
||
!pwd \
|
||
&& ls -a | sed 's/^/\\ /'
|
||
!!cd /Users/foo/Library/Application\ Support/
|
||
/foo 1 2
|
||
,foo 1 2
|
||
;foo 1 2
|
||
!ls
|
||
"
|
||
.trim();
|
||
assert_debug_snapshot!(lex_jupyter_source(source));
|
||
}
|
||
|
||
#[test]
|
||
fn test_ipython_help_end_escape_command() {
|
||
let source = r"
|
||
?foo?
|
||
?? foo?
|
||
?? foo ?
|
||
?foo??
|
||
??foo??
|
||
???foo?
|
||
???foo??
|
||
??foo???
|
||
???foo???
|
||
?? \
|
||
foo?
|
||
?? \
|
||
?
|
||
????
|
||
%foo?
|
||
%foo??
|
||
%%foo???
|
||
!pwd?"
|
||
.trim();
|
||
assert_debug_snapshot!(lex_jupyter_source(source));
|
||
}
|
||
|
||
#[test]
|
||
fn test_ipython_escape_command_indentation() {
|
||
let source = r"
|
||
if True:
|
||
%matplotlib \
|
||
--inline"
|
||
.trim();
|
||
assert_debug_snapshot!(lex_jupyter_source(source));
|
||
}
|
||
|
||
#[test]
|
||
fn test_ipython_escape_command_assignment() {
|
||
let source = r"
|
||
pwd = !pwd
|
||
foo = %timeit a = b
|
||
bar = %timeit a % 3
|
||
baz = %matplotlib \
|
||
inline"
|
||
.trim();
|
||
assert_debug_snapshot!(lex_jupyter_source(source));
|
||
}
|
||
|
||
fn assert_no_ipython_escape_command(tokens: &[Spanned]) {
|
||
for (tok, _) in tokens {
|
||
if let Tok::IpyEscapeCommand { .. } = tok {
|
||
panic!("Unexpected escape command token: {tok:?}")
|
||
}
|
||
}
|
||
}
|
||
|
||
#[test]
|
||
fn test_ipython_escape_command_not_an_assignment() {
|
||
let source = r"
|
||
# Other escape kinds are not valid here (can't test `foo = ?str` because '?' is not a valid token)
|
||
foo = /func
|
||
foo = ;func
|
||
foo = ,func
|
||
|
||
(foo == %timeit a = b)
|
||
(foo := %timeit a = b)
|
||
def f(arg=%timeit a = b):
|
||
pass"
|
||
.trim();
|
||
let tokens = lex_jupyter_source(source);
|
||
assert_no_ipython_escape_command(&tokens);
|
||
}
|
||
|
||
#[test]
|
||
fn test_numbers() {
|
||
let source =
|
||
"0x2f 0o12 0b1101 0 123 123_45_67_890 0.2 1e+2 2.1e3 2j 2.2j 000 0x995DC9BBDF1939FA";
|
||
assert_debug_snapshot!(lex_source(source));
|
||
}
|
||
|
||
#[test]
|
||
fn test_invalid_leading_zero_small() {
|
||
let source = "025";
|
||
|
||
let lexer = lex(source, Mode::Module);
|
||
let tokens = lexer.collect::<Result<Vec<_>, LexicalError>>();
|
||
assert_debug_snapshot!(tokens);
|
||
}
|
||
|
||
#[test]
|
||
fn test_invalid_leading_zero_big() {
|
||
let source =
|
||
"0252222222222222522222222222225222222222222252222222222222522222222222225222222222222";
|
||
|
||
let lexer = lex(source, Mode::Module);
|
||
let tokens = lexer.collect::<Result<Vec<_>, LexicalError>>();
|
||
assert_debug_snapshot!(tokens);
|
||
}
|
||
|
||
#[test]
|
||
fn test_line_comment_long() {
|
||
let source = "99232 # foo".to_string();
|
||
assert_debug_snapshot!(lex_source(&source));
|
||
}
|
||
|
||
#[test]
|
||
fn test_line_comment_whitespace() {
|
||
let source = "99232 # ".to_string();
|
||
assert_debug_snapshot!(lex_source(&source));
|
||
}
|
||
|
||
#[test]
|
||
fn test_line_comment_single_whitespace() {
|
||
let source = "99232 # ".to_string();
|
||
assert_debug_snapshot!(lex_source(&source));
|
||
}
|
||
|
||
#[test]
|
||
fn test_line_comment_empty() {
|
||
let source = "99232 #".to_string();
|
||
assert_debug_snapshot!(lex_source(&source));
|
||
}
|
||
|
||
fn comment_until_eol(eol: &str) -> Vec<Spanned> {
|
||
let source = format!("123 # Foo{eol}456");
|
||
lex_source(&source)
|
||
}
|
||
|
||
#[test]
|
||
fn test_comment_until_unix_eol() {
|
||
assert_debug_snapshot!(comment_until_eol(UNIX_EOL));
|
||
}
|
||
|
||
#[test]
|
||
fn test_comment_until_mac_eol() {
|
||
assert_debug_snapshot!(comment_until_eol(MAC_EOL));
|
||
}
|
||
|
||
#[test]
|
||
fn test_comment_until_windows_eol() {
|
||
assert_debug_snapshot!(comment_until_eol(WINDOWS_EOL));
|
||
}
|
||
|
||
#[test]
|
||
fn test_assignment() {
|
||
let source = r"a_variable = 99 + 2-0";
|
||
assert_debug_snapshot!(lex_source(source));
|
||
}
|
||
|
||
fn indentation_with_eol(eol: &str) -> Vec<Spanned> {
|
||
let source = format!("def foo():{eol} return 99{eol}{eol}");
|
||
lex_source(&source)
|
||
}
|
||
|
||
#[test]
|
||
fn test_indentation_with_unix_eol() {
|
||
assert_debug_snapshot!(indentation_with_eol(UNIX_EOL));
|
||
}
|
||
|
||
#[test]
|
||
fn test_indentation_with_mac_eol() {
|
||
assert_debug_snapshot!(indentation_with_eol(MAC_EOL));
|
||
}
|
||
|
||
#[test]
|
||
fn test_indentation_with_windows_eol() {
|
||
assert_debug_snapshot!(indentation_with_eol(WINDOWS_EOL));
|
||
}
|
||
|
||
fn double_dedent_with_eol(eol: &str) -> Vec<Spanned> {
|
||
let source = format!("def foo():{eol} if x:{eol}{eol} return 99{eol}{eol}");
|
||
lex_source(&source)
|
||
}
|
||
|
||
#[test]
|
||
fn test_double_dedent_with_unix_eol() {
|
||
assert_debug_snapshot!(double_dedent_with_eol(UNIX_EOL));
|
||
}
|
||
|
||
#[test]
|
||
fn test_double_dedent_with_mac_eol() {
|
||
assert_debug_snapshot!(double_dedent_with_eol(MAC_EOL));
|
||
}
|
||
|
||
#[test]
|
||
fn test_double_dedent_with_windows_eol() {
|
||
assert_debug_snapshot!(double_dedent_with_eol(WINDOWS_EOL));
|
||
}
|
||
|
||
fn double_dedent_with_tabs_eol(eol: &str) -> Vec<Spanned> {
|
||
let source = format!("def foo():{eol}\tif x:{eol}{eol}\t\t return 99{eol}{eol}");
|
||
lex_source(&source)
|
||
}
|
||
|
||
#[test]
|
||
fn test_double_dedent_with_tabs_unix_eol() {
|
||
assert_debug_snapshot!(double_dedent_with_tabs_eol(UNIX_EOL));
|
||
}
|
||
|
||
#[test]
|
||
fn test_double_dedent_with_tabs_mac_eol() {
|
||
assert_debug_snapshot!(double_dedent_with_tabs_eol(MAC_EOL));
|
||
}
|
||
|
||
#[test]
|
||
fn test_double_dedent_with_tabs_windows_eol() {
|
||
assert_debug_snapshot!(double_dedent_with_tabs_eol(WINDOWS_EOL));
|
||
}
|
||
|
||
fn newline_in_brackets_eol(eol: &str) -> Vec<Spanned> {
|
||
let source = r"x = [
|
||
|
||
1,2
|
||
,(3,
|
||
4,
|
||
), {
|
||
5,
|
||
6,\
|
||
7}]
|
||
"
|
||
.replace('\n', eol);
|
||
lex_source(&source)
|
||
}
|
||
|
||
#[test]
|
||
fn test_newline_in_brackets_unix_eol() {
|
||
assert_debug_snapshot!(newline_in_brackets_eol(UNIX_EOL));
|
||
}
|
||
|
||
#[test]
|
||
fn test_newline_in_brackets_mac_eol() {
|
||
assert_debug_snapshot!(newline_in_brackets_eol(MAC_EOL));
|
||
}
|
||
|
||
#[test]
|
||
fn test_newline_in_brackets_windows_eol() {
|
||
assert_debug_snapshot!(newline_in_brackets_eol(WINDOWS_EOL));
|
||
}
|
||
|
||
#[test]
|
||
fn test_non_logical_newline_in_string_continuation() {
|
||
let source = r"(
|
||
'a'
|
||
'b'
|
||
|
||
'c' \
|
||
'd'
|
||
)";
|
||
assert_debug_snapshot!(lex_source(source));
|
||
}
|
||
|
||
#[test]
|
||
fn test_logical_newline_line_comment() {
|
||
let source = "#Hello\n#World\n";
|
||
assert_debug_snapshot!(lex_source(source));
|
||
}
|
||
|
||
#[test]
|
||
fn test_operators() {
|
||
let source = "//////=/ /";
|
||
assert_debug_snapshot!(lex_source(source));
|
||
}
|
||
|
||
#[test]
|
||
fn test_string() {
|
||
let source = r#""double" 'single' 'can\'t' "\\\"" '\t\r\n' '\g' r'raw\'' '\420' '\200\0a'"#;
|
||
assert_debug_snapshot!(lex_source(source));
|
||
}
|
||
|
||
fn string_continuation_with_eol(eol: &str) -> Vec<Spanned> {
|
||
let source = format!("\"abc\\{eol}def\"");
|
||
lex_source(&source)
|
||
}
|
||
|
||
#[test]
|
||
fn test_string_continuation_with_unix_eol() {
|
||
assert_debug_snapshot!(string_continuation_with_eol(UNIX_EOL));
|
||
}
|
||
|
||
#[test]
|
||
fn test_string_continuation_with_mac_eol() {
|
||
assert_debug_snapshot!(string_continuation_with_eol(MAC_EOL));
|
||
}
|
||
|
||
#[test]
|
||
fn test_string_continuation_with_windows_eol() {
|
||
assert_debug_snapshot!(string_continuation_with_eol(WINDOWS_EOL));
|
||
}
|
||
|
||
#[test]
|
||
fn test_escape_unicode_name() {
|
||
let source = r#""\N{EN SPACE}""#;
|
||
assert_debug_snapshot!(lex_source(source));
|
||
}
|
||
|
||
fn triple_quoted_eol(eol: &str) -> Vec<Spanned> {
|
||
let source = format!("\"\"\"{eol} test string{eol} \"\"\"");
|
||
lex_source(&source)
|
||
}
|
||
|
||
#[test]
|
||
fn test_triple_quoted_unix_eol() {
|
||
assert_debug_snapshot!(triple_quoted_eol(UNIX_EOL));
|
||
}
|
||
|
||
#[test]
|
||
fn test_triple_quoted_mac_eol() {
|
||
assert_debug_snapshot!(triple_quoted_eol(MAC_EOL));
|
||
}
|
||
|
||
#[test]
|
||
fn test_triple_quoted_windows_eol() {
|
||
assert_debug_snapshot!(triple_quoted_eol(WINDOWS_EOL));
|
||
}
|
||
|
||
// This test case is to just make sure that the lexer doesn't go into
|
||
// infinite loop on invalid input.
|
||
#[test]
|
||
fn test_infinite_loop() {
|
||
let source = "[1";
|
||
let _ = lex(source, Mode::Module).collect::<Vec<_>>();
|
||
}
|
||
|
||
/// Emoji identifiers are a non-standard python feature and are not supported by our lexer.
|
||
#[test]
|
||
fn test_emoji_identifier() {
|
||
let source = "🐦";
|
||
|
||
let lexed: Vec<_> = lex(source, Mode::Module).collect();
|
||
|
||
match lexed.as_slice() {
|
||
[Err(error)] => {
|
||
assert_eq!(
|
||
error.error,
|
||
LexicalErrorType::UnrecognizedToken { tok: '🐦' }
|
||
);
|
||
}
|
||
result => panic!("Expected an error token but found {result:?}"),
|
||
}
|
||
}
|
||
|
||
#[test]
|
||
fn tet_too_low_dedent() {
|
||
let tokens: Vec<_> = lex(
|
||
"if True:
|
||
pass
|
||
pass",
|
||
Mode::Module,
|
||
)
|
||
.collect();
|
||
assert_debug_snapshot!(tokens);
|
||
}
|
||
|
||
#[test]
|
||
fn test_empty_fstrings() {
|
||
let source = r#"f"" "" F"" f'' '' f"""""" f''''''"#;
|
||
assert_debug_snapshot!(lex_source(source));
|
||
}
|
||
|
||
#[test]
|
||
fn test_fstring_prefix() {
|
||
let source = r#"f"" F"" rf"" rF"" Rf"" RF"" fr"" Fr"" fR"" FR"""#;
|
||
assert_debug_snapshot!(lex_source(source));
|
||
}
|
||
|
||
#[test]
|
||
fn test_fstring() {
|
||
let source = r#"f"normal {foo} {{another}} {bar} {{{three}}}""#;
|
||
assert_debug_snapshot!(lex_source(source));
|
||
}
|
||
|
||
#[test]
|
||
fn test_fstring_parentheses() {
|
||
let source = r#"f"{}" f"{{}}" f" {}" f"{{{}}}" f"{{{{}}}}" f" {} {{}} {{{}}} {{{{}}}} ""#;
|
||
assert_debug_snapshot!(lex_source(source));
|
||
}
|
||
|
||
fn fstring_single_quote_escape_eol(eol: &str) -> Vec<Spanned> {
|
||
let source = format!(r"f'text \{eol} more text'");
|
||
lex_source(&source)
|
||
}
|
||
|
||
#[test]
|
||
fn test_fstring_single_quote_escape_unix_eol() {
|
||
assert_debug_snapshot!(fstring_single_quote_escape_eol(UNIX_EOL));
|
||
}
|
||
|
||
#[test]
|
||
fn test_fstring_single_quote_escape_mac_eol() {
|
||
assert_debug_snapshot!(fstring_single_quote_escape_eol(MAC_EOL));
|
||
}
|
||
|
||
#[test]
|
||
fn test_fstring_single_quote_escape_windows_eol() {
|
||
assert_debug_snapshot!(fstring_single_quote_escape_eol(WINDOWS_EOL));
|
||
}
|
||
|
||
#[test]
|
||
fn test_fstring_escape() {
|
||
let source = r#"f"\{x:\"\{x}} \"\"\
|
||
end""#;
|
||
assert_debug_snapshot!(lex_source(source));
|
||
}
|
||
|
||
#[test]
|
||
fn test_fstring_escape_braces() {
|
||
let source = r"f'\{foo}' f'\\{foo}' f'\{{foo}}' f'\\{{foo}}'";
|
||
assert_debug_snapshot!(lex_source(source));
|
||
}
|
||
|
||
#[test]
|
||
fn test_fstring_escape_raw() {
|
||
let source = r#"rf"\{x:\"\{x}} \"\"\
|
||
end""#;
|
||
assert_debug_snapshot!(lex_source(source));
|
||
}
|
||
|
||
#[test]
|
||
fn test_fstring_named_unicode() {
|
||
let source = r#"f"\N{BULLET} normal \Nope \N""#;
|
||
assert_debug_snapshot!(lex_source(source));
|
||
}
|
||
|
||
#[test]
|
||
fn test_fstring_named_unicode_raw() {
|
||
let source = r#"rf"\N{BULLET} normal""#;
|
||
assert_debug_snapshot!(lex_source(source));
|
||
}
|
||
|
||
#[test]
|
||
fn test_fstring_with_named_expression() {
|
||
let source = r#"f"{x:=10} {(x:=10)} {x,{y:=10}} {[x:=10]}""#;
|
||
assert_debug_snapshot!(lex_source(source));
|
||
}
|
||
|
||
#[test]
|
||
fn test_fstring_with_format_spec() {
|
||
let source = r#"f"{foo:} {x=!s:.3f} {x:.{y}f} {'':*^{1:{1}}} {x:{{1}.pop()}}""#;
|
||
assert_debug_snapshot!(lex_source(source));
|
||
}
|
||
|
||
#[test]
|
||
fn test_fstring_with_multiline_format_spec() {
|
||
// The last f-string is invalid syntactically but we should still lex it.
|
||
// Note that the `b` is a `Name` token and not a `FStringMiddle` token.
|
||
let source = r"f'''__{
|
||
x:d
|
||
}__'''
|
||
f'''__{
|
||
x:a
|
||
b
|
||
c
|
||
}__'''
|
||
f'__{
|
||
x:d
|
||
}__'
|
||
f'__{
|
||
x:a
|
||
b
|
||
}__'
|
||
";
|
||
assert_debug_snapshot!(lex_source(source));
|
||
}
|
||
|
||
#[test]
|
||
fn test_fstring_conversion() {
|
||
let source = r#"f"{x!s} {x=!r} {x:.3f!r} {{x!r}}""#;
|
||
assert_debug_snapshot!(lex_source(source));
|
||
}
|
||
|
||
#[test]
|
||
fn test_fstring_nested() {
|
||
let source = r#"f"foo {f"bar {x + f"{wow}"}"} baz" f'foo {f'bar'} some {f"another"}'"#;
|
||
assert_debug_snapshot!(lex_source(source));
|
||
}
|
||
|
||
#[test]
|
||
fn test_fstring_expression_multiline() {
|
||
let source = r#"f"first {
|
||
x
|
||
*
|
||
y
|
||
} second""#;
|
||
assert_debug_snapshot!(lex_source(source));
|
||
}
|
||
|
||
#[test]
|
||
fn test_fstring_multiline() {
|
||
let source = r#"f"""
|
||
hello
|
||
world
|
||
""" f'''
|
||
world
|
||
hello
|
||
''' f"some {f"""multiline
|
||
allowed {x}"""} string""#;
|
||
assert_debug_snapshot!(lex_source(source));
|
||
}
|
||
|
||
#[test]
|
||
fn test_fstring_comments() {
|
||
let source = r#"f"""
|
||
# not a comment { # comment {
|
||
x
|
||
} # not a comment
|
||
""""#;
|
||
assert_debug_snapshot!(lex_source(source));
|
||
}
|
||
|
||
#[test]
|
||
fn test_fstring_with_ipy_escape_command() {
|
||
let source = r#"f"foo {!pwd} bar""#;
|
||
assert_debug_snapshot!(lex_source(source));
|
||
}
|
||
|
||
#[test]
|
||
fn test_fstring_with_lambda_expression() {
|
||
let source = r#"
|
||
f"{lambda x:{x}}"
|
||
f"{(lambda x:{x})}"
|
||
"#
|
||
.trim();
|
||
assert_debug_snapshot!(lex_source(source));
|
||
}
|
||
|
||
#[test]
|
||
fn test_fstring_with_nul_char() {
|
||
let source = r"f'\0'";
|
||
assert_debug_snapshot!(lex_source(source));
|
||
}
|
||
|
||
#[test]
|
||
fn test_match_softkeyword_in_notebook() {
|
||
let source = r"match foo:
|
||
case bar:
|
||
pass";
|
||
assert_debug_snapshot!(lex_jupyter_source(source));
|
||
}
|
||
|
||
fn lex_error(source: &str) -> LexicalError {
|
||
match lex(source, Mode::Module).find_map(Result::err) {
|
||
Some(err) => err,
|
||
_ => panic!("Expected at least one error"),
|
||
}
|
||
}
|
||
|
||
fn lex_fstring_error(source: &str) -> FStringErrorType {
|
||
match lex_error(source).error {
|
||
LexicalErrorType::FStringError(error) => error,
|
||
err => panic!("Expected FStringError: {err:?}"),
|
||
}
|
||
}
|
||
|
||
#[test]
|
||
fn test_fstring_error() {
|
||
use FStringErrorType::{
|
||
SingleRbrace, UnclosedLbrace, UnterminatedString, UnterminatedTripleQuotedString,
|
||
};
|
||
|
||
assert_eq!(lex_fstring_error("f'}'"), SingleRbrace);
|
||
assert_eq!(lex_fstring_error("f'{{}'"), SingleRbrace);
|
||
assert_eq!(lex_fstring_error("f'{{}}}'"), SingleRbrace);
|
||
assert_eq!(lex_fstring_error("f'foo}'"), SingleRbrace);
|
||
assert_eq!(lex_fstring_error(r"f'\u007b}'"), SingleRbrace);
|
||
assert_eq!(lex_fstring_error("f'{a:b}}'"), SingleRbrace);
|
||
assert_eq!(lex_fstring_error("f'{3:}}>10}'"), SingleRbrace);
|
||
assert_eq!(lex_fstring_error(r"f'\{foo}\}'"), SingleRbrace);
|
||
|
||
assert_eq!(lex_fstring_error("f'{'"), UnclosedLbrace);
|
||
assert_eq!(lex_fstring_error("f'{foo!r'"), UnclosedLbrace);
|
||
assert_eq!(lex_fstring_error("f'{foo='"), UnclosedLbrace);
|
||
assert_eq!(
|
||
lex_fstring_error(
|
||
r#"f"{"
|
||
"#
|
||
),
|
||
UnclosedLbrace
|
||
);
|
||
assert_eq!(lex_fstring_error(r#"f"""{""""#), UnclosedLbrace);
|
||
|
||
assert_eq!(lex_fstring_error(r#"f""#), UnterminatedString);
|
||
assert_eq!(lex_fstring_error(r"f'"), UnterminatedString);
|
||
|
||
assert_eq!(lex_fstring_error(r#"f""""#), UnterminatedTripleQuotedString);
|
||
assert_eq!(lex_fstring_error(r"f'''"), UnterminatedTripleQuotedString);
|
||
assert_eq!(
|
||
lex_fstring_error(r#"f"""""#),
|
||
UnterminatedTripleQuotedString
|
||
);
|
||
assert_eq!(
|
||
lex_fstring_error(r#"f""""""#),
|
||
UnterminatedTripleQuotedString
|
||
);
|
||
}
|
||
|
||
#[test]
|
||
fn test_fstring_error_location() {
|
||
assert_debug_snapshot!(lex_error("f'{'"), @r###"
|
||
LexicalError {
|
||
error: FStringError(
|
||
UnclosedLbrace,
|
||
),
|
||
location: 4,
|
||
}
|
||
"###);
|
||
|
||
assert_debug_snapshot!(lex_error("f'{'α"), @r###"
|
||
LexicalError {
|
||
error: FStringError(
|
||
UnclosedLbrace,
|
||
),
|
||
location: 6,
|
||
}
|
||
"###);
|
||
}
|
||
}
|