ruff/crates/ruff_python_parser/src/lexer.rs
Charlie Marsh f0d43dafcf
Ignore trailing quotes for unclosed l-brace errors (#9388)
## Summary

Given:

```python
F"{"ڤ
```

We try to locate the "unclosed left brace" error by subtracting the
quote size from the lexer offset -- so we subtract 1 from the end of the
source, which puts us in the middle of a Unicode character. I don't
think we should try to adjust the offset in this way, since there can be
content _after_ the quote. For example, with the advent of PEP 701, this
string could reasonably be fixed as:

```python
F"{"ڤ"}"
````

Closes https://github.com/astral-sh/ruff/issues/9379.
2024-01-04 05:00:55 +00:00

2274 lines
74 KiB
Rust
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

//! This module takes care of lexing Python source text.
//!
//! This means source code is scanned and translated into separate tokens. The rules
//! governing what is and is not a valid token are defined in the Python reference
//! guide section on [Lexical analysis].
//!
//! The primary function in this module is [`lex`], which takes a string slice
//! and returns an iterator over the tokens in the source code. The tokens are currently returned
//! as a `Result<Spanned, LexicalError>`, where [`Spanned`] is a tuple containing the
//! start and end [`TextSize`] and a [`Tok`] denoting the token.
//!
//! # Example
//!
//! ```
//! use ruff_python_parser::{lexer::lex, Tok, Mode, StringKind};
//!
//! let source = "x = 'RustPython'";
//! let tokens = lex(source, Mode::Module)
//! .map(|tok| tok.expect("Failed to lex"))
//! .collect::<Vec<_>>();
//!
//! for (token, range) in tokens {
//! println!(
//! "{token:?}@{range:?}",
//! );
//! }
//! ```
//!
//! [Lexical analysis]: https://docs.python.org/3/reference/lexical_analysis.html
use std::iter::FusedIterator;
use std::{char, cmp::Ordering, str::FromStr};
use unicode_ident::{is_xid_continue, is_xid_start};
use ruff_python_ast::{Int, IpyEscapeKind};
use ruff_text_size::{TextLen, TextRange, TextSize};
use crate::lexer::cursor::{Cursor, EOF_CHAR};
use crate::lexer::fstring::{FStringContext, FStringContextFlags, FStrings};
use crate::lexer::indentation::{Indentation, Indentations};
use crate::{
soft_keywords::SoftKeywordTransformer,
string::FStringErrorType,
token::{StringKind, Tok},
Mode,
};
mod cursor;
mod fstring;
mod indentation;
/// A lexer for Python source code.
pub struct Lexer<'source> {
// Contains the source code to be lexed.
cursor: Cursor<'source>,
source: &'source str,
state: State,
// Amount of parenthesis.
nesting: u32,
// Indentation levels.
indentations: Indentations,
pending_indentation: Option<Indentation>,
// Lexer mode.
mode: Mode,
// F-string contexts.
fstrings: FStrings,
}
/// Contains a Token along with its `range`.
pub type Spanned = (Tok, TextRange);
/// The result of lexing a token.
pub type LexResult = Result<Spanned, LexicalError>;
/// Create a new lexer from a source string.
///
/// # Examples
///
/// ```
/// use ruff_python_parser::{Mode, lexer::lex};
///
/// let source = "def hello(): return 'world'";
/// let lexer = lex(source, Mode::Module);
///
/// for token in lexer {
/// println!("{:?}", token);
/// }
/// ```
#[inline]
pub fn lex(source: &str, mode: Mode) -> SoftKeywordTransformer<Lexer> {
SoftKeywordTransformer::new(Lexer::new(source, mode), mode)
}
pub struct LexStartsAtIterator<I> {
start_offset: TextSize,
inner: I,
}
impl<I> Iterator for LexStartsAtIterator<I>
where
I: Iterator<Item = LexResult>,
{
type Item = LexResult;
#[inline]
fn next(&mut self) -> Option<Self::Item> {
let result = match self.inner.next()? {
Ok((tok, range)) => Ok((tok, range + self.start_offset)),
Err(error) => Err(LexicalError {
location: error.location + self.start_offset,
..error
}),
};
Some(result)
}
fn size_hint(&self) -> (usize, Option<usize>) {
self.inner.size_hint()
}
}
impl<I> FusedIterator for LexStartsAtIterator<I> where I: Iterator<Item = LexResult> + FusedIterator {}
impl<I> ExactSizeIterator for LexStartsAtIterator<I> where
I: Iterator<Item = LexResult> + ExactSizeIterator
{
}
/// Create a new lexer from a source string, starting at a given location.
/// You probably want to use [`lex`] instead.
pub fn lex_starts_at(
source: &str,
mode: Mode,
start_offset: TextSize,
) -> LexStartsAtIterator<SoftKeywordTransformer<Lexer>> {
LexStartsAtIterator {
start_offset,
inner: lex(source, mode),
}
}
impl<'source> Lexer<'source> {
/// Create a new lexer from T and a starting location. You probably want to use
/// [`lex`] instead.
pub fn new(input: &'source str, mode: Mode) -> Self {
assert!(
u32::try_from(input.len()).is_ok(),
"Lexer only supports files with a size up to 4GB"
);
let mut lxr = Lexer {
state: State::AfterNewline,
nesting: 0,
indentations: Indentations::default(),
pending_indentation: None,
source: input,
cursor: Cursor::new(input),
mode,
fstrings: FStrings::default(),
};
// TODO: Handle possible mismatch between BOM and explicit encoding declaration.
// spell-checker:ignore feff
lxr.cursor.eat_char('\u{feff}');
lxr
}
/// Lex an identifier. Also used for keywords and string/bytes literals with a prefix.
fn lex_identifier(&mut self, first: char) -> Result<Tok, LexicalError> {
// Detect potential string like rb'' b'' f'' u'' r''
match (first, self.cursor.first()) {
('f' | 'F', quote @ ('\'' | '"')) => {
self.cursor.bump();
return Ok(self.lex_fstring_start(quote, false));
}
('r' | 'R', 'f' | 'F') | ('f' | 'F', 'r' | 'R') if is_quote(self.cursor.second()) => {
self.cursor.bump();
let quote = self.cursor.bump().unwrap();
return Ok(self.lex_fstring_start(quote, true));
}
(_, quote @ ('\'' | '"')) => {
if let Ok(string_kind) = StringKind::try_from(first) {
self.cursor.bump();
return self.lex_string(string_kind, quote);
}
}
(_, second @ ('r' | 'R' | 'b' | 'B')) if is_quote(self.cursor.second()) => {
self.cursor.bump();
if let Ok(string_kind) = StringKind::try_from([first, second]) {
let quote = self.cursor.bump().unwrap();
return self.lex_string(string_kind, quote);
}
}
_ => {}
}
self.cursor.eat_while(is_identifier_continuation);
let text = self.token_text();
let keyword = match text {
"False" => Tok::False,
"None" => Tok::None,
"True" => Tok::True,
"and" => Tok::And,
"as" => Tok::As,
"assert" => Tok::Assert,
"async" => Tok::Async,
"await" => Tok::Await,
"break" => Tok::Break,
"case" => Tok::Case,
"class" => Tok::Class,
"continue" => Tok::Continue,
"def" => Tok::Def,
"del" => Tok::Del,
"elif" => Tok::Elif,
"else" => Tok::Else,
"except" => Tok::Except,
"finally" => Tok::Finally,
"for" => Tok::For,
"from" => Tok::From,
"global" => Tok::Global,
"if" => Tok::If,
"import" => Tok::Import,
"in" => Tok::In,
"is" => Tok::Is,
"lambda" => Tok::Lambda,
"match" => Tok::Match,
"nonlocal" => Tok::Nonlocal,
"not" => Tok::Not,
"or" => Tok::Or,
"pass" => Tok::Pass,
"raise" => Tok::Raise,
"return" => Tok::Return,
"try" => Tok::Try,
"type" => Tok::Type,
"while" => Tok::While,
"with" => Tok::With,
"yield" => Tok::Yield,
_ => {
return Ok(Tok::Name {
name: text.to_string(),
})
}
};
Ok(keyword)
}
/// Numeric lexing. The feast can start!
fn lex_number(&mut self, first: char) -> Result<Tok, LexicalError> {
if first == '0' {
if self.cursor.eat_if(|c| matches!(c, 'x' | 'X')).is_some() {
self.lex_number_radix(Radix::Hex)
} else if self.cursor.eat_if(|c| matches!(c, 'o' | 'O')).is_some() {
self.lex_number_radix(Radix::Octal)
} else if self.cursor.eat_if(|c| matches!(c, 'b' | 'B')).is_some() {
self.lex_number_radix(Radix::Binary)
} else {
self.lex_decimal_number(first)
}
} else {
self.lex_decimal_number(first)
}
}
/// Lex a hex/octal/decimal/binary number without a decimal point.
fn lex_number_radix(&mut self, radix: Radix) -> Result<Tok, LexicalError> {
#[cfg(debug_assertions)]
debug_assert!(matches!(
self.cursor.previous().to_ascii_lowercase(),
'x' | 'o' | 'b'
));
// Lex the portion of the token after the base prefix (e.g., `9D5` in `0x9D5`).
let mut number = LexedText::new(self.offset(), self.source);
self.radix_run(&mut number, radix);
// Extract the entire number, including the base prefix (e.g., `0x9D5`).
let token = &self.source[self.token_range()];
let value = match Int::from_str_radix(number.as_str(), radix.as_u32(), token) {
Ok(int) => int,
Err(err) => {
return Err(LexicalError {
error: LexicalErrorType::OtherError(format!("{err:?}")),
location: self.token_range().start(),
});
}
};
Ok(Tok::Int { value })
}
/// Lex a normal number, that is, no octal, hex or binary number.
fn lex_decimal_number(&mut self, first_digit_or_dot: char) -> Result<Tok, LexicalError> {
#[cfg(debug_assertions)]
debug_assert!(self.cursor.previous().is_ascii_digit() || self.cursor.previous() == '.');
let start_is_zero = first_digit_or_dot == '0';
let mut number = LexedText::new(self.token_start(), self.source);
if first_digit_or_dot != '.' {
number.push(first_digit_or_dot);
self.radix_run(&mut number, Radix::Decimal);
};
let is_float = if first_digit_or_dot == '.' || self.cursor.eat_char('.') {
number.push('.');
if self.cursor.eat_char('_') {
return Err(LexicalError {
error: LexicalErrorType::OtherError("Invalid Syntax".to_owned()),
location: self.offset() - TextSize::new(1),
});
}
self.radix_run(&mut number, Radix::Decimal);
true
} else {
// Normal number:
false
};
let is_float = match self.cursor.rest().as_bytes() {
[b'e' | b'E', b'0'..=b'9', ..] | [b'e' | b'E', b'-' | b'+', b'0'..=b'9', ..] => {
// 'e' | 'E'
number.push(self.cursor.bump().unwrap());
if let Some(sign) = self.cursor.eat_if(|c| matches!(c, '+' | '-')) {
number.push(sign);
}
self.radix_run(&mut number, Radix::Decimal);
true
}
_ => is_float,
};
if is_float {
// Improvement: Use `Cow` instead of pushing to value text
let value = f64::from_str(number.as_str()).map_err(|_| LexicalError {
error: LexicalErrorType::OtherError("Invalid decimal literal".to_owned()),
location: self.token_start(),
})?;
// Parse trailing 'j':
if self.cursor.eat_if(|c| matches!(c, 'j' | 'J')).is_some() {
Ok(Tok::Complex {
real: 0.0,
imag: value,
})
} else {
Ok(Tok::Float { value })
}
} else {
// Parse trailing 'j':
if self.cursor.eat_if(|c| matches!(c, 'j' | 'J')).is_some() {
let imag = f64::from_str(number.as_str()).unwrap();
Ok(Tok::Complex { real: 0.0, imag })
} else {
let value = match Int::from_str(number.as_str()) {
Ok(value) => {
if start_is_zero && value.as_u8() != Some(0) {
// Leading zeros in decimal integer literals are not permitted.
return Err(LexicalError {
error: LexicalErrorType::OtherError("Invalid Token".to_owned()),
location: self.token_range().start(),
});
}
value
}
Err(err) => {
return Err(LexicalError {
error: LexicalErrorType::OtherError(format!("{err:?}")),
location: self.token_range().start(),
})
}
};
Ok(Tok::Int { value })
}
}
}
/// Consume a sequence of numbers with the given radix,
/// the digits can be decorated with underscores
/// like this: '`1_2_3_4`' == '1234'
fn radix_run(&mut self, number: &mut LexedText, radix: Radix) {
loop {
if let Some(c) = self.cursor.eat_if(|c| radix.is_digit(c)) {
number.push(c);
}
// Number that contains `_` separators. Remove them from the parsed text.
else if self.cursor.first() == '_' && radix.is_digit(self.cursor.second()) {
// Skip over `_`
self.cursor.bump();
number.skip_char();
} else {
break;
}
}
}
/// Lex a single comment.
fn lex_comment(&mut self) -> Tok {
#[cfg(debug_assertions)]
debug_assert_eq!(self.cursor.previous(), '#');
let bytes = self.cursor.rest().as_bytes();
let offset = memchr::memchr2(b'\n', b'\r', bytes).unwrap_or(bytes.len());
self.cursor.skip_bytes(offset);
Tok::Comment(self.token_text().to_string())
}
/// Lex a single IPython escape command.
fn lex_ipython_escape_command(&mut self, escape_kind: IpyEscapeKind) -> Tok {
let mut value = String::new();
loop {
match self.cursor.first() {
'\\' => {
// Only skip the line continuation if it is followed by a newline
// otherwise it is a normal backslash which is part of the magic command:
//
// Skip this backslash
// v
// !pwd \
// && ls -a | sed 's/^/\\ /'
// ^^
// Don't skip these backslashes
if self.cursor.second() == '\r' {
self.cursor.bump();
self.cursor.bump();
self.cursor.eat_char('\n');
continue;
} else if self.cursor.second() == '\n' {
self.cursor.bump();
self.cursor.bump();
continue;
}
self.cursor.bump();
value.push('\\');
}
// Help end escape commands are those that end with 1 or 2 question marks.
// Here, we're only looking for a subset of help end escape commands which
// are the ones that has the escape token at the start of the line as well.
// On the other hand, we're not looking for help end escape commands that
// are strict in the sense that the escape token is only at the end. For example,
//
// * `%foo?` is recognized as a help end escape command but not as a strict one.
// * `foo?` is recognized as a strict help end escape command which is not
// lexed here but is identified at the parser level.
//
// Help end escape commands implemented in the IPython codebase using regex:
// https://github.com/ipython/ipython/blob/292e3a23459ca965b8c1bfe2c3707044c510209a/IPython/core/inputtransformer2.py#L454-L462
'?' => {
self.cursor.bump();
let mut question_count = 1u32;
while self.cursor.eat_char('?') {
question_count += 1;
}
// The original implementation in the IPython codebase is based on regex which
// means that it's strict in the sense that it won't recognize a help end escape:
// * If there's any whitespace before the escape token (e.g. `%foo ?`)
// * If there are more than 2 question mark tokens (e.g. `%foo???`)
// which is what we're doing here as well. In that case, we'll continue with
// the prefixed escape token.
//
// Now, the whitespace and empty value check also makes sure that an empty
// command (e.g. `%?` or `? ??`, no value after/between the escape tokens)
// is not recognized as a help end escape command. So, `%?` and `? ??` are
// `IpyEscapeKind::Magic` and `IpyEscapeKind::Help` because of the initial `%` and `??`
// tokens.
if question_count > 2
|| value.chars().last().map_or(true, is_python_whitespace)
|| !matches!(self.cursor.first(), '\n' | '\r' | EOF_CHAR)
{
// Not a help end escape command, so continue with the lexing.
value.reserve(question_count as usize);
for _ in 0..question_count {
value.push('?');
}
continue;
}
if escape_kind.is_help() {
// If we've recognize this as a help end escape command, then
// any question mark token / whitespaces at the start are not
// considered as part of the value.
//
// For example, `??foo?` is recognized as `IpyEscapeKind::Help` and
// `value` is `foo` instead of `??foo`.
value = value.trim_start_matches([' ', '?']).to_string();
} else if escape_kind.is_magic() {
// Between `%` and `?` (at the end), the `?` takes priority
// over the `%` so `%foo?` is recognized as `IpyEscapeKind::Help`
// and `value` is `%foo` instead of `foo`. So, we need to
// insert the magic escape token at the start.
value.insert_str(0, escape_kind.as_str());
}
let kind = match question_count {
1 => IpyEscapeKind::Help,
2 => IpyEscapeKind::Help2,
_ => unreachable!("`question_count` is always 1 or 2"),
};
return Tok::IpyEscapeCommand { kind, value };
}
'\n' | '\r' | EOF_CHAR => {
return Tok::IpyEscapeCommand {
kind: escape_kind,
value,
};
}
c => {
self.cursor.bump();
value.push(c);
}
}
}
}
/// Lex a f-string start token.
fn lex_fstring_start(&mut self, quote: char, is_raw_string: bool) -> Tok {
#[cfg(debug_assertions)]
debug_assert_eq!(self.cursor.previous(), quote);
let mut flags = FStringContextFlags::empty();
if quote == '"' {
flags |= FStringContextFlags::DOUBLE;
}
if is_raw_string {
flags |= FStringContextFlags::RAW;
}
if self.cursor.eat_char2(quote, quote) {
flags |= FStringContextFlags::TRIPLE;
}
self.fstrings.push(FStringContext::new(flags, self.nesting));
Tok::FStringStart
}
/// Lex a f-string middle or end token.
fn lex_fstring_middle_or_end(&mut self) -> Result<Option<Tok>, LexicalError> {
// SAFETY: Safe because the function is only called when `self.fstrings` is not empty.
let fstring = self.fstrings.current().unwrap();
self.cursor.start_token();
// Check if we're at the end of the f-string.
if fstring.is_triple_quoted() {
let quote_char = fstring.quote_char();
if self.cursor.eat_char3(quote_char, quote_char, quote_char) {
return Ok(Some(Tok::FStringEnd));
}
} else if self.cursor.eat_char(fstring.quote_char()) {
return Ok(Some(Tok::FStringEnd));
}
// We have to decode `{{` and `}}` into `{` and `}` respectively. As an
// optimization, we only allocate a new string we find any escaped curly braces,
// otherwise this string will remain empty and we'll use a source slice instead.
let mut normalized = String::new();
// Tracks the last offset of token value that has been written to `normalized`.
let mut last_offset = self.offset();
// This isn't going to change for the duration of the loop.
let in_format_spec = fstring.is_in_format_spec(self.nesting);
let mut in_named_unicode = false;
loop {
match self.cursor.first() {
// The condition is to differentiate between the `NUL` (`\0`) character
// in the source code and the one returned by `self.cursor.first()` when
// we reach the end of the source code.
EOF_CHAR if self.cursor.is_eof() => {
let error = if fstring.is_triple_quoted() {
FStringErrorType::UnterminatedTripleQuotedString
} else {
FStringErrorType::UnterminatedString
};
return Err(LexicalError {
error: LexicalErrorType::FStringError(error),
location: self.offset(),
});
}
'\n' | '\r' if !fstring.is_triple_quoted() => {
// If we encounter a newline while we're in a format spec, then
// we stop here and let the lexer emit the newline token.
//
// Relevant discussion: https://github.com/python/cpython/issues/110259
if in_format_spec {
break;
}
return Err(LexicalError {
error: LexicalErrorType::FStringError(FStringErrorType::UnterminatedString),
location: self.offset(),
});
}
'\\' => {
self.cursor.bump(); // '\'
if matches!(self.cursor.first(), '{' | '}') {
// Don't consume `{` or `}` as we want them to be emitted as tokens.
// They will be handled in the next iteration.
continue;
} else if !fstring.is_raw_string() {
if self.cursor.eat_char2('N', '{') {
in_named_unicode = true;
continue;
}
}
// Consume the escaped character.
if self.cursor.eat_char('\r') {
self.cursor.eat_char('\n');
} else {
self.cursor.bump();
}
}
quote @ ('\'' | '"') if quote == fstring.quote_char() => {
if let Some(triple_quotes) = fstring.triple_quotes() {
if self.cursor.rest().starts_with(triple_quotes) {
break;
}
self.cursor.bump();
} else {
break;
}
}
'{' => {
if self.cursor.second() == '{' && !in_format_spec {
self.cursor.bump();
normalized
.push_str(&self.source[TextRange::new(last_offset, self.offset())]);
self.cursor.bump(); // Skip the second `{`
last_offset = self.offset();
} else {
break;
}
}
'}' => {
if in_named_unicode {
in_named_unicode = false;
self.cursor.bump();
} else if self.cursor.second() == '}' && !in_format_spec {
self.cursor.bump();
normalized
.push_str(&self.source[TextRange::new(last_offset, self.offset())]);
self.cursor.bump(); // Skip the second `}`
last_offset = self.offset();
} else {
break;
}
}
_ => {
self.cursor.bump();
}
}
}
let range = self.token_range();
if range.is_empty() {
return Ok(None);
}
let value = if normalized.is_empty() {
self.source[range].to_string()
} else {
normalized.push_str(&self.source[TextRange::new(last_offset, self.offset())]);
normalized
};
Ok(Some(Tok::FStringMiddle {
value,
is_raw: fstring.is_raw_string(),
}))
}
/// Lex a string literal.
fn lex_string(&mut self, kind: StringKind, quote: char) -> Result<Tok, LexicalError> {
#[cfg(debug_assertions)]
debug_assert_eq!(self.cursor.previous(), quote);
// If the next two characters are also the quote character, then we have a triple-quoted
// string; consume those two characters and ensure that we require a triple-quote to close
let triple_quoted = self.cursor.eat_char2(quote, quote);
let value_start = self.offset();
let value_end = loop {
match self.cursor.bump() {
Some('\\') => {
if self.cursor.eat_char('\r') {
self.cursor.eat_char('\n');
} else {
self.cursor.bump();
}
}
Some('\r' | '\n') if !triple_quoted => {
if let Some(fstring) = self.fstrings.current() {
// When we are in an f-string, check whether the initial quote
// matches with f-strings quotes and if it is, then this must be a
// missing '}' token so raise the proper error.
if fstring.quote_char() == quote && !fstring.is_triple_quoted() {
return Err(LexicalError {
error: LexicalErrorType::FStringError(
FStringErrorType::UnclosedLbrace,
),
location: self.offset() - TextSize::new(1),
});
}
}
return Err(LexicalError {
error: LexicalErrorType::OtherError(
"EOL while scanning string literal".to_owned(),
),
location: self.offset() - TextSize::new(1),
});
}
Some(c) if c == quote => {
if triple_quoted {
if self.cursor.eat_char2(quote, quote) {
break self.offset() - TextSize::new(3);
}
} else {
break self.offset() - TextSize::new(1);
}
}
Some(_) => {}
None => {
if let Some(fstring) = self.fstrings.current() {
// When we are in an f-string, check whether the initial quote
// matches with f-strings quotes and if it is, then this must be a
// missing '}' token so raise the proper error.
if fstring.quote_char() == quote
&& fstring.is_triple_quoted() == triple_quoted
{
return Err(LexicalError {
error: LexicalErrorType::FStringError(
FStringErrorType::UnclosedLbrace,
),
location: self.offset(),
});
}
}
return Err(LexicalError {
error: if triple_quoted {
LexicalErrorType::Eof
} else {
LexicalErrorType::StringError
},
location: self.offset(),
});
}
}
};
let tok = Tok::String {
value: self.source[TextRange::new(value_start, value_end)].to_string(),
kind,
triple_quoted,
};
Ok(tok)
}
// This is the main entry point. Call this function to retrieve the next token.
// This function is used by the iterator implementation.
pub fn next_token(&mut self) -> LexResult {
if let Some(fstring) = self.fstrings.current() {
if !fstring.is_in_expression(self.nesting) {
match self.lex_fstring_middle_or_end() {
Ok(Some(tok)) => {
if tok == Tok::FStringEnd {
self.fstrings.pop();
}
return Ok((tok, self.token_range()));
}
Err(e) => {
// This is to prevent an infinite loop in which the lexer
// continuously returns an error token because the f-string
// remains on the stack.
self.fstrings.pop();
return Err(e);
}
_ => {}
}
}
}
// Return dedent tokens until the current indentation level matches the indentation of the next token.
else if let Some(indentation) = self.pending_indentation.take() {
match self.indentations.current().try_compare(indentation) {
Ok(Ordering::Greater) => {
self.pending_indentation = Some(indentation);
let offset = self.offset();
self.indentations.dedent_one(indentation).map_err(|_| {
LexicalError::new(LexicalErrorType::IndentationError, offset)
})?;
return Ok((Tok::Dedent, TextRange::empty(offset)));
}
Ok(_) => {}
Err(_) => {
return Err(LexicalError::new(
LexicalErrorType::IndentationError,
self.offset(),
));
}
}
}
if self.state.is_after_newline() {
if let Some(indentation) = self.eat_indentation()? {
return Ok(indentation);
}
} else {
self.skip_whitespace()?;
}
self.cursor.start_token();
if let Some(c) = self.cursor.bump() {
if c.is_ascii() {
self.consume_ascii_character(c)
} else if is_unicode_identifier_start(c) {
let identifier = self.lex_identifier(c)?;
self.state = State::Other;
Ok((identifier, self.token_range()))
} else {
Err(LexicalError {
error: LexicalErrorType::UnrecognizedToken { tok: c },
location: self.token_start(),
})
}
} else {
// Reached the end of the file. Emit a trailing newline token if not at the beginning of a logical line,
// empty the dedent stack, and finally, return the EndOfFile token.
self.consume_end()
}
}
fn skip_whitespace(&mut self) -> Result<(), LexicalError> {
loop {
match self.cursor.first() {
' ' => {
self.cursor.bump();
}
'\t' => {
self.cursor.bump();
}
'\\' => {
self.cursor.bump();
if self.cursor.eat_char('\r') {
self.cursor.eat_char('\n');
} else if self.cursor.is_eof() {
return Err(LexicalError {
error: LexicalErrorType::Eof,
location: self.token_start(),
});
} else if !self.cursor.eat_char('\n') {
return Err(LexicalError {
error: LexicalErrorType::LineContinuationError,
location: self.token_start(),
});
}
}
// Form feed
'\x0C' => {
self.cursor.bump();
}
_ => break,
}
}
Ok(())
}
fn eat_indentation(&mut self) -> Result<Option<Spanned>, LexicalError> {
let mut indentation = Indentation::root();
self.cursor.start_token();
loop {
match self.cursor.first() {
' ' => {
self.cursor.bump();
indentation = indentation.add_space();
}
'\t' => {
self.cursor.bump();
indentation = indentation.add_tab();
}
'\\' => {
self.cursor.bump();
if self.cursor.eat_char('\r') {
self.cursor.eat_char('\n');
} else if self.cursor.is_eof() {
return Err(LexicalError {
error: LexicalErrorType::Eof,
location: self.token_start(),
});
} else if !self.cursor.eat_char('\n') {
return Err(LexicalError {
error: LexicalErrorType::LineContinuationError,
location: self.token_start(),
});
}
indentation = Indentation::root();
}
// Form feed
'\x0C' => {
self.cursor.bump();
indentation = Indentation::root();
}
_ => break,
}
}
// Handle indentation if this is a new, not all empty, logical line
if !matches!(self.cursor.first(), '\n' | '\r' | '#' | EOF_CHAR) {
self.state = State::NonEmptyLogicalLine;
if let Some(spanned) = self.handle_indentation(indentation)? {
// Set to false so that we don't handle indentation on the next call.
return Ok(Some(spanned));
}
}
Ok(None)
}
fn handle_indentation(
&mut self,
indentation: Indentation,
) -> Result<Option<Spanned>, LexicalError> {
let token = match self.indentations.current().try_compare(indentation) {
// Dedent
Ok(Ordering::Greater) => {
self.pending_indentation = Some(indentation);
self.indentations.dedent_one(indentation).map_err(|_| {
LexicalError::new(LexicalErrorType::IndentationError, self.offset())
})?;
Some((Tok::Dedent, TextRange::empty(self.offset())))
}
Ok(Ordering::Equal) => None,
// Indent
Ok(Ordering::Less) => {
self.indentations.indent(indentation);
Some((Tok::Indent, self.token_range()))
}
Err(_) => {
return Err(LexicalError {
error: LexicalErrorType::IndentationError,
location: self.offset(),
});
}
};
Ok(token)
}
fn consume_end(&mut self) -> Result<Spanned, LexicalError> {
// We reached end of file.
// First of all, we need all nestings to be finished.
if self.nesting > 0 {
// Reset the nesting to avoid going into infinite loop.
self.nesting = 0;
return Err(LexicalError {
error: LexicalErrorType::Eof,
location: self.offset(),
});
}
// Next, insert a trailing newline, if required.
if !self.state.is_new_logical_line() {
self.state = State::AfterNewline;
Ok((Tok::Newline, TextRange::empty(self.offset())))
}
// Next, flush the indentation stack to zero.
else if self.indentations.dedent().is_some() {
Ok((Tok::Dedent, TextRange::empty(self.offset())))
} else {
Ok((Tok::EndOfFile, TextRange::empty(self.offset())))
}
}
// Dispatch based on the given character.
fn consume_ascii_character(&mut self, c: char) -> Result<Spanned, LexicalError> {
let token = match c {
c if is_ascii_identifier_start(c) => self.lex_identifier(c)?,
'0'..='9' => self.lex_number(c)?,
'#' => return Ok((self.lex_comment(), self.token_range())),
'"' | '\'' => self.lex_string(StringKind::String, c)?,
'=' => {
if self.cursor.eat_char('=') {
Tok::EqEqual
} else {
self.state = State::AfterEqual;
return Ok((Tok::Equal, self.token_range()));
}
}
'+' => {
if self.cursor.eat_char('=') {
Tok::PlusEqual
} else {
Tok::Plus
}
}
'*' => {
if self.cursor.eat_char('=') {
Tok::StarEqual
} else if self.cursor.eat_char('*') {
if self.cursor.eat_char('=') {
Tok::DoubleStarEqual
} else {
Tok::DoubleStar
}
} else {
Tok::Star
}
}
c @ ('%' | '!')
if self.mode == Mode::Ipython
&& self.state.is_after_equal()
&& self.nesting == 0 =>
{
// SAFETY: Safe because `c` has been matched against one of the possible escape command token
self.lex_ipython_escape_command(IpyEscapeKind::try_from(c).unwrap())
}
c @ ('%' | '!' | '?' | '/' | ';' | ',')
if self.mode == Mode::Ipython && self.state.is_new_logical_line() =>
{
let kind = if let Ok(kind) = IpyEscapeKind::try_from([c, self.cursor.first()]) {
self.cursor.bump();
kind
} else {
// SAFETY: Safe because `c` has been matched against one of the possible escape command token
IpyEscapeKind::try_from(c).unwrap()
};
self.lex_ipython_escape_command(kind)
}
'?' if self.mode == Mode::Ipython => Tok::Question,
'/' => {
if self.cursor.eat_char('=') {
Tok::SlashEqual
} else if self.cursor.eat_char('/') {
if self.cursor.eat_char('=') {
Tok::DoubleSlashEqual
} else {
Tok::DoubleSlash
}
} else {
Tok::Slash
}
}
'%' => {
if self.cursor.eat_char('=') {
Tok::PercentEqual
} else {
Tok::Percent
}
}
'|' => {
if self.cursor.eat_char('=') {
Tok::VbarEqual
} else {
Tok::Vbar
}
}
'^' => {
if self.cursor.eat_char('=') {
Tok::CircumflexEqual
} else {
Tok::CircumFlex
}
}
'&' => {
if self.cursor.eat_char('=') {
Tok::AmperEqual
} else {
Tok::Amper
}
}
'-' => {
if self.cursor.eat_char('=') {
Tok::MinusEqual
} else if self.cursor.eat_char('>') {
Tok::Rarrow
} else {
Tok::Minus
}
}
'@' => {
if self.cursor.eat_char('=') {
Tok::AtEqual
} else {
Tok::At
}
}
'!' => {
if self.cursor.eat_char('=') {
Tok::NotEqual
} else {
Tok::Exclamation
}
}
'~' => Tok::Tilde,
'(' => {
self.nesting += 1;
Tok::Lpar
}
')' => {
self.nesting = self.nesting.saturating_sub(1);
Tok::Rpar
}
'[' => {
self.nesting += 1;
Tok::Lsqb
}
']' => {
self.nesting = self.nesting.saturating_sub(1);
Tok::Rsqb
}
'{' => {
self.nesting += 1;
Tok::Lbrace
}
'}' => {
if let Some(fstring) = self.fstrings.current_mut() {
if fstring.nesting() == self.nesting {
return Err(LexicalError {
error: LexicalErrorType::FStringError(FStringErrorType::SingleRbrace),
location: self.token_start(),
});
}
fstring.try_end_format_spec(self.nesting);
}
self.nesting = self.nesting.saturating_sub(1);
Tok::Rbrace
}
':' => {
if self
.fstrings
.current_mut()
.is_some_and(|fstring| fstring.try_start_format_spec(self.nesting))
{
Tok::Colon
} else if self.cursor.eat_char('=') {
Tok::ColonEqual
} else {
Tok::Colon
}
}
';' => Tok::Semi,
'<' => {
if self.cursor.eat_char('<') {
if self.cursor.eat_char('=') {
Tok::LeftShiftEqual
} else {
Tok::LeftShift
}
} else if self.cursor.eat_char('=') {
Tok::LessEqual
} else {
Tok::Less
}
}
'>' => {
if self.cursor.eat_char('>') {
if self.cursor.eat_char('=') {
Tok::RightShiftEqual
} else {
Tok::RightShift
}
} else if self.cursor.eat_char('=') {
Tok::GreaterEqual
} else {
Tok::Greater
}
}
',' => Tok::Comma,
'.' => {
if self.cursor.first().is_ascii_digit() {
self.lex_decimal_number('.')?
} else if self.cursor.eat_char2('.', '.') {
Tok::Ellipsis
} else {
Tok::Dot
}
}
'\n' => {
return Ok((
if self.nesting == 0 && !self.state.is_new_logical_line() {
self.state = State::AfterNewline;
Tok::Newline
} else {
if let Some(fstring) = self.fstrings.current_mut() {
fstring.try_end_format_spec(self.nesting);
}
Tok::NonLogicalNewline
},
self.token_range(),
))
}
'\r' => {
self.cursor.eat_char('\n');
return Ok((
if self.nesting == 0 && !self.state.is_new_logical_line() {
self.state = State::AfterNewline;
Tok::Newline
} else {
if let Some(fstring) = self.fstrings.current_mut() {
fstring.try_end_format_spec(self.nesting);
}
Tok::NonLogicalNewline
},
self.token_range(),
));
}
_ => {
self.state = State::Other;
return Err(LexicalError {
error: LexicalErrorType::UnrecognizedToken { tok: c },
location: self.token_start(),
});
}
};
self.state = State::Other;
Ok((token, self.token_range()))
}
#[inline]
fn token_range(&self) -> TextRange {
let end = self.offset();
let len = self.cursor.token_len();
TextRange::at(end - len, len)
}
#[inline]
fn token_text(&self) -> &'source str {
&self.source[self.token_range()]
}
// Lexer doesn't allow files larger than 4GB
#[allow(clippy::cast_possible_truncation)]
#[inline]
fn offset(&self) -> TextSize {
TextSize::new(self.source.len() as u32) - self.cursor.text_len()
}
#[inline]
fn token_start(&self) -> TextSize {
self.token_range().start()
}
}
// Implement iterator pattern for Lexer.
// Calling the next element in the iterator will yield the next lexical
// token.
impl Iterator for Lexer<'_> {
type Item = LexResult;
fn next(&mut self) -> Option<Self::Item> {
let token = self.next_token();
match token {
Ok((Tok::EndOfFile, _)) => None,
r => Some(r),
}
}
}
impl FusedIterator for Lexer<'_> {}
/// Represents an error that occur during lexing and are
/// returned by the `parse_*` functions in the iterator in the
/// [lexer] implementation.
///
/// [lexer]: crate::lexer
#[derive(Debug, Clone, PartialEq)]
pub struct LexicalError {
/// The type of error that occurred.
pub error: LexicalErrorType,
/// The location of the error.
pub location: TextSize,
}
impl LexicalError {
/// Creates a new `LexicalError` with the given error type and location.
pub fn new(error: LexicalErrorType, location: TextSize) -> Self {
Self { error, location }
}
}
impl std::ops::Deref for LexicalError {
type Target = LexicalErrorType;
fn deref(&self) -> &Self::Target {
&self.error
}
}
impl std::error::Error for LexicalError {
fn source(&self) -> Option<&(dyn std::error::Error + 'static)> {
Some(&self.error)
}
}
impl std::fmt::Display for LexicalError {
fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
write!(
f,
"{} at byte offset {}",
&self.error,
u32::from(self.location)
)
}
}
/// Represents the different types of errors that can occur during lexing.
#[derive(Debug, Clone, PartialEq)]
pub enum LexicalErrorType {
// TODO: Can probably be removed, the places it is used seem to be able
// to use the `UnicodeError` variant instead.
#[doc(hidden)]
StringError,
// TODO: Should take a start/end position to report.
/// Decoding of a unicode escape sequence in a string literal failed.
UnicodeError,
/// The nesting of brackets/braces/parentheses is not balanced.
NestingError,
/// The indentation is not consistent.
IndentationError,
/// Inconsistent use of tabs and spaces.
TabError,
/// Encountered a tab after a space.
TabsAfterSpaces,
/// A non-default argument follows a default argument.
DefaultArgumentError,
/// A duplicate argument was found in a function definition.
DuplicateArgumentError(String),
/// A positional argument follows a keyword argument.
PositionalArgumentError,
/// An iterable argument unpacking `*args` follows keyword argument unpacking `**kwargs`.
UnpackedArgumentError,
/// A keyword argument was repeated.
DuplicateKeywordArgumentError(String),
/// An unrecognized token was encountered.
UnrecognizedToken { tok: char },
/// An f-string error containing the [`FStringErrorType`].
FStringError(FStringErrorType),
/// An unexpected character was encountered after a line continuation.
LineContinuationError,
/// An unexpected end of file was encountered.
Eof,
/// Occurs when a syntactically invalid assignment was encountered.
AssignmentError,
/// An unexpected error occurred.
OtherError(String),
}
impl std::error::Error for LexicalErrorType {}
impl std::fmt::Display for LexicalErrorType {
fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
match self {
LexicalErrorType::StringError => write!(f, "Got unexpected string"),
LexicalErrorType::FStringError(error) => write!(f, "f-string: {error}"),
LexicalErrorType::UnicodeError => write!(f, "Got unexpected unicode"),
LexicalErrorType::NestingError => write!(f, "Got unexpected nesting"),
LexicalErrorType::IndentationError => {
write!(f, "unindent does not match any outer indentation level")
}
LexicalErrorType::TabError => {
write!(f, "inconsistent use of tabs and spaces in indentation")
}
LexicalErrorType::TabsAfterSpaces => {
write!(f, "Tabs not allowed as part of indentation after spaces")
}
LexicalErrorType::DefaultArgumentError => {
write!(f, "non-default argument follows default argument")
}
LexicalErrorType::DuplicateArgumentError(arg_name) => {
write!(f, "duplicate argument '{arg_name}' in function definition")
}
LexicalErrorType::DuplicateKeywordArgumentError(arg_name) => {
write!(f, "keyword argument repeated: {arg_name}")
}
LexicalErrorType::PositionalArgumentError => {
write!(f, "positional argument follows keyword argument")
}
LexicalErrorType::UnpackedArgumentError => {
write!(
f,
"iterable argument unpacking follows keyword argument unpacking"
)
}
LexicalErrorType::UnrecognizedToken { tok } => {
write!(f, "Got unexpected token {tok}")
}
LexicalErrorType::LineContinuationError => {
write!(f, "unexpected character after line continuation character")
}
LexicalErrorType::Eof => write!(f, "unexpected EOF while parsing"),
LexicalErrorType::AssignmentError => write!(f, "invalid assignment target"),
LexicalErrorType::OtherError(msg) => write!(f, "{msg}"),
}
}
}
#[derive(Copy, Clone, Debug)]
enum State {
/// Lexer is right at the beginning of the file or after a `Newline` token.
AfterNewline,
/// The lexer is at the start of a new logical line but **after** the indentation
NonEmptyLogicalLine,
/// Lexer is right after an equal token
AfterEqual,
/// Inside of a logical line
Other,
}
impl State {
const fn is_after_newline(self) -> bool {
matches!(self, State::AfterNewline)
}
const fn is_new_logical_line(self) -> bool {
matches!(self, State::AfterNewline | State::NonEmptyLogicalLine)
}
const fn is_after_equal(self) -> bool {
matches!(self, State::AfterEqual)
}
}
#[derive(Copy, Clone, Debug)]
enum Radix {
Binary,
Octal,
Decimal,
Hex,
}
impl Radix {
const fn as_u32(self) -> u32 {
match self {
Radix::Binary => 2,
Radix::Octal => 8,
Radix::Decimal => 10,
Radix::Hex => 16,
}
}
const fn is_digit(self, c: char) -> bool {
match self {
Radix::Binary => matches!(c, '0'..='1'),
Radix::Octal => matches!(c, '0'..='7'),
Radix::Decimal => c.is_ascii_digit(),
Radix::Hex => matches!(c, '0'..='9' | 'a'..='f' | 'A'..='F'),
}
}
}
const fn is_quote(c: char) -> bool {
matches!(c, '\'' | '"')
}
const fn is_ascii_identifier_start(c: char) -> bool {
matches!(c, 'a'..='z' | 'A'..='Z' | '_')
}
// Checks if the character c is a valid starting character as described
// in https://docs.python.org/3/reference/lexical_analysis.html#identifiers
fn is_unicode_identifier_start(c: char) -> bool {
is_xid_start(c)
}
// Checks if the character c is a valid continuation character as described
// in https://docs.python.org/3/reference/lexical_analysis.html#identifiers
fn is_identifier_continuation(c: char) -> bool {
match c {
'a'..='z' | 'A'..='Z' | '_' | '0'..='9' => true,
c => is_xid_continue(c),
}
}
/// Returns `true` for [whitespace](https://docs.python.org/3/reference/lexical_analysis.html#whitespace-between-tokens)
/// characters.
///
/// This is the same as `ruff_python_trivia::is_python_whitespace` and is copied
/// here to avoid a circular dependency as `ruff_python_trivia` has a dev-dependency
/// on `ruff_python_lexer`.
const fn is_python_whitespace(c: char) -> bool {
matches!(
c,
// Space, tab, or form-feed
' ' | '\t' | '\x0C'
)
}
enum LexedText<'a> {
Source { source: &'a str, range: TextRange },
Owned(String),
}
impl<'a> LexedText<'a> {
fn new(start: TextSize, source: &'a str) -> Self {
Self::Source {
range: TextRange::empty(start),
source,
}
}
fn push(&mut self, c: char) {
match self {
LexedText::Source { range, source } => {
*range = range.add_end(c.text_len());
debug_assert!(source[*range].ends_with(c));
}
LexedText::Owned(owned) => owned.push(c),
}
}
fn as_str<'b>(&'b self) -> &'b str
where
'b: 'a,
{
match self {
LexedText::Source { range, source } => &source[*range],
LexedText::Owned(owned) => owned,
}
}
fn skip_char(&mut self) {
match self {
LexedText::Source { range, source } => {
*self = LexedText::Owned(source[*range].to_string());
}
LexedText::Owned(_) => {}
}
}
}
#[cfg(test)]
mod tests {
use insta::assert_debug_snapshot;
use super::*;
const WINDOWS_EOL: &str = "\r\n";
const MAC_EOL: &str = "\r";
const UNIX_EOL: &str = "\n";
fn lex_source_with_mode(source: &str, mode: Mode) -> Vec<Spanned> {
let lexer = lex(source, mode);
lexer.map(std::result::Result::unwrap).collect()
}
fn lex_source(source: &str) -> Vec<Spanned> {
lex_source_with_mode(source, Mode::Module)
}
fn lex_jupyter_source(source: &str) -> Vec<Spanned> {
lex_source_with_mode(source, Mode::Ipython)
}
fn ipython_escape_command_line_continuation_eol(eol: &str) -> Vec<Spanned> {
let source = format!("%matplotlib \\{eol} --inline");
lex_jupyter_source(&source)
}
#[test]
fn test_ipython_escape_command_line_continuation_unix_eol() {
assert_debug_snapshot!(ipython_escape_command_line_continuation_eol(UNIX_EOL));
}
#[test]
fn test_ipython_escape_command_line_continuation_mac_eol() {
assert_debug_snapshot!(ipython_escape_command_line_continuation_eol(MAC_EOL));
}
#[test]
fn test_ipython_escape_command_line_continuation_windows_eol() {
assert_debug_snapshot!(ipython_escape_command_line_continuation_eol(WINDOWS_EOL));
}
fn ipython_escape_command_line_continuation_with_eol_and_eof(eol: &str) -> Vec<Spanned> {
let source = format!("%matplotlib \\{eol}");
lex_jupyter_source(&source)
}
#[test]
fn test_ipython_escape_command_line_continuation_with_unix_eol_and_eof() {
assert_debug_snapshot!(ipython_escape_command_line_continuation_with_eol_and_eof(
UNIX_EOL
));
}
#[test]
fn test_ipython_escape_command_line_continuation_with_mac_eol_and_eof() {
assert_debug_snapshot!(ipython_escape_command_line_continuation_with_eol_and_eof(
MAC_EOL
));
}
#[test]
fn test_ipython_escape_command_line_continuation_with_windows_eol_and_eof() {
assert_debug_snapshot!(ipython_escape_command_line_continuation_with_eol_and_eof(
WINDOWS_EOL
));
}
#[test]
fn test_empty_ipython_escape_command() {
let source = "%\n%%\n!\n!!\n?\n??\n/\n,\n;";
assert_debug_snapshot!(lex_jupyter_source(source));
}
#[test]
fn test_ipython_escape_command() {
let source = r"
?foo
??foo
%timeit a = b
%timeit a % 3
%matplotlib \
--inline
!pwd \
&& ls -a | sed 's/^/\\ /'
!!cd /Users/foo/Library/Application\ Support/
/foo 1 2
,foo 1 2
;foo 1 2
!ls
"
.trim();
assert_debug_snapshot!(lex_jupyter_source(source));
}
#[test]
fn test_ipython_help_end_escape_command() {
let source = r"
?foo?
?? foo?
?? foo ?
?foo??
??foo??
???foo?
???foo??
??foo???
???foo???
?? \
foo?
?? \
?
????
%foo?
%foo??
%%foo???
!pwd?"
.trim();
assert_debug_snapshot!(lex_jupyter_source(source));
}
#[test]
fn test_ipython_escape_command_indentation() {
let source = r"
if True:
%matplotlib \
--inline"
.trim();
assert_debug_snapshot!(lex_jupyter_source(source));
}
#[test]
fn test_ipython_escape_command_assignment() {
let source = r"
pwd = !pwd
foo = %timeit a = b
bar = %timeit a % 3
baz = %matplotlib \
inline"
.trim();
assert_debug_snapshot!(lex_jupyter_source(source));
}
fn assert_no_ipython_escape_command(tokens: &[Spanned]) {
for (tok, _) in tokens {
if let Tok::IpyEscapeCommand { .. } = tok {
panic!("Unexpected escape command token: {tok:?}")
}
}
}
#[test]
fn test_ipython_escape_command_not_an_assignment() {
let source = r"
# Other escape kinds are not valid here (can't test `foo = ?str` because '?' is not a valid token)
foo = /func
foo = ;func
foo = ,func
(foo == %timeit a = b)
(foo := %timeit a = b)
def f(arg=%timeit a = b):
pass"
.trim();
let tokens = lex_jupyter_source(source);
assert_no_ipython_escape_command(&tokens);
}
#[test]
fn test_numbers() {
let source =
"0x2f 0o12 0b1101 0 123 123_45_67_890 0.2 1e+2 2.1e3 2j 2.2j 000 0x995DC9BBDF1939FA";
assert_debug_snapshot!(lex_source(source));
}
#[test]
fn test_invalid_leading_zero_small() {
let source = "025";
let lexer = lex(source, Mode::Module);
let tokens = lexer.collect::<Result<Vec<_>, LexicalError>>();
assert_debug_snapshot!(tokens);
}
#[test]
fn test_invalid_leading_zero_big() {
let source =
"0252222222222222522222222222225222222222222252222222222222522222222222225222222222222";
let lexer = lex(source, Mode::Module);
let tokens = lexer.collect::<Result<Vec<_>, LexicalError>>();
assert_debug_snapshot!(tokens);
}
#[test]
fn test_line_comment_long() {
let source = "99232 # foo".to_string();
assert_debug_snapshot!(lex_source(&source));
}
#[test]
fn test_line_comment_whitespace() {
let source = "99232 # ".to_string();
assert_debug_snapshot!(lex_source(&source));
}
#[test]
fn test_line_comment_single_whitespace() {
let source = "99232 # ".to_string();
assert_debug_snapshot!(lex_source(&source));
}
#[test]
fn test_line_comment_empty() {
let source = "99232 #".to_string();
assert_debug_snapshot!(lex_source(&source));
}
fn comment_until_eol(eol: &str) -> Vec<Spanned> {
let source = format!("123 # Foo{eol}456");
lex_source(&source)
}
#[test]
fn test_comment_until_unix_eol() {
assert_debug_snapshot!(comment_until_eol(UNIX_EOL));
}
#[test]
fn test_comment_until_mac_eol() {
assert_debug_snapshot!(comment_until_eol(MAC_EOL));
}
#[test]
fn test_comment_until_windows_eol() {
assert_debug_snapshot!(comment_until_eol(WINDOWS_EOL));
}
#[test]
fn test_assignment() {
let source = r"a_variable = 99 + 2-0";
assert_debug_snapshot!(lex_source(source));
}
fn indentation_with_eol(eol: &str) -> Vec<Spanned> {
let source = format!("def foo():{eol} return 99{eol}{eol}");
lex_source(&source)
}
#[test]
fn test_indentation_with_unix_eol() {
assert_debug_snapshot!(indentation_with_eol(UNIX_EOL));
}
#[test]
fn test_indentation_with_mac_eol() {
assert_debug_snapshot!(indentation_with_eol(MAC_EOL));
}
#[test]
fn test_indentation_with_windows_eol() {
assert_debug_snapshot!(indentation_with_eol(WINDOWS_EOL));
}
fn double_dedent_with_eol(eol: &str) -> Vec<Spanned> {
let source = format!("def foo():{eol} if x:{eol}{eol} return 99{eol}{eol}");
lex_source(&source)
}
#[test]
fn test_double_dedent_with_unix_eol() {
assert_debug_snapshot!(double_dedent_with_eol(UNIX_EOL));
}
#[test]
fn test_double_dedent_with_mac_eol() {
assert_debug_snapshot!(double_dedent_with_eol(MAC_EOL));
}
#[test]
fn test_double_dedent_with_windows_eol() {
assert_debug_snapshot!(double_dedent_with_eol(WINDOWS_EOL));
}
fn double_dedent_with_tabs_eol(eol: &str) -> Vec<Spanned> {
let source = format!("def foo():{eol}\tif x:{eol}{eol}\t\t return 99{eol}{eol}");
lex_source(&source)
}
#[test]
fn test_double_dedent_with_tabs_unix_eol() {
assert_debug_snapshot!(double_dedent_with_tabs_eol(UNIX_EOL));
}
#[test]
fn test_double_dedent_with_tabs_mac_eol() {
assert_debug_snapshot!(double_dedent_with_tabs_eol(MAC_EOL));
}
#[test]
fn test_double_dedent_with_tabs_windows_eol() {
assert_debug_snapshot!(double_dedent_with_tabs_eol(WINDOWS_EOL));
}
fn newline_in_brackets_eol(eol: &str) -> Vec<Spanned> {
let source = r"x = [
1,2
,(3,
4,
), {
5,
6,\
7}]
"
.replace('\n', eol);
lex_source(&source)
}
#[test]
fn test_newline_in_brackets_unix_eol() {
assert_debug_snapshot!(newline_in_brackets_eol(UNIX_EOL));
}
#[test]
fn test_newline_in_brackets_mac_eol() {
assert_debug_snapshot!(newline_in_brackets_eol(MAC_EOL));
}
#[test]
fn test_newline_in_brackets_windows_eol() {
assert_debug_snapshot!(newline_in_brackets_eol(WINDOWS_EOL));
}
#[test]
fn test_non_logical_newline_in_string_continuation() {
let source = r"(
'a'
'b'
'c' \
'd'
)";
assert_debug_snapshot!(lex_source(source));
}
#[test]
fn test_logical_newline_line_comment() {
let source = "#Hello\n#World\n";
assert_debug_snapshot!(lex_source(source));
}
#[test]
fn test_operators() {
let source = "//////=/ /";
assert_debug_snapshot!(lex_source(source));
}
#[test]
fn test_string() {
let source = r#""double" 'single' 'can\'t' "\\\"" '\t\r\n' '\g' r'raw\'' '\420' '\200\0a'"#;
assert_debug_snapshot!(lex_source(source));
}
fn string_continuation_with_eol(eol: &str) -> Vec<Spanned> {
let source = format!("\"abc\\{eol}def\"");
lex_source(&source)
}
#[test]
fn test_string_continuation_with_unix_eol() {
assert_debug_snapshot!(string_continuation_with_eol(UNIX_EOL));
}
#[test]
fn test_string_continuation_with_mac_eol() {
assert_debug_snapshot!(string_continuation_with_eol(MAC_EOL));
}
#[test]
fn test_string_continuation_with_windows_eol() {
assert_debug_snapshot!(string_continuation_with_eol(WINDOWS_EOL));
}
#[test]
fn test_escape_unicode_name() {
let source = r#""\N{EN SPACE}""#;
assert_debug_snapshot!(lex_source(source));
}
fn triple_quoted_eol(eol: &str) -> Vec<Spanned> {
let source = format!("\"\"\"{eol} test string{eol} \"\"\"");
lex_source(&source)
}
#[test]
fn test_triple_quoted_unix_eol() {
assert_debug_snapshot!(triple_quoted_eol(UNIX_EOL));
}
#[test]
fn test_triple_quoted_mac_eol() {
assert_debug_snapshot!(triple_quoted_eol(MAC_EOL));
}
#[test]
fn test_triple_quoted_windows_eol() {
assert_debug_snapshot!(triple_quoted_eol(WINDOWS_EOL));
}
// This test case is to just make sure that the lexer doesn't go into
// infinite loop on invalid input.
#[test]
fn test_infinite_loop() {
let source = "[1";
let _ = lex(source, Mode::Module).collect::<Vec<_>>();
}
/// Emoji identifiers are a non-standard python feature and are not supported by our lexer.
#[test]
fn test_emoji_identifier() {
let source = "🐦";
let lexed: Vec<_> = lex(source, Mode::Module).collect();
match lexed.as_slice() {
[Err(error)] => {
assert_eq!(
error.error,
LexicalErrorType::UnrecognizedToken { tok: '🐦' }
);
}
result => panic!("Expected an error token but found {result:?}"),
}
}
#[test]
fn tet_too_low_dedent() {
let tokens: Vec<_> = lex(
"if True:
pass
pass",
Mode::Module,
)
.collect();
assert_debug_snapshot!(tokens);
}
#[test]
fn test_empty_fstrings() {
let source = r#"f"" "" F"" f'' '' f"""""" f''''''"#;
assert_debug_snapshot!(lex_source(source));
}
#[test]
fn test_fstring_prefix() {
let source = r#"f"" F"" rf"" rF"" Rf"" RF"" fr"" Fr"" fR"" FR"""#;
assert_debug_snapshot!(lex_source(source));
}
#[test]
fn test_fstring() {
let source = r#"f"normal {foo} {{another}} {bar} {{{three}}}""#;
assert_debug_snapshot!(lex_source(source));
}
#[test]
fn test_fstring_parentheses() {
let source = r#"f"{}" f"{{}}" f" {}" f"{{{}}}" f"{{{{}}}}" f" {} {{}} {{{}}} {{{{}}}} ""#;
assert_debug_snapshot!(lex_source(source));
}
fn fstring_single_quote_escape_eol(eol: &str) -> Vec<Spanned> {
let source = format!(r"f'text \{eol} more text'");
lex_source(&source)
}
#[test]
fn test_fstring_single_quote_escape_unix_eol() {
assert_debug_snapshot!(fstring_single_quote_escape_eol(UNIX_EOL));
}
#[test]
fn test_fstring_single_quote_escape_mac_eol() {
assert_debug_snapshot!(fstring_single_quote_escape_eol(MAC_EOL));
}
#[test]
fn test_fstring_single_quote_escape_windows_eol() {
assert_debug_snapshot!(fstring_single_quote_escape_eol(WINDOWS_EOL));
}
#[test]
fn test_fstring_escape() {
let source = r#"f"\{x:\"\{x}} \"\"\
end""#;
assert_debug_snapshot!(lex_source(source));
}
#[test]
fn test_fstring_escape_braces() {
let source = r"f'\{foo}' f'\\{foo}' f'\{{foo}}' f'\\{{foo}}'";
assert_debug_snapshot!(lex_source(source));
}
#[test]
fn test_fstring_escape_raw() {
let source = r#"rf"\{x:\"\{x}} \"\"\
end""#;
assert_debug_snapshot!(lex_source(source));
}
#[test]
fn test_fstring_named_unicode() {
let source = r#"f"\N{BULLET} normal \Nope \N""#;
assert_debug_snapshot!(lex_source(source));
}
#[test]
fn test_fstring_named_unicode_raw() {
let source = r#"rf"\N{BULLET} normal""#;
assert_debug_snapshot!(lex_source(source));
}
#[test]
fn test_fstring_with_named_expression() {
let source = r#"f"{x:=10} {(x:=10)} {x,{y:=10}} {[x:=10]}""#;
assert_debug_snapshot!(lex_source(source));
}
#[test]
fn test_fstring_with_format_spec() {
let source = r#"f"{foo:} {x=!s:.3f} {x:.{y}f} {'':*^{1:{1}}} {x:{{1}.pop()}}""#;
assert_debug_snapshot!(lex_source(source));
}
#[test]
fn test_fstring_with_multiline_format_spec() {
// The last f-string is invalid syntactically but we should still lex it.
// Note that the `b` is a `Name` token and not a `FStringMiddle` token.
let source = r"f'''__{
x:d
}__'''
f'''__{
x:a
b
c
}__'''
f'__{
x:d
}__'
f'__{
x:a
b
}__'
";
assert_debug_snapshot!(lex_source(source));
}
#[test]
fn test_fstring_conversion() {
let source = r#"f"{x!s} {x=!r} {x:.3f!r} {{x!r}}""#;
assert_debug_snapshot!(lex_source(source));
}
#[test]
fn test_fstring_nested() {
let source = r#"f"foo {f"bar {x + f"{wow}"}"} baz" f'foo {f'bar'} some {f"another"}'"#;
assert_debug_snapshot!(lex_source(source));
}
#[test]
fn test_fstring_expression_multiline() {
let source = r#"f"first {
x
*
y
} second""#;
assert_debug_snapshot!(lex_source(source));
}
#[test]
fn test_fstring_multiline() {
let source = r#"f"""
hello
world
""" f'''
world
hello
''' f"some {f"""multiline
allowed {x}"""} string""#;
assert_debug_snapshot!(lex_source(source));
}
#[test]
fn test_fstring_comments() {
let source = r#"f"""
# not a comment { # comment {
x
} # not a comment
""""#;
assert_debug_snapshot!(lex_source(source));
}
#[test]
fn test_fstring_with_ipy_escape_command() {
let source = r#"f"foo {!pwd} bar""#;
assert_debug_snapshot!(lex_source(source));
}
#[test]
fn test_fstring_with_lambda_expression() {
let source = r#"
f"{lambda x:{x}}"
f"{(lambda x:{x})}"
"#
.trim();
assert_debug_snapshot!(lex_source(source));
}
#[test]
fn test_fstring_with_nul_char() {
let source = r"f'\0'";
assert_debug_snapshot!(lex_source(source));
}
#[test]
fn test_match_softkeyword_in_notebook() {
let source = r"match foo:
case bar:
pass";
assert_debug_snapshot!(lex_jupyter_source(source));
}
fn lex_error(source: &str) -> LexicalError {
match lex(source, Mode::Module).find_map(Result::err) {
Some(err) => err,
_ => panic!("Expected at least one error"),
}
}
fn lex_fstring_error(source: &str) -> FStringErrorType {
match lex_error(source).error {
LexicalErrorType::FStringError(error) => error,
err => panic!("Expected FStringError: {err:?}"),
}
}
#[test]
fn test_fstring_error() {
use FStringErrorType::{
SingleRbrace, UnclosedLbrace, UnterminatedString, UnterminatedTripleQuotedString,
};
assert_eq!(lex_fstring_error("f'}'"), SingleRbrace);
assert_eq!(lex_fstring_error("f'{{}'"), SingleRbrace);
assert_eq!(lex_fstring_error("f'{{}}}'"), SingleRbrace);
assert_eq!(lex_fstring_error("f'foo}'"), SingleRbrace);
assert_eq!(lex_fstring_error(r"f'\u007b}'"), SingleRbrace);
assert_eq!(lex_fstring_error("f'{a:b}}'"), SingleRbrace);
assert_eq!(lex_fstring_error("f'{3:}}>10}'"), SingleRbrace);
assert_eq!(lex_fstring_error(r"f'\{foo}\}'"), SingleRbrace);
assert_eq!(lex_fstring_error("f'{'"), UnclosedLbrace);
assert_eq!(lex_fstring_error("f'{foo!r'"), UnclosedLbrace);
assert_eq!(lex_fstring_error("f'{foo='"), UnclosedLbrace);
assert_eq!(
lex_fstring_error(
r#"f"{"
"#
),
UnclosedLbrace
);
assert_eq!(lex_fstring_error(r#"f"""{""""#), UnclosedLbrace);
assert_eq!(lex_fstring_error(r#"f""#), UnterminatedString);
assert_eq!(lex_fstring_error(r"f'"), UnterminatedString);
assert_eq!(lex_fstring_error(r#"f""""#), UnterminatedTripleQuotedString);
assert_eq!(lex_fstring_error(r"f'''"), UnterminatedTripleQuotedString);
assert_eq!(
lex_fstring_error(r#"f"""""#),
UnterminatedTripleQuotedString
);
assert_eq!(
lex_fstring_error(r#"f""""""#),
UnterminatedTripleQuotedString
);
}
#[test]
fn test_fstring_error_location() {
assert_debug_snapshot!(lex_error("f'{'"), @r###"
LexicalError {
error: FStringError(
UnclosedLbrace,
),
location: 4,
}
"###);
assert_debug_snapshot!(lex_error("f'{'α"), @r###"
LexicalError {
error: FStringError(
UnclosedLbrace,
),
location: 6,
}
"###);
}
}