mirror of
https://github.com/RustPython/Parser.git
synced 2025-07-23 04:55:25 +00:00
1793 lines
60 KiB
Rust
1793 lines
60 KiB
Rust
//! This module takes care of lexing Python source text.
|
|
//!
|
|
//! This means source code is scanned and translated into separate tokens. The rules
|
|
//! governing what is and is not a valid token are defined in the Python reference
|
|
//! guide section on [Lexical analysis].
|
|
//!
|
|
//! The primary function in this module is [`lex`], which takes a string slice
|
|
//! and returns an iterator over the tokens in the source code. The tokens are currently returned
|
|
//! as a `Result<Spanned, LexicalError>`, where [`Spanned`] is a tuple containing the
|
|
//! start and end [`Location`] and a [`Tok`] denoting the token.
|
|
//!
|
|
//! # Example
|
|
//!
|
|
//! ```
|
|
//! use rustpython_parser::{lexer::lex, Tok, Mode, StringKind};
|
|
//!
|
|
//! let source = "x = 'RustPython'";
|
|
//! let tokens = lex(source, Mode::Module)
|
|
//! .map(|tok| tok.expect("Failed to lex"))
|
|
//! .collect::<Vec<_>>();
|
|
//!
|
|
//! for (start, token, end) in tokens {
|
|
//! println!(
|
|
//! "{0},{1}-{2},{3:<5} {token:?}",
|
|
//! start.row(),
|
|
//! start.column(),
|
|
//! end.row(),
|
|
//! end.column(),
|
|
//! );
|
|
//! }
|
|
//! ```
|
|
//!
|
|
//! [Lexical analysis]: https://docs.python.org/3/reference/lexical_analysis.html
|
|
use crate::{
|
|
ast::Location,
|
|
mode::Mode,
|
|
soft_keywords::SoftKeywordTransformer,
|
|
string::FStringErrorType,
|
|
token::{StringKind, Tok},
|
|
};
|
|
use log::trace;
|
|
use num_bigint::BigInt;
|
|
use num_traits::{Num, Zero};
|
|
use std::{char, cmp::Ordering, ops::Index, slice::SliceIndex, str::FromStr};
|
|
use unic_emoji_char::is_emoji_presentation;
|
|
use unic_ucd_ident::{is_xid_continue, is_xid_start};
|
|
|
|
// Indentations are tracked by a stack of indentation levels. IndentationLevel keeps
|
|
// track of the number of tabs and spaces at the current level.
|
|
#[derive(Clone, Copy, PartialEq, Debug, Default)]
|
|
struct IndentationLevel {
|
|
tabs: u32,
|
|
spaces: u32,
|
|
}
|
|
|
|
impl IndentationLevel {
|
|
fn compare_strict(
|
|
&self,
|
|
other: &IndentationLevel,
|
|
location: Location,
|
|
) -> Result<Ordering, LexicalError> {
|
|
// We only know for sure that we're smaller or bigger if tabs
|
|
// and spaces both differ in the same direction. Otherwise we're
|
|
// dependent on the size of tabs.
|
|
match self.tabs.cmp(&other.tabs) {
|
|
Ordering::Less => {
|
|
if self.spaces <= other.spaces {
|
|
Ok(Ordering::Less)
|
|
} else {
|
|
Err(LexicalError {
|
|
location,
|
|
error: LexicalErrorType::TabError,
|
|
})
|
|
}
|
|
}
|
|
Ordering::Greater => {
|
|
if self.spaces >= other.spaces {
|
|
Ok(Ordering::Greater)
|
|
} else {
|
|
Err(LexicalError {
|
|
location,
|
|
error: LexicalErrorType::TabError,
|
|
})
|
|
}
|
|
}
|
|
Ordering::Equal => Ok(self.spaces.cmp(&other.spaces)),
|
|
}
|
|
}
|
|
}
|
|
|
|
// The indentations stack is used to keep track of the current indentation level.
|
|
// Similar to the CPython implementation, the Indentations stack always has at
|
|
// least one level which is never popped. See Reference 2.1.8.
|
|
#[derive(Debug)]
|
|
struct Indentations {
|
|
indent_stack: Vec<IndentationLevel>,
|
|
}
|
|
|
|
impl Indentations {
|
|
fn is_empty(&self) -> bool {
|
|
self.indent_stack.len() == 1
|
|
}
|
|
|
|
fn push(&mut self, indent: IndentationLevel) {
|
|
self.indent_stack.push(indent);
|
|
}
|
|
|
|
fn pop(&mut self) -> Option<IndentationLevel> {
|
|
if self.is_empty() {
|
|
return None;
|
|
}
|
|
self.indent_stack.pop()
|
|
}
|
|
|
|
fn current(&self) -> &IndentationLevel {
|
|
self.indent_stack
|
|
.last()
|
|
.expect("Indentations must have at least one level")
|
|
}
|
|
}
|
|
|
|
impl Default for Indentations {
|
|
fn default() -> Self {
|
|
Self {
|
|
indent_stack: vec![IndentationLevel::default()],
|
|
}
|
|
}
|
|
}
|
|
|
|
// A CharWindow is a sliding window over an iterator of chars. It is used to
|
|
// allow for look-ahead when scanning tokens from the source code.
|
|
struct CharWindow<T: Iterator<Item = char>, const N: usize> {
|
|
source: T,
|
|
window: [Option<char>; N],
|
|
}
|
|
|
|
impl<T, const N: usize> CharWindow<T, N>
|
|
where
|
|
T: Iterator<Item = char>,
|
|
{
|
|
fn new(source: T) -> Self {
|
|
Self {
|
|
source,
|
|
window: [None; N],
|
|
}
|
|
}
|
|
|
|
fn slide(&mut self) -> Option<char> {
|
|
self.window.rotate_left(1);
|
|
let next = self.source.next();
|
|
*self.window.last_mut().expect("never empty") = next;
|
|
next
|
|
}
|
|
}
|
|
|
|
impl<T, const N: usize, Idx> Index<Idx> for CharWindow<T, N>
|
|
where
|
|
T: Iterator<Item = char>,
|
|
Idx: SliceIndex<[Option<char>]>,
|
|
{
|
|
type Output = Idx::Output;
|
|
|
|
fn index(&self, index: Idx) -> &Self::Output {
|
|
&self.window[index]
|
|
}
|
|
}
|
|
|
|
/// A lexer for Python source code.
|
|
pub struct Lexer<T: Iterator<Item = char>> {
|
|
// Contains the source code to be lexed.
|
|
window: CharWindow<T, 3>,
|
|
// Are we at the beginning of a line?
|
|
at_begin_of_line: bool,
|
|
// Amount of parenthesis.
|
|
nesting: usize,
|
|
// Indentation levels.
|
|
indentations: Indentations,
|
|
// Pending list of tokens to be returned.
|
|
pending: Vec<Spanned>,
|
|
// The current location.
|
|
location: Location,
|
|
}
|
|
|
|
// generated in build.rs, in gen_phf()
|
|
/// A map of keywords to their tokens.
|
|
pub static KEYWORDS: phf::Map<&'static str, Tok> =
|
|
include!(concat!(env!("OUT_DIR"), "/keywords.rs"));
|
|
|
|
/// Contains a Token along with its start and end location.
|
|
pub type Spanned = (Location, Tok, Location);
|
|
/// The result of lexing a token.
|
|
pub type LexResult = Result<Spanned, LexicalError>;
|
|
|
|
/// Create a new lexer from a source string.
|
|
///
|
|
/// # Examples
|
|
///
|
|
/// ```
|
|
/// use rustpython_parser::{Mode, lexer::lex};
|
|
///
|
|
/// let source = "def hello(): return 'world'";
|
|
/// let lexer = lex(source, Mode::Module);
|
|
///
|
|
/// for token in lexer {
|
|
/// println!("{:?}", token);
|
|
/// }
|
|
/// ```
|
|
#[inline]
|
|
pub fn lex(source: &str, mode: Mode) -> impl Iterator<Item = LexResult> + '_ {
|
|
lex_located(source, mode, Location::default())
|
|
}
|
|
|
|
/// Create a new lexer from a source string, starting at a given location.
|
|
/// You probably want to use [`lex`] instead.
|
|
pub fn lex_located(
|
|
source: &str,
|
|
mode: Mode,
|
|
start_location: Location,
|
|
) -> impl Iterator<Item = LexResult> + '_ {
|
|
SoftKeywordTransformer::new(Lexer::new(source.chars(), start_location), mode)
|
|
}
|
|
|
|
impl<T> Lexer<T>
|
|
where
|
|
T: Iterator<Item = char>,
|
|
{
|
|
/// Create a new lexer from T and a starting location. You probably want to use
|
|
/// [`lex`] instead.
|
|
pub fn new(input: T, start: Location) -> Self {
|
|
let mut lxr = Lexer {
|
|
at_begin_of_line: true,
|
|
nesting: 0,
|
|
indentations: Indentations::default(),
|
|
// Usually we have less than 5 tokens pending.
|
|
pending: Vec::with_capacity(5),
|
|
location: start,
|
|
window: CharWindow::new(input),
|
|
};
|
|
// Fill the window.
|
|
lxr.window.slide();
|
|
lxr.window.slide();
|
|
lxr.window.slide();
|
|
// TODO: Handle possible mismatch between BOM and explicit encoding declaration.
|
|
if let Some('\u{feff}') = lxr.window[0] {
|
|
lxr.window.slide();
|
|
}
|
|
lxr
|
|
}
|
|
|
|
/// Lex an identifier. Also used for keywords and string/bytes literals with a prefix.
|
|
fn lex_identifier(&mut self) -> LexResult {
|
|
// Detect potential string like rb'' b'' f'' u'' r''
|
|
match self.window[..3] {
|
|
[Some(c), Some('"' | '\''), ..] => {
|
|
if let Ok(kind) = StringKind::try_from(c) {
|
|
return self.lex_string(kind);
|
|
}
|
|
}
|
|
[Some(c1), Some(c2), Some('"' | '\'')] => {
|
|
if let Ok(kind) = StringKind::try_from([c1, c2]) {
|
|
return self.lex_string(kind);
|
|
}
|
|
}
|
|
_ => {}
|
|
};
|
|
|
|
let start_pos = self.get_pos();
|
|
let mut name = String::with_capacity(8);
|
|
while self.is_identifier_continuation() {
|
|
name.push(self.next_char().unwrap());
|
|
}
|
|
let end_pos = self.get_pos();
|
|
|
|
if let Some(tok) = KEYWORDS.get(&name) {
|
|
Ok((start_pos, tok.clone(), end_pos))
|
|
} else {
|
|
Ok((start_pos, Tok::Name { name }, end_pos))
|
|
}
|
|
}
|
|
|
|
/// Numeric lexing. The feast can start!
|
|
fn lex_number(&mut self) -> LexResult {
|
|
let start_pos = self.get_pos();
|
|
match self.window[..2] {
|
|
[Some('0'), Some('x' | 'X')] => {
|
|
// Hex! (0xdeadbeef)
|
|
self.next_char();
|
|
self.next_char();
|
|
self.lex_number_radix(start_pos, 16)
|
|
}
|
|
[Some('0'), Some('o' | 'O')] => {
|
|
// Octal style! (0o377)
|
|
self.next_char();
|
|
self.next_char();
|
|
self.lex_number_radix(start_pos, 8)
|
|
}
|
|
[Some('0'), Some('b' | 'B')] => {
|
|
// Binary! (0b_1110_0101)
|
|
self.next_char();
|
|
self.next_char();
|
|
self.lex_number_radix(start_pos, 2)
|
|
}
|
|
_ => self.lex_normal_number(),
|
|
}
|
|
}
|
|
|
|
/// Lex a hex/octal/decimal/binary number without a decimal point.
|
|
fn lex_number_radix(&mut self, start_pos: Location, radix: u32) -> LexResult {
|
|
let value_text = self.radix_run(radix);
|
|
let end_pos = self.get_pos();
|
|
let value = BigInt::from_str_radix(&value_text, radix).map_err(|e| LexicalError {
|
|
error: LexicalErrorType::OtherError(format!("{e:?}")),
|
|
location: start_pos,
|
|
})?;
|
|
Ok((start_pos, Tok::Int { value }, end_pos))
|
|
}
|
|
|
|
/// Lex a normal number, that is, no octal, hex or binary number.
|
|
fn lex_normal_number(&mut self) -> LexResult {
|
|
let start_pos = self.get_pos();
|
|
let start_is_zero = self.window[0] == Some('0');
|
|
// Normal number:
|
|
let mut value_text = self.radix_run(10);
|
|
|
|
// If float:
|
|
if self.window[0] == Some('.') || self.at_exponent() {
|
|
// Take '.':
|
|
if self.window[0] == Some('.') {
|
|
if self.window[1] == Some('_') {
|
|
return Err(LexicalError {
|
|
error: LexicalErrorType::OtherError("Invalid Syntax".to_owned()),
|
|
location: self.get_pos(),
|
|
});
|
|
}
|
|
value_text.push(self.next_char().unwrap());
|
|
value_text.push_str(&self.radix_run(10));
|
|
}
|
|
|
|
// 1e6 for example:
|
|
if let Some('e' | 'E') = self.window[0] {
|
|
if self.window[1] == Some('_') {
|
|
return Err(LexicalError {
|
|
error: LexicalErrorType::OtherError("Invalid Syntax".to_owned()),
|
|
location: self.get_pos(),
|
|
});
|
|
}
|
|
value_text.push(self.next_char().unwrap().to_ascii_lowercase());
|
|
// Optional +/-
|
|
if matches!(self.window[0], Some('-' | '+')) {
|
|
if self.window[1] == Some('_') {
|
|
return Err(LexicalError {
|
|
error: LexicalErrorType::OtherError("Invalid Syntax".to_owned()),
|
|
location: self.get_pos(),
|
|
});
|
|
}
|
|
value_text.push(self.next_char().unwrap());
|
|
}
|
|
|
|
value_text.push_str(&self.radix_run(10));
|
|
}
|
|
|
|
let value = f64::from_str(&value_text).map_err(|_| LexicalError {
|
|
error: LexicalErrorType::OtherError("Invalid decimal literal".to_owned()),
|
|
location: self.get_pos(),
|
|
})?;
|
|
|
|
// Parse trailing 'j':
|
|
if matches!(self.window[0], Some('j' | 'J')) {
|
|
self.next_char();
|
|
let end_pos = self.get_pos();
|
|
Ok((
|
|
start_pos,
|
|
Tok::Complex {
|
|
real: 0.0,
|
|
imag: value,
|
|
},
|
|
end_pos,
|
|
))
|
|
} else {
|
|
let end_pos = self.get_pos();
|
|
Ok((start_pos, Tok::Float { value }, end_pos))
|
|
}
|
|
} else {
|
|
// Parse trailing 'j':
|
|
if matches!(self.window[0], Some('j' | 'J')) {
|
|
self.next_char();
|
|
let end_pos = self.get_pos();
|
|
let imag = f64::from_str(&value_text).unwrap();
|
|
Ok((start_pos, Tok::Complex { real: 0.0, imag }, end_pos))
|
|
} else {
|
|
let end_pos = self.get_pos();
|
|
let value = value_text.parse::<BigInt>().unwrap();
|
|
if start_is_zero && !value.is_zero() {
|
|
// leading zeros in decimal integer literals are not permitted
|
|
return Err(LexicalError {
|
|
error: LexicalErrorType::OtherError("Invalid Token".to_owned()),
|
|
location: self.get_pos(),
|
|
});
|
|
}
|
|
Ok((start_pos, Tok::Int { value }, end_pos))
|
|
}
|
|
}
|
|
}
|
|
|
|
/// Consume a sequence of numbers with the given radix,
|
|
/// the digits can be decorated with underscores
|
|
/// like this: '1_2_3_4' == '1234'
|
|
fn radix_run(&mut self, radix: u32) -> String {
|
|
let mut value_text = String::new();
|
|
|
|
loop {
|
|
if let Some(c) = self.take_number(radix) {
|
|
value_text.push(c);
|
|
} else if self.window[0] == Some('_')
|
|
&& Lexer::<T>::is_digit_of_radix(self.window[1], radix)
|
|
{
|
|
self.next_char();
|
|
} else {
|
|
break;
|
|
}
|
|
}
|
|
value_text
|
|
}
|
|
|
|
/// Consume a single character with the given radix.
|
|
fn take_number(&mut self, radix: u32) -> Option<char> {
|
|
let take_char = Lexer::<T>::is_digit_of_radix(self.window[0], radix);
|
|
|
|
take_char.then(|| self.next_char().unwrap())
|
|
}
|
|
|
|
/// Test if a digit is of a certain radix.
|
|
fn is_digit_of_radix(c: Option<char>, radix: u32) -> bool {
|
|
match radix {
|
|
2 => matches!(c, Some('0'..='1')),
|
|
8 => matches!(c, Some('0'..='7')),
|
|
10 => matches!(c, Some('0'..='9')),
|
|
16 => matches!(c, Some('0'..='9') | Some('a'..='f') | Some('A'..='F')),
|
|
other => unimplemented!("Radix not implemented: {}", other),
|
|
}
|
|
}
|
|
|
|
/// Test if we face '[eE][-+]?[0-9]+'
|
|
fn at_exponent(&self) -> bool {
|
|
match self.window[..2] {
|
|
[Some('e' | 'E'), Some('+' | '-')] => matches!(self.window[2], Some('0'..='9')),
|
|
[Some('e' | 'E'), Some('0'..='9')] => true,
|
|
_ => false,
|
|
}
|
|
}
|
|
|
|
/// Lex a single comment.
|
|
fn lex_comment(&mut self) -> LexResult {
|
|
let start_pos = self.get_pos();
|
|
let mut value = String::new();
|
|
loop {
|
|
match self.window[0] {
|
|
Some('\n' | '\r') | None => {
|
|
let end_pos = self.get_pos();
|
|
return Ok((start_pos, Tok::Comment(value), end_pos));
|
|
}
|
|
Some(_) => {}
|
|
}
|
|
value.push(self.next_char().unwrap());
|
|
}
|
|
}
|
|
|
|
/// Lex a string literal.
|
|
fn lex_string(&mut self, kind: StringKind) -> LexResult {
|
|
let start_pos = self.get_pos();
|
|
for _ in 0..kind.prefix_len() {
|
|
self.next_char();
|
|
}
|
|
let quote_char = self.next_char().unwrap();
|
|
let mut string_content = String::with_capacity(5);
|
|
|
|
// If the next two characters are also the quote character, then we have a triple-quoted
|
|
// string; consume those two characters and ensure that we require a triple-quote to close
|
|
let triple_quoted = if self.window[..2] == [Some(quote_char); 2] {
|
|
self.next_char();
|
|
self.next_char();
|
|
true
|
|
} else {
|
|
false
|
|
};
|
|
|
|
loop {
|
|
match self.next_char() {
|
|
Some(c) => {
|
|
if c == '\\' {
|
|
if let Some(next_c) = self.next_char() {
|
|
string_content.push('\\');
|
|
string_content.push(next_c);
|
|
continue;
|
|
}
|
|
}
|
|
if c == '\n' && !triple_quoted {
|
|
return Err(LexicalError {
|
|
error: LexicalErrorType::OtherError(
|
|
"EOL while scanning string literal".to_owned(),
|
|
),
|
|
location: self.get_pos(),
|
|
});
|
|
}
|
|
|
|
if c == quote_char {
|
|
if triple_quoted {
|
|
// Look ahead at the next two characters; if we have two more
|
|
// quote_chars, it's the end of the string; consume the remaining
|
|
// closing quotes and break the loop
|
|
if self.window[..2] == [Some(quote_char); 2] {
|
|
self.next_char();
|
|
self.next_char();
|
|
break;
|
|
}
|
|
} else {
|
|
break;
|
|
}
|
|
}
|
|
string_content.push(c);
|
|
}
|
|
None => {
|
|
return Err(LexicalError {
|
|
error: if triple_quoted {
|
|
LexicalErrorType::Eof
|
|
} else {
|
|
LexicalErrorType::StringError
|
|
},
|
|
location: self.get_pos(),
|
|
});
|
|
}
|
|
}
|
|
}
|
|
let end_pos = self.get_pos();
|
|
let tok = Tok::String {
|
|
value: string_content,
|
|
kind,
|
|
triple_quoted,
|
|
};
|
|
Ok((start_pos, tok, end_pos))
|
|
}
|
|
|
|
// Checks if the character c is a valid starting character as described
|
|
// in https://docs.python.org/3/reference/lexical_analysis.html#identifiers
|
|
fn is_identifier_start(&self, c: char) -> bool {
|
|
match c {
|
|
'a'..='z' | 'A'..='Z' | '_' => true,
|
|
_ => is_xid_start(c),
|
|
}
|
|
}
|
|
|
|
// Checks if the character c is a valid continuation character as described
|
|
// in https://docs.python.org/3/reference/lexical_analysis.html#identifiers
|
|
fn is_identifier_continuation(&self) -> bool {
|
|
match self.window[0] {
|
|
Some('a'..='z' | 'A'..='Z' | '_' | '0'..='9') => true,
|
|
Some(c) => is_xid_continue(c),
|
|
_ => false,
|
|
}
|
|
}
|
|
|
|
// This is the main entry point. Call this function to retrieve the next token.
|
|
// This function is used by the iterator implementation.
|
|
fn inner_next(&mut self) -> LexResult {
|
|
// top loop, keep on processing, until we have something pending.
|
|
while self.pending.is_empty() {
|
|
// Detect indentation levels
|
|
if self.at_begin_of_line {
|
|
self.handle_indentations()?;
|
|
}
|
|
|
|
self.consume_normal()?;
|
|
}
|
|
|
|
Ok(self.pending.remove(0))
|
|
}
|
|
|
|
// Given we are at the start of a line, count the number of spaces and/or tabs until the first character.
|
|
fn eat_indentation(&mut self) -> Result<IndentationLevel, LexicalError> {
|
|
// Determine indentation:
|
|
let mut spaces: u32 = 0;
|
|
let mut tabs: u32 = 0;
|
|
loop {
|
|
match self.window[0] {
|
|
Some(' ') => {
|
|
/*
|
|
if tabs != 0 {
|
|
// Don't allow spaces after tabs as part of indentation.
|
|
// This is technically stricter than python3 but spaces after
|
|
// tabs is even more insane than mixing spaces and tabs.
|
|
return Some(Err(LexicalError {
|
|
error: LexicalErrorType::OtherError("Spaces not allowed as part of indentation after tabs".to_owned()),
|
|
location: self.get_pos(),
|
|
}));
|
|
}
|
|
*/
|
|
self.next_char();
|
|
spaces += 1;
|
|
}
|
|
Some('\t') => {
|
|
if spaces != 0 {
|
|
// Don't allow tabs after spaces as part of indentation.
|
|
// This is technically stricter than python3 but spaces before
|
|
// tabs is even more insane than mixing spaces and tabs.
|
|
return Err(LexicalError {
|
|
error: LexicalErrorType::TabsAfterSpaces,
|
|
location: self.get_pos(),
|
|
});
|
|
}
|
|
self.next_char();
|
|
tabs += 1;
|
|
}
|
|
Some('#') => {
|
|
let comment = self.lex_comment()?;
|
|
self.emit(comment);
|
|
spaces = 0;
|
|
tabs = 0;
|
|
}
|
|
Some('\x0C') => {
|
|
// Form feed character!
|
|
// Reset indentation for the Emacs user.
|
|
self.next_char();
|
|
spaces = 0;
|
|
tabs = 0;
|
|
}
|
|
Some('\n' | '\r') => {
|
|
// Empty line!
|
|
self.next_char();
|
|
spaces = 0;
|
|
tabs = 0;
|
|
}
|
|
None => {
|
|
spaces = 0;
|
|
tabs = 0;
|
|
break;
|
|
}
|
|
_ => {
|
|
self.at_begin_of_line = false;
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
Ok(IndentationLevel { tabs, spaces })
|
|
}
|
|
|
|
// Push/pop indents/dedents based on the current indentation level.
|
|
fn handle_indentations(&mut self) -> Result<(), LexicalError> {
|
|
let indentation_level = self.eat_indentation()?;
|
|
|
|
if self.nesting != 0 {
|
|
return Ok(());
|
|
}
|
|
|
|
// Determine indent or dedent:
|
|
let current_indentation = self.indentations.current();
|
|
let ordering = indentation_level.compare_strict(current_indentation, self.get_pos())?;
|
|
match ordering {
|
|
Ordering::Equal => {
|
|
// Same same
|
|
}
|
|
Ordering::Greater => {
|
|
// New indentation level:
|
|
self.indentations.push(indentation_level);
|
|
let tok_pos = self.get_pos();
|
|
self.emit((tok_pos, Tok::Indent, tok_pos));
|
|
}
|
|
Ordering::Less => {
|
|
// One or more dedentations
|
|
// Pop off other levels until col is found:
|
|
|
|
loop {
|
|
let current_indentation = self.indentations.current();
|
|
let ordering =
|
|
indentation_level.compare_strict(current_indentation, self.get_pos())?;
|
|
match ordering {
|
|
Ordering::Less => {
|
|
self.indentations.pop();
|
|
let tok_pos = self.get_pos();
|
|
self.emit((tok_pos, Tok::Dedent, tok_pos));
|
|
}
|
|
Ordering::Equal => {
|
|
// We arrived at proper level of indentation.
|
|
break;
|
|
}
|
|
Ordering::Greater => {
|
|
return Err(LexicalError {
|
|
error: LexicalErrorType::IndentationError,
|
|
location: self.get_pos(),
|
|
});
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
Ok(())
|
|
}
|
|
|
|
// Take a look at the next character, if any, and decide upon the next steps.
|
|
fn consume_normal(&mut self) -> Result<(), LexicalError> {
|
|
if let Some(c) = self.window[0] {
|
|
// Identifiers are the most common case.
|
|
if self.is_identifier_start(c) {
|
|
let identifier = self.lex_identifier()?;
|
|
self.emit(identifier);
|
|
} else {
|
|
self.consume_character(c)?;
|
|
}
|
|
} else {
|
|
// We reached end of file.
|
|
let tok_pos = self.get_pos();
|
|
|
|
// First of all, we need all nestings to be finished.
|
|
if self.nesting > 0 {
|
|
return Err(LexicalError {
|
|
error: LexicalErrorType::Eof,
|
|
location: tok_pos,
|
|
});
|
|
}
|
|
|
|
// Next, insert a trailing newline, if required.
|
|
if !self.at_begin_of_line {
|
|
self.at_begin_of_line = true;
|
|
self.emit((tok_pos, Tok::Newline, tok_pos));
|
|
}
|
|
|
|
// Next, flush the indentation stack to zero.
|
|
while !self.indentations.is_empty() {
|
|
self.indentations.pop();
|
|
self.emit((tok_pos, Tok::Dedent, tok_pos));
|
|
}
|
|
|
|
self.emit((tok_pos, Tok::EndOfFile, tok_pos));
|
|
}
|
|
|
|
Ok(())
|
|
}
|
|
|
|
// Dispatch based on the given character.
|
|
fn consume_character(&mut self, c: char) -> Result<(), LexicalError> {
|
|
match c {
|
|
'0'..='9' => {
|
|
let number = self.lex_number()?;
|
|
self.emit(number);
|
|
}
|
|
'#' => {
|
|
let comment = self.lex_comment()?;
|
|
self.emit(comment);
|
|
}
|
|
'"' | '\'' => {
|
|
let string = self.lex_string(StringKind::String)?;
|
|
self.emit(string);
|
|
}
|
|
'=' => {
|
|
let tok_start = self.get_pos();
|
|
self.next_char();
|
|
match self.window[0] {
|
|
Some('=') => {
|
|
self.next_char();
|
|
let tok_end = self.get_pos();
|
|
self.emit((tok_start, Tok::EqEqual, tok_end));
|
|
}
|
|
_ => {
|
|
let tok_end = self.get_pos();
|
|
self.emit((tok_start, Tok::Equal, tok_end));
|
|
}
|
|
}
|
|
}
|
|
'+' => {
|
|
let tok_start = self.get_pos();
|
|
self.next_char();
|
|
if let Some('=') = self.window[0] {
|
|
self.next_char();
|
|
let tok_end = self.get_pos();
|
|
self.emit((tok_start, Tok::PlusEqual, tok_end));
|
|
} else {
|
|
let tok_end = self.get_pos();
|
|
self.emit((tok_start, Tok::Plus, tok_end));
|
|
}
|
|
}
|
|
'*' => {
|
|
let tok_start = self.get_pos();
|
|
self.next_char();
|
|
match self.window[0] {
|
|
Some('=') => {
|
|
self.next_char();
|
|
let tok_end = self.get_pos();
|
|
self.emit((tok_start, Tok::StarEqual, tok_end));
|
|
}
|
|
Some('*') => {
|
|
self.next_char();
|
|
match self.window[0] {
|
|
Some('=') => {
|
|
self.next_char();
|
|
let tok_end = self.get_pos();
|
|
self.emit((tok_start, Tok::DoubleStarEqual, tok_end));
|
|
}
|
|
_ => {
|
|
let tok_end = self.get_pos();
|
|
self.emit((tok_start, Tok::DoubleStar, tok_end));
|
|
}
|
|
}
|
|
}
|
|
_ => {
|
|
let tok_end = self.get_pos();
|
|
self.emit((tok_start, Tok::Star, tok_end));
|
|
}
|
|
}
|
|
}
|
|
'/' => {
|
|
let tok_start = self.get_pos();
|
|
self.next_char();
|
|
match self.window[0] {
|
|
Some('=') => {
|
|
self.next_char();
|
|
let tok_end = self.get_pos();
|
|
self.emit((tok_start, Tok::SlashEqual, tok_end));
|
|
}
|
|
Some('/') => {
|
|
self.next_char();
|
|
match self.window[0] {
|
|
Some('=') => {
|
|
self.next_char();
|
|
let tok_end = self.get_pos();
|
|
self.emit((tok_start, Tok::DoubleSlashEqual, tok_end));
|
|
}
|
|
_ => {
|
|
let tok_end = self.get_pos();
|
|
self.emit((tok_start, Tok::DoubleSlash, tok_end));
|
|
}
|
|
}
|
|
}
|
|
_ => {
|
|
let tok_end = self.get_pos();
|
|
self.emit((tok_start, Tok::Slash, tok_end));
|
|
}
|
|
}
|
|
}
|
|
'%' => {
|
|
let tok_start = self.get_pos();
|
|
self.next_char();
|
|
if let Some('=') = self.window[0] {
|
|
self.next_char();
|
|
let tok_end = self.get_pos();
|
|
self.emit((tok_start, Tok::PercentEqual, tok_end));
|
|
} else {
|
|
let tok_end = self.get_pos();
|
|
self.emit((tok_start, Tok::Percent, tok_end));
|
|
}
|
|
}
|
|
'|' => {
|
|
let tok_start = self.get_pos();
|
|
self.next_char();
|
|
if let Some('=') = self.window[0] {
|
|
self.next_char();
|
|
let tok_end = self.get_pos();
|
|
self.emit((tok_start, Tok::VbarEqual, tok_end));
|
|
} else {
|
|
let tok_end = self.get_pos();
|
|
self.emit((tok_start, Tok::Vbar, tok_end));
|
|
}
|
|
}
|
|
'^' => {
|
|
let tok_start = self.get_pos();
|
|
self.next_char();
|
|
if let Some('=') = self.window[0] {
|
|
self.next_char();
|
|
let tok_end = self.get_pos();
|
|
self.emit((tok_start, Tok::CircumflexEqual, tok_end));
|
|
} else {
|
|
let tok_end = self.get_pos();
|
|
self.emit((tok_start, Tok::CircumFlex, tok_end));
|
|
}
|
|
}
|
|
'&' => {
|
|
let tok_start = self.get_pos();
|
|
self.next_char();
|
|
if let Some('=') = self.window[0] {
|
|
self.next_char();
|
|
let tok_end = self.get_pos();
|
|
self.emit((tok_start, Tok::AmperEqual, tok_end));
|
|
} else {
|
|
let tok_end = self.get_pos();
|
|
self.emit((tok_start, Tok::Amper, tok_end));
|
|
}
|
|
}
|
|
'-' => {
|
|
let tok_start = self.get_pos();
|
|
self.next_char();
|
|
match self.window[0] {
|
|
Some('=') => {
|
|
self.next_char();
|
|
let tok_end = self.get_pos();
|
|
self.emit((tok_start, Tok::MinusEqual, tok_end));
|
|
}
|
|
Some('>') => {
|
|
self.next_char();
|
|
let tok_end = self.get_pos();
|
|
self.emit((tok_start, Tok::Rarrow, tok_end));
|
|
}
|
|
_ => {
|
|
let tok_end = self.get_pos();
|
|
self.emit((tok_start, Tok::Minus, tok_end));
|
|
}
|
|
}
|
|
}
|
|
'@' => {
|
|
let tok_start = self.get_pos();
|
|
self.next_char();
|
|
if let Some('=') = self.window[0] {
|
|
self.next_char();
|
|
let tok_end = self.get_pos();
|
|
self.emit((tok_start, Tok::AtEqual, tok_end));
|
|
} else {
|
|
let tok_end = self.get_pos();
|
|
self.emit((tok_start, Tok::At, tok_end));
|
|
}
|
|
}
|
|
'!' => {
|
|
let tok_start = self.get_pos();
|
|
self.next_char();
|
|
if let Some('=') = self.window[0] {
|
|
self.next_char();
|
|
let tok_end = self.get_pos();
|
|
self.emit((tok_start, Tok::NotEqual, tok_end));
|
|
} else {
|
|
return Err(LexicalError {
|
|
error: LexicalErrorType::UnrecognizedToken { tok: '!' },
|
|
location: tok_start,
|
|
});
|
|
}
|
|
}
|
|
'~' => {
|
|
self.eat_single_char(Tok::Tilde);
|
|
}
|
|
'(' => {
|
|
self.eat_single_char(Tok::Lpar);
|
|
self.nesting += 1;
|
|
}
|
|
')' => {
|
|
self.eat_single_char(Tok::Rpar);
|
|
if self.nesting == 0 {
|
|
return Err(LexicalError {
|
|
error: LexicalErrorType::NestingError,
|
|
location: self.get_pos(),
|
|
});
|
|
}
|
|
self.nesting -= 1;
|
|
}
|
|
'[' => {
|
|
self.eat_single_char(Tok::Lsqb);
|
|
self.nesting += 1;
|
|
}
|
|
']' => {
|
|
self.eat_single_char(Tok::Rsqb);
|
|
if self.nesting == 0 {
|
|
return Err(LexicalError {
|
|
error: LexicalErrorType::NestingError,
|
|
location: self.get_pos(),
|
|
});
|
|
}
|
|
self.nesting -= 1;
|
|
}
|
|
'{' => {
|
|
self.eat_single_char(Tok::Lbrace);
|
|
self.nesting += 1;
|
|
}
|
|
'}' => {
|
|
self.eat_single_char(Tok::Rbrace);
|
|
if self.nesting == 0 {
|
|
return Err(LexicalError {
|
|
error: LexicalErrorType::NestingError,
|
|
location: self.get_pos(),
|
|
});
|
|
}
|
|
self.nesting -= 1;
|
|
}
|
|
':' => {
|
|
let tok_start = self.get_pos();
|
|
self.next_char();
|
|
if let Some('=') = self.window[0] {
|
|
self.next_char();
|
|
let tok_end = self.get_pos();
|
|
self.emit((tok_start, Tok::ColonEqual, tok_end));
|
|
} else {
|
|
let tok_end = self.get_pos();
|
|
self.emit((tok_start, Tok::Colon, tok_end));
|
|
}
|
|
}
|
|
';' => {
|
|
self.eat_single_char(Tok::Semi);
|
|
}
|
|
'<' => {
|
|
let tok_start = self.get_pos();
|
|
self.next_char();
|
|
match self.window[0] {
|
|
Some('<') => {
|
|
self.next_char();
|
|
match self.window[0] {
|
|
Some('=') => {
|
|
self.next_char();
|
|
let tok_end = self.get_pos();
|
|
self.emit((tok_start, Tok::LeftShiftEqual, tok_end));
|
|
}
|
|
_ => {
|
|
let tok_end = self.get_pos();
|
|
self.emit((tok_start, Tok::LeftShift, tok_end));
|
|
}
|
|
}
|
|
}
|
|
Some('=') => {
|
|
self.next_char();
|
|
let tok_end = self.get_pos();
|
|
self.emit((tok_start, Tok::LessEqual, tok_end));
|
|
}
|
|
_ => {
|
|
let tok_end = self.get_pos();
|
|
self.emit((tok_start, Tok::Less, tok_end));
|
|
}
|
|
}
|
|
}
|
|
'>' => {
|
|
let tok_start = self.get_pos();
|
|
self.next_char();
|
|
match self.window[0] {
|
|
Some('>') => {
|
|
self.next_char();
|
|
match self.window[0] {
|
|
Some('=') => {
|
|
self.next_char();
|
|
let tok_end = self.get_pos();
|
|
self.emit((tok_start, Tok::RightShiftEqual, tok_end));
|
|
}
|
|
_ => {
|
|
let tok_end = self.get_pos();
|
|
self.emit((tok_start, Tok::RightShift, tok_end));
|
|
}
|
|
}
|
|
}
|
|
Some('=') => {
|
|
self.next_char();
|
|
let tok_end = self.get_pos();
|
|
self.emit((tok_start, Tok::GreaterEqual, tok_end));
|
|
}
|
|
_ => {
|
|
let tok_end = self.get_pos();
|
|
self.emit((tok_start, Tok::Greater, tok_end));
|
|
}
|
|
}
|
|
}
|
|
',' => {
|
|
self.eat_single_char(Tok::Comma);
|
|
}
|
|
'.' => {
|
|
if let Some('0'..='9') = self.window[1] {
|
|
let number = self.lex_number()?;
|
|
self.emit(number);
|
|
} else {
|
|
let tok_start = self.get_pos();
|
|
self.next_char();
|
|
if self.window[..2] == [Some('.'); 2] {
|
|
self.next_char();
|
|
self.next_char();
|
|
let tok_end = self.get_pos();
|
|
self.emit((tok_start, Tok::Ellipsis, tok_end));
|
|
} else {
|
|
let tok_end = self.get_pos();
|
|
self.emit((tok_start, Tok::Dot, tok_end));
|
|
}
|
|
}
|
|
}
|
|
'\n' | '\r' => {
|
|
let tok_start = self.get_pos();
|
|
self.next_char();
|
|
let tok_end = self.get_pos();
|
|
|
|
// Depending on the nesting level, we emit a logical or
|
|
// non-logical newline:
|
|
if self.nesting == 0 {
|
|
self.at_begin_of_line = true;
|
|
self.emit((tok_start, Tok::Newline, tok_end));
|
|
} else {
|
|
self.emit((tok_start, Tok::NonLogicalNewline, tok_end));
|
|
}
|
|
}
|
|
' ' | '\t' | '\x0C' => {
|
|
// Skip whitespaces
|
|
self.next_char();
|
|
while let Some(' ' | '\t' | '\x0C') = self.window[0] {
|
|
self.next_char();
|
|
}
|
|
}
|
|
'\\' => {
|
|
self.next_char();
|
|
match self.window[0] {
|
|
Some('\n' | '\r') => {
|
|
self.next_char();
|
|
}
|
|
_ => {
|
|
return Err(LexicalError {
|
|
error: LexicalErrorType::LineContinuationError,
|
|
location: self.get_pos(),
|
|
})
|
|
}
|
|
}
|
|
|
|
if self.window[0].is_none() {
|
|
return Err(LexicalError {
|
|
error: LexicalErrorType::Eof,
|
|
location: self.get_pos(),
|
|
});
|
|
}
|
|
}
|
|
_ => {
|
|
if is_emoji_presentation(c) {
|
|
let tok_start = self.get_pos();
|
|
self.next_char();
|
|
let tok_end = self.get_pos();
|
|
self.emit((
|
|
tok_start,
|
|
Tok::Name {
|
|
name: c.to_string(),
|
|
},
|
|
tok_end,
|
|
));
|
|
} else {
|
|
let c = self.next_char();
|
|
return Err(LexicalError {
|
|
error: LexicalErrorType::UnrecognizedToken { tok: c.unwrap() },
|
|
location: self.get_pos(),
|
|
});
|
|
}
|
|
}
|
|
}
|
|
|
|
Ok(())
|
|
}
|
|
|
|
// Used by single character tokens to advance the window and emit the correct token.
|
|
fn eat_single_char(&mut self, ty: Tok) {
|
|
let tok_start = self.get_pos();
|
|
self.next_char().unwrap_or_else(|| unsafe {
|
|
// SAFETY: eat_single_char has been called only after a character has been read
|
|
// from the window, so the window is guaranteed to be non-empty.
|
|
std::hint::unreachable_unchecked()
|
|
});
|
|
let tok_end = self.get_pos();
|
|
self.emit((tok_start, ty, tok_end));
|
|
}
|
|
|
|
// Helper function to go to the next character coming up.
|
|
fn next_char(&mut self) -> Option<char> {
|
|
let mut c = self.window[0];
|
|
self.window.slide();
|
|
match c {
|
|
Some('\n') => {
|
|
self.location.newline();
|
|
}
|
|
Some('\r') => {
|
|
if self.window[0] == Some('\n') {
|
|
self.window.slide();
|
|
}
|
|
self.location.newline();
|
|
c = Some('\n');
|
|
}
|
|
_ => {
|
|
self.location.go_right();
|
|
}
|
|
}
|
|
c
|
|
}
|
|
|
|
// Helper function to retrieve the current position.
|
|
fn get_pos(&self) -> Location {
|
|
self.location
|
|
}
|
|
|
|
// Helper function to emit a lexed token to the queue of tokens.
|
|
fn emit(&mut self, spanned: Spanned) {
|
|
self.pending.push(spanned);
|
|
}
|
|
}
|
|
|
|
// Implement iterator pattern for Lexer.
|
|
// Calling the next element in the iterator will yield the next lexical
|
|
// token.
|
|
impl<T> Iterator for Lexer<T>
|
|
where
|
|
T: Iterator<Item = char>,
|
|
{
|
|
type Item = LexResult;
|
|
|
|
fn next(&mut self) -> Option<Self::Item> {
|
|
let token = self.inner_next();
|
|
trace!(
|
|
"Lex token {:?}, nesting={:?}, indent stack: {:?}",
|
|
token,
|
|
self.nesting,
|
|
self.indentations,
|
|
);
|
|
|
|
match token {
|
|
Ok((_, Tok::EndOfFile, _)) => None,
|
|
r => Some(r),
|
|
}
|
|
}
|
|
}
|
|
|
|
/// Represents an error that occur during lexing and are
|
|
/// returned by the `parse_*` functions in the iterator in the
|
|
/// [lexer] implementation.
|
|
///
|
|
/// [lexer]: crate::lexer
|
|
#[derive(Debug, PartialEq)]
|
|
pub struct LexicalError {
|
|
/// The type of error that occurred.
|
|
pub error: LexicalErrorType,
|
|
/// The location of the error.
|
|
pub location: Location,
|
|
}
|
|
|
|
impl LexicalError {
|
|
/// Creates a new `LexicalError` with the given error type and location.
|
|
pub fn new(error: LexicalErrorType, location: Location) -> Self {
|
|
Self { error, location }
|
|
}
|
|
}
|
|
|
|
/// Represents the different types of errors that can occur during lexing.
|
|
#[derive(Debug, PartialEq)]
|
|
pub enum LexicalErrorType {
|
|
// TODO: Can probably be removed, the places it is used seem to be able
|
|
// to use the `UnicodeError` variant instead.
|
|
#[doc(hidden)]
|
|
StringError,
|
|
// TODO: Should take a start/end position to report.
|
|
/// Decoding of a unicode escape sequence in a string literal failed.
|
|
UnicodeError,
|
|
/// The nesting of brackets/braces/parentheses is not balanced.
|
|
NestingError,
|
|
/// The indentation is not consistent.
|
|
IndentationError,
|
|
/// Inconsistent use of tabs and spaces.
|
|
TabError,
|
|
/// Encountered a tab after a space.
|
|
TabsAfterSpaces,
|
|
/// A non-default argument follows a default argument.
|
|
DefaultArgumentError,
|
|
/// A duplicate argument was found in a function definition.
|
|
DuplicateArgumentError(String),
|
|
/// A positional argument follows a keyword argument.
|
|
PositionalArgumentError,
|
|
/// An iterable argument unpacking `*args` follows keyword argument unpacking `**kwargs`.
|
|
UnpackedArgumentError,
|
|
/// A keyword argument was repeated.
|
|
DuplicateKeywordArgumentError(String),
|
|
/// An unrecognized token was encountered.
|
|
UnrecognizedToken { tok: char },
|
|
/// An f-string error containing the [`FStringErrorType`].
|
|
FStringError(FStringErrorType),
|
|
/// An unexpected character was encountered after a line continuation.
|
|
LineContinuationError,
|
|
/// An unexpected end of file was encountered.
|
|
Eof,
|
|
/// An unexpected error occurred.
|
|
OtherError(String),
|
|
}
|
|
|
|
impl std::fmt::Display for LexicalErrorType {
|
|
fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
|
|
match self {
|
|
LexicalErrorType::StringError => write!(f, "Got unexpected string"),
|
|
LexicalErrorType::FStringError(error) => write!(f, "f-string: {error}"),
|
|
LexicalErrorType::UnicodeError => write!(f, "Got unexpected unicode"),
|
|
LexicalErrorType::NestingError => write!(f, "Got unexpected nesting"),
|
|
LexicalErrorType::IndentationError => {
|
|
write!(f, "unindent does not match any outer indentation level")
|
|
}
|
|
LexicalErrorType::TabError => {
|
|
write!(f, "inconsistent use of tabs and spaces in indentation")
|
|
}
|
|
LexicalErrorType::TabsAfterSpaces => {
|
|
write!(f, "Tabs not allowed as part of indentation after spaces")
|
|
}
|
|
LexicalErrorType::DefaultArgumentError => {
|
|
write!(f, "non-default argument follows default argument")
|
|
}
|
|
LexicalErrorType::DuplicateArgumentError(arg_name) => {
|
|
write!(f, "duplicate argument '{arg_name}' in function definition")
|
|
}
|
|
LexicalErrorType::DuplicateKeywordArgumentError(arg_name) => {
|
|
write!(f, "keyword argument repeated: {arg_name}")
|
|
}
|
|
LexicalErrorType::PositionalArgumentError => {
|
|
write!(f, "positional argument follows keyword argument")
|
|
}
|
|
LexicalErrorType::UnpackedArgumentError => {
|
|
write!(
|
|
f,
|
|
"iterable argument unpacking follows keyword argument unpacking"
|
|
)
|
|
}
|
|
LexicalErrorType::UnrecognizedToken { tok } => {
|
|
write!(f, "Got unexpected token {tok}")
|
|
}
|
|
LexicalErrorType::LineContinuationError => {
|
|
write!(f, "unexpected character after line continuation character")
|
|
}
|
|
LexicalErrorType::Eof => write!(f, "unexpected EOF while parsing"),
|
|
LexicalErrorType::OtherError(msg) => write!(f, "{msg}"),
|
|
}
|
|
}
|
|
}
|
|
|
|
#[cfg(test)]
|
|
mod tests {
|
|
use super::*;
|
|
use num_bigint::BigInt;
|
|
|
|
const WINDOWS_EOL: &str = "\r\n";
|
|
const MAC_EOL: &str = "\r";
|
|
const UNIX_EOL: &str = "\n";
|
|
|
|
pub fn lex_source(source: &str) -> Vec<Tok> {
|
|
let lexer = lex(source, Mode::Module);
|
|
lexer.map(|x| x.unwrap().1).collect()
|
|
}
|
|
|
|
fn stok(s: &str) -> Tok {
|
|
Tok::String {
|
|
value: s.to_owned(),
|
|
kind: StringKind::String,
|
|
triple_quoted: false,
|
|
}
|
|
}
|
|
|
|
fn raw_stok(s: &str) -> Tok {
|
|
Tok::String {
|
|
value: s.to_owned(),
|
|
kind: StringKind::RawString,
|
|
triple_quoted: false,
|
|
}
|
|
}
|
|
|
|
#[test]
|
|
fn test_numbers() {
|
|
let source = "0x2f 0o12 0b1101 0 123 123_45_67_890 0.2 1e+2 2.1e3 2j 2.2j";
|
|
let tokens = lex_source(source);
|
|
assert_eq!(
|
|
tokens,
|
|
vec![
|
|
Tok::Int {
|
|
value: BigInt::from(47),
|
|
},
|
|
Tok::Int {
|
|
value: BigInt::from(10)
|
|
},
|
|
Tok::Int {
|
|
value: BigInt::from(13),
|
|
},
|
|
Tok::Int {
|
|
value: BigInt::from(0),
|
|
},
|
|
Tok::Int {
|
|
value: BigInt::from(123),
|
|
},
|
|
Tok::Int {
|
|
value: BigInt::from(1234567890),
|
|
},
|
|
Tok::Float { value: 0.2 },
|
|
Tok::Float { value: 100.0 },
|
|
Tok::Float { value: 2100.0 },
|
|
Tok::Complex {
|
|
real: 0.0,
|
|
imag: 2.0,
|
|
},
|
|
Tok::Complex {
|
|
real: 0.0,
|
|
imag: 2.2,
|
|
},
|
|
Tok::Newline,
|
|
]
|
|
);
|
|
}
|
|
|
|
macro_rules! test_line_comment {
|
|
($($name:ident: $eol:expr,)*) => {
|
|
$(
|
|
#[test]
|
|
fn $name() {
|
|
let source = format!(r"99232 # {}", $eol);
|
|
let tokens = lex_source(&source);
|
|
assert_eq!(tokens, vec![Tok::Int { value: BigInt::from(99232) }, Tok::Comment(format!("# {}", $eol)), Tok::Newline]);
|
|
}
|
|
)*
|
|
}
|
|
}
|
|
|
|
test_line_comment! {
|
|
test_line_comment_long: " foo",
|
|
test_line_comment_whitespace: " ",
|
|
test_line_comment_single_whitespace: " ",
|
|
test_line_comment_empty: "",
|
|
}
|
|
|
|
macro_rules! test_comment_until_eol {
|
|
($($name:ident: $eol:expr,)*) => {
|
|
$(
|
|
#[test]
|
|
fn $name() {
|
|
let source = format!("123 # Foo{}456", $eol);
|
|
let tokens = lex_source(&source);
|
|
assert_eq!(
|
|
tokens,
|
|
vec![
|
|
Tok::Int { value: BigInt::from(123) },
|
|
Tok::Comment("# Foo".to_string()),
|
|
Tok::Newline,
|
|
Tok::Int { value: BigInt::from(456) },
|
|
Tok::Newline,
|
|
]
|
|
)
|
|
}
|
|
)*
|
|
}
|
|
}
|
|
|
|
test_comment_until_eol! {
|
|
test_comment_until_windows_eol: WINDOWS_EOL,
|
|
test_comment_until_mac_eol: MAC_EOL,
|
|
test_comment_until_unix_eol: UNIX_EOL,
|
|
}
|
|
|
|
#[test]
|
|
fn test_assignment() {
|
|
let source = r"avariable = 99 + 2-0";
|
|
let tokens = lex_source(source);
|
|
assert_eq!(
|
|
tokens,
|
|
vec![
|
|
Tok::Name {
|
|
name: String::from("avariable"),
|
|
},
|
|
Tok::Equal,
|
|
Tok::Int {
|
|
value: BigInt::from(99)
|
|
},
|
|
Tok::Plus,
|
|
Tok::Int {
|
|
value: BigInt::from(2)
|
|
},
|
|
Tok::Minus,
|
|
Tok::Int {
|
|
value: BigInt::from(0)
|
|
},
|
|
Tok::Newline,
|
|
]
|
|
);
|
|
}
|
|
|
|
macro_rules! test_indentation_with_eol {
|
|
($($name:ident: $eol:expr,)*) => {
|
|
$(
|
|
#[test]
|
|
fn $name() {
|
|
let source = format!("def foo():{} return 99{}{}", $eol, $eol, $eol);
|
|
let tokens = lex_source(&source);
|
|
assert_eq!(
|
|
tokens,
|
|
vec![
|
|
Tok::Def,
|
|
Tok::Name {
|
|
name: String::from("foo"),
|
|
},
|
|
Tok::Lpar,
|
|
Tok::Rpar,
|
|
Tok::Colon,
|
|
Tok::Newline,
|
|
Tok::Indent,
|
|
Tok::Return,
|
|
Tok::Int { value: BigInt::from(99) },
|
|
Tok::Newline,
|
|
Tok::Dedent,
|
|
]
|
|
);
|
|
}
|
|
)*
|
|
};
|
|
}
|
|
|
|
test_indentation_with_eol! {
|
|
test_indentation_windows_eol: WINDOWS_EOL,
|
|
test_indentation_mac_eol: MAC_EOL,
|
|
test_indentation_unix_eol: UNIX_EOL,
|
|
}
|
|
|
|
macro_rules! test_double_dedent_with_eol {
|
|
($($name:ident: $eol:expr,)*) => {
|
|
$(
|
|
#[test]
|
|
fn $name() {
|
|
let source = format!("def foo():{} if x:{}{} return 99{}{}", $eol, $eol, $eol, $eol, $eol);
|
|
let tokens = lex_source(&source);
|
|
assert_eq!(
|
|
tokens,
|
|
vec![
|
|
Tok::Def,
|
|
Tok::Name {
|
|
name: String::from("foo"),
|
|
},
|
|
Tok::Lpar,
|
|
Tok::Rpar,
|
|
Tok::Colon,
|
|
Tok::Newline,
|
|
Tok::Indent,
|
|
Tok::If,
|
|
Tok::Name {
|
|
name: String::from("x"),
|
|
},
|
|
Tok::Colon,
|
|
Tok::Newline,
|
|
Tok::Indent,
|
|
Tok::Return,
|
|
Tok::Int { value: BigInt::from(99) },
|
|
Tok::Newline,
|
|
Tok::Dedent,
|
|
Tok::Dedent,
|
|
]
|
|
);
|
|
}
|
|
)*
|
|
}
|
|
}
|
|
|
|
macro_rules! test_double_dedent_with_tabs {
|
|
($($name:ident: $eol:expr,)*) => {
|
|
$(
|
|
#[test]
|
|
fn $name() {
|
|
let source = format!("def foo():{}\tif x:{}{}\t return 99{}{}", $eol, $eol, $eol, $eol, $eol);
|
|
let tokens = lex_source(&source);
|
|
assert_eq!(
|
|
tokens,
|
|
vec![
|
|
Tok::Def,
|
|
Tok::Name {
|
|
name: String::from("foo"),
|
|
},
|
|
Tok::Lpar,
|
|
Tok::Rpar,
|
|
Tok::Colon,
|
|
Tok::Newline,
|
|
Tok::Indent,
|
|
Tok::If,
|
|
Tok::Name {
|
|
name: String::from("x"),
|
|
},
|
|
Tok::Colon,
|
|
Tok::Newline,
|
|
Tok::Indent,
|
|
Tok::Return,
|
|
Tok::Int { value: BigInt::from(99) },
|
|
Tok::Newline,
|
|
Tok::Dedent,
|
|
Tok::Dedent,
|
|
]
|
|
);
|
|
}
|
|
)*
|
|
}
|
|
}
|
|
|
|
test_double_dedent_with_eol! {
|
|
test_double_dedent_windows_eol: WINDOWS_EOL,
|
|
test_double_dedent_mac_eol: MAC_EOL,
|
|
test_double_dedent_unix_eol: UNIX_EOL,
|
|
}
|
|
|
|
test_double_dedent_with_tabs! {
|
|
test_double_dedent_tabs_windows_eol: WINDOWS_EOL,
|
|
test_double_dedent_tabs_mac_eol: MAC_EOL,
|
|
test_double_dedent_tabs_unix_eol: UNIX_EOL,
|
|
}
|
|
|
|
macro_rules! test_newline_in_brackets {
|
|
($($name:ident: $eol:expr,)*) => {
|
|
$(
|
|
#[test]
|
|
fn $name() {
|
|
let source = r"x = [
|
|
|
|
1,2
|
|
,(3,
|
|
4,
|
|
), {
|
|
5,
|
|
6,\
|
|
7}]
|
|
".replace("\n", $eol);
|
|
let tokens = lex_source(&source);
|
|
assert_eq!(
|
|
tokens,
|
|
vec![
|
|
Tok::Name {
|
|
name: String::from("x"),
|
|
},
|
|
Tok::Equal,
|
|
Tok::Lsqb,
|
|
Tok::NonLogicalNewline,
|
|
Tok::NonLogicalNewline,
|
|
Tok::Int { value: BigInt::from(1) },
|
|
Tok::Comma,
|
|
Tok::Int { value: BigInt::from(2) },
|
|
Tok::NonLogicalNewline,
|
|
Tok::Comma,
|
|
Tok::Lpar,
|
|
Tok::Int { value: BigInt::from(3) },
|
|
Tok::Comma,
|
|
Tok::NonLogicalNewline,
|
|
Tok::Int { value: BigInt::from(4) },
|
|
Tok::Comma,
|
|
Tok::NonLogicalNewline,
|
|
Tok::Rpar,
|
|
Tok::Comma,
|
|
Tok::Lbrace,
|
|
Tok::NonLogicalNewline,
|
|
Tok::Int { value: BigInt::from(5) },
|
|
Tok::Comma,
|
|
Tok::NonLogicalNewline,
|
|
Tok::Int { value: BigInt::from(6) },
|
|
Tok::Comma,
|
|
// Continuation here - no NonLogicalNewline.
|
|
Tok::Int { value: BigInt::from(7) },
|
|
Tok::Rbrace,
|
|
Tok::Rsqb,
|
|
Tok::Newline,
|
|
]
|
|
);
|
|
}
|
|
)*
|
|
};
|
|
}
|
|
|
|
test_newline_in_brackets! {
|
|
test_newline_in_brackets_windows_eol: WINDOWS_EOL,
|
|
test_newline_in_brackets_mac_eol: MAC_EOL,
|
|
test_newline_in_brackets_unix_eol: UNIX_EOL,
|
|
}
|
|
|
|
#[test]
|
|
fn test_non_logical_newline_in_string_continuation() {
|
|
let source = r"(
|
|
'a'
|
|
'b'
|
|
|
|
'c' \
|
|
'd'
|
|
)";
|
|
let tokens = lex_source(source);
|
|
assert_eq!(
|
|
tokens,
|
|
vec![
|
|
Tok::Lpar,
|
|
Tok::NonLogicalNewline,
|
|
stok("a"),
|
|
Tok::NonLogicalNewline,
|
|
stok("b"),
|
|
Tok::NonLogicalNewline,
|
|
Tok::NonLogicalNewline,
|
|
stok("c"),
|
|
stok("d"),
|
|
Tok::NonLogicalNewline,
|
|
Tok::Rpar,
|
|
Tok::Newline,
|
|
]
|
|
);
|
|
}
|
|
|
|
#[test]
|
|
fn test_logical_newline_line_comment() {
|
|
let source = "#Hello\n#World";
|
|
let tokens = lex_source(source);
|
|
assert_eq!(
|
|
tokens,
|
|
vec![
|
|
Tok::Comment("#Hello".to_owned()),
|
|
// tokenize.py does put an NL here...
|
|
Tok::Comment("#World".to_owned()),
|
|
// ... and here, but doesn't seem very useful.
|
|
]
|
|
);
|
|
}
|
|
|
|
#[test]
|
|
fn test_operators() {
|
|
let source = "//////=/ /";
|
|
let tokens = lex_source(source);
|
|
assert_eq!(
|
|
tokens,
|
|
vec![
|
|
Tok::DoubleSlash,
|
|
Tok::DoubleSlash,
|
|
Tok::DoubleSlashEqual,
|
|
Tok::Slash,
|
|
Tok::Slash,
|
|
Tok::Newline,
|
|
]
|
|
);
|
|
}
|
|
|
|
#[test]
|
|
fn test_string() {
|
|
let source = r#""double" 'single' 'can\'t' "\\\"" '\t\r\n' '\g' r'raw\'' '\420' '\200\0a'"#;
|
|
let tokens = lex_source(source);
|
|
assert_eq!(
|
|
tokens,
|
|
vec![
|
|
stok("double"),
|
|
stok("single"),
|
|
stok(r"can\'t"),
|
|
stok(r#"\\\""#),
|
|
stok(r"\t\r\n"),
|
|
stok(r"\g"),
|
|
raw_stok(r"raw\'"),
|
|
stok(r"\420"),
|
|
stok(r"\200\0a"),
|
|
Tok::Newline,
|
|
]
|
|
);
|
|
}
|
|
|
|
macro_rules! test_string_continuation {
|
|
($($name:ident: $eol:expr,)*) => {
|
|
$(
|
|
#[test]
|
|
fn $name() {
|
|
let source = format!("\"abc\\{}def\"", $eol);
|
|
let tokens = lex_source(&source);
|
|
assert_eq!(
|
|
tokens,
|
|
vec![
|
|
stok("abc\\\ndef"),
|
|
Tok::Newline,
|
|
]
|
|
)
|
|
}
|
|
)*
|
|
}
|
|
}
|
|
|
|
test_string_continuation! {
|
|
test_string_continuation_windows_eol: WINDOWS_EOL,
|
|
test_string_continuation_mac_eol: MAC_EOL,
|
|
test_string_continuation_unix_eol: UNIX_EOL,
|
|
}
|
|
|
|
#[test]
|
|
fn test_escape_unicode_name() {
|
|
let source = r#""\N{EN SPACE}""#;
|
|
let tokens = lex_source(source);
|
|
assert_eq!(tokens, vec![stok(r"\N{EN SPACE}"), Tok::Newline])
|
|
}
|
|
|
|
macro_rules! test_triple_quoted {
|
|
($($name:ident: $eol:expr,)*) => {
|
|
$(
|
|
#[test]
|
|
fn $name() {
|
|
let source = format!("\"\"\"{0} test string{0} \"\"\"", $eol);
|
|
let tokens = lex_source(&source);
|
|
assert_eq!(
|
|
tokens,
|
|
vec![
|
|
Tok::String {
|
|
value: "\n test string\n ".to_owned(),
|
|
kind: StringKind::String,
|
|
triple_quoted: true,
|
|
},
|
|
Tok::Newline,
|
|
]
|
|
)
|
|
}
|
|
)*
|
|
}
|
|
}
|
|
|
|
test_triple_quoted! {
|
|
test_triple_quoted_windows_eol: WINDOWS_EOL,
|
|
test_triple_quoted_mac_eol: MAC_EOL,
|
|
test_triple_quoted_unix_eol: UNIX_EOL,
|
|
}
|
|
}
|