Move token and error structs into related modules (#11957)

## Summary

This PR does some housekeeping into moving certain structs into related
modules. Specifically,
1. Move `LexicalError` from `lexer.rs` to `error.rs` which also contains
the `ParseError`
2. Move `Token`, `TokenFlags` and `TokenValue` from `lexer.rs` to
`token.rs`
This commit is contained in:
Dhruv Manilawala 2024-06-21 15:37:19 +05:30 committed by GitHub
parent 4667d8697c
commit 96da136e6a
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
10 changed files with 352 additions and 342 deletions

View file

@ -9,23 +9,19 @@
use std::cmp::Ordering;
use std::str::FromStr;
use bitflags::bitflags;
use unicode_ident::{is_xid_continue, is_xid_start};
use unicode_normalization::UnicodeNormalization;
use ruff_python_ast::str::Quote;
use ruff_python_ast::str_prefix::{
AnyStringPrefix, ByteStringPrefix, FStringPrefix, StringLiteralPrefix,
};
use ruff_python_ast::{AnyStringFlags, Int, IpyEscapeKind, StringFlags};
use ruff_python_ast::{Int, IpyEscapeKind, StringFlags};
use ruff_python_trivia::is_python_whitespace;
use ruff_text_size::{Ranged, TextLen, TextRange, TextSize};
use ruff_text_size::{TextLen, TextRange, TextSize};
use crate::error::FStringErrorType;
use crate::error::{FStringErrorType, LexicalError, LexicalErrorType};
use crate::lexer::cursor::{Cursor, EOF_CHAR};
use crate::lexer::fstring::{FStringContext, FStrings, FStringsCheckpoint};
use crate::lexer::indentation::{Indentation, Indentations, IndentationsCheckpoint};
use crate::{Mode, TokenKind};
use crate::token::{TokenFlags, TokenKind, TokenValue};
use crate::Mode;
mod cursor;
mod fstring;
@ -1511,317 +1507,6 @@ impl<'src> Lexer<'src> {
}
}
bitflags! {
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
pub(crate) struct TokenFlags: u8 {
/// The token is a string with double quotes (`"`).
const DOUBLE_QUOTES = 1 << 0;
/// The token is a triple-quoted string i.e., it starts and ends with three consecutive
/// quote characters (`"""` or `'''`).
const TRIPLE_QUOTED_STRING = 1 << 1;
/// The token is a unicode string i.e., prefixed with `u` or `U`
const UNICODE_STRING = 1 << 2;
/// The token is a byte string i.e., prefixed with `b` or `B`
const BYTE_STRING = 1 << 3;
/// The token is an f-string i.e., prefixed with `f` or `F`
const F_STRING = 1 << 4;
/// The token is a raw string and the prefix character is in lowercase.
const RAW_STRING_LOWERCASE = 1 << 5;
/// The token is a raw string and the prefix character is in uppercase.
const RAW_STRING_UPPERCASE = 1 << 6;
/// The token is a raw string i.e., prefixed with `r` or `R`
const RAW_STRING = Self::RAW_STRING_LOWERCASE.bits() | Self::RAW_STRING_UPPERCASE.bits();
}
}
impl StringFlags for TokenFlags {
fn quote_style(self) -> Quote {
if self.intersects(TokenFlags::DOUBLE_QUOTES) {
Quote::Double
} else {
Quote::Single
}
}
fn is_triple_quoted(self) -> bool {
self.intersects(TokenFlags::TRIPLE_QUOTED_STRING)
}
fn prefix(self) -> AnyStringPrefix {
if self.intersects(TokenFlags::F_STRING) {
if self.intersects(TokenFlags::RAW_STRING_LOWERCASE) {
AnyStringPrefix::Format(FStringPrefix::Raw { uppercase_r: false })
} else if self.intersects(TokenFlags::RAW_STRING_UPPERCASE) {
AnyStringPrefix::Format(FStringPrefix::Raw { uppercase_r: true })
} else {
AnyStringPrefix::Format(FStringPrefix::Regular)
}
} else if self.intersects(TokenFlags::BYTE_STRING) {
if self.intersects(TokenFlags::RAW_STRING_LOWERCASE) {
AnyStringPrefix::Bytes(ByteStringPrefix::Raw { uppercase_r: false })
} else if self.intersects(TokenFlags::RAW_STRING_UPPERCASE) {
AnyStringPrefix::Bytes(ByteStringPrefix::Raw { uppercase_r: true })
} else {
AnyStringPrefix::Bytes(ByteStringPrefix::Regular)
}
} else if self.intersects(TokenFlags::RAW_STRING_LOWERCASE) {
AnyStringPrefix::Regular(StringLiteralPrefix::Raw { uppercase: false })
} else if self.intersects(TokenFlags::RAW_STRING_UPPERCASE) {
AnyStringPrefix::Regular(StringLiteralPrefix::Raw { uppercase: true })
} else if self.intersects(TokenFlags::UNICODE_STRING) {
AnyStringPrefix::Regular(StringLiteralPrefix::Unicode)
} else {
AnyStringPrefix::Regular(StringLiteralPrefix::Empty)
}
}
}
impl TokenFlags {
/// Returns `true` if the token is an f-string.
const fn is_f_string(self) -> bool {
self.intersects(TokenFlags::F_STRING)
}
/// Returns `true` if the token is a triple-quoted f-string.
fn is_triple_quoted_fstring(self) -> bool {
self.contains(TokenFlags::F_STRING | TokenFlags::TRIPLE_QUOTED_STRING)
}
/// Returns `true` if the token is a raw string.
const fn is_raw_string(self) -> bool {
self.intersects(TokenFlags::RAW_STRING)
}
pub(crate) fn as_any_string_flags(self) -> AnyStringFlags {
AnyStringFlags::new(self.prefix(), self.quote_style(), self.is_triple_quoted())
}
}
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
pub struct Token {
/// The kind of the token.
kind: TokenKind,
/// The range of the token.
range: TextRange,
/// The set of flags describing this token.
flags: TokenFlags,
}
impl Token {
pub(crate) fn new(kind: TokenKind, range: TextRange, flags: TokenFlags) -> Token {
Self { kind, range, flags }
}
/// Returns the token kind.
#[inline]
pub const fn kind(&self) -> TokenKind {
self.kind
}
/// Returns the token as a tuple of (kind, range).
#[inline]
pub const fn as_tuple(&self) -> (TokenKind, TextRange) {
(self.kind, self.range)
}
/// Returns `true` if this is any kind of string token.
const fn is_any_string(self) -> bool {
matches!(
self.kind,
TokenKind::String
| TokenKind::FStringStart
| TokenKind::FStringMiddle
| TokenKind::FStringEnd
)
}
/// Returns `true` if the current token is a triple-quoted string of any kind.
///
/// # Panics
///
/// If it isn't a string or any f-string tokens.
pub fn is_triple_quoted_string(self) -> bool {
assert!(self.is_any_string());
self.flags.is_triple_quoted()
}
/// Returns the [`Quote`] style for the current string token of any kind.
///
/// # Panics
///
/// If it isn't a string or any f-string tokens.
pub fn string_quote_style(self) -> Quote {
assert!(self.is_any_string());
self.flags.quote_style()
}
}
impl Ranged for Token {
fn range(&self) -> TextRange {
self.range
}
}
/// Represents an error that occur during lexing and are
/// returned by the `parse_*` functions in the iterator in the
/// [lexer] implementation.
///
/// [lexer]: crate::lexer
#[derive(Debug, Clone, PartialEq)]
pub struct LexicalError {
/// The type of error that occurred.
error: LexicalErrorType,
/// The location of the error.
location: TextRange,
}
impl LexicalError {
/// Creates a new `LexicalError` with the given error type and location.
pub fn new(error: LexicalErrorType, location: TextRange) -> Self {
Self { error, location }
}
pub fn error(&self) -> &LexicalErrorType {
&self.error
}
pub fn into_error(self) -> LexicalErrorType {
self.error
}
pub fn location(&self) -> TextRange {
self.location
}
}
impl std::ops::Deref for LexicalError {
type Target = LexicalErrorType;
fn deref(&self) -> &Self::Target {
self.error()
}
}
impl std::error::Error for LexicalError {
fn source(&self) -> Option<&(dyn std::error::Error + 'static)> {
Some(self.error())
}
}
impl std::fmt::Display for LexicalError {
fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
write!(
f,
"{} at byte offset {}",
self.error(),
u32::from(self.location().start())
)
}
}
/// Represents the different types of errors that can occur during lexing.
#[derive(Debug, Clone, PartialEq)]
pub enum LexicalErrorType {
// TODO: Can probably be removed, the places it is used seem to be able
// to use the `UnicodeError` variant instead.
#[doc(hidden)]
StringError,
/// A string literal without the closing quote.
UnclosedStringError,
/// Decoding of a unicode escape sequence in a string literal failed.
UnicodeError,
/// Missing the `{` for unicode escape sequence.
MissingUnicodeLbrace,
/// Missing the `}` for unicode escape sequence.
MissingUnicodeRbrace,
/// The indentation is not consistent.
IndentationError,
/// An unrecognized token was encountered.
UnrecognizedToken { tok: char },
/// An f-string error containing the [`FStringErrorType`].
FStringError(FStringErrorType),
/// Invalid character encountered in a byte literal.
InvalidByteLiteral,
/// An unexpected character was encountered after a line continuation.
LineContinuationError,
/// An unexpected end of file was encountered.
Eof,
/// An unexpected error occurred.
OtherError(Box<str>),
}
impl std::error::Error for LexicalErrorType {}
impl std::fmt::Display for LexicalErrorType {
fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
match self {
LexicalErrorType::StringError => write!(f, "Got unexpected string"),
LexicalErrorType::FStringError(error) => write!(f, "f-string: {error}"),
LexicalErrorType::InvalidByteLiteral => {
write!(f, "bytes can only contain ASCII literal characters")
}
LexicalErrorType::UnicodeError => write!(f, "Got unexpected unicode"),
LexicalErrorType::IndentationError => {
write!(f, "unindent does not match any outer indentation level")
}
LexicalErrorType::UnrecognizedToken { tok } => {
write!(f, "Got unexpected token {tok}")
}
LexicalErrorType::LineContinuationError => {
write!(f, "unexpected character after line continuation character")
}
LexicalErrorType::Eof => write!(f, "unexpected EOF while parsing"),
LexicalErrorType::OtherError(msg) => write!(f, "{msg}"),
LexicalErrorType::UnclosedStringError => {
write!(f, "missing closing quote in string literal")
}
LexicalErrorType::MissingUnicodeLbrace => {
write!(f, "Missing `{{` in Unicode escape sequence")
}
LexicalErrorType::MissingUnicodeRbrace => {
write!(f, "Missing `}}` in Unicode escape sequence")
}
}
}
}
#[derive(Clone, Debug, Default)]
pub(crate) enum TokenValue {
#[default]
None,
/// Token value for a name, commonly known as an identifier.
///
/// Unicode names are NFKC-normalized by the lexer,
/// matching [the behaviour of Python's lexer](https://docs.python.org/3/reference/lexical_analysis.html#identifiers)
Name(Box<str>),
/// Token value for an integer.
Int(Int),
/// Token value for a floating point number.
Float(f64),
/// Token value for a complex number.
Complex {
/// The real part of the complex number.
real: f64,
/// The imaginary part of the complex number.
imag: f64,
},
/// Token value for a string.
String(Box<str>),
/// Token value that includes the portion of text inside the f-string that's not
/// part of the expression part and isn't an opening or closing brace.
FStringMiddle(Box<str>),
/// Token value for IPython escape commands. These are recognized by the lexer
/// only when the mode is [`Mode::Ipython`].
IpyEscapeCommand {
/// The magic command value.
value: Box<str>,
/// The kind of magic command.
kind: IpyEscapeKind,
},
}
pub(crate) struct LexerCheckpoint {
value: TokenValue,
current_kind: TokenKind,