RustPython-Parser/parser/src/token.rs

//! Token type for Python source code created by the lexer and consumed by the parser.
//!
//! This module defines the tokens that the lexer recognizes. The tokens are
//! loosely based on the token definitions found in the [CPython source].
//!
//! [CPython source]: https://github.com/python/cpython/blob/dfc2e065a2e71011017077e549cd2f9bf4944c54/Include/internal/pycore_token.h
use crate::{text_size::TextSize, Mode};
use num_bigint::BigInt;
use std::fmt;

/// The set of tokens the Python source code can be tokenized in.
#[derive(Clone, Debug, PartialEq)]
pub enum Tok {
    /// Token value for a name, commonly known as an identifier.
    Name {
        /// The name value.
        name: String,
    },
    /// Token value for an integer.
    Int {
        /// The integer value.
        value: BigInt,
    },
    /// Token value for a floating point number.
    Float {
        /// The float value.
        value: f64,
    },
    /// Token value for a complex number.
    Complex {
        /// The real part of the complex number.
        real: f64,
        /// The imaginary part of the complex number.
        imag: f64,
    },
    /// Token value for a string.
    String {
        /// The string value.
        value: String,
        /// The kind of string.
        kind: StringKind,
        /// Whether the string is triple quoted.
        triple_quoted: bool,
    },
    /// Token value for a comment. These are filtered out of the token stream prior to parsing.
    Comment(String),
    /// Token value for a newline.
    Newline,
    /// Token value for a newline that is not a logical line break. These are filtered out of
    /// the token stream prior to parsing.
    NonLogicalNewline,
    /// Token value for an indent.
    Indent,
    /// Token value for a dedent.
    Dedent,
    EndOfFile,
    /// Token value for a left parenthesis `(`.
    Lpar,
    /// Token value for a right parenthesis `)`.
    Rpar,
    /// Token value for a left square bracket `[`.
    Lsqb,
    /// Token value for a right square bracket `]`.
    Rsqb,
    /// Token value for a colon `:`.
    Colon,
    /// Token value for a comma `,`.
    Comma,
    /// Token value for a semicolon `;`.
    Semi,
    /// Token value for plus `+`.
    Plus,
    /// Token value for minus `-`.
    Minus,
    /// Token value for star `*`.
    Star,
    /// Token value for slash `/`.
    Slash,
    /// Token value for vertical bar `|`.
    Vbar,
    /// Token value for ampersand `&`.
    Amper,
    /// Token value for less than `<`.
    Less,
    /// Token value for greater than `>`.
    Greater,
    /// Token value for equal `=`.
    Equal,
    /// Token value for dot `.`.
    Dot,
    /// Token value for percent `%`.
    Percent,
    /// Token value for left bracket `{`.
    Lbrace,
    /// Token value for right bracket `}`.
    Rbrace,
    /// Token value for double equal `==`.
    EqEqual,
    /// Token value for not equal `!=`.
    NotEqual,
    /// Token value for less than or equal `<=`.
    LessEqual,
    /// Token value for greater than or equal `>=`.
    GreaterEqual,
    /// Token value for tilde `~`.
    Tilde,
    /// Token value for caret `^`.
    CircumFlex,
    /// Token value for left shift `<<`.
    LeftShift,
    /// Token value for right shift `>>`.
    RightShift,
    /// Token value for double star `**`.
    DoubleStar,
    /// Token value for double star equal `**=`.
    DoubleStarEqual,
    /// Token value for plus equal `+=`.
    PlusEqual,
    /// Token value for minus equal `-=`.
    MinusEqual,
    /// Token value for star equal `*=`.
    StarEqual,
    /// Token value for slash equal `/=`.
    SlashEqual,
    /// Token value for percent equal `%=`.
    PercentEqual,
    /// Token value for ampersand equal `&=`.
    AmperEqual,
    /// Token value for vertical bar equal `|=`.
    VbarEqual,
    /// Token value for caret equal `^=`.
    CircumflexEqual,
    /// Token value for left shift equal `<<=`.
    LeftShiftEqual,
    /// Token value for right shift equal `>>=`.
    RightShiftEqual,
    /// Token value for double slash `//`.
    DoubleSlash,
    /// Token value for double slash equal `//=`.
    DoubleSlashEqual,
    /// Token value for colon equal `:=`.
    ColonEqual,
    /// Token value for at `@`.
    At,
    /// Token value for at equal `@=`.
    AtEqual,
    /// Token value for arrow `->`.
    Rarrow,
    /// Token value for ellipsis `...`.
    Ellipsis,

    // Self documenting.
    // Keywords (alphabetically):
    False,
    None,
    True,

    And,
    As,
    Assert,
    Async,
    Await,
    Break,
    Class,
    Continue,
    Def,
    Del,
    Elif,
    Else,
    Except,
    Finally,
    For,
    From,
    Global,
    If,
    Import,
    In,
    Is,
    Lambda,
    Nonlocal,
    Not,
    Or,
    Pass,
    Raise,
    Return,
    Try,
    While,
    Match,
    Case,
    With,
    Yield,

    // RustPython specific.
    StartModule,
    StartInteractive,
    StartExpression,
}

impl Tok {
    pub fn start_marker(mode: Mode) -> Self {
        match mode {
            Mode::Module => Tok::StartModule,
            Mode::Interactive => Tok::StartInteractive,
            Mode::Expression => Tok::StartExpression,
        }
    }
}

impl fmt::Display for Tok {
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
        use Tok::*;
        match self {
            Name { name } => write!(f, "'{name}'"),
            Int { value } => write!(f, "'{value}'"),
            Float { value } => write!(f, "'{value}'"),
            Complex { real, imag } => write!(f, "{real}j{imag}"),
            String {
                value,
                kind,
                triple_quoted,
            } => {
                let quotes = "\"".repeat(if *triple_quoted { 3 } else { 1 });
                write!(f, "{kind}{quotes}{value}{quotes}")
            }
            Newline => f.write_str("Newline"),
            NonLogicalNewline => f.write_str("NonLogicalNewline"),
            Indent => f.write_str("Indent"),
            Dedent => f.write_str("Dedent"),
            StartModule => f.write_str("StartProgram"),
            StartInteractive => f.write_str("StartInteractive"),
            StartExpression => f.write_str("StartExpression"),
            EndOfFile => f.write_str("EOF"),
            Lpar => f.write_str("'('"),
            Rpar => f.write_str("')'"),
            Lsqb => f.write_str("'['"),
            Rsqb => f.write_str("']'"),
            Colon => f.write_str("':'"),
            Comma => f.write_str("','"),
            Comment(value) => f.write_str(value),
            Semi => f.write_str("';'"),
            Plus => f.write_str("'+'"),
            Minus => f.write_str("'-'"),
            Star => f.write_str("'*'"),
            Slash => f.write_str("'/'"),
            Vbar => f.write_str("'|'"),
            Amper => f.write_str("'&'"),
            Less => f.write_str("'<'"),
            Greater => f.write_str("'>'"),
            Equal => f.write_str("'='"),
            Dot => f.write_str("'.'"),
            Percent => f.write_str("'%'"),
            Lbrace => f.write_str("'{'"),
            Rbrace => f.write_str("'}'"),
            EqEqual => f.write_str("'=='"),
            NotEqual => f.write_str("'!='"),
            LessEqual => f.write_str("'<='"),
            GreaterEqual => f.write_str("'>='"),
            Tilde => f.write_str("'~'"),
            CircumFlex => f.write_str("'^'"),
            LeftShift => f.write_str("'<<'"),
            RightShift => f.write_str("'>>'"),
            DoubleStar => f.write_str("'**'"),
            DoubleStarEqual => f.write_str("'**='"),
            PlusEqual => f.write_str("'+='"),
            MinusEqual => f.write_str("'-='"),
            StarEqual => f.write_str("'*='"),
            SlashEqual => f.write_str("'/='"),
            PercentEqual => f.write_str("'%='"),
            AmperEqual => f.write_str("'&='"),
            VbarEqual => f.write_str("'|='"),
            CircumflexEqual => f.write_str("'^='"),
            LeftShiftEqual => f.write_str("'<<='"),
            RightShiftEqual => f.write_str("'>>='"),
            DoubleSlash => f.write_str("'//'"),
            DoubleSlashEqual => f.write_str("'//='"),
            At => f.write_str("'@'"),
            AtEqual => f.write_str("'@='"),
            Rarrow => f.write_str("'->'"),
            Ellipsis => f.write_str("'...'"),
            False => f.write_str("'False'"),
            None => f.write_str("'None'"),
            True => f.write_str("'True'"),
            And => f.write_str("'and'"),
            As => f.write_str("'as'"),
            Assert => f.write_str("'assert'"),
            Async => f.write_str("'async'"),
            Await => f.write_str("'await'"),
            Break => f.write_str("'break'"),
            Class => f.write_str("'class'"),
            Continue => f.write_str("'continue'"),
            Def => f.write_str("'def'"),
            Del => f.write_str("'del'"),
            Elif => f.write_str("'elif'"),
            Else => f.write_str("'else'"),
            Except => f.write_str("'except'"),
            Finally => f.write_str("'finally'"),
            For => f.write_str("'for'"),
            From => f.write_str("'from'"),
            Global => f.write_str("'global'"),
            If => f.write_str("'if'"),
            Import => f.write_str("'import'"),
            In => f.write_str("'in'"),
            Is => f.write_str("'is'"),
            Lambda => f.write_str("'lambda'"),
            Nonlocal => f.write_str("'nonlocal'"),
            Not => f.write_str("'not'"),
            Or => f.write_str("'or'"),
            Pass => f.write_str("'pass'"),
            Raise => f.write_str("'raise'"),
            Return => f.write_str("'return'"),
            Try => f.write_str("'try'"),
            While => f.write_str("'while'"),
            Match => f.write_str("'match'"),
            Case => f.write_str("'case'"),
            With => f.write_str("'with'"),
            Yield => f.write_str("'yield'"),
            ColonEqual => f.write_str("':='"),
        }
    }
}

/// The kind of string literal as described in the [String and Bytes literals]
/// section of the Python reference.
///
/// [String and Bytes literals]: https://docs.python.org/3/reference/lexical_analysis.html#string-and-bytes-literals
#[derive(PartialEq, Eq, Debug, Clone)]
pub enum StringKind {
    /// A normal string literal with no prefix.
    String,
    /// A f-string literal, with a `f` or `F` prefix.
    FString,
    /// A byte string literal, with a `b` or `B` prefix.
    Bytes,
    /// A raw string literal, with a `r` or `R` prefix.
    RawString,
    /// A raw f-string literal, with a `rf`/`fr` or `rF`/`Fr` or `Rf`/`fR` or `RF`/`FR` prefix.
    RawFString,
    /// A raw byte string literal, with a `rb`/`br` or `rB`/`Br` or `Rb`/`bR` or `RB`/`BR` prefix.
    RawBytes,
    /// A unicode string literal, with a `u` or `U` prefix.
    Unicode,
}

impl TryFrom<char> for StringKind {
    type Error = String;

    fn try_from(ch: char) -> Result<Self, String> {
        match ch {
            'r' | 'R' => Ok(StringKind::RawString),
            'f' | 'F' => Ok(StringKind::FString),
            'u' | 'U' => Ok(StringKind::Unicode),
            'b' | 'B' => Ok(StringKind::Bytes),
            c => Err(format!("Unexpected string prefix: {c}")),
        }
    }
}

impl TryFrom<[char; 2]> for StringKind {
    type Error = String;

    fn try_from(chars: [char; 2]) -> Result<Self, String> {
        match chars {
            ['r' | 'R', 'f' | 'F'] => Ok(StringKind::RawFString),
            ['f' | 'F', 'r' | 'R'] => Ok(StringKind::RawFString),
            ['r' | 'R', 'b' | 'B'] => Ok(StringKind::RawBytes),
            ['b' | 'B', 'r' | 'R'] => Ok(StringKind::RawBytes),
            [c1, c2] => Err(format!("Unexpected string prefix: {c1}{c2}")),
        }
    }
}

impl fmt::Display for StringKind {
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
        use StringKind::*;
        match self {
            String => f.write_str(""),
            FString => f.write_str("f"),
            Bytes => f.write_str("b"),
            RawString => f.write_str("r"),
            RawFString => f.write_str("rf"),
            RawBytes => f.write_str("rb"),
            Unicode => f.write_str("u"),
        }
    }
}

impl StringKind {
    /// Returns true if the string is a raw string, i,e one of
    /// [`StringKind::RawString`] or [`StringKind::RawFString`] or [`StringKind::RawBytes`].
    pub fn is_raw(&self) -> bool {
        use StringKind::{RawBytes, RawFString, RawString};
        matches!(self, RawString | RawFString | RawBytes)
    }

    /// Returns true if the string is an f-string, i,e one of
    /// [`StringKind::FString`] or [`StringKind::RawFString`].
    pub fn is_fstring(&self) -> bool {
        use StringKind::{FString, RawFString};
        matches!(self, FString | RawFString)
    }

    /// Returns true if the string is a byte string, i,e one of
    /// [`StringKind::Bytes`] or [`StringKind::RawBytes`].
    pub fn is_bytes(&self) -> bool {
        use StringKind::{Bytes, RawBytes};
        matches!(self, Bytes | RawBytes)
    }

    /// Returns true if the string is a unicode string, i,e [`StringKind::Unicode`].
    pub fn is_unicode(&self) -> bool {
        matches!(self, StringKind::Unicode)
    }

    /// Returns the number of characters in the prefix.
    pub fn prefix_len(&self) -> TextSize {
        use StringKind::*;
        let len = match self {
            String => 0,
            RawString | FString | Unicode | Bytes => 1,
            RawFString | RawBytes => 2,
        };
        len.into()
    }
}