//! defines `Token` (The minimum unit in the Erg source code that serves as input to the parser). //! //! Token(パーサーへの入力となる、Ergソースコードにおける最小単位)を定義する use std::collections::VecDeque; use std::fmt; use std::hash::{Hash, Hasher}; use erg_common::error::Location; use erg_common::impl_displayable_deque_stream_for_wrapper; use erg_common::opcode311::BinOpCode; use erg_common::str::Str; use erg_common::traits::{DequeStream, Locational}; // use erg_common::ty::Type; // use erg_common::typaram::OpKind; // use erg_common::value::ValueObj; #[cfg(not(feature = "pylib"))] use erg_proc_macros::pyclass; #[cfg(feature = "pylib")] use pyo3::prelude::*; /// 意味論的名前と記号自体の名前が混在しているが、Pythonの名残である #[pyclass] #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] #[repr(u8)] pub enum TokenKind { /// e.g. i, p!, $s, T, `+`, `and`, 'd/dx' Symbol, /// e.g. 0, 1 NatLit, /// e.g. -1, -2 IntLit, /// e.g. 0b101 BinLit, /// e.g. 0o777 OctLit, /// e.g. 0xdeadbeef HexLit, RatioLit, BoolLit, StrLit, /// e.g. "abc\{ StrInterpLeft, /// e.g. }abc\{ StrInterpMid, /// e.g. }def" StrInterpRight, NoneLit, /// ... (== Ellipsis) EllipsisLit, InfLit, DocComment, /// `+` (unary) PrePlus, /// `-` (unary) PreMinus, /// ~ (unary) PreBitNot, // PreAmp, // & (unary) // PreAt, // @ (unary) /// ! (unary) Mutate, PreStar, // * (unary) PreDblStar, // ** (unary) /// ? (postfix) Try, /// `+` Plus, /// `-` Minus, /// `*` Star, /// / Slash, /// // FloorDiv, /// ** Pow, /// % Mod, /// .. Closed, /// ..< RightOpen, /// <.. LeftOpen, /// <..< Open, /// && BitAnd, /// || BitOr, /// ^^ BitXor, /// << Shl, /// >> Shr, /// < Less, /// > Gre, /// <= LessEq, /// >= GreEq, /// == DblEq, /// != NotEq, /// `in` InOp, /// `notin` NotInOp, // `contains` ContainsOp, /// `sub` (subtype of) SubOp, /// `is!` IsOp, /// `isnot!` IsNotOp, /// `and` AndOp, /// `or` OrOp, /// `ref` (special unary) RefOp, /// `ref!` (special unary) RefMutOp, /// = Assign, /// <- Inclusion, /// := Walrus, /// -> FuncArrow, /// => ProcArrow, /// ( LParen, /// ) RParen, /// [ LSqBr, /// ] RSqBr, /// { LBrace, /// } RBrace, Indent, Dedent, /// . Dot, /// |> Pipe, /// : Colon, /// :: DblColon, /// :> SupertypeOf, /// <: SubtypeOf, /// `as` As, /// , Comma, /// ^ Caret, /// & Amper, /// @ AtSign, /// | VBar, /// _ UBar, /// \n Newline, /// ; Semi, Illegal, /// Beginning Of File BOF, EOF, } use TokenKind::*; #[pyclass] #[derive(Debug, Clone, Copy, PartialEq, Eq)] pub enum TokenCategory { Symbol, Literal, StrInterpLeft, StrInterpMid, StrInterpRight, BinOp, UnaryOp, /// ? <.. .. PostfixOp, /// ( [ { Indent LEnclosure, /// ) } } Dedent REnclosure, /// , : :: :> <: . |> := SpecialBinOp, /// = DefOp, /// -> => LambdaOp, /// \n ; Separator, /// ^ & Reserved, /// @ AtSign, /// | VBar, /// _ UBar, BOF, EOF, Illegal, } impl fmt::Display for TokenCategory { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { write!(f, "{self:?}") } } impl TokenCategory { pub const fn is_block_op(&self) -> bool { matches!(self, Self::DefOp | Self::LambdaOp) } } impl TokenKind { pub const fn category(&self) -> TokenCategory { match self { Symbol => TokenCategory::Symbol, NatLit | BinLit | OctLit | HexLit | IntLit | RatioLit | StrLit | BoolLit | NoneLit | EllipsisLit | InfLit | DocComment => TokenCategory::Literal, StrInterpLeft => TokenCategory::StrInterpLeft, StrInterpMid => TokenCategory::StrInterpMid, StrInterpRight => TokenCategory::StrInterpRight, PrePlus | PreMinus | PreBitNot | Mutate | PreStar | PreDblStar | RefOp | RefMutOp => { TokenCategory::UnaryOp } Try => TokenCategory::PostfixOp, Comma | Colon | DblColon | SupertypeOf | SubtypeOf | As | Dot | Pipe | Walrus | Inclusion => TokenCategory::SpecialBinOp, Assign => TokenCategory::DefOp, FuncArrow | ProcArrow => TokenCategory::LambdaOp, Semi | Newline => TokenCategory::Separator, LParen | LBrace | LSqBr | Indent => TokenCategory::LEnclosure, RParen | RBrace | RSqBr | Dedent => TokenCategory::REnclosure, Caret | Amper => TokenCategory::Reserved, AtSign => TokenCategory::AtSign, VBar => TokenCategory::VBar, UBar => TokenCategory::UBar, BOF => TokenCategory::BOF, EOF => TokenCategory::EOF, Illegal => TokenCategory::Illegal, _ => TokenCategory::BinOp, } } pub const fn precedence(&self) -> Option { let prec = match self { Dot | DblColon => 200, // . Pow => 190, // ** PrePlus | PreMinus | PreBitNot | RefOp | RefMutOp => 180, // (unary) + - * ~ ref ref! Star | Slash | FloorDiv | Mod => 170, // * / // % Plus | Minus => 160, // + - Shl | Shr => 150, // << >> BitAnd => 140, // && BitXor => 130, // ^^ BitOr => 120, // || Closed | LeftOpen | RightOpen | Open => 100, // range operators Less | Gre | LessEq | GreEq | DblEq | NotEq | InOp | NotInOp | ContainsOp | IsOp | IsNotOp => 90, // < > <= >= == != in notin contains is isnot AndOp => 80, // and OrOp => 70, // or FuncArrow | ProcArrow | Inclusion => 60, // -> => <- Colon | SupertypeOf | SubtypeOf | As => 50, // : :> <: as Comma => 40, // , Assign | Walrus => 20, // = := Newline | Semi => 10, // \n ; LParen | LBrace | LSqBr | Indent => 0, // ( { [ Indent _ => return None, }; Some(prec) } pub const fn is_right_associative(&self) -> bool { matches!( self, FuncArrow | ProcArrow | Assign /* | PreDollar | PreAt */ ) } pub const fn is_range_op(&self) -> bool { matches!(self, Closed | LeftOpen | RightOpen | Open) } } impl fmt::Display for TokenKind { #[inline] fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { write!(f, "{self:?}") } } impl From for BinOpCode { fn from(tk: TokenKind) -> Self { match tk { Plus => BinOpCode::Add, Minus => BinOpCode::Subtract, Star => BinOpCode::Multiply, Slash => BinOpCode::TrueDivide, FloorDiv => BinOpCode::FloorDiv, Mod => BinOpCode::Remainder, Pow => BinOpCode::Power, BitAnd => BinOpCode::And, BitOr => BinOpCode::Or, BitXor => BinOpCode::Xor, Shl => BinOpCode::LShift, Shr => BinOpCode::RShift, _ => panic!("invalid token kind for binop"), } } } #[pyclass(get_all, set_all)] #[derive(Clone, Eq)] pub struct Token { pub kind: TokenKind, pub content: Str, /// 1 origin // TODO: 複数行文字列リテラルもあるのでタプルにするのが妥当? pub lineno: u32, /// A pointer from which the token starts (0 origin) pub col_begin: u32, /// A pointer to the end position of the token. /// `col_end - col_start` does not necessarily equal `content.len()` pub col_end: u32, } pub const COLON: Token = Token::dummy(TokenKind::Colon, ":"); pub const AS: Token = Token::dummy(TokenKind::As, "as"); pub const DOT: Token = Token::dummy(TokenKind::Dot, "."); pub const EQUAL: Token = Token::dummy(TokenKind::Assign, "="); impl fmt::Debug for Token { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { f.debug_struct("Token") .field("kind", &self.kind) .field("content", &self.content.replace('\n', "\\n")) .field("lineno", &self.lineno) .field("col_begin", &self.col_begin) .field("col_end", &self.col_end) .finish() } } impl fmt::Display for Token { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { write!(f, "{:?} {}", self.kind, self.content.replace('\n', "\\n")) } } // the values of lineno and col are not relevant for comparison // use `deep_eq` if you want to compare them impl PartialEq for Token { #[inline] fn eq(&self, other: &Self) -> bool { self.is(other.kind) && self.content == other.content } } impl Hash for Token { fn hash(&self, state: &mut H) { self.kind.hash(state); self.content.hash(state); } } impl Locational for Token { fn loc(&self) -> Location { if self.lineno == 0 { Location::Unknown } else { Location::range(self.lineno, self.col_begin, self.lineno, self.col_end) } } #[inline] fn col_end(&self) -> Option { Some(self.col_begin + self.content.len() as u32) } } impl Token { pub const DUMMY: Token = Token { kind: TokenKind::Illegal, content: Str::ever("DUMMY"), lineno: 0, col_begin: 0, col_end: 0, }; pub const fn dummy(kind: TokenKind, content: &'static str) -> Self { Self { kind, content: Str::ever(content), lineno: 0, col_begin: 0, col_end: 0, } } #[inline] pub fn new>(kind: TokenKind, cont: S, lineno: u32, col_begin: u32) -> Self { let content = cont.into(); let col_end = col_begin + content.chars().count() as u32; Token { kind, content, lineno, col_begin, col_end, } } #[inline] pub fn new_fake>( kind: TokenKind, cont: S, lineno: u32, col_begin: u32, col_end: u32, ) -> Self { Token { kind, content: cont.into(), lineno, col_begin, col_end, } } pub fn new_with_loc(kind: TokenKind, cont: impl Into, loc: Location) -> Self { Token { kind, content: cont.into(), lineno: loc.ln_begin().unwrap_or(0), col_begin: loc.col_begin().unwrap_or(0), col_end: loc.col_end().unwrap_or(1), } } #[inline] pub fn from_str(kind: TokenKind, cont: &str) -> Self { Token { kind, content: Str::rc(cont), lineno: 0, col_begin: 0, col_end: 0, } } #[inline] pub fn symbol(cont: &str) -> Self { Self::from_str(TokenKind::Symbol, cont) } #[inline] pub fn symbol_with_line(cont: &str, lineno: u32) -> Self { Token { kind: TokenKind::Symbol, content: Str::rc(cont), lineno, col_begin: 0, col_end: 1, } } pub fn symbol_with_loc>(cont: S, loc: Location) -> Self { Token { kind: TokenKind::Symbol, content: cont.into(), lineno: loc.ln_begin().unwrap_or(0), col_begin: loc.col_begin().unwrap_or(0), col_end: loc.col_end().unwrap_or(1), } } pub const fn static_symbol(s: &'static str) -> Self { Token { kind: TokenKind::Symbol, content: Str::ever(s), lineno: 0, col_begin: 0, col_end: 1, } } pub fn deep_eq(&self, other: &Self) -> bool { self.kind == other.kind && self.content == other.content && self.lineno == other.lineno && self.col_begin == other.col_begin } pub fn loc(&self) -> Location { Locational::loc(self) } pub const fn category(&self) -> TokenCategory { self.kind.category() } pub fn category_is(&self, category: TokenCategory) -> bool { self.kind.category() == category } pub fn is(&self, kind: TokenKind) -> bool { self.kind == kind } pub const fn is_block_op(&self) -> bool { self.category().is_block_op() } pub const fn inspect(&self) -> &Str { &self.content } pub fn is_procedural(&self) -> bool { self.inspect().ends_with('!') } pub fn is_const(&self) -> bool { self.inspect().is_uppercase() } } #[pyclass] #[derive(Debug, Clone)] pub struct TokenStream(VecDeque); impl_displayable_deque_stream_for_wrapper!(TokenStream, Token);