//! Lexing `&str` into a sequence of Rust tokens. //! //! Note that strictly speaking the parser in this crate is not required to work //! on tokens which originated from text. Macros, eg, can synthesize tokens out //! of thin air. So, ideally, lexer should be an orthogonal crate. It is however //! convenient to include a text-based lexer here! //! //! Note that these tokens, unlike the tokens we feed into the parser, do //! include info about comments and whitespace. use std::ops; use rustc_literal_escaper::{ EscapeError, Mode, unescape_byte, unescape_byte_str, unescape_c_str, unescape_char, unescape_str, }; use crate::{ Edition, SyntaxKind::{self, *}, T, }; pub struct LexedStr<'a> { text: &'a str, kind: Vec, start: Vec, error: Vec, } struct LexError { msg: String, token: u32, } impl<'a> LexedStr<'a> { pub fn new(edition: Edition, text: &'a str) -> LexedStr<'a> { let _p = tracing::info_span!("LexedStr::new").entered(); let mut conv = Converter::new(edition, text); if let Some(shebang_len) = rustc_lexer::strip_shebang(text) { conv.res.push(SHEBANG, conv.offset); conv.offset = shebang_len; }; // Re-create the tokenizer from scratch every token because `GuardedStrPrefix` is one token in the lexer // but we want to split it to two in edition <2024. while let Some(token) = rustc_lexer::tokenize(&text[conv.offset..], rustc_lexer::FrontmatterAllowed::No).next() { let token_text = &text[conv.offset..][..token.len as usize]; conv.extend_token(&token.kind, token_text); } conv.finalize_with_eof() } pub fn single_token(edition: Edition, text: &'a str) -> Option<(SyntaxKind, Option)> { if text.is_empty() { return None; } let token = rustc_lexer::tokenize(text, rustc_lexer::FrontmatterAllowed::No).next()?; if token.len as usize != text.len() { return None; } let mut conv = Converter::new(edition, text); conv.extend_token(&token.kind, text); match &*conv.res.kind { [kind] => Some((*kind, conv.res.error.pop().map(|it| it.msg))), _ => None, } } pub fn as_str(&self) -> &str { self.text } pub fn len(&self) -> usize { self.kind.len() - 1 } pub fn is_empty(&self) -> bool { self.len() == 0 } pub fn kind(&self, i: usize) -> SyntaxKind { assert!(i < self.len()); self.kind[i] } pub fn text(&self, i: usize) -> &str { self.range_text(i..i + 1) } pub fn range_text(&self, r: ops::Range) -> &str { assert!(r.start < r.end && r.end <= self.len()); let lo = self.start[r.start] as usize; let hi = self.start[r.end] as usize; &self.text[lo..hi] } // Naming is hard. pub fn text_range(&self, i: usize) -> ops::Range { assert!(i < self.len()); let lo = self.start[i] as usize; let hi = self.start[i + 1] as usize; lo..hi } pub fn text_start(&self, i: usize) -> usize { assert!(i <= self.len()); self.start[i] as usize } pub fn text_len(&self, i: usize) -> usize { assert!(i < self.len()); let r = self.text_range(i); r.end - r.start } pub fn error(&self, i: usize) -> Option<&str> { assert!(i < self.len()); let err = self.error.binary_search_by_key(&(i as u32), |i| i.token).ok()?; Some(self.error[err].msg.as_str()) } pub fn errors(&self) -> impl Iterator + '_ { self.error.iter().map(|it| (it.token as usize, it.msg.as_str())) } fn push(&mut self, kind: SyntaxKind, offset: usize) { self.kind.push(kind); self.start.push(offset as u32); } } struct Converter<'a> { res: LexedStr<'a>, offset: usize, edition: Edition, } impl<'a> Converter<'a> { fn new(edition: Edition, text: &'a str) -> Self { Self { res: LexedStr { text, kind: Vec::new(), start: Vec::new(), error: Vec::new() }, offset: 0, edition, } } /// Check for likely unterminated string by analyzing STRING token content fn has_likely_unterminated_string(&self) -> bool { let Some(last_idx) = self.res.kind.len().checked_sub(1) else { return false }; for i in (0..=last_idx).rev().take(5) { if self.res.kind[i] == STRING { let start = self.res.start[i] as usize; let end = self.res.start.get(i + 1).map(|&s| s as usize).unwrap_or(self.offset); let content = &self.res.text[start..end]; if content.contains('(') && (content.contains("//") || content.contains(";\n")) { return true; } } } false } fn finalize_with_eof(mut self) -> LexedStr<'a> { self.res.push(EOF, self.offset); self.res } fn push(&mut self, kind: SyntaxKind, len: usize, errors: Vec) { self.res.push(kind, self.offset); self.offset += len; for msg in errors { if !msg.is_empty() { self.res.error.push(LexError { msg, token: self.res.len() as u32 }); } } } fn extend_token(&mut self, kind: &rustc_lexer::TokenKind, mut token_text: &str) { // A note on an intended tradeoff: // We drop some useful information here (see patterns with double dots `..`) // Storing that info in `SyntaxKind` is not possible due to its layout requirements of // being `u16` that come from `rowan::SyntaxKind`. let mut errors: Vec = vec![]; let syntax_kind = { match kind { rustc_lexer::TokenKind::LineComment { doc_style: _ } => COMMENT, rustc_lexer::TokenKind::BlockComment { doc_style: _, terminated } => { if !terminated { errors.push( "Missing trailing `*/` symbols to terminate the block comment".into(), ); } COMMENT } rustc_lexer::TokenKind::Frontmatter { has_invalid_preceding_whitespace, invalid_infostring, } => { if *has_invalid_preceding_whitespace { errors.push("invalid preceding whitespace for frontmatter opening".into()); } else if *invalid_infostring { errors.push("invalid infostring for frontmatter".into()); } FRONTMATTER } rustc_lexer::TokenKind::Whitespace => WHITESPACE, rustc_lexer::TokenKind::Ident if token_text == "_" => UNDERSCORE, rustc_lexer::TokenKind::Ident => { SyntaxKind::from_keyword(token_text, self.edition).unwrap_or(IDENT) } rustc_lexer::TokenKind::InvalidIdent => { errors.push("Ident contains invalid characters".into()); IDENT } rustc_lexer::TokenKind::RawIdent => IDENT, rustc_lexer::TokenKind::GuardedStrPrefix if self.edition.at_least_2024() => { // FIXME: rustc does something better for recovery. errors.push("Invalid string literal (reserved syntax)".into()); ERROR } rustc_lexer::TokenKind::GuardedStrPrefix => { // The token is `#"` or `##`, split it into two. token_text = &token_text[1..]; POUND } rustc_lexer::TokenKind::Literal { kind, .. } => { self.extend_literal(token_text.len(), kind); return; } rustc_lexer::TokenKind::Lifetime { starts_with_number } => { if *starts_with_number { errors.push("Lifetime name cannot start with a number".into()); } LIFETIME_IDENT } rustc_lexer::TokenKind::UnknownPrefixLifetime => { errors.push("Unknown lifetime prefix".into()); LIFETIME_IDENT } rustc_lexer::TokenKind::RawLifetime => LIFETIME_IDENT, rustc_lexer::TokenKind::Semi => T![;], rustc_lexer::TokenKind::Comma => T![,], rustc_lexer::TokenKind::Dot => T![.], rustc_lexer::TokenKind::OpenParen => T!['('], rustc_lexer::TokenKind::CloseParen => T![')'], rustc_lexer::TokenKind::OpenBrace => T!['{'], rustc_lexer::TokenKind::CloseBrace => T!['}'], rustc_lexer::TokenKind::OpenBracket => T!['['], rustc_lexer::TokenKind::CloseBracket => T![']'], rustc_lexer::TokenKind::At => T![@], rustc_lexer::TokenKind::Pound => T![#], rustc_lexer::TokenKind::Tilde => T![~], rustc_lexer::TokenKind::Question => T![?], rustc_lexer::TokenKind::Colon => T![:], rustc_lexer::TokenKind::Dollar => T![$], rustc_lexer::TokenKind::Eq => T![=], rustc_lexer::TokenKind::Bang => T![!], rustc_lexer::TokenKind::Lt => T![<], rustc_lexer::TokenKind::Gt => T![>], rustc_lexer::TokenKind::Minus => T![-], rustc_lexer::TokenKind::And => T![&], rustc_lexer::TokenKind::Or => T![|], rustc_lexer::TokenKind::Plus => T![+], rustc_lexer::TokenKind::Star => T![*], rustc_lexer::TokenKind::Slash => T![/], rustc_lexer::TokenKind::Caret => T![^], rustc_lexer::TokenKind::Percent => T![%], rustc_lexer::TokenKind::Unknown => ERROR, rustc_lexer::TokenKind::UnknownPrefix if token_text == "builtin" => IDENT, rustc_lexer::TokenKind::UnknownPrefix => { let has_unterminated = self.has_likely_unterminated_string(); let error_msg = if has_unterminated { format!( "unknown literal prefix `{token_text}` (note: check for unterminated string literal)" ) } else { "unknown literal prefix".to_owned() }; errors.push(error_msg); IDENT } rustc_lexer::TokenKind::Eof => EOF, } }; self.push(syntax_kind, token_text.len(), errors); } fn extend_literal(&mut self, len: usize, kind: &rustc_lexer::LiteralKind) { let invalid_raw_msg = String::from("Invalid raw string literal"); let mut errors = vec![]; let mut no_end_quote = |c: char, kind: &str| { errors.push(format!("Missing trailing `{c}` symbol to terminate the {kind} literal")); }; let syntax_kind = match *kind { rustc_lexer::LiteralKind::Int { empty_int, base: _ } => { if empty_int { errors.push("Missing digits after the integer base prefix".into()); } INT_NUMBER } rustc_lexer::LiteralKind::Float { empty_exponent, base: _ } => { if empty_exponent { errors.push("Missing digits after the exponent symbol".into()); } FLOAT_NUMBER } rustc_lexer::LiteralKind::Char { terminated } => { if !terminated { no_end_quote('\'', "character"); } else { let text = &self.res.text[self.offset + 1..][..len - 1]; let text = &text[..text.rfind('\'').unwrap()]; if let Err(e) = unescape_char(text) { errors.push(err_to_msg(e, Mode::Char)); } } CHAR } rustc_lexer::LiteralKind::Byte { terminated } => { if !terminated { no_end_quote('\'', "byte"); } else { let text = &self.res.text[self.offset + 2..][..len - 2]; let text = &text[..text.rfind('\'').unwrap()]; if let Err(e) = unescape_byte(text) { errors.push(err_to_msg(e, Mode::Byte)); } } BYTE } rustc_lexer::LiteralKind::Str { terminated } => { if !terminated { no_end_quote('"', "string"); } else { let text = &self.res.text[self.offset + 1..][..len - 1]; let text = &text[..text.rfind('"').unwrap()]; unescape_str(text, |_, res| { if let Err(e) = res { errors.push(err_to_msg(e, Mode::Str)); } }); } STRING } rustc_lexer::LiteralKind::ByteStr { terminated } => { if !terminated { no_end_quote('"', "byte string"); } else { let text = &self.res.text[self.offset + 2..][..len - 2]; let text = &text[..text.rfind('"').unwrap()]; unescape_byte_str(text, |_, res| { if let Err(e) = res { errors.push(err_to_msg(e, Mode::ByteStr)); } }); } BYTE_STRING } rustc_lexer::LiteralKind::CStr { terminated } => { if !terminated { no_end_quote('"', "C string") } else { let text = &self.res.text[self.offset + 2..][..len - 2]; let text = &text[..text.rfind('"').unwrap()]; unescape_c_str(text, |_, res| { if let Err(e) = res { errors.push(err_to_msg(e, Mode::CStr)); } }); } C_STRING } rustc_lexer::LiteralKind::RawStr { n_hashes } => { if n_hashes.is_none() { errors.push(invalid_raw_msg); } STRING } rustc_lexer::LiteralKind::RawByteStr { n_hashes } => { if n_hashes.is_none() { errors.push(invalid_raw_msg); } BYTE_STRING } rustc_lexer::LiteralKind::RawCStr { n_hashes } => { if n_hashes.is_none() { errors.push(invalid_raw_msg); } C_STRING } }; self.push(syntax_kind, len, errors); } } fn err_to_msg(error: EscapeError, mode: Mode) -> String { match error { EscapeError::ZeroChars => "empty character literal", EscapeError::MoreThanOneChar => "character literal may only contain one codepoint", EscapeError::LoneSlash => "", EscapeError::InvalidEscape if mode == Mode::Byte || mode == Mode::ByteStr => { "unknown byte escape" } EscapeError::InvalidEscape => "unknown character escape", EscapeError::BareCarriageReturn => "", EscapeError::BareCarriageReturnInRawString => "", EscapeError::EscapeOnlyChar if mode == Mode::Byte => "byte constant must be escaped", EscapeError::EscapeOnlyChar => "character constant must be escaped", EscapeError::TooShortHexEscape => "numeric character escape is too short", EscapeError::InvalidCharInHexEscape => "invalid character in numeric character escape", EscapeError::OutOfRangeHexEscape => "out of range hex escape", EscapeError::NoBraceInUnicodeEscape => "incorrect unicode escape sequence", EscapeError::InvalidCharInUnicodeEscape => "invalid character in unicode escape", EscapeError::EmptyUnicodeEscape => "empty unicode escape", EscapeError::UnclosedUnicodeEscape => "unterminated unicode escape", EscapeError::LeadingUnderscoreUnicodeEscape => "invalid start of unicode escape", EscapeError::OverlongUnicodeEscape => "overlong unicode escape", EscapeError::LoneSurrogateUnicodeEscape => "invalid unicode character escape", EscapeError::OutOfRangeUnicodeEscape => "invalid unicode character escape", EscapeError::UnicodeEscapeInByte => "unicode escape in byte string", EscapeError::NonAsciiCharInByte if mode == Mode::Byte => { "non-ASCII character in byte literal" } EscapeError::NonAsciiCharInByte if mode == Mode::ByteStr => { "non-ASCII character in byte string literal" } EscapeError::NonAsciiCharInByte => "non-ASCII character in raw byte string literal", EscapeError::NulInCStr => "null character in C string literal", EscapeError::UnskippedWhitespaceWarning => "", EscapeError::MultipleSkippedLinesWarning => "", } .into() }