internal: move all the lexing to the parser crate

This commit is contained in:
Aleksey Kladov 2021-12-18 17:20:38 +03:00
parent 78926027e3
commit a022ad68c9
16 changed files with 159 additions and 467 deletions

View file

@ -48,7 +48,6 @@ use text_edit::Indel;
pub use crate::{
ast::{AstNode, AstToken},
parsing::lexer::{lex_single_syntax_kind, tokenize, Token},
ptr::{AstPtr, SyntaxNodePtr},
syntax_error::SyntaxError,
syntax_node::{

View file

@ -1,7 +1,6 @@
//! Lexing, bridging to parser (which does the actual parsing) and
//! incremental reparsing.
pub(crate) mod lexer;
mod text_tree_sink;
mod reparsing;
@ -10,18 +9,17 @@ use text_tree_sink::TextTreeSink;
use crate::{syntax_node::GreenNode, AstNode, SyntaxError, SyntaxNode};
pub(crate) use crate::parsing::{lexer::*, reparsing::incremental_reparse};
pub(crate) use crate::parsing::reparsing::incremental_reparse;
pub(crate) fn parse_text(text: &str) -> (GreenNode, Vec<SyntaxError>) {
let (lexer_tokens, lexer_errors) = tokenize(text);
let parser_tokens = to_parser_tokens(text, &lexer_tokens);
let lexed = parser::LexedStr::new(text);
let parser_tokens = lexed.to_tokens();
let mut tree_sink = TextTreeSink::new(text, &lexer_tokens);
let mut tree_sink = TextTreeSink::new(lexed);
parser::parse_source_file(&parser_tokens, &mut tree_sink);
let (tree, mut parser_errors) = tree_sink.finish();
parser_errors.extend(lexer_errors);
let (tree, parser_errors) = tree_sink.finish();
(tree, parser_errors)
}
@ -31,14 +29,13 @@ pub(crate) fn parse_text_as<T: AstNode>(
text: &str,
entry_point: parser::ParserEntryPoint,
) -> Result<T, ()> {
let (lexer_tokens, lexer_errors) = tokenize(text);
if !lexer_errors.is_empty() {
let lexed = parser::LexedStr::new(text);
if lexed.errors().next().is_some() {
return Err(());
}
let parser_tokens = lexed.to_tokens();
let parser_tokens = to_parser_tokens(text, &lexer_tokens);
let mut tree_sink = TextTreeSink::new(text, &lexer_tokens);
let mut tree_sink = TextTreeSink::new(lexed);
// TextTreeSink assumes that there's at least some root node to which it can attach errors and
// tokens. We arbitrarily give it a SourceFile.
@ -54,29 +51,3 @@ pub(crate) fn parse_text_as<T: AstNode>(
SyntaxNode::new_root(tree).first_child().and_then(T::cast).ok_or(())
}
pub(crate) fn to_parser_tokens(text: &str, lexer_tokens: &[lexer::Token]) -> ::parser::Tokens {
let mut off = 0;
let mut res = parser::Tokens::default();
let mut was_joint = false;
for t in lexer_tokens {
if t.kind.is_trivia() {
was_joint = false;
} else {
if t.kind == SyntaxKind::IDENT {
let token_text = &text[off..][..usize::from(t.len)];
let contextual_kw =
SyntaxKind::from_contextual_keyword(token_text).unwrap_or(SyntaxKind::IDENT);
res.push_ident(contextual_kw);
} else {
if was_joint {
res.was_joint();
}
res.push(t.kind);
}
was_joint = true;
}
off += usize::from(t.len);
}
res
}

View file

@ -1,249 +0,0 @@
//! Lexer analyzes raw input string and produces lexemes (tokens).
//! It is just a bridge to `rustc_lexer`.
use std::convert::TryInto;
use rustc_lexer::RawStrError;
use crate::{
SyntaxError,
SyntaxKind::{self, *},
TextRange, TextSize, T,
};
/// A token of Rust source.
#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
pub struct Token {
/// The kind of token.
pub kind: SyntaxKind,
/// The length of the token.
pub len: TextSize,
}
/// Break a string up into its component tokens.
/// Beware that it checks for shebang first and its length contributes to resulting
/// tokens offsets.
pub fn tokenize(text: &str) -> (Vec<Token>, Vec<SyntaxError>) {
// non-empty string is a precondition of `rustc_lexer::strip_shebang()`.
if text.is_empty() {
return Default::default();
}
let mut tokens = Vec::new();
let mut errors = Vec::new();
let mut offset = match rustc_lexer::strip_shebang(text) {
Some(shebang_len) => {
tokens.push(Token { kind: SHEBANG, len: shebang_len.try_into().unwrap() });
shebang_len
}
None => 0,
};
let text_without_shebang = &text[offset..];
for rustc_token in rustc_lexer::tokenize(text_without_shebang) {
let token_len: TextSize = rustc_token.len.try_into().unwrap();
let token_range = TextRange::at(offset.try_into().unwrap(), token_len);
let (syntax_kind, err_message) =
rustc_token_kind_to_syntax_kind(&rustc_token.kind, &text[token_range]);
tokens.push(Token { kind: syntax_kind, len: token_len });
if let Some(err_message) = err_message {
errors.push(SyntaxError::new(err_message, token_range));
}
offset += rustc_token.len;
}
(tokens, errors)
}
/// Returns `SyntaxKind` and `Option<SyntaxError>` if `text` parses as a single token.
///
/// Returns `None` if the string contains zero *or two or more* tokens.
/// The token is malformed if the returned error is not `None`.
///
/// Beware that unescape errors are not checked at tokenization time.
pub fn lex_single_syntax_kind(text: &str) -> Option<(SyntaxKind, Option<SyntaxError>)> {
let (first_token, err) = lex_first_token(text)?;
if first_token.len != TextSize::of(text) {
return None;
}
Some((first_token.kind, err))
}
/// Returns `SyntaxKind` and `Option<SyntaxError>` of the first token
/// encountered at the beginning of the string.
///
/// Returns `None` if the string contains zero tokens or if the token was parsed
/// with an error.
/// The token is malformed if the returned error is not `None`.
///
/// Beware that unescape errors are not checked at tokenization time.
fn lex_first_token(text: &str) -> Option<(Token, Option<SyntaxError>)> {
// non-empty string is a precondition of `rustc_lexer::first_token()`.
if text.is_empty() {
return None;
}
let rustc_token = rustc_lexer::first_token(text);
let (syntax_kind, err_message) = rustc_token_kind_to_syntax_kind(&rustc_token.kind, text);
let token = Token { kind: syntax_kind, len: rustc_token.len.try_into().unwrap() };
let optional_error = err_message
.map(|err_message| SyntaxError::new(err_message, TextRange::up_to(TextSize::of(text))));
Some((token, optional_error))
}
/// Returns `SyntaxKind` and an optional tokenize error message.
fn rustc_token_kind_to_syntax_kind(
rustc_token_kind: &rustc_lexer::TokenKind,
token_text: &str,
) -> (SyntaxKind, Option<&'static str>) {
// A note on an intended tradeoff:
// We drop some useful information here (see patterns with double dots `..`)
// Storing that info in `SyntaxKind` is not possible due to its layout requirements of
// being `u16` that come from `rowan::SyntaxKind`.
let syntax_kind = {
match rustc_token_kind {
rustc_lexer::TokenKind::LineComment { doc_style: _ } => COMMENT,
rustc_lexer::TokenKind::BlockComment { doc_style: _, terminated: true } => COMMENT,
rustc_lexer::TokenKind::BlockComment { doc_style: _, terminated: false } => {
return (
COMMENT,
Some("Missing trailing `*/` symbols to terminate the block comment"),
);
}
rustc_lexer::TokenKind::Whitespace => WHITESPACE,
rustc_lexer::TokenKind::Ident => {
if token_text == "_" {
UNDERSCORE
} else {
SyntaxKind::from_keyword(token_text).unwrap_or(IDENT)
}
}
rustc_lexer::TokenKind::RawIdent => IDENT,
rustc_lexer::TokenKind::Literal { kind, .. } => return match_literal_kind(kind),
rustc_lexer::TokenKind::Lifetime { starts_with_number: false } => LIFETIME_IDENT,
rustc_lexer::TokenKind::Lifetime { starts_with_number: true } => {
return (LIFETIME_IDENT, Some("Lifetime name cannot start with a number"))
}
rustc_lexer::TokenKind::Semi => T![;],
rustc_lexer::TokenKind::Comma => T![,],
rustc_lexer::TokenKind::Dot => T![.],
rustc_lexer::TokenKind::OpenParen => T!['('],
rustc_lexer::TokenKind::CloseParen => T![')'],
rustc_lexer::TokenKind::OpenBrace => T!['{'],
rustc_lexer::TokenKind::CloseBrace => T!['}'],
rustc_lexer::TokenKind::OpenBracket => T!['['],
rustc_lexer::TokenKind::CloseBracket => T![']'],
rustc_lexer::TokenKind::At => T![@],
rustc_lexer::TokenKind::Pound => T![#],
rustc_lexer::TokenKind::Tilde => T![~],
rustc_lexer::TokenKind::Question => T![?],
rustc_lexer::TokenKind::Colon => T![:],
rustc_lexer::TokenKind::Dollar => T![$],
rustc_lexer::TokenKind::Eq => T![=],
rustc_lexer::TokenKind::Bang => T![!],
rustc_lexer::TokenKind::Lt => T![<],
rustc_lexer::TokenKind::Gt => T![>],
rustc_lexer::TokenKind::Minus => T![-],
rustc_lexer::TokenKind::And => T![&],
rustc_lexer::TokenKind::Or => T![|],
rustc_lexer::TokenKind::Plus => T![+],
rustc_lexer::TokenKind::Star => T![*],
rustc_lexer::TokenKind::Slash => T![/],
rustc_lexer::TokenKind::Caret => T![^],
rustc_lexer::TokenKind::Percent => T![%],
rustc_lexer::TokenKind::Unknown => ERROR,
}
};
return (syntax_kind, None);
fn match_literal_kind(kind: &rustc_lexer::LiteralKind) -> (SyntaxKind, Option<&'static str>) {
let mut err = "";
let syntax_kind = match *kind {
rustc_lexer::LiteralKind::Int { empty_int, base: _ } => {
if empty_int {
err = "Missing digits after the integer base prefix";
}
INT_NUMBER
}
rustc_lexer::LiteralKind::Float { empty_exponent, base: _ } => {
if empty_exponent {
err = "Missing digits after the exponent symbol";
}
FLOAT_NUMBER
}
rustc_lexer::LiteralKind::Char { terminated } => {
if !terminated {
err = "Missing trailing `'` symbol to terminate the character literal";
}
CHAR
}
rustc_lexer::LiteralKind::Byte { terminated } => {
if !terminated {
err = "Missing trailing `'` symbol to terminate the byte literal";
}
BYTE
}
rustc_lexer::LiteralKind::Str { terminated } => {
if !terminated {
err = "Missing trailing `\"` symbol to terminate the string literal";
}
STRING
}
rustc_lexer::LiteralKind::ByteStr { terminated } => {
if !terminated {
err = "Missing trailing `\"` symbol to terminate the byte string literal";
}
BYTE_STRING
}
rustc_lexer::LiteralKind::RawStr { err: raw_str_err, .. } => {
if let Some(raw_str_err) = raw_str_err {
err = match raw_str_err {
RawStrError::InvalidStarter { .. } => "Missing `\"` symbol after `#` symbols to begin the raw string literal",
RawStrError::NoTerminator { expected, found, .. } => if expected == found {
"Missing trailing `\"` to terminate the raw string literal"
} else {
"Missing trailing `\"` with `#` symbols to terminate the raw string literal"
},
RawStrError::TooManyDelimiters { .. } => "Too many `#` symbols: raw strings may be delimited by up to 65535 `#` symbols",
};
};
STRING
}
rustc_lexer::LiteralKind::RawByteStr { err: raw_str_err, .. } => {
if let Some(raw_str_err) = raw_str_err {
err = match raw_str_err {
RawStrError::InvalidStarter { .. } => "Missing `\"` symbol after `#` symbols to begin the raw byte string literal",
RawStrError::NoTerminator { expected, found, .. } => if expected == found {
"Missing trailing `\"` to terminate the raw byte string literal"
} else {
"Missing trailing `\"` with `#` symbols to terminate the raw byte string literal"
},
RawStrError::TooManyDelimiters { .. } => "Too many `#` symbols: raw byte strings may be delimited by up to 65535 `#` symbols",
};
};
BYTE_STRING
}
};
let err = if err.is_empty() { None } else { Some(err) };
(syntax_kind, err)
}
}

View file

@ -10,11 +10,7 @@ use parser::Reparser;
use text_edit::Indel;
use crate::{
parsing::{
lexer::{lex_single_syntax_kind, tokenize, Token},
text_tree_sink::TextTreeSink,
to_parser_tokens,
},
parsing::text_tree_sink::TextTreeSink,
syntax_node::{GreenNode, GreenToken, NodeOrToken, SyntaxElement, SyntaxNode},
SyntaxError,
SyntaxKind::*,
@ -53,7 +49,7 @@ fn reparse_token(
}
let mut new_text = get_text_after_edit(prev_token.clone().into(), edit);
let (new_token_kind, new_err) = lex_single_syntax_kind(&new_text)?;
let (new_token_kind, new_err) = parser::LexedStr::single_token(&new_text)?;
if new_token_kind != prev_token_kind
|| (new_token_kind == IDENT && is_contextual_kw(&new_text))
@ -66,7 +62,7 @@ fn reparse_token(
// `b` no longer remains an identifier, but becomes a part of byte string literal
if let Some(next_char) = root.text().char_at(prev_token.text_range().end()) {
new_text.push(next_char);
let token_with_next_char = lex_single_syntax_kind(&new_text);
let token_with_next_char = parser::LexedStr::single_token(&new_text);
if let Some((_kind, _error)) = token_with_next_char {
return None;
}
@ -74,9 +70,10 @@ fn reparse_token(
}
let new_token = GreenToken::new(rowan::SyntaxKind(prev_token_kind.into()), &new_text);
let range = TextRange::up_to(TextSize::of(&new_text));
Some((
prev_token.replace_with(new_token),
new_err.into_iter().collect(),
new_err.into_iter().map(|msg| SyntaxError::new(msg, range)).collect(),
prev_token.text_range(),
))
}
@ -91,17 +88,17 @@ fn reparse_block(
let (node, reparser) = find_reparsable_node(root, edit.delete)?;
let text = get_text_after_edit(node.clone().into(), edit);
let (lexer_tokens, new_lexer_errors) = tokenize(&text);
if !is_balanced(&lexer_tokens) {
let lexed = parser::LexedStr::new(text.as_str());
let parser_tokens = lexed.to_tokens();
if !is_balanced(&lexed) {
return None;
}
let parser_tokens = to_parser_tokens(&text, &lexer_tokens);
let mut tree_sink = TextTreeSink::new(&text, &lexer_tokens);
let mut tree_sink = TextTreeSink::new(lexed);
reparser.parse(&parser_tokens, &mut tree_sink);
let (green, mut new_parser_errors) = tree_sink.finish();
new_parser_errors.extend(new_lexer_errors);
let (green, new_parser_errors) = tree_sink.finish();
Some((node.replace_with(green), new_parser_errors, node.text_range()))
}
@ -131,16 +128,13 @@ fn find_reparsable_node(node: &SyntaxNode, range: TextRange) -> Option<(SyntaxNo
})
}
fn is_balanced(tokens: &[Token]) -> bool {
if tokens.is_empty()
|| tokens.first().unwrap().kind != T!['{']
|| tokens.last().unwrap().kind != T!['}']
{
fn is_balanced(lexed: &parser::LexedStr<'_>) -> bool {
if lexed.is_empty() || lexed.kind(0) != T!['{'] || lexed.kind(lexed.len() - 1) != T!['}'] {
return false;
}
let mut balance = 0usize;
for t in &tokens[1..tokens.len() - 1] {
match t.kind {
for i in 1..lexed.len() - 1 {
match lexed.kind(i) {
T!['{'] => balance += 1,
T!['}'] => {
balance = match balance.checked_sub(1) {

View file

@ -2,25 +2,22 @@
use std::mem;
use parser::{ParseError, TreeSink};
use parser::{LexedStr, ParseError, TreeSink};
use crate::{
ast,
parsing::Token,
syntax_node::GreenNode,
SyntaxError,
SyntaxKind::{self, *},
SyntaxTreeBuilder, TextRange, TextSize,
SyntaxTreeBuilder, TextRange,
};
/// Bridges the parser with our specific syntax tree representation.
///
/// `TextTreeSink` also handles attachment of trivia (whitespace) to nodes.
pub(crate) struct TextTreeSink<'a> {
text: &'a str,
tokens: &'a [Token],
text_pos: TextSize,
token_pos: usize,
lexed: LexedStr<'a>,
pos: usize,
state: State,
inner: SyntaxTreeBuilder,
}
@ -39,12 +36,7 @@ impl<'a> TreeSink for TextTreeSink<'a> {
State::Normal => (),
}
self.eat_trivias();
let n_tokens = n_tokens as usize;
let len = self.tokens[self.token_pos..self.token_pos + n_tokens]
.iter()
.map(|it| it.len)
.sum::<TextSize>();
self.do_token(kind, len, n_tokens);
self.do_token(kind, n_tokens as usize);
}
fn start_node(&mut self, kind: SyntaxKind) {
@ -60,20 +52,12 @@ impl<'a> TreeSink for TextTreeSink<'a> {
}
let n_trivias =
self.tokens[self.token_pos..].iter().take_while(|it| it.kind.is_trivia()).count();
let leading_trivias = &self.tokens[self.token_pos..self.token_pos + n_trivias];
let mut trivia_end =
self.text_pos + leading_trivias.iter().map(|it| it.len).sum::<TextSize>();
let n_attached_trivias = {
let leading_trivias = leading_trivias.iter().rev().map(|it| {
let next_end = trivia_end - it.len;
let range = TextRange::new(next_end, trivia_end);
trivia_end = next_end;
(it.kind, &self.text[range])
});
n_attached_trivias(kind, leading_trivias)
};
(self.pos..self.lexed.len()).take_while(|&it| self.lexed.kind(it).is_trivia()).count();
let leading_trivias = self.pos..self.pos + n_trivias;
let n_attached_trivias = n_attached_trivias(
kind,
leading_trivias.rev().map(|it| (self.lexed.kind(it), self.lexed.text(it))),
);
self.eat_n_trivias(n_trivias - n_attached_trivias);
self.inner.start_node(kind);
self.eat_n_trivias(n_attached_trivias);
@ -88,20 +72,14 @@ impl<'a> TreeSink for TextTreeSink<'a> {
}
fn error(&mut self, error: ParseError) {
self.inner.error(error, self.text_pos);
let text_pos = self.lexed.text_start(self.pos).try_into().unwrap();
self.inner.error(error, text_pos);
}
}
impl<'a> TextTreeSink<'a> {
pub(super) fn new(text: &'a str, tokens: &'a [Token]) -> Self {
Self {
text,
tokens,
text_pos: 0.into(),
token_pos: 0,
state: State::PendingStart,
inner: SyntaxTreeBuilder::default(),
}
pub(super) fn new(lexed: parser::LexedStr<'a>) -> Self {
Self { lexed, pos: 0, state: State::PendingStart, inner: SyntaxTreeBuilder::default() }
}
pub(super) fn finish_eof(mut self) -> (GreenNode, Vec<SyntaxError>, bool) {
@ -113,8 +91,17 @@ impl<'a> TextTreeSink<'a> {
State::PendingStart | State::Normal => unreachable!(),
}
let (node, errors) = self.inner.finish_raw();
let is_eof = self.token_pos == self.tokens.len();
let (node, mut errors) = self.inner.finish_raw();
for (i, err) in self.lexed.errors() {
let text_range = self.lexed.text_range(i);
let text_range = TextRange::new(
text_range.start.try_into().unwrap(),
text_range.end.try_into().unwrap(),
);
errors.push(SyntaxError::new(err, text_range))
}
let is_eof = self.pos == self.lexed.len();
(node, errors, is_eof)
}
@ -125,27 +112,26 @@ impl<'a> TextTreeSink<'a> {
}
fn eat_trivias(&mut self) {
while let Some(&token) = self.tokens.get(self.token_pos) {
if !token.kind.is_trivia() {
while self.pos < self.lexed.len() {
let kind = self.lexed.kind(self.pos);
if !kind.is_trivia() {
break;
}
self.do_token(token.kind, token.len, 1);
self.do_token(kind, 1);
}
}
fn eat_n_trivias(&mut self, n: usize) {
for _ in 0..n {
let token = self.tokens[self.token_pos];
assert!(token.kind.is_trivia());
self.do_token(token.kind, token.len, 1);
let kind = self.lexed.kind(self.pos);
assert!(kind.is_trivia());
self.do_token(kind, 1);
}
}
fn do_token(&mut self, kind: SyntaxKind, len: TextSize, n_tokens: usize) {
let range = TextRange::at(self.text_pos, len);
let text = &self.text[range];
self.text_pos += len;
self.token_pos += n_tokens;
fn do_token(&mut self, kind: SyntaxKind, n_tokens: usize) {
let text = &self.lexed.range_text(self.pos..self.pos + n_tokens);
self.pos += n_tokens;
self.inner.token(kind, text);
}
}

View file

@ -3,7 +3,6 @@ mod sourcegen_ast;
mod ast_src;
use std::{
fmt::Write,
fs,
path::{Path, PathBuf},
};
@ -13,25 +12,7 @@ use expect_test::expect_file;
use rayon::prelude::*;
use test_utils::{bench, bench_fixture, project_root};
use crate::{ast, fuzz, tokenize, AstNode, SourceFile, SyntaxError, TextRange, TextSize, Token};
#[test]
fn lexer_tests() {
// FIXME:
// * Add tests for unicode escapes in byte-character and [raw]-byte-string literals
// * Add tests for unescape errors
dir_tests(&test_data_dir(), &["lexer/ok"], "txt", |text, path| {
let (tokens, errors) = tokenize(text);
assert_errors_are_absent(&errors, path);
dump_tokens_and_errors(&tokens, &errors, text)
});
dir_tests(&test_data_dir(), &["lexer/err"], "txt", |text, path| {
let (tokens, errors) = tokenize(text);
assert_errors_are_present(&errors, path);
dump_tokens_and_errors(&tokens, &errors, text)
});
}
use crate::{ast, fuzz, AstNode, SourceFile, SyntaxError};
#[test]
fn parse_smoke_test() {
@ -206,22 +187,6 @@ fn assert_errors_are_absent(errors: &[SyntaxError], path: &Path) {
);
}
fn dump_tokens_and_errors(tokens: &[Token], errors: &[SyntaxError], text: &str) -> String {
let mut acc = String::new();
let mut offset: TextSize = 0.into();
for token in tokens {
let token_len = token.len;
let token_text = &text[TextRange::at(offset, token.len)];
offset += token.len;
writeln!(acc, "{:?} {:?} {:?}", token.kind, token_len, token_text).unwrap();
}
for err in errors {
writeln!(acc, "> error{:?} token({:?}) msg({})", err.range(), &text[err.range()], err)
.unwrap();
}
acc
}
fn fragment_parser_dir_test<T, F>(ok_paths: &[&str], err_paths: &[&str], f: F)
where
T: crate::AstNode,