Store token trees in contiguous Vec instead of as a tree

I expected this to be faster (due to less allocations and better cache locality), but benchmarked it is not (neither it is slower). Memory usage, however, drops by ~50mb (of `analysis-stats .`). I guess tt construction is just not hot.

This also simplifies using even less memory for token trees by compressing equal span, which I plan to do right after.

Some workflows are more easily expressed with a flat tt, while some are better expressed with a tree. With the right helpers, though (which was mostly a matter of trial and error), even the worst workflows become very easy indeed.
This commit is contained in:
Chayim Refael Friedman 2024-10-18 10:16:08 +03:00
parent 1c6b83852b
commit ceba289f80
50 changed files with 2356 additions and 2286 deletions

View file

@ -5,17 +5,14 @@ use std::fmt;
use intern::Symbol;
use rustc_hash::{FxHashMap, FxHashSet};
use span::{Edition, SpanAnchor, SpanData, SpanMap};
use stdx::{format_to, never, non_empty_vec::NonEmptyVec};
use stdx::{format_to, never};
use syntax::{
ast::{self, make::tokens::doc_comment},
format_smolstr, AstToken, Parse, PreorderWithTokens, SmolStr, SyntaxElement,
SyntaxKind::{self, *},
SyntaxNode, SyntaxToken, SyntaxTreeBuilder, TextRange, TextSize, WalkEvent, T,
};
use tt::{
buffer::{Cursor, TokenBuffer},
token_to_literal,
};
use tt::{buffer::Cursor, token_to_literal};
pub mod prettify_macro_expansion;
mod to_parser_input;
@ -99,7 +96,7 @@ pub fn syntax_node_to_token_tree<Ctx, SpanMap>(
map: SpanMap,
span: SpanData<Ctx>,
mode: DocCommentDesugarMode,
) -> tt::Subtree<SpanData<Ctx>>
) -> tt::TopSubtree<SpanData<Ctx>>
where
SpanData<Ctx>: Copy + fmt::Debug,
SpanMap: SpanMapper<SpanData<Ctx>>,
@ -118,7 +115,7 @@ pub fn syntax_node_to_token_tree_modified<Ctx, SpanMap>(
remove: FxHashSet<SyntaxElement>,
call_site: SpanData<Ctx>,
mode: DocCommentDesugarMode,
) -> tt::Subtree<SpanData<Ctx>>
) -> tt::TopSubtree<SpanData<Ctx>>
where
SpanMap: SpanMapper<SpanData<Ctx>>,
SpanData<Ctx>: Copy + fmt::Debug,
@ -142,7 +139,7 @@ where
/// Converts a [`tt::Subtree`] back to a [`SyntaxNode`].
/// The produced `SpanMap` contains a mapping from the syntax nodes offsets to the subtree's spans.
pub fn token_tree_to_syntax_node<Ctx>(
tt: &tt::Subtree<SpanData<Ctx>>,
tt: &tt::TopSubtree<SpanData<Ctx>>,
entry_point: parser::TopEntryPoint,
edition: parser::Edition,
) -> (Parse<SyntaxNode>, SpanMap<Ctx>)
@ -150,16 +147,10 @@ where
SpanData<Ctx>: Copy + fmt::Debug,
Ctx: PartialEq,
{
let buffer = match tt {
tt::Subtree {
delimiter: tt::Delimiter { kind: tt::DelimiterKind::Invisible, .. },
token_trees,
} => TokenBuffer::from_tokens(token_trees),
_ => TokenBuffer::from_subtree(tt),
};
let parser_input = to_parser_input(edition, &buffer);
let buffer = tt.view().strip_invisible();
let parser_input = to_parser_input(edition, buffer);
let parser_output = entry_point.parse(&parser_input, edition);
let mut tree_sink = TtTreeSink::new(buffer.begin());
let mut tree_sink = TtTreeSink::new(buffer.cursor());
for event in parser_output.iter() {
match event {
parser::Step::Token { kind, n_input_tokens: n_raw_tokens } => {
@ -183,7 +174,7 @@ pub fn parse_to_token_tree<Ctx>(
anchor: SpanAnchor,
ctx: Ctx,
text: &str,
) -> Option<tt::Subtree<SpanData<Ctx>>>
) -> Option<tt::TopSubtree<SpanData<Ctx>>>
where
SpanData<Ctx>: Copy + fmt::Debug,
Ctx: Copy,
@ -202,7 +193,7 @@ pub fn parse_to_token_tree_static_span<S>(
edition: Edition,
span: S,
text: &str,
) -> Option<tt::Subtree<S>>
) -> Option<tt::TopSubtree<S>>
where
S: Copy + fmt::Debug,
{
@ -215,47 +206,38 @@ where
Some(convert_tokens(&mut conv))
}
fn convert_tokens<S, C>(conv: &mut C) -> tt::Subtree<S>
fn convert_tokens<S, C>(conv: &mut C) -> tt::TopSubtree<S>
where
C: TokenConverter<S>,
S: Copy + fmt::Debug,
C::Token: fmt::Debug,
{
let entry = tt::SubtreeBuilder {
delimiter: tt::Delimiter::invisible_spanned(conv.call_site()),
token_trees: vec![],
};
let mut stack = NonEmptyVec::new(entry);
let mut builder =
tt::TopSubtreeBuilder::new(tt::Delimiter::invisible_spanned(conv.call_site()));
while let Some((token, abs_range)) = conv.bump() {
let tt::SubtreeBuilder { delimiter, token_trees } = stack.last_mut();
let delimiter = builder.expected_delimiter().map(|it| it.kind);
let tt = match token.as_leaf() {
Some(leaf) => tt::TokenTree::Leaf(leaf.clone()),
Some(leaf) => leaf.clone(),
None => match token.kind(conv) {
// Desugar doc comments into doc attributes
COMMENT => {
let span = conv.span_for(abs_range);
if let Some(tokens) = conv.convert_doc_comment(&token, span) {
token_trees.extend(tokens);
}
conv.convert_doc_comment(&token, span, &mut builder);
continue;
}
kind if kind.is_punct() && kind != UNDERSCORE => {
let expected = match delimiter.kind {
tt::DelimiterKind::Parenthesis => Some(T![')']),
tt::DelimiterKind::Brace => Some(T!['}']),
tt::DelimiterKind::Bracket => Some(T![']']),
tt::DelimiterKind::Invisible => None,
let expected = match delimiter {
Some(tt::DelimiterKind::Parenthesis) => Some(T![')']),
Some(tt::DelimiterKind::Brace) => Some(T!['}']),
Some(tt::DelimiterKind::Bracket) => Some(T![']']),
Some(tt::DelimiterKind::Invisible) | None => None,
};
// Current token is a closing delimiter that we expect, fix up the closing span
// and end the subtree here
if matches!(expected, Some(expected) if expected == kind) {
if let Some(mut subtree) = stack.pop() {
subtree.delimiter.close = conv.span_for(abs_range);
stack.last_mut().token_trees.push(subtree.build().into());
}
builder.close(conv.span_for(abs_range));
continue;
}
@ -268,16 +250,7 @@ where
// Start a new subtree
if let Some(kind) = delim {
let open = conv.span_for(abs_range);
stack.push(tt::SubtreeBuilder {
delimiter: tt::Delimiter {
open,
// will be overwritten on subtree close above
close: open,
kind,
},
token_trees: vec![],
});
builder.open(kind, conv.span_for(abs_range));
continue;
}
@ -289,7 +262,6 @@ where
panic!("Token from lexer must be single char: token = {token:#?}")
};
tt::Leaf::from(tt::Punct { char, spacing, span: conv.span_for(abs_range) })
.into()
}
kind => {
macro_rules! make_ident {
@ -320,7 +292,7 @@ where
span: conv
.span_for(TextRange::at(abs_range.start(), TextSize::of('\''))),
});
token_trees.push(apostrophe.into());
builder.push(apostrophe);
let ident = tt::Leaf::from(tt::Ident {
sym: Symbol::intern(&token.to_text(conv)[1..]),
@ -330,47 +302,26 @@ where
)),
is_raw: tt::IdentIsRaw::No,
});
token_trees.push(ident.into());
builder.push(ident);
continue;
}
_ => continue,
};
leaf.into()
leaf
}
},
};
token_trees.push(tt);
builder.push(tt);
}
// If we get here, we've consumed all input tokens.
// We might have more than one subtree in the stack, if the delimiters are improperly balanced.
// Merge them so we're left with one.
while let Some(entry) = stack.pop() {
let parent = stack.last_mut();
builder.flatten_unclosed_subtrees();
let leaf: tt::Leaf<_> = tt::Punct {
span: entry.delimiter.open,
char: match entry.delimiter.kind {
tt::DelimiterKind::Parenthesis => '(',
tt::DelimiterKind::Brace => '{',
tt::DelimiterKind::Bracket => '[',
tt::DelimiterKind::Invisible => '$',
},
spacing: tt::Spacing::Alone,
}
.into();
parent.token_trees.push(leaf.into());
parent.token_trees.extend(entry.token_trees);
}
let subtree = stack.into_last().build();
if let [tt::TokenTree::Subtree(first)] = &*subtree.token_trees {
first.clone()
} else {
subtree
}
builder.build_skip_top_subtree()
}
fn is_single_token_op(kind: SyntaxKind) -> bool {
@ -436,25 +387,17 @@ fn convert_doc_comment<S: Copy>(
token: &syntax::SyntaxToken,
span: S,
mode: DocCommentDesugarMode,
) -> Option<Vec<tt::TokenTree<S>>> {
let comment = ast::Comment::cast(token.clone())?;
let doc = comment.kind().doc?;
builder: &mut tt::TopSubtreeBuilder<S>,
) {
let Some(comment) = ast::Comment::cast(token.clone()) else { return };
let Some(doc) = comment.kind().doc else { return };
let mk_ident = |s: &str| {
tt::TokenTree::from(tt::Leaf::from(tt::Ident {
sym: Symbol::intern(s),
span,
is_raw: tt::IdentIsRaw::No,
}))
tt::Leaf::from(tt::Ident { sym: Symbol::intern(s), span, is_raw: tt::IdentIsRaw::No })
};
let mk_punct = |c: char| {
tt::TokenTree::from(tt::Leaf::from(tt::Punct {
char: c,
spacing: tt::Spacing::Alone,
span,
}))
};
let mk_punct =
|c: char| tt::Leaf::from(tt::Punct { char: c, spacing: tt::Spacing::Alone, span });
let mk_doc_literal = |comment: &ast::Comment| {
let prefix_len = comment.prefix().len();
@ -467,24 +410,20 @@ fn convert_doc_comment<S: Copy>(
let (text, kind) = desugar_doc_comment_text(text, mode);
let lit = tt::Literal { symbol: text, span, kind, suffix: None };
tt::TokenTree::from(tt::Leaf::from(lit))
tt::Leaf::from(lit)
};
// Make `doc="\" Comments\""
let meta_tkns = Box::new([mk_ident("doc"), mk_punct('='), mk_doc_literal(&comment)]);
let meta_tkns = [mk_ident("doc"), mk_punct('='), mk_doc_literal(&comment)];
// Make `#![]`
let mut token_trees = Vec::with_capacity(3);
token_trees.push(mk_punct('#'));
builder.push(mk_punct('#'));
if let ast::CommentPlacement::Inner = doc {
token_trees.push(mk_punct('!'));
builder.push(mk_punct('!'));
}
token_trees.push(tt::TokenTree::from(tt::Subtree {
delimiter: tt::Delimiter { open: span, close: span, kind: tt::DelimiterKind::Bracket },
token_trees: meta_tkns,
}));
Some(token_trees)
builder.open(tt::DelimiterKind::Bracket, span);
builder.extend(meta_tkns);
builder.close(span);
}
/// A raw token (straight from lexer) converter
@ -518,7 +457,12 @@ trait SrcToken<Ctx, S> {
trait TokenConverter<S>: Sized {
type Token: SrcToken<Self, S>;
fn convert_doc_comment(&self, token: &Self::Token, span: S) -> Option<Vec<tt::TokenTree<S>>>;
fn convert_doc_comment(
&self,
token: &Self::Token,
span: S,
builder: &mut tt::TopSubtreeBuilder<S>,
);
fn bump(&mut self) -> Option<(Self::Token, TextRange)>;
@ -567,9 +511,10 @@ where
&self,
&token: &usize,
span: SpanData<Ctx>,
) -> Option<Vec<tt::TokenTree<SpanData<Ctx>>>> {
builder: &mut tt::TopSubtreeBuilder<SpanData<Ctx>>,
) {
let text = self.lexed.text(token);
convert_doc_comment(&doc_comment(text), span, self.mode)
convert_doc_comment(&doc_comment(text), span, self.mode, builder);
}
fn bump(&mut self) -> Option<(Self::Token, TextRange)> {
@ -606,9 +551,9 @@ where
{
type Token = usize;
fn convert_doc_comment(&self, &token: &usize, span: S) -> Option<Vec<tt::TokenTree<S>>> {
fn convert_doc_comment(&self, &token: &usize, span: S, builder: &mut tt::TopSubtreeBuilder<S>) {
let text = self.lexed.text(token);
convert_doc_comment(&doc_comment(text), span, self.mode)
convert_doc_comment(&doc_comment(text), span, self.mode, builder);
}
fn bump(&mut self) -> Option<(Self::Token, TextRange)> {
@ -773,8 +718,13 @@ where
SpanMap: SpanMapper<S>,
{
type Token = SynToken<S>;
fn convert_doc_comment(&self, token: &Self::Token, span: S) -> Option<Vec<tt::TokenTree<S>>> {
convert_doc_comment(token.token(), span, self.mode)
fn convert_doc_comment(
&self,
token: &Self::Token,
span: S,
builder: &mut tt::TopSubtreeBuilder<S>,
) {
convert_doc_comment(token.token(), span, self.mode, builder);
}
fn bump(&mut self) -> Option<(Self::Token, TextRange)> {
@ -899,15 +849,12 @@ where
/// This occurs when a float literal is used as a field access.
fn float_split(&mut self, has_pseudo_dot: bool) {
let (text, span) = match self.cursor.token_tree() {
Some(tt::buffer::TokenTreeRef::Leaf(
tt::Leaf::Literal(tt::Literal {
symbol: text,
span,
kind: tt::LitKind::Float,
suffix: _,
}),
_,
)) => (text.as_str(), *span),
Some(tt::TokenTree::Leaf(tt::Leaf::Literal(tt::Literal {
symbol: text,
span,
kind: tt::LitKind::Float,
suffix: _,
}))) => (text.as_str(), *span),
tt => unreachable!("{tt:?}"),
};
// FIXME: Span splitting
@ -942,7 +889,7 @@ where
}
None => unreachable!(),
}
self.cursor = self.cursor.bump();
self.cursor.bump();
}
fn token(&mut self, kind: SyntaxKind, mut n_tokens: u8) {
@ -950,24 +897,24 @@ where
n_tokens = 2;
}
let mut last = self.cursor;
let mut last_two = self.cursor.peek_two_leaves();
let mut combined_span = None;
'tokens: for _ in 0..n_tokens {
let tmp: u8;
if self.cursor.eof() {
break;
}
last = self.cursor;
last_two = self.cursor.peek_two_leaves();
let (text, span) = loop {
break match self.cursor.token_tree() {
Some(tt::buffer::TokenTreeRef::Leaf(leaf, _)) => match leaf {
Some(tt::TokenTree::Leaf(leaf)) => match leaf {
tt::Leaf::Ident(ident) => {
if ident.is_raw.yes() {
self.buf.push_str("r#");
self.text_pos += TextSize::of("r#");
}
let r = (ident.sym.as_str(), ident.span);
self.cursor = self.cursor.bump();
self.cursor.bump();
r
}
tt::Leaf::Punct(punct) => {
@ -977,7 +924,7 @@ where
std::str::from_utf8(std::slice::from_ref(&tmp)).unwrap(),
punct.span,
);
self.cursor = self.cursor.bump();
self.cursor.bump();
r
}
tt::Leaf::Literal(lit) => {
@ -989,20 +936,19 @@ where
None => Some(lit.span),
Some(prev_span) => Some(Self::merge_spans(prev_span, lit.span)),
};
self.cursor = self.cursor.bump();
self.cursor.bump();
continue 'tokens;
}
},
Some(tt::buffer::TokenTreeRef::Subtree(subtree, _)) => {
self.cursor = self.cursor.subtree().unwrap();
Some(tt::TokenTree::Subtree(subtree)) => {
self.cursor.bump();
match delim_to_str(subtree.delimiter.kind, false) {
Some(it) => (it, subtree.delimiter.open),
None => continue,
}
}
None => {
let parent = self.cursor.end().unwrap();
self.cursor = self.cursor.bump();
let parent = self.cursor.end();
match delim_to_str(parent.delimiter.kind, true) {
Some(it) => (it, parent.delimiter.close),
None => continue,
@ -1023,12 +969,7 @@ where
self.buf.clear();
// FIXME: Emitting whitespace for this is really just a hack, we should get rid of it.
// Add whitespace between adjoint puncts
let next = last.bump();
if let (
Some(tt::buffer::TokenTreeRef::Leaf(tt::Leaf::Punct(curr), _)),
Some(tt::buffer::TokenTreeRef::Leaf(tt::Leaf::Punct(next), _)),
) = (last.token_tree(), next.token_tree())
{
if let Some([tt::Leaf::Punct(curr), tt::Leaf::Punct(next)]) = last_two {
// Note: We always assume the semi-colon would be the last token in
// other parts of RA such that we don't add whitespace here.
//

View file

@ -2,10 +2,7 @@ use rustc_hash::FxHashMap;
use span::Span;
use syntax::{ast, AstNode};
use test_utils::extract_annotations;
use tt::{
buffer::{TokenBuffer, TokenTreeRef},
Leaf, Punct, Spacing,
};
use tt::{buffer::Cursor, Leaf, Punct, Spacing};
use crate::{
dummy_test_span_utils::{DummyTestSpanMap, DUMMY},
@ -32,22 +29,22 @@ fn check_punct_spacing(fixture: &str) {
})
.collect();
let buf = TokenBuffer::from_subtree(&subtree);
let mut cursor = buf.begin();
let mut cursor = Cursor::new(&subtree.0);
while !cursor.eof() {
while let Some(token_tree) = cursor.token_tree() {
if let TokenTreeRef::Leaf(
Leaf::Punct(Punct { spacing, span: Span { range, .. }, .. }),
_,
) = token_tree
if let tt::TokenTree::Leaf(Leaf::Punct(Punct {
spacing,
span: Span { range, .. },
..
})) = token_tree
{
if let Some(expected) = annotations.remove(range) {
assert_eq!(expected, *spacing);
}
}
cursor = cursor.bump_subtree();
cursor.bump();
}
cursor = cursor.bump();
cursor.bump_or_end();
}
assert!(annotations.is_empty(), "unchecked annotations: {annotations:?}");

View file

@ -6,37 +6,34 @@ use std::fmt;
use span::Edition;
use syntax::{SyntaxKind, SyntaxKind::*, T};
use tt::buffer::TokenBuffer;
pub fn to_parser_input<S: Copy + fmt::Debug>(
edition: Edition,
buffer: &TokenBuffer<'_, S>,
buffer: tt::TokenTreesView<'_, S>,
) -> parser::Input {
let mut res = parser::Input::default();
let mut current = buffer.begin();
let mut current = buffer.cursor();
while !current.eof() {
let cursor = current;
let tt = cursor.token_tree();
let tt = current.token_tree();
// Check if it is lifetime
if let Some(tt::buffer::TokenTreeRef::Leaf(tt::Leaf::Punct(punct), _)) = tt {
if let Some(tt::TokenTree::Leaf(tt::Leaf::Punct(punct))) = tt {
if punct.char == '\'' {
let next = cursor.bump();
match next.token_tree() {
Some(tt::buffer::TokenTreeRef::Leaf(tt::Leaf::Ident(_ident), _)) => {
current.bump();
match current.token_tree() {
Some(tt::TokenTree::Leaf(tt::Leaf::Ident(_ident))) => {
res.push(LIFETIME_IDENT);
current = next.bump();
current.bump();
continue;
}
_ => panic!("Next token must be ident : {:#?}", next.token_tree()),
_ => panic!("Next token must be ident"),
}
}
}
current = match tt {
Some(tt::buffer::TokenTreeRef::Leaf(leaf, _)) => {
match tt {
Some(tt::TokenTree::Leaf(leaf)) => {
match leaf {
tt::Leaf::Literal(lit) => {
let kind = match lit.kind {
@ -83,9 +80,9 @@ pub fn to_parser_input<S: Copy + fmt::Debug>(
}
}
}
cursor.bump()
current.bump();
}
Some(tt::buffer::TokenTreeRef::Subtree(subtree, _)) => {
Some(tt::TokenTree::Subtree(subtree)) => {
if let Some(kind) = match subtree.delimiter.kind {
tt::DelimiterKind::Parenthesis => Some(T!['(']),
tt::DelimiterKind::Brace => Some(T!['{']),
@ -94,22 +91,19 @@ pub fn to_parser_input<S: Copy + fmt::Debug>(
} {
res.push(kind);
}
cursor.subtree().unwrap()
current.bump();
}
None => match cursor.end() {
Some(subtree) => {
if let Some(kind) = match subtree.delimiter.kind {
tt::DelimiterKind::Parenthesis => Some(T![')']),
tt::DelimiterKind::Brace => Some(T!['}']),
tt::DelimiterKind::Bracket => Some(T![']']),
tt::DelimiterKind::Invisible => None,
} {
res.push(kind);
}
cursor.bump()
None => {
let subtree = current.end();
if let Some(kind) = match subtree.delimiter.kind {
tt::DelimiterKind::Parenthesis => Some(T![')']),
tt::DelimiterKind::Brace => Some(T!['}']),
tt::DelimiterKind::Bracket => Some(T![']']),
tt::DelimiterKind::Invisible => None,
} {
res.push(kind);
}
None => continue,
},
}
};
}