Store token trees in contiguous Vec instead of as a tree

I expected this to be faster (due to less allocations and better cache locality), but benchmarked it is not (neither it is slower). Memory usage, however, drops by ~50mb (of `analysis-stats .`). I guess tt construction is just not hot. This also simplifies using even less memory for token trees by compressing equal span, which I plan to do right after. Some workflows are more easily expressed with a flat tt, while some are better expressed with a tree. With the right helpers, though (which was mostly a matter of trial and error), even the worst workflows become very easy indeed.
2025-10-27 10:17:15 +00:00 · 2024-10-18 10:16:08 +03:00 · 2024-10-18 10:16:08 +03:00 · ceba289f80
commit ceba289f80
parent 1c6b83852b
50 changed files with 2356 additions and 2286 deletions
--- a/crates/syntax-bridge/src/lib.rs
+++ b/crates/syntax-bridge/src/lib.rs
@ -5,17 +5,14 @@ use std::fmt;
 use intern::Symbol;
 use rustc_hash::{FxHashMap, FxHashSet};
 use span::{Edition, SpanAnchor, SpanData, SpanMap};
-use stdx::{format_to, never, non_empty_vec::NonEmptyVec};
+use stdx::{format_to, never};
 use syntax::{
    ast::{self, make::tokens::doc_comment},
    format_smolstr, AstToken, Parse, PreorderWithTokens, SmolStr, SyntaxElement,
    SyntaxKind::{self, *},
    SyntaxNode, SyntaxToken, SyntaxTreeBuilder, TextRange, TextSize, WalkEvent, T,
 };
-use tt::{
-    buffer::{Cursor, TokenBuffer},
-    token_to_literal,
-};
+use tt::{buffer::Cursor, token_to_literal};

 pub mod prettify_macro_expansion;
 mod to_parser_input;
@ -99,7 +96,7 @@ pub fn syntax_node_to_token_tree<Ctx, SpanMap>(
    map: SpanMap,
    span: SpanData<Ctx>,
    mode: DocCommentDesugarMode,
-) -> tt::Subtree<SpanData<Ctx>>
+) -> tt::TopSubtree<SpanData<Ctx>>
 where
    SpanData<Ctx>: Copy + fmt::Debug,
    SpanMap: SpanMapper<SpanData<Ctx>>,
@ -118,7 +115,7 @@ pub fn syntax_node_to_token_tree_modified<Ctx, SpanMap>(
    remove: FxHashSet<SyntaxElement>,
    call_site: SpanData<Ctx>,
    mode: DocCommentDesugarMode,
-) -> tt::Subtree<SpanData<Ctx>>
+) -> tt::TopSubtree<SpanData<Ctx>>
 where
    SpanMap: SpanMapper<SpanData<Ctx>>,
    SpanData<Ctx>: Copy + fmt::Debug,
@ -142,7 +139,7 @@ where
 /// Converts a [`tt::Subtree`] back to a [`SyntaxNode`].
 /// The produced `SpanMap` contains a mapping from the syntax nodes offsets to the subtree's spans.
 pub fn token_tree_to_syntax_node<Ctx>(
-    tt: &tt::Subtree<SpanData<Ctx>>,
+    tt: &tt::TopSubtree<SpanData<Ctx>>,
    entry_point: parser::TopEntryPoint,
    edition: parser::Edition,
 ) -> (Parse<SyntaxNode>, SpanMap<Ctx>)
@ -150,16 +147,10 @@ where
    SpanData<Ctx>: Copy + fmt::Debug,
    Ctx: PartialEq,
 {
-    let buffer = match tt {
-        tt::Subtree {
-            delimiter: tt::Delimiter { kind: tt::DelimiterKind::Invisible, .. },
-            token_trees,
-        } => TokenBuffer::from_tokens(token_trees),
-        _ => TokenBuffer::from_subtree(tt),
-    };
-    let parser_input = to_parser_input(edition, &buffer);
+    let buffer = tt.view().strip_invisible();
+    let parser_input = to_parser_input(edition, buffer);
    let parser_output = entry_point.parse(&parser_input, edition);
-    let mut tree_sink = TtTreeSink::new(buffer.begin());
+    let mut tree_sink = TtTreeSink::new(buffer.cursor());
    for event in parser_output.iter() {
        match event {
            parser::Step::Token { kind, n_input_tokens: n_raw_tokens } => {
@ -183,7 +174,7 @@ pub fn parse_to_token_tree<Ctx>(
    anchor: SpanAnchor,
    ctx: Ctx,
    text: &str,
-) -> Option<tt::Subtree<SpanData<Ctx>>>
+) -> Option<tt::TopSubtree<SpanData<Ctx>>>
 where
    SpanData<Ctx>: Copy + fmt::Debug,
    Ctx: Copy,
@ -202,7 +193,7 @@ pub fn parse_to_token_tree_static_span<S>(
    edition: Edition,
    span: S,
    text: &str,
-) -> Option<tt::Subtree<S>>
+) -> Option<tt::TopSubtree<S>>
 where
    S: Copy + fmt::Debug,
 {
@ -215,47 +206,38 @@ where
    Some(convert_tokens(&mut conv))
 }

-fn convert_tokens<S, C>(conv: &mut C) -> tt::Subtree<S>
+fn convert_tokens<S, C>(conv: &mut C) -> tt::TopSubtree<S>
 where
    C: TokenConverter<S>,
    S: Copy + fmt::Debug,
    C::Token: fmt::Debug,
 {
-    let entry = tt::SubtreeBuilder {
-        delimiter: tt::Delimiter::invisible_spanned(conv.call_site()),
-        token_trees: vec![],
-    };
-    let mut stack = NonEmptyVec::new(entry);
+    let mut builder =
+        tt::TopSubtreeBuilder::new(tt::Delimiter::invisible_spanned(conv.call_site()));

    while let Some((token, abs_range)) = conv.bump() {
-        let tt::SubtreeBuilder { delimiter, token_trees } = stack.last_mut();
-
+        let delimiter = builder.expected_delimiter().map(|it| it.kind);
        let tt = match token.as_leaf() {
-            Some(leaf) => tt::TokenTree::Leaf(leaf.clone()),
+            Some(leaf) => leaf.clone(),
            None => match token.kind(conv) {
                // Desugar doc comments into doc attributes
                COMMENT => {
                    let span = conv.span_for(abs_range);
-                    if let Some(tokens) = conv.convert_doc_comment(&token, span) {
-                        token_trees.extend(tokens);
-                    }
+                    conv.convert_doc_comment(&token, span, &mut builder);
                    continue;
                }
                kind if kind.is_punct() && kind != UNDERSCORE => {
-                    let expected = match delimiter.kind {
-                        tt::DelimiterKind::Parenthesis => Some(T![')']),
-                        tt::DelimiterKind::Brace => Some(T!['}']),
-                        tt::DelimiterKind::Bracket => Some(T![']']),
-                        tt::DelimiterKind::Invisible => None,
+                    let expected = match delimiter {
+                        Some(tt::DelimiterKind::Parenthesis) => Some(T![')']),
+                        Some(tt::DelimiterKind::Brace) => Some(T!['}']),
+                        Some(tt::DelimiterKind::Bracket) => Some(T![']']),
+                        Some(tt::DelimiterKind::Invisible) | None => None,
                    };

                    // Current token is a closing delimiter that we expect, fix up the closing span
                    // and end the subtree here
                    if matches!(expected, Some(expected) if expected == kind) {
-                        if let Some(mut subtree) = stack.pop() {
-                            subtree.delimiter.close = conv.span_for(abs_range);
-                            stack.last_mut().token_trees.push(subtree.build().into());
-                        }
+                        builder.close(conv.span_for(abs_range));
                        continue;
                    }

@ -268,16 +250,7 @@ where

                    // Start a new subtree
                    if let Some(kind) = delim {
-                        let open = conv.span_for(abs_range);
-                        stack.push(tt::SubtreeBuilder {
-                            delimiter: tt::Delimiter {
-                                open,
-                                // will be overwritten on subtree close above
-                                close: open,
-                                kind,
-                            },
-                            token_trees: vec![],
-                        });
+                        builder.open(kind, conv.span_for(abs_range));
                        continue;
                    }

@ -289,7 +262,6 @@ where
                        panic!("Token from lexer must be single char: token = {token:#?}")
                    };
                    tt::Leaf::from(tt::Punct { char, spacing, span: conv.span_for(abs_range) })
-                        .into()
                }
                kind => {
                    macro_rules! make_ident {
@ -320,7 +292,7 @@ where
                                span: conv
                                    .span_for(TextRange::at(abs_range.start(), TextSize::of('\''))),
                            });
-                            token_trees.push(apostrophe.into());
+                            builder.push(apostrophe);

                            let ident = tt::Leaf::from(tt::Ident {
                                sym: Symbol::intern(&token.to_text(conv)[1..]),
@ -330,47 +302,26 @@ where
                                )),
                                is_raw: tt::IdentIsRaw::No,
                            });
-                            token_trees.push(ident.into());
+                            builder.push(ident);
                            continue;
                        }
                        _ => continue,
                    };

-                    leaf.into()
+                    leaf
                }
            },
        };

-        token_trees.push(tt);
+        builder.push(tt);
    }

    // If we get here, we've consumed all input tokens.
    // We might have more than one subtree in the stack, if the delimiters are improperly balanced.
    // Merge them so we're left with one.
-    while let Some(entry) = stack.pop() {
-        let parent = stack.last_mut();
+    builder.flatten_unclosed_subtrees();

-        let leaf: tt::Leaf<_> = tt::Punct {
-            span: entry.delimiter.open,
-            char: match entry.delimiter.kind {
-                tt::DelimiterKind::Parenthesis => '(',
-                tt::DelimiterKind::Brace => '{',
-                tt::DelimiterKind::Bracket => '[',
-                tt::DelimiterKind::Invisible => '$',
-            },
-            spacing: tt::Spacing::Alone,
-        }
-        .into();
-        parent.token_trees.push(leaf.into());
-        parent.token_trees.extend(entry.token_trees);
-    }
-
-    let subtree = stack.into_last().build();
-    if let [tt::TokenTree::Subtree(first)] = &*subtree.token_trees {
-        first.clone()
-    } else {
-        subtree
-    }
+    builder.build_skip_top_subtree()
 }

 fn is_single_token_op(kind: SyntaxKind) -> bool {
@ -436,25 +387,17 @@ fn convert_doc_comment<S: Copy>(
    token: &syntax::SyntaxToken,
    span: S,
    mode: DocCommentDesugarMode,
-) -> Option<Vec<tt::TokenTree<S>>> {
-    let comment = ast::Comment::cast(token.clone())?;
-    let doc = comment.kind().doc?;
+    builder: &mut tt::TopSubtreeBuilder<S>,
+) {
+    let Some(comment) = ast::Comment::cast(token.clone()) else { return };
+    let Some(doc) = comment.kind().doc else { return };

    let mk_ident = |s: &str| {
-        tt::TokenTree::from(tt::Leaf::from(tt::Ident {
-            sym: Symbol::intern(s),
-            span,
-            is_raw: tt::IdentIsRaw::No,
-        }))
+        tt::Leaf::from(tt::Ident { sym: Symbol::intern(s), span, is_raw: tt::IdentIsRaw::No })
    };

-    let mk_punct = |c: char| {
-        tt::TokenTree::from(tt::Leaf::from(tt::Punct {
-            char: c,
-            spacing: tt::Spacing::Alone,
-            span,
-        }))
-    };
+    let mk_punct =
+        |c: char| tt::Leaf::from(tt::Punct { char: c, spacing: tt::Spacing::Alone, span });

    let mk_doc_literal = |comment: &ast::Comment| {
        let prefix_len = comment.prefix().len();
@ -467,24 +410,20 @@ fn convert_doc_comment<S: Copy>(
        let (text, kind) = desugar_doc_comment_text(text, mode);
        let lit = tt::Literal { symbol: text, span, kind, suffix: None };

-        tt::TokenTree::from(tt::Leaf::from(lit))
+        tt::Leaf::from(lit)
    };

    // Make `doc="\" Comments\""
-    let meta_tkns = Box::new([mk_ident("doc"), mk_punct('='), mk_doc_literal(&comment)]);
+    let meta_tkns = [mk_ident("doc"), mk_punct('='), mk_doc_literal(&comment)];

    // Make `#![]`
-    let mut token_trees = Vec::with_capacity(3);
-    token_trees.push(mk_punct('#'));
+    builder.push(mk_punct('#'));
    if let ast::CommentPlacement::Inner = doc {
-        token_trees.push(mk_punct('!'));
+        builder.push(mk_punct('!'));
    }
-    token_trees.push(tt::TokenTree::from(tt::Subtree {
-        delimiter: tt::Delimiter { open: span, close: span, kind: tt::DelimiterKind::Bracket },
-        token_trees: meta_tkns,
-    }));
-
-    Some(token_trees)
+    builder.open(tt::DelimiterKind::Bracket, span);
+    builder.extend(meta_tkns);
+    builder.close(span);
 }

 /// A raw token (straight from lexer) converter
@ -518,7 +457,12 @@ trait SrcToken<Ctx, S> {
 trait TokenConverter<S>: Sized {
    type Token: SrcToken<Self, S>;

-    fn convert_doc_comment(&self, token: &Self::Token, span: S) -> Option<Vec<tt::TokenTree<S>>>;
+    fn convert_doc_comment(
+        &self,
+        token: &Self::Token,
+        span: S,
+        builder: &mut tt::TopSubtreeBuilder<S>,
+    );

    fn bump(&mut self) -> Option<(Self::Token, TextRange)>;

@ -567,9 +511,10 @@ where
        &self,
        &token: &usize,
        span: SpanData<Ctx>,
-    ) -> Option<Vec<tt::TokenTree<SpanData<Ctx>>>> {
+        builder: &mut tt::TopSubtreeBuilder<SpanData<Ctx>>,
+    ) {
        let text = self.lexed.text(token);
-        convert_doc_comment(&doc_comment(text), span, self.mode)
+        convert_doc_comment(&doc_comment(text), span, self.mode, builder);
    }

    fn bump(&mut self) -> Option<(Self::Token, TextRange)> {
@ -606,9 +551,9 @@ where
 {
    type Token = usize;

-    fn convert_doc_comment(&self, &token: &usize, span: S) -> Option<Vec<tt::TokenTree<S>>> {
+    fn convert_doc_comment(&self, &token: &usize, span: S, builder: &mut tt::TopSubtreeBuilder<S>) {
        let text = self.lexed.text(token);
-        convert_doc_comment(&doc_comment(text), span, self.mode)
+        convert_doc_comment(&doc_comment(text), span, self.mode, builder);
    }

    fn bump(&mut self) -> Option<(Self::Token, TextRange)> {
@ -773,8 +718,13 @@ where
    SpanMap: SpanMapper<S>,
 {
    type Token = SynToken<S>;
-    fn convert_doc_comment(&self, token: &Self::Token, span: S) -> Option<Vec<tt::TokenTree<S>>> {
-        convert_doc_comment(token.token(), span, self.mode)
+    fn convert_doc_comment(
+        &self,
+        token: &Self::Token,
+        span: S,
+        builder: &mut tt::TopSubtreeBuilder<S>,
+    ) {
+        convert_doc_comment(token.token(), span, self.mode, builder);
    }

    fn bump(&mut self) -> Option<(Self::Token, TextRange)> {
@ -899,15 +849,12 @@ where
    /// This occurs when a float literal is used as a field access.
    fn float_split(&mut self, has_pseudo_dot: bool) {
        let (text, span) = match self.cursor.token_tree() {
-            Some(tt::buffer::TokenTreeRef::Leaf(
-                tt::Leaf::Literal(tt::Literal {
-                    symbol: text,
-                    span,
-                    kind: tt::LitKind::Float,
-                    suffix: _,
-                }),
-                _,
-            )) => (text.as_str(), *span),
+            Some(tt::TokenTree::Leaf(tt::Leaf::Literal(tt::Literal {
+                symbol: text,
+                span,
+                kind: tt::LitKind::Float,
+                suffix: _,
+            }))) => (text.as_str(), *span),
            tt => unreachable!("{tt:?}"),
        };
        // FIXME: Span splitting
@ -942,7 +889,7 @@ where
            }
            None => unreachable!(),
        }
-        self.cursor = self.cursor.bump();
+        self.cursor.bump();
    }

    fn token(&mut self, kind: SyntaxKind, mut n_tokens: u8) {
@ -950,24 +897,24 @@ where
            n_tokens = 2;
        }

-        let mut last = self.cursor;
+        let mut last_two = self.cursor.peek_two_leaves();
        let mut combined_span = None;
        'tokens: for _ in 0..n_tokens {
            let tmp: u8;
            if self.cursor.eof() {
                break;
            }
-            last = self.cursor;
+            last_two = self.cursor.peek_two_leaves();
            let (text, span) = loop {
                break match self.cursor.token_tree() {
-                    Some(tt::buffer::TokenTreeRef::Leaf(leaf, _)) => match leaf {
+                    Some(tt::TokenTree::Leaf(leaf)) => match leaf {
                        tt::Leaf::Ident(ident) => {
                            if ident.is_raw.yes() {
                                self.buf.push_str("r#");
                                self.text_pos += TextSize::of("r#");
                            }
                            let r = (ident.sym.as_str(), ident.span);
-                            self.cursor = self.cursor.bump();
+                            self.cursor.bump();
                            r
                        }
                        tt::Leaf::Punct(punct) => {
@ -977,7 +924,7 @@ where
                                std::str::from_utf8(std::slice::from_ref(&tmp)).unwrap(),
                                punct.span,
                            );
-                            self.cursor = self.cursor.bump();
+                            self.cursor.bump();
                            r
                        }
                        tt::Leaf::Literal(lit) => {
@ -989,20 +936,19 @@ where
                                None => Some(lit.span),
                                Some(prev_span) => Some(Self::merge_spans(prev_span, lit.span)),
                            };
-                            self.cursor = self.cursor.bump();
+                            self.cursor.bump();
                            continue 'tokens;
                        }
                    },
-                    Some(tt::buffer::TokenTreeRef::Subtree(subtree, _)) => {
-                        self.cursor = self.cursor.subtree().unwrap();
+                    Some(tt::TokenTree::Subtree(subtree)) => {
+                        self.cursor.bump();
                        match delim_to_str(subtree.delimiter.kind, false) {
                            Some(it) => (it, subtree.delimiter.open),
                            None => continue,
                        }
                    }
                    None => {
-                        let parent = self.cursor.end().unwrap();
-                        self.cursor = self.cursor.bump();
+                        let parent = self.cursor.end();
                        match delim_to_str(parent.delimiter.kind, true) {
                            Some(it) => (it, parent.delimiter.close),
                            None => continue,
@ -1023,12 +969,7 @@ where
        self.buf.clear();
        // FIXME: Emitting whitespace for this is really just a hack, we should get rid of it.
        // Add whitespace between adjoint puncts
-        let next = last.bump();
-        if let (
-            Some(tt::buffer::TokenTreeRef::Leaf(tt::Leaf::Punct(curr), _)),
-            Some(tt::buffer::TokenTreeRef::Leaf(tt::Leaf::Punct(next), _)),
-        ) = (last.token_tree(), next.token_tree())
-        {
+        if let Some([tt::Leaf::Punct(curr), tt::Leaf::Punct(next)]) = last_two {
            // Note: We always assume the semi-colon would be the last token in
            // other parts of RA such that we don't add whitespace here.
            //
--- a/crates/syntax-bridge/src/tests.rs
+++ b/crates/syntax-bridge/src/tests.rs
@ -2,10 +2,7 @@ use rustc_hash::FxHashMap;
 use span::Span;
 use syntax::{ast, AstNode};
 use test_utils::extract_annotations;
-use tt::{
-    buffer::{TokenBuffer, TokenTreeRef},
-    Leaf, Punct, Spacing,
-};
+use tt::{buffer::Cursor, Leaf, Punct, Spacing};

 use crate::{
    dummy_test_span_utils::{DummyTestSpanMap, DUMMY},
@ -32,22 +29,22 @@ fn check_punct_spacing(fixture: &str) {
        })
        .collect();

-    let buf = TokenBuffer::from_subtree(&subtree);
-    let mut cursor = buf.begin();
+    let mut cursor = Cursor::new(&subtree.0);
    while !cursor.eof() {
        while let Some(token_tree) = cursor.token_tree() {
-            if let TokenTreeRef::Leaf(
-                Leaf::Punct(Punct { spacing, span: Span { range, .. }, .. }),
-                _,
-            ) = token_tree
+            if let tt::TokenTree::Leaf(Leaf::Punct(Punct {
+                spacing,
+                span: Span { range, .. },
+                ..
+            })) = token_tree
            {
                if let Some(expected) = annotations.remove(range) {
                    assert_eq!(expected, *spacing);
                }
            }
-            cursor = cursor.bump_subtree();
+            cursor.bump();
        }
-        cursor = cursor.bump();
+        cursor.bump_or_end();
    }

    assert!(annotations.is_empty(), "unchecked annotations: {annotations:?}");
--- a/crates/syntax-bridge/src/to_parser_input.rs
+++ b/crates/syntax-bridge/src/to_parser_input.rs
@ -6,37 +6,34 @@ use std::fmt;
 use span::Edition;
 use syntax::{SyntaxKind, SyntaxKind::*, T};

-use tt::buffer::TokenBuffer;
-
 pub fn to_parser_input<S: Copy + fmt::Debug>(
    edition: Edition,
-    buffer: &TokenBuffer<'_, S>,
+    buffer: tt::TokenTreesView<'_, S>,
 ) -> parser::Input {
    let mut res = parser::Input::default();

-    let mut current = buffer.begin();
+    let mut current = buffer.cursor();

    while !current.eof() {
-        let cursor = current;
-        let tt = cursor.token_tree();
+        let tt = current.token_tree();

        // Check if it is lifetime
-        if let Some(tt::buffer::TokenTreeRef::Leaf(tt::Leaf::Punct(punct), _)) = tt {
+        if let Some(tt::TokenTree::Leaf(tt::Leaf::Punct(punct))) = tt {
            if punct.char == '\'' {
-                let next = cursor.bump();
-                match next.token_tree() {
-                    Some(tt::buffer::TokenTreeRef::Leaf(tt::Leaf::Ident(_ident), _)) => {
+                current.bump();
+                match current.token_tree() {
+                    Some(tt::TokenTree::Leaf(tt::Leaf::Ident(_ident))) => {
                        res.push(LIFETIME_IDENT);
-                        current = next.bump();
+                        current.bump();
                        continue;
                    }
-                    _ => panic!("Next token must be ident : {:#?}", next.token_tree()),
+                    _ => panic!("Next token must be ident"),
                }
            }
        }

-        current = match tt {
-            Some(tt::buffer::TokenTreeRef::Leaf(leaf, _)) => {
+        match tt {
+            Some(tt::TokenTree::Leaf(leaf)) => {
                match leaf {
                    tt::Leaf::Literal(lit) => {
                        let kind = match lit.kind {
@ -83,9 +80,9 @@ pub fn to_parser_input<S: Copy + fmt::Debug>(
                        }
                    }
                }
-                cursor.bump()
+                current.bump();
            }
-            Some(tt::buffer::TokenTreeRef::Subtree(subtree, _)) => {
+            Some(tt::TokenTree::Subtree(subtree)) => {
                if let Some(kind) = match subtree.delimiter.kind {
                    tt::DelimiterKind::Parenthesis => Some(T!['(']),
                    tt::DelimiterKind::Brace => Some(T!['{']),
@ -94,22 +91,19 @@ pub fn to_parser_input<S: Copy + fmt::Debug>(
                } {
                    res.push(kind);
                }
-                cursor.subtree().unwrap()
+                current.bump();
            }
-            None => match cursor.end() {
-                Some(subtree) => {
-                    if let Some(kind) = match subtree.delimiter.kind {
-                        tt::DelimiterKind::Parenthesis => Some(T![')']),
-                        tt::DelimiterKind::Brace => Some(T!['}']),
-                        tt::DelimiterKind::Bracket => Some(T![']']),
-                        tt::DelimiterKind::Invisible => None,
-                    } {
-                        res.push(kind);
-                    }
-                    cursor.bump()
+            None => {
+                let subtree = current.end();
+                if let Some(kind) = match subtree.delimiter.kind {
+                    tt::DelimiterKind::Parenthesis => Some(T![')']),
+                    tt::DelimiterKind::Brace => Some(T!['}']),
+                    tt::DelimiterKind::Bracket => Some(T![']']),
+                    tt::DelimiterKind::Invisible => None,
+                } {
+                    res.push(kind);
                }
-                None => continue,
-            },
+            }
        };
    }