port mbe to soa tokens

2025-09-27 04:19:13 +00:00 · 2021-12-12 19:06:40 +03:00 · 2021-12-12 19:06:40 +03:00 · 1055a6111a
commit 1055a6111a
parent 965585748e
7 changed files with 130 additions and 183 deletions
--- a/crates/mbe/src/lib.rs
+++ b/crates/mbe/src/lib.rs
@ -10,7 +10,7 @@ mod parser;
 mod expander;
 mod syntax_bridge;
 mod tt_iter;
-mod subtree_source;
+mod to_parser_tokens;
 #[cfg(test)]
 mod benchmark;
--- a/crates/mbe/src/subtree_source.rs
+++ b/crates/mbe/src/subtree_source.rs
@ -1,174 +0,0 @@
 //! Our parser is generic over the source of tokens it parses.
 //!
 //! This module defines tokens sourced from declarative macros.
 use parser::{Token, TokenSource};
 use syntax::{lex_single_syntax_kind, SmolStr, SyntaxKind, SyntaxKind::*, T};
 use tt::buffer::TokenBuffer;
 #[derive(Debug, Clone, Eq, PartialEq)]
 struct TtToken {
    tt: Token,
    text: SmolStr,
 }
 pub(crate) struct SubtreeTokenSource {
    cached: Vec<TtToken>,
    curr: (Token, usize),
 }
 impl<'a> SubtreeTokenSource {
    pub(crate) fn new(buffer: &TokenBuffer) -> SubtreeTokenSource {
        let mut current = buffer.begin();
        let mut cached = Vec::with_capacity(100);
        while !current.eof() {
            let cursor = current;
            let tt = cursor.token_tree();
            // Check if it is lifetime
            if let Some(tt::buffer::TokenTreeRef::Leaf(tt::Leaf::Punct(punct), _)) = tt {
                if punct.char == '\'' {
                    let next = cursor.bump();
                    if let Some(tt::buffer::TokenTreeRef::Leaf(tt::Leaf::Ident(ident), _)) =
                        next.token_tree()
                    {
                        let text = SmolStr::new("'".to_string() + &ident.text);
                        cached.push(TtToken {
                            tt: Token { kind: LIFETIME_IDENT, is_jointed_to_next: false },
                            text,
                        });
                        current = next.bump();
                        continue;
                    } else {
                        panic!("Next token must be ident : {:#?}", next.token_tree());
                    }
                }
            }
            current = match tt {
                Some(tt::buffer::TokenTreeRef::Leaf(leaf, _)) => {
                    cached.push(convert_leaf(leaf));
                    cursor.bump()
                }
                Some(tt::buffer::TokenTreeRef::Subtree(subtree, _)) => {
                    if let Some(d) = subtree.delimiter_kind() {
                        cached.push(convert_delim(d, false));
                    }
                    cursor.subtree().unwrap()
                }
                None => match cursor.end() {
                    Some(subtree) => {
                        if let Some(d) = subtree.delimiter_kind() {
                            cached.push(convert_delim(d, true));
                        }
                        cursor.bump()
                    }
                    None => continue,
                },
            };
        }
        let mut res = SubtreeTokenSource {
            curr: (Token { kind: EOF, is_jointed_to_next: false }, 0),
            cached,
        };
        res.curr = (res.token(0), 0);
        res
    }
    fn token(&self, pos: usize) -> Token {
        match self.cached.get(pos) {
            Some(it) => it.tt,
            None => Token { kind: EOF, is_jointed_to_next: false },
        }
    }
 }
 impl<'a> TokenSource for SubtreeTokenSource {
    fn current(&self) -> Token {
        self.curr.0
    }
    /// Lookahead n token
    fn lookahead_nth(&self, n: usize) -> Token {
        self.token(self.curr.1 + n)
    }
    /// bump cursor to next token
    fn bump(&mut self) {
        if self.current().kind == EOF {
            return;
        }
        self.curr = (self.token(self.curr.1 + 1), self.curr.1 + 1);
    }
    /// Is the current token a specified keyword?
    fn is_keyword(&self, kw: &str) -> bool {
        match self.cached.get(self.curr.1) {
            Some(t) => t.text == *kw,
            None => false,
        }
    }
 }
 fn convert_delim(d: tt::DelimiterKind, closing: bool) -> TtToken {
    let (kinds, texts) = match d {
        tt::DelimiterKind::Parenthesis => ([T!['('], T![')']], "()"),
        tt::DelimiterKind::Brace => ([T!['{'], T!['}']], "{}"),
        tt::DelimiterKind::Bracket => ([T!['['], T![']']], "[]"),
    };
    let idx = closing as usize;
    let kind = kinds[idx];
    let text = &texts[idx..texts.len() - (1 - idx)];
    TtToken { tt: Token { kind, is_jointed_to_next: false }, text: SmolStr::new(text) }
 }
 fn convert_literal(l: &tt::Literal) -> TtToken {
    let is_negated = l.text.starts_with('-');
    let inner_text = &l.text[if is_negated { 1 } else { 0 }..];
    let kind = lex_single_syntax_kind(inner_text)
        .map(|(kind, _error)| kind)
        .filter(|kind| {
            kind.is_literal() && (!is_negated || matches!(kind, FLOAT_NUMBER | INT_NUMBER))
        })
        .unwrap_or_else(|| panic!("Fail to convert given literal {:#?}", &l));
    TtToken { tt: Token { kind, is_jointed_to_next: false }, text: l.text.clone() }
 }
 fn convert_ident(ident: &tt::Ident) -> TtToken {
    let kind = match ident.text.as_ref() {
        "true" => T![true],
        "false" => T![false],
        "_" => UNDERSCORE,
        i if i.starts_with('\'') => LIFETIME_IDENT,
        _ => SyntaxKind::from_keyword(ident.text.as_str()).unwrap_or(IDENT),
    };
    TtToken { tt: Token { kind, is_jointed_to_next: false }, text: ident.text.clone() }
 }
 fn convert_punct(p: tt::Punct) -> TtToken {
    let kind = match SyntaxKind::from_char(p.char) {
        None => panic!("{:#?} is not a valid punct", p),
        Some(kind) => kind,
    };
    let text = {
        let mut buf = [0u8; 4];
        let s: &str = p.char.encode_utf8(&mut buf);
        SmolStr::new(s)
    };
    TtToken { tt: Token { kind, is_jointed_to_next: p.spacing == tt::Spacing::Joint }, text }
 }
 fn convert_leaf(leaf: &tt::Leaf) -> TtToken {
    match leaf {
        tt::Leaf::Literal(l) => convert_literal(l),
        tt::Leaf::Ident(ident) => convert_ident(ident),
        tt::Leaf::Punct(punct) => convert_punct(*punct),
    }
 }
--- a/crates/mbe/src/syntax_bridge.rs
+++ b/crates/mbe/src/syntax_bridge.rs
@ -12,7 +12,7 @@ use syntax::{
 use tt::buffer::{Cursor, TokenBuffer};
 use crate::{
-    subtree_source::SubtreeTokenSource, tt_iter::TtIter, ExpandError, ParserEntryPoint, TokenMap,
+    to_parser_tokens::to_parser_tokens, tt_iter::TtIter, ExpandError, ParserEntryPoint, TokenMap,
 };
 /// Convert the syntax node to a `TokenTree` (what macro
@ -56,9 +56,9 @@ pub fn token_tree_to_syntax_node(
        }
        _ => TokenBuffer::from_subtree(tt),
    };
-    let mut token_source = SubtreeTokenSource::new(&buffer);
+    let parser_tokens = to_parser_tokens(&buffer);
    let mut tree_sink = TtTreeSink::new(buffer.begin());
-    parser::parse(&mut token_source, &mut tree_sink, entry_point);
+    parser::parse(&parser_tokens, &mut tree_sink, entry_point);
    if tree_sink.roots.len() != 1 {
        return Err(ExpandError::ConversionError);
    }
--- a/crates/mbe/src/to_parser_tokens.rs
+++ b/crates/mbe/src/to_parser_tokens.rs
@ -0,0 +1,97 @@
 //! Convert macro-by-example tokens which are specific to macro expansion into a
 //! format that works for our parser.
 use syntax::{lex_single_syntax_kind, SyntaxKind, SyntaxKind::*, T};
 use tt::buffer::TokenBuffer;
 pub(crate) fn to_parser_tokens(buffer: &TokenBuffer) -> parser::Tokens {
    let mut res = parser::Tokens::default();
    let mut current = buffer.begin();
    while !current.eof() {
        let cursor = current;
        let tt = cursor.token_tree();
        // Check if it is lifetime
        if let Some(tt::buffer::TokenTreeRef::Leaf(tt::Leaf::Punct(punct), _)) = tt {
            if punct.char == '\'' {
                let next = cursor.bump();
                match next.token_tree() {
                    Some(tt::buffer::TokenTreeRef::Leaf(tt::Leaf::Ident(_ident), _)) => {
                        res.push(LIFETIME_IDENT);
                        current = next.bump();
                        continue;
                    }
                    _ => panic!("Next token must be ident : {:#?}", next.token_tree()),
                }
            }
        }
        current = match tt {
            Some(tt::buffer::TokenTreeRef::Leaf(leaf, _)) => {
                match leaf {
                    tt::Leaf::Literal(lit) => {
                        let is_negated = lit.text.starts_with('-');
                        let inner_text = &lit.text[if is_negated { 1 } else { 0 }..];
                        let kind = lex_single_syntax_kind(inner_text)
                            .map(|(kind, _error)| kind)
                            .filter(|kind| {
                                kind.is_literal()
                                    && (!is_negated || matches!(kind, FLOAT_NUMBER | INT_NUMBER))
                            })
                            .unwrap_or_else(|| panic!("Fail to convert given literal {:#?}", &lit));
                        res.push(kind);
                    }
                    tt::Leaf::Ident(ident) => match ident.text.as_ref() {
                        "_" => res.push(T![_]),
                        i if i.starts_with('\'') => res.push(LIFETIME_IDENT),
                        _ => match SyntaxKind::from_keyword(&ident.text) {
                            Some(kind) => res.push(kind),
                            None => {
                                let contextual_keyword =
                                    SyntaxKind::from_contextual_keyword(&ident.text)
                                        .unwrap_or(SyntaxKind::IDENT);
                                res.push_ident(contextual_keyword);
                            }
                        },
                    },
                    tt::Leaf::Punct(punct) => {
                        let kind = SyntaxKind::from_char(punct.char)
                            .unwrap_or_else(|| panic!("{:#?} is not a valid punct", punct));
                        res.push(kind);
                        res.was_joint(punct.spacing == tt::Spacing::Joint);
                    }
                }
                cursor.bump()
            }
            Some(tt::buffer::TokenTreeRef::Subtree(subtree, _)) => {
                if let Some(d) = subtree.delimiter_kind() {
                    res.push(match d {
                        tt::DelimiterKind::Parenthesis => T!['('],
                        tt::DelimiterKind::Brace => T!['{'],
                        tt::DelimiterKind::Bracket => T!['['],
                    });
                }
                cursor.subtree().unwrap()
            }
            None => match cursor.end() {
                Some(subtree) => {
                    if let Some(d) = subtree.delimiter_kind() {
                        res.push(match d {
                            tt::DelimiterKind::Parenthesis => T![')'],
                            tt::DelimiterKind::Brace => T!['}'],
                            tt::DelimiterKind::Bracket => T![']'],
                        })
                    }
                    cursor.bump()
                }
                None => continue,
            },
        };
    }
    res
 }
--- a/crates/mbe/src/tt_iter.rs
+++ b/crates/mbe/src/tt_iter.rs
@ -1,7 +1,7 @@
 //! A "Parser" structure for token trees. We use this when parsing a declarative
 //! macro definition into a list of patterns and templates.
-use crate::{subtree_source::SubtreeTokenSource, ExpandError, ExpandResult, ParserEntryPoint};
+use crate::{to_parser_tokens::to_parser_tokens, ExpandError, ExpandResult, ParserEntryPoint};
 use parser::TreeSink;
 use syntax::SyntaxKind;
@ -116,10 +116,10 @@ impl<'a> TtIter<'a> {
        }
        let buffer = TokenBuffer::from_tokens(self.inner.as_slice());
-        let mut src = SubtreeTokenSource::new(&buffer);
+        let parser_tokens = to_parser_tokens(&buffer);
        let mut sink = OffsetTokenSink { cursor: buffer.begin(), error: false };
-        parser::parse(&mut src, &mut sink, entry_point);
+        parser::parse(&parser_tokens, &mut sink, entry_point);
        let mut err = if !sink.cursor.is_root() || sink.error {
            Some(err!("expected {:?}", entry_point))
--- a/crates/parser/src/lib.rs
+++ b/crates/parser/src/lib.rs
@ -1,8 +1,11 @@
 //! The Rust parser.
 //!
 //! NOTE: The crate is undergoing refactors, don't believe everything the docs
 //! say :-)
 //!
 //! The parser doesn't know about concrete representation of tokens and syntax
-//! trees. Abstract [`TokenSource`] and [`TreeSink`] traits are used instead.
+//! trees. Abstract [`TokenSource`] and [`TreeSink`] traits are used instead. As
-//! As a consequence, this crate does not contain a lexer.
+//! a consequence, this crate does not contain a lexer.
 //!
 //! The [`Parser`] struct from the [`parser`] module is a cursor into the
 //! sequence of tokens.  Parsing routines use [`Parser`] to inspect current
--- a/crates/parser/src/tokens.rs
+++ b/crates/parser/src/tokens.rs
@ -1,3 +1,8 @@
 //! Input for the parser -- a sequence of tokens.
 //!
 //! As of now, parser doesn't have access to the *text* of the tokens, and makes
 //! decisions based solely on their classification.
 use crate::SyntaxKind;
 #[allow(non_camel_case_types)]
@ -28,6 +33,22 @@ impl Tokens {
    pub fn push(&mut self, kind: SyntaxKind) {
        self.push_impl(kind, SyntaxKind::EOF)
    }
    /// Sets jointness for the last token we've pushed.
    ///
    /// This is a separate API rather than an argument to the `push` to make it
    /// convenient both for textual and mbe tokens. With text, you know whether
    /// the *previous* token was joint, with mbe, you know whether the *current*
    /// one is joint. This API allows for styles of usage:
    ///
    /// ```
    /// // In text:
    /// tokens.was_joint(prev_joint);
    /// tokens.push(curr);
    ///
    /// // In MBE:
    /// token.push(curr);
    /// tokens.push(curr_joint)
    /// ```
    pub fn was_joint(&mut self, yes: bool) {
        let idx = self.len();
        if yes && idx > 0 {