fix: pass to_multiline_tokens2 checking by copilot (#639)

* test: generate and pass sema_tokens sanity checking by copilot * test: pass sema_tokens sanity checking * dev: update snapshot
2025-11-24 21:19:37 +00:00 · 2024-10-07 11:58:48 +08:00 · 2024-10-07 11:58:48 +08:00 · b0f5398412
commit b0f5398412
parent d4492e0436
10 changed files with 166 additions and 52 deletions
--- a/crates/tinymist-query/src/fixtures/semantic_tokens/snaps/test@content-block.typ.snap
+++ b/crates/tinymist-query/src/fixtures/semantic_tokens/snaps/test@content-block.typ.snap
@ -3,4 +3,4 @@ source: crates/tinymist-query/src/semantic_tokens_full.rs
 expression: "serde_json::to_string(&result).unwrap()"
 input_file: crates/tinymist-query/src/fixtures/semantic_tokens/content-block.typ
 ---
-{"data":[0,0,1,8,0,0,1,1,8,0,0,1,1,8,0,0,1,1,20,0,1,0,0,20,0,0,0,1,8,0,0,1,1,8,0,0,1,1,20,0,0,1,1,8,0,0,1,1,20,0,1,0,0,20,0,0,0,1,8,0,0,1,1,8,0,0,1,1,8,0,0,1,1,8,0,0,1,1,20,0,0,1,1,8,0,0,1,1,8,0,0,1,1,8,0,0,1,1,8,0,0,1,1,8,0,0,1,3,20,0,0,3,1,8,0,0,1,1,8,0,0,1,1,8,0,0,1,1,20,0,1,0,0,20,0]}
+{"data":[0,0,1,8,0,0,1,1,8,0,0,1,1,8,0,0,1,1,20,0,1,0,1,8,0,0,1,1,8,0,0,1,1,20,0,0,1,1,8,0,0,1,1,20,0,1,0,1,8,0,0,1,1,8,0,0,1,1,8,0,0,1,1,8,0,0,1,1,20,0,0,1,1,8,0,0,1,1,8,0,0,1,1,8,0,0,1,1,8,0,0,1,1,8,0,0,1,3,20,0,0,3,1,8,0,0,1,1,8,0,0,1,1,8,0,0,1,1,20,0]}
--- a/crates/tinymist-query/src/fixtures/semantic_tokens/snaps/test@fn2.typ.snap
+++ b/crates/tinymist-query/src/fixtures/semantic_tokens/snaps/test@fn2.typ.snap
@ -3,4 +3,4 @@ source: crates/tinymist-query/src/semantic_tokens_full.rs
 expression: "serde_json::to_string(&result).unwrap()"
 input_file: crates/tinymist-query/src/fixtures/semantic_tokens/fn2.typ
 ---
-{"data":[0,0,1,2,0,0,1,3,2,0,0,3,1,20,0,0,1,2,5,0,0,2,1,8,0,0,1,1,8,0,0,1,1,20,0,0,1,1,3,0,0,1,1,20,0,0,1,1,8,0,0,1,1,20,0,1,0,2,20,0,0,2,3,5,0,0,3,1,8,0,0,1,7,1,0,0,7,1,8,0,0,1,1,20,0,0,1,4,18,0,0,4,1,8,0,0,1,1,20,0,0,1,5,1,0,0,5,1,8,0,0,1,1,20,0,1,0,2,20,0,0,2,3,20,0,0,0,3,11,0,0,3,3,20,0,0,0,3,11,0,0,3,1,20,0,1,0,2,20,0,0,2,1,20,0,0,0,1,11,0,0,1,1,20,0,1,0,2,20,0,0,2,1,20,0,0,0,1,11,0,0,1,1,20,0,1,0,2,20,0,0,2,3,20,0,0,0,3,11,0,0,3,1,20,0,1,0,2,20,0,0,2,1,8,0,0,1,1,20,0,0,1,1,8,0,0,1,1,20,0,1,0,0,20,0,0,0,1,8,0]}
+{"data":[0,0,1,2,0,0,1,3,2,0,0,3,1,20,0,0,1,2,5,0,0,2,1,8,0,0,1,1,8,0,0,1,1,20,0,0,1,1,3,0,0,1,1,20,0,0,1,1,8,0,0,1,1,20,0,1,0,2,20,0,0,2,3,5,0,0,3,1,8,0,0,1,7,1,0,0,7,1,8,0,0,1,1,20,0,0,1,4,18,0,0,4,1,8,0,0,1,1,20,0,0,1,5,1,0,0,5,1,8,0,0,1,1,20,0,1,0,2,20,0,0,2,3,20,0,0,3,3,20,0,0,3,1,20,0,1,0,2,20,0,0,2,1,20,0,0,1,1,20,0,1,0,2,20,0,0,2,1,20,0,0,1,1,20,0,1,0,2,20,0,0,2,3,20,0,0,3,1,20,0,1,0,2,20,0,0,2,1,8,0,0,1,1,20,0,0,1,1,8,0,0,1,1,20,0,1,0,1,8,0]}
--- a/crates/tinymist-query/src/fixtures/semantic_tokens/snaps/test@heading.typ.snap
+++ b/crates/tinymist-query/src/fixtures/semantic_tokens/snaps/test@heading.typ.snap
@ -3,4 +3,4 @@ source: crates/tinymist-query/src/semantic_tokens_full.rs
 expression: "serde_json::to_string(&result).unwrap()"
 input_file: crates/tinymist-query/src/fixtures/semantic_tokens/heading.typ
 ---
-{"data":[0,0,3,14,0,0,0,3,14,0,0,3,1,20,0,0,0,1,14,0,0,1,1,5,0,0,0,1,14,0,0,1,4,5,0,0,0,4,14,0,0,4,1,8,0,0,0,1,14,0,0,1,17,1,0,0,0,17,14,0,0,17,1,8,0,0,0,1,14,0]}
+{"data":[0,0,3,14,0,0,3,1,20,0,0,1,1,5,0,0,1,4,5,0,0,4,1,8,0,0,1,17,1,0,0,17,1,8,0]}
--- a/crates/tinymist-query/src/fixtures/semantic_tokens/snaps/test@tinymist_issue_601.typ.snap
+++ b/crates/tinymist-query/src/fixtures/semantic_tokens/snaps/test@tinymist_issue_601.typ.snap
@ -3,4 +3,4 @@ source: crates/tinymist-query/src/semantic_tokens_full.rs
 expression: "serde_json::to_string(&result).unwrap()"
 input_file: crates/tinymist-query/src/fixtures/semantic_tokens/tinymist_issue_601.typ
 ---
-{"data":[0,0,1,19,4,0,1,3,5,4,0,3,1,8,4,0,1,1,20,4,0,1,1,3,4,0,1,1,20,4,0,1,1,20,4,1,0,0,20,4,0,0,1,20,4,0,1,1,20,4,1,0,1,20,4,1,0,0,20,4,0,0,0,20,0]}
+{"data":[0,0,1,19,4,0,1,3,5,4,0,3,1,8,4,0,1,1,20,4,0,1,1,3,4,0,1,1,20,4,0,1,1,20,4,1,0,1,20,4,0,1,1,20,4,1,0,1,20,4]}
--- a/crates/tinymist-query/src/fixtures/semantic_tokens/snaps/test@tinymist_issue_638.typ.snap
+++ b/crates/tinymist-query/src/fixtures/semantic_tokens/snaps/test@tinymist_issue_638.typ.snap
@ -0,0 +1,6 @@
 ---
 source: crates/tinymist-query/src/semantic_tokens_full.rs
 expression: "serde_json::to_string(&result).unwrap()"
 input_file: crates/tinymist-query/src/fixtures/semantic_tokens/tinymist_issue_638.typ
 ---
 {"data":[0,0,9,20,0,0,9,1,20,0]}
--- a/crates/tinymist-query/src/fixtures/semantic_tokens/snaps/test@typst_lsp_issue_264.typ.snap
+++ b/crates/tinymist-query/src/fixtures/semantic_tokens/snaps/test@typst_lsp_issue_264.typ.snap
@ -3,4 +3,4 @@ source: crates/tinymist-query/src/semantic_tokens_full.rs
 expression: "serde_json::to_string(&result).unwrap()"
 input_file: crates/tinymist-query/src/fixtures/semantic_tokens/typst_lsp_issue_264.typ
 ---
-{"data":[0,0,3,0,0,1,0,4,0,0,1,0,3,0,0,1,0,2,0,0,0,2,1,20,0,1,0,1,20,0,1,0,0,20,0,0,0,3,20,0,0,0,3,11,0,0,3,4,20,0,0,0,4,11,0,0,4,1,20,0,1,0,0,20,0,0,0,17,20,0,0,0,17,11,0,0,17,1,20,0,1,0,0,20,0,0,0,3,20,0,0,0,3,11,0]}
+{"data":[0,0,3,0,0,1,0,4,0,0,1,0,3,0,0,1,0,2,0,0,0,2,1,20,0,1,0,1,20,0,1,0,3,20,0,0,3,4,20,0,0,4,1,20,0,1,0,17,20,0,0,17,1,20,0,1,0,3,20,0]}
--- a/crates/tinymist-query/src/fixtures/semantic_tokens/tinymist_issue_638.typ
+++ b/crates/tinymist-query/src/fixtures/semantic_tokens/tinymist_issue_638.typ
@ -0,0 +1 @@
 Bob’s car
--- a/crates/tinymist-query/src/semantic_tokens/mod.rs
+++ b/crates/tinymist-query/src/semantic_tokens/mod.rs
@ -85,7 +85,7 @@ struct Tokenizer {
    allow_multiline_token: bool,
-    token: Token,
+    token: Option<Token>,
 }
 impl Tokenizer {
@ -98,7 +98,7 @@ impl Tokenizer {
            allow_multiline_token,
            encoding,
-            token: Token::default(),
+            token: None,
        }
    }
@ -113,33 +113,35 @@ impl Tokenizer {
            .map(|token_type| Token::new(token_type, modifiers, range.clone()));
        // Push start
-        if !self.token.range.is_empty() && self.token.range.start < range.start {
+        if let Some(prev_token) = self.token.as_mut() {
-            let end = self.token.range.end.min(range.start);
+            if !prev_token.range.is_empty() && prev_token.range.start < range.start {
-            self.push(Token {
+                let end = prev_token.range.end.min(range.start);
-                token_type: self.token.token_type,
+                let sliced = Token {
-                modifiers: self.token.modifiers,
+                    token_type: prev_token.token_type,
-                range: self.token.range.start..end,
+                    modifiers: prev_token.modifiers,
-            });
+                    range: prev_token.range.start..end,
-            self.token.range.start = end;
+                };
                // Slice the previous token
                prev_token.range.start = end;
                self.push(sliced);
            }
        }
        if !is_leaf {
-            if let Some(token) = token.as_mut() {
+            std::mem::swap(&mut self.token, &mut token);
                std::mem::swap(&mut self.token, token);
            }
            for child in root.children() {
                self.tokenize_tree(&child, modifiers);
            }
-
+            std::mem::swap(&mut self.token, &mut token);
            if let Some(token) = token.as_mut() {
                std::mem::swap(&mut self.token, token);
            }
        }
        // Push end
        if let Some(token) = token.clone() {
            if !token.range.is_empty() {
                // Slice the previous token
                if let Some(prev_token) = self.token.as_mut() {
                    prev_token.range.start = token.range.end;
                }
                self.push(token);
            }
        }
@ -160,16 +162,16 @@ impl Tokenizer {
        }
        // This might be a bug of typst, that `end > len` is possible
-        let utf8_end = (range.end).min(self.source.text().len());
+        let source_len = self.source.text().len();
        let utf8_end = (range.end).min(source_len);
        self.pos_offset = utf8_start;
-        if utf8_end < range.start || range.start > self.source.text().len() {
+        if utf8_end <= utf8_start || utf8_start > source_len {
            return;
        }
        let position = typst_to_lsp::offset_to_position(utf8_start, self.encoding, &self.source);
        let delta = self.curr_pos.delta(&position);
        self.curr_pos = position;
        let encode_length = |s, t| {
            match self.encoding {
@ -191,6 +193,7 @@ impl Tokenizer {
                token_type: token_type as u32,
                token_modifiers_bitset: modifiers.bitset(),
            });
            self.curr_pos = position;
        } else {
            let final_line = self
                .source
@ -199,38 +202,51 @@ impl Tokenizer {
            let next_offset = self
                .source
                .line_to_byte((self.curr_pos.line + 1) as usize)
-                .unwrap_or(self.source.text().len());
+                .unwrap_or(source_len);
-            self.output.push(SemanticToken {
+            let inline_length = encode_length(utf8_start, utf8_end.min(next_offset)) as u32;
-                delta_line: delta.delta_line,
+            if inline_length != 0 {
-                delta_start: delta.delta_start,
+                self.output.push(SemanticToken {
-                length: encode_length(utf8_start, utf8_end.min(next_offset)) as u32,
+                    delta_line: delta.delta_line,
-                token_type: token_type as u32,
+                    delta_start: delta.delta_start,
-                token_modifiers_bitset: modifiers.bitset(),
+                    length: inline_length,
-            });
+                    token_type: token_type as u32,
-            let mut utf8_cursor = next_offset;
+                    token_modifiers_bitset: modifiers.bitset(),
-            if self.curr_pos.line < final_line {
+                });
-                for line in self.curr_pos.line + 1..=final_line {
+                self.curr_pos = position;
-                    let next_offset = if line == final_line {
+            }
-                        utf8_end
+            if self.curr_pos.line >= final_line {
-                    } else {
+                return;
-                        self.source
+            }
                            .line_to_byte((line + 1) as usize)
                            .unwrap_or(self.source.text().len())
                    };
            let mut utf8_cursor = next_offset;
            let mut delta_line = 0;
            for line in self.curr_pos.line + 1..=final_line {
                let next_offset = if line == final_line {
                    utf8_end
                } else {
                    self.source
                        .line_to_byte((line + 1) as usize)
                        .unwrap_or(source_len)
                };
                if utf8_cursor < next_offset {
                    let inline_length = encode_length(utf8_cursor, next_offset) as u32;
                    self.output.push(SemanticToken {
-                        delta_line: 1,
+                        delta_line: delta_line + 1,
                        delta_start: 0,
-                        length: encode_length(utf8_cursor, next_offset) as u32,
+                        length: inline_length,
                        token_type: token_type as u32,
                        token_modifiers_bitset: modifiers.bitset(),
                    });
-                    self.pos_offset = utf8_cursor;
+                    delta_line = 0;
-                    utf8_cursor = next_offset;
+                    self.curr_pos.character = 0;
                } else {
                    delta_line += 1;
                }
-                self.curr_pos.line = final_line;
+                self.pos_offset = utf8_cursor;
-                self.curr_pos.character = 0;
+                utf8_cursor = next_offset;
            }
            self.curr_pos.line = final_line - delta_line;
        }
        pub trait PositionExt {
--- a/crates/tinymist-query/src/semantic_tokens_full.rs
+++ b/crates/tinymist-query/src/semantic_tokens_full.rs
@ -46,6 +46,88 @@ mod tests {
    use super::*;
    use crate::tests::*;
    /// This is converted by Copilot from TypeScript `to_multiline_tokens2`.
    /// <https://github.com/microsoft/vscode/blob/2acc0e52cbc7434c415f221d5c34ee1bbdd6cd71/src/vs/editor/common/services/semanticTokensProviderStyling.ts#L147>
    fn check_tokens(tokens: &SemanticTokens) {
        const DESIRED_TOKENS_PER_AREA: usize = 400;
        const DESIRED_MAX_AREAS: usize = 1024;
        let src_data = &tokens.data;
        let token_count = src_data.len();
        let tokens_per_area = std::cmp::max(
            (token_count as f64 / DESIRED_MAX_AREAS as f64).ceil() as usize,
            DESIRED_TOKENS_PER_AREA,
        );
        let mut token_index = 0;
        let mut last_line_number = 1;
        let mut last_start_character = 0;
        while token_index < token_count {
            let token_start_index = token_index;
            let mut token_end_index =
                std::cmp::min(token_start_index + tokens_per_area, token_count);
            // Keep tokens on the same line in the same area...
            if token_end_index < token_count {
                let mut small_token_end_index = token_end_index;
                while small_token_end_index - 1 > token_start_index
                    && src_data[small_token_end_index].delta_line == 0
                {
                    small_token_end_index -= 1;
                }
                if small_token_end_index - 1 == token_start_index {
                    // there are so many tokens on this line that our area would be empty, we must
                    // now go right
                    let mut big_token_end_index = token_end_index;
                    while big_token_end_index + 1 < token_count
                        && src_data[big_token_end_index].delta_line == 0
                    {
                        big_token_end_index += 1;
                    }
                    token_end_index = big_token_end_index;
                } else {
                    token_end_index = small_token_end_index;
                }
            }
            let mut prev_line_number = 0;
            let mut prev_end_character = 0;
            while token_index < token_end_index {
                let delta_line = src_data[token_index].delta_line;
                let delta_character = src_data[token_index].delta_start;
                let length = src_data[token_index].length;
                let line_number = last_line_number + delta_line;
                let start_character = if delta_line == 0 {
                    last_start_character + delta_character
                } else {
                    delta_character
                };
                // delta_character
                let end_character = start_character + length;
                if end_character <= start_character {
                    // this token is invalid (most likely a negative length casted to uint32)
                    panic!(
                        "Invalid length for semantic token at line {line_number}, character {start_character}, end: {end_character}"
                    );
                } else if prev_line_number == line_number && prev_end_character > start_character {
                    // this token overlaps with the previous token
                    panic!("Overlapping semantic tokens at line {line_number}, character {start_character}, previous line {prev_line_number}, previous end {prev_end_character}");
                } else {
                    prev_line_number = line_number;
                    prev_end_character = end_character;
                }
                last_line_number = line_number;
                last_start_character = start_character;
                token_index += 1;
            }
        }
    }
    #[test]
    fn test() {
        snapshot_testing("semantic_tokens", &|ctx, path| {
@ -60,6 +142,15 @@ mod tests {
                tokens.result_id.take();
            }
            match &result {
                SemanticTokensResult::Tokens(tokens) => {
                    check_tokens(tokens);
                }
                SemanticTokensResult::Partial(_) => {
                    panic!("Unexpected partial result");
                }
            }
            assert_snapshot!(serde_json::to_string(&result).unwrap());
        });
    }
--- a/tests/e2e/main.rs
+++ b/tests/e2e/main.rs
@ -374,7 +374,7 @@ fn e2e() {
        });
        let hash = replay_log(&tinymist_binary, &root.join("neovim"));
-        insta::assert_snapshot!(hash, @"siphash128_13:57d85be77d6449db5c0bd9c9b3a7a480");
+        insta::assert_snapshot!(hash, @"siphash128_13:95195259988d97cae5802b09f2aa6c0");
    }
    {
@ -385,7 +385,7 @@ fn e2e() {
        });
        let hash = replay_log(&tinymist_binary, &root.join("vscode"));
-        insta::assert_snapshot!(hash, @"siphash128_13:2a067ea884ed66dcf681c3fa7ec167a3");
+        insta::assert_snapshot!(hash, @"siphash128_13:5ed17b7e6504fbe1d77e2df03a6bd1ce");
    }
 }