From b0f53984123f5b18714fbb45e64021c4be5d88d6 Mon Sep 17 00:00:00 2001 From: Myriad-Dreamin <35292584+Myriad-Dreamin@users.noreply.github.com> Date: Mon, 7 Oct 2024 11:58:48 +0800 Subject: [PATCH] fix: pass `to_multiline_tokens2` checking by copilot (#639) * test: generate and pass sema_tokens sanity checking by copilot * test: pass sema_tokens sanity checking * dev: update snapshot --- .../snaps/test@content-block.typ.snap | 2 +- .../semantic_tokens/snaps/test@fn2.typ.snap | 2 +- .../snaps/test@heading.typ.snap | 2 +- .../snaps/test@tinymist_issue_601.typ.snap | 2 +- .../snaps/test@tinymist_issue_638.typ.snap | 6 + .../snaps/test@typst_lsp_issue_264.typ.snap | 2 +- .../semantic_tokens/tinymist_issue_638.typ | 1 + .../tinymist-query/src/semantic_tokens/mod.rs | 106 ++++++++++-------- .../src/semantic_tokens_full.rs | 91 +++++++++++++++ tests/e2e/main.rs | 4 +- 10 files changed, 166 insertions(+), 52 deletions(-) create mode 100644 crates/tinymist-query/src/fixtures/semantic_tokens/snaps/test@tinymist_issue_638.typ.snap create mode 100644 crates/tinymist-query/src/fixtures/semantic_tokens/tinymist_issue_638.typ diff --git a/crates/tinymist-query/src/fixtures/semantic_tokens/snaps/test@content-block.typ.snap b/crates/tinymist-query/src/fixtures/semantic_tokens/snaps/test@content-block.typ.snap index 86340a96..73c676f7 100644 --- a/crates/tinymist-query/src/fixtures/semantic_tokens/snaps/test@content-block.typ.snap +++ b/crates/tinymist-query/src/fixtures/semantic_tokens/snaps/test@content-block.typ.snap @@ -3,4 +3,4 @@ source: crates/tinymist-query/src/semantic_tokens_full.rs expression: "serde_json::to_string(&result).unwrap()" input_file: crates/tinymist-query/src/fixtures/semantic_tokens/content-block.typ --- -{"data":[0,0,1,8,0,0,1,1,8,0,0,1,1,8,0,0,1,1,20,0,1,0,0,20,0,0,0,1,8,0,0,1,1,8,0,0,1,1,20,0,0,1,1,8,0,0,1,1,20,0,1,0,0,20,0,0,0,1,8,0,0,1,1,8,0,0,1,1,8,0,0,1,1,8,0,0,1,1,20,0,0,1,1,8,0,0,1,1,8,0,0,1,1,8,0,0,1,1,8,0,0,1,1,8,0,0,1,3,20,0,0,3,1,8,0,0,1,1,8,0,0,1,1,8,0,0,1,1,20,0,1,0,0,20,0]} +{"data":[0,0,1,8,0,0,1,1,8,0,0,1,1,8,0,0,1,1,20,0,1,0,1,8,0,0,1,1,8,0,0,1,1,20,0,0,1,1,8,0,0,1,1,20,0,1,0,1,8,0,0,1,1,8,0,0,1,1,8,0,0,1,1,8,0,0,1,1,20,0,0,1,1,8,0,0,1,1,8,0,0,1,1,8,0,0,1,1,8,0,0,1,1,8,0,0,1,3,20,0,0,3,1,8,0,0,1,1,8,0,0,1,1,8,0,0,1,1,20,0]} diff --git a/crates/tinymist-query/src/fixtures/semantic_tokens/snaps/test@fn2.typ.snap b/crates/tinymist-query/src/fixtures/semantic_tokens/snaps/test@fn2.typ.snap index 00cd2615..bfd4b895 100644 --- a/crates/tinymist-query/src/fixtures/semantic_tokens/snaps/test@fn2.typ.snap +++ b/crates/tinymist-query/src/fixtures/semantic_tokens/snaps/test@fn2.typ.snap @@ -3,4 +3,4 @@ source: crates/tinymist-query/src/semantic_tokens_full.rs expression: "serde_json::to_string(&result).unwrap()" input_file: crates/tinymist-query/src/fixtures/semantic_tokens/fn2.typ --- -{"data":[0,0,1,2,0,0,1,3,2,0,0,3,1,20,0,0,1,2,5,0,0,2,1,8,0,0,1,1,8,0,0,1,1,20,0,0,1,1,3,0,0,1,1,20,0,0,1,1,8,0,0,1,1,20,0,1,0,2,20,0,0,2,3,5,0,0,3,1,8,0,0,1,7,1,0,0,7,1,8,0,0,1,1,20,0,0,1,4,18,0,0,4,1,8,0,0,1,1,20,0,0,1,5,1,0,0,5,1,8,0,0,1,1,20,0,1,0,2,20,0,0,2,3,20,0,0,0,3,11,0,0,3,3,20,0,0,0,3,11,0,0,3,1,20,0,1,0,2,20,0,0,2,1,20,0,0,0,1,11,0,0,1,1,20,0,1,0,2,20,0,0,2,1,20,0,0,0,1,11,0,0,1,1,20,0,1,0,2,20,0,0,2,3,20,0,0,0,3,11,0,0,3,1,20,0,1,0,2,20,0,0,2,1,8,0,0,1,1,20,0,0,1,1,8,0,0,1,1,20,0,1,0,0,20,0,0,0,1,8,0]} +{"data":[0,0,1,2,0,0,1,3,2,0,0,3,1,20,0,0,1,2,5,0,0,2,1,8,0,0,1,1,8,0,0,1,1,20,0,0,1,1,3,0,0,1,1,20,0,0,1,1,8,0,0,1,1,20,0,1,0,2,20,0,0,2,3,5,0,0,3,1,8,0,0,1,7,1,0,0,7,1,8,0,0,1,1,20,0,0,1,4,18,0,0,4,1,8,0,0,1,1,20,0,0,1,5,1,0,0,5,1,8,0,0,1,1,20,0,1,0,2,20,0,0,2,3,20,0,0,3,3,20,0,0,3,1,20,0,1,0,2,20,0,0,2,1,20,0,0,1,1,20,0,1,0,2,20,0,0,2,1,20,0,0,1,1,20,0,1,0,2,20,0,0,2,3,20,0,0,3,1,20,0,1,0,2,20,0,0,2,1,8,0,0,1,1,20,0,0,1,1,8,0,0,1,1,20,0,1,0,1,8,0]} diff --git a/crates/tinymist-query/src/fixtures/semantic_tokens/snaps/test@heading.typ.snap b/crates/tinymist-query/src/fixtures/semantic_tokens/snaps/test@heading.typ.snap index 900c1a58..fca0b165 100644 --- a/crates/tinymist-query/src/fixtures/semantic_tokens/snaps/test@heading.typ.snap +++ b/crates/tinymist-query/src/fixtures/semantic_tokens/snaps/test@heading.typ.snap @@ -3,4 +3,4 @@ source: crates/tinymist-query/src/semantic_tokens_full.rs expression: "serde_json::to_string(&result).unwrap()" input_file: crates/tinymist-query/src/fixtures/semantic_tokens/heading.typ --- -{"data":[0,0,3,14,0,0,0,3,14,0,0,3,1,20,0,0,0,1,14,0,0,1,1,5,0,0,0,1,14,0,0,1,4,5,0,0,0,4,14,0,0,4,1,8,0,0,0,1,14,0,0,1,17,1,0,0,0,17,14,0,0,17,1,8,0,0,0,1,14,0]} +{"data":[0,0,3,14,0,0,3,1,20,0,0,1,1,5,0,0,1,4,5,0,0,4,1,8,0,0,1,17,1,0,0,17,1,8,0]} diff --git a/crates/tinymist-query/src/fixtures/semantic_tokens/snaps/test@tinymist_issue_601.typ.snap b/crates/tinymist-query/src/fixtures/semantic_tokens/snaps/test@tinymist_issue_601.typ.snap index b6713908..7ca58bac 100644 --- a/crates/tinymist-query/src/fixtures/semantic_tokens/snaps/test@tinymist_issue_601.typ.snap +++ b/crates/tinymist-query/src/fixtures/semantic_tokens/snaps/test@tinymist_issue_601.typ.snap @@ -3,4 +3,4 @@ source: crates/tinymist-query/src/semantic_tokens_full.rs expression: "serde_json::to_string(&result).unwrap()" input_file: crates/tinymist-query/src/fixtures/semantic_tokens/tinymist_issue_601.typ --- -{"data":[0,0,1,19,4,0,1,3,5,4,0,3,1,8,4,0,1,1,20,4,0,1,1,3,4,0,1,1,20,4,0,1,1,20,4,1,0,0,20,4,0,0,1,20,4,0,1,1,20,4,1,0,1,20,4,1,0,0,20,4,0,0,0,20,0]} +{"data":[0,0,1,19,4,0,1,3,5,4,0,3,1,8,4,0,1,1,20,4,0,1,1,3,4,0,1,1,20,4,0,1,1,20,4,1,0,1,20,4,0,1,1,20,4,1,0,1,20,4]} diff --git a/crates/tinymist-query/src/fixtures/semantic_tokens/snaps/test@tinymist_issue_638.typ.snap b/crates/tinymist-query/src/fixtures/semantic_tokens/snaps/test@tinymist_issue_638.typ.snap new file mode 100644 index 00000000..fd432adb --- /dev/null +++ b/crates/tinymist-query/src/fixtures/semantic_tokens/snaps/test@tinymist_issue_638.typ.snap @@ -0,0 +1,6 @@ +--- +source: crates/tinymist-query/src/semantic_tokens_full.rs +expression: "serde_json::to_string(&result).unwrap()" +input_file: crates/tinymist-query/src/fixtures/semantic_tokens/tinymist_issue_638.typ +--- +{"data":[0,0,9,20,0,0,9,1,20,0]} diff --git a/crates/tinymist-query/src/fixtures/semantic_tokens/snaps/test@typst_lsp_issue_264.typ.snap b/crates/tinymist-query/src/fixtures/semantic_tokens/snaps/test@typst_lsp_issue_264.typ.snap index 377837fe..0fdffa01 100644 --- a/crates/tinymist-query/src/fixtures/semantic_tokens/snaps/test@typst_lsp_issue_264.typ.snap +++ b/crates/tinymist-query/src/fixtures/semantic_tokens/snaps/test@typst_lsp_issue_264.typ.snap @@ -3,4 +3,4 @@ source: crates/tinymist-query/src/semantic_tokens_full.rs expression: "serde_json::to_string(&result).unwrap()" input_file: crates/tinymist-query/src/fixtures/semantic_tokens/typst_lsp_issue_264.typ --- -{"data":[0,0,3,0,0,1,0,4,0,0,1,0,3,0,0,1,0,2,0,0,0,2,1,20,0,1,0,1,20,0,1,0,0,20,0,0,0,3,20,0,0,0,3,11,0,0,3,4,20,0,0,0,4,11,0,0,4,1,20,0,1,0,0,20,0,0,0,17,20,0,0,0,17,11,0,0,17,1,20,0,1,0,0,20,0,0,0,3,20,0,0,0,3,11,0]} +{"data":[0,0,3,0,0,1,0,4,0,0,1,0,3,0,0,1,0,2,0,0,0,2,1,20,0,1,0,1,20,0,1,0,3,20,0,0,3,4,20,0,0,4,1,20,0,1,0,17,20,0,0,17,1,20,0,1,0,3,20,0]} diff --git a/crates/tinymist-query/src/fixtures/semantic_tokens/tinymist_issue_638.typ b/crates/tinymist-query/src/fixtures/semantic_tokens/tinymist_issue_638.typ new file mode 100644 index 00000000..933643cf --- /dev/null +++ b/crates/tinymist-query/src/fixtures/semantic_tokens/tinymist_issue_638.typ @@ -0,0 +1 @@ +Bob’s car diff --git a/crates/tinymist-query/src/semantic_tokens/mod.rs b/crates/tinymist-query/src/semantic_tokens/mod.rs index ea59ac21..bb077a18 100644 --- a/crates/tinymist-query/src/semantic_tokens/mod.rs +++ b/crates/tinymist-query/src/semantic_tokens/mod.rs @@ -85,7 +85,7 @@ struct Tokenizer { allow_multiline_token: bool, - token: Token, + token: Option, } impl Tokenizer { @@ -98,7 +98,7 @@ impl Tokenizer { allow_multiline_token, encoding, - token: Token::default(), + token: None, } } @@ -113,33 +113,35 @@ impl Tokenizer { .map(|token_type| Token::new(token_type, modifiers, range.clone())); // Push start - if !self.token.range.is_empty() && self.token.range.start < range.start { - let end = self.token.range.end.min(range.start); - self.push(Token { - token_type: self.token.token_type, - modifiers: self.token.modifiers, - range: self.token.range.start..end, - }); - self.token.range.start = end; + if let Some(prev_token) = self.token.as_mut() { + if !prev_token.range.is_empty() && prev_token.range.start < range.start { + let end = prev_token.range.end.min(range.start); + let sliced = Token { + token_type: prev_token.token_type, + modifiers: prev_token.modifiers, + range: prev_token.range.start..end, + }; + // Slice the previous token + prev_token.range.start = end; + self.push(sliced); + } } if !is_leaf { - if let Some(token) = token.as_mut() { - std::mem::swap(&mut self.token, token); - } - + std::mem::swap(&mut self.token, &mut token); for child in root.children() { self.tokenize_tree(&child, modifiers); } - - if let Some(token) = token.as_mut() { - std::mem::swap(&mut self.token, token); - } + std::mem::swap(&mut self.token, &mut token); } // Push end if let Some(token) = token.clone() { if !token.range.is_empty() { + // Slice the previous token + if let Some(prev_token) = self.token.as_mut() { + prev_token.range.start = token.range.end; + } self.push(token); } } @@ -160,16 +162,16 @@ impl Tokenizer { } // This might be a bug of typst, that `end > len` is possible - let utf8_end = (range.end).min(self.source.text().len()); + let source_len = self.source.text().len(); + let utf8_end = (range.end).min(source_len); self.pos_offset = utf8_start; - if utf8_end < range.start || range.start > self.source.text().len() { + if utf8_end <= utf8_start || utf8_start > source_len { return; } let position = typst_to_lsp::offset_to_position(utf8_start, self.encoding, &self.source); let delta = self.curr_pos.delta(&position); - self.curr_pos = position; let encode_length = |s, t| { match self.encoding { @@ -191,6 +193,7 @@ impl Tokenizer { token_type: token_type as u32, token_modifiers_bitset: modifiers.bitset(), }); + self.curr_pos = position; } else { let final_line = self .source @@ -199,38 +202,51 @@ impl Tokenizer { let next_offset = self .source .line_to_byte((self.curr_pos.line + 1) as usize) - .unwrap_or(self.source.text().len()); - self.output.push(SemanticToken { - delta_line: delta.delta_line, - delta_start: delta.delta_start, - length: encode_length(utf8_start, utf8_end.min(next_offset)) as u32, - token_type: token_type as u32, - token_modifiers_bitset: modifiers.bitset(), - }); - let mut utf8_cursor = next_offset; - if self.curr_pos.line < final_line { - for line in self.curr_pos.line + 1..=final_line { - let next_offset = if line == final_line { - utf8_end - } else { - self.source - .line_to_byte((line + 1) as usize) - .unwrap_or(self.source.text().len()) - }; + .unwrap_or(source_len); + let inline_length = encode_length(utf8_start, utf8_end.min(next_offset)) as u32; + if inline_length != 0 { + self.output.push(SemanticToken { + delta_line: delta.delta_line, + delta_start: delta.delta_start, + length: inline_length, + token_type: token_type as u32, + token_modifiers_bitset: modifiers.bitset(), + }); + self.curr_pos = position; + } + if self.curr_pos.line >= final_line { + return; + } + let mut utf8_cursor = next_offset; + let mut delta_line = 0; + for line in self.curr_pos.line + 1..=final_line { + let next_offset = if line == final_line { + utf8_end + } else { + self.source + .line_to_byte((line + 1) as usize) + .unwrap_or(source_len) + }; + + if utf8_cursor < next_offset { + let inline_length = encode_length(utf8_cursor, next_offset) as u32; self.output.push(SemanticToken { - delta_line: 1, + delta_line: delta_line + 1, delta_start: 0, - length: encode_length(utf8_cursor, next_offset) as u32, + length: inline_length, token_type: token_type as u32, token_modifiers_bitset: modifiers.bitset(), }); - self.pos_offset = utf8_cursor; - utf8_cursor = next_offset; + delta_line = 0; + self.curr_pos.character = 0; + } else { + delta_line += 1; } - self.curr_pos.line = final_line; - self.curr_pos.character = 0; + self.pos_offset = utf8_cursor; + utf8_cursor = next_offset; } + self.curr_pos.line = final_line - delta_line; } pub trait PositionExt { diff --git a/crates/tinymist-query/src/semantic_tokens_full.rs b/crates/tinymist-query/src/semantic_tokens_full.rs index 213588b0..8932b692 100644 --- a/crates/tinymist-query/src/semantic_tokens_full.rs +++ b/crates/tinymist-query/src/semantic_tokens_full.rs @@ -46,6 +46,88 @@ mod tests { use super::*; use crate::tests::*; + /// This is converted by Copilot from TypeScript `to_multiline_tokens2`. + /// + fn check_tokens(tokens: &SemanticTokens) { + const DESIRED_TOKENS_PER_AREA: usize = 400; + const DESIRED_MAX_AREAS: usize = 1024; + + let src_data = &tokens.data; + let token_count = src_data.len(); + let tokens_per_area = std::cmp::max( + (token_count as f64 / DESIRED_MAX_AREAS as f64).ceil() as usize, + DESIRED_TOKENS_PER_AREA, + ); + + let mut token_index = 0; + let mut last_line_number = 1; + let mut last_start_character = 0; + + while token_index < token_count { + let token_start_index = token_index; + let mut token_end_index = + std::cmp::min(token_start_index + tokens_per_area, token_count); + + // Keep tokens on the same line in the same area... + if token_end_index < token_count { + let mut small_token_end_index = token_end_index; + while small_token_end_index - 1 > token_start_index + && src_data[small_token_end_index].delta_line == 0 + { + small_token_end_index -= 1; + } + + if small_token_end_index - 1 == token_start_index { + // there are so many tokens on this line that our area would be empty, we must + // now go right + let mut big_token_end_index = token_end_index; + while big_token_end_index + 1 < token_count + && src_data[big_token_end_index].delta_line == 0 + { + big_token_end_index += 1; + } + token_end_index = big_token_end_index; + } else { + token_end_index = small_token_end_index; + } + } + + let mut prev_line_number = 0; + let mut prev_end_character = 0; + + while token_index < token_end_index { + let delta_line = src_data[token_index].delta_line; + let delta_character = src_data[token_index].delta_start; + let length = src_data[token_index].length; + let line_number = last_line_number + delta_line; + let start_character = if delta_line == 0 { + last_start_character + delta_character + } else { + delta_character + }; + // delta_character + let end_character = start_character + length; + + if end_character <= start_character { + // this token is invalid (most likely a negative length casted to uint32) + panic!( + "Invalid length for semantic token at line {line_number}, character {start_character}, end: {end_character}" + ); + } else if prev_line_number == line_number && prev_end_character > start_character { + // this token overlaps with the previous token + panic!("Overlapping semantic tokens at line {line_number}, character {start_character}, previous line {prev_line_number}, previous end {prev_end_character}"); + } else { + prev_line_number = line_number; + prev_end_character = end_character; + } + + last_line_number = line_number; + last_start_character = start_character; + token_index += 1; + } + } + } + #[test] fn test() { snapshot_testing("semantic_tokens", &|ctx, path| { @@ -60,6 +142,15 @@ mod tests { tokens.result_id.take(); } + match &result { + SemanticTokensResult::Tokens(tokens) => { + check_tokens(tokens); + } + SemanticTokensResult::Partial(_) => { + panic!("Unexpected partial result"); + } + } + assert_snapshot!(serde_json::to_string(&result).unwrap()); }); } diff --git a/tests/e2e/main.rs b/tests/e2e/main.rs index f448bbfd..bf22f1c6 100644 --- a/tests/e2e/main.rs +++ b/tests/e2e/main.rs @@ -374,7 +374,7 @@ fn e2e() { }); let hash = replay_log(&tinymist_binary, &root.join("neovim")); - insta::assert_snapshot!(hash, @"siphash128_13:57d85be77d6449db5c0bd9c9b3a7a480"); + insta::assert_snapshot!(hash, @"siphash128_13:95195259988d97cae5802b09f2aa6c0"); } { @@ -385,7 +385,7 @@ fn e2e() { }); let hash = replay_log(&tinymist_binary, &root.join("vscode")); - insta::assert_snapshot!(hash, @"siphash128_13:2a067ea884ed66dcf681c3fa7ec167a3"); + insta::assert_snapshot!(hash, @"siphash128_13:5ed17b7e6504fbe1d77e2df03a6bd1ce"); } }