//! Semantic tokens (highlighting) support for LSP. use std::collections::HashMap; use std::{ num::NonZeroUsize, ops::Range, path::Path, sync::{Arc, OnceLock}, }; use lsp_types::SemanticToken; use lsp_types::{SemanticTokenModifier, SemanticTokenType}; use parking_lot::Mutex; use strum::EnumIter; use tinymist_std::ImmutPath; use typst::syntax::{LinkedNode, Source, SyntaxKind, ast}; use crate::{ LocalContext, LspPosition, PositionEncoding, adt::revision::{RevisionLock, RevisionManager, RevisionManagerLike, RevisionSlot}, syntax::{Expr, ExprInfo}, ty::Ty, }; /// A shared semantic tokens object. pub type SemanticTokens = Arc>; /// Get the semantic tokens for a source. #[typst_macros::time(span = source.root().span())] pub(crate) fn get_semantic_tokens(ctx: &mut LocalContext, source: &Source) -> SemanticTokens { let mut tokenizer = Tokenizer::new( source.clone(), ctx.expr_stage(source), ctx.analysis.allow_multiline_token, ctx.analysis.position_encoding, ); tokenizer.tokenize_tree(&LinkedNode::new(source.root()), ModifierSet::empty()); SemanticTokens::new(tokenizer.output) } /// A shared semantic tokens cache. #[derive(Default)] pub struct SemanticTokenCache { next_id: usize, // todo: clear cache after didClose manager: HashMap>>, } impl SemanticTokenCache { pub(crate) fn clear(&mut self) { self.next_id = 0; self.manager.clear(); } /// Lock the token cache with an optional previous id in *main thread*. pub(crate) fn acquire( cache: Arc>, path: &Path, prev: Option<&str>, ) -> SemanticTokenContext { let that = cache.clone(); let mut that = that.lock(); that.next_id += 1; let prev = prev.and_then(|id| { id.parse::() .inspect_err(|_| { log::warn!("invalid previous id: {id}"); }) .ok() }); let next = NonZeroUsize::new(that.next_id).expect("id overflow"); let path = ImmutPath::from(path); let manager = that.manager.entry(path.clone()).or_default(); let _rev_lock = manager.lock(prev.unwrap_or(next)); let prev = prev.and_then(|prev| { manager .find_revision(prev, |_| OnceLock::new()) .data .get() .cloned() }); let next = manager.find_revision(next, |_| OnceLock::new()); SemanticTokenContext { _rev_lock, cache, path, prev, next, } } } /// A semantic token context providing incremental semantic tokens rendering. pub(crate) struct SemanticTokenContext { _rev_lock: RevisionLock, cache: Arc>, path: ImmutPath, pub prev: Option, pub next: Arc>>, } impl Drop for SemanticTokenContext { fn drop(&mut self) { let mut cache = self.cache.lock(); let manager = cache.manager.get_mut(&self.path); if let Some(manager) = manager { let min_rev = manager.unlock(&mut self._rev_lock); if let Some(min_rev) = min_rev { manager.gc(min_rev); } } } } const BOOL: SemanticTokenType = SemanticTokenType::new("bool"); const PUNCTUATION: SemanticTokenType = SemanticTokenType::new("punct"); const ESCAPE: SemanticTokenType = SemanticTokenType::new("escape"); const LINK: SemanticTokenType = SemanticTokenType::new("link"); const RAW: SemanticTokenType = SemanticTokenType::new("raw"); const LABEL: SemanticTokenType = SemanticTokenType::new("label"); const REF: SemanticTokenType = SemanticTokenType::new("ref"); const HEADING: SemanticTokenType = SemanticTokenType::new("heading"); const LIST_MARKER: SemanticTokenType = SemanticTokenType::new("marker"); const LIST_TERM: SemanticTokenType = SemanticTokenType::new("term"); const DELIMITER: SemanticTokenType = SemanticTokenType::new("delim"); const INTERPOLATED: SemanticTokenType = SemanticTokenType::new("pol"); const ERROR: SemanticTokenType = SemanticTokenType::new("error"); const TEXT: SemanticTokenType = SemanticTokenType::new("text"); /// Very similar to `typst_ide::Tag`, but with convenience traits, and /// extensible because we want to further customize highlighting #[derive(Clone, Copy, Eq, PartialEq, EnumIter, Default)] #[repr(u32)] pub enum TokenType { // Standard LSP types /// A comment token. Comment, /// A string token. String, /// A keyword token. Keyword, /// An operator token. Operator, /// A number token. Number, /// A function token. Function, /// A decorator token. Decorator, /// A type token. Type, /// A namespace token. Namespace, // Custom types /// A boolean token. Bool, /// A punctuation token. Punctuation, /// An escape token. Escape, /// A link token. Link, /// A raw token. Raw, /// A label token. Label, /// A markup reference token. Ref, /// A heading token. Heading, /// A list marker token. ListMarker, /// A list term token. ListTerm, /// A delimiter token. Delimiter, /// An interpolated token. Interpolated, /// An error token. Error, /// Any text in markup without a more specific token type, possible styled. /// /// We perform styling (like bold and italics) via modifiers. That means /// everything that should receive styling needs to be a token so we can /// apply a modifier to it. This token type is mostly for that, since /// text should usually not be specially styled. Text, /// A token that is not recognized by the lexer #[default] None, } impl From for SemanticTokenType { fn from(token_type: TokenType) -> Self { use TokenType::*; match token_type { Comment => Self::COMMENT, String => Self::STRING, Keyword => Self::KEYWORD, Operator => Self::OPERATOR, Number => Self::NUMBER, Function => Self::FUNCTION, Decorator => Self::DECORATOR, Type => Self::TYPE, Namespace => Self::NAMESPACE, Bool => BOOL, Punctuation => PUNCTUATION, Escape => ESCAPE, Link => LINK, Raw => RAW, Label => LABEL, Ref => REF, Heading => HEADING, ListMarker => LIST_MARKER, ListTerm => LIST_TERM, Delimiter => DELIMITER, Interpolated => INTERPOLATED, Error => ERROR, Text => TEXT, None => unreachable!(), } } } const STRONG: SemanticTokenModifier = SemanticTokenModifier::new("strong"); const EMPH: SemanticTokenModifier = SemanticTokenModifier::new("emph"); const MATH: SemanticTokenModifier = SemanticTokenModifier::new("math"); /// A modifier to some semantic token. #[derive(Clone, Copy, EnumIter)] #[repr(u8)] pub enum Modifier { /// Strong modifier. Strong, /// Emphasis modifier. Emph, /// Math modifier. Math, /// Read-only modifier. ReadOnly, /// Static modifier. Static, /// Default library modifier. DefaultLibrary, } impl Modifier { /// Get the index of the modifier. pub const fn index(self) -> u8 { self as u8 } /// Get the bitmask of the modifier. pub const fn bitmask(self) -> u32 { 0b1 << self.index() } } impl From for SemanticTokenModifier { fn from(modifier: Modifier) -> Self { use Modifier::*; match modifier { Strong => STRONG, Emph => EMPH, Math => MATH, ReadOnly => Self::READONLY, Static => Self::STATIC, DefaultLibrary => Self::DEFAULT_LIBRARY, } } } #[derive(Default, Clone, Copy)] pub(crate) struct ModifierSet(u32); impl ModifierSet { pub fn empty() -> Self { Self::default() } pub fn new(modifiers: &[Modifier]) -> Self { let bits = modifiers .iter() .copied() .map(Modifier::bitmask) .fold(0, |bits, mask| bits | mask); Self(bits) } pub fn bitset(self) -> u32 { self.0 } } impl std::ops::BitOr for ModifierSet { type Output = Self; fn bitor(self, rhs: Self) -> Self::Output { Self(self.0 | rhs.0) } } pub(crate) struct Tokenizer { curr_pos: LspPosition, pos_offset: usize, output: Vec, source: Source, ei: ExprInfo, encoding: PositionEncoding, allow_multiline_token: bool, token: Option, } impl Tokenizer { pub fn new( source: Source, ei: ExprInfo, allow_multiline_token: bool, encoding: PositionEncoding, ) -> Self { Self { curr_pos: LspPosition::new(0, 0), pos_offset: 0, output: Vec::new(), source, ei, allow_multiline_token, encoding, token: None, } } /// Tokenize a node and its children fn tokenize_tree(&mut self, root: &LinkedNode, modifiers: ModifierSet) { let is_leaf = root.get().children().len() == 0; let mut modifiers = modifiers | modifiers_from_node(root); let range = root.range(); let mut token = token_from_node(&self.ei, root, &mut modifiers) .or_else(|| is_leaf.then_some(TokenType::Text)) .map(|token_type| Token::new(token_type, modifiers, range.clone())); // Push start if let Some(prev_token) = self.token.as_mut() && !prev_token.range.is_empty() && prev_token.range.start < range.start { let end = prev_token.range.end.min(range.start); let sliced = Token { token_type: prev_token.token_type, modifiers: prev_token.modifiers, range: prev_token.range.start..end, }; // Slice the previous token prev_token.range.start = end; self.push(sliced); } if !is_leaf { std::mem::swap(&mut self.token, &mut token); for child in root.children() { self.tokenize_tree(&child, modifiers); } std::mem::swap(&mut self.token, &mut token); } // Push end if let Some(token) = token.clone() && !token.range.is_empty() { // Slice the previous token if let Some(prev_token) = self.token.as_mut() { prev_token.range.start = token.range.end; } self.push(token); } } fn push(&mut self, token: Token) { let Token { token_type, modifiers, range, } = token; use crate::lsp_typst_boundary; use lsp_types::Position; let utf8_start = range.start; if self.pos_offset > utf8_start { return; } // This might be a bug of typst, that `end > len` is possible let source_len = self.source.text().len(); let utf8_end = (range.end).min(source_len); self.pos_offset = utf8_start; if utf8_end <= utf8_start || utf8_start > source_len { return; } let position = lsp_typst_boundary::to_lsp_position(utf8_start, self.encoding, &self.source); let delta = self.curr_pos.delta(&position); let encode_length = |s, t| { match self.encoding { PositionEncoding::Utf8 => t - s, PositionEncoding::Utf16 => { // todo: whether it is safe to unwrap let utf16_start = self.source.lines().byte_to_utf16(s).unwrap(); let utf16_end = self.source.lines().byte_to_utf16(t).unwrap(); utf16_end - utf16_start } } }; if self.allow_multiline_token { self.output.push(SemanticToken { delta_line: delta.delta_line, delta_start: delta.delta_start, length: encode_length(utf8_start, utf8_end) as u32, token_type: token_type as u32, token_modifiers_bitset: modifiers.bitset(), }); self.curr_pos = position; } else { let final_line = self.source .lines() .byte_to_line(utf8_end) .unwrap_or_else(|| self.source.lines().len_lines()) as u32; let next_offset = self .source .lines() .line_to_byte((self.curr_pos.line + 1) as usize) .unwrap_or(source_len); let inline_length = encode_length(utf8_start, utf8_end.min(next_offset)) as u32; if inline_length != 0 { self.output.push(SemanticToken { delta_line: delta.delta_line, delta_start: delta.delta_start, length: inline_length, token_type: token_type as u32, token_modifiers_bitset: modifiers.bitset(), }); self.curr_pos = position; } if self.curr_pos.line >= final_line { return; } let mut utf8_cursor = next_offset; let mut delta_line = 0; for line in self.curr_pos.line + 1..=final_line { let next_offset = if line == final_line { utf8_end } else { self.source .lines() .line_to_byte((line + 1) as usize) .unwrap_or(source_len) }; if utf8_cursor < next_offset { let inline_length = encode_length(utf8_cursor, next_offset) as u32; self.output.push(SemanticToken { delta_line: delta_line + 1, delta_start: 0, length: inline_length, token_type: token_type as u32, token_modifiers_bitset: modifiers.bitset(), }); delta_line = 0; self.curr_pos.character = 0; } else { delta_line += 1; } self.pos_offset = utf8_cursor; utf8_cursor = next_offset; } self.curr_pos.line = final_line - delta_line; } pub trait PositionExt { fn delta(&self, to: &Self) -> PositionDelta; } impl PositionExt for Position { /// Calculates the delta from `self` to `to`. This is in the /// `SemanticToken` sense, so the delta's `character` is /// relative to `self`'s `character` iff `self` and `to` /// are on the same line. Otherwise, it's relative to /// the start of the line `to` is on. fn delta(&self, to: &Self) -> PositionDelta { let line_delta = to.line - self.line; let char_delta = if line_delta == 0 { to.character - self.character } else { to.character }; PositionDelta { delta_line: line_delta, delta_start: char_delta, } } } #[derive(Debug, Eq, PartialEq, Ord, PartialOrd, Copy, Clone, Default)] pub struct PositionDelta { pub delta_line: u32, pub delta_start: u32, } } } #[derive(Clone, Default)] struct Token { pub token_type: TokenType, pub modifiers: ModifierSet, pub range: Range, } impl Token { pub fn new(token_type: TokenType, modifiers: ModifierSet, range: Range) -> Self { Self { token_type, modifiers, range, } } } /// Determines the [`Modifier`]s to be applied to a node and all its children. /// /// Note that this does not recurse up, so calling it on a child node may not /// return a modifier that should be applied to it due to a parent. fn modifiers_from_node(node: &LinkedNode) -> ModifierSet { match node.kind() { SyntaxKind::Emph => ModifierSet::new(&[Modifier::Emph]), SyntaxKind::Strong => ModifierSet::new(&[Modifier::Strong]), SyntaxKind::Math | SyntaxKind::Equation => ModifierSet::new(&[Modifier::Math]), _ => ModifierSet::empty(), } } /// Determines the best [`TokenType`] for an entire node and its children, if /// any. If there is no single `TokenType`, or none better than `Text`, returns /// `None`. /// /// In tokenization, returning `Some` stops recursion, while returning `None` /// continues and attempts to tokenize each of `node`'s children. If there are /// no children, `Text` is taken as the default. fn token_from_node( ei: &ExprInfo, node: &LinkedNode, modifier: &mut ModifierSet, ) -> Option { use SyntaxKind::*; match node.kind() { Star if node.parent_kind() == Some(Strong) => Some(TokenType::Punctuation), Star if node.parent_kind() == Some(ModuleImport) => Some(TokenType::Operator), Underscore if node.parent_kind() == Some(Emph) => Some(TokenType::Punctuation), Underscore if node.parent_kind() == Some(MathAttach) => Some(TokenType::Operator), MathIdent | Ident => Some(token_from_ident(ei, node, modifier)), Hash => token_from_hashtag(ei, node, modifier), LeftBrace | RightBrace | LeftBracket | RightBracket | LeftParen | RightParen | Comma | Semicolon | Colon => Some(TokenType::Punctuation), Linebreak | Escape | Shorthand => Some(TokenType::Escape), Link => Some(TokenType::Link), Raw => Some(TokenType::Raw), Label => Some(TokenType::Label), RefMarker => Some(TokenType::Ref), Heading | HeadingMarker => Some(TokenType::Heading), ListMarker | EnumMarker | TermMarker => Some(TokenType::ListMarker), Not | And | Or => Some(TokenType::Keyword), MathAlignPoint | Plus | Minus | Slash | Hat | Dot | Eq | EqEq | ExclEq | Lt | LtEq | Gt | GtEq | PlusEq | HyphEq | StarEq | SlashEq | Dots | Arrow => Some(TokenType::Operator), Dollar => Some(TokenType::Delimiter), None | Auto | Let | Show | If | Else | For | In | While | Break | Continue | Return | Import | Include | As | Set | Context => Some(TokenType::Keyword), Bool => Some(TokenType::Bool), Int | Float | Numeric => Some(TokenType::Number), Str => Some(TokenType::String), LineComment | BlockComment => Some(TokenType::Comment), Error => Some(TokenType::Error), // Disambiguate from `SyntaxKind::None` _ => Option::None, } } // TODO: differentiate also using tokens in scope, not just context fn token_from_ident(ei: &ExprInfo, ident: &LinkedNode, modifier: &mut ModifierSet) -> TokenType { let resolved = ei.resolves.get(&ident.span()); let context = if let Some(resolved) = resolved { match (&resolved.root, &resolved.term) { (Some(root), term) => Some(token_from_decl_expr(root, term.as_ref(), modifier)), (_, Some(ty)) => Some(token_from_term(ty, modifier)), _ => None, } } else { None }; if !matches!(context, None | Some(TokenType::Interpolated)) { return context.unwrap_or(TokenType::Interpolated); } let next = ident.next_leaf(); let next_is_adjacent = next .as_ref() .is_some_and(|n| n.range().start == ident.range().end); let next_parent = next.as_ref().and_then(|n| n.parent_kind()); let next_kind = next.map(|n| n.kind()); let lexical_function_call = next_is_adjacent && matches!(next_kind, Some(SyntaxKind::LeftParen)) && matches!(next_parent, Some(SyntaxKind::Args | SyntaxKind::Params)); if lexical_function_call { return TokenType::Function; } let function_content = next_is_adjacent && matches!(next_kind, Some(SyntaxKind::LeftBracket)) && matches!(next_parent, Some(SyntaxKind::ContentBlock)); if function_content { return TokenType::Function; } TokenType::Interpolated } fn token_from_term(t: &Ty, modifier: &mut ModifierSet) -> TokenType { use typst::foundations::Value::*; match t { Ty::Func(..) => TokenType::Function, Ty::Value(v) => { match &v.val { Func(..) => TokenType::Function, Type(..) => { *modifier = *modifier | ModifierSet::new(&[Modifier::DefaultLibrary]); TokenType::Function } Module(..) => ns(modifier), // todo: read only modifier _ => TokenType::Interpolated, } } _ => TokenType::Interpolated, } } fn token_from_decl_expr(expr: &Expr, term: Option<&Ty>, modifier: &mut ModifierSet) -> TokenType { use crate::syntax::Decl::*; match expr { Expr::Type(term) => token_from_term(term, modifier), Expr::Decl(decl) => match decl.as_ref() { Func(..) => TokenType::Function, Var(..) => TokenType::Interpolated, Module(..) => ns(modifier), ModuleAlias(..) => ns(modifier), PathStem(..) => ns(modifier), ImportAlias(..) => TokenType::Interpolated, IdentRef(..) => TokenType::Interpolated, ImportPath(..) => TokenType::Interpolated, IncludePath(..) => TokenType::Interpolated, Import(..) => TokenType::Interpolated, ContentRef(..) => TokenType::Interpolated, Label(..) => TokenType::Interpolated, StrName(..) => TokenType::Interpolated, ModuleImport(..) => TokenType::Interpolated, Closure(..) => TokenType::Interpolated, Pattern(..) => TokenType::Interpolated, Spread(..) => TokenType::Interpolated, Content(..) => TokenType::Interpolated, Constant(..) => TokenType::Interpolated, BibEntry(..) => TokenType::Interpolated, Docs(..) => TokenType::Interpolated, Generated(..) => TokenType::Interpolated, }, _ => term .map(|term| token_from_term(term, modifier)) .unwrap_or(TokenType::Interpolated), } } fn ns(modifier: &mut ModifierSet) -> TokenType { *modifier = *modifier | ModifierSet::new(&[Modifier::Static, Modifier::ReadOnly]); TokenType::Namespace } fn get_expr_following_hashtag<'a>(hashtag: &LinkedNode<'a>) -> Option> { hashtag .next_sibling() .filter(|next| next.cast::().is_some_and(|expr| expr.hash())) .and_then(|node| node.leftmost_leaf()) } fn token_from_hashtag( ei: &ExprInfo, hashtag: &LinkedNode, modifier: &mut ModifierSet, ) -> Option { get_expr_following_hashtag(hashtag) .as_ref() .and_then(|node| token_from_node(ei, node, modifier)) } #[cfg(test)] mod tests { use strum::IntoEnumIterator; use super::*; #[test] fn ensure_not_too_many_modifiers() { // Because modifiers are encoded in a 32 bit bitmask, we can't have more than 32 // modifiers assert!(Modifier::iter().len() <= 32); } }