diff --git a/i18n/edit.toml b/i18n/edit.toml index 6c0fd10..d0dd01f 100644 --- a/i18n/edit.toml +++ b/i18n/edit.toml @@ -1422,6 +1422,32 @@ uk = "Закрити" zh_hans = "关闭" zh_hant = "關閉" +[LanguageSelectMode] +en = "Select Language Mode" +de = "Sprachmodus auswählen" +es = "Seleccionar modo de lenguaje" +fr = "Sélectionner le mode du langage" +it = "Seleziona modalità del linguaggio" +ja = "言語モードの選択" +ko = "언어 모드 선택" +pt_br = "Selecionar modo de linguagem" +ru = "Выбрать режим языка" +zh_hans = "选择语言模式" +zh_hant = "選擇語言模式" + +[LanguageAutoDetect] +en = "Auto Detect" +de = "Automatisch erkennen" +es = "Detección automática" +fr = "Détection automatique" +it = "Rilevamento automatico" +ja = "自動検出" +ko = "자동 감지" +pt_br = "Detectar automaticamente" +ru = "Определить автоматически" +zh_hans = "自动检测" +zh_hant = "自動偵測" + [EncodingReopen] en = "Reopen with encoding…" bn = "এনকোডিং সহ পুনরায় খুলুন" diff --git a/src/bin/edit/documents.rs b/src/bin/edit/documents.rs index 88ce468..e2121cb 100644 --- a/src/bin/edit/documents.rs +++ b/src/bin/edit/documents.rs @@ -8,7 +8,7 @@ use std::path::{Path, PathBuf}; use edit::buffer::{RcTextBuffer, TextBuffer}; use edit::helpers::{CoordType, Point}; -use edit::lsh::language_from_path; +use edit::lsh::{Language, language_from_path}; use edit::{apperr, path, sys}; use crate::state::DisplayablePathBuf; @@ -20,6 +20,7 @@ pub struct Document { pub filename: String, pub file_id: Option, pub new_file_counter: usize, + pub language_override: Option>, } impl Document { @@ -63,15 +64,32 @@ impl Document { let filename = path.file_name().unwrap_or_default().to_string_lossy().into_owned(); let dir = path.parent().map(ToOwned::to_owned).unwrap_or_default(); - { - let mut tb = self.buffer.borrow_mut(); - tb.set_language(language_from_path(&path)); - tb.set_ruler(if filename == "COMMIT_EDITMSG" { 72 } else { 0 }); - } - self.filename = filename; self.dir = Some(DisplayablePathBuf::from_path(dir)); self.path = Some(path); + + self.buffer.borrow_mut().set_ruler(if self.filename == "COMMIT_EDITMSG" { 72 } else { 0 }); + self.update_language(); + } + + pub fn auto_detect_language(&mut self) { + self.language_override = None; + self.update_language(); + } + + pub fn override_language(&mut self, lang: Option<&'static Language>) { + self.language_override = Some(lang); + self.update_language(); + } + + fn update_language(&mut self) { + self.buffer.borrow_mut().set_language(if let Some(lang) = self.language_override { + lang + } else if let Some(path) = &self.path { + language_from_path(path) + } else { + None + }) } } @@ -123,6 +141,7 @@ impl DocumentManager { filename: Default::default(), file_id: None, new_file_counter: 0, + language_override: None, }; self.gen_untitled_name(&mut doc); @@ -183,6 +202,7 @@ impl DocumentManager { filename: Default::default(), file_id, new_file_counter: 0, + language_override: None, }; doc.set_path(path); diff --git a/src/bin/edit/draw_statusbar.rs b/src/bin/edit/draw_statusbar.rs index f7a631a..c897162 100644 --- a/src/bin/edit/draw_statusbar.rs +++ b/src/bin/edit/draw_statusbar.rs @@ -6,6 +6,7 @@ use edit::framebuffer::{Attributes, IndexedColor}; use edit::fuzzy::score_fuzzy; use edit::helpers::*; use edit::input::vk; +use edit::lsh::LANGUAGES; use edit::tui::*; use edit::{arena_format, icu}; @@ -26,15 +27,21 @@ pub fn draw_statusbar(ctx: &mut Context, state: &mut State) { ctx.table_next_row(); - if ctx.button("newline", if tb.is_crlf() { "CRLF" } else { "LF" }, ButtonStyle::default()) { - let is_crlf = tb.is_crlf(); - tb.normalize_newlines(!is_crlf); - } + state.wants_language_picker |= ctx.button( + "language", + tb.language().map_or("Plain Text", |l| l.name), + ButtonStyle::default(), + ); if state.wants_statusbar_focus { state.wants_statusbar_focus = false; ctx.steal_focus(); } + if ctx.button("newline", if tb.is_crlf() { "CRLF" } else { "LF" }, ButtonStyle::default()) { + let is_crlf = tb.is_crlf(); + tb.normalize_newlines(!is_crlf); + } + state.wants_encoding_picker |= ctx.button("encoding", tb.encoding(), ButtonStyle::default()); if state.wants_encoding_picker { @@ -199,6 +206,55 @@ pub fn draw_statusbar(ctx: &mut Context, state: &mut State) { ctx.table_end(); } +pub fn draw_dialog_language_change(ctx: &mut Context, state: &mut State) { + let doc = state.documents.active_mut(); + let mut done = doc.is_none(); + + ctx.modal_begin("language", loc(LocId::LanguageSelectMode)); + if let Some(doc) = doc { + let width = (ctx.size().width - 20).max(10); + let height = (ctx.size().height - 10).max(10); + + ctx.scrollarea_begin("scrollarea", Size { width, height }); + ctx.attr_background_rgba(ctx.indexed_alpha(IndexedColor::Black, 1, 4)); + ctx.inherit_focus(); + { + ctx.list_begin("languages"); + ctx.inherit_focus(); + + let auto_detect = doc.language_override.is_none(); + let selected = if auto_detect { None } else { doc.buffer.borrow().language() }; + + if ctx.list_item(auto_detect, loc(LocId::LanguageAutoDetect)) + == ListSelection::Activated + { + doc.auto_detect_language(); + done = true; + } + + if ctx.list_item(selected.is_none(), "Plain Text") == ListSelection::Activated { + doc.override_language(None); + done = true; + } + + for &lang in LANGUAGES { + if ctx.list_item(Some(lang) == selected, lang.name) == ListSelection::Activated { + doc.override_language(Some(lang)); + done = true; + } + } + ctx.list_end(); + } + ctx.scrollarea_end(); + } + done |= ctx.modal_end(); + + if done { + state.wants_language_picker = false; + ctx.needs_rerender(); + } +} + pub fn draw_dialog_encoding_change(ctx: &mut Context, state: &mut State) { let encoding = state.documents.active_mut().map_or("", |doc| doc.buffer.borrow().encoding()); let reopen = state.wants_encoding_change == StateEncodingChange::Reopen; diff --git a/src/bin/edit/main.rs b/src/bin/edit/main.rs index f21ae84..271f137 100644 --- a/src/bin/edit/main.rs +++ b/src/bin/edit/main.rs @@ -313,6 +313,9 @@ fn draw(ctx: &mut Context, state: &mut State) { if state.wants_save { draw_handle_save(ctx, state); } + if state.wants_language_picker { + draw_dialog_language_change(ctx, state); + } if state.wants_encoding_change != StateEncodingChange::None { draw_dialog_encoding_change(ctx, state); } diff --git a/src/bin/edit/state.rs b/src/bin/edit/state.rs index 451060b..62c8b67 100644 --- a/src/bin/edit/state.rs +++ b/src/bin/edit/state.rs @@ -152,6 +152,8 @@ pub struct State { pub search_options: buffer::SearchOptions, pub search_success: bool, + pub wants_language_picker: bool, + pub wants_encoding_picker: bool, pub wants_encoding_change: StateEncodingChange, pub encoding_picker_needle: String, @@ -200,6 +202,8 @@ impl State { search_options: Default::default(), search_success: true, + wants_language_picker: false, + wants_encoding_picker: false, encoding_picker_needle: Default::default(), encoding_picker_results: Default::default(), diff --git a/src/buffer/line_cache.rs b/src/buffer/line_cache.rs deleted file mode 100644 index af7cd59..0000000 --- a/src/buffer/line_cache.rs +++ /dev/null @@ -1,116 +0,0 @@ -use std::ops::Range; - -use crate::{document::ReadableDocument, simd::memchr2}; - -/// Cache a line/offset pair every CACHE_EVERY lines to speed up line/offset calculations -const CACHE_EVERY: usize = 1024 * 64; - -#[derive(Clone)] -pub struct CachePoint { - pub index: usize, - pub line: usize, - // pub snapshot: ParserSnapshot -} - -pub struct LineCache { - cache: Vec, -} - -impl LineCache { - pub fn new() -> Self { - Self { cache: vec![] } - } - - pub fn from_document(&mut self, document: &T) { - self.cache.clear(); - - let mut offset = 0; - let mut line = 0; - loop { - let text = document.read_forward(offset); - if text.is_empty() { return; } - - let mut off = 0; - loop { - off = memchr2(b'\n', b'\n', text, off); - if off == text.len() { break; } - - if line % CACHE_EVERY == 0 { - self.cache.push(CachePoint { index: offset+off, line }); - } - line += 1; - off += 1; - } - - offset += text.len(); - } - } - - /// Updates the cache after a deletion. - /// `range` is the deleted byte range, and `text` is the content that was deleted. - pub fn delete(&mut self, range: Range, text: &Vec) { - let mut newlines = 0; - for c in text { - if *c == b'\n' { - newlines += 1; - } - } - - let mut beg_del = None; - let mut end_del = None; - for (i, point) in self.cache.iter_mut().enumerate() { - if point.index >= range.start { - if point.index < range.end { - // cache point is within the deleted range - if beg_del.is_none() { beg_del = Some(i); } - end_del = Some(i + 1); - } - else { - point.index -= text.len(); - point.line -= newlines; - } - } - } - - if let (Some(beg), Some(end)) = (beg_del, end_del) { - self.cache.drain(beg..end); - } - } - - /// Updates the cache after an insertion. - /// `offset` is where the insertion occurs, and `text` is the inserted content. - pub fn insert(&mut self, offset: usize, text: &[u8]) { - // Count how many newlines were inserted - let mut newlines = 0; - for c in text { - if *c == b'\n' { - newlines += 1; - } - } - - let len = text.len(); - for point in &mut self.cache { - if point.index > offset { - point.index += len; - point.line += newlines; - } - } - - // TODO: This also needs to insert new cache points - } - - /// Finds the nearest cached line-offset pair relative to a target line. - /// If `reverse` is false, it returns the closest *before* the target. - /// If `reverse` is true, it returns the closest *after or at* the target. - pub fn nearest_offset(&self, target_count: usize, reverse: bool) -> Option { - match self.cache.binary_search_by_key(&target_count, |p| p.line) { - Ok(i) => Some(self.cache[i].clone()), - Err(i) => { - if i == 0 || i == self.cache.len() { None } // target < lowest cache point || target > highest cache point - else { - Some(self.cache[ if reverse {i} else {i-1} ].clone()) - } - } - } - } -} diff --git a/src/buffer/mod.rs b/src/buffer/mod.rs index ba2a64f..6d736dc 100644 --- a/src/buffer/mod.rs +++ b/src/buffer/mod.rs @@ -42,6 +42,7 @@ use crate::clipboard::Clipboard; use crate::document::{ReadableDocument, WriteableDocument}; use crate::framebuffer::{Framebuffer, IndexedColor}; use crate::helpers::*; +use crate::lsh::cache::HighlighterCache; use crate::lsh::{HighlightKind, Highlighter, Language}; use crate::oklab::StraightRgba; use crate::simd::memchr2; @@ -219,6 +220,7 @@ pub struct TextBuffer { active_edit_line_info: Option, active_edit_depth: i32, active_edit_off: usize, + active_edit_first_line_y: Option, stats: TextBufferStatistics, cursor: Cursor, @@ -230,6 +232,7 @@ pub struct TextBuffer { selection: Option, selection_generation: u32, search: Option>, + highlighter_cache: HighlighterCache, width: CoordType, margin_width: CoordType, @@ -272,6 +275,7 @@ impl TextBuffer { active_edit_line_info: None, active_edit_depth: 0, active_edit_off: 0, + active_edit_first_line_y: None, stats: TextBufferStatistics { logical_lines: 1, visual_lines: 1 }, cursor: Default::default(), @@ -279,6 +283,7 @@ impl TextBuffer { selection: None, selection_generation: 0, search: None, + highlighter_cache: HighlighterCache::new(), width: 0, margin_width: 0, @@ -581,8 +586,13 @@ impl TextBuffer { self.line_highlight_enabled = enabled; } + pub fn language(&self) -> Option<&'static Language> { + self.language + } + pub fn set_language(&mut self, language: Option<&'static Language>) { self.language = language; + self.highlighter_cache.clear_all(); } /// Sets a ruler column, e.g. 80. @@ -663,6 +673,7 @@ impl TextBuffer { self.set_selection(None); self.mark_as_clean(); self.reflow(); + self.highlighter_cache.clear_all(); } /// Copies the contents of the buffer into a string. @@ -1738,6 +1749,14 @@ impl TextBuffer { if da < db { a } else { b } }; + // If we have a highlighter and a cache, fast-forward to the last checkpoint before + // the first line of the viewport to reduce the amount of work needed. + if let Some(h) = &mut highlighter { + let first_line_cursor = + self.cursor_move_to_visual_internal(cursor, Point { x: origin.x, y: origin.y }); + self.highlighter_cache.prepare(h, first_line_cursor.logical_pos.y); + } + let [selection_beg, selection_end] = match self.selection { None => [Point::MIN, Point::MIN], Some(TextBufferSelection { beg, end }) => minmax(beg, end), @@ -1970,9 +1989,11 @@ impl TextBuffer { while h.logical_pos_y() < cursor_beg.logical_pos.y - 1 { let scratch_alt = scratch_arena(Some(&scratch)); _ = h.parse_next_line(&scratch_alt); + self.highlighter_cache.maybe_store_after_parse(h); } let highlights = h.parse_next_line(&scratch); + self.highlighter_cache.maybe_store_after_parse(h); let mut highlights = highlights.iter(); if let Some(first) = highlights.next() { @@ -2609,6 +2630,13 @@ impl TextBuffer { let cursor_before = self.cursor; self.set_cursor_internal(cursor); + // Track the first logical line affected by this edit so we can invalidate + // cached highlighter state starting from here. + if self.active_edit_first_line_y.is_none() { + let y = self.goto_line_start(cursor, cursor.logical_pos.y).logical_pos.y; + self.active_edit_first_line_y = Some(y); + } + // If both the last and this are a Write/Delete operation, we skip allocating a new undo history item. if history_type != self.last_history_type || !matches!(history_type, HistoryType::Write | HistoryType::Delete) @@ -2755,6 +2783,11 @@ impl TextBuffer { self.stats.visual_lines = self.stats.logical_lines; } + // Invalidate cached highlighter state starting from the first changed line. + if let Some(y) = self.active_edit_first_line_y.take() { + self.highlighter_cache.invalidate_from(y); + } + self.recalc_after_content_changed(); } diff --git a/src/lsh/cache.rs b/src/lsh/cache.rs new file mode 100644 index 0000000..9c1da0d --- /dev/null +++ b/src/lsh/cache.rs @@ -0,0 +1,77 @@ +use crate::helpers::CoordType; +use crate::lsh::highlighter::{Highlighter, ParserState}; + +#[derive(Clone)] +struct Checkpoint { + line: CoordType, // snapshot corresponds to the start of this logical line + state: ParserState, +} + +pub struct HighlighterCache { + checkpoints: Vec, + interval: CoordType, +} + +impl Default for HighlighterCache { + fn default() -> Self { + Self { checkpoints: Vec::new(), interval: 1000 } + } +} + +impl HighlighterCache { + pub fn new() -> Self { + Self::default() + } + + pub fn set_interval(&mut self, interval: CoordType) { + self.interval = interval.max(1); + } + + pub fn clear_all(&mut self) { + self.checkpoints.clear(); + } + + /// Drop any cached state starting at the given logical line. + pub fn invalidate_from(&mut self, line: CoordType) { + if self.checkpoints.is_empty() { + return; + } + let idx = match self.checkpoints.binary_search_by_key(&line, |c| c.line) { + Ok(i) => i, + Err(i) => i, // first checkpoint with line > given `line` is at position i + }; + self.checkpoints.truncate(idx); + } + + /// Prepare the highlighter to start parsing from the last checkpoint before or at `target_line`. + /// If none exists, do nothing (the caller will parse from the start). + pub fn prepare(&self, h: &mut Highlighter, target_line: CoordType) { + if self.checkpoints.is_empty() { + return; + } + let idx = match self.checkpoints.binary_search_by_key(&target_line, |c| c.line) { + Ok(i) => Some(i), + Err(0) => None, + Err(i) => Some(i - 1), + }; + if let Some(i) = idx { + h.restore(&self.checkpoints[i].state); + } + } + + /// After parsing a line, maybe store a checkpoint. The snapshot at this time + /// corresponds to the start of the next logical line, which is ideal for resuming. + pub fn maybe_store_after_parse(&mut self, h: &Highlighter) { + let next_line = h.logical_pos_y().saturating_add(1); + if next_line < 0 { + return; + } + if next_line % self.interval != 0 { + return; + } + if self.checkpoints.last().is_some_and(|c| c.line == next_line) { + return; + } + self.checkpoints.push(Checkpoint { line: next_line, state: h.snapshot() }); + } +} diff --git a/src/lsh/definitions.rs b/src/lsh/definitions.rs index 2736400..3a992c9 100644 --- a/src/lsh/definitions.rs +++ b/src/lsh/definitions.rs @@ -14,6 +14,12 @@ pub struct Language { pub transitions: &'static [Transition<'static>], } +impl PartialEq for Language { + fn eq(&self, other: &Self) -> bool { + std::ptr::eq(self, other) + } +} + pub struct Transition<'a> { pub test: Test<'a>, pub kind: Option, @@ -104,40 +110,40 @@ flowchart TD 20["20 (ignore)"] 20 -->|"Chars(Line)
None"| pop1310720@{ shape: stop } **/ -#[rustfmt::skip] const LANG_GIT_COMMIT_MESSAGE_STRING_520FD55FC3E4BA24: *const u8 = [1, 0x23].as_ptr(); -#[rustfmt::skip] const LANG_GIT_COMMIT_MESSAGE_STRING_121F4AB352225114: *const u8 = [10, 0x64, 0x69, 0x66, 0x66, 0x20, 0x2d, 0x2d, 0x67, 0x69, 0x74].as_ptr(); -#[rustfmt::skip] const LANG_GIT_COMMIT_MESSAGE_STRING_EC6DE3D5B94B5BD1: *const u8 = [9, 0x09, 0x64, 0x65, 0x6c, 0x65, 0x74, 0x65, 0x64, 0x3a].as_ptr(); -#[rustfmt::skip] const LANG_GIT_COMMIT_MESSAGE_STRING_6A3FE2F75546B408: *const u8 = [10, 0x09, 0x6d, 0x6f, 0x64, 0x69, 0x66, 0x69, 0x65, 0x64, 0x3a].as_ptr(); -#[rustfmt::skip] const LANG_GIT_COMMIT_MESSAGE_STRING_1218C90A43640858: *const u8 = [10, 0x09, 0x6e, 0x65, 0x77, 0x20, 0x66, 0x69, 0x6c, 0x65, 0x3a].as_ptr(); -#[rustfmt::skip] const LANG_GIT_COMMIT_MESSAGE_STRING_8E3289E428F770E2: *const u8 = [9, 0x09, 0x72, 0x65, 0x6e, 0x61, 0x6d, 0x65, 0x64, 0x3a].as_ptr(); -#[rustfmt::skip] const LANG_GIT_COMMIT_MESSAGE_STRING_589722BCE95B4F03: *const u8 = [4, 0x64, 0x69, 0x66, 0x66].as_ptr(); -#[rustfmt::skip] const LANG_GIT_COMMIT_MESSAGE_STRING_89C86C21681CF1DD: *const u8 = [3, 0x2d, 0x2d, 0x2d].as_ptr(); -#[rustfmt::skip] const LANG_GIT_COMMIT_MESSAGE_STRING_24BADF678A331FDC: *const u8 = [3, 0x2b, 0x2b, 0x2b].as_ptr(); -#[rustfmt::skip] const LANG_GIT_COMMIT_MESSAGE_STRING_BBDBF9E07DEFE5B5: *const u8 = [1, 0x2d].as_ptr(); -#[rustfmt::skip] const LANG_GIT_COMMIT_MESSAGE_STRING_6D48C1B7C2CD9E76: *const u8 = [1, 0x2b].as_ptr(); +#[rustfmt::skip] const LANG_GIT_COMMIT_MESSAGE_STRING_0: *const u8 = [1, 0x23].as_ptr(); +#[rustfmt::skip] const LANG_GIT_COMMIT_MESSAGE_STRING_1: *const u8 = [10, 0x64, 0x69, 0x66, 0x66, 0x20, 0x2d, 0x2d, 0x67, 0x69, 0x74].as_ptr(); +#[rustfmt::skip] const LANG_GIT_COMMIT_MESSAGE_STRING_2: *const u8 = [9, 0x09, 0x64, 0x65, 0x6c, 0x65, 0x74, 0x65, 0x64, 0x3a].as_ptr(); +#[rustfmt::skip] const LANG_GIT_COMMIT_MESSAGE_STRING_3: *const u8 = [10, 0x09, 0x6d, 0x6f, 0x64, 0x69, 0x66, 0x69, 0x65, 0x64, 0x3a].as_ptr(); +#[rustfmt::skip] const LANG_GIT_COMMIT_MESSAGE_STRING_4: *const u8 = [10, 0x09, 0x6e, 0x65, 0x77, 0x20, 0x66, 0x69, 0x6c, 0x65, 0x3a].as_ptr(); +#[rustfmt::skip] const LANG_GIT_COMMIT_MESSAGE_STRING_5: *const u8 = [9, 0x09, 0x72, 0x65, 0x6e, 0x61, 0x6d, 0x65, 0x64, 0x3a].as_ptr(); +#[rustfmt::skip] const LANG_GIT_COMMIT_MESSAGE_STRING_6: *const u8 = [4, 0x64, 0x69, 0x66, 0x66].as_ptr(); +#[rustfmt::skip] const LANG_GIT_COMMIT_MESSAGE_STRING_7: *const u8 = [3, 0x2d, 0x2d, 0x2d].as_ptr(); +#[rustfmt::skip] const LANG_GIT_COMMIT_MESSAGE_STRING_8: *const u8 = [3, 0x2b, 0x2b, 0x2b].as_ptr(); +#[rustfmt::skip] const LANG_GIT_COMMIT_MESSAGE_STRING_9: *const u8 = [1, 0x2d].as_ptr(); +#[rustfmt::skip] const LANG_GIT_COMMIT_MESSAGE_STRING_10: *const u8 = [1, 0x2b].as_ptr(); #[rustfmt::skip] pub const LANG_GIT_COMMIT_MESSAGE: &Language = &Language { name: "Git Commit Message", filenames: &["COMMIT_EDITMSG", "MERGE_MSG"], transitions: &[ - t(Prefix(LANG_GIT_COMMIT_MESSAGE_STRING_520FD55FC3E4BA24), Some(Comment), Push(4, 0)), - t(Prefix(LANG_GIT_COMMIT_MESSAGE_STRING_121F4AB352225114), None, Change(3)), + t(Prefix(LANG_GIT_COMMIT_MESSAGE_STRING_0), Some(Comment), Push(4, 0)), + t(Prefix(LANG_GIT_COMMIT_MESSAGE_STRING_1), None, Change(3)), t(Chars(0), None, Change(20)), t(Chars(usize::MAX), Some(Direct(BrightBlue)), Push(13, 0)), - t(Prefix(LANG_GIT_COMMIT_MESSAGE_STRING_EC6DE3D5B94B5BD1), None, Change(9)), - t(Prefix(LANG_GIT_COMMIT_MESSAGE_STRING_6A3FE2F75546B408), None, Change(10)), - t(Prefix(LANG_GIT_COMMIT_MESSAGE_STRING_1218C90A43640858), None, Change(11)), - t(Prefix(LANG_GIT_COMMIT_MESSAGE_STRING_8E3289E428F770E2), None, Change(12)), + t(Prefix(LANG_GIT_COMMIT_MESSAGE_STRING_2), None, Change(9)), + t(Prefix(LANG_GIT_COMMIT_MESSAGE_STRING_3), None, Change(10)), + t(Prefix(LANG_GIT_COMMIT_MESSAGE_STRING_4), None, Change(11)), + t(Prefix(LANG_GIT_COMMIT_MESSAGE_STRING_5), None, Change(12)), t(Chars(usize::MAX), None, Pop(1)), t(Chars(usize::MAX), Some(Direct(BrightRed)), Pop(1)), t(Chars(usize::MAX), Some(Direct(BrightBlue)), Pop(1)), t(Chars(usize::MAX), Some(Direct(BrightGreen)), Pop(1)), t(Chars(usize::MAX), Some(Direct(BrightBlue)), Pop(1)), t(Chars(0), Some(Other), Push(14, 13)), - t(Prefix(LANG_GIT_COMMIT_MESSAGE_STRING_589722BCE95B4F03), Some(Direct(BrightBlue)), Change(20)), - t(Prefix(LANG_GIT_COMMIT_MESSAGE_STRING_89C86C21681CF1DD), Some(Direct(BrightBlue)), Change(20)), - t(Prefix(LANG_GIT_COMMIT_MESSAGE_STRING_24BADF678A331FDC), Some(Direct(BrightBlue)), Change(20)), - t(Prefix(LANG_GIT_COMMIT_MESSAGE_STRING_BBDBF9E07DEFE5B5), Some(Direct(BrightRed)), Change(20)), - t(Prefix(LANG_GIT_COMMIT_MESSAGE_STRING_6D48C1B7C2CD9E76), Some(Direct(BrightGreen)), Change(20)), + t(Prefix(LANG_GIT_COMMIT_MESSAGE_STRING_6), Some(Direct(BrightBlue)), Change(20)), + t(Prefix(LANG_GIT_COMMIT_MESSAGE_STRING_7), Some(Direct(BrightBlue)), Change(20)), + t(Prefix(LANG_GIT_COMMIT_MESSAGE_STRING_8), Some(Direct(BrightBlue)), Change(20)), + t(Prefix(LANG_GIT_COMMIT_MESSAGE_STRING_9), Some(Direct(BrightRed)), Change(20)), + t(Prefix(LANG_GIT_COMMIT_MESSAGE_STRING_10), Some(Direct(BrightGreen)), Change(20)), t(Chars(0), None, Change(20)), t(Chars(usize::MAX), None, Pop(0)), ], @@ -153,71 +159,74 @@ config: --- flowchart TD 0["0 (ground)"] - 0 -->|"Prefix(break, drop, exec, b, d, x)
None"| 19 - 0 -->|"Prefix(edit, fixup, pick, reword, squash, e, f, p, r, s)
None"| 21 + 0 -->|"Prefix(break, exec, b, x)
None"| 19 + 0 -->|"Prefix(drop, edit, fixup, pick, reword, squash, d, e, f, p, r, s)
None"| 21 0 -->|"Prefix(#)
None"| 23 0 -->|"Charset([0x00-", $-a, c, g-o, q, t-w, y-0xFF])
None"| pop0@{ shape: stop } 0 -->|"Chars(1)
None"| pop0@{ shape: stop } 19 -->|"Charset([0-9, A-Z, _, a-z, 0xC2-0xF4])
None"| pop1245184@{ shape: stop } - 19 -->|"Chars(0)
Some(Keyword)"| push1245210[/"comment"/] + 19 -->|"Chars(0)
Some(Keyword)"| push1245211[/"comment"/] 21 -->|"Charset([0-9, A-Z, _, a-z, 0xC2-0xF4])
None"| pop1376256@{ shape: stop } 21 -->|"Chars(0)
Some(Keyword)"| push1376280[/"hash"/] 23 -->|"Chars(Line)
Some(Comment)"| pop1507328@{ shape: stop } 24["24 (hash)"] - 24 -->|"Charset([0x00-0x08, 0x0E-0x1F, !-0xFF])
Some(Variable)"| push1572890[/"comment"/] + 24 -->|"Charset([0x00-0x08, 0x0E-0x1F, !-0xFF])
Some(Variable)"| push1572891[/"comment"/] + 24 -->|"Charset([0x09-0x0D, 0x20])
None"| pop1572864@{ shape: stop } 24 -->|"Chars(Line)
None"| pop1572864@{ shape: stop } - 26["26 (comment)"] - 26 -->|"Chars(Line)
Some(Comment)"| pop1703936@{ shape: stop } + 27["27 (comment)"] + 27 -->|"Chars(Line)
Some(Comment)"| pop1769472@{ shape: stop } **/ -#[rustfmt::skip] const LANG_GIT_REBASE_MESSAGE_CHARSET_B8C5CFDCF118A06C: &[u16; 16] = &[0xe0a8, 0xe0f8, 0xf0f8, 0xf0f8, 0xf0f8, 0x70f8, 0x70f8, 0x70f8, 0x70f8, 0x70f8, 0x70f0, 0x7050, 0x7050, 0x7050, 0x7050, 0x7070]; -#[rustfmt::skip] const LANG_GIT_REBASE_MESSAGE_CHARSET_5194EFCD4A36EDF4: &[u16; 16] = &[0xfffb, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xfffe, 0xfffe, 0xfffe, 0xfffe, 0xfffe, 0xffff, 0xffff]; -#[rustfmt::skip] const LANG_GIT_REBASE_MESSAGE_CHARSET_3E7F9F69BC96DA4F: &[u16; 16] = &[0xff7f, 0xffff, 0xff3f, 0xff7b, 0xffbf, 0xffbf, 0xffbf, 0xffff, 0xff7f, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff]; -#[rustfmt::skip] const LANG_GIT_REBASE_MESSAGE_STRING_303BCC458DA59CA8: *const u8 = [5, 0x62, 0x72, 0x65, 0x61, 0x6b].as_ptr(); -#[rustfmt::skip] const LANG_GIT_REBASE_MESSAGE_STRING_0C6400D6B9B47208: *const u8 = [4, 0x64, 0x72, 0x6f, 0x70].as_ptr(); -#[rustfmt::skip] const LANG_GIT_REBASE_MESSAGE_STRING_61385E31CBE176AD: *const u8 = [4, 0x65, 0x78, 0x65, 0x63].as_ptr(); -#[rustfmt::skip] const LANG_GIT_REBASE_MESSAGE_STRING_EBD3E4FD8943240A: *const u8 = [1, 0x62].as_ptr(); -#[rustfmt::skip] const LANG_GIT_REBASE_MESSAGE_STRING_669BD43167599AB5: *const u8 = [1, 0x64].as_ptr(); -#[rustfmt::skip] const LANG_GIT_REBASE_MESSAGE_STRING_735B2F4734CDA621: *const u8 = [1, 0x78].as_ptr(); -#[rustfmt::skip] const LANG_GIT_REBASE_MESSAGE_STRING_8FC5A350E96EF029: *const u8 = [4, 0x65, 0x64, 0x69, 0x74].as_ptr(); -#[rustfmt::skip] const LANG_GIT_REBASE_MESSAGE_STRING_F21A2AE3814D4D54: *const u8 = [5, 0x66, 0x69, 0x78, 0x75, 0x70].as_ptr(); -#[rustfmt::skip] const LANG_GIT_REBASE_MESSAGE_STRING_A678DDDA0C17324F: *const u8 = [4, 0x70, 0x69, 0x63, 0x6b].as_ptr(); -#[rustfmt::skip] const LANG_GIT_REBASE_MESSAGE_STRING_691484CDDA2E5DBA: *const u8 = [6, 0x72, 0x65, 0x77, 0x6f, 0x72, 0x64].as_ptr(); -#[rustfmt::skip] const LANG_GIT_REBASE_MESSAGE_STRING_A8F3DDD2082E2211: *const u8 = [6, 0x73, 0x71, 0x75, 0x61, 0x73, 0x68].as_ptr(); -#[rustfmt::skip] const LANG_GIT_REBASE_MESSAGE_STRING_E561AACBDF93CFD2: *const u8 = [1, 0x65].as_ptr(); -#[rustfmt::skip] const LANG_GIT_REBASE_MESSAGE_STRING_56C0C61729ED1B19: *const u8 = [1, 0x66].as_ptr(); -#[rustfmt::skip] const LANG_GIT_REBASE_MESSAGE_STRING_1DD05CCF89ABD763: *const u8 = [1, 0x70].as_ptr(); -#[rustfmt::skip] const LANG_GIT_REBASE_MESSAGE_STRING_915744BB66D98775: *const u8 = [1, 0x72].as_ptr(); -#[rustfmt::skip] const LANG_GIT_REBASE_MESSAGE_STRING_58F0D67C30FD13C8: *const u8 = [1, 0x73].as_ptr(); -#[rustfmt::skip] const LANG_GIT_REBASE_MESSAGE_STRING_520FD55FC3E4BA24: *const u8 = [1, 0x23].as_ptr(); +#[rustfmt::skip] const LANG_GIT_REBASE_MESSAGE_CHARSET_0: &[u16; 16] = &[0xe0a8, 0xe0f8, 0xf0f8, 0xf0f8, 0xf0f8, 0x70f8, 0x70f8, 0x70f8, 0x70f8, 0x70f8, 0x70f0, 0x7050, 0x7050, 0x7050, 0x7050, 0x7070]; +#[rustfmt::skip] const LANG_GIT_REBASE_MESSAGE_CHARSET_1: &[u16; 16] = &[0xfffb, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xfffe, 0xfffe, 0xfffe, 0xfffe, 0xfffe, 0xffff, 0xffff]; +#[rustfmt::skip] const LANG_GIT_REBASE_MESSAGE_CHARSET_2: &[u16; 16] = &[0x0004, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0000, 0x0000]; +#[rustfmt::skip] const LANG_GIT_REBASE_MESSAGE_CHARSET_3: &[u16; 16] = &[0xff7f, 0xffff, 0xff3f, 0xff7b, 0xffbf, 0xffbf, 0xffbf, 0xffff, 0xff7f, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff]; +#[rustfmt::skip] const LANG_GIT_REBASE_MESSAGE_STRING_0: *const u8 = [5, 0x62, 0x72, 0x65, 0x61, 0x6b].as_ptr(); +#[rustfmt::skip] const LANG_GIT_REBASE_MESSAGE_STRING_1: *const u8 = [4, 0x65, 0x78, 0x65, 0x63].as_ptr(); +#[rustfmt::skip] const LANG_GIT_REBASE_MESSAGE_STRING_2: *const u8 = [1, 0x62].as_ptr(); +#[rustfmt::skip] const LANG_GIT_REBASE_MESSAGE_STRING_3: *const u8 = [1, 0x78].as_ptr(); +#[rustfmt::skip] const LANG_GIT_REBASE_MESSAGE_STRING_4: *const u8 = [4, 0x64, 0x72, 0x6f, 0x70].as_ptr(); +#[rustfmt::skip] const LANG_GIT_REBASE_MESSAGE_STRING_5: *const u8 = [4, 0x65, 0x64, 0x69, 0x74].as_ptr(); +#[rustfmt::skip] const LANG_GIT_REBASE_MESSAGE_STRING_6: *const u8 = [5, 0x66, 0x69, 0x78, 0x75, 0x70].as_ptr(); +#[rustfmt::skip] const LANG_GIT_REBASE_MESSAGE_STRING_7: *const u8 = [4, 0x70, 0x69, 0x63, 0x6b].as_ptr(); +#[rustfmt::skip] const LANG_GIT_REBASE_MESSAGE_STRING_8: *const u8 = [6, 0x72, 0x65, 0x77, 0x6f, 0x72, 0x64].as_ptr(); +#[rustfmt::skip] const LANG_GIT_REBASE_MESSAGE_STRING_9: *const u8 = [6, 0x73, 0x71, 0x75, 0x61, 0x73, 0x68].as_ptr(); +#[rustfmt::skip] const LANG_GIT_REBASE_MESSAGE_STRING_10: *const u8 = [1, 0x64].as_ptr(); +#[rustfmt::skip] const LANG_GIT_REBASE_MESSAGE_STRING_11: *const u8 = [1, 0x65].as_ptr(); +#[rustfmt::skip] const LANG_GIT_REBASE_MESSAGE_STRING_12: *const u8 = [1, 0x66].as_ptr(); +#[rustfmt::skip] const LANG_GIT_REBASE_MESSAGE_STRING_13: *const u8 = [1, 0x70].as_ptr(); +#[rustfmt::skip] const LANG_GIT_REBASE_MESSAGE_STRING_14: *const u8 = [1, 0x72].as_ptr(); +#[rustfmt::skip] const LANG_GIT_REBASE_MESSAGE_STRING_15: *const u8 = [1, 0x73].as_ptr(); +#[rustfmt::skip] const LANG_GIT_REBASE_MESSAGE_STRING_16: *const u8 = [1, 0x23].as_ptr(); #[rustfmt::skip] pub const LANG_GIT_REBASE_MESSAGE: &Language = &Language { name: "Git Rebase Message", filenames: &["git-rebase-todo"], transitions: &[ - t(Prefix(LANG_GIT_REBASE_MESSAGE_STRING_303BCC458DA59CA8), None, Change(19)), - t(Prefix(LANG_GIT_REBASE_MESSAGE_STRING_0C6400D6B9B47208), None, Change(19)), - t(Prefix(LANG_GIT_REBASE_MESSAGE_STRING_61385E31CBE176AD), None, Change(19)), - t(Prefix(LANG_GIT_REBASE_MESSAGE_STRING_EBD3E4FD8943240A), None, Change(19)), - t(Prefix(LANG_GIT_REBASE_MESSAGE_STRING_669BD43167599AB5), None, Change(19)), - t(Prefix(LANG_GIT_REBASE_MESSAGE_STRING_735B2F4734CDA621), None, Change(19)), - t(Prefix(LANG_GIT_REBASE_MESSAGE_STRING_8FC5A350E96EF029), None, Change(21)), - t(Prefix(LANG_GIT_REBASE_MESSAGE_STRING_F21A2AE3814D4D54), None, Change(21)), - t(Prefix(LANG_GIT_REBASE_MESSAGE_STRING_A678DDDA0C17324F), None, Change(21)), - t(Prefix(LANG_GIT_REBASE_MESSAGE_STRING_691484CDDA2E5DBA), None, Change(21)), - t(Prefix(LANG_GIT_REBASE_MESSAGE_STRING_A8F3DDD2082E2211), None, Change(21)), - t(Prefix(LANG_GIT_REBASE_MESSAGE_STRING_E561AACBDF93CFD2), None, Change(21)), - t(Prefix(LANG_GIT_REBASE_MESSAGE_STRING_56C0C61729ED1B19), None, Change(21)), - t(Prefix(LANG_GIT_REBASE_MESSAGE_STRING_1DD05CCF89ABD763), None, Change(21)), - t(Prefix(LANG_GIT_REBASE_MESSAGE_STRING_915744BB66D98775), None, Change(21)), - t(Prefix(LANG_GIT_REBASE_MESSAGE_STRING_58F0D67C30FD13C8), None, Change(21)), - t(Prefix(LANG_GIT_REBASE_MESSAGE_STRING_520FD55FC3E4BA24), None, Change(23)), - t(Charset(LANG_GIT_REBASE_MESSAGE_CHARSET_3E7F9F69BC96DA4F), None, Pop(0)), + t(Prefix(LANG_GIT_REBASE_MESSAGE_STRING_0), None, Change(19)), + t(Prefix(LANG_GIT_REBASE_MESSAGE_STRING_1), None, Change(19)), + t(Prefix(LANG_GIT_REBASE_MESSAGE_STRING_2), None, Change(19)), + t(Prefix(LANG_GIT_REBASE_MESSAGE_STRING_3), None, Change(19)), + t(Prefix(LANG_GIT_REBASE_MESSAGE_STRING_4), None, Change(21)), + t(Prefix(LANG_GIT_REBASE_MESSAGE_STRING_5), None, Change(21)), + t(Prefix(LANG_GIT_REBASE_MESSAGE_STRING_6), None, Change(21)), + t(Prefix(LANG_GIT_REBASE_MESSAGE_STRING_7), None, Change(21)), + t(Prefix(LANG_GIT_REBASE_MESSAGE_STRING_8), None, Change(21)), + t(Prefix(LANG_GIT_REBASE_MESSAGE_STRING_9), None, Change(21)), + t(Prefix(LANG_GIT_REBASE_MESSAGE_STRING_10), None, Change(21)), + t(Prefix(LANG_GIT_REBASE_MESSAGE_STRING_11), None, Change(21)), + t(Prefix(LANG_GIT_REBASE_MESSAGE_STRING_12), None, Change(21)), + t(Prefix(LANG_GIT_REBASE_MESSAGE_STRING_13), None, Change(21)), + t(Prefix(LANG_GIT_REBASE_MESSAGE_STRING_14), None, Change(21)), + t(Prefix(LANG_GIT_REBASE_MESSAGE_STRING_15), None, Change(21)), + t(Prefix(LANG_GIT_REBASE_MESSAGE_STRING_16), None, Change(23)), + t(Charset(LANG_GIT_REBASE_MESSAGE_CHARSET_3), None, Pop(0)), t(Chars(1), None, Pop(0)), - t(Charset(LANG_GIT_REBASE_MESSAGE_CHARSET_B8C5CFDCF118A06C), None, Pop(0)), - t(Chars(0), Some(Keyword), Push(26, 0)), - t(Charset(LANG_GIT_REBASE_MESSAGE_CHARSET_B8C5CFDCF118A06C), None, Pop(0)), + t(Charset(LANG_GIT_REBASE_MESSAGE_CHARSET_0), None, Pop(0)), + t(Chars(0), Some(Keyword), Push(27, 0)), + t(Charset(LANG_GIT_REBASE_MESSAGE_CHARSET_0), None, Pop(0)), t(Chars(0), Some(Keyword), Push(24, 0)), t(Chars(usize::MAX), Some(Comment), Pop(0)), - t(Charset(LANG_GIT_REBASE_MESSAGE_CHARSET_5194EFCD4A36EDF4), Some(Variable), Push(26, 24)), + t(Charset(LANG_GIT_REBASE_MESSAGE_CHARSET_1), Some(Variable), Push(27, 24)), + t(Charset(LANG_GIT_REBASE_MESSAGE_CHARSET_2), None, Pop(0)), t(Chars(usize::MAX), None, Pop(1)), t(Chars(usize::MAX), Some(Comment), Pop(1)), ], diff --git a/src/lsh/highlighter.rs b/src/lsh/highlighter.rs new file mode 100644 index 0000000..b17bd95 --- /dev/null +++ b/src/lsh/highlighter.rs @@ -0,0 +1,408 @@ +use std::borrow::Cow; +use std::ffi::OsStr; +use std::fmt::Debug; +use std::ops::RangeInclusive; +use std::path::Path; +use std::slice; + +use crate::arena::{Arena, scratch_arena}; +use crate::document::ReadableDocument; +use crate::helpers::*; +use crate::lsh::definitions::*; +use crate::{simd, unicode}; + +pub fn language_from_path(path: &Path) -> Option<&'static Language> { + let filename = path.file_name()?.as_encoded_bytes(); + + for &l in LANGUAGES { + for f in l.filenames { + let f = f.as_bytes(); + if let Some(suffix) = f.strip_prefix(b"*") { + if filename.ends_with(suffix) { + return Some(l); + } + } else if filename == f { + return Some(l); + } + } + } + + None +} + +#[derive(Clone, PartialEq, Eq)] +pub struct Higlight { + pub start: usize, + pub kind: HighlightKind, +} + +impl Debug for Higlight { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "({}, {:?})", self.start, self.kind) + } +} + +#[derive(Clone, Copy, PartialEq, Eq, Default)] +pub struct State {} + +#[derive(Clone)] +pub struct Highlighter<'a> { + doc: &'a dyn ReadableDocument, + language: &'static Language, + offset: usize, + logical_pos_y: CoordType, + state_stack: Vec<(u8, HighlightKind)>, +} + +#[derive(Clone)] +pub struct ParserState { + pub offset: usize, + pub logical_pos_y: CoordType, + pub state_stack: Vec<(u8, HighlightKind)>, +} + +impl<'doc> Highlighter<'doc> { + pub fn new(doc: &'doc dyn ReadableDocument, language: &'static Language) -> Self { + Self { + doc, + language, + offset: 0, + logical_pos_y: 0, + state_stack: vec![(0, HighlightKind::Other)], + } + } + + pub fn logical_pos_y(&self) -> CoordType { + self.logical_pos_y + } + + /// Create a restorable snapshot of the current highlighter state + /// so we can resume highlighting from this point later. + pub fn snapshot(&self) -> ParserState { + ParserState { + offset: self.offset, + logical_pos_y: self.logical_pos_y, + state_stack: self.state_stack.clone(), + } + } + + /// Restore the highlighter state from a previously captured snapshot. + pub fn restore(&mut self, snapshot: &ParserState) { + self.offset = snapshot.offset; + self.logical_pos_y = snapshot.logical_pos_y; + self.state_stack = snapshot.state_stack.clone(); + } + + pub fn parse_next_line<'a>(&mut self, arena: &'a Arena) -> Vec { + const MAX_LEN: usize = 32 * KIBI; + + let scratch = scratch_arena(Some(arena)); + let line_beg = self.offset; + let mut res = Vec::new_in(arena); + + if self.offset != 0 { + self.logical_pos_y += 1; + } + + // Accumulate a line of text into `line_buf`. + let line = 'read: { + let mut chunk; + let mut line_buf; + + // Try to read a chunk and see if it contains a newline. + // In that case we can skip concatenating chunks. + { + chunk = self.doc.read_forward(self.offset); + if chunk.is_empty() { + break 'read chunk; + } + + let (off, line) = simd::lines_fwd(chunk, 0, 0, 1); + self.offset += off; + + if line == 1 { + break 'read &chunk[..off]; + } + + let next_chunk = self.doc.read_forward(self.offset); + if next_chunk.is_empty() { + break 'read &chunk[..off]; + } + + line_buf = Vec::new_in(&*scratch); + + // Ensure we don't overflow the heap size with a 1GB long line. + let end = off.min(MAX_LEN - line_buf.len()); + let end = end.min(chunk.len()); + line_buf.extend_from_slice(&chunk[..end]); + + chunk = next_chunk; + } + + // Concatenate chunks until we get a full line. + while line_buf.len() < MAX_LEN { + let (off, line) = simd::lines_fwd(chunk, 0, 0, 1); + self.offset += off; + + // Ensure we don't overflow the heap size with a 1GB long line. + let end = off.min(MAX_LEN - line_buf.len()); + let end = end.min(chunk.len()); + line_buf.extend_from_slice(&chunk[..end]); + + // Start of the next line found. + if line == 1 { + break; + } + + chunk = self.doc.read_forward(self.offset); + if chunk.is_empty() { + break; + } + } + + line_buf.leak() + }; + + // If the line is empty, we reached the end of the document. + // + // If the line is too long, we don't highlight it. + // This is to prevent performance issues with very long lines. + if line.is_empty() || line.len() >= MAX_LEN { + return res; + } + + let line = unicode::strip_newline(line); + let mut off = 0usize; + let mut start = 0usize; + + let &(state, mut kind) = unsafe { self.state_stack.last().unwrap_unchecked() }; + let mut state = state as usize; + + let mut push = |start: usize, kind: HighlightKind| { + if let Some(last) = res.last_mut() { + if last.start == start { + last.kind = kind; + } + if last.kind == kind { + return; + } + } + res.push(Higlight { start, kind }); + }; + + state = state.wrapping_sub(1); + + loop { + state = state.wrapping_add(1); + let t = unsafe { self.language.transitions.get_unchecked(state) }; + + match t.test { + Test::Chars(n) => { + off = off + n.min(line.len() - off); + } + Test::Prefix(str) => { + let str = unsafe { slice::from_raw_parts(str.add(1), str.read() as usize) }; + if !Self::inlined_memcmp(line, off, str) { + continue; + } + off += str.len(); + } + Test::PrefixInsensitive(str) => { + let str = unsafe { slice::from_raw_parts(str.add(1), str.read() as usize) }; + if !Self::inlined_memicmp(line, off, str) { + continue; + } + off += str.len(); + } + Test::Charset(cs) => { + // TODO: http://0x80.pl/notesen/2018-10-18-simd-byte-lookup.html#alternative-implementation + if off >= line.len() || !Self::in_set(cs, line[off]) { + continue; + } + while { + off += 1; + off < line.len() && Self::in_set(cs, line[off]) + } {} + } + } + + match t.action { + Action::Change(dst) => { + state = dst as usize; + kind = t.kind.unwrap_or(kind); + } + Action::Push(dst, _) => { + self.state_stack.push((dst, kind)); + + state = dst as usize; + kind = t.kind.unwrap_or(kind); + push(start, kind); + + start = off; + } + Action::Pop(n) => { + kind = t.kind.unwrap_or(kind); + push(start, kind); + + if n != 0 { + let n = n as usize; + self.state_stack.truncate(self.state_stack.len().max(n + 1) - n); + } + + let v = unsafe { self.state_stack.last().unwrap_unchecked() }; + state = v.0 as usize; + kind = v.1; + + start = off; + + if n == 0 && off >= line.len() { + break; + } + } + } + + state = state.wrapping_sub(1); + } + + push(start, kind); + push(line.len(), kind); + + // Adjust the range to account for the line offset. + for h in &mut res { + h.start = line_beg + h.start.min(line.len()); + } + + res + } + + /// A mini-memcmp implementation for short needles. + /// Compares the `haystack` at `off` with the `needle`. + #[inline] + fn inlined_memcmp(haystack: &[u8], off: usize, needle: &[u8]) -> bool { + unsafe { + let needle_len = needle.len(); + if haystack.len() - off < needle_len { + return false; + } + + let mut a = haystack.as_ptr().add(off); + let mut b = needle.as_ptr(); + let mut i = 0; + + while i < needle_len { + let a = *a.add(i); + let b = *b.add(i); + i += 1; + if a != b { + return false; + } + } + + true + } + } + + /// Like `inlined_memcmp`, but case-insensitive. + #[inline] + fn inlined_memicmp(haystack: &[u8], off: usize, needle: &[u8]) -> bool { + unsafe { + let needle_len = needle.len(); + if haystack.len() - off < needle_len { + return false; + } + + let mut a = haystack.as_ptr().add(off); + let mut b = needle.as_ptr(); + let mut i = 0; + + while i < needle_len { + // str in PrefixInsensitive(str) is expected to be lowercase, printable ASCII. + let a = a.add(i).read().to_ascii_lowercase(); + let b = b.add(i).read(); + i += 1; + if a != b { + return false; + } + } + + true + } + } + + #[inline] + fn in_set(bitmap: &[u16; 16], byte: u8) -> bool { + let lo_nibble = byte & 0xf; + let hi_nibble = byte >> 4; + + let bitset = bitmap[lo_nibble as usize]; + let bitmask = 1u16 << hi_nibble; + + (bitset & bitmask) != 0 + } +} + +/*#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_powershell() { + let doc = r#"$response = Read-Host "Delete branch '$branch'? [y/N]""#; + let bytes = doc.as_bytes(); + let scratch = scratch_arena(None); + let mut parser = Highlighter::new(&bytes, &lang_powershell::LANG); + + let tokens = parser.parse_next_line(&scratch); + assert_eq!( + tokens, + &[ + Higlight { start: 0, kind: HighlightKind::Variable }, + Higlight { start: 9, kind: HighlightKind::Other }, + Higlight { start: 10, kind: HighlightKind::Operator }, + Higlight { start: 11, kind: HighlightKind::Other }, + Higlight { start: 12, kind: HighlightKind::Method }, + Higlight { start: 21, kind: HighlightKind::Other }, + Higlight { start: 22, kind: HighlightKind::String }, + Higlight { start: 38, kind: HighlightKind::Variable }, + Higlight { start: 45, kind: HighlightKind::String }, + Higlight { start: 54, kind: HighlightKind::Other }, + ] + ); + } + + #[test] + fn test_string() { + let doc = r#""$x";"#; + let bytes = doc.as_bytes(); + let scratch = scratch_arena(None); + let mut parser = Highlighter::new(&bytes, &lang_powershell::LANG); + + let tokens = parser.parse_next_line(&scratch); + assert_eq!( + tokens, + &[ + Higlight { start: 0, kind: HighlightKind::String }, + Higlight { start: 1, kind: HighlightKind::Variable }, + Higlight { start: 3, kind: HighlightKind::String }, + Higlight { start: 4, kind: HighlightKind::Other }, + ] + ); + } + + #[test] + fn test_comment() { + let doc = r#"<#x#>"#; + let bytes = doc.as_bytes(); + let scratch = scratch_arena(None); + let mut parser = Highlighter::new(&bytes, &lang_powershell::LANG); + + let tokens = parser.parse_next_line(&scratch); + assert_eq!( + tokens, + &[ + Higlight { start: 0, kind: HighlightKind::Comment }, + Higlight { start: 5, kind: HighlightKind::Other }, + ] + ); + } +}*/ diff --git a/src/lsh/mod.rs b/src/lsh/mod.rs index 02fd39c..919aa6b 100644 --- a/src/lsh/mod.rs +++ b/src/lsh/mod.rs @@ -1,363 +1,8 @@ //! Welcome to Leonard's Shitty Highlighter. +pub mod cache; mod definitions; +mod highlighter; -use std::ffi::OsStr; -use std::fmt::Debug; -use std::ops::RangeInclusive; -use std::path::Path; -use std::slice; - -pub use definitions::{HighlightKind, Language}; - -use crate::arena::{Arena, scratch_arena}; -use crate::document::ReadableDocument; -use crate::helpers::*; -use crate::lsh::definitions::*; -use crate::{simd, unicode}; - -pub fn language_from_path(path: &Path) -> Option<&'static Language> { - let filename = path.file_name()?.as_encoded_bytes(); - - for &l in LANGUAGES { - for f in l.filenames { - let f = f.as_bytes(); - if let Some(suffix) = f.strip_prefix(b"*") { - if filename.ends_with(suffix) { - return Some(l); - } - } else if filename == f { - return Some(l); - } - } - } - - None -} - -#[derive(Clone, PartialEq, Eq)] -pub struct Higlight { - pub start: usize, - pub kind: HighlightKind, -} - -impl Debug for Higlight { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - write!(f, "({}, {:?})", self.start, self.kind) - } -} - -#[derive(Clone, Copy, PartialEq, Eq, Default)] -pub struct State {} - -#[derive(Clone)] -pub struct Highlighter<'a> { - doc: &'a dyn ReadableDocument, - language: &'static Language, - offset: usize, - logical_pos_y: CoordType, - - state: usize, - kind: HighlightKind, - state_stack: Vec<(u8, HighlightKind)>, -} - -impl<'doc> Highlighter<'doc> { - pub fn new(doc: &'doc dyn ReadableDocument, language: &'static Language) -> Self { - Self { - doc, - language, - offset: 0, - logical_pos_y: 0, - - state: 0, - kind: Default::default(), - state_stack: Default::default(), - } - } - - pub fn logical_pos_y(&self) -> CoordType { - self.logical_pos_y - } - - pub fn parse_next_line<'a>(&mut self, arena: &'a Arena) -> Vec { - const MAX_LEN: usize = 32 * KIBI; - - let scratch = scratch_arena(Some(arena)); - let line_beg = self.offset; - let mut line_buf = Vec::new_in(&*scratch); - let mut res = Vec::new_in(arena); - - if self.offset != 0 { - self.logical_pos_y += 1; - } - - // Accumulate a line of text into `line_buf`. - { - let mut chunk = self.doc.read_forward(self.offset); - - // Check if the last line was the last line in the document. - if chunk.is_empty() { - return res; - } - - loop { - let (off, line) = simd::lines_fwd(chunk, 0, 0, 1); - self.offset += off; - - // Overly long lines are not highlighted, so we limit the line length to 32 KiB. - // I'm worried it may run into weird edge cases. - let end = off.min(MAX_LEN - line_buf.len()); - // If we're at it we can also help Rust understand that indexing with `end` doesn't panic. - let end = end.min(chunk.len()); - - line_buf.extend_from_slice(&chunk[..end]); - - // If the line is too long, we don't highlight it. - // This is to prevent performance issues with very long lines. - if line_buf.len() >= MAX_LEN { - return res; - } - - // Start of the next line found. - if line == 1 { - break; - } - - chunk = self.doc.read_forward(self.offset); - if chunk.is_empty() { - // End of document reached - break; - } - } - } - - let line_buf = unicode::strip_newline(&line_buf); - let mut off = 0usize; - let mut start = 0usize; - - let mut state = self.state; - let mut kind = self.kind; - - state = state.wrapping_sub(1); - - loop { - state = state.wrapping_add(1); - let t = unsafe { self.language.transitions.get_unchecked(state) }; - - match t.test { - Test::Chars(n) => { - off = off + n.min(line_buf.len() - off); - } - Test::Prefix(str) => { - let str = unsafe { slice::from_raw_parts(str.add(1), str.read() as usize) }; - if !Self::inlined_memcmp(line_buf, off, str) { - continue; - } - off += str.len(); - } - Test::PrefixInsensitive(str) => { - let str = unsafe { slice::from_raw_parts(str.add(1), str.read() as usize) }; - if !Self::inlined_memicmp(line_buf, off, str) { - continue; - } - off += str.len(); - } - Test::Charset(cs) => { - // TODO: http://0x80.pl/notesen/2018-10-18-simd-byte-lookup.html#alternative-implementation - if off >= line_buf.len() || !Self::in_set(cs, line_buf[off]) { - continue; - } - while { - off += 1; - off < line_buf.len() && Self::in_set(cs, line_buf[off]) - } {} - } - } - - match t.action { - Action::Change(dst) => { - state = dst as usize; - kind = t.kind.unwrap_or(kind); - } - Action::Push(dst, pop_dst) => { - self.state_stack.push((pop_dst, kind)); - - state = dst as usize; - kind = t.kind.unwrap_or(kind); - res.push(Higlight { start, kind }); - - start = off; - } - Action::Pop(n) => { - kind = t.kind.unwrap_or(kind); - res.push(Higlight { start, kind }); - - let l = self.state_stack.last().copied().unwrap_or_default(); - state = l.0 as usize; - kind = l.1; - - if n != 0 { - self.state_stack - .truncate(self.state_stack.len().saturating_sub(n as usize)); - } - - start = off; - - if n == 0 && off >= line_buf.len() { - break; - } - } - } - - state = state.wrapping_sub(1); - } - - if res.last().is_none_or(|h| h.start != start) { - res.push(Higlight { start, kind }); - } - if res.last().is_some_and(|h| h.start != line_buf.len()) { - res.push(Higlight { start: line_buf.len(), kind }); - } - - // Adjust the range to account for the line offset. - for h in &mut res { - h.start = line_beg + h.start.min(line_buf.len()); - } - - self.state = state; - self.kind = kind; - res - } - - /// A mini-memcmp implementation for short needles. - /// Compares the `haystack` at `off` with the `needle`. - #[inline] - fn inlined_memcmp(haystack: &[u8], off: usize, needle: &[u8]) -> bool { - unsafe { - let needle_len = needle.len(); - if haystack.len() - off < needle_len { - return false; - } - - let mut a = haystack.as_ptr().add(off); - let mut b = needle.as_ptr(); - let mut i = 0; - - while i < needle_len { - let a = *a.add(i); - let b = *b.add(i); - i += 1; - if a != b { - return false; - } - } - - true - } - } - - /// Like `inlined_memcmp`, but case-insensitive. - #[inline] - fn inlined_memicmp(haystack: &[u8], off: usize, needle: &[u8]) -> bool { - unsafe { - let needle_len = needle.len(); - if haystack.len() - off < needle_len { - return false; - } - - let mut a = haystack.as_ptr().add(off); - let mut b = needle.as_ptr(); - let mut i = 0; - - while i < needle_len { - // str in PrefixInsensitive(str) is expected to be lowercase, printable ASCII. - let a = a.add(i).read().to_ascii_lowercase(); - let b = b.add(i).read(); - i += 1; - if a != b { - return false; - } - } - - true - } - } - - #[inline] - fn in_set(bitmap: &[u16; 16], byte: u8) -> bool { - let lo_nibble = byte & 0xf; - let hi_nibble = byte >> 4; - - let bitset = bitmap[lo_nibble as usize]; - let bitmask = 1u16 << hi_nibble; - - (bitset & bitmask) != 0 - } -} - -/*#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn test_powershell() { - let doc = r#"$response = Read-Host "Delete branch '$branch'? [y/N]""#; - let bytes = doc.as_bytes(); - let scratch = scratch_arena(None); - let mut parser = Highlighter::new(&bytes, &lang_powershell::LANG); - - let tokens = parser.parse_next_line(&scratch); - assert_eq!( - tokens, - &[ - Higlight { start: 0, kind: HighlightKind::Variable }, - Higlight { start: 9, kind: HighlightKind::Other }, - Higlight { start: 10, kind: HighlightKind::Operator }, - Higlight { start: 11, kind: HighlightKind::Other }, - Higlight { start: 12, kind: HighlightKind::Method }, - Higlight { start: 21, kind: HighlightKind::Other }, - Higlight { start: 22, kind: HighlightKind::String }, - Higlight { start: 38, kind: HighlightKind::Variable }, - Higlight { start: 45, kind: HighlightKind::String }, - Higlight { start: 54, kind: HighlightKind::Other }, - ] - ); - } - - #[test] - fn test_string() { - let doc = r#""$x";"#; - let bytes = doc.as_bytes(); - let scratch = scratch_arena(None); - let mut parser = Highlighter::new(&bytes, &lang_powershell::LANG); - - let tokens = parser.parse_next_line(&scratch); - assert_eq!( - tokens, - &[ - Higlight { start: 0, kind: HighlightKind::String }, - Higlight { start: 1, kind: HighlightKind::Variable }, - Higlight { start: 3, kind: HighlightKind::String }, - Higlight { start: 4, kind: HighlightKind::Other }, - ] - ); - } - - #[test] - fn test_comment() { - let doc = r#"<#x#>"#; - let bytes = doc.as_bytes(); - let scratch = scratch_arena(None); - let mut parser = Highlighter::new(&bytes, &lang_powershell::LANG); - - let tokens = parser.parse_next_line(&scratch); - assert_eq!( - tokens, - &[ - Higlight { start: 0, kind: HighlightKind::Comment }, - Higlight { start: 5, kind: HighlightKind::Other }, - ] - ); - } -}*/ +pub use definitions::*; +pub use highlighter::*; diff --git a/tools/highlighter-gen/src/definitions.rs b/tools/highlighter-gen/src/definitions.rs index 1fe2290..f2355ca 100644 --- a/tools/highlighter-gen/src/definitions.rs +++ b/tools/highlighter-gen/src/definitions.rs @@ -81,10 +81,10 @@ const LANG_GIT_REBASE: Language = Language { State { name: "ground", rules: &[ - re(r#"(?:break|drop|exec|b|d|x)\b{end-half}"#) + re(r#"(?:break|exec|b|x)\b{end-half}"#) .is(Keyword) .then_call("comment"), - re(r#"(?:edit|fixup|pick|reword|squash|e|f|p|r|s)\b{end-half}"#) + re(r#"(?:drop|edit|fixup|pick|reword|squash|d|e|f|p|r|s)\b{end-half}"#) .is(Keyword) .then_call("hash"), re(r#"#.*"#).is(Comment), @@ -94,6 +94,7 @@ const LANG_GIT_REBASE: Language = Language { name: "hash", rules: &[ re(r#"\S+"#).is(Variable).then_call("comment"), + re(r#"\s+"#), re(r#".*"#).then_return(), ], }, @@ -422,6 +423,13 @@ pub struct State { pub rules: &'static [Rule], } +pub enum Instruction { + Continue, + Change(&'static str), + Push(&'static str), + Pop, +} + pub struct Rule { pub pattern: &'static str, pub kind: Option, diff --git a/tools/highlighter-gen/src/interner.rs b/tools/highlighter-gen/src/interner.rs deleted file mode 100644 index 2051048..0000000 --- a/tools/highlighter-gen/src/interner.rs +++ /dev/null @@ -1,32 +0,0 @@ -use std::rc::{Rc, Weak}; - -pub struct Interner { - list: Vec>, -} - -impl Default for Interner { - fn default() -> Self { - Interner { list: Vec::new() } - } -} - -impl Interner { - pub fn extract(&self) -> Vec> { - self.list.iter().filter_map(Weak::upgrade).collect() - } - - pub fn intern(&mut self, value: T) -> Rc { - if let Some(rc) = self - .list - .iter() - .filter_map(|w| w.upgrade()) - .find(|c| **c == value) - { - return rc; - } - - let rc = Rc::new(value); - self.list.push(Rc::downgrade(&rc)); - rc - } -} diff --git a/tools/highlighter-gen/src/main.rs b/tools/highlighter-gen/src/main.rs index b9620ee..2591fbb 100644 --- a/tools/highlighter-gen/src/main.rs +++ b/tools/highlighter-gen/src/main.rs @@ -13,11 +13,9 @@ mod definitions; mod handles; -mod interner; mod transformer; use std::fmt::Write as _; -use std::hash::{DefaultHasher, Hash, Hasher as _}; use std::io::Write as _; use indoc::{indoc, writedoc}; @@ -46,6 +44,12 @@ fn main() { pub transitions: &'static [Transition<'static>], } + impl PartialEq for Language { + fn eq(&self, other: &Self) -> bool { + std::ptr::eq(self, other) + } + } + pub struct Transition<'a> { pub test: Test<'a>, pub kind: Option, @@ -146,13 +150,13 @@ fn main() { builder.format_as_mermaid() ); - for cs in builder.extract_charsets() { + for (h, cs) in builder.extract_charsets() { _ = writedoc!( output, " - #[rustfmt::skip] const LANG_{}_CHARSET_{:016X}: &[u16; 16] = &[", + #[rustfmt::skip] const LANG_{}_CHARSET_{}: &[u16; 16] = &[", name_uppercase, - calculate_hash(&cs) + h.0, ); for lo in 0..16 { if lo > 0 { @@ -167,13 +171,13 @@ fn main() { _ = writeln!(output, "];"); } - for s in builder.extract_strings() { + for (h, s) in builder.extract_strings() { _ = writedoc!( output, " - #[rustfmt::skip] const LANG_{}_STRING_{:016X}: *const u8 = [", + #[rustfmt::skip] const LANG_{}_STRING_{}: *const u8 = [", name_uppercase, - calculate_hash(&s), + h.0, ); _ = write!(output, "{}", s.len()); for &c in s.as_bytes() { @@ -202,25 +206,13 @@ fn main() { format!("Chars({n})") } GraphTest::Charset(cs) => { - format!( - "Charset(LANG_{}_CHARSET_{:016X})", - name_uppercase, - calculate_hash(&cs) - ) + format!("Charset(LANG_{}_CHARSET_{})", name_uppercase, cs.0) } GraphTest::Prefix(s) => { - format!( - "Prefix(LANG_{}_STRING_{:016X})", - name_uppercase, - calculate_hash(&s) - ) + format!("Prefix(LANG_{}_STRING_{})", name_uppercase, s.0) } GraphTest::PrefixInsensitive(s) => { - format!( - "PrefixInsensitive(LANG_{}_STRING_{:016X})", - name_uppercase, - calculate_hash(&s) - ) + format!("PrefixInsensitive(LANG_{}_STRING_{})", name_uppercase, s.0) } }; let action = match &t.dst { @@ -271,9 +263,3 @@ fn main() { _ = std::io::stdout().write_all(output.as_bytes()); } - -fn calculate_hash(t: &T) -> u64 { - let mut s = DefaultHasher::new(); - t.hash(&mut s); - s.finish() -} diff --git a/tools/highlighter-gen/src/transformer.rs b/tools/highlighter-gen/src/transformer.rs index 137ef4a..55f7f90 100644 --- a/tools/highlighter-gen/src/transformer.rs +++ b/tools/highlighter-gen/src/transformer.rs @@ -1,23 +1,23 @@ use std::fmt::{self, Write as _}; use std::mem; use std::ops::{Index, IndexMut}; -use std::rc::Rc; use regex_syntax::hir::{Class, ClassBytes, ClassBytesRange, Hir, HirKind, Look}; use crate::definitions::*; use crate::handles::{HandleVec, declare_handle}; -use crate::interner::Interner; declare_handle!(pub StateHandle(usize)); declare_handle!(pub TransitionHandle(usize)); +declare_handle!(pub CharsetHandle(usize)); +declare_handle!(pub StringHandle(usize)); pub struct GraphBuilder { roots: RootList, states: HandleVec, transitions: HandleVec, - charsets: Interner, - strings: Interner, + charsets: HandleVec, + strings: HandleVec, origin: i32, } @@ -151,7 +151,7 @@ impl GraphBuilder { lit: &[u8], ) -> GraphAction { let prefix = String::from_utf8(lit.to_vec()).unwrap(); - let prefix = self.strings.intern(prefix); + let prefix = self.intern_string(prefix); self.add_transition(kind, src, dst, GraphTest::Prefix(prefix)) } @@ -164,7 +164,7 @@ impl GraphBuilder { class: &ClassBytes, ) -> GraphAction { let c = self.class_to_charset(class); - let c = self.charsets.intern(c); + let c = self.intern_charset(&c); self.add_transition(kind, src, dst, GraphTest::Charset(c)) } @@ -200,9 +200,9 @@ impl GraphBuilder { { charset[upper] = false; str.make_ascii_lowercase(); - GraphTest::PrefixInsensitive(self.strings.intern(str)) + GraphTest::PrefixInsensitive(self.intern_string(str)) } else { - GraphTest::Prefix(self.strings.intern(str)) + GraphTest::Prefix(self.intern_string(str)) }; let d = self.add_transition(kind, src, dst, test); @@ -304,7 +304,7 @@ impl GraphBuilder { }; if let Some(str) = prefix_insensitive { - let str = self.strings.intern(str); + let str = self.intern_string(str); src = self.add_transition(kind, src_idx, dst, GraphTest::PrefixInsensitive(str)); } else { src = self.transform(kind, src_idx, dst, hir); @@ -389,15 +389,19 @@ impl GraphBuilder { for t in self.transitions_from_state(src) { use GraphTest::*; - if match (&t.test, &test) { + if match (t.test, test) { (Chars(_), _) => true, - (Charset(p), Charset(n)) => n.is_superset(p), - (Charset(p), Prefix(n)) => p.covers_char(n.as_bytes()[0]), - (Charset(p), PrefixInsensitive(n)) => p.covers_char_insensitive(n.as_bytes()[0]), - (Prefix(p), Prefix(s)) => s.starts_with(p.as_str()), + (Charset(p), Charset(n)) => self.charsets[n].is_superset(&self.charsets[p]), + (Charset(p), Prefix(n)) => { + self.charsets[p].covers_char(self.strings[n].as_bytes()[0]) + } + (Charset(p), PrefixInsensitive(n)) => { + self.charsets[p].covers_char_insensitive(self.strings[n].as_bytes()[0]) + } + (Prefix(p), Prefix(s)) => self.strings[s].starts_with(self.strings[p].as_str()), (PrefixInsensitive(p), Prefix(s) | PrefixInsensitive(s)) => { - let s = s.as_bytes(); - let p = p.as_bytes(); + let s = self.strings[s].as_bytes(); + let p = self.strings[p].as_bytes(); p.len() <= s.len() && s[..p.len()].eq_ignore_ascii_case(p) } _ => false, @@ -419,7 +423,27 @@ impl GraphBuilder { dst } + fn intern_charset(&mut self, cs: &Charset) -> CharsetHandle { + if let Some((idx, _)) = self.charsets.enumerate().find(|&(_, v)| v == cs) { + idx + } else { + self.charsets.push(cs.clone()) + } + } + + fn intern_string(&mut self, string: String) -> StringHandle { + if let Some((idx, _)) = self.strings.enumerate().find(|&(_, v)| *v == string) { + idx + } else { + self.strings.push(string) + } + } + pub fn finalize(&mut self) { + if self.states.is_empty() { + return; + } + self.finalize_resolve_root_aliases(); self.finalize_compute_charset_coverage(); self.finalize_add_root_loops(); @@ -456,9 +480,9 @@ impl GraphBuilder { /// Technically we don't need to do that for the root states. fn finalize_compute_charset_coverage(&mut self) { for t in &self.transitions { - match &t.test { + match t.test { GraphTest::Chars(_) => self.states[t.src].coverage.fill(true), - GraphTest::Charset(c) => self.states[t.src].coverage.merge(c), + GraphTest::Charset(c) => self.states[t.src].coverage.merge(&self.charsets[c]), _ => {} } } @@ -482,20 +506,20 @@ impl GraphBuilder { let mut cs = Charset::no(); for t in self.transitions_from_state(src) { - match &t.test { + match t.test { GraphTest::Chars(_) => { cs.fill(true); break; } GraphTest::Charset(c) => { - cs.merge(c); + cs.merge(&self.charsets[c]); } GraphTest::Prefix(s) => { - let ch = s.as_bytes()[0]; + let ch = self.strings[s].as_bytes()[0]; cs.set(ch, true); } GraphTest::PrefixInsensitive(s) => { - let ch = s.as_bytes()[0]; + let ch = self.strings[s].as_bytes()[0]; cs.set(ch.to_ascii_uppercase(), true); cs.set(ch.to_ascii_lowercase(), true); } @@ -505,10 +529,11 @@ impl GraphBuilder { if !cs.covers_all() { cs.invert(); + let cs = self.intern_charset(&cs); self.transitions.push(GraphTransition { origin: -1, src, - test: GraphTest::Charset(self.charsets.intern(cs)), + test: GraphTest::Charset(cs), kind: None, dst: GraphAction::Pop(0), }); @@ -542,9 +567,9 @@ impl GraphBuilder { t.clone() }; - let fallback_cs = match &fallback.test { + let fallback_cs = match fallback.test { GraphTest::Chars(_) => &CS_YES, - GraphTest::Charset(c) => &**c, + GraphTest::Charset(c) => &self.charsets[c], _ => unreachable!(), }; @@ -579,7 +604,7 @@ impl GraphBuilder { match t.dst { GraphAction::Fallback => { - t.test = fallback.test.clone(); + t.test = fallback.test; t.dst = fallback.dst; } GraphAction::Change(dst) if !visited[dst.0] => { @@ -588,13 +613,13 @@ impl GraphBuilder { // Check if the fallback is a superset of this transition. // This applies recursively, which means we assert that the fallback covers // the entire "path" from the original `fallback.src` down to this state. - if match &t.test { + if match t.test { GraphTest::Chars(0) => true, GraphTest::Chars(_) => fallback_cs.covers_all(), - GraphTest::Charset(c) => fallback_cs.is_superset(c), - GraphTest::Prefix(s) => fallback_cs.covers_str(s), + GraphTest::Charset(c) => fallback_cs.is_superset(&self.charsets[c]), + GraphTest::Prefix(s) => fallback_cs.covers_str(&self.strings[s]), GraphTest::PrefixInsensitive(s) => { - fallback_cs.covers_str_insensitive(s) + fallback_cs.covers_str_insensitive(&self.strings[s]) } } { stack.push(dst); @@ -794,26 +819,19 @@ impl GraphBuilder { } } - let label = match &t.test { + let label = match t.test { GraphTest::Chars(usize::MAX) => "Chars(Line)".to_string(), GraphTest::Chars(n) => format!("Chars({n})"), - GraphTest::Charset(c) => format!("Charset({c:?})"), + GraphTest::Charset(c) => format!("Charset({:?})", &self.charsets[c]), GraphTest::Prefix(s) => { let mut label = String::new(); - _ = write!(label, "Prefix({s}"); + _ = write!(label, "Prefix({}", &self.strings[s]); - loop { - let Some(next) = iter.peek() else { - break; - }; - let GraphTest::Prefix(next_s) = &next.test else { - break; - }; - if next.dst != t.dst { - break; - } - - _ = write!(label, ", {}", next_s); + while let Some(next) = iter.peek() + && let GraphTest::Prefix(next_s) = next.test + && next.dst == t.dst + { + _ = write!(label, ", {}", &self.strings[next_s]); iter.next(); } @@ -822,20 +840,13 @@ impl GraphBuilder { } GraphTest::PrefixInsensitive(s) => { let mut label = String::new(); - _ = write!(label, "PrefixInsensitive({s}"); + _ = write!(label, "PrefixInsensitive({}", &self.strings[s]); - loop { - let Some(next) = iter.peek() else { - break; - }; - let GraphTest::PrefixInsensitive(next_s) = &next.test else { - break; - }; - if next.dst != t.dst { - break; - } - - _ = write!(label, ", {next_s}"); + while let Some(next) = iter.peek() + && let GraphTest::PrefixInsensitive(next_s) = next.test + && next.dst == t.dst + { + _ = write!(label, ", {}", &self.strings[next_s]); iter.next(); } @@ -885,12 +896,38 @@ impl GraphBuilder { output } - pub fn extract_charsets(&self) -> Vec> { - self.charsets.extract() + /// Filtered down to only those that are still used. + pub fn extract_charsets(&self) -> Vec<(CharsetHandle, Charset)> { + let mut used = vec![false; self.charsets.len()]; + + for t in &self.transitions { + if let GraphTest::Charset(c) = t.test { + used[c.0] = true; + } + } + + self.charsets + .enumerate() + .filter(|&(h, _)| used[h.0]) + .map(|(h, v)| (h, v.clone())) + .collect() } - pub fn extract_strings(&self) -> Vec> { - self.strings.extract() + /// Filtered down to only those that are still used. + pub fn extract_strings(&self) -> Vec<(StringHandle, String)> { + let mut used = vec![false; self.strings.len()]; + + for t in &self.transitions { + if let GraphTest::Prefix(s) | GraphTest::PrefixInsensitive(s) = t.test { + used[s.0] = true; + } + } + + self.strings + .enumerate() + .filter(|&(h, _)| used[h.0]) + .map(|(h, v)| (h, v.clone())) + .collect() } /// Up to this point we've thought of this as a graph, but now we'll flatten @@ -989,28 +1026,14 @@ pub enum GraphAction { Fallback, // replace with a fallback transition (for look-aheads like \b) } -#[derive(Debug, Clone)] +#[derive(Debug, Clone, Copy, PartialEq, Eq)] pub enum GraphTest { Chars(usize), - Charset(Rc), - Prefix(Rc), - PrefixInsensitive(Rc), + Charset(CharsetHandle), + Prefix(StringHandle), + PrefixInsensitive(StringHandle), } -impl PartialEq for GraphTest { - fn eq(&self, other: &Self) -> bool { - match (self, other) { - (GraphTest::Chars(a), GraphTest::Chars(b)) => a == b, - (GraphTest::Charset(a), GraphTest::Charset(b)) => Rc::ptr_eq(a, b), - (GraphTest::Prefix(a), GraphTest::Prefix(b)) => Rc::ptr_eq(a, b), - (GraphTest::PrefixInsensitive(a), GraphTest::PrefixInsensitive(b)) => Rc::ptr_eq(a, b), - _ => false, - } - } -} - -impl Eq for GraphTest {} - #[derive(Debug, Clone)] pub struct GraphTransition { origin: i32, @@ -1107,7 +1130,7 @@ impl fmt::Debug for Charset { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { let show_char = |f: &mut fmt::Formatter<'_>, b: usize| { let b = b as u8; - if b.is_ascii_graphic() || b == b' ' { + if b.is_ascii_graphic() { let b = b as char; write!(f, "{b}") } else {