This commit is contained in:
Leonard Hecker 2025-08-21 16:57:23 +02:00
parent 38c61115d4
commit 5fe48ed71e
15 changed files with 850 additions and 700 deletions

View file

@ -1422,6 +1422,32 @@ uk = "Закрити"
zh_hans = "关闭"
zh_hant = "關閉"
[LanguageSelectMode]
en = "Select Language Mode"
de = "Sprachmodus auswählen"
es = "Seleccionar modo de lenguaje"
fr = "Sélectionner le mode du langage"
it = "Seleziona modalità del linguaggio"
ja = "言語モードの選択"
ko = "언어 모드 선택"
pt_br = "Selecionar modo de linguagem"
ru = "Выбрать режим языка"
zh_hans = "选择语言模式"
zh_hant = "選擇語言模式"
[LanguageAutoDetect]
en = "Auto Detect"
de = "Automatisch erkennen"
es = "Detección automática"
fr = "Détection automatique"
it = "Rilevamento automatico"
ja = "自動検出"
ko = "자동 감지"
pt_br = "Detectar automaticamente"
ru = "Определить автоматически"
zh_hans = "自动检测"
zh_hant = "自動偵測"
[EncodingReopen]
en = "Reopen with encoding…"
bn = "এনকোডিং সহ পুনরায় খুলুন"

View file

@ -8,7 +8,7 @@ use std::path::{Path, PathBuf};
use edit::buffer::{RcTextBuffer, TextBuffer};
use edit::helpers::{CoordType, Point};
use edit::lsh::language_from_path;
use edit::lsh::{Language, language_from_path};
use edit::{apperr, path, sys};
use crate::state::DisplayablePathBuf;
@ -20,6 +20,7 @@ pub struct Document {
pub filename: String,
pub file_id: Option<sys::FileId>,
pub new_file_counter: usize,
pub language_override: Option<Option<&'static Language>>,
}
impl Document {
@ -63,15 +64,32 @@ impl Document {
let filename = path.file_name().unwrap_or_default().to_string_lossy().into_owned();
let dir = path.parent().map(ToOwned::to_owned).unwrap_or_default();
{
let mut tb = self.buffer.borrow_mut();
tb.set_language(language_from_path(&path));
tb.set_ruler(if filename == "COMMIT_EDITMSG" { 72 } else { 0 });
}
self.filename = filename;
self.dir = Some(DisplayablePathBuf::from_path(dir));
self.path = Some(path);
self.buffer.borrow_mut().set_ruler(if self.filename == "COMMIT_EDITMSG" { 72 } else { 0 });
self.update_language();
}
pub fn auto_detect_language(&mut self) {
self.language_override = None;
self.update_language();
}
pub fn override_language(&mut self, lang: Option<&'static Language>) {
self.language_override = Some(lang);
self.update_language();
}
fn update_language(&mut self) {
self.buffer.borrow_mut().set_language(if let Some(lang) = self.language_override {
lang
} else if let Some(path) = &self.path {
language_from_path(path)
} else {
None
})
}
}
@ -123,6 +141,7 @@ impl DocumentManager {
filename: Default::default(),
file_id: None,
new_file_counter: 0,
language_override: None,
};
self.gen_untitled_name(&mut doc);
@ -183,6 +202,7 @@ impl DocumentManager {
filename: Default::default(),
file_id,
new_file_counter: 0,
language_override: None,
};
doc.set_path(path);

View file

@ -6,6 +6,7 @@ use edit::framebuffer::{Attributes, IndexedColor};
use edit::fuzzy::score_fuzzy;
use edit::helpers::*;
use edit::input::vk;
use edit::lsh::LANGUAGES;
use edit::tui::*;
use edit::{arena_format, icu};
@ -26,15 +27,21 @@ pub fn draw_statusbar(ctx: &mut Context, state: &mut State) {
ctx.table_next_row();
if ctx.button("newline", if tb.is_crlf() { "CRLF" } else { "LF" }, ButtonStyle::default()) {
let is_crlf = tb.is_crlf();
tb.normalize_newlines(!is_crlf);
}
state.wants_language_picker |= ctx.button(
"language",
tb.language().map_or("Plain Text", |l| l.name),
ButtonStyle::default(),
);
if state.wants_statusbar_focus {
state.wants_statusbar_focus = false;
ctx.steal_focus();
}
if ctx.button("newline", if tb.is_crlf() { "CRLF" } else { "LF" }, ButtonStyle::default()) {
let is_crlf = tb.is_crlf();
tb.normalize_newlines(!is_crlf);
}
state.wants_encoding_picker |=
ctx.button("encoding", tb.encoding(), ButtonStyle::default());
if state.wants_encoding_picker {
@ -199,6 +206,55 @@ pub fn draw_statusbar(ctx: &mut Context, state: &mut State) {
ctx.table_end();
}
pub fn draw_dialog_language_change(ctx: &mut Context, state: &mut State) {
let doc = state.documents.active_mut();
let mut done = doc.is_none();
ctx.modal_begin("language", loc(LocId::LanguageSelectMode));
if let Some(doc) = doc {
let width = (ctx.size().width - 20).max(10);
let height = (ctx.size().height - 10).max(10);
ctx.scrollarea_begin("scrollarea", Size { width, height });
ctx.attr_background_rgba(ctx.indexed_alpha(IndexedColor::Black, 1, 4));
ctx.inherit_focus();
{
ctx.list_begin("languages");
ctx.inherit_focus();
let auto_detect = doc.language_override.is_none();
let selected = if auto_detect { None } else { doc.buffer.borrow().language() };
if ctx.list_item(auto_detect, loc(LocId::LanguageAutoDetect))
== ListSelection::Activated
{
doc.auto_detect_language();
done = true;
}
if ctx.list_item(selected.is_none(), "Plain Text") == ListSelection::Activated {
doc.override_language(None);
done = true;
}
for &lang in LANGUAGES {
if ctx.list_item(Some(lang) == selected, lang.name) == ListSelection::Activated {
doc.override_language(Some(lang));
done = true;
}
}
ctx.list_end();
}
ctx.scrollarea_end();
}
done |= ctx.modal_end();
if done {
state.wants_language_picker = false;
ctx.needs_rerender();
}
}
pub fn draw_dialog_encoding_change(ctx: &mut Context, state: &mut State) {
let encoding = state.documents.active_mut().map_or("", |doc| doc.buffer.borrow().encoding());
let reopen = state.wants_encoding_change == StateEncodingChange::Reopen;

View file

@ -313,6 +313,9 @@ fn draw(ctx: &mut Context, state: &mut State) {
if state.wants_save {
draw_handle_save(ctx, state);
}
if state.wants_language_picker {
draw_dialog_language_change(ctx, state);
}
if state.wants_encoding_change != StateEncodingChange::None {
draw_dialog_encoding_change(ctx, state);
}

View file

@ -152,6 +152,8 @@ pub struct State {
pub search_options: buffer::SearchOptions,
pub search_success: bool,
pub wants_language_picker: bool,
pub wants_encoding_picker: bool,
pub wants_encoding_change: StateEncodingChange,
pub encoding_picker_needle: String,
@ -200,6 +202,8 @@ impl State {
search_options: Default::default(),
search_success: true,
wants_language_picker: false,
wants_encoding_picker: false,
encoding_picker_needle: Default::default(),
encoding_picker_results: Default::default(),

View file

@ -1,116 +0,0 @@
use std::ops::Range;
use crate::{document::ReadableDocument, simd::memchr2};
/// Cache a line/offset pair every CACHE_EVERY lines to speed up line/offset calculations
const CACHE_EVERY: usize = 1024 * 64;
#[derive(Clone)]
pub struct CachePoint {
pub index: usize,
pub line: usize,
// pub snapshot: ParserSnapshot
}
pub struct LineCache {
cache: Vec<CachePoint>,
}
impl LineCache {
pub fn new() -> Self {
Self { cache: vec![] }
}
pub fn from_document<T: ReadableDocument>(&mut self, document: &T) {
self.cache.clear();
let mut offset = 0;
let mut line = 0;
loop {
let text = document.read_forward(offset);
if text.is_empty() { return; }
let mut off = 0;
loop {
off = memchr2(b'\n', b'\n', text, off);
if off == text.len() { break; }
if line % CACHE_EVERY == 0 {
self.cache.push(CachePoint { index: offset+off, line });
}
line += 1;
off += 1;
}
offset += text.len();
}
}
/// Updates the cache after a deletion.
/// `range` is the deleted byte range, and `text` is the content that was deleted.
pub fn delete(&mut self, range: Range<usize>, text: &Vec<u8>) {
let mut newlines = 0;
for c in text {
if *c == b'\n' {
newlines += 1;
}
}
let mut beg_del = None;
let mut end_del = None;
for (i, point) in self.cache.iter_mut().enumerate() {
if point.index >= range.start {
if point.index < range.end {
// cache point is within the deleted range
if beg_del.is_none() { beg_del = Some(i); }
end_del = Some(i + 1);
}
else {
point.index -= text.len();
point.line -= newlines;
}
}
}
if let (Some(beg), Some(end)) = (beg_del, end_del) {
self.cache.drain(beg..end);
}
}
/// Updates the cache after an insertion.
/// `offset` is where the insertion occurs, and `text` is the inserted content.
pub fn insert(&mut self, offset: usize, text: &[u8]) {
// Count how many newlines were inserted
let mut newlines = 0;
for c in text {
if *c == b'\n' {
newlines += 1;
}
}
let len = text.len();
for point in &mut self.cache {
if point.index > offset {
point.index += len;
point.line += newlines;
}
}
// TODO: This also needs to insert new cache points
}
/// Finds the nearest cached line-offset pair relative to a target line.
/// If `reverse` is false, it returns the closest *before* the target.
/// If `reverse` is true, it returns the closest *after or at* the target.
pub fn nearest_offset(&self, target_count: usize, reverse: bool) -> Option<CachePoint> {
match self.cache.binary_search_by_key(&target_count, |p| p.line) {
Ok(i) => Some(self.cache[i].clone()),
Err(i) => {
if i == 0 || i == self.cache.len() { None } // target < lowest cache point || target > highest cache point
else {
Some(self.cache[ if reverse {i} else {i-1} ].clone())
}
}
}
}
}

View file

@ -42,6 +42,7 @@ use crate::clipboard::Clipboard;
use crate::document::{ReadableDocument, WriteableDocument};
use crate::framebuffer::{Framebuffer, IndexedColor};
use crate::helpers::*;
use crate::lsh::cache::HighlighterCache;
use crate::lsh::{HighlightKind, Highlighter, Language};
use crate::oklab::StraightRgba;
use crate::simd::memchr2;
@ -219,6 +220,7 @@ pub struct TextBuffer {
active_edit_line_info: Option<ActiveEditLineInfo>,
active_edit_depth: i32,
active_edit_off: usize,
active_edit_first_line_y: Option<CoordType>,
stats: TextBufferStatistics,
cursor: Cursor,
@ -230,6 +232,7 @@ pub struct TextBuffer {
selection: Option<TextBufferSelection>,
selection_generation: u32,
search: Option<UnsafeCell<ActiveSearch>>,
highlighter_cache: HighlighterCache,
width: CoordType,
margin_width: CoordType,
@ -272,6 +275,7 @@ impl TextBuffer {
active_edit_line_info: None,
active_edit_depth: 0,
active_edit_off: 0,
active_edit_first_line_y: None,
stats: TextBufferStatistics { logical_lines: 1, visual_lines: 1 },
cursor: Default::default(),
@ -279,6 +283,7 @@ impl TextBuffer {
selection: None,
selection_generation: 0,
search: None,
highlighter_cache: HighlighterCache::new(),
width: 0,
margin_width: 0,
@ -581,8 +586,13 @@ impl TextBuffer {
self.line_highlight_enabled = enabled;
}
pub fn language(&self) -> Option<&'static Language> {
self.language
}
pub fn set_language(&mut self, language: Option<&'static Language>) {
self.language = language;
self.highlighter_cache.clear_all();
}
/// Sets a ruler column, e.g. 80.
@ -663,6 +673,7 @@ impl TextBuffer {
self.set_selection(None);
self.mark_as_clean();
self.reflow();
self.highlighter_cache.clear_all();
}
/// Copies the contents of the buffer into a string.
@ -1738,6 +1749,14 @@ impl TextBuffer {
if da < db { a } else { b }
};
// If we have a highlighter and a cache, fast-forward to the last checkpoint before
// the first line of the viewport to reduce the amount of work needed.
if let Some(h) = &mut highlighter {
let first_line_cursor =
self.cursor_move_to_visual_internal(cursor, Point { x: origin.x, y: origin.y });
self.highlighter_cache.prepare(h, first_line_cursor.logical_pos.y);
}
let [selection_beg, selection_end] = match self.selection {
None => [Point::MIN, Point::MIN],
Some(TextBufferSelection { beg, end }) => minmax(beg, end),
@ -1970,9 +1989,11 @@ impl TextBuffer {
while h.logical_pos_y() < cursor_beg.logical_pos.y - 1 {
let scratch_alt = scratch_arena(Some(&scratch));
_ = h.parse_next_line(&scratch_alt);
self.highlighter_cache.maybe_store_after_parse(h);
}
let highlights = h.parse_next_line(&scratch);
self.highlighter_cache.maybe_store_after_parse(h);
let mut highlights = highlights.iter();
if let Some(first) = highlights.next() {
@ -2609,6 +2630,13 @@ impl TextBuffer {
let cursor_before = self.cursor;
self.set_cursor_internal(cursor);
// Track the first logical line affected by this edit so we can invalidate
// cached highlighter state starting from here.
if self.active_edit_first_line_y.is_none() {
let y = self.goto_line_start(cursor, cursor.logical_pos.y).logical_pos.y;
self.active_edit_first_line_y = Some(y);
}
// If both the last and this are a Write/Delete operation, we skip allocating a new undo history item.
if history_type != self.last_history_type
|| !matches!(history_type, HistoryType::Write | HistoryType::Delete)
@ -2755,6 +2783,11 @@ impl TextBuffer {
self.stats.visual_lines = self.stats.logical_lines;
}
// Invalidate cached highlighter state starting from the first changed line.
if let Some(y) = self.active_edit_first_line_y.take() {
self.highlighter_cache.invalidate_from(y);
}
self.recalc_after_content_changed();
}

77
src/lsh/cache.rs Normal file
View file

@ -0,0 +1,77 @@
use crate::helpers::CoordType;
use crate::lsh::highlighter::{Highlighter, ParserState};
#[derive(Clone)]
struct Checkpoint {
line: CoordType, // snapshot corresponds to the start of this logical line
state: ParserState,
}
pub struct HighlighterCache {
checkpoints: Vec<Checkpoint>,
interval: CoordType,
}
impl Default for HighlighterCache {
fn default() -> Self {
Self { checkpoints: Vec::new(), interval: 1000 }
}
}
impl HighlighterCache {
pub fn new() -> Self {
Self::default()
}
pub fn set_interval(&mut self, interval: CoordType) {
self.interval = interval.max(1);
}
pub fn clear_all(&mut self) {
self.checkpoints.clear();
}
/// Drop any cached state starting at the given logical line.
pub fn invalidate_from(&mut self, line: CoordType) {
if self.checkpoints.is_empty() {
return;
}
let idx = match self.checkpoints.binary_search_by_key(&line, |c| c.line) {
Ok(i) => i,
Err(i) => i, // first checkpoint with line > given `line` is at position i
};
self.checkpoints.truncate(idx);
}
/// Prepare the highlighter to start parsing from the last checkpoint before or at `target_line`.
/// If none exists, do nothing (the caller will parse from the start).
pub fn prepare(&self, h: &mut Highlighter, target_line: CoordType) {
if self.checkpoints.is_empty() {
return;
}
let idx = match self.checkpoints.binary_search_by_key(&target_line, |c| c.line) {
Ok(i) => Some(i),
Err(0) => None,
Err(i) => Some(i - 1),
};
if let Some(i) = idx {
h.restore(&self.checkpoints[i].state);
}
}
/// After parsing a line, maybe store a checkpoint. The snapshot at this time
/// corresponds to the start of the next logical line, which is ideal for resuming.
pub fn maybe_store_after_parse(&mut self, h: &Highlighter) {
let next_line = h.logical_pos_y().saturating_add(1);
if next_line < 0 {
return;
}
if next_line % self.interval != 0 {
return;
}
if self.checkpoints.last().is_some_and(|c| c.line == next_line) {
return;
}
self.checkpoints.push(Checkpoint { line: next_line, state: h.snapshot() });
}
}

View file

@ -14,6 +14,12 @@ pub struct Language {
pub transitions: &'static [Transition<'static>],
}
impl PartialEq for Language {
fn eq(&self, other: &Self) -> bool {
std::ptr::eq(self, other)
}
}
pub struct Transition<'a> {
pub test: Test<'a>,
pub kind: Option<HighlightKind>,
@ -104,40 +110,40 @@ flowchart TD
20["20 (ignore)"]
20 -->|"Chars(Line)<br/>None"| pop1310720@{ shape: stop }
**/
#[rustfmt::skip] const LANG_GIT_COMMIT_MESSAGE_STRING_520FD55FC3E4BA24: *const u8 = [1, 0x23].as_ptr();
#[rustfmt::skip] const LANG_GIT_COMMIT_MESSAGE_STRING_121F4AB352225114: *const u8 = [10, 0x64, 0x69, 0x66, 0x66, 0x20, 0x2d, 0x2d, 0x67, 0x69, 0x74].as_ptr();
#[rustfmt::skip] const LANG_GIT_COMMIT_MESSAGE_STRING_EC6DE3D5B94B5BD1: *const u8 = [9, 0x09, 0x64, 0x65, 0x6c, 0x65, 0x74, 0x65, 0x64, 0x3a].as_ptr();
#[rustfmt::skip] const LANG_GIT_COMMIT_MESSAGE_STRING_6A3FE2F75546B408: *const u8 = [10, 0x09, 0x6d, 0x6f, 0x64, 0x69, 0x66, 0x69, 0x65, 0x64, 0x3a].as_ptr();
#[rustfmt::skip] const LANG_GIT_COMMIT_MESSAGE_STRING_1218C90A43640858: *const u8 = [10, 0x09, 0x6e, 0x65, 0x77, 0x20, 0x66, 0x69, 0x6c, 0x65, 0x3a].as_ptr();
#[rustfmt::skip] const LANG_GIT_COMMIT_MESSAGE_STRING_8E3289E428F770E2: *const u8 = [9, 0x09, 0x72, 0x65, 0x6e, 0x61, 0x6d, 0x65, 0x64, 0x3a].as_ptr();
#[rustfmt::skip] const LANG_GIT_COMMIT_MESSAGE_STRING_589722BCE95B4F03: *const u8 = [4, 0x64, 0x69, 0x66, 0x66].as_ptr();
#[rustfmt::skip] const LANG_GIT_COMMIT_MESSAGE_STRING_89C86C21681CF1DD: *const u8 = [3, 0x2d, 0x2d, 0x2d].as_ptr();
#[rustfmt::skip] const LANG_GIT_COMMIT_MESSAGE_STRING_24BADF678A331FDC: *const u8 = [3, 0x2b, 0x2b, 0x2b].as_ptr();
#[rustfmt::skip] const LANG_GIT_COMMIT_MESSAGE_STRING_BBDBF9E07DEFE5B5: *const u8 = [1, 0x2d].as_ptr();
#[rustfmt::skip] const LANG_GIT_COMMIT_MESSAGE_STRING_6D48C1B7C2CD9E76: *const u8 = [1, 0x2b].as_ptr();
#[rustfmt::skip] const LANG_GIT_COMMIT_MESSAGE_STRING_0: *const u8 = [1, 0x23].as_ptr();
#[rustfmt::skip] const LANG_GIT_COMMIT_MESSAGE_STRING_1: *const u8 = [10, 0x64, 0x69, 0x66, 0x66, 0x20, 0x2d, 0x2d, 0x67, 0x69, 0x74].as_ptr();
#[rustfmt::skip] const LANG_GIT_COMMIT_MESSAGE_STRING_2: *const u8 = [9, 0x09, 0x64, 0x65, 0x6c, 0x65, 0x74, 0x65, 0x64, 0x3a].as_ptr();
#[rustfmt::skip] const LANG_GIT_COMMIT_MESSAGE_STRING_3: *const u8 = [10, 0x09, 0x6d, 0x6f, 0x64, 0x69, 0x66, 0x69, 0x65, 0x64, 0x3a].as_ptr();
#[rustfmt::skip] const LANG_GIT_COMMIT_MESSAGE_STRING_4: *const u8 = [10, 0x09, 0x6e, 0x65, 0x77, 0x20, 0x66, 0x69, 0x6c, 0x65, 0x3a].as_ptr();
#[rustfmt::skip] const LANG_GIT_COMMIT_MESSAGE_STRING_5: *const u8 = [9, 0x09, 0x72, 0x65, 0x6e, 0x61, 0x6d, 0x65, 0x64, 0x3a].as_ptr();
#[rustfmt::skip] const LANG_GIT_COMMIT_MESSAGE_STRING_6: *const u8 = [4, 0x64, 0x69, 0x66, 0x66].as_ptr();
#[rustfmt::skip] const LANG_GIT_COMMIT_MESSAGE_STRING_7: *const u8 = [3, 0x2d, 0x2d, 0x2d].as_ptr();
#[rustfmt::skip] const LANG_GIT_COMMIT_MESSAGE_STRING_8: *const u8 = [3, 0x2b, 0x2b, 0x2b].as_ptr();
#[rustfmt::skip] const LANG_GIT_COMMIT_MESSAGE_STRING_9: *const u8 = [1, 0x2d].as_ptr();
#[rustfmt::skip] const LANG_GIT_COMMIT_MESSAGE_STRING_10: *const u8 = [1, 0x2b].as_ptr();
#[rustfmt::skip] pub const LANG_GIT_COMMIT_MESSAGE: &Language = &Language {
name: "Git Commit Message",
filenames: &["COMMIT_EDITMSG", "MERGE_MSG"],
transitions: &[
t(Prefix(LANG_GIT_COMMIT_MESSAGE_STRING_520FD55FC3E4BA24), Some(Comment), Push(4, 0)),
t(Prefix(LANG_GIT_COMMIT_MESSAGE_STRING_121F4AB352225114), None, Change(3)),
t(Prefix(LANG_GIT_COMMIT_MESSAGE_STRING_0), Some(Comment), Push(4, 0)),
t(Prefix(LANG_GIT_COMMIT_MESSAGE_STRING_1), None, Change(3)),
t(Chars(0), None, Change(20)),
t(Chars(usize::MAX), Some(Direct(BrightBlue)), Push(13, 0)),
t(Prefix(LANG_GIT_COMMIT_MESSAGE_STRING_EC6DE3D5B94B5BD1), None, Change(9)),
t(Prefix(LANG_GIT_COMMIT_MESSAGE_STRING_6A3FE2F75546B408), None, Change(10)),
t(Prefix(LANG_GIT_COMMIT_MESSAGE_STRING_1218C90A43640858), None, Change(11)),
t(Prefix(LANG_GIT_COMMIT_MESSAGE_STRING_8E3289E428F770E2), None, Change(12)),
t(Prefix(LANG_GIT_COMMIT_MESSAGE_STRING_2), None, Change(9)),
t(Prefix(LANG_GIT_COMMIT_MESSAGE_STRING_3), None, Change(10)),
t(Prefix(LANG_GIT_COMMIT_MESSAGE_STRING_4), None, Change(11)),
t(Prefix(LANG_GIT_COMMIT_MESSAGE_STRING_5), None, Change(12)),
t(Chars(usize::MAX), None, Pop(1)),
t(Chars(usize::MAX), Some(Direct(BrightRed)), Pop(1)),
t(Chars(usize::MAX), Some(Direct(BrightBlue)), Pop(1)),
t(Chars(usize::MAX), Some(Direct(BrightGreen)), Pop(1)),
t(Chars(usize::MAX), Some(Direct(BrightBlue)), Pop(1)),
t(Chars(0), Some(Other), Push(14, 13)),
t(Prefix(LANG_GIT_COMMIT_MESSAGE_STRING_589722BCE95B4F03), Some(Direct(BrightBlue)), Change(20)),
t(Prefix(LANG_GIT_COMMIT_MESSAGE_STRING_89C86C21681CF1DD), Some(Direct(BrightBlue)), Change(20)),
t(Prefix(LANG_GIT_COMMIT_MESSAGE_STRING_24BADF678A331FDC), Some(Direct(BrightBlue)), Change(20)),
t(Prefix(LANG_GIT_COMMIT_MESSAGE_STRING_BBDBF9E07DEFE5B5), Some(Direct(BrightRed)), Change(20)),
t(Prefix(LANG_GIT_COMMIT_MESSAGE_STRING_6D48C1B7C2CD9E76), Some(Direct(BrightGreen)), Change(20)),
t(Prefix(LANG_GIT_COMMIT_MESSAGE_STRING_6), Some(Direct(BrightBlue)), Change(20)),
t(Prefix(LANG_GIT_COMMIT_MESSAGE_STRING_7), Some(Direct(BrightBlue)), Change(20)),
t(Prefix(LANG_GIT_COMMIT_MESSAGE_STRING_8), Some(Direct(BrightBlue)), Change(20)),
t(Prefix(LANG_GIT_COMMIT_MESSAGE_STRING_9), Some(Direct(BrightRed)), Change(20)),
t(Prefix(LANG_GIT_COMMIT_MESSAGE_STRING_10), Some(Direct(BrightGreen)), Change(20)),
t(Chars(0), None, Change(20)),
t(Chars(usize::MAX), None, Pop(0)),
],
@ -153,71 +159,74 @@ config:
---
flowchart TD
0["0 (ground)"]
0 -->|"Prefix(break, drop, exec, b, d, x)<br/>None"| 19
0 -->|"Prefix(edit, fixup, pick, reword, squash, e, f, p, r, s)<br/>None"| 21
0 -->|"Prefix(break, exec, b, x)<br/>None"| 19
0 -->|"Prefix(drop, edit, fixup, pick, reword, squash, d, e, f, p, r, s)<br/>None"| 21
0 -->|"Prefix(#)<br/>None"| 23
0 -->|"Charset([0x00-&quot;, $-a, c, g-o, q, t-w, y-0xFF])<br/>None"| pop0@{ shape: stop }
0 -->|"Chars(1)<br/>None"| pop0@{ shape: stop }
19 -->|"Charset([0-9, A-Z, _, a-z, 0xC2-0xF4])<br/>None"| pop1245184@{ shape: stop }
19 -->|"Chars(0)<br/>Some(Keyword)"| push1245210[/"comment"/]
19 -->|"Chars(0)<br/>Some(Keyword)"| push1245211[/"comment"/]
21 -->|"Charset([0-9, A-Z, _, a-z, 0xC2-0xF4])<br/>None"| pop1376256@{ shape: stop }
21 -->|"Chars(0)<br/>Some(Keyword)"| push1376280[/"hash"/]
23 -->|"Chars(Line)<br/>Some(Comment)"| pop1507328@{ shape: stop }
24["24 (hash)"]
24 -->|"Charset([0x00-0x08, 0x0E-0x1F, !-0xFF])<br/>Some(Variable)"| push1572890[/"comment"/]
24 -->|"Charset([0x00-0x08, 0x0E-0x1F, !-0xFF])<br/>Some(Variable)"| push1572891[/"comment"/]
24 -->|"Charset([0x09-0x0D, 0x20])<br/>None"| pop1572864@{ shape: stop }
24 -->|"Chars(Line)<br/>None"| pop1572864@{ shape: stop }
26["26 (comment)"]
26 -->|"Chars(Line)<br/>Some(Comment)"| pop1703936@{ shape: stop }
27["27 (comment)"]
27 -->|"Chars(Line)<br/>Some(Comment)"| pop1769472@{ shape: stop }
**/
#[rustfmt::skip] const LANG_GIT_REBASE_MESSAGE_CHARSET_B8C5CFDCF118A06C: &[u16; 16] = &[0xe0a8, 0xe0f8, 0xf0f8, 0xf0f8, 0xf0f8, 0x70f8, 0x70f8, 0x70f8, 0x70f8, 0x70f8, 0x70f0, 0x7050, 0x7050, 0x7050, 0x7050, 0x7070];
#[rustfmt::skip] const LANG_GIT_REBASE_MESSAGE_CHARSET_5194EFCD4A36EDF4: &[u16; 16] = &[0xfffb, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xfffe, 0xfffe, 0xfffe, 0xfffe, 0xfffe, 0xffff, 0xffff];
#[rustfmt::skip] const LANG_GIT_REBASE_MESSAGE_CHARSET_3E7F9F69BC96DA4F: &[u16; 16] = &[0xff7f, 0xffff, 0xff3f, 0xff7b, 0xffbf, 0xffbf, 0xffbf, 0xffff, 0xff7f, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff];
#[rustfmt::skip] const LANG_GIT_REBASE_MESSAGE_STRING_303BCC458DA59CA8: *const u8 = [5, 0x62, 0x72, 0x65, 0x61, 0x6b].as_ptr();
#[rustfmt::skip] const LANG_GIT_REBASE_MESSAGE_STRING_0C6400D6B9B47208: *const u8 = [4, 0x64, 0x72, 0x6f, 0x70].as_ptr();
#[rustfmt::skip] const LANG_GIT_REBASE_MESSAGE_STRING_61385E31CBE176AD: *const u8 = [4, 0x65, 0x78, 0x65, 0x63].as_ptr();
#[rustfmt::skip] const LANG_GIT_REBASE_MESSAGE_STRING_EBD3E4FD8943240A: *const u8 = [1, 0x62].as_ptr();
#[rustfmt::skip] const LANG_GIT_REBASE_MESSAGE_STRING_669BD43167599AB5: *const u8 = [1, 0x64].as_ptr();
#[rustfmt::skip] const LANG_GIT_REBASE_MESSAGE_STRING_735B2F4734CDA621: *const u8 = [1, 0x78].as_ptr();
#[rustfmt::skip] const LANG_GIT_REBASE_MESSAGE_STRING_8FC5A350E96EF029: *const u8 = [4, 0x65, 0x64, 0x69, 0x74].as_ptr();
#[rustfmt::skip] const LANG_GIT_REBASE_MESSAGE_STRING_F21A2AE3814D4D54: *const u8 = [5, 0x66, 0x69, 0x78, 0x75, 0x70].as_ptr();
#[rustfmt::skip] const LANG_GIT_REBASE_MESSAGE_STRING_A678DDDA0C17324F: *const u8 = [4, 0x70, 0x69, 0x63, 0x6b].as_ptr();
#[rustfmt::skip] const LANG_GIT_REBASE_MESSAGE_STRING_691484CDDA2E5DBA: *const u8 = [6, 0x72, 0x65, 0x77, 0x6f, 0x72, 0x64].as_ptr();
#[rustfmt::skip] const LANG_GIT_REBASE_MESSAGE_STRING_A8F3DDD2082E2211: *const u8 = [6, 0x73, 0x71, 0x75, 0x61, 0x73, 0x68].as_ptr();
#[rustfmt::skip] const LANG_GIT_REBASE_MESSAGE_STRING_E561AACBDF93CFD2: *const u8 = [1, 0x65].as_ptr();
#[rustfmt::skip] const LANG_GIT_REBASE_MESSAGE_STRING_56C0C61729ED1B19: *const u8 = [1, 0x66].as_ptr();
#[rustfmt::skip] const LANG_GIT_REBASE_MESSAGE_STRING_1DD05CCF89ABD763: *const u8 = [1, 0x70].as_ptr();
#[rustfmt::skip] const LANG_GIT_REBASE_MESSAGE_STRING_915744BB66D98775: *const u8 = [1, 0x72].as_ptr();
#[rustfmt::skip] const LANG_GIT_REBASE_MESSAGE_STRING_58F0D67C30FD13C8: *const u8 = [1, 0x73].as_ptr();
#[rustfmt::skip] const LANG_GIT_REBASE_MESSAGE_STRING_520FD55FC3E4BA24: *const u8 = [1, 0x23].as_ptr();
#[rustfmt::skip] const LANG_GIT_REBASE_MESSAGE_CHARSET_0: &[u16; 16] = &[0xe0a8, 0xe0f8, 0xf0f8, 0xf0f8, 0xf0f8, 0x70f8, 0x70f8, 0x70f8, 0x70f8, 0x70f8, 0x70f0, 0x7050, 0x7050, 0x7050, 0x7050, 0x7070];
#[rustfmt::skip] const LANG_GIT_REBASE_MESSAGE_CHARSET_1: &[u16; 16] = &[0xfffb, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xfffe, 0xfffe, 0xfffe, 0xfffe, 0xfffe, 0xffff, 0xffff];
#[rustfmt::skip] const LANG_GIT_REBASE_MESSAGE_CHARSET_2: &[u16; 16] = &[0x0004, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0000, 0x0000];
#[rustfmt::skip] const LANG_GIT_REBASE_MESSAGE_CHARSET_3: &[u16; 16] = &[0xff7f, 0xffff, 0xff3f, 0xff7b, 0xffbf, 0xffbf, 0xffbf, 0xffff, 0xff7f, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff];
#[rustfmt::skip] const LANG_GIT_REBASE_MESSAGE_STRING_0: *const u8 = [5, 0x62, 0x72, 0x65, 0x61, 0x6b].as_ptr();
#[rustfmt::skip] const LANG_GIT_REBASE_MESSAGE_STRING_1: *const u8 = [4, 0x65, 0x78, 0x65, 0x63].as_ptr();
#[rustfmt::skip] const LANG_GIT_REBASE_MESSAGE_STRING_2: *const u8 = [1, 0x62].as_ptr();
#[rustfmt::skip] const LANG_GIT_REBASE_MESSAGE_STRING_3: *const u8 = [1, 0x78].as_ptr();
#[rustfmt::skip] const LANG_GIT_REBASE_MESSAGE_STRING_4: *const u8 = [4, 0x64, 0x72, 0x6f, 0x70].as_ptr();
#[rustfmt::skip] const LANG_GIT_REBASE_MESSAGE_STRING_5: *const u8 = [4, 0x65, 0x64, 0x69, 0x74].as_ptr();
#[rustfmt::skip] const LANG_GIT_REBASE_MESSAGE_STRING_6: *const u8 = [5, 0x66, 0x69, 0x78, 0x75, 0x70].as_ptr();
#[rustfmt::skip] const LANG_GIT_REBASE_MESSAGE_STRING_7: *const u8 = [4, 0x70, 0x69, 0x63, 0x6b].as_ptr();
#[rustfmt::skip] const LANG_GIT_REBASE_MESSAGE_STRING_8: *const u8 = [6, 0x72, 0x65, 0x77, 0x6f, 0x72, 0x64].as_ptr();
#[rustfmt::skip] const LANG_GIT_REBASE_MESSAGE_STRING_9: *const u8 = [6, 0x73, 0x71, 0x75, 0x61, 0x73, 0x68].as_ptr();
#[rustfmt::skip] const LANG_GIT_REBASE_MESSAGE_STRING_10: *const u8 = [1, 0x64].as_ptr();
#[rustfmt::skip] const LANG_GIT_REBASE_MESSAGE_STRING_11: *const u8 = [1, 0x65].as_ptr();
#[rustfmt::skip] const LANG_GIT_REBASE_MESSAGE_STRING_12: *const u8 = [1, 0x66].as_ptr();
#[rustfmt::skip] const LANG_GIT_REBASE_MESSAGE_STRING_13: *const u8 = [1, 0x70].as_ptr();
#[rustfmt::skip] const LANG_GIT_REBASE_MESSAGE_STRING_14: *const u8 = [1, 0x72].as_ptr();
#[rustfmt::skip] const LANG_GIT_REBASE_MESSAGE_STRING_15: *const u8 = [1, 0x73].as_ptr();
#[rustfmt::skip] const LANG_GIT_REBASE_MESSAGE_STRING_16: *const u8 = [1, 0x23].as_ptr();
#[rustfmt::skip] pub const LANG_GIT_REBASE_MESSAGE: &Language = &Language {
name: "Git Rebase Message",
filenames: &["git-rebase-todo"],
transitions: &[
t(Prefix(LANG_GIT_REBASE_MESSAGE_STRING_303BCC458DA59CA8), None, Change(19)),
t(Prefix(LANG_GIT_REBASE_MESSAGE_STRING_0C6400D6B9B47208), None, Change(19)),
t(Prefix(LANG_GIT_REBASE_MESSAGE_STRING_61385E31CBE176AD), None, Change(19)),
t(Prefix(LANG_GIT_REBASE_MESSAGE_STRING_EBD3E4FD8943240A), None, Change(19)),
t(Prefix(LANG_GIT_REBASE_MESSAGE_STRING_669BD43167599AB5), None, Change(19)),
t(Prefix(LANG_GIT_REBASE_MESSAGE_STRING_735B2F4734CDA621), None, Change(19)),
t(Prefix(LANG_GIT_REBASE_MESSAGE_STRING_8FC5A350E96EF029), None, Change(21)),
t(Prefix(LANG_GIT_REBASE_MESSAGE_STRING_F21A2AE3814D4D54), None, Change(21)),
t(Prefix(LANG_GIT_REBASE_MESSAGE_STRING_A678DDDA0C17324F), None, Change(21)),
t(Prefix(LANG_GIT_REBASE_MESSAGE_STRING_691484CDDA2E5DBA), None, Change(21)),
t(Prefix(LANG_GIT_REBASE_MESSAGE_STRING_A8F3DDD2082E2211), None, Change(21)),
t(Prefix(LANG_GIT_REBASE_MESSAGE_STRING_E561AACBDF93CFD2), None, Change(21)),
t(Prefix(LANG_GIT_REBASE_MESSAGE_STRING_56C0C61729ED1B19), None, Change(21)),
t(Prefix(LANG_GIT_REBASE_MESSAGE_STRING_1DD05CCF89ABD763), None, Change(21)),
t(Prefix(LANG_GIT_REBASE_MESSAGE_STRING_915744BB66D98775), None, Change(21)),
t(Prefix(LANG_GIT_REBASE_MESSAGE_STRING_58F0D67C30FD13C8), None, Change(21)),
t(Prefix(LANG_GIT_REBASE_MESSAGE_STRING_520FD55FC3E4BA24), None, Change(23)),
t(Charset(LANG_GIT_REBASE_MESSAGE_CHARSET_3E7F9F69BC96DA4F), None, Pop(0)),
t(Prefix(LANG_GIT_REBASE_MESSAGE_STRING_0), None, Change(19)),
t(Prefix(LANG_GIT_REBASE_MESSAGE_STRING_1), None, Change(19)),
t(Prefix(LANG_GIT_REBASE_MESSAGE_STRING_2), None, Change(19)),
t(Prefix(LANG_GIT_REBASE_MESSAGE_STRING_3), None, Change(19)),
t(Prefix(LANG_GIT_REBASE_MESSAGE_STRING_4), None, Change(21)),
t(Prefix(LANG_GIT_REBASE_MESSAGE_STRING_5), None, Change(21)),
t(Prefix(LANG_GIT_REBASE_MESSAGE_STRING_6), None, Change(21)),
t(Prefix(LANG_GIT_REBASE_MESSAGE_STRING_7), None, Change(21)),
t(Prefix(LANG_GIT_REBASE_MESSAGE_STRING_8), None, Change(21)),
t(Prefix(LANG_GIT_REBASE_MESSAGE_STRING_9), None, Change(21)),
t(Prefix(LANG_GIT_REBASE_MESSAGE_STRING_10), None, Change(21)),
t(Prefix(LANG_GIT_REBASE_MESSAGE_STRING_11), None, Change(21)),
t(Prefix(LANG_GIT_REBASE_MESSAGE_STRING_12), None, Change(21)),
t(Prefix(LANG_GIT_REBASE_MESSAGE_STRING_13), None, Change(21)),
t(Prefix(LANG_GIT_REBASE_MESSAGE_STRING_14), None, Change(21)),
t(Prefix(LANG_GIT_REBASE_MESSAGE_STRING_15), None, Change(21)),
t(Prefix(LANG_GIT_REBASE_MESSAGE_STRING_16), None, Change(23)),
t(Charset(LANG_GIT_REBASE_MESSAGE_CHARSET_3), None, Pop(0)),
t(Chars(1), None, Pop(0)),
t(Charset(LANG_GIT_REBASE_MESSAGE_CHARSET_B8C5CFDCF118A06C), None, Pop(0)),
t(Chars(0), Some(Keyword), Push(26, 0)),
t(Charset(LANG_GIT_REBASE_MESSAGE_CHARSET_B8C5CFDCF118A06C), None, Pop(0)),
t(Charset(LANG_GIT_REBASE_MESSAGE_CHARSET_0), None, Pop(0)),
t(Chars(0), Some(Keyword), Push(27, 0)),
t(Charset(LANG_GIT_REBASE_MESSAGE_CHARSET_0), None, Pop(0)),
t(Chars(0), Some(Keyword), Push(24, 0)),
t(Chars(usize::MAX), Some(Comment), Pop(0)),
t(Charset(LANG_GIT_REBASE_MESSAGE_CHARSET_5194EFCD4A36EDF4), Some(Variable), Push(26, 24)),
t(Charset(LANG_GIT_REBASE_MESSAGE_CHARSET_1), Some(Variable), Push(27, 24)),
t(Charset(LANG_GIT_REBASE_MESSAGE_CHARSET_2), None, Pop(0)),
t(Chars(usize::MAX), None, Pop(1)),
t(Chars(usize::MAX), Some(Comment), Pop(1)),
],

408
src/lsh/highlighter.rs Normal file
View file

@ -0,0 +1,408 @@
use std::borrow::Cow;
use std::ffi::OsStr;
use std::fmt::Debug;
use std::ops::RangeInclusive;
use std::path::Path;
use std::slice;
use crate::arena::{Arena, scratch_arena};
use crate::document::ReadableDocument;
use crate::helpers::*;
use crate::lsh::definitions::*;
use crate::{simd, unicode};
pub fn language_from_path(path: &Path) -> Option<&'static Language> {
let filename = path.file_name()?.as_encoded_bytes();
for &l in LANGUAGES {
for f in l.filenames {
let f = f.as_bytes();
if let Some(suffix) = f.strip_prefix(b"*") {
if filename.ends_with(suffix) {
return Some(l);
}
} else if filename == f {
return Some(l);
}
}
}
None
}
#[derive(Clone, PartialEq, Eq)]
pub struct Higlight {
pub start: usize,
pub kind: HighlightKind,
}
impl Debug for Higlight {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "({}, {:?})", self.start, self.kind)
}
}
#[derive(Clone, Copy, PartialEq, Eq, Default)]
pub struct State {}
#[derive(Clone)]
pub struct Highlighter<'a> {
doc: &'a dyn ReadableDocument,
language: &'static Language,
offset: usize,
logical_pos_y: CoordType,
state_stack: Vec<(u8, HighlightKind)>,
}
#[derive(Clone)]
pub struct ParserState {
pub offset: usize,
pub logical_pos_y: CoordType,
pub state_stack: Vec<(u8, HighlightKind)>,
}
impl<'doc> Highlighter<'doc> {
pub fn new(doc: &'doc dyn ReadableDocument, language: &'static Language) -> Self {
Self {
doc,
language,
offset: 0,
logical_pos_y: 0,
state_stack: vec![(0, HighlightKind::Other)],
}
}
pub fn logical_pos_y(&self) -> CoordType {
self.logical_pos_y
}
/// Create a restorable snapshot of the current highlighter state
/// so we can resume highlighting from this point later.
pub fn snapshot(&self) -> ParserState {
ParserState {
offset: self.offset,
logical_pos_y: self.logical_pos_y,
state_stack: self.state_stack.clone(),
}
}
/// Restore the highlighter state from a previously captured snapshot.
pub fn restore(&mut self, snapshot: &ParserState) {
self.offset = snapshot.offset;
self.logical_pos_y = snapshot.logical_pos_y;
self.state_stack = snapshot.state_stack.clone();
}
pub fn parse_next_line<'a>(&mut self, arena: &'a Arena) -> Vec<Higlight, &'a Arena> {
const MAX_LEN: usize = 32 * KIBI;
let scratch = scratch_arena(Some(arena));
let line_beg = self.offset;
let mut res = Vec::new_in(arena);
if self.offset != 0 {
self.logical_pos_y += 1;
}
// Accumulate a line of text into `line_buf`.
let line = 'read: {
let mut chunk;
let mut line_buf;
// Try to read a chunk and see if it contains a newline.
// In that case we can skip concatenating chunks.
{
chunk = self.doc.read_forward(self.offset);
if chunk.is_empty() {
break 'read chunk;
}
let (off, line) = simd::lines_fwd(chunk, 0, 0, 1);
self.offset += off;
if line == 1 {
break 'read &chunk[..off];
}
let next_chunk = self.doc.read_forward(self.offset);
if next_chunk.is_empty() {
break 'read &chunk[..off];
}
line_buf = Vec::new_in(&*scratch);
// Ensure we don't overflow the heap size with a 1GB long line.
let end = off.min(MAX_LEN - line_buf.len());
let end = end.min(chunk.len());
line_buf.extend_from_slice(&chunk[..end]);
chunk = next_chunk;
}
// Concatenate chunks until we get a full line.
while line_buf.len() < MAX_LEN {
let (off, line) = simd::lines_fwd(chunk, 0, 0, 1);
self.offset += off;
// Ensure we don't overflow the heap size with a 1GB long line.
let end = off.min(MAX_LEN - line_buf.len());
let end = end.min(chunk.len());
line_buf.extend_from_slice(&chunk[..end]);
// Start of the next line found.
if line == 1 {
break;
}
chunk = self.doc.read_forward(self.offset);
if chunk.is_empty() {
break;
}
}
line_buf.leak()
};
// If the line is empty, we reached the end of the document.
//
// If the line is too long, we don't highlight it.
// This is to prevent performance issues with very long lines.
if line.is_empty() || line.len() >= MAX_LEN {
return res;
}
let line = unicode::strip_newline(line);
let mut off = 0usize;
let mut start = 0usize;
let &(state, mut kind) = unsafe { self.state_stack.last().unwrap_unchecked() };
let mut state = state as usize;
let mut push = |start: usize, kind: HighlightKind| {
if let Some(last) = res.last_mut() {
if last.start == start {
last.kind = kind;
}
if last.kind == kind {
return;
}
}
res.push(Higlight { start, kind });
};
state = state.wrapping_sub(1);
loop {
state = state.wrapping_add(1);
let t = unsafe { self.language.transitions.get_unchecked(state) };
match t.test {
Test::Chars(n) => {
off = off + n.min(line.len() - off);
}
Test::Prefix(str) => {
let str = unsafe { slice::from_raw_parts(str.add(1), str.read() as usize) };
if !Self::inlined_memcmp(line, off, str) {
continue;
}
off += str.len();
}
Test::PrefixInsensitive(str) => {
let str = unsafe { slice::from_raw_parts(str.add(1), str.read() as usize) };
if !Self::inlined_memicmp(line, off, str) {
continue;
}
off += str.len();
}
Test::Charset(cs) => {
// TODO: http://0x80.pl/notesen/2018-10-18-simd-byte-lookup.html#alternative-implementation
if off >= line.len() || !Self::in_set(cs, line[off]) {
continue;
}
while {
off += 1;
off < line.len() && Self::in_set(cs, line[off])
} {}
}
}
match t.action {
Action::Change(dst) => {
state = dst as usize;
kind = t.kind.unwrap_or(kind);
}
Action::Push(dst, _) => {
self.state_stack.push((dst, kind));
state = dst as usize;
kind = t.kind.unwrap_or(kind);
push(start, kind);
start = off;
}
Action::Pop(n) => {
kind = t.kind.unwrap_or(kind);
push(start, kind);
if n != 0 {
let n = n as usize;
self.state_stack.truncate(self.state_stack.len().max(n + 1) - n);
}
let v = unsafe { self.state_stack.last().unwrap_unchecked() };
state = v.0 as usize;
kind = v.1;
start = off;
if n == 0 && off >= line.len() {
break;
}
}
}
state = state.wrapping_sub(1);
}
push(start, kind);
push(line.len(), kind);
// Adjust the range to account for the line offset.
for h in &mut res {
h.start = line_beg + h.start.min(line.len());
}
res
}
/// A mini-memcmp implementation for short needles.
/// Compares the `haystack` at `off` with the `needle`.
#[inline]
fn inlined_memcmp(haystack: &[u8], off: usize, needle: &[u8]) -> bool {
unsafe {
let needle_len = needle.len();
if haystack.len() - off < needle_len {
return false;
}
let mut a = haystack.as_ptr().add(off);
let mut b = needle.as_ptr();
let mut i = 0;
while i < needle_len {
let a = *a.add(i);
let b = *b.add(i);
i += 1;
if a != b {
return false;
}
}
true
}
}
/// Like `inlined_memcmp`, but case-insensitive.
#[inline]
fn inlined_memicmp(haystack: &[u8], off: usize, needle: &[u8]) -> bool {
unsafe {
let needle_len = needle.len();
if haystack.len() - off < needle_len {
return false;
}
let mut a = haystack.as_ptr().add(off);
let mut b = needle.as_ptr();
let mut i = 0;
while i < needle_len {
// str in PrefixInsensitive(str) is expected to be lowercase, printable ASCII.
let a = a.add(i).read().to_ascii_lowercase();
let b = b.add(i).read();
i += 1;
if a != b {
return false;
}
}
true
}
}
#[inline]
fn in_set(bitmap: &[u16; 16], byte: u8) -> bool {
let lo_nibble = byte & 0xf;
let hi_nibble = byte >> 4;
let bitset = bitmap[lo_nibble as usize];
let bitmask = 1u16 << hi_nibble;
(bitset & bitmask) != 0
}
}
/*#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_powershell() {
let doc = r#"$response = Read-Host "Delete branch '$branch'? [y/N]""#;
let bytes = doc.as_bytes();
let scratch = scratch_arena(None);
let mut parser = Highlighter::new(&bytes, &lang_powershell::LANG);
let tokens = parser.parse_next_line(&scratch);
assert_eq!(
tokens,
&[
Higlight { start: 0, kind: HighlightKind::Variable },
Higlight { start: 9, kind: HighlightKind::Other },
Higlight { start: 10, kind: HighlightKind::Operator },
Higlight { start: 11, kind: HighlightKind::Other },
Higlight { start: 12, kind: HighlightKind::Method },
Higlight { start: 21, kind: HighlightKind::Other },
Higlight { start: 22, kind: HighlightKind::String },
Higlight { start: 38, kind: HighlightKind::Variable },
Higlight { start: 45, kind: HighlightKind::String },
Higlight { start: 54, kind: HighlightKind::Other },
]
);
}
#[test]
fn test_string() {
let doc = r#""$x";"#;
let bytes = doc.as_bytes();
let scratch = scratch_arena(None);
let mut parser = Highlighter::new(&bytes, &lang_powershell::LANG);
let tokens = parser.parse_next_line(&scratch);
assert_eq!(
tokens,
&[
Higlight { start: 0, kind: HighlightKind::String },
Higlight { start: 1, kind: HighlightKind::Variable },
Higlight { start: 3, kind: HighlightKind::String },
Higlight { start: 4, kind: HighlightKind::Other },
]
);
}
#[test]
fn test_comment() {
let doc = r#"<#x#>"#;
let bytes = doc.as_bytes();
let scratch = scratch_arena(None);
let mut parser = Highlighter::new(&bytes, &lang_powershell::LANG);
let tokens = parser.parse_next_line(&scratch);
assert_eq!(
tokens,
&[
Higlight { start: 0, kind: HighlightKind::Comment },
Higlight { start: 5, kind: HighlightKind::Other },
]
);
}
}*/

View file

@ -1,363 +1,8 @@
//! Welcome to Leonard's Shitty Highlighter.
pub mod cache;
mod definitions;
mod highlighter;
use std::ffi::OsStr;
use std::fmt::Debug;
use std::ops::RangeInclusive;
use std::path::Path;
use std::slice;
pub use definitions::{HighlightKind, Language};
use crate::arena::{Arena, scratch_arena};
use crate::document::ReadableDocument;
use crate::helpers::*;
use crate::lsh::definitions::*;
use crate::{simd, unicode};
pub fn language_from_path(path: &Path) -> Option<&'static Language> {
let filename = path.file_name()?.as_encoded_bytes();
for &l in LANGUAGES {
for f in l.filenames {
let f = f.as_bytes();
if let Some(suffix) = f.strip_prefix(b"*") {
if filename.ends_with(suffix) {
return Some(l);
}
} else if filename == f {
return Some(l);
}
}
}
None
}
#[derive(Clone, PartialEq, Eq)]
pub struct Higlight {
pub start: usize,
pub kind: HighlightKind,
}
impl Debug for Higlight {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "({}, {:?})", self.start, self.kind)
}
}
#[derive(Clone, Copy, PartialEq, Eq, Default)]
pub struct State {}
#[derive(Clone)]
pub struct Highlighter<'a> {
doc: &'a dyn ReadableDocument,
language: &'static Language,
offset: usize,
logical_pos_y: CoordType,
state: usize,
kind: HighlightKind,
state_stack: Vec<(u8, HighlightKind)>,
}
impl<'doc> Highlighter<'doc> {
pub fn new(doc: &'doc dyn ReadableDocument, language: &'static Language) -> Self {
Self {
doc,
language,
offset: 0,
logical_pos_y: 0,
state: 0,
kind: Default::default(),
state_stack: Default::default(),
}
}
pub fn logical_pos_y(&self) -> CoordType {
self.logical_pos_y
}
pub fn parse_next_line<'a>(&mut self, arena: &'a Arena) -> Vec<Higlight, &'a Arena> {
const MAX_LEN: usize = 32 * KIBI;
let scratch = scratch_arena(Some(arena));
let line_beg = self.offset;
let mut line_buf = Vec::new_in(&*scratch);
let mut res = Vec::new_in(arena);
if self.offset != 0 {
self.logical_pos_y += 1;
}
// Accumulate a line of text into `line_buf`.
{
let mut chunk = self.doc.read_forward(self.offset);
// Check if the last line was the last line in the document.
if chunk.is_empty() {
return res;
}
loop {
let (off, line) = simd::lines_fwd(chunk, 0, 0, 1);
self.offset += off;
// Overly long lines are not highlighted, so we limit the line length to 32 KiB.
// I'm worried it may run into weird edge cases.
let end = off.min(MAX_LEN - line_buf.len());
// If we're at it we can also help Rust understand that indexing with `end` doesn't panic.
let end = end.min(chunk.len());
line_buf.extend_from_slice(&chunk[..end]);
// If the line is too long, we don't highlight it.
// This is to prevent performance issues with very long lines.
if line_buf.len() >= MAX_LEN {
return res;
}
// Start of the next line found.
if line == 1 {
break;
}
chunk = self.doc.read_forward(self.offset);
if chunk.is_empty() {
// End of document reached
break;
}
}
}
let line_buf = unicode::strip_newline(&line_buf);
let mut off = 0usize;
let mut start = 0usize;
let mut state = self.state;
let mut kind = self.kind;
state = state.wrapping_sub(1);
loop {
state = state.wrapping_add(1);
let t = unsafe { self.language.transitions.get_unchecked(state) };
match t.test {
Test::Chars(n) => {
off = off + n.min(line_buf.len() - off);
}
Test::Prefix(str) => {
let str = unsafe { slice::from_raw_parts(str.add(1), str.read() as usize) };
if !Self::inlined_memcmp(line_buf, off, str) {
continue;
}
off += str.len();
}
Test::PrefixInsensitive(str) => {
let str = unsafe { slice::from_raw_parts(str.add(1), str.read() as usize) };
if !Self::inlined_memicmp(line_buf, off, str) {
continue;
}
off += str.len();
}
Test::Charset(cs) => {
// TODO: http://0x80.pl/notesen/2018-10-18-simd-byte-lookup.html#alternative-implementation
if off >= line_buf.len() || !Self::in_set(cs, line_buf[off]) {
continue;
}
while {
off += 1;
off < line_buf.len() && Self::in_set(cs, line_buf[off])
} {}
}
}
match t.action {
Action::Change(dst) => {
state = dst as usize;
kind = t.kind.unwrap_or(kind);
}
Action::Push(dst, pop_dst) => {
self.state_stack.push((pop_dst, kind));
state = dst as usize;
kind = t.kind.unwrap_or(kind);
res.push(Higlight { start, kind });
start = off;
}
Action::Pop(n) => {
kind = t.kind.unwrap_or(kind);
res.push(Higlight { start, kind });
let l = self.state_stack.last().copied().unwrap_or_default();
state = l.0 as usize;
kind = l.1;
if n != 0 {
self.state_stack
.truncate(self.state_stack.len().saturating_sub(n as usize));
}
start = off;
if n == 0 && off >= line_buf.len() {
break;
}
}
}
state = state.wrapping_sub(1);
}
if res.last().is_none_or(|h| h.start != start) {
res.push(Higlight { start, kind });
}
if res.last().is_some_and(|h| h.start != line_buf.len()) {
res.push(Higlight { start: line_buf.len(), kind });
}
// Adjust the range to account for the line offset.
for h in &mut res {
h.start = line_beg + h.start.min(line_buf.len());
}
self.state = state;
self.kind = kind;
res
}
/// A mini-memcmp implementation for short needles.
/// Compares the `haystack` at `off` with the `needle`.
#[inline]
fn inlined_memcmp(haystack: &[u8], off: usize, needle: &[u8]) -> bool {
unsafe {
let needle_len = needle.len();
if haystack.len() - off < needle_len {
return false;
}
let mut a = haystack.as_ptr().add(off);
let mut b = needle.as_ptr();
let mut i = 0;
while i < needle_len {
let a = *a.add(i);
let b = *b.add(i);
i += 1;
if a != b {
return false;
}
}
true
}
}
/// Like `inlined_memcmp`, but case-insensitive.
#[inline]
fn inlined_memicmp(haystack: &[u8], off: usize, needle: &[u8]) -> bool {
unsafe {
let needle_len = needle.len();
if haystack.len() - off < needle_len {
return false;
}
let mut a = haystack.as_ptr().add(off);
let mut b = needle.as_ptr();
let mut i = 0;
while i < needle_len {
// str in PrefixInsensitive(str) is expected to be lowercase, printable ASCII.
let a = a.add(i).read().to_ascii_lowercase();
let b = b.add(i).read();
i += 1;
if a != b {
return false;
}
}
true
}
}
#[inline]
fn in_set(bitmap: &[u16; 16], byte: u8) -> bool {
let lo_nibble = byte & 0xf;
let hi_nibble = byte >> 4;
let bitset = bitmap[lo_nibble as usize];
let bitmask = 1u16 << hi_nibble;
(bitset & bitmask) != 0
}
}
/*#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_powershell() {
let doc = r#"$response = Read-Host "Delete branch '$branch'? [y/N]""#;
let bytes = doc.as_bytes();
let scratch = scratch_arena(None);
let mut parser = Highlighter::new(&bytes, &lang_powershell::LANG);
let tokens = parser.parse_next_line(&scratch);
assert_eq!(
tokens,
&[
Higlight { start: 0, kind: HighlightKind::Variable },
Higlight { start: 9, kind: HighlightKind::Other },
Higlight { start: 10, kind: HighlightKind::Operator },
Higlight { start: 11, kind: HighlightKind::Other },
Higlight { start: 12, kind: HighlightKind::Method },
Higlight { start: 21, kind: HighlightKind::Other },
Higlight { start: 22, kind: HighlightKind::String },
Higlight { start: 38, kind: HighlightKind::Variable },
Higlight { start: 45, kind: HighlightKind::String },
Higlight { start: 54, kind: HighlightKind::Other },
]
);
}
#[test]
fn test_string() {
let doc = r#""$x";"#;
let bytes = doc.as_bytes();
let scratch = scratch_arena(None);
let mut parser = Highlighter::new(&bytes, &lang_powershell::LANG);
let tokens = parser.parse_next_line(&scratch);
assert_eq!(
tokens,
&[
Higlight { start: 0, kind: HighlightKind::String },
Higlight { start: 1, kind: HighlightKind::Variable },
Higlight { start: 3, kind: HighlightKind::String },
Higlight { start: 4, kind: HighlightKind::Other },
]
);
}
#[test]
fn test_comment() {
let doc = r#"<#x#>"#;
let bytes = doc.as_bytes();
let scratch = scratch_arena(None);
let mut parser = Highlighter::new(&bytes, &lang_powershell::LANG);
let tokens = parser.parse_next_line(&scratch);
assert_eq!(
tokens,
&[
Higlight { start: 0, kind: HighlightKind::Comment },
Higlight { start: 5, kind: HighlightKind::Other },
]
);
}
}*/
pub use definitions::*;
pub use highlighter::*;

View file

@ -81,10 +81,10 @@ const LANG_GIT_REBASE: Language = Language {
State {
name: "ground",
rules: &[
re(r#"(?:break|drop|exec|b|d|x)\b{end-half}"#)
re(r#"(?:break|exec|b|x)\b{end-half}"#)
.is(Keyword)
.then_call("comment"),
re(r#"(?:edit|fixup|pick|reword|squash|e|f|p|r|s)\b{end-half}"#)
re(r#"(?:drop|edit|fixup|pick|reword|squash|d|e|f|p|r|s)\b{end-half}"#)
.is(Keyword)
.then_call("hash"),
re(r#"#.*"#).is(Comment),
@ -94,6 +94,7 @@ const LANG_GIT_REBASE: Language = Language {
name: "hash",
rules: &[
re(r#"\S+"#).is(Variable).then_call("comment"),
re(r#"\s+"#),
re(r#".*"#).then_return(),
],
},
@ -422,6 +423,13 @@ pub struct State {
pub rules: &'static [Rule],
}
pub enum Instruction {
Continue,
Change(&'static str),
Push(&'static str),
Pop,
}
pub struct Rule {
pub pattern: &'static str,
pub kind: Option<HighlightKind>,

View file

@ -1,32 +0,0 @@
use std::rc::{Rc, Weak};
pub struct Interner<T> {
list: Vec<Weak<T>>,
}
impl<T> Default for Interner<T> {
fn default() -> Self {
Interner { list: Vec::new() }
}
}
impl<T: PartialEq> Interner<T> {
pub fn extract(&self) -> Vec<Rc<T>> {
self.list.iter().filter_map(Weak::upgrade).collect()
}
pub fn intern(&mut self, value: T) -> Rc<T> {
if let Some(rc) = self
.list
.iter()
.filter_map(|w| w.upgrade())
.find(|c| **c == value)
{
return rc;
}
let rc = Rc::new(value);
self.list.push(Rc::downgrade(&rc));
rc
}
}

View file

@ -13,11 +13,9 @@
mod definitions;
mod handles;
mod interner;
mod transformer;
use std::fmt::Write as _;
use std::hash::{DefaultHasher, Hash, Hasher as _};
use std::io::Write as _;
use indoc::{indoc, writedoc};
@ -46,6 +44,12 @@ fn main() {
pub transitions: &'static [Transition<'static>],
}
impl PartialEq for Language {
fn eq(&self, other: &Self) -> bool {
std::ptr::eq(self, other)
}
}
pub struct Transition<'a> {
pub test: Test<'a>,
pub kind: Option<HighlightKind>,
@ -146,13 +150,13 @@ fn main() {
builder.format_as_mermaid()
);
for cs in builder.extract_charsets() {
for (h, cs) in builder.extract_charsets() {
_ = writedoc!(
output,
"
#[rustfmt::skip] const LANG_{}_CHARSET_{:016X}: &[u16; 16] = &[",
#[rustfmt::skip] const LANG_{}_CHARSET_{}: &[u16; 16] = &[",
name_uppercase,
calculate_hash(&cs)
h.0,
);
for lo in 0..16 {
if lo > 0 {
@ -167,13 +171,13 @@ fn main() {
_ = writeln!(output, "];");
}
for s in builder.extract_strings() {
for (h, s) in builder.extract_strings() {
_ = writedoc!(
output,
"
#[rustfmt::skip] const LANG_{}_STRING_{:016X}: *const u8 = [",
#[rustfmt::skip] const LANG_{}_STRING_{}: *const u8 = [",
name_uppercase,
calculate_hash(&s),
h.0,
);
_ = write!(output, "{}", s.len());
for &c in s.as_bytes() {
@ -202,25 +206,13 @@ fn main() {
format!("Chars({n})")
}
GraphTest::Charset(cs) => {
format!(
"Charset(LANG_{}_CHARSET_{:016X})",
name_uppercase,
calculate_hash(&cs)
)
format!("Charset(LANG_{}_CHARSET_{})", name_uppercase, cs.0)
}
GraphTest::Prefix(s) => {
format!(
"Prefix(LANG_{}_STRING_{:016X})",
name_uppercase,
calculate_hash(&s)
)
format!("Prefix(LANG_{}_STRING_{})", name_uppercase, s.0)
}
GraphTest::PrefixInsensitive(s) => {
format!(
"PrefixInsensitive(LANG_{}_STRING_{:016X})",
name_uppercase,
calculate_hash(&s)
)
format!("PrefixInsensitive(LANG_{}_STRING_{})", name_uppercase, s.0)
}
};
let action = match &t.dst {
@ -271,9 +263,3 @@ fn main() {
_ = std::io::stdout().write_all(output.as_bytes());
}
fn calculate_hash<T: Hash>(t: &T) -> u64 {
let mut s = DefaultHasher::new();
t.hash(&mut s);
s.finish()
}

View file

@ -1,23 +1,23 @@
use std::fmt::{self, Write as _};
use std::mem;
use std::ops::{Index, IndexMut};
use std::rc::Rc;
use regex_syntax::hir::{Class, ClassBytes, ClassBytesRange, Hir, HirKind, Look};
use crate::definitions::*;
use crate::handles::{HandleVec, declare_handle};
use crate::interner::Interner;
declare_handle!(pub StateHandle(usize));
declare_handle!(pub TransitionHandle(usize));
declare_handle!(pub CharsetHandle(usize));
declare_handle!(pub StringHandle(usize));
pub struct GraphBuilder {
roots: RootList,
states: HandleVec<StateHandle, GraphState>,
transitions: HandleVec<TransitionHandle, GraphTransition>,
charsets: Interner<Charset>,
strings: Interner<String>,
charsets: HandleVec<CharsetHandle, Charset>,
strings: HandleVec<StringHandle, String>,
origin: i32,
}
@ -151,7 +151,7 @@ impl GraphBuilder {
lit: &[u8],
) -> GraphAction {
let prefix = String::from_utf8(lit.to_vec()).unwrap();
let prefix = self.strings.intern(prefix);
let prefix = self.intern_string(prefix);
self.add_transition(kind, src, dst, GraphTest::Prefix(prefix))
}
@ -164,7 +164,7 @@ impl GraphBuilder {
class: &ClassBytes,
) -> GraphAction {
let c = self.class_to_charset(class);
let c = self.charsets.intern(c);
let c = self.intern_charset(&c);
self.add_transition(kind, src, dst, GraphTest::Charset(c))
}
@ -200,9 +200,9 @@ impl GraphBuilder {
{
charset[upper] = false;
str.make_ascii_lowercase();
GraphTest::PrefixInsensitive(self.strings.intern(str))
GraphTest::PrefixInsensitive(self.intern_string(str))
} else {
GraphTest::Prefix(self.strings.intern(str))
GraphTest::Prefix(self.intern_string(str))
};
let d = self.add_transition(kind, src, dst, test);
@ -304,7 +304,7 @@ impl GraphBuilder {
};
if let Some(str) = prefix_insensitive {
let str = self.strings.intern(str);
let str = self.intern_string(str);
src = self.add_transition(kind, src_idx, dst, GraphTest::PrefixInsensitive(str));
} else {
src = self.transform(kind, src_idx, dst, hir);
@ -389,15 +389,19 @@ impl GraphBuilder {
for t in self.transitions_from_state(src) {
use GraphTest::*;
if match (&t.test, &test) {
if match (t.test, test) {
(Chars(_), _) => true,
(Charset(p), Charset(n)) => n.is_superset(p),
(Charset(p), Prefix(n)) => p.covers_char(n.as_bytes()[0]),
(Charset(p), PrefixInsensitive(n)) => p.covers_char_insensitive(n.as_bytes()[0]),
(Prefix(p), Prefix(s)) => s.starts_with(p.as_str()),
(Charset(p), Charset(n)) => self.charsets[n].is_superset(&self.charsets[p]),
(Charset(p), Prefix(n)) => {
self.charsets[p].covers_char(self.strings[n].as_bytes()[0])
}
(Charset(p), PrefixInsensitive(n)) => {
self.charsets[p].covers_char_insensitive(self.strings[n].as_bytes()[0])
}
(Prefix(p), Prefix(s)) => self.strings[s].starts_with(self.strings[p].as_str()),
(PrefixInsensitive(p), Prefix(s) | PrefixInsensitive(s)) => {
let s = s.as_bytes();
let p = p.as_bytes();
let s = self.strings[s].as_bytes();
let p = self.strings[p].as_bytes();
p.len() <= s.len() && s[..p.len()].eq_ignore_ascii_case(p)
}
_ => false,
@ -419,7 +423,27 @@ impl GraphBuilder {
dst
}
fn intern_charset(&mut self, cs: &Charset) -> CharsetHandle {
if let Some((idx, _)) = self.charsets.enumerate().find(|&(_, v)| v == cs) {
idx
} else {
self.charsets.push(cs.clone())
}
}
fn intern_string(&mut self, string: String) -> StringHandle {
if let Some((idx, _)) = self.strings.enumerate().find(|&(_, v)| *v == string) {
idx
} else {
self.strings.push(string)
}
}
pub fn finalize(&mut self) {
if self.states.is_empty() {
return;
}
self.finalize_resolve_root_aliases();
self.finalize_compute_charset_coverage();
self.finalize_add_root_loops();
@ -456,9 +480,9 @@ impl GraphBuilder {
/// Technically we don't need to do that for the root states.
fn finalize_compute_charset_coverage(&mut self) {
for t in &self.transitions {
match &t.test {
match t.test {
GraphTest::Chars(_) => self.states[t.src].coverage.fill(true),
GraphTest::Charset(c) => self.states[t.src].coverage.merge(c),
GraphTest::Charset(c) => self.states[t.src].coverage.merge(&self.charsets[c]),
_ => {}
}
}
@ -482,20 +506,20 @@ impl GraphBuilder {
let mut cs = Charset::no();
for t in self.transitions_from_state(src) {
match &t.test {
match t.test {
GraphTest::Chars(_) => {
cs.fill(true);
break;
}
GraphTest::Charset(c) => {
cs.merge(c);
cs.merge(&self.charsets[c]);
}
GraphTest::Prefix(s) => {
let ch = s.as_bytes()[0];
let ch = self.strings[s].as_bytes()[0];
cs.set(ch, true);
}
GraphTest::PrefixInsensitive(s) => {
let ch = s.as_bytes()[0];
let ch = self.strings[s].as_bytes()[0];
cs.set(ch.to_ascii_uppercase(), true);
cs.set(ch.to_ascii_lowercase(), true);
}
@ -505,10 +529,11 @@ impl GraphBuilder {
if !cs.covers_all() {
cs.invert();
let cs = self.intern_charset(&cs);
self.transitions.push(GraphTransition {
origin: -1,
src,
test: GraphTest::Charset(self.charsets.intern(cs)),
test: GraphTest::Charset(cs),
kind: None,
dst: GraphAction::Pop(0),
});
@ -542,9 +567,9 @@ impl GraphBuilder {
t.clone()
};
let fallback_cs = match &fallback.test {
let fallback_cs = match fallback.test {
GraphTest::Chars(_) => &CS_YES,
GraphTest::Charset(c) => &**c,
GraphTest::Charset(c) => &self.charsets[c],
_ => unreachable!(),
};
@ -579,7 +604,7 @@ impl GraphBuilder {
match t.dst {
GraphAction::Fallback => {
t.test = fallback.test.clone();
t.test = fallback.test;
t.dst = fallback.dst;
}
GraphAction::Change(dst) if !visited[dst.0] => {
@ -588,13 +613,13 @@ impl GraphBuilder {
// Check if the fallback is a superset of this transition.
// This applies recursively, which means we assert that the fallback covers
// the entire "path" from the original `fallback.src` down to this state.
if match &t.test {
if match t.test {
GraphTest::Chars(0) => true,
GraphTest::Chars(_) => fallback_cs.covers_all(),
GraphTest::Charset(c) => fallback_cs.is_superset(c),
GraphTest::Prefix(s) => fallback_cs.covers_str(s),
GraphTest::Charset(c) => fallback_cs.is_superset(&self.charsets[c]),
GraphTest::Prefix(s) => fallback_cs.covers_str(&self.strings[s]),
GraphTest::PrefixInsensitive(s) => {
fallback_cs.covers_str_insensitive(s)
fallback_cs.covers_str_insensitive(&self.strings[s])
}
} {
stack.push(dst);
@ -794,26 +819,19 @@ impl GraphBuilder {
}
}
let label = match &t.test {
let label = match t.test {
GraphTest::Chars(usize::MAX) => "Chars(Line)".to_string(),
GraphTest::Chars(n) => format!("Chars({n})"),
GraphTest::Charset(c) => format!("Charset({c:?})"),
GraphTest::Charset(c) => format!("Charset({:?})", &self.charsets[c]),
GraphTest::Prefix(s) => {
let mut label = String::new();
_ = write!(label, "Prefix({s}");
_ = write!(label, "Prefix({}", &self.strings[s]);
loop {
let Some(next) = iter.peek() else {
break;
};
let GraphTest::Prefix(next_s) = &next.test else {
break;
};
if next.dst != t.dst {
break;
}
_ = write!(label, ", {}", next_s);
while let Some(next) = iter.peek()
&& let GraphTest::Prefix(next_s) = next.test
&& next.dst == t.dst
{
_ = write!(label, ", {}", &self.strings[next_s]);
iter.next();
}
@ -822,20 +840,13 @@ impl GraphBuilder {
}
GraphTest::PrefixInsensitive(s) => {
let mut label = String::new();
_ = write!(label, "PrefixInsensitive({s}");
_ = write!(label, "PrefixInsensitive({}", &self.strings[s]);
loop {
let Some(next) = iter.peek() else {
break;
};
let GraphTest::PrefixInsensitive(next_s) = &next.test else {
break;
};
if next.dst != t.dst {
break;
}
_ = write!(label, ", {next_s}");
while let Some(next) = iter.peek()
&& let GraphTest::PrefixInsensitive(next_s) = next.test
&& next.dst == t.dst
{
_ = write!(label, ", {}", &self.strings[next_s]);
iter.next();
}
@ -885,12 +896,38 @@ impl GraphBuilder {
output
}
pub fn extract_charsets(&self) -> Vec<Rc<Charset>> {
self.charsets.extract()
/// Filtered down to only those that are still used.
pub fn extract_charsets(&self) -> Vec<(CharsetHandle, Charset)> {
let mut used = vec![false; self.charsets.len()];
for t in &self.transitions {
if let GraphTest::Charset(c) = t.test {
used[c.0] = true;
}
}
self.charsets
.enumerate()
.filter(|&(h, _)| used[h.0])
.map(|(h, v)| (h, v.clone()))
.collect()
}
pub fn extract_strings(&self) -> Vec<Rc<String>> {
self.strings.extract()
/// Filtered down to only those that are still used.
pub fn extract_strings(&self) -> Vec<(StringHandle, String)> {
let mut used = vec![false; self.strings.len()];
for t in &self.transitions {
if let GraphTest::Prefix(s) | GraphTest::PrefixInsensitive(s) = t.test {
used[s.0] = true;
}
}
self.strings
.enumerate()
.filter(|&(h, _)| used[h.0])
.map(|(h, v)| (h, v.clone()))
.collect()
}
/// Up to this point we've thought of this as a graph, but now we'll flatten
@ -989,28 +1026,14 @@ pub enum GraphAction {
Fallback, // replace with a fallback transition (for look-aheads like \b)
}
#[derive(Debug, Clone)]
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum GraphTest {
Chars(usize),
Charset(Rc<Charset>),
Prefix(Rc<String>),
PrefixInsensitive(Rc<String>),
Charset(CharsetHandle),
Prefix(StringHandle),
PrefixInsensitive(StringHandle),
}
impl PartialEq for GraphTest {
fn eq(&self, other: &Self) -> bool {
match (self, other) {
(GraphTest::Chars(a), GraphTest::Chars(b)) => a == b,
(GraphTest::Charset(a), GraphTest::Charset(b)) => Rc::ptr_eq(a, b),
(GraphTest::Prefix(a), GraphTest::Prefix(b)) => Rc::ptr_eq(a, b),
(GraphTest::PrefixInsensitive(a), GraphTest::PrefixInsensitive(b)) => Rc::ptr_eq(a, b),
_ => false,
}
}
}
impl Eq for GraphTest {}
#[derive(Debug, Clone)]
pub struct GraphTransition {
origin: i32,
@ -1107,7 +1130,7 @@ impl fmt::Debug for Charset {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
let show_char = |f: &mut fmt::Formatter<'_>, b: usize| {
let b = b as u8;
if b.is_ascii_graphic() || b == b' ' {
if b.is_ascii_graphic() {
let b = b as char;
write!(f, "{b}")
} else {