This commit is contained in:
Leonard Hecker 2025-08-27 19:37:47 +02:00
parent fa5dd54f6f
commit d661bfdf09
5 changed files with 280 additions and 135 deletions

View file

@ -128,13 +128,13 @@ fn bench_hash(c: &mut Criterion) {
}
fn bench_lsh(c: &mut Criterion) {
let bytes = include_bytes!("../assets/highlighting-tests/powershell.ps1");
let bytes = include_bytes!("../assets/highlighting-tests/COMMIT_EDITMSG");
let bytes = &bytes[..];
let lang = lsh::language_from_path(Path::new("powershell.ps1")).unwrap();
let lang = lsh::language_from_path(Path::new("COMMIT_EDITMSG")).unwrap();
let highlighter = lsh::Highlighter::new(black_box(&bytes), lang);
c.benchmark_group("lsh").throughput(Throughput::Bytes(bytes.len() as u64)).bench_function(
"powershell",
"COMMIT_EDITMSG",
|b| {
b.iter(|| {
let mut h = highlighter.clone();

View file

@ -12,20 +12,43 @@ declare_handle!(pub StateHandle(usize));
declare_handle!(pub StringHandle(usize));
declare_handle!(pub TransitionHandle(usize));
pub const REG_ZERO: u8 = 0;
pub const REG_PROGRAM_COUNTER: u8 = 1;
pub const REG_INPUT_OFFSET: u8 = 2;
pub const REG_HIGHLIGHT_START: u8 = 3;
pub const REG_HIGHLIGHT_KIND: u8 = 4;
#[derive(Clone, Copy)]
pub enum Register {
Zero,
ProgramCounter,
ProcedureStart,
InputOffset,
HighlightStart,
HighlightKind,
#[allow(clippy::upper_case_acronyms)]
COUNT,
}
impl fmt::Debug for Register {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
f.write_str(match self {
Register::Zero => "Register::Zero",
Register::ProgramCounter => "Register::ProgramCounter",
Register::ProcedureStart => "Register::ProcedureStart",
Register::InputOffset => "Register::InputOffset",
Register::HighlightStart => "Register::HighlightStart",
Register::HighlightKind => "Register::HighlightKind",
Register::COUNT => "Register::COUNT",
})
}
}
#[allow(dead_code)]
pub type Registers = [usize; 5];
pub type Registers = [usize; Register::COUNT as usize];
#[allow(dead_code)]
#[derive(Debug, Clone)]
pub enum Instruction {
// .0 (reg) = .1 (reg) + .2 (constant)
// Note that this allows for jumps as well, by manipulating REG_PROGRAM_COUNTER.
Add(u8, u8, usize),
Add(Register, Register, usize),
// Typical call/ret instructions.
// The VM takes care of saving the return address.
@ -445,7 +468,14 @@ impl Compiler {
}
}
self.transitions.push(GraphTransition { origin: self.origin, src, test, kind, dst });
self.transitions.push(GraphTransition {
origin: self.origin,
instruction_offset: usize::MAX,
src,
test,
kind,
dst,
});
dst
}
@ -482,90 +512,178 @@ impl Compiler {
let strings = self.extract_strings();
let charsets = self.extract_charsets();
let mut instructions = Vec::new();
#[derive(Clone, Copy)]
enum RelocationTarget {
None,
State(StateHandle),
Transition(TransitionHandle),
RelState(StateHandle),
RelTransition(TransitionHandle),
}
for t in &self.transitions {
use Instruction::*;
use RelocationTarget::*;
struct Compiler<'a> {
states: &'a mut HandleVec<StateHandle, GraphState>,
transitions: &'a mut HandleVec<TransitionHandle, GraphTransition>,
instructions: Vec<Instruction>,
relocations: Vec<(usize, RelocationTarget)>,
}
if self.states[t.src].instruction_offset == usize::MAX {
self.states[t.src].instruction_offset = instructions.len();
impl Compiler<'_> {
fn compile(mut self) -> Vec<Instruction> {
use RelocationTarget::*;
for th in self.transitions.indices() {
let th_next = TransitionHandle(th.0 + 1);
let t = self.transitions[th].clone();
self.transitions[th].instruction_offset = self.instructions.len();
if self.states[t.src].instruction_offset == usize::MAX {
self.states[t.src].instruction_offset = self.instructions.len();
}
match t.test {
GraphTest::Chars(0) => {}
GraphTest::Chars(usize::MAX) => {
self.assign(Register::InputOffset, usize::MAX);
}
GraphTest::Chars(n) => {
self.add_assign(Register::InputOffset, n);
}
GraphTest::Charset(h) => {
self.jump_if_not_match_charset(RelTransition(th_next), h);
}
GraphTest::Prefix(h) => {
self.jump_if_not_match_prefix(RelTransition(th_next), h);
}
GraphTest::PrefixInsensitive(h) => {
self.jump_if_not_match_prefix_insensitive(RelTransition(th_next), h);
}
}
match t.kind {
HighlightKindOp::None => {}
HighlightKindOp::Some(kind) => {
self.assign(Register::HighlightKind, kind.as_usize());
}
}
match t.dst {
GraphAction::Jump(dst) => {
self.jump(RelState(dst));
}
GraphAction::Push(dst) => {
self.flush_highlight();
self.copy(Register::HighlightStart, Register::InputOffset);
self.jump(RelState(dst));
}
GraphAction::Pop(0) => {
self.flush_highlight();
self.suspend_opportunity();
self.copy(Register::ProgramCounter, Register::ProcedureStart);
}
GraphAction::Pop(1) => {
self.flush_highlight();
self.ret();
}
GraphAction::Loop(dst) => {
self.suspend_opportunity();
self.jump(RelState(dst));
}
_ => unreachable!(),
}
}
for &(off, dst) in &self.relocations {
let instruction_offset = match dst {
RelState(h) => self.states[h].instruction_offset,
RelTransition(h) => self.transitions[h].instruction_offset,
};
let instruction_offset: u16 = instruction_offset.try_into().unwrap();
match &mut self.instructions[off] {
Instruction::Add(Register::ProgramCounter, Register::Zero, d) => {
*d = instruction_offset as usize;
}
Instruction::JumpIfNotMatchCharset(d, _)
| Instruction::JumpIfNotMatchPrefix(d, _)
| Instruction::JumpIfNotMatchPrefixInsensitive(d, _) => {
*d = instruction_offset;
}
i => panic!("Unexpected relocation target: {i:?}"),
}
}
self.instructions
}
match t.test {
GraphTest::Chars(n) => {
instructions.push((None, Add(REG_INPUT_OFFSET, REG_INPUT_OFFSET, n)));
}
GraphTest::Charset(h) => {
instructions
.push((Transition(TransitionHandle(123)), JumpIfNotMatchCharset(0, h)));
}
GraphTest::Prefix(h) => {
instructions
.push((Transition(TransitionHandle(1234)), JumpIfNotMatchPrefix(0, h)));
}
GraphTest::PrefixInsensitive(h) => {
instructions.push((
Transition(TransitionHandle(1234)),
JumpIfNotMatchPrefixInsensitive(0, h),
));
}
fn add_assign(&mut self, reg: Register, val: usize) {
self.instructions.push(Instruction::Add(reg, reg, val));
}
match t.kind {
HighlightKindOp::None => {}
HighlightKindOp::Some(kind) => {
instructions.push((None, Add(REG_HIGHLIGHT_KIND, REG_ZERO, kind.as_usize())));
}
fn assign(&mut self, reg: Register, val: usize) {
self.instructions.push(Instruction::Add(reg, Register::Zero, val));
}
match t.dst {
GraphAction::Jump(dst) => {
instructions.push((
Transition(TransitionHandle(1234)),
Add(REG_PROGRAM_COUNTER, REG_ZERO, 0),
));
fn copy(&mut self, dst: Register, src: Register) {
self.instructions.push(Instruction::Add(dst, src, 0));
}
fn jump(&mut self, dst: RelocationTarget) {
let dst = self.resolve_relocation(dst);
self.assign(Register::ProgramCounter, dst as usize);
}
fn jump_if_not_match_charset(&mut self, dst: RelocationTarget, h: CharsetHandle) {
let dst = self.resolve_relocation(dst);
self.instructions.push(Instruction::JumpIfNotMatchCharset(dst, h));
}
fn jump_if_not_match_prefix(&mut self, dst: RelocationTarget, h: StringHandle) {
let dst = self.resolve_relocation(dst);
self.instructions.push(Instruction::JumpIfNotMatchPrefix(dst, h));
}
fn jump_if_not_match_prefix_insensitive(
&mut self,
dst: RelocationTarget,
h: StringHandle,
) {
let dst = self.resolve_relocation(dst);
self.instructions.push(Instruction::JumpIfNotMatchPrefixInsensitive(dst, h));
}
fn ret(&mut self) {
self.instructions.push(Instruction::Return);
}
fn flush_highlight(&mut self) {
self.instructions.push(Instruction::FlushHighlight);
}
fn suspend_opportunity(&mut self) {
self.instructions.push(Instruction::SuspendOpportunity);
}
fn resolve_relocation(&mut self, dst: RelocationTarget) -> u16 {
use RelocationTarget::*;
let instruction_offset = match dst {
RelState(h) => self.states[h].instruction_offset,
RelTransition(h) => self.transitions[h].instruction_offset,
};
if instruction_offset != usize::MAX {
instruction_offset.try_into().unwrap()
} else {
self.relocations.push((self.instructions.len(), dst));
0
}
GraphAction::Push(dst) => {
instructions.push((Transition(TransitionHandle(1234)), Call(0))); // TODO: The position of the `dst` state.
instructions.push((
Transition(TransitionHandle(1234)),
Add(REG_PROGRAM_COUNTER, REG_ZERO, 0),
)); // TODO: The position of the `t.src` state.
}
GraphAction::Pop(_) => {
instructions.push((None, Return));
}
GraphAction::Loop(dst) => {
instructions.push((
Transition(TransitionHandle(1234)),
Add(REG_PROGRAM_COUNTER, REG_ZERO, 0),
)); // TODO: The position of the `t.src` state.
}
_ => unreachable!(),
}
}
// Up to this point we created jump instructions with targets referring to states.
// We now patch TODO
//for i in &mut instructions {
// match i {
// Instruction::JumpIfNotMatchCharset(dst, _)
// | Instruction::JumpIfNotMatchPrefix(dst, _)
// | Instruction::JumpIfNotMatchPrefixInsensitive(dst, _) => {
// *dst = self.states[StateHandle(*dst as usize)].instruction_offset as u16;
// }
// _ => {}
// }
//}
let instructions = instructions.into_iter().map(|(_, i)| i).collect();
let compiler = Compiler {
states: &mut self.states,
transitions: &mut self.transitions,
instructions: Default::default(),
relocations: Default::default(),
};
let instructions = compiler.compile();
Assembly { strings, charsets, instructions }
}
@ -665,6 +783,7 @@ impl Compiler {
let cs = self.intern_charset(&cs);
self.transitions.push(GraphTransition {
origin: -1,
instruction_offset: usize::MAX,
src,
test: GraphTest::Charset(cs),
kind: HighlightKindOp::None,
@ -672,6 +791,7 @@ impl Compiler {
});
self.transitions.push(GraphTransition {
origin: -1,
instruction_offset: usize::MAX,
src,
test: GraphTest::Chars(1),
kind: HighlightKindOp::None,
@ -774,6 +894,7 @@ impl Compiler {
if !s.coverage.covers_all() {
self.transitions.push(GraphTransition {
origin: -1,
instruction_offset: usize::MAX,
src,
test: GraphTest::Chars(0),
kind: HighlightKindOp::None,
@ -1087,6 +1208,7 @@ pub enum GraphTest {
#[derive(Debug, Clone)]
pub struct GraphTransition {
origin: i32,
instruction_offset: usize,
pub src: StateHandle,
pub test: GraphTest,
pub kind: HighlightKindOp,

View file

@ -24,6 +24,10 @@ where
pub fn enumerate(&self) -> impl DoubleEndedIterator<Item = (H, &T)> {
self.list.iter().enumerate().map(|(i, v)| (H::from(i), v))
}
pub fn enumerate_mut(&mut self) -> impl DoubleEndedIterator<Item = (H, &mut T)> {
self.list.iter_mut().enumerate().map(|(i, v)| (H::from(i), v))
}
}
impl<H, T> Default for HandleVec<H, T> {

View file

@ -27,7 +27,6 @@ pub fn generate() -> String {
"\
// This file is generated by build.rs. Do not edit it manually.
use HighlightKind::*;
use Instruction::*;
pub struct Language {
@ -73,28 +72,36 @@ pub enum HighlightKind {
impl HighlightKind {
pub const fn as_usize(self) -> usize {
unsafe { std::mem::transmute::<HighlightKind, u8>(self) as usize }
unsafe { std::mem::transmute::<Self, u8>(self) as usize }
}
pub const unsafe fn from_usize(value: usize) -> Self {
debug_assert!(value <= Method.as_usize());
unsafe { std::mem::transmute::<u8, HighlightKind>(value as u8) }
debug_assert!(value <= Self::Method.as_usize());
unsafe { std::mem::transmute::<u8, Self>(value as u8) }
}
}
pub const REG_ZERO: u8 = 0;
pub const REG_PROGRAM_COUNTER: u8 = 1;
pub const REG_INPUT_OFFSET: u8 = 2;
pub const REG_HIGHLIGHT_START: u8 = 3;
pub const REG_HIGHLIGHT_KIND: u8 = 4;
#[derive(Debug, Clone, Copy)]
pub enum Register {
Zero,
ProgramCounter,
ProcedureStart,
InputOffset,
HighlightStart,
HighlightKind,
pub type Registers = [usize; 5];
#[allow(clippy::upper_case_acronyms)]
COUNT,
}
#[allow(dead_code)]
pub type Registers = [usize; Register::COUNT as usize];
#[derive(Debug, Clone)]
pub enum Instruction {
// .0 (reg) = .1 (reg) + .2 (constant)
// Note that this allows for jumps as well, by manipulating REG_PROGRAM_COUNTER.
Add(u8, u8, usize),
Add(Register, Register, usize),
// Typical call/ret instructions.
// The VM takes care of saving the return address.
@ -103,9 +110,9 @@ pub enum Instruction {
// Test (and consume) the given character(s) in `.1`.
// If the test fails, jump to `.0`.
JumpIfNotMatchCharset(u16, CharsetHandle),
JumpIfNotMatchPrefix(u16, StringHandle),
JumpIfNotMatchPrefixInsensitive(u16, StringHandle),
JumpIfNotMatchCharset(u16, &'static [u16; 16]),
JumpIfNotMatchPrefix(u16, *const u8),
JumpIfNotMatchPrefixInsensitive(u16, *const u8),
// Flush the current HighlightKind to the output.
FlushHighlight,
@ -201,28 +208,31 @@ config:
filenames = lang.filenames,
);
for op in &assembly.instructions {
let instruction_number_width = assembly.instructions.len().ilog10() as usize + 1;
for (i, op) in assembly.instructions.into_iter().enumerate() {
_ = write!(output, " /* {i:<instruction_number_width$} */ ");
match op {
Instruction::JumpIfNotMatchCharset(addr, h) => {
_ = writeln!(
output,
" JumpIfNotMatchCharset({addr}, LANG_{name_uppercase}_CHARSET_{h}),"
"JumpIfNotMatchCharset({addr}, LANG_{name_uppercase}_CHARSET_{h}),"
);
}
Instruction::JumpIfNotMatchPrefix(addr, h) => {
_ = writeln!(
output,
" JumpIfNotMatchPrefix({addr}, LANG_{name_uppercase}_STRING_{h}),"
"JumpIfNotMatchPrefix({addr}, LANG_{name_uppercase}_STRING_{h}),"
);
}
Instruction::JumpIfNotMatchPrefixInsensitive(addr, h) => {
_ = writeln!(
output,
" JumpIfNotMatchPrefixInsensitive({addr}, LANG_{name_uppercase}_STRING_{h}),"
"JumpIfNotMatchPrefixInsensitive({addr}, LANG_{name_uppercase}_STRING_{h}),"
);
}
_ => {
_ = writeln!(output, " {op:?},");
op => {
_ = writeln!(output, "{op:?},");
}
}
}

View file

@ -67,7 +67,11 @@ impl<'doc> Highlighter<'doc> {
offset: 0,
logical_pos_y: 0,
call_stack: Default::default(),
registers: Default::default(),
registers: {
let mut r = Registers::default();
r[Register::HighlightKind as usize] = HighlightKind::Other.as_usize();
r
},
}
}
@ -170,23 +174,22 @@ impl<'doc> Highlighter<'doc> {
let line = unicode::strip_newline(line);
let mut push = |start: usize, kind: HighlightKind| {};
self.set_reg(Register::InputOffset, 0);
self.set_reg(Register::HighlightStart, 0);
loop {
unsafe {
match *self
.language
.instructions
.get_unchecked(self.registers[REG_PROGRAM_COUNTER as usize])
{
let pc = self.get_reg(Register::ProgramCounter);
self.set_reg(Register::ProgramCounter, pc + 1);
match *self.language.instructions.get_unchecked(pc) {
Instruction::Add(dst, src, add) => {
let src = *self.registers.get_unchecked(src as usize);
let dst = self.registers.get_unchecked_mut(dst as usize);
*dst = src.saturating_add(add);
self.registers[dst as usize] = self.registers[src as usize] + add;
}
Instruction::Call(dst) => {
self.call_stack.push(self.registers);
self.registers[REG_PROGRAM_COUNTER as usize] = dst as usize;
self.set_reg(Register::ProgramCounter, dst as usize);
self.set_reg(Register::ProcedureStart, dst as usize);
}
Instruction::Return => {
if let Some(last) = self.call_stack.last() {
@ -197,40 +200,39 @@ impl<'doc> Highlighter<'doc> {
}
}
Instruction::JumpIfNotMatchCharset(dst, cs) => {
let mut off = self.registers[REG_INPUT_OFFSET as usize];
let mut off = self.get_reg(Register::InputOffset);
// TODO: http://0x80.pl/notesen/2018-10-18-simd-byte-lookup.html#alternative-implementation
if off >= line.len() || !Self::in_set(cs, line[off]) {
self.registers[REG_PROGRAM_COUNTER as usize] = dst as usize;
self.set_reg(Register::ProgramCounter, dst as usize);
} else {
while {
off += 1;
off < line.len() && Self::in_set(cs, line[off])
} {}
self.registers[REG_INPUT_OFFSET as usize] = off;
self.set_reg(Register::InputOffset, off);
}
}
Instruction::JumpIfNotMatchPrefix(dst, s) => {
let off = self.registers[REG_INPUT_OFFSET as usize];
let str = unsafe { slice::from_raw_parts(s.add(1), s.read() as usize) };
let off = self.get_reg(Register::InputOffset);
let str = slice::from_raw_parts(s.add(1), s.read() as usize);
if !Self::inlined_memcmp(line, off, str) {
self.registers[REG_PROGRAM_COUNTER as usize] = dst as usize;
self.set_reg(Register::ProgramCounter, dst as usize);
} else {
self.registers[REG_INPUT_OFFSET as usize] = off + str.len();
self.set_reg(Register::InputOffset, off + str.len());
}
}
Instruction::JumpIfNotMatchPrefixInsensitive(dst, s) => {
let off = self.registers[REG_INPUT_OFFSET as usize];
let str = unsafe { slice::from_raw_parts(s.add(1), s.read() as usize) };
let off = self.get_reg(Register::InputOffset);
let str = slice::from_raw_parts(s.add(1), s.read() as usize);
if !Self::inlined_memicmp(line, off, str) {
self.registers[REG_PROGRAM_COUNTER as usize] = dst as usize;
self.set_reg(Register::ProgramCounter, dst as usize);
} else {
self.registers[REG_INPUT_OFFSET as usize] = off + str.len();
self.set_reg(Register::InputOffset, off + str.len());
}
}
Instruction::FlushHighlight => {
let start = self.registers[REG_HIGHLIGHT_START as usize];
let kind =
HighlightKind::from_usize(self.registers[REG_HIGHLIGHT_KIND as usize]);
let start = self.get_reg(Register::HighlightStart);
let kind = HighlightKind::from_usize(self.get_reg(Register::HighlightKind));
if let Some(last) = res.last_mut()
&& (last.start == start || last.kind == kind)
@ -240,11 +242,10 @@ impl<'doc> Highlighter<'doc> {
res.push(Higlight { start, kind });
}
self.registers[REG_HIGHLIGHT_START as usize] =
self.registers[REG_INPUT_OFFSET as usize];
self.set_reg(Register::HighlightStart, self.get_reg(Register::InputOffset));
}
Instruction::SuspendOpportunity => {
let off = self.registers[REG_INPUT_OFFSET as usize];
let off = self.get_reg(Register::InputOffset);
if off >= line.len() {
break;
}
@ -268,8 +269,7 @@ impl<'doc> Highlighter<'doc> {
#[inline]
fn inlined_memcmp(haystack: &[u8], off: usize, needle: &[u8]) -> bool {
unsafe {
let needle_len = needle.len();
if haystack.len() - off < needle_len {
if haystack.len() - off < needle.len() {
return false;
}
@ -277,7 +277,7 @@ impl<'doc> Highlighter<'doc> {
let b = needle.as_ptr();
let mut i = 0;
while i < needle_len {
while i < needle.len() {
let a = *a.add(i);
let b = *b.add(i);
i += 1;
@ -294,8 +294,7 @@ impl<'doc> Highlighter<'doc> {
#[inline]
fn inlined_memicmp(haystack: &[u8], off: usize, needle: &[u8]) -> bool {
unsafe {
let needle_len = needle.len();
if haystack.len() - off < needle_len {
if haystack.len() - off < needle.len() {
return false;
}
@ -303,7 +302,7 @@ impl<'doc> Highlighter<'doc> {
let b = needle.as_ptr();
let mut i = 0;
while i < needle_len {
while i < needle.len() {
// str in PrefixInsensitive(str) is expected to be lowercase, printable ASCII.
let a = a.add(i).read().to_ascii_lowercase();
let b = b.add(i).read();
@ -327,6 +326,16 @@ impl<'doc> Highlighter<'doc> {
(bitset & bitmask) != 0
}
#[inline(always)]
fn get_reg(&self, reg: Register) -> usize {
self.registers[reg as usize]
}
#[inline(always)]
fn set_reg(&mut self, reg: Register, val: usize) {
self.registers[reg as usize] = val;
}
}
/*#[cfg(test)]