diff --git a/benches/lib.rs b/benches/lib.rs index 6bbfc4c..69625f0 100644 --- a/benches/lib.rs +++ b/benches/lib.rs @@ -128,13 +128,13 @@ fn bench_hash(c: &mut Criterion) { } fn bench_lsh(c: &mut Criterion) { - let bytes = include_bytes!("../assets/highlighting-tests/powershell.ps1"); + let bytes = include_bytes!("../assets/highlighting-tests/COMMIT_EDITMSG"); let bytes = &bytes[..]; - let lang = lsh::language_from_path(Path::new("powershell.ps1")).unwrap(); + let lang = lsh::language_from_path(Path::new("COMMIT_EDITMSG")).unwrap(); let highlighter = lsh::Highlighter::new(black_box(&bytes), lang); c.benchmark_group("lsh").throughput(Throughput::Bytes(bytes.len() as u64)).bench_function( - "powershell", + "COMMIT_EDITMSG", |b| { b.iter(|| { let mut h = highlighter.clone(); diff --git a/build/lsh/compiler.rs b/build/lsh/compiler.rs index c09c972..2ee31ef 100644 --- a/build/lsh/compiler.rs +++ b/build/lsh/compiler.rs @@ -12,20 +12,43 @@ declare_handle!(pub StateHandle(usize)); declare_handle!(pub StringHandle(usize)); declare_handle!(pub TransitionHandle(usize)); -pub const REG_ZERO: u8 = 0; -pub const REG_PROGRAM_COUNTER: u8 = 1; -pub const REG_INPUT_OFFSET: u8 = 2; -pub const REG_HIGHLIGHT_START: u8 = 3; -pub const REG_HIGHLIGHT_KIND: u8 = 4; +#[derive(Clone, Copy)] +pub enum Register { + Zero, + ProgramCounter, + ProcedureStart, + InputOffset, + HighlightStart, + HighlightKind, + + #[allow(clippy::upper_case_acronyms)] + COUNT, +} + +impl fmt::Debug for Register { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.write_str(match self { + Register::Zero => "Register::Zero", + Register::ProgramCounter => "Register::ProgramCounter", + Register::ProcedureStart => "Register::ProcedureStart", + Register::InputOffset => "Register::InputOffset", + Register::HighlightStart => "Register::HighlightStart", + Register::HighlightKind => "Register::HighlightKind", + + Register::COUNT => "Register::COUNT", + }) + } +} #[allow(dead_code)] -pub type Registers = [usize; 5]; +pub type Registers = [usize; Register::COUNT as usize]; +#[allow(dead_code)] #[derive(Debug, Clone)] pub enum Instruction { // .0 (reg) = .1 (reg) + .2 (constant) // Note that this allows for jumps as well, by manipulating REG_PROGRAM_COUNTER. - Add(u8, u8, usize), + Add(Register, Register, usize), // Typical call/ret instructions. // The VM takes care of saving the return address. @@ -445,7 +468,14 @@ impl Compiler { } } - self.transitions.push(GraphTransition { origin: self.origin, src, test, kind, dst }); + self.transitions.push(GraphTransition { + origin: self.origin, + instruction_offset: usize::MAX, + src, + test, + kind, + dst, + }); dst } @@ -482,90 +512,178 @@ impl Compiler { let strings = self.extract_strings(); let charsets = self.extract_charsets(); - let mut instructions = Vec::new(); - + #[derive(Clone, Copy)] enum RelocationTarget { - None, - State(StateHandle), - Transition(TransitionHandle), + RelState(StateHandle), + RelTransition(TransitionHandle), } - for t in &self.transitions { - use Instruction::*; - use RelocationTarget::*; + struct Compiler<'a> { + states: &'a mut HandleVec, + transitions: &'a mut HandleVec, + instructions: Vec, + relocations: Vec<(usize, RelocationTarget)>, + } - if self.states[t.src].instruction_offset == usize::MAX { - self.states[t.src].instruction_offset = instructions.len(); + impl Compiler<'_> { + fn compile(mut self) -> Vec { + use RelocationTarget::*; + + for th in self.transitions.indices() { + let th_next = TransitionHandle(th.0 + 1); + let t = self.transitions[th].clone(); + + self.transitions[th].instruction_offset = self.instructions.len(); + if self.states[t.src].instruction_offset == usize::MAX { + self.states[t.src].instruction_offset = self.instructions.len(); + } + + match t.test { + GraphTest::Chars(0) => {} + GraphTest::Chars(usize::MAX) => { + self.assign(Register::InputOffset, usize::MAX); + } + GraphTest::Chars(n) => { + self.add_assign(Register::InputOffset, n); + } + GraphTest::Charset(h) => { + self.jump_if_not_match_charset(RelTransition(th_next), h); + } + GraphTest::Prefix(h) => { + self.jump_if_not_match_prefix(RelTransition(th_next), h); + } + GraphTest::PrefixInsensitive(h) => { + self.jump_if_not_match_prefix_insensitive(RelTransition(th_next), h); + } + } + + match t.kind { + HighlightKindOp::None => {} + HighlightKindOp::Some(kind) => { + self.assign(Register::HighlightKind, kind.as_usize()); + } + } + + match t.dst { + GraphAction::Jump(dst) => { + self.jump(RelState(dst)); + } + GraphAction::Push(dst) => { + self.flush_highlight(); + self.copy(Register::HighlightStart, Register::InputOffset); + self.jump(RelState(dst)); + } + GraphAction::Pop(0) => { + self.flush_highlight(); + self.suspend_opportunity(); + self.copy(Register::ProgramCounter, Register::ProcedureStart); + } + GraphAction::Pop(1) => { + self.flush_highlight(); + self.ret(); + } + GraphAction::Loop(dst) => { + self.suspend_opportunity(); + self.jump(RelState(dst)); + } + _ => unreachable!(), + } + } + + for &(off, dst) in &self.relocations { + let instruction_offset = match dst { + RelState(h) => self.states[h].instruction_offset, + RelTransition(h) => self.transitions[h].instruction_offset, + }; + let instruction_offset: u16 = instruction_offset.try_into().unwrap(); + match &mut self.instructions[off] { + Instruction::Add(Register::ProgramCounter, Register::Zero, d) => { + *d = instruction_offset as usize; + } + Instruction::JumpIfNotMatchCharset(d, _) + | Instruction::JumpIfNotMatchPrefix(d, _) + | Instruction::JumpIfNotMatchPrefixInsensitive(d, _) => { + *d = instruction_offset; + } + i => panic!("Unexpected relocation target: {i:?}"), + } + } + + self.instructions } - match t.test { - GraphTest::Chars(n) => { - instructions.push((None, Add(REG_INPUT_OFFSET, REG_INPUT_OFFSET, n))); - } - GraphTest::Charset(h) => { - instructions - .push((Transition(TransitionHandle(123)), JumpIfNotMatchCharset(0, h))); - } - GraphTest::Prefix(h) => { - instructions - .push((Transition(TransitionHandle(1234)), JumpIfNotMatchPrefix(0, h))); - } - GraphTest::PrefixInsensitive(h) => { - instructions.push(( - Transition(TransitionHandle(1234)), - JumpIfNotMatchPrefixInsensitive(0, h), - )); - } + fn add_assign(&mut self, reg: Register, val: usize) { + self.instructions.push(Instruction::Add(reg, reg, val)); } - match t.kind { - HighlightKindOp::None => {} - HighlightKindOp::Some(kind) => { - instructions.push((None, Add(REG_HIGHLIGHT_KIND, REG_ZERO, kind.as_usize()))); - } + fn assign(&mut self, reg: Register, val: usize) { + self.instructions.push(Instruction::Add(reg, Register::Zero, val)); } - match t.dst { - GraphAction::Jump(dst) => { - instructions.push(( - Transition(TransitionHandle(1234)), - Add(REG_PROGRAM_COUNTER, REG_ZERO, 0), - )); + fn copy(&mut self, dst: Register, src: Register) { + self.instructions.push(Instruction::Add(dst, src, 0)); + } + + fn jump(&mut self, dst: RelocationTarget) { + let dst = self.resolve_relocation(dst); + self.assign(Register::ProgramCounter, dst as usize); + } + + fn jump_if_not_match_charset(&mut self, dst: RelocationTarget, h: CharsetHandle) { + let dst = self.resolve_relocation(dst); + self.instructions.push(Instruction::JumpIfNotMatchCharset(dst, h)); + } + + fn jump_if_not_match_prefix(&mut self, dst: RelocationTarget, h: StringHandle) { + let dst = self.resolve_relocation(dst); + self.instructions.push(Instruction::JumpIfNotMatchPrefix(dst, h)); + } + + fn jump_if_not_match_prefix_insensitive( + &mut self, + dst: RelocationTarget, + h: StringHandle, + ) { + let dst = self.resolve_relocation(dst); + self.instructions.push(Instruction::JumpIfNotMatchPrefixInsensitive(dst, h)); + } + + fn ret(&mut self) { + self.instructions.push(Instruction::Return); + } + + fn flush_highlight(&mut self) { + self.instructions.push(Instruction::FlushHighlight); + } + + fn suspend_opportunity(&mut self) { + self.instructions.push(Instruction::SuspendOpportunity); + } + + fn resolve_relocation(&mut self, dst: RelocationTarget) -> u16 { + use RelocationTarget::*; + + let instruction_offset = match dst { + RelState(h) => self.states[h].instruction_offset, + RelTransition(h) => self.transitions[h].instruction_offset, + }; + + if instruction_offset != usize::MAX { + instruction_offset.try_into().unwrap() + } else { + self.relocations.push((self.instructions.len(), dst)); + 0 } - GraphAction::Push(dst) => { - instructions.push((Transition(TransitionHandle(1234)), Call(0))); // TODO: The position of the `dst` state. - instructions.push(( - Transition(TransitionHandle(1234)), - Add(REG_PROGRAM_COUNTER, REG_ZERO, 0), - )); // TODO: The position of the `t.src` state. - } - GraphAction::Pop(_) => { - instructions.push((None, Return)); - } - GraphAction::Loop(dst) => { - instructions.push(( - Transition(TransitionHandle(1234)), - Add(REG_PROGRAM_COUNTER, REG_ZERO, 0), - )); // TODO: The position of the `t.src` state. - } - _ => unreachable!(), } } - // Up to this point we created jump instructions with targets referring to states. - // We now patch TODO - //for i in &mut instructions { - // match i { - // Instruction::JumpIfNotMatchCharset(dst, _) - // | Instruction::JumpIfNotMatchPrefix(dst, _) - // | Instruction::JumpIfNotMatchPrefixInsensitive(dst, _) => { - // *dst = self.states[StateHandle(*dst as usize)].instruction_offset as u16; - // } - // _ => {} - // } - //} - - let instructions = instructions.into_iter().map(|(_, i)| i).collect(); + let compiler = Compiler { + states: &mut self.states, + transitions: &mut self.transitions, + instructions: Default::default(), + relocations: Default::default(), + }; + let instructions = compiler.compile(); Assembly { strings, charsets, instructions } } @@ -665,6 +783,7 @@ impl Compiler { let cs = self.intern_charset(&cs); self.transitions.push(GraphTransition { origin: -1, + instruction_offset: usize::MAX, src, test: GraphTest::Charset(cs), kind: HighlightKindOp::None, @@ -672,6 +791,7 @@ impl Compiler { }); self.transitions.push(GraphTransition { origin: -1, + instruction_offset: usize::MAX, src, test: GraphTest::Chars(1), kind: HighlightKindOp::None, @@ -774,6 +894,7 @@ impl Compiler { if !s.coverage.covers_all() { self.transitions.push(GraphTransition { origin: -1, + instruction_offset: usize::MAX, src, test: GraphTest::Chars(0), kind: HighlightKindOp::None, @@ -1087,6 +1208,7 @@ pub enum GraphTest { #[derive(Debug, Clone)] pub struct GraphTransition { origin: i32, + instruction_offset: usize, pub src: StateHandle, pub test: GraphTest, pub kind: HighlightKindOp, diff --git a/build/lsh/handles.rs b/build/lsh/handles.rs index 6cb44b8..19561cb 100644 --- a/build/lsh/handles.rs +++ b/build/lsh/handles.rs @@ -24,6 +24,10 @@ where pub fn enumerate(&self) -> impl DoubleEndedIterator { self.list.iter().enumerate().map(|(i, v)| (H::from(i), v)) } + + pub fn enumerate_mut(&mut self) -> impl DoubleEndedIterator { + self.list.iter_mut().enumerate().map(|(i, v)| (H::from(i), v)) + } } impl Default for HandleVec { diff --git a/build/lsh/mod.rs b/build/lsh/mod.rs index 9abd748..c032c2d 100644 --- a/build/lsh/mod.rs +++ b/build/lsh/mod.rs @@ -27,7 +27,6 @@ pub fn generate() -> String { "\ // This file is generated by build.rs. Do not edit it manually. -use HighlightKind::*; use Instruction::*; pub struct Language { @@ -73,28 +72,36 @@ pub enum HighlightKind { impl HighlightKind { pub const fn as_usize(self) -> usize { - unsafe { std::mem::transmute::(self) as usize } + unsafe { std::mem::transmute::(self) as usize } } pub const unsafe fn from_usize(value: usize) -> Self { - debug_assert!(value <= Method.as_usize()); - unsafe { std::mem::transmute::(value as u8) } + debug_assert!(value <= Self::Method.as_usize()); + unsafe { std::mem::transmute::(value as u8) } } } -pub const REG_ZERO: u8 = 0; -pub const REG_PROGRAM_COUNTER: u8 = 1; -pub const REG_INPUT_OFFSET: u8 = 2; -pub const REG_HIGHLIGHT_START: u8 = 3; -pub const REG_HIGHLIGHT_KIND: u8 = 4; +#[derive(Debug, Clone, Copy)] +pub enum Register { + Zero, + ProgramCounter, + ProcedureStart, + InputOffset, + HighlightStart, + HighlightKind, -pub type Registers = [usize; 5]; + #[allow(clippy::upper_case_acronyms)] + COUNT, +} + +#[allow(dead_code)] +pub type Registers = [usize; Register::COUNT as usize]; #[derive(Debug, Clone)] pub enum Instruction { // .0 (reg) = .1 (reg) + .2 (constant) // Note that this allows for jumps as well, by manipulating REG_PROGRAM_COUNTER. - Add(u8, u8, usize), + Add(Register, Register, usize), // Typical call/ret instructions. // The VM takes care of saving the return address. @@ -103,9 +110,9 @@ pub enum Instruction { // Test (and consume) the given character(s) in `.1`. // If the test fails, jump to `.0`. - JumpIfNotMatchCharset(u16, CharsetHandle), - JumpIfNotMatchPrefix(u16, StringHandle), - JumpIfNotMatchPrefixInsensitive(u16, StringHandle), + JumpIfNotMatchCharset(u16, &'static [u16; 16]), + JumpIfNotMatchPrefix(u16, *const u8), + JumpIfNotMatchPrefixInsensitive(u16, *const u8), // Flush the current HighlightKind to the output. FlushHighlight, @@ -201,28 +208,31 @@ config: filenames = lang.filenames, ); - for op in &assembly.instructions { + let instruction_number_width = assembly.instructions.len().ilog10() as usize + 1; + + for (i, op) in assembly.instructions.into_iter().enumerate() { + _ = write!(output, " /* {i: { _ = writeln!( output, - " JumpIfNotMatchCharset({addr}, LANG_{name_uppercase}_CHARSET_{h})," + "JumpIfNotMatchCharset({addr}, LANG_{name_uppercase}_CHARSET_{h})," ); } Instruction::JumpIfNotMatchPrefix(addr, h) => { _ = writeln!( output, - " JumpIfNotMatchPrefix({addr}, LANG_{name_uppercase}_STRING_{h})," + "JumpIfNotMatchPrefix({addr}, LANG_{name_uppercase}_STRING_{h})," ); } Instruction::JumpIfNotMatchPrefixInsensitive(addr, h) => { _ = writeln!( output, - " JumpIfNotMatchPrefixInsensitive({addr}, LANG_{name_uppercase}_STRING_{h})," + "JumpIfNotMatchPrefixInsensitive({addr}, LANG_{name_uppercase}_STRING_{h})," ); } - _ => { - _ = writeln!(output, " {op:?},"); + op => { + _ = writeln!(output, "{op:?},"); } } } diff --git a/src/lsh/highlighter.rs b/src/lsh/highlighter.rs index 66318df..49a784b 100644 --- a/src/lsh/highlighter.rs +++ b/src/lsh/highlighter.rs @@ -67,7 +67,11 @@ impl<'doc> Highlighter<'doc> { offset: 0, logical_pos_y: 0, call_stack: Default::default(), - registers: Default::default(), + registers: { + let mut r = Registers::default(); + r[Register::HighlightKind as usize] = HighlightKind::Other.as_usize(); + r + }, } } @@ -170,23 +174,22 @@ impl<'doc> Highlighter<'doc> { let line = unicode::strip_newline(line); - let mut push = |start: usize, kind: HighlightKind| {}; + self.set_reg(Register::InputOffset, 0); + self.set_reg(Register::HighlightStart, 0); loop { unsafe { - match *self - .language - .instructions - .get_unchecked(self.registers[REG_PROGRAM_COUNTER as usize]) - { + let pc = self.get_reg(Register::ProgramCounter); + self.set_reg(Register::ProgramCounter, pc + 1); + + match *self.language.instructions.get_unchecked(pc) { Instruction::Add(dst, src, add) => { - let src = *self.registers.get_unchecked(src as usize); - let dst = self.registers.get_unchecked_mut(dst as usize); - *dst = src.saturating_add(add); + self.registers[dst as usize] = self.registers[src as usize] + add; } Instruction::Call(dst) => { self.call_stack.push(self.registers); - self.registers[REG_PROGRAM_COUNTER as usize] = dst as usize; + self.set_reg(Register::ProgramCounter, dst as usize); + self.set_reg(Register::ProcedureStart, dst as usize); } Instruction::Return => { if let Some(last) = self.call_stack.last() { @@ -197,40 +200,39 @@ impl<'doc> Highlighter<'doc> { } } Instruction::JumpIfNotMatchCharset(dst, cs) => { - let mut off = self.registers[REG_INPUT_OFFSET as usize]; + let mut off = self.get_reg(Register::InputOffset); // TODO: http://0x80.pl/notesen/2018-10-18-simd-byte-lookup.html#alternative-implementation if off >= line.len() || !Self::in_set(cs, line[off]) { - self.registers[REG_PROGRAM_COUNTER as usize] = dst as usize; + self.set_reg(Register::ProgramCounter, dst as usize); } else { while { off += 1; off < line.len() && Self::in_set(cs, line[off]) } {} - self.registers[REG_INPUT_OFFSET as usize] = off; + self.set_reg(Register::InputOffset, off); } } Instruction::JumpIfNotMatchPrefix(dst, s) => { - let off = self.registers[REG_INPUT_OFFSET as usize]; - let str = unsafe { slice::from_raw_parts(s.add(1), s.read() as usize) }; + let off = self.get_reg(Register::InputOffset); + let str = slice::from_raw_parts(s.add(1), s.read() as usize); if !Self::inlined_memcmp(line, off, str) { - self.registers[REG_PROGRAM_COUNTER as usize] = dst as usize; + self.set_reg(Register::ProgramCounter, dst as usize); } else { - self.registers[REG_INPUT_OFFSET as usize] = off + str.len(); + self.set_reg(Register::InputOffset, off + str.len()); } } Instruction::JumpIfNotMatchPrefixInsensitive(dst, s) => { - let off = self.registers[REG_INPUT_OFFSET as usize]; - let str = unsafe { slice::from_raw_parts(s.add(1), s.read() as usize) }; + let off = self.get_reg(Register::InputOffset); + let str = slice::from_raw_parts(s.add(1), s.read() as usize); if !Self::inlined_memicmp(line, off, str) { - self.registers[REG_PROGRAM_COUNTER as usize] = dst as usize; + self.set_reg(Register::ProgramCounter, dst as usize); } else { - self.registers[REG_INPUT_OFFSET as usize] = off + str.len(); + self.set_reg(Register::InputOffset, off + str.len()); } } Instruction::FlushHighlight => { - let start = self.registers[REG_HIGHLIGHT_START as usize]; - let kind = - HighlightKind::from_usize(self.registers[REG_HIGHLIGHT_KIND as usize]); + let start = self.get_reg(Register::HighlightStart); + let kind = HighlightKind::from_usize(self.get_reg(Register::HighlightKind)); if let Some(last) = res.last_mut() && (last.start == start || last.kind == kind) @@ -240,11 +242,10 @@ impl<'doc> Highlighter<'doc> { res.push(Higlight { start, kind }); } - self.registers[REG_HIGHLIGHT_START as usize] = - self.registers[REG_INPUT_OFFSET as usize]; + self.set_reg(Register::HighlightStart, self.get_reg(Register::InputOffset)); } Instruction::SuspendOpportunity => { - let off = self.registers[REG_INPUT_OFFSET as usize]; + let off = self.get_reg(Register::InputOffset); if off >= line.len() { break; } @@ -268,8 +269,7 @@ impl<'doc> Highlighter<'doc> { #[inline] fn inlined_memcmp(haystack: &[u8], off: usize, needle: &[u8]) -> bool { unsafe { - let needle_len = needle.len(); - if haystack.len() - off < needle_len { + if haystack.len() - off < needle.len() { return false; } @@ -277,7 +277,7 @@ impl<'doc> Highlighter<'doc> { let b = needle.as_ptr(); let mut i = 0; - while i < needle_len { + while i < needle.len() { let a = *a.add(i); let b = *b.add(i); i += 1; @@ -294,8 +294,7 @@ impl<'doc> Highlighter<'doc> { #[inline] fn inlined_memicmp(haystack: &[u8], off: usize, needle: &[u8]) -> bool { unsafe { - let needle_len = needle.len(); - if haystack.len() - off < needle_len { + if haystack.len() - off < needle.len() { return false; } @@ -303,7 +302,7 @@ impl<'doc> Highlighter<'doc> { let b = needle.as_ptr(); let mut i = 0; - while i < needle_len { + while i < needle.len() { // str in PrefixInsensitive(str) is expected to be lowercase, printable ASCII. let a = a.add(i).read().to_ascii_lowercase(); let b = b.add(i).read(); @@ -327,6 +326,16 @@ impl<'doc> Highlighter<'doc> { (bitset & bitmask) != 0 } + + #[inline(always)] + fn get_reg(&self, reg: Register) -> usize { + self.registers[reg as usize] + } + + #[inline(always)] + fn set_reg(&mut self, reg: Register, val: usize) { + self.registers[reg as usize] = val; + } } /*#[cfg(test)]