mirror of
https://github.com/microsoft/edit.git
synced 2025-08-04 19:08:31 +00:00
350 lines
13 KiB
Rust
350 lines
13 KiB
Rust
// Copyright (c) Microsoft Corporation.
|
|
// Licensed under the MIT License.
|
|
|
|
//! Our VT parser.
|
|
|
|
use std::time;
|
|
|
|
use crate::simd::memchr2;
|
|
use crate::unicode::Utf8Chars;
|
|
|
|
/// The parser produces these tokens.
|
|
pub enum Token<'parser, 'input> {
|
|
/// A bunch of text. Doesn't contain any control characters.
|
|
Text(&'input str),
|
|
/// A single control character, like backspace or return.
|
|
Ctrl(char),
|
|
/// We encountered `ESC x` and this contains `x`.
|
|
Esc(char),
|
|
/// We encountered `ESC O x` and this contains `x`.
|
|
SS3(char),
|
|
/// A CSI sequence started with `ESC [`.
|
|
///
|
|
/// They are the most common escape sequences. See [`Csi`].
|
|
Csi(&'parser Csi),
|
|
/// An OSC sequence started with `ESC ]`.
|
|
///
|
|
/// The sequence may be split up into multiple tokens if the input
|
|
/// is given in chunks. This is indicated by the `partial` field.
|
|
Osc { data: &'input str, partial: bool },
|
|
/// An DCS sequence started with `ESC P`.
|
|
///
|
|
/// The sequence may be split up into multiple tokens if the input
|
|
/// is given in chunks. This is indicated by the `partial` field.
|
|
Dcs { data: &'input str, partial: bool },
|
|
}
|
|
|
|
/// Stores the state of the parser.
|
|
#[derive(Clone, Copy)]
|
|
enum State {
|
|
Ground,
|
|
Esc,
|
|
Ss3,
|
|
Csi,
|
|
Osc,
|
|
Dcs,
|
|
OscEsc,
|
|
DcsEsc,
|
|
}
|
|
|
|
/// A single CSI sequence, parsed for your convenience.
|
|
pub struct Csi {
|
|
/// The parameters of the CSI sequence.
|
|
pub params: [u16; 32],
|
|
/// The number of parameters stored in [`Csi::params`].
|
|
pub param_count: usize,
|
|
/// The private byte, if any. `0` if none.
|
|
///
|
|
/// The private byte is the first character right after the
|
|
/// `ESC [` sequence. It is usually a `?` or `<`.
|
|
pub private_byte: char,
|
|
/// The final byte of the CSI sequence.
|
|
///
|
|
/// This is the last character of the sequence, e.g. `m` or `H`.
|
|
pub final_byte: char,
|
|
}
|
|
|
|
pub struct Parser {
|
|
state: State,
|
|
// Csi is not part of State, because it allows us
|
|
// to more quickly erase and reuse the struct.
|
|
csi: Csi,
|
|
}
|
|
|
|
impl Parser {
|
|
pub fn new() -> Self {
|
|
Self {
|
|
state: State::Ground,
|
|
csi: Csi { params: [0; 32], param_count: 0, private_byte: '\0', final_byte: '\0' },
|
|
}
|
|
}
|
|
|
|
/// Suggests a timeout for the next call to `read()`.
|
|
///
|
|
/// We need this because of the ambiguity of whether a trailing
|
|
/// escape character in an input is starting another escape sequence or
|
|
/// is just the result of the user literally pressing the Escape key.
|
|
pub fn read_timeout(&mut self) -> std::time::Duration {
|
|
match self.state {
|
|
// 100ms is a upper ceiling for a responsive feel.
|
|
// Realistically though, this could be much lower.
|
|
//
|
|
// However, there seems to be issues with OpenSSH on Windows.
|
|
// See: https://github.com/PowerShell/Win32-OpenSSH/issues/2275
|
|
State::Esc => time::Duration::from_millis(100),
|
|
_ => time::Duration::MAX,
|
|
}
|
|
}
|
|
|
|
/// Parses the given input into VT sequences.
|
|
///
|
|
/// You should call this function even if your `read()`
|
|
/// had a timeout (pass an empty string in that case).
|
|
pub fn parse<'parser, 'input>(
|
|
&'parser mut self,
|
|
input: &'input str,
|
|
) -> Stream<'parser, 'input> {
|
|
Stream { parser: self, input, off: 0 }
|
|
}
|
|
}
|
|
|
|
/// An iterator that parses VT sequences into [`Token`]s.
|
|
///
|
|
/// Can't implement [`Iterator`], because this is a "lending iterator".
|
|
pub struct Stream<'parser, 'input> {
|
|
parser: &'parser mut Parser,
|
|
input: &'input str,
|
|
off: usize,
|
|
}
|
|
|
|
impl<'input> Stream<'_, 'input> {
|
|
/// Returns the input that is being parsed.
|
|
pub fn input(&self) -> &'input str {
|
|
self.input
|
|
}
|
|
|
|
/// Returns the current parser offset.
|
|
pub fn offset(&self) -> usize {
|
|
self.off
|
|
}
|
|
|
|
/// Reads and consumes raw bytes from the input.
|
|
pub fn read(&mut self, dst: &mut [u8]) -> usize {
|
|
let bytes = self.input.as_bytes();
|
|
let off = self.off.min(bytes.len());
|
|
let len = dst.len().min(bytes.len() - off);
|
|
dst[..len].copy_from_slice(&bytes[off..off + len]);
|
|
self.off += len;
|
|
len
|
|
}
|
|
|
|
fn decode_next(&mut self) -> char {
|
|
let mut iter = Utf8Chars::new(self.input.as_bytes(), self.off);
|
|
let c = iter.next().unwrap_or('\0');
|
|
self.off = iter.offset();
|
|
c
|
|
}
|
|
|
|
/// Parses the next VT sequence from the previously given input.
|
|
#[allow(
|
|
clippy::should_implement_trait,
|
|
reason = "can't implement Iterator because this is a lending iterator"
|
|
)]
|
|
pub fn next(&mut self) -> Option<Token<'_, 'input>> {
|
|
let input = self.input;
|
|
let bytes = input.as_bytes();
|
|
|
|
// If the previous input ended with an escape character, `read_timeout()`
|
|
// returned `Some(..)` timeout, and if the caller did everything correctly
|
|
// and there was indeed a timeout, we should be called with an empty
|
|
// input. In that case we'll return the escape as its own token.
|
|
if input.is_empty() && matches!(self.parser.state, State::Esc) {
|
|
self.parser.state = State::Ground;
|
|
return Some(Token::Esc('\0'));
|
|
}
|
|
|
|
while self.off < bytes.len() {
|
|
// TODO: The state machine can be roughly broken up into two parts:
|
|
// * Wants to parse 1 `char` at a time: Ground, Esc, Ss3
|
|
// These could all be unified to a single call to `decode_next()`.
|
|
// * Wants to bulk-process bytes: Csi, Osc, Dcs
|
|
// We should do that so the UTF8 handling is a bit more "unified".
|
|
match self.parser.state {
|
|
State::Ground => match bytes[self.off] {
|
|
0x1b => {
|
|
self.parser.state = State::Esc;
|
|
self.off += 1;
|
|
}
|
|
c @ (0x00..0x20 | 0x7f) => {
|
|
self.off += 1;
|
|
return Some(Token::Ctrl(c as char));
|
|
}
|
|
_ => {
|
|
let beg = self.off;
|
|
while {
|
|
self.off += 1;
|
|
self.off < bytes.len()
|
|
&& bytes[self.off] >= 0x20
|
|
&& bytes[self.off] != 0x7f
|
|
} {}
|
|
return Some(Token::Text(&input[beg..self.off]));
|
|
}
|
|
},
|
|
State::Esc => match self.decode_next() {
|
|
'[' => {
|
|
self.parser.state = State::Csi;
|
|
self.parser.csi.private_byte = '\0';
|
|
self.parser.csi.final_byte = '\0';
|
|
while self.parser.csi.param_count > 0 {
|
|
self.parser.csi.param_count -= 1;
|
|
self.parser.csi.params[self.parser.csi.param_count] = 0;
|
|
}
|
|
}
|
|
']' => {
|
|
self.parser.state = State::Osc;
|
|
}
|
|
'O' => {
|
|
self.parser.state = State::Ss3;
|
|
}
|
|
'P' => {
|
|
self.parser.state = State::Dcs;
|
|
}
|
|
c => {
|
|
self.parser.state = State::Ground;
|
|
return Some(Token::Esc(c));
|
|
}
|
|
},
|
|
State::Ss3 => {
|
|
self.parser.state = State::Ground;
|
|
return Some(Token::SS3(self.decode_next()));
|
|
}
|
|
State::Csi => {
|
|
loop {
|
|
// If we still have slots left, parse the parameter.
|
|
if self.parser.csi.param_count < self.parser.csi.params.len() {
|
|
let dst = &mut self.parser.csi.params[self.parser.csi.param_count];
|
|
while self.off < bytes.len() && bytes[self.off].is_ascii_digit() {
|
|
let add = bytes[self.off] as u32 - b'0' as u32;
|
|
let value = *dst as u32 * 10 + add;
|
|
*dst = value.min(u16::MAX as u32) as u16;
|
|
self.off += 1;
|
|
}
|
|
} else {
|
|
// ...otherwise, skip the parameters until we find the final byte.
|
|
while self.off < bytes.len() && bytes[self.off].is_ascii_digit() {
|
|
self.off += 1;
|
|
}
|
|
}
|
|
|
|
// Encountered the end of the input before finding the final byte.
|
|
if self.off >= bytes.len() {
|
|
return None;
|
|
}
|
|
|
|
let c = bytes[self.off];
|
|
self.off += 1;
|
|
|
|
match c {
|
|
0x40..=0x7e => {
|
|
self.parser.state = State::Ground;
|
|
self.parser.csi.final_byte = c as char;
|
|
if self.parser.csi.param_count != 0
|
|
|| self.parser.csi.params[0] != 0
|
|
{
|
|
self.parser.csi.param_count += 1;
|
|
}
|
|
return Some(Token::Csi(&self.parser.csi));
|
|
}
|
|
b';' => self.parser.csi.param_count += 1,
|
|
b'<'..=b'?' => self.parser.csi.private_byte = c as char,
|
|
_ => {}
|
|
}
|
|
}
|
|
}
|
|
State::Osc | State::Dcs => {
|
|
let beg = self.off;
|
|
let mut data;
|
|
let mut partial;
|
|
|
|
loop {
|
|
// Find any indication for the end of the OSC/DCS sequence.
|
|
self.off = memchr2(b'\x07', b'\x1b', bytes, self.off);
|
|
|
|
data = &input[beg..self.off];
|
|
partial = self.off >= bytes.len();
|
|
|
|
// Encountered the end of the input before finding the terminator.
|
|
if partial {
|
|
break;
|
|
}
|
|
|
|
let c = bytes[self.off];
|
|
self.off += 1;
|
|
|
|
if c == 0x1b {
|
|
// It's only a string terminator if it's followed by \.
|
|
// We're at the end so we're saving the state and will continue next time.
|
|
if self.off >= bytes.len() {
|
|
self.parser.state = match self.parser.state {
|
|
State::Osc => State::OscEsc,
|
|
_ => State::DcsEsc,
|
|
};
|
|
partial = true;
|
|
break;
|
|
}
|
|
|
|
// False alarm: Not a string terminator.
|
|
if bytes[self.off] != b'\\' {
|
|
continue;
|
|
}
|
|
|
|
self.off += 1;
|
|
}
|
|
|
|
break;
|
|
}
|
|
|
|
let state = self.parser.state;
|
|
if !partial {
|
|
self.parser.state = State::Ground;
|
|
}
|
|
return match state {
|
|
State::Osc => Some(Token::Osc { data, partial }),
|
|
_ => Some(Token::Dcs { data, partial }),
|
|
};
|
|
}
|
|
State::OscEsc | State::DcsEsc => {
|
|
// We were processing an OSC/DCS sequence and the last byte was an escape character.
|
|
// It's only a string terminator if it's followed by \ (= "\x1b\\").
|
|
if bytes[self.off] == b'\\' {
|
|
// It was indeed a string terminator and we can now tell the caller about it.
|
|
let state = self.parser.state;
|
|
|
|
// Consume the terminator (one byte in the previous input and this byte).
|
|
self.parser.state = State::Ground;
|
|
self.off += 1;
|
|
|
|
return match state {
|
|
State::OscEsc => Some(Token::Osc { data: "", partial: false }),
|
|
_ => Some(Token::Dcs { data: "", partial: false }),
|
|
};
|
|
} else {
|
|
// False alarm: Not a string terminator.
|
|
// We'll return the escape character as a separate token.
|
|
// Processing will continue from the current state (`bytes[self.off]`).
|
|
self.parser.state = match self.parser.state {
|
|
State::OscEsc => State::Osc,
|
|
_ => State::Dcs,
|
|
};
|
|
return match self.parser.state {
|
|
State::Osc => Some(Token::Osc { data: "\x1b", partial: true }),
|
|
_ => Some(Token::Dcs { data: "\x1b", partial: true }),
|
|
};
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
None
|
|
}
|
|
}
|