From 579898fd80eb1d56c8e7d3aa8147d16737d34f63 Mon Sep 17 00:00:00 2001 From: Leonard Hecker Date: Fri, 4 Jul 2025 21:01:01 +0200 Subject: [PATCH] wip --- .vscode/launch.json | 2 +- src/helpers.rs | 10 ++ src/highlighter/lang_bash.rs | 90 ++++++++----- src/highlighter/lang_batch.rs | 98 ++++++++------ src/highlighter/lang_json.rs | 64 +++++---- src/highlighter/lang_powershell.rs | 206 ++++++++++++++++------------- src/highlighter/lang_yaml.rs | 134 +++++++++++++------ src/highlighter/mod.rs | 82 +++++++----- 8 files changed, 421 insertions(+), 265 deletions(-) diff --git a/.vscode/launch.json b/.vscode/launch.json index 73b1a20..0345e47 100644 --- a/.vscode/launch.json +++ b/.vscode/launch.json @@ -12,9 +12,9 @@ "args": [ "assets/highlighting-tests/bash.sh", "assets/highlighting-tests/batch.bat", + "assets/highlighting-tests/json.json", "assets/highlighting-tests/powershell.ps1", "assets/highlighting-tests/yaml.yml", - "assets/highlighting-tests/json.json", ], }, { diff --git a/src/helpers.rs b/src/helpers.rs index bd43358..e9d232a 100644 --- a/src/helpers.rs +++ b/src/helpers.rs @@ -291,3 +291,13 @@ impl AsciiStringHelpers for str { p.len() <= s.len() && s[..p.len()].eq_ignore_ascii_case(p) } } + +impl AsciiStringHelpers for [u8] { + fn starts_with_ignore_ascii_case(&self, prefix: &str) -> bool { + // Casting to bytes first ensures we skip any UTF8 boundary checks. + // Since the comparison is ASCII, we don't need to worry about that. + let s = self; + let p = prefix.as_bytes(); + p.len() <= s.len() && s[..p.len()].eq_ignore_ascii_case(p) + } +} diff --git a/src/highlighter/lang_bash.rs b/src/highlighter/lang_bash.rs index 26d1757..06359b0 100644 --- a/src/highlighter/lang_bash.rs +++ b/src/highlighter/lang_bash.rs @@ -5,44 +5,66 @@ use super::*; type T = Transition; +// NOTE: These are indices into the `LANG.charsets` array. +const C_DIGITS: usize = 0; +const C_VARIABLE: usize = 1; + // NOTE: These are indices into the `LANG.states` array. -const _GROUND: u8 = 0; -const COMMENT: u8 = 1; -const STRING_SINGLE: u8 = 2; -const STRING_DOUBLE: u8 = 3; -const STRING_ESCAPE: u8 = 4; -const VARIABLE: u8 = 5; +const _S_GROUND: u8 = 0; +const S_COMMENT: u8 = 1; +const S_STRING_SINGLE: u8 = 2; +const S_STRING_DOUBLE: u8 = 3; +const S_STRING_ESCAPE: u8 = 4; +const S_VARIABLE: u8 = 5; pub const LANG: Language = Language { name: "Bash", extensions: &["sh", "bash", "zsh", "ksh", "csh", "tcsh"], - word_chars: &[ - // /.-,+*)('&%$#"! - 0b_0000110000000000, - // ?>=<;:9876543210 - 0b_0000001111111111, - // ONMLKJIHGFEDCBA@ - 0b_1111111111111110, - // _^]\[ZYXWVUTSRQP - 0b_1000000000000000, - // onmlkjihgfedcba` - 0b_1111111111111111, - // ~}|{zyxwvutsrqp - 0b_0000000000000000, + charsets: &[ + // C_DIGITS + &[ + // /.-,+*)('&%$#"! + 0b_0010100000000000, + // ?>=<;:9876543210 + 0b_0000001111111111, + // ONMLKJIHGFEDCBA@ + 0b_0000000000000000, + // _^]\[ZYXWVUTSRQP + 0b_0000000000000000, + // onmlkjihgfedcba` + 0b_0000000000000000, + // ~}|{zyxwvutsrqp + 0b_0000000000000000, + ], + // C_VARIABLE + &[ + // /.-,+*)('&%$#"! + 0b_0000110000000000, + // ?>=<;:9876543210 + 0b_0000001111111111, + // ONMLKJIHGFEDCBA@ + 0b_1111111111111110, + // _^]\[ZYXWVUTSRQP + 0b_1000000000000000, + // onmlkjihgfedcba` + 0b_1111111111111111, + // ~}|{zyxwvutsrqp + 0b_0000000000000000, + ], ], states: &[ - // GROUND + // S_GROUND &[ // Comments - T { test: Prefix("#"), kind: Comment, state: Push(COMMENT) }, - T { test: Prefix("<#"), kind: Comment, state: Push(COMMENT) }, + T { test: Prefix("#"), kind: Comment, state: Push(S_COMMENT) }, + T { test: Prefix("<#"), kind: Comment, state: Push(S_COMMENT) }, // Strings - T { test: Prefix("'"), kind: String, state: Push(STRING_SINGLE) }, - T { test: Prefix("\""), kind: String, state: Push(STRING_DOUBLE) }, + T { test: Prefix("'"), kind: String, state: Push(S_STRING_SINGLE) }, + T { test: Prefix("\""), kind: String, state: Push(S_STRING_DOUBLE) }, // Variables - T { test: Prefix("$"), kind: Variable, state: Push(VARIABLE) }, + T { test: Prefix("$"), kind: Variable, state: Push(S_VARIABLE) }, // Numbers - T { test: Digits, kind: Number, state: Pop(1) }, + T { test: Charset(C_DIGITS), kind: Number, state: Pop(1) }, // Operators T { test: Prefix("|"), kind: Operator, state: Pop(1) }, T { test: Prefix("&"), kind: Operator, state: Pop(1) }, @@ -63,19 +85,19 @@ pub const LANG: Language = Language { T { test: Prefix("esac"), kind: Keyword, state: Pop(1) }, T { test: Prefix("function"), kind: Keyword, state: Pop(1) }, ], - // COMMENT + // S_COMMENT &[T { test: Line, kind: Comment, state: Pop(1) }], - // STRING_SINGLE + // S_STRING_SINGLE &[T { test: Prefix("'"), kind: String, state: Pop(1) }], - // STRING_DOUBLE + // S_STRING_DOUBLE &[ - T { test: Prefix("\\"), kind: String, state: Push(STRING_ESCAPE) }, - T { test: Prefix("$"), kind: Variable, state: Push(VARIABLE) }, + T { test: Prefix("\\"), kind: String, state: Push(S_STRING_ESCAPE) }, + T { test: Prefix("$"), kind: Variable, state: Push(S_VARIABLE) }, T { test: Prefix("\""), kind: String, state: Pop(1) }, ], - // STRING_ESCAPE + // S_STRING_ESCAPE &[T { test: Chars(1), kind: String, state: Pop(1) }], - // VARIABLE - &[T { test: Word, kind: Variable, state: Pop(1) }], + // S_VARIABLE + &[T { test: Charset(C_VARIABLE), kind: Variable, state: Pop(1) }], ], }; diff --git a/src/highlighter/lang_batch.rs b/src/highlighter/lang_batch.rs index 2d35523..af3d833 100644 --- a/src/highlighter/lang_batch.rs +++ b/src/highlighter/lang_batch.rs @@ -5,67 +5,89 @@ use super::*; type T = Transition; +// NOTE: These are indices into the `LANG.charsets` array. +const C_DIGITS: usize = 0; +const C_VARIABLE: usize = 1; + // NOTE: These are indices into the `LANG.states` array. -const _GROUND: u8 = 0; -const COMMENT: u8 = 1; -const STRING: u8 = 2; -const VARIABLE: u8 = 3; +const _S_GROUND: u8 = 0; +const S_COMMENT: u8 = 1; +const S_STRING: u8 = 2; +const S_VARIABLE: u8 = 3; pub const LANG: Language = Language { name: "Batch", extensions: &["bat", "cmd"], - word_chars: &[ - // /.-,+*)('&%$#"! - 0b_0000000000000000, - // ?>=<;:9876543210 - 0b_0000001111111111, - // ONMLKJIHGFEDCBA@ - 0b_1111111111111110, - // _^]\[ZYXWVUTSRQP - 0b_1000000000000000, - // onmlkjihgfedcba` - 0b_1111111111111111, - // ~}|{zyxwvutsrqp - 0b_0000000000000000, + charsets: &[ + // C_DIGITS + &[ + // /.-,+*)('&%$#"! + 0b_0010100000000000, + // ?>=<;:9876543210 + 0b_0000001111111111, + // ONMLKJIHGFEDCBA@ + 0b_0000000000000000, + // _^]\[ZYXWVUTSRQP + 0b_0000000000000000, + // onmlkjihgfedcba` + 0b_0000000000000000, + // ~}|{zyxwvutsrqp + 0b_0000000000000000, + ], + // C_COMMAND + &[ + // /.-,+*)('&%$#"! + 0b_0000000000000000, + // ?>=<;:9876543210 + 0b_0000001111111111, + // ONMLKJIHGFEDCBA@ + 0b_1111111111111110, + // _^]\[ZYXWVUTSRQP + 0b_1000011111111111, + // onmlkjihgfedcba` + 0b_1111111111111111, + // ~}|{zyxwvutsrqp + 0b_0100011111111111, + ], ], states: &[ - // GROUND + // S_GROUND &[ // Comments (REM or ::) - T { test: Prefix("REM "), kind: Comment, state: Push(COMMENT) }, - T { test: Prefix("::"), kind: Comment, state: Push(COMMENT) }, + T { test: PrefixInsensitive("rem "), kind: Comment, state: Push(S_COMMENT) }, + T { test: Prefix("::"), kind: Comment, state: Push(S_COMMENT) }, // Strings (quoted) - T { test: Prefix("\""), kind: String, state: Push(STRING) }, + T { test: Prefix("\""), kind: String, state: Push(S_STRING) }, // Variables - T { test: Prefix("%"), kind: Variable, state: Push(VARIABLE) }, + T { test: Prefix("%"), kind: Variable, state: Push(S_VARIABLE) }, // Numbers - T { test: Digits, kind: Number, state: Pop(1) }, + T { test: Charset(C_DIGITS), kind: Number, state: Pop(1) }, // Operators T { test: Prefix("|"), kind: Operator, state: Pop(1) }, T { test: Prefix("&"), kind: Operator, state: Pop(1) }, T { test: Prefix("<"), kind: Operator, state: Pop(1) }, T { test: Prefix(">"), kind: Operator, state: Pop(1) }, // Keywords (common) - T { test: Prefix("if"), kind: Keyword, state: Pop(1) }, - T { test: Prefix("else"), kind: Keyword, state: Pop(1) }, - T { test: Prefix("for"), kind: Keyword, state: Pop(1) }, - T { test: Prefix("in"), kind: Keyword, state: Pop(1) }, - T { test: Prefix("do"), kind: Keyword, state: Pop(1) }, - T { test: Prefix("not"), kind: Keyword, state: Pop(1) }, - T { test: Prefix("exist"), kind: Keyword, state: Pop(1) }, - T { test: Prefix("set"), kind: Keyword, state: Pop(1) }, - T { test: Prefix("echo"), kind: Keyword, state: Pop(1) }, - T { test: Prefix("goto"), kind: Keyword, state: Pop(1) }, - T { test: Prefix("call"), kind: Keyword, state: Pop(1) }, + T { test: PrefixInsensitive("if"), kind: Keyword, state: Pop(1) }, + T { test: PrefixInsensitive("else"), kind: Keyword, state: Pop(1) }, + T { test: PrefixInsensitive("for"), kind: Keyword, state: Pop(1) }, + T { test: PrefixInsensitive("in"), kind: Keyword, state: Pop(1) }, + T { test: PrefixInsensitive("do"), kind: Keyword, state: Pop(1) }, + T { test: PrefixInsensitive("not"), kind: Keyword, state: Pop(1) }, + T { test: PrefixInsensitive("exist"), kind: Keyword, state: Pop(1) }, + T { test: PrefixInsensitive("set"), kind: Keyword, state: Pop(1) }, + T { test: PrefixInsensitive("echo"), kind: Keyword, state: Pop(1) }, + T { test: PrefixInsensitive("goto"), kind: Keyword, state: Pop(1) }, + T { test: PrefixInsensitive("call"), kind: Keyword, state: Pop(1) }, ], - // COMMENT + // S_COMMENT &[T { test: Line, kind: Comment, state: Pop(1) }], - // STRING + // S_STRING &[T { test: Prefix("\""), kind: String, state: Pop(1) }], - // VARIABLE + // S_VARIABLE &[ T { test: Prefix("%"), kind: Variable, state: Pop(1) }, - T { test: Word, kind: Variable, state: Pop(1) }, + T { test: Charset(C_VARIABLE), kind: Variable, state: Pop(1) }, ], ], }; diff --git a/src/highlighter/lang_json.rs b/src/highlighter/lang_json.rs index ef8272f..cfe42ed 100644 --- a/src/highlighter/lang_json.rs +++ b/src/highlighter/lang_json.rs @@ -5,55 +5,61 @@ use super::*; type T = Transition; +// NOTE: These are indices into the `LANG.charsets` array. +const C_DIGITS: usize = 0; + // NOTE: These are indices into the `LANG.states` array. -const _GROUND: u8 = 0; -const LINE_COMMENT: u8 = 1; -const BLOCK_COMMENT: u8 = 2; -const STRING: u8 = 3; -const STRING_ESCAPE: u8 = 4; +const _S_GROUND: u8 = 0; +const S_LINE_COMMENT: u8 = 1; +const S_BLOCK_COMMENT: u8 = 2; +const S_STRING: u8 = 3; +const S_STRING_ESCAPE: u8 = 4; pub const LANG: Language = Language { name: "JSON", extensions: &["json", "jsonc"], - word_chars: &[ - // /.-,+*)('&%$#"! - 0b_0000000000000000, - // ?>=<;:9876543210 - 0b_0000000000000000, - // ONMLKJIHGFEDCBA@ - 0b_0000000000000000, - // _^]\[ZYXWVUTSRQP - 0b_0000000000000000, - // onmlkjihgfedcba` - 0b_0000000000000000, - // ~}|{zyxwvutsrqp - 0b_0000000000000000, + charsets: &[ + // C_DIGITS + &[ + // /.-,+*)('&%$#"! + 0b_0110100000000000, + // ?>=<;:9876543210 + 0b_0000001111111111, + // ONMLKJIHGFEDCBA@ + 0b_0000000000100000, + // _^]\[ZYXWVUTSRQP + 0b_0000000000000000, + // onmlkjihgfedcba` + 0b_0000000000100000, + // ~}|{zyxwvutsrqp + 0b_0000000000000000, + ], ], states: &[ - // GROUND + // S_GROUND &[ // Comments (jsonc) - T { test: Prefix("//"), kind: Comment, state: Push(LINE_COMMENT) }, - T { test: Prefix("/*"), kind: Comment, state: Push(BLOCK_COMMENT) }, + T { test: Prefix("//"), kind: Comment, state: Push(S_LINE_COMMENT) }, + T { test: Prefix("/*"), kind: Comment, state: Push(S_BLOCK_COMMENT) }, // Strings - T { test: Prefix("\""), kind: String, state: Push(STRING) }, - // Numbers - T { test: Digits, kind: Number, state: Pop(1) }, + T { test: Prefix("\""), kind: String, state: Push(S_STRING) }, + // Numbers (start: minus or digit) + T { test: Charset(C_DIGITS), kind: Number, state: Pop(1) }, // Booleans/null T { test: Prefix("true"), kind: Keyword, state: Pop(1) }, T { test: Prefix("false"), kind: Keyword, state: Pop(1) }, T { test: Prefix("null"), kind: Keyword, state: Pop(1) }, ], - // LINE_COMMENT (// single-line) + // S_LINE_COMMENT (// single-line) &[T { test: Line, kind: Comment, state: Pop(1) }], - // BLOCK_COMMENT (/* ... */) + // S_BLOCK_COMMENT (/* ... */) &[T { test: Prefix("*/"), kind: Comment, state: Pop(1) }], - // STRING ("...") + // S_STRING ("...") &[ - T { test: Prefix("\\"), kind: String, state: Push(STRING_ESCAPE) }, + T { test: Prefix("\\"), kind: String, state: Push(S_STRING_ESCAPE) }, T { test: Prefix("\""), kind: String, state: Pop(1) }, ], - // STRING_ESCAPE (escape in string) + // S_STRING_ESCAPE (escape in string) &[T { test: Chars(1), kind: String, state: Pop(1) }], ], }; diff --git a/src/highlighter/lang_powershell.rs b/src/highlighter/lang_powershell.rs index e3d4920..17a35c1 100644 --- a/src/highlighter/lang_powershell.rs +++ b/src/highlighter/lang_powershell.rs @@ -5,53 +5,75 @@ use super::*; type T = Transition; +// NOTE: These are indices into the `LANG.charsets` array. +const C_DIGITS: usize = 0; +const C_METHOD: usize = 1; + // NOTE: These are indices into the `LANG.states` array. -const _GROUND: u8 = 0; -const LINE_COMMENT: u8 = 1; -const BLOCK_COMMENT: u8 = 2; -const STRING_SINGLE: u8 = 3; -const STRING_DOUBLE: u8 = 4; -const STRING_ESCAPE: u8 = 5; -const VARIABLE: u8 = 6; -const VARIABLE_BRACE: u8 = 7; -const VARIABLE_PAREN: u8 = 8; -const PARAMETER: u8 = 9; -const KEYWORD: u8 = 10; -const METHOD: u8 = 11; +const _S_GROUND: u8 = 0; +const S_LINE_COMMENT: u8 = 1; +const S_BLOCK_COMMENT: u8 = 2; +const S_STRING_SINGLE: u8 = 3; +const S_STRING_DOUBLE: u8 = 4; +const S_STRING_ESCAPE: u8 = 5; +const S_VARIABLE: u8 = 6; +const S_VARIABLE_BRACE: u8 = 7; +const S_VARIABLE_PAREN: u8 = 8; +const S_PARAMETER: u8 = 9; +const S_KEYWORD: u8 = 10; +const S_METHOD: u8 = 11; pub const LANG: Language = Language { name: "PowerShell", extensions: &["ps1", "psm1", "psd1"], - word_chars: &[ - // /.-,+*)('&%$#"! - 0b_1110110000101010, - // ?>=<;:9876543210 - 0b_1111011111111111, - // ONMLKJIHGFEDCBA@ - 0b_1111111111111110, - // _^]\[ZYXWVUTSRQP - 0b_1111111111111111, - // onmlkjihgfedcba` - 0b_1111111111111111, - // ~}|{zyxwvutsrqp - 0b_0100011111111111, + charsets: &[ + // C_DIGITS + &[ + // /.-,+*)('&%$#"! + 0b_0110100000000000, + // ?>=<;:9876543210 + 0b_0000001111111111, + // ONMLKJIHGFEDCBA@ + 0b_0000000000100000, + // _^]\[ZYXWVUTSRQP + 0b_0000000000000000, + // onmlkjihgfedcba` + 0b_0000000000100000, + // ~}|{zyxwvutsrqp + 0b_0000000000000000, + ], + // C_METHOD + &[ + // /.-,+*)('&%$#"! + 0b_1110110000101010, + // ?>=<;:9876543210 + 0b_1111011111111111, + // ONMLKJIHGFEDCBA@ + 0b_1111111111111110, + // _^]\[ZYXWVUTSRQP + 0b_1111111111111111, + // onmlkjihgfedcba` + 0b_1111111111111111, + // ~}|{zyxwvutsrqp + 0b_0100011111111111, + ], ], states: &[ - // GROUND + // S_GROUND &[ // Comments - T { test: Prefix("#"), kind: Comment, state: Push(LINE_COMMENT) }, - T { test: Prefix("<#"), kind: Comment, state: Push(BLOCK_COMMENT) }, + T { test: Prefix("#"), kind: Comment, state: Push(S_LINE_COMMENT) }, + T { test: Prefix("<#"), kind: Comment, state: Push(S_BLOCK_COMMENT) }, // Numbers - T { test: Digits, kind: Number, state: Pop(1) }, + T { test: Charset(C_DIGITS), kind: Number, state: Pop(1) }, // Strings - T { test: Prefix("'"), kind: String, state: Push(STRING_SINGLE) }, - T { test: Prefix("\""), kind: String, state: Push(STRING_DOUBLE) }, + T { test: Prefix("'"), kind: String, state: Push(S_STRING_SINGLE) }, + T { test: Prefix("\""), kind: String, state: Push(S_STRING_DOUBLE) }, // Variables - T { test: Prefix("$("), kind: Other, state: Push(VARIABLE_PAREN) }, - T { test: Prefix("$"), kind: Variable, state: Push(VARIABLE) }, + T { test: Prefix("$("), kind: Other, state: Push(S_VARIABLE_PAREN) }, + T { test: Prefix("$"), kind: Variable, state: Push(S_VARIABLE) }, // Operators - T { test: Prefix("-"), kind: Operator, state: Push(PARAMETER) }, + T { test: Prefix("-"), kind: Operator, state: Push(S_PARAMETER) }, T { test: Prefix("!"), kind: Operator, state: Pop(1) }, T { test: Prefix("*"), kind: Operator, state: Pop(1) }, T { test: Prefix("/"), kind: Operator, state: Pop(1) }, @@ -62,61 +84,61 @@ pub const LANG: Language = Language { T { test: Prefix(">"), kind: Operator, state: Pop(1) }, T { test: Prefix("|"), kind: Operator, state: Pop(1) }, // Keywords - T { test: Prefix("break"), kind: Keyword, state: Push(KEYWORD) }, - T { test: Prefix("catch"), kind: Keyword, state: Push(KEYWORD) }, - T { test: Prefix("continue"), kind: Keyword, state: Push(KEYWORD) }, - T { test: Prefix("do"), kind: Keyword, state: Push(KEYWORD) }, - T { test: Prefix("else"), kind: Keyword, state: Push(KEYWORD) }, - T { test: Prefix("finally"), kind: Keyword, state: Push(KEYWORD) }, - T { test: Prefix("foreach"), kind: Keyword, state: Push(KEYWORD) }, - T { test: Prefix("function"), kind: Keyword, state: Push(KEYWORD) }, - T { test: Prefix("if"), kind: Keyword, state: Push(KEYWORD) }, - T { test: Prefix("return"), kind: Keyword, state: Push(KEYWORD) }, - T { test: Prefix("switch"), kind: Keyword, state: Push(KEYWORD) }, - T { test: Prefix("throw"), kind: Keyword, state: Push(KEYWORD) }, - T { test: Prefix("try"), kind: Keyword, state: Push(KEYWORD) }, - T { test: Prefix("using"), kind: Keyword, state: Push(KEYWORD) }, - T { test: Prefix("while"), kind: Keyword, state: Push(KEYWORD) }, + T { test: PrefixInsensitive("break"), kind: Keyword, state: Push(S_KEYWORD) }, + T { test: PrefixInsensitive("catch"), kind: Keyword, state: Push(S_KEYWORD) }, + T { test: PrefixInsensitive("continue"), kind: Keyword, state: Push(S_KEYWORD) }, + T { test: PrefixInsensitive("do"), kind: Keyword, state: Push(S_KEYWORD) }, + T { test: PrefixInsensitive("else"), kind: Keyword, state: Push(S_KEYWORD) }, + T { test: PrefixInsensitive("finally"), kind: Keyword, state: Push(S_KEYWORD) }, + T { test: PrefixInsensitive("foreach"), kind: Keyword, state: Push(S_KEYWORD) }, + T { test: PrefixInsensitive("function"), kind: Keyword, state: Push(S_KEYWORD) }, + T { test: PrefixInsensitive("if"), kind: Keyword, state: Push(S_KEYWORD) }, + T { test: PrefixInsensitive("return"), kind: Keyword, state: Push(S_KEYWORD) }, + T { test: PrefixInsensitive("switch"), kind: Keyword, state: Push(S_KEYWORD) }, + T { test: PrefixInsensitive("throw"), kind: Keyword, state: Push(S_KEYWORD) }, + T { test: PrefixInsensitive("try"), kind: Keyword, state: Push(S_KEYWORD) }, + T { test: PrefixInsensitive("using"), kind: Keyword, state: Push(S_KEYWORD) }, + T { test: PrefixInsensitive("while"), kind: Keyword, state: Push(S_KEYWORD) }, // Methods - T { test: Word, kind: Method, state: Push(METHOD) }, + T { test: Charset(C_METHOD), kind: Method, state: Push(S_METHOD) }, ], - // LINE_COMMENT: # comment + // S_LINE_COMMENT: # comment &[T { test: Line, kind: Comment, state: Pop(1) }], - // BLOCK_COMMENT: <# comment #> + // S_BLOCK_COMMENT: <# comment #> &[T { test: Prefix("#>"), kind: Comment, state: Pop(1) }], - // STRING_SINGLE: 'string' + // S_STRING_SINGLE: 'string' &[T { test: Prefix("'"), kind: String, state: Pop(1) }], - // STRING_DOUBLE: "string" + // S_STRING_DOUBLE: "string" &[ - T { test: Prefix("`"), kind: String, state: Push(STRING_ESCAPE) }, - T { test: Prefix("$("), kind: Other, state: Push(VARIABLE_PAREN) }, - T { test: Prefix("$"), kind: Variable, state: Push(VARIABLE) }, + T { test: Prefix("`"), kind: String, state: Push(S_STRING_ESCAPE) }, + T { test: Prefix("$("), kind: Other, state: Push(S_VARIABLE_PAREN) }, + T { test: Prefix("$"), kind: Variable, state: Push(S_VARIABLE) }, T { test: Prefix("\""), kind: String, state: Pop(1) }, ], - // STRING_ESCAPE: "`a" + // S_STRING_ESCAPE: "`a" &[T { test: Chars(1), kind: String, state: Pop(1) }], - // VARIABLE: $variable + // S_VARIABLE: $variable &[ - T { test: Prefix("{"), kind: Variable, state: Change(VARIABLE_BRACE) }, - T { test: Word, kind: Variable, state: Pop(1) }, + T { test: Prefix("{"), kind: Variable, state: Change(S_VARIABLE_BRACE) }, + T { test: Charset(C_METHOD), kind: Variable, state: Pop(1) }, ], - // VARIABLE_BRACE: ${variable} + // S_VARIABLE_BRACE: ${variable} &[T { test: Prefix("}"), kind: Variable, state: Pop(1) }], - // VARIABLE_PAREN: $(command) + // S_VARIABLE_PAREN: $(command) // This is largely a copy of the ground state. &[ // Ground state Overrides - T { test: Prefix("("), kind: Other, state: Push(VARIABLE_PAREN) }, + T { test: Prefix("("), kind: Other, state: Push(S_VARIABLE_PAREN) }, T { test: Prefix(")"), kind: Other, state: Pop(1) }, // Numbers - T { test: Digits, kind: Number, state: Pop(1) }, + T { test: Charset(C_DIGITS), kind: Number, state: Pop(1) }, // Strings - T { test: Prefix("'"), kind: String, state: Push(STRING_SINGLE) }, - T { test: Prefix("\""), kind: String, state: Push(STRING_DOUBLE) }, + T { test: Prefix("'"), kind: String, state: Push(S_STRING_SINGLE) }, + T { test: Prefix("\""), kind: String, state: Push(S_STRING_DOUBLE) }, // Variables - T { test: Prefix("$"), kind: Variable, state: Push(VARIABLE) }, + T { test: Prefix("$"), kind: Variable, state: Push(S_VARIABLE) }, // Operators - T { test: Prefix("-"), kind: Operator, state: Push(PARAMETER) }, + T { test: Prefix("-"), kind: Operator, state: Push(S_PARAMETER) }, T { test: Prefix("!"), kind: Operator, state: Pop(1) }, T { test: Prefix("*"), kind: Operator, state: Pop(1) }, T { test: Prefix("/"), kind: Operator, state: Pop(1) }, @@ -127,38 +149,38 @@ pub const LANG: Language = Language { T { test: Prefix(">"), kind: Operator, state: Pop(1) }, T { test: Prefix("|"), kind: Operator, state: Pop(1) }, // Keywords - T { test: Prefix("break"), kind: Keyword, state: Push(KEYWORD) }, - T { test: Prefix("catch"), kind: Keyword, state: Push(KEYWORD) }, - T { test: Prefix("continue"), kind: Keyword, state: Push(KEYWORD) }, - T { test: Prefix("do"), kind: Keyword, state: Push(KEYWORD) }, - T { test: Prefix("else"), kind: Keyword, state: Push(KEYWORD) }, - T { test: Prefix("finally"), kind: Keyword, state: Push(KEYWORD) }, - T { test: Prefix("foreach"), kind: Keyword, state: Push(KEYWORD) }, - T { test: Prefix("function"), kind: Keyword, state: Push(KEYWORD) }, - T { test: Prefix("if"), kind: Keyword, state: Push(KEYWORD) }, - T { test: Prefix("return"), kind: Keyword, state: Push(KEYWORD) }, - T { test: Prefix("switch"), kind: Keyword, state: Push(KEYWORD) }, - T { test: Prefix("throw"), kind: Keyword, state: Push(KEYWORD) }, - T { test: Prefix("try"), kind: Keyword, state: Push(KEYWORD) }, - T { test: Prefix("using"), kind: Keyword, state: Push(KEYWORD) }, - T { test: Prefix("while"), kind: Keyword, state: Push(KEYWORD) }, + T { test: PrefixInsensitive("break"), kind: Keyword, state: Push(S_KEYWORD) }, + T { test: PrefixInsensitive("catch"), kind: Keyword, state: Push(S_KEYWORD) }, + T { test: PrefixInsensitive("continue"), kind: Keyword, state: Push(S_KEYWORD) }, + T { test: PrefixInsensitive("do"), kind: Keyword, state: Push(S_KEYWORD) }, + T { test: PrefixInsensitive("else"), kind: Keyword, state: Push(S_KEYWORD) }, + T { test: PrefixInsensitive("finally"), kind: Keyword, state: Push(S_KEYWORD) }, + T { test: PrefixInsensitive("foreach"), kind: Keyword, state: Push(S_KEYWORD) }, + T { test: PrefixInsensitive("function"), kind: Keyword, state: Push(S_KEYWORD) }, + T { test: PrefixInsensitive("if"), kind: Keyword, state: Push(S_KEYWORD) }, + T { test: PrefixInsensitive("return"), kind: Keyword, state: Push(S_KEYWORD) }, + T { test: PrefixInsensitive("switch"), kind: Keyword, state: Push(S_KEYWORD) }, + T { test: PrefixInsensitive("throw"), kind: Keyword, state: Push(S_KEYWORD) }, + T { test: PrefixInsensitive("try"), kind: Keyword, state: Push(S_KEYWORD) }, + T { test: PrefixInsensitive("using"), kind: Keyword, state: Push(S_KEYWORD) }, + T { test: PrefixInsensitive("while"), kind: Keyword, state: Push(S_KEYWORD) }, // Methods - T { test: Word, kind: Method, state: Push(METHOD) }, + T { test: Charset(C_METHOD), kind: Method, state: Push(S_METHOD) }, ], - // PARAMETER: -parameter + // S_PARAMETER: -parameter &[ - T { test: Word, kind: Operator, state: Pop(1) }, + T { test: Charset(C_METHOD), kind: Operator, state: Pop(1) }, T { test: Chars(0), kind: Operator, state: Pop(1) }, ], - // KEYWORD: foreach, if, etc. + // S_KEYWORD: foreach, if, etc. &[ - T { test: Word, kind: Method, state: Change(METHOD) }, + T { test: Charset(C_METHOD), kind: Method, state: Change(S_METHOD) }, T { test: Chars(0), kind: Keyword, state: Pop(1) }, ], - // METHOD: Foo-Bar + // S_METHOD: Foo-Bar &[ - T { test: Word, kind: Method, state: Change(METHOD) }, - T { test: Prefix("-"), kind: Method, state: Change(METHOD) }, + T { test: Charset(C_METHOD), kind: Method, state: Change(S_METHOD) }, + T { test: Prefix("-"), kind: Method, state: Change(S_METHOD) }, T { test: Chars(0), kind: Method, state: Pop(1) }, ], ], diff --git a/src/highlighter/lang_yaml.rs b/src/highlighter/lang_yaml.rs index 077fbe7..bdc704b 100644 --- a/src/highlighter/lang_yaml.rs +++ b/src/highlighter/lang_yaml.rs @@ -5,58 +5,116 @@ use super::*; type T = Transition; +// NOTE: These are indices into the `LANG.charsets` array. +const C_DIGITS: usize = 0; +const C_KEY_STRING: usize = 1; +const C_VALUE_STRING: usize = 2; + // NOTE: These are indices into the `LANG.states` array. -const _GROUND: u8 = 0; -const COMMENT: u8 = 1; -const STRING_SINGLE: u8 = 2; -const STRING_DOUBLE: u8 = 3; +const _S_GROUND: u8 = 0; +const S_COMMENT: u8 = 1; +const S_STRING_SINGLE: u8 = 2; +const S_STRING_DOUBLE: u8 = 3; +const S_KEYWORD_MAYBE: u8 = 4; +const S_KEYVALUE: u8 = 5; pub const LANG: Language = Language { name: "YAML", extensions: &["yaml", "yml"], - word_chars: &[ - // /.-,+*)('&%$#"! - 0b_0000000000000000, - // ?>=<;:9876543210 - 0b_0000000000000000, - // ONMLKJIHGFEDCBA@ - 0b_0000000000000000, - // _^]\[ZYXWVUTSRQP - 0b_0000000000000000, - // onmlkjihgfedcba` - 0b_0000000000000000, - // ~}|{zyxwvutsrqp - 0b_0000000000000000, + charsets: &[ + // C_DIGITS + &[ + // /.-,+*)('&%$#"! + 0b_0000000000000000, + // ?>=<;:9876543210 + 0b_0000001111111111, + // ONMLKJIHGFEDCBA@ + 0b_0000000000000000, + // _^]\[ZYXWVUTSRQP + 0b_0000000000000000, + // onmlkjihgfedcba` + 0b_0000000000000000, + // ~}|{zyxwvutsrqp + 0b_0000000000000000, + ], + // C_KEY_STRING + &[ + // /.-,+*)('&%$#"! + 0b_1111111111111110, + // ?>=<;:9876543210 + 0b_1111101111111111, + // ONMLKJIHGFEDCBA@ + 0b_1111111111111111, + // _^]\[ZYXWVUTSRQP + 0b_1111111111111111, + // onmlkjihgfedcba` + 0b_1111111111111111, + // ~}|{zyxwvutsrqp + 0b_1111111111111111, + ], + // C_VALUE_STRING + &[ + // /.-,+*)('&%$#"! + 0b_1111111101110011, + // ?>=<;:9876543210 + 0b_1111111111111111, + // ONMLKJIHGFEDCBA@ + 0b_1111111111111111, + // _^]\[ZYXWVUTSRQP + 0b_1111111111111111, + // onmlkjihgfedcba` + 0b_1111111111111111, + // ~}|{zyxwvutsrqp + 0b_1111111111111111, + ], ], states: &[ - // GROUND + // S_GROUND &[ // Comments - T { test: Prefix("#"), kind: Comment, state: Push(COMMENT) }, - T { test: Prefix("<#"), kind: Comment, state: Push(COMMENT) }, + T { test: Prefix("#"), kind: Comment, state: Push(S_COMMENT) }, // Strings - T { test: Prefix("'"), kind: String, state: Push(STRING_SINGLE) }, - T { test: Prefix("\""), kind: String, state: Push(STRING_DOUBLE) }, + T { test: Prefix("'"), kind: String, state: Push(S_STRING_SINGLE) }, + T { test: Prefix("\""), kind: String, state: Push(S_STRING_DOUBLE) }, // Numbers - T { test: Digits, kind: Number, state: Pop(1) }, - // Booleans/null - T { test: Prefix("true"), kind: Keyword, state: Pop(1) }, - T { test: Prefix("false"), kind: Keyword, state: Pop(1) }, - T { test: Prefix("null"), kind: Keyword, state: Pop(1) }, - // Punctuation - T { test: Prefix("-"), kind: Operator, state: Pop(1) }, - T { test: Prefix(":"), kind: Operator, state: Pop(1) }, - T { test: Prefix(","), kind: Operator, state: Pop(1) }, - T { test: Prefix("["), kind: Operator, state: Pop(1) }, - T { test: Prefix("]"), kind: Operator, state: Pop(1) }, - T { test: Prefix("{"), kind: Operator, state: Pop(1) }, - T { test: Prefix("}"), kind: Operator, state: Pop(1) }, + T { test: Charset(C_DIGITS), kind: Number, state: Pop(1) }, + // Booleans/Null + T { test: Prefix("true"), kind: Keyword, state: Push(S_KEYWORD_MAYBE) }, + T { test: Prefix("false"), kind: Keyword, state: Push(S_KEYWORD_MAYBE) }, + T { test: Prefix("null"), kind: Keyword, state: Push(S_KEYWORD_MAYBE) }, + T { test: Charset(C_KEY_STRING), kind: Other, state: Push(S_KEYVALUE) }, ], - // COMMENT + // S_COMMENT &[T { test: Line, kind: Comment, state: Pop(1) }], - // STRING_SINGLE + // S_STRING_SINGLE &[T { test: Prefix("'"), kind: String, state: Pop(1) }], - // STRING_DOUBLE + // S_STRING_DOUBLE &[T { test: Prefix("\""), kind: String, state: Pop(1) }], + // S_KEYWORD_MAYBE + &[ + T { test: Charset(C_KEY_STRING), kind: Other, state: Push(S_KEYVALUE) }, + T { test: Chars(0), kind: Keyword, state: Pop(1) }, + ], + // S_KEYVALUE + &[ + T { test: Prefix(":"), kind: Other, state: Push(S_KEYWORD_MAYBE) }, + T { test: Chars(0), kind: Other, state: Pop(1) }, + ], + // S_VALUE + &[ + // Comments + T { test: Prefix("#"), kind: Comment, state: Push(S_COMMENT) }, + // Strings + T { test: Prefix("'"), kind: String, state: Push(S_STRING_SINGLE) }, + T { test: Prefix("\""), kind: String, state: Push(S_STRING_DOUBLE) }, + // Numbers + T { test: Charset(C_DIGITS), kind: Number, state: Pop(1) }, + // Booleans/Null + T { test: Prefix("true"), kind: Keyword, state: Push(S_KEYWORD_MAYBE) }, + T { test: Prefix("false"), kind: Keyword, state: Push(S_KEYWORD_MAYBE) }, + T { test: Prefix("null"), kind: Keyword, state: Push(S_KEYWORD_MAYBE) }, + T { test: Charset(C_KEY_STRING), kind: Other, state: Push(S_KEYVALUE) }, + T { test: Chars(0), kind: Other, state: Pop(1) }, + ], ], }; diff --git a/src/highlighter/mod.rs b/src/highlighter/mod.rs index a96a057..fa820ac 100644 --- a/src/highlighter/mod.rs +++ b/src/highlighter/mod.rs @@ -25,13 +25,13 @@ pub struct Language { #[allow(dead_code)] name: &'static str, extensions: &'static [&'static str], - word_chars: &'static [u16; 6], + charsets: &'static [&'static [u16; 6]], states: &'static [&'static [Transition]], } impl Language { pub fn from_path(path: &Path) -> Option<&'static Language> { - let ext = path.extension()?; + let ext = path.extension().unwrap(); LANGUAGES.iter().copied().find(|lang| lang.extensions.iter().any(|&e| OsStr::new(e) == ext)) } } @@ -45,8 +45,8 @@ struct Transition { enum Consume { Chars(usize), Prefix(&'static str), - Digits, - Word, + PrefixInsensitive(&'static str), + Charset(usize), Line, } @@ -90,7 +90,7 @@ pub struct Highlighter<'a> { logical_pos_y: CoordType, language: &'static Language, - word_chars: [bool; 256], + charsets: Vec<[bool; 256]>, starter: Vec<[bool; 256]>, state: usize, @@ -100,31 +100,46 @@ pub struct Highlighter<'a> { impl<'doc> Highlighter<'doc> { pub fn new(doc: &'doc dyn ReadableDocument, language: &'static Language) -> Self { - let mut word_chars = [false; 256]; - Self::fill_word_chars(&mut word_chars, language.word_chars); - - let starter = Vec::from_iter(language.states.iter().map(|&transitions| { - let mut starter = [false; 256]; - for t in transitions { - match t.test { - Consume::Chars(_) => starter.fill(true), - Consume::Prefix(prefix) => starter[prefix.as_bytes()[0] as usize] = true, - Consume::Digits => starter[b'0' as usize..=b'9' as usize].fill(true), - Consume::Word => Self::fill_word_chars(&mut starter, language.word_chars), - Consume::Line => {} - } - } - starter - })); - Self { doc, offset: 0, logical_pos_y: 0, language, - word_chars, - starter, + charsets: language + .charsets + .iter() + .map(|&charset| { + let mut word_chars = [false; 256]; + Self::fill_word_chars(&mut word_chars, charset); + word_chars + }) + .collect(), + starter: language + .states + .iter() + .map(|&transitions| { + let mut starter = [false; 256]; + for t in transitions { + match t.test { + Consume::Chars(_) => starter.fill(true), + Consume::Prefix(prefix) => { + starter[prefix.as_bytes()[0] as usize] = true; + } + Consume::PrefixInsensitive(prefix) => { + let ch = prefix.as_bytes()[0]; + starter[ch.to_ascii_lowercase() as usize] = true; + starter[ch.to_ascii_uppercase() as usize] = true; + } + Consume::Charset(i) => { + Self::fill_word_chars(&mut starter, language.charsets[i]); + } + Consume::Line => {} + } + } + starter + }) + .collect(), state: 0, kind: Default::default(), @@ -227,21 +242,19 @@ impl<'doc> Highlighter<'doc> { break; } } - Consume::Digits => { - if off < line_buf.len() && line_buf[off].is_ascii_digit() { - while { - off += 1; - off < line_buf.len() && line_buf[off].is_ascii_digit() - } {} + Consume::PrefixInsensitive(str) => { + if line_buf[off..].starts_with_ignore_ascii_case(str) { + off += str.len(); hit = Some(t); break; } } - Consume::Word => { - if off < line_buf.len() && self.word_chars[line_buf[off] as usize] { + Consume::Charset(i) => { + let charset = &self.charsets[i]; + if off < line_buf.len() && charset[line_buf[off] as usize] { while { off += 1; - off < line_buf.len() && self.word_chars[line_buf[off] as usize] + off < line_buf.len() && charset[line_buf[off] as usize] } {} hit = Some(t); break; @@ -270,6 +283,9 @@ impl<'doc> Highlighter<'doc> { match t.state { StateStack::Change(to) => { + if let Some(last) = res.last_mut() { + last.kind = t.kind; + } self.state = to as usize; self.kind = t.kind; }