slint/internal/compiler/lexer.rs

410 lines
13 KiB
Rust

// Copyright © SixtyFPS GmbH <info@slint-ui.com>
// SPDX-License-Identifier: GPL-3.0-only OR LicenseRef-Slint-commercial
//! This module contains the code for the lexer.
//!
//! It is kind of shared with parser.rs, which implements the lex_next_token based on the macro_rules
//! that declares token
use crate::parser::SyntaxKind;
#[derive(Default)]
pub struct LexState {
/// The top of the stack is the level of embedded braces `{`.
/// So we must still lex so many '}' before re-entering into a string mode and pop the stack.
template_string_stack: Vec<u32>,
}
/// This trait is used by the `crate::parser::lex_next_token` function and is implemented
/// for rule passed to the macro which can be either a string literal, or a function
pub trait LexingRule {
/// Return the size of the match for this rule, or 0 if there is no match
fn lex(&self, text: &str, state: &mut LexState) -> usize;
}
impl<'a> LexingRule for &'a str {
#[inline]
fn lex(&self, text: &str, _: &mut LexState) -> usize {
if text.starts_with(*self) {
self.len()
} else {
0
}
}
}
impl<F: Fn(&str, &mut LexState) -> usize> LexingRule for F {
#[inline]
fn lex(&self, text: &str, state: &mut LexState) -> usize {
(self)(text, state)
}
}
pub fn lex_whitespace(text: &str, _: &mut LexState) -> usize {
let mut len = 0;
let chars = text.chars();
for c in chars {
if !c.is_whitespace() {
break;
}
len += c.len_utf8();
}
len
}
pub fn lex_comment(text: &str, _: &mut LexState) -> usize {
// FIXME: could report proper error if not properly terminated
if text.starts_with("//") {
return text.find(&['\n', '\r'] as &[_]).unwrap_or(text.len());
}
if text.starts_with("/*") {
let mut nested = 0;
let mut offset = 2;
let bytes = text.as_bytes();
while offset < bytes.len() {
if let Some(star) = bytes[offset..].iter().position(|c| *c == b'*') {
let star = star + offset;
if star > offset && bytes[star - 1] == b'/' {
nested += 1;
offset = star + 1;
} else if star < bytes.len() - 1 && bytes[star + 1] == b'/' {
if nested == 0 {
return star + 2;
}
nested -= 1;
offset = star + 2;
} else {
offset = star + 1;
}
} else {
// Unterminated
return 0;
}
}
// Unterminated
return 0;
}
0
}
pub fn lex_string(text: &str, state: &mut LexState) -> usize {
if let Some(brace_level) = state.template_string_stack.last_mut() {
if text.starts_with('{') {
*brace_level += 1;
return 0;
} else if text.starts_with('}') {
if *brace_level > 0 {
*brace_level -= 1;
return 0;
} else {
state.template_string_stack.pop();
}
} else if !text.starts_with('"') {
return 0;
}
} else if !text.starts_with('"') {
return 0;
}
let text_len = text.as_bytes().len();
let mut end = 1; // skip the '"'
loop {
let stop = match text[end..].find(&['"', '\\'][..]) {
Some(stop) => end + stop,
// FIXME: report an error for unterminated string
None => return 0,
};
match text.as_bytes()[stop] {
b'"' => {
return stop + 1;
}
b'\\' => {
if text_len <= stop + 1 {
// FIXME: report an error for unterminated string
return 0;
}
if text.as_bytes()[stop + 1] == b'{' {
state.template_string_stack.push(0);
return stop + 2;
}
end = stop + 1 + text[stop + 1..].chars().next().map_or(0, |c| c.len_utf8())
}
_ => unreachable!(),
}
}
}
pub fn lex_number(text: &str, _: &mut LexState) -> usize {
let mut len = 0;
let mut chars = text.chars();
let mut had_period = false;
while let Some(c) = chars.next() {
if !c.is_ascii_digit() {
if !had_period && c == '.' && len > 0 {
had_period = true;
} else {
if len > 0 {
if c == '%' {
return len + 1;
}
if c.is_ascii_alphabetic() {
len += c.len_utf8();
// The unit
for c in chars {
if !c.is_ascii_alphabetic() {
return len;
}
len += c.len_utf8();
}
}
}
break;
}
}
len += c.len_utf8();
}
len
}
pub fn lex_color(text: &str, _: &mut LexState) -> usize {
if !text.starts_with('#') {
return 0;
}
let mut len = 1;
let chars = text[1..].chars();
for c in chars {
if !c.is_ascii_alphanumeric() {
break;
}
len += c.len_utf8();
}
len
}
pub fn lex_identifier(text: &str, _: &mut LexState) -> usize {
let mut len = 0;
let chars = text.chars();
for c in chars {
if !c.is_alphanumeric() && c != '_' && (c != '-' || len == 0) {
break;
}
len += c.len_utf8();
}
len
}
#[allow(clippy::needless_update)] // Token may have extra fields depending on selected features
pub fn lex(mut source: &str) -> Vec<crate::parser::Token> {
let mut result = vec![];
let mut offset = 0;
let mut state = LexState::default();
while !source.is_empty() {
if let Some((len, kind)) = crate::parser::lex_next_token(source, &mut state) {
result.push(crate::parser::Token {
kind,
text: source[..len].into(),
offset,
..Default::default()
});
offset += len;
source = &source[len..];
} else {
// FIXME: recover
result.push(crate::parser::Token {
kind: SyntaxKind::Error,
text: source.into(),
offset,
..Default::default()
});
//offset += source.len();
break;
}
}
result
}
#[test]
fn basic_lexer_test() {
fn compare(source: &str, expected: &[(SyntaxKind, &str)]) {
let actual = lex(source);
let actual =
actual.iter().map(|token| (token.kind, token.text.as_str())).collect::<Vec<_>>();
assert_eq!(actual.as_slice(), expected);
}
compare(
r#"45 /*hi/*_*/ho*/ "string""#,
&[
(SyntaxKind::NumberLiteral, "45"),
(SyntaxKind::Whitespace, " "),
(SyntaxKind::Comment, "/*hi/*_*/ho*/"),
(SyntaxKind::Whitespace, " "),
(SyntaxKind::StringLiteral, r#""string""#),
],
);
compare(
r#"12px+5.2+=0.7%"#,
&[
(SyntaxKind::NumberLiteral, "12px"),
(SyntaxKind::Plus, "+"),
(SyntaxKind::NumberLiteral, "5.2"),
(SyntaxKind::PlusEqual, "+="),
(SyntaxKind::NumberLiteral, "0.7%"),
],
);
compare(
r#"aa_a.b1,c"#,
&[
(SyntaxKind::Identifier, "aa_a"),
(SyntaxKind::Dot, "."),
(SyntaxKind::Identifier, "b1"),
(SyntaxKind::Comma, ","),
(SyntaxKind::Identifier, "c"),
],
);
compare(
r#"/*/**/*//**/*"#,
&[
(SyntaxKind::Comment, "/*/**/*/"),
(SyntaxKind::Comment, "/**/"),
(SyntaxKind::Star, "*"),
],
);
compare(
"a//x\nb//y\r\nc//z",
&[
(SyntaxKind::Identifier, "a"),
(SyntaxKind::Comment, "//x"),
(SyntaxKind::Whitespace, "\n"),
(SyntaxKind::Identifier, "b"),
(SyntaxKind::Comment, "//y"),
(SyntaxKind::Whitespace, "\r\n"),
(SyntaxKind::Identifier, "c"),
(SyntaxKind::Comment, "//z"),
],
);
compare(r#""x""#, &[(SyntaxKind::StringLiteral, r#""x""#)]);
compare(
r#"a"\"\\"x"#,
&[
(SyntaxKind::Identifier, "a"),
(SyntaxKind::StringLiteral, r#""\"\\""#),
(SyntaxKind::Identifier, "x"),
],
);
compare(
r#""a\{b{c}d"e\{f}g"h}i"j"#,
&[
(SyntaxKind::StringLiteral, r#""a\{"#),
(SyntaxKind::Identifier, "b"),
(SyntaxKind::LBrace, "{"),
(SyntaxKind::Identifier, "c"),
(SyntaxKind::RBrace, "}"),
(SyntaxKind::Identifier, "d"),
(SyntaxKind::StringLiteral, r#""e\{"#),
(SyntaxKind::Identifier, "f"),
(SyntaxKind::StringLiteral, r#"}g""#),
(SyntaxKind::Identifier, "h"),
(SyntaxKind::StringLiteral, r#"}i""#),
(SyntaxKind::Identifier, "j"),
],
);
// Fuzzer tests:
compare(r#"/**"#, &[(SyntaxKind::Div, "/"), (SyntaxKind::Star, "*"), (SyntaxKind::Star, "*")]);
compare(r#""\"#, &[(SyntaxKind::Error, "\"\\")]);
compare(r#""\ޱ"#, &[(SyntaxKind::Error, "\"\\ޱ")]);
}
/// Given the source of a rust file, find the occurrence of each `slint!(...)`macro.
/// Return an iterator with the range of the location of the macro in the original source
pub fn locate_slint_macro(rust_source: &str) -> impl Iterator<Item = core::ops::Range<usize>> + '_ {
let mut begin = 0;
std::iter::from_fn(move || {
let (open, close) = loop {
if let Some(m) = rust_source[begin..].find("slint") {
// heuristics to find if we are not in a comment or a string literal. Not perfect, but should work in most cases
if let Some(x) = rust_source[begin..(begin + m)].rfind(['\\', '\n', '/', '\"']) {
if rust_source.as_bytes()[begin + x] != b'\n' {
begin += m + 5;
begin += rust_source[begin..].find(['\n']).unwrap_or(0);
continue;
}
}
begin += m + 5;
while rust_source[begin..].starts_with(' ') {
begin += 1;
}
if !rust_source[begin..].starts_with('!') {
continue;
}
begin += 1;
while rust_source[begin..].starts_with(' ') {
begin += 1;
}
let Some(open) = rust_source.as_bytes().get(begin) else { continue };
match open {
b'{' => break (SyntaxKind::LBrace, SyntaxKind::RBrace),
b'[' => break (SyntaxKind::LBracket, SyntaxKind::RBracket),
b'(' => break (SyntaxKind::LParent, SyntaxKind::RParent),
_ => continue,
}
} else {
// No macro found, just return
return None;
}
};
begin += 1;
// Now find the matching closing delimiter
// Technically, we should be lexing rust, not slint
let mut state = LexState::default();
let start = begin;
let mut end = begin;
let mut level = 0;
while !rust_source[end..].is_empty() {
let len = match crate::parser::lex_next_token(&rust_source[end..], &mut state) {
Some((len, x)) if x == open => {
level += 1;
len
}
Some((_, x)) if x == close && level == 0 => {
break;
}
Some((len, x)) if x == close => {
level -= 1;
len
}
Some((len, _)) => len,
None => {
// Lex error
break;
}
};
if len == 0 {
break; // Shouldn't happen
}
end += len;
}
begin = end;
Some(start..end)
})
}
#[test]
fn test_locate_rust_macro() {
#[track_caller]
fn do_test(source: &str, captures: &[&str]) {
let result = locate_slint_macro(source).map(|r| &source[r]).collect::<Vec<_>>();
assert_eq!(&result, captures);
}
do_test("\nslint{!{}}", &[]);
do_test(
"//slint!(123)\nslint!(456)\nslint ![789]\n/*slint!{abc}*/\nslint! {def}",
&["456", "789", "def"],
);
do_test("slint!(slint!(abc))slint!()", &["slint!(abc)", ""]);
}