mirror of
https://github.com/roc-lang/roc.git
synced 2025-09-28 14:24:45 +00:00
Parse unicode escape sequences
This commit is contained in:
parent
b995ccef75
commit
5080a7e24b
4 changed files with 103 additions and 46 deletions
|
@ -609,6 +609,7 @@ pub enum Attempting {
|
||||||
Module,
|
Module,
|
||||||
Record,
|
Record,
|
||||||
Identifier,
|
Identifier,
|
||||||
|
HexDigit,
|
||||||
ConcreteType,
|
ConcreteType,
|
||||||
TypeVariable,
|
TypeVariable,
|
||||||
WhenCondition,
|
WhenCondition,
|
||||||
|
|
|
@ -445,6 +445,29 @@ pub fn ascii_char<'a>(expected: char) -> impl Parser<'a, ()> {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// One or more ASCII hex digits. (Useful when parsing unicode escape codes,
|
||||||
|
/// which must consist entirely of ASCII hex digits.)
|
||||||
|
pub fn ascii_hex_digits<'a>() -> impl Parser<'a, &'a str> {
|
||||||
|
move |arena, state: State<'a>| {
|
||||||
|
let mut buf = bumpalo::collections::String::new_in(arena);
|
||||||
|
|
||||||
|
for &byte in state.bytes.iter() {
|
||||||
|
if (byte as char).is_ascii_hexdigit() {
|
||||||
|
buf.push(byte as char);
|
||||||
|
} else if buf.is_empty() {
|
||||||
|
// We didn't find any hex digits!
|
||||||
|
return Err(unexpected(0, state, Attempting::Keyword));
|
||||||
|
} else {
|
||||||
|
let state = state.advance_without_indenting(buf.len())?;
|
||||||
|
|
||||||
|
return Ok((buf.into_bump_str(), state));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Err(unexpected_eof(0, Attempting::HexDigit, state))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/// A single UTF-8-encoded char. This will both parse *and* validate that the
|
/// A single UTF-8-encoded char. This will both parse *and* validate that the
|
||||||
/// char is valid UTF-8, but it will *not* advance the state.
|
/// char is valid UTF-8, but it will *not* advance the state.
|
||||||
pub fn peek_utf8_char<'a>(state: &State<'a>) -> Result<(char, usize), FailReason> {
|
pub fn peek_utf8_char<'a>(state: &State<'a>) -> Result<(char, usize), FailReason> {
|
||||||
|
|
|
@ -1,11 +1,11 @@
|
||||||
use crate::ast::{Attempting, StrLiteral, StrSegment};
|
use crate::ast::{Attempting, StrLiteral, StrSegment};
|
||||||
use crate::expr;
|
use crate::expr;
|
||||||
use crate::parser::{
|
use crate::parser::{
|
||||||
allocated, ascii_char, loc, parse_utf8, unexpected, unexpected_eof, ParseResult, Parser, State,
|
allocated, ascii_char, ascii_hex_digits, loc, parse_utf8, unexpected, unexpected_eof,
|
||||||
|
ParseResult, Parser, State,
|
||||||
};
|
};
|
||||||
use bumpalo::collections::vec::Vec;
|
use bumpalo::collections::vec::Vec;
|
||||||
use bumpalo::Bump;
|
use bumpalo::Bump;
|
||||||
use roc_region::all::{Located, Region};
|
|
||||||
|
|
||||||
pub fn parse<'a>() -> impl Parser<'a, StrLiteral<'a>> {
|
pub fn parse<'a>() -> impl Parser<'a, StrLiteral<'a>> {
|
||||||
use StrLiteral::*;
|
use StrLiteral::*;
|
||||||
|
@ -176,38 +176,28 @@ pub fn parse<'a>() -> impl Parser<'a, StrLiteral<'a>> {
|
||||||
state = new_state;
|
state = new_state;
|
||||||
}
|
}
|
||||||
Some(b'u') => {
|
Some(b'u') => {
|
||||||
// This is an escaped unicode character
|
// Advance past the `\u` before using the expr parser
|
||||||
if let Some(b'(') = bytes.next() {
|
state = state.advance_without_indenting(2)?;
|
||||||
segment_parsed_bytes += 1;
|
|
||||||
} else {
|
let original_byte_count = state.bytes.len();
|
||||||
// Whenever we encounter `\u` it must be followed
|
|
||||||
// by a `(` char!
|
// Parse the hex digits, surrounded by parens, then
|
||||||
return Err(unexpected(0, state, Attempting::StrLiteral));
|
// give a canonicalization error if the digits form
|
||||||
|
// an invalid unicode code point.
|
||||||
|
let (loc_digits, new_state) =
|
||||||
|
between!(ascii_char('('), loc(ascii_hex_digits()), ascii_char(')'))
|
||||||
|
.parse(arena, state)?;
|
||||||
|
|
||||||
|
// Advance the iterator past the expr we just parsed.
|
||||||
|
for _ in 0..(original_byte_count - new_state.bytes.len()) {
|
||||||
|
bytes.next();
|
||||||
}
|
}
|
||||||
|
|
||||||
while let Some(&byte) = bytes.next() {
|
segments.push(StrSegment::Unicode(loc_digits));
|
||||||
segment_parsed_bytes += 1;
|
|
||||||
|
|
||||||
if (byte as char).is_ascii_hexdigit() {
|
// Reset the segment
|
||||||
// This is the most common case.
|
segment_parsed_bytes = 0;
|
||||||
} else if byte == b')' {
|
state = new_state;
|
||||||
// Add the segment
|
|
||||||
end_segment!(|string: &'a str| {
|
|
||||||
let value = &string[0..string.len() - 1];
|
|
||||||
|
|
||||||
StrSegment::Unicode(Located {
|
|
||||||
region: Region::zero(), // TODO calculate the right region
|
|
||||||
value,
|
|
||||||
})
|
|
||||||
});
|
|
||||||
|
|
||||||
// We're done parsing digits now.
|
|
||||||
break;
|
|
||||||
} else {
|
|
||||||
// Unicode escapes must all be digits!
|
|
||||||
return Err(unexpected(0, state, Attempting::StrLiteral));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
Some(b'\\') => {
|
Some(b'\\') => {
|
||||||
escaped_char!('\\');
|
escaped_char!('\\');
|
||||||
|
|
|
@ -50,6 +50,15 @@ mod test_parse {
|
||||||
assert_eq!(Err(expected_fail), actual);
|
assert_eq!(Err(expected_fail), actual);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn assert_segments<E: Fn(&Bump) -> Vec<'_, ast::StrSegment<'_>>>(input: &str, to_expected: E) {
|
||||||
|
let arena = Bump::new();
|
||||||
|
let actual = parse_with(&arena, arena.alloc(input));
|
||||||
|
let expected_slice = to_expected(&arena).into_bump_slice();
|
||||||
|
let expected_expr = Expr::Str(Line(expected_slice));
|
||||||
|
|
||||||
|
assert_eq!(Ok(expected_expr), actual);
|
||||||
|
}
|
||||||
|
|
||||||
fn parses_with_escaped_char<
|
fn parses_with_escaped_char<
|
||||||
I: Fn(&str) -> String,
|
I: Fn(&str) -> String,
|
||||||
E: Fn(char, &Bump) -> Vec<'_, ast::StrSegment<'_>>,
|
E: Fn(char, &Bump) -> Vec<'_, ast::StrSegment<'_>>,
|
||||||
|
@ -162,23 +171,57 @@ mod test_parse {
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
// INTERPOLATION
|
// UNICODE ESCAPES
|
||||||
|
|
||||||
fn assert_interpolations<E: Fn(&Bump) -> Vec<'_, ast::StrSegment<'_>>>(
|
#[test]
|
||||||
input: &str,
|
fn unicode_escape_in_middle() {
|
||||||
to_expected: E,
|
assert_segments(r#""Hi, \u(123)!""#, |arena| {
|
||||||
) {
|
bumpalo::vec![in arena;
|
||||||
let arena = Bump::new();
|
Plaintext("Hi, "),
|
||||||
let actual = parse_with(&arena, arena.alloc(input));
|
Unicode(Located::new(0, 0, 8, 11, "123")),
|
||||||
let expected_slice = to_expected(&arena).into_bump_slice();
|
Plaintext("!")
|
||||||
let expected_expr = Expr::Str(Line(expected_slice));
|
]
|
||||||
|
});
|
||||||
assert_eq!(Ok(expected_expr), actual);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn unicode_escape_in_front() {
|
||||||
|
assert_segments(r#""\u(1234) is a unicode char""#, |arena| {
|
||||||
|
bumpalo::vec![in arena;
|
||||||
|
Unicode(Located::new(0, 0, 4, 8, "1234")),
|
||||||
|
Plaintext(" is a unicode char")
|
||||||
|
]
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn unicode_escape_in_back() {
|
||||||
|
assert_segments(r#""this is unicode: \u(1)""#, |arena| {
|
||||||
|
bumpalo::vec![in arena;
|
||||||
|
Plaintext("this is unicode: "),
|
||||||
|
Unicode(Located::new(0, 0, 21, 22, "1"))
|
||||||
|
]
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn unicode_escape_multiple() {
|
||||||
|
assert_segments(r#""\u(a1) this is \u(2Bcd) unicode \u(ef97)""#, |arena| {
|
||||||
|
bumpalo::vec![in arena;
|
||||||
|
Unicode(Located::new(0, 0, 4, 6, "a1")),
|
||||||
|
Plaintext(" this is "),
|
||||||
|
Unicode(Located::new(0, 0, 19, 23, "2Bcd")),
|
||||||
|
Plaintext(" unicode "),
|
||||||
|
Unicode(Located::new(0, 0, 36, 40, "ef97"))
|
||||||
|
]
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
// INTERPOLATION
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn string_with_interpolation_in_middle() {
|
fn string_with_interpolation_in_middle() {
|
||||||
assert_interpolations(r#""Hi, \(name)!""#, |arena| {
|
assert_segments(r#""Hi, \(name)!""#, |arena| {
|
||||||
let expr = arena.alloc(Var {
|
let expr = arena.alloc(Var {
|
||||||
module_name: "",
|
module_name: "",
|
||||||
ident: "name",
|
ident: "name",
|
||||||
|
@ -194,7 +237,7 @@ mod test_parse {
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn string_with_interpolation_in_front() {
|
fn string_with_interpolation_in_front() {
|
||||||
assert_interpolations(r#""\(name), hi!""#, |arena| {
|
assert_segments(r#""\(name), hi!""#, |arena| {
|
||||||
let expr = arena.alloc(Var {
|
let expr = arena.alloc(Var {
|
||||||
module_name: "",
|
module_name: "",
|
||||||
ident: "name",
|
ident: "name",
|
||||||
|
@ -209,7 +252,7 @@ mod test_parse {
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn string_with_interpolation_in_back() {
|
fn string_with_interpolation_in_back() {
|
||||||
assert_interpolations(r#""Hello \(name)""#, |arena| {
|
assert_segments(r#""Hello \(name)""#, |arena| {
|
||||||
let expr = arena.alloc(Var {
|
let expr = arena.alloc(Var {
|
||||||
module_name: "",
|
module_name: "",
|
||||||
ident: "name",
|
ident: "name",
|
||||||
|
@ -224,7 +267,7 @@ mod test_parse {
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn string_with_multiple_interpolations() {
|
fn string_with_multiple_interpolations() {
|
||||||
assert_interpolations(r#""Hi, \(name)! How is \(project) going?""#, |arena| {
|
assert_segments(r#""Hi, \(name)! How is \(project) going?""#, |arena| {
|
||||||
let expr1 = arena.alloc(Var {
|
let expr1 = arena.alloc(Var {
|
||||||
module_name: "",
|
module_name: "",
|
||||||
ident: "name",
|
ident: "name",
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue