Parse unicode escape sequences

This commit is contained in:
Richard Feldman 2020-08-29 14:14:16 -04:00
parent b995ccef75
commit 5080a7e24b
4 changed files with 103 additions and 46 deletions

View file

@ -609,6 +609,7 @@ pub enum Attempting {
Module,
Record,
Identifier,
HexDigit,
ConcreteType,
TypeVariable,
WhenCondition,

View file

@ -445,6 +445,29 @@ pub fn ascii_char<'a>(expected: char) -> impl Parser<'a, ()> {
}
}
/// One or more ASCII hex digits. (Useful when parsing unicode escape codes,
/// which must consist entirely of ASCII hex digits.)
pub fn ascii_hex_digits<'a>() -> impl Parser<'a, &'a str> {
move |arena, state: State<'a>| {
let mut buf = bumpalo::collections::String::new_in(arena);
for &byte in state.bytes.iter() {
if (byte as char).is_ascii_hexdigit() {
buf.push(byte as char);
} else if buf.is_empty() {
// We didn't find any hex digits!
return Err(unexpected(0, state, Attempting::Keyword));
} else {
let state = state.advance_without_indenting(buf.len())?;
return Ok((buf.into_bump_str(), state));
}
}
Err(unexpected_eof(0, Attempting::HexDigit, state))
}
}
/// A single UTF-8-encoded char. This will both parse *and* validate that the
/// char is valid UTF-8, but it will *not* advance the state.
pub fn peek_utf8_char<'a>(state: &State<'a>) -> Result<(char, usize), FailReason> {

View file

@ -1,11 +1,11 @@
use crate::ast::{Attempting, StrLiteral, StrSegment};
use crate::expr;
use crate::parser::{
allocated, ascii_char, loc, parse_utf8, unexpected, unexpected_eof, ParseResult, Parser, State,
allocated, ascii_char, ascii_hex_digits, loc, parse_utf8, unexpected, unexpected_eof,
ParseResult, Parser, State,
};
use bumpalo::collections::vec::Vec;
use bumpalo::Bump;
use roc_region::all::{Located, Region};
pub fn parse<'a>() -> impl Parser<'a, StrLiteral<'a>> {
use StrLiteral::*;
@ -176,38 +176,28 @@ pub fn parse<'a>() -> impl Parser<'a, StrLiteral<'a>> {
state = new_state;
}
Some(b'u') => {
// This is an escaped unicode character
if let Some(b'(') = bytes.next() {
segment_parsed_bytes += 1;
} else {
// Whenever we encounter `\u` it must be followed
// by a `(` char!
return Err(unexpected(0, state, Attempting::StrLiteral));
// Advance past the `\u` before using the expr parser
state = state.advance_without_indenting(2)?;
let original_byte_count = state.bytes.len();
// Parse the hex digits, surrounded by parens, then
// give a canonicalization error if the digits form
// an invalid unicode code point.
let (loc_digits, new_state) =
between!(ascii_char('('), loc(ascii_hex_digits()), ascii_char(')'))
.parse(arena, state)?;
// Advance the iterator past the expr we just parsed.
for _ in 0..(original_byte_count - new_state.bytes.len()) {
bytes.next();
}
while let Some(&byte) = bytes.next() {
segment_parsed_bytes += 1;
segments.push(StrSegment::Unicode(loc_digits));
if (byte as char).is_ascii_hexdigit() {
// This is the most common case.
} else if byte == b')' {
// Add the segment
end_segment!(|string: &'a str| {
let value = &string[0..string.len() - 1];
StrSegment::Unicode(Located {
region: Region::zero(), // TODO calculate the right region
value,
})
});
// We're done parsing digits now.
break;
} else {
// Unicode escapes must all be digits!
return Err(unexpected(0, state, Attempting::StrLiteral));
}
}
// Reset the segment
segment_parsed_bytes = 0;
state = new_state;
}
Some(b'\\') => {
escaped_char!('\\');