Canonicalize unicode code point escapes

This commit is contained in:
Richard Feldman 2020-08-30 22:21:19 -04:00
parent 7682e09b0a
commit 2e15443c8c
4 changed files with 125 additions and 61 deletions

View file

@ -22,7 +22,7 @@ use roc_region::all::{Located, Region};
use roc_types::subs::{VarStore, Variable}; use roc_types::subs::{VarStore, Variable};
use roc_types::types::Alias; use roc_types::types::Alias;
use std::fmt::Debug; use std::fmt::Debug;
use std::i64; use std::{char, i64, u32};
#[derive(Clone, Default, Debug, PartialEq)] #[derive(Clone, Default, Debug, PartialEq)]
pub struct Output { pub struct Output {
@ -1359,7 +1359,7 @@ fn flatten_str_lines<'a>(
use ast::StrSegment::*; use ast::StrSegment::*;
let mut buf = String::new(); let mut buf = String::new();
let mut interpolations = Vec::new(); let mut segments = Vec::new();
let mut output = Output::default(); let mut output = Output::default();
for line in lines { for line in lines {
@ -1368,11 +1368,41 @@ fn flatten_str_lines<'a>(
Plaintext(string) => { Plaintext(string) => {
buf.push_str(string); buf.push_str(string);
} }
Unicode(loc_digits) => { Unicode(loc_hex_digits) => match u32::from_str_radix(loc_hex_digits.value, 16) {
todo!("parse unicode digits {:?}", loc_digits); Ok(code_pt) => match char::from_u32(code_pt) {
} Some(ch) => {
buf.push(ch);
}
None => {
env.problem(Problem::InvalidUnicodeCodePoint(loc_hex_digits.region));
return (
Expr::RuntimeError(RuntimeError::InvalidUnicodeCodePoint(
loc_hex_digits.region,
)),
output,
);
}
},
Err(_) => {
env.problem(Problem::InvalidHexadecimal(loc_hex_digits.region));
return (
Expr::RuntimeError(RuntimeError::InvalidHexadecimal(
loc_hex_digits.region,
)),
output,
);
}
},
Interpolated(loc_expr) => { Interpolated(loc_expr) => {
if is_valid_interpolation(loc_expr.value) { if is_valid_interpolation(loc_expr.value) {
if !buf.is_empty() {
segments.push(StrSegment::Plaintext(buf.into()));
buf = String::new();
}
let (loc_expr, new_output) = canonicalize_expr( let (loc_expr, new_output) = canonicalize_expr(
env, env,
var_store, var_store,
@ -1383,7 +1413,7 @@ fn flatten_str_lines<'a>(
output.union(new_output); output.union(new_output);
interpolations.push(StrSegment::Interpolation(loc_expr)); segments.push(StrSegment::Interpolation(loc_expr));
} else { } else {
env.problem(Problem::InvalidInterpolation(loc_expr.region)); env.problem(Problem::InvalidInterpolation(loc_expr.region));
@ -1398,7 +1428,11 @@ fn flatten_str_lines<'a>(
} }
} }
(Expr::Str(interpolations), output) if !buf.is_empty() {
segments.push(StrSegment::Plaintext(buf.into()));
}
(Expr::Str(segments), output)
} }
/// Returns the char that would have been originally parsed to /// Returns the char that would have been originally parsed to

View file

@ -15,7 +15,7 @@ mod test_can {
use crate::helpers::{can_expr_with, test_home, CanExprOut}; use crate::helpers::{can_expr_with, test_home, CanExprOut};
use bumpalo::Bump; use bumpalo::Bump;
use roc_can::expr::Expr::{self, *}; use roc_can::expr::Expr::{self, *};
use roc_can::expr::Recursive; use roc_can::expr::{Recursive, StrSegment};
use roc_problem::can::{FloatErrorKind, IntErrorKind, Problem, RuntimeError}; use roc_problem::can::{FloatErrorKind, IntErrorKind, Problem, RuntimeError};
use roc_region::all::Region; use roc_region::all::Region;
use std::{f64, i64}; use std::{f64, i64};
@ -69,6 +69,10 @@ mod test_can {
} }
} }
fn expr_str(contents: &str) -> Expr {
Expr::Str(vec![StrSegment::Plaintext(contents.into())])
}
// NUMBER LITERALS // NUMBER LITERALS
#[test] #[test]
@ -1179,62 +1183,61 @@ mod test_can {
//} //}
// //
// //
//// STRING LITERALS // STRING LITERALS
// #[test]
// #[test] fn string_with_valid_unicode_escapes() {
// fn string_with_valid_unicode_escapes() { assert_can(r#""x\u(00A0)x""#, expr_str("x\u{00A0}x"));
// expect_parsed_str("x\u{00A0}x", r#""x\u{00A0}x""#); assert_can(r#""x\u(101010)x""#, expr_str("x\u{101010}x"));
// expect_parsed_str("x\u{101010}x", r#""x\u{101010}x""#); }
// }
// #[test] // #[test]
// fn string_with_too_large_unicode_escape() { // fn string_with_too_large_unicode_escape() {
// // Should be too big - max size should be 10FFFF. // // Should be too big - max size should be 10FFFF.
// // (Rust has this restriction. I assume it's a good idea.) // // (Rust has this restriction. I assume it's a good idea.)
// assert_malformed_str( // assert_malformed_str(
// r#""abc\u{110000}def""#, // r#""abc\u{110000}def""#,
// vec![Located::new(0, 7, 0, 12, Problem::UnicodeCodePointTooLarge)], // vec![Located::new(0, 7, 0, 12, Problem::UnicodeCodePointTooLarge)],
// ); // );
// } // }
// #[test] // #[test]
// fn string_with_no_unicode_digits() { // fn string_with_no_unicode_digits() {
// // No digits specified // // No digits specified
// assert_malformed_str( // assert_malformed_str(
// r#""blah\u{}foo""#, // r#""blah\u{}foo""#,
// vec![Located::new(0, 5, 0, 8, Problem::NoUnicodeDigits)], // vec![Located::new(0, 5, 0, 8, Problem::NoUnicodeDigits)],
// ); // );
// } // }
// #[test] // #[test]
// fn string_with_no_unicode_opening_brace() { // fn string_with_no_unicode_opening_brace() {
// // No opening curly brace. It can't be sure if the closing brace // // No opening curly brace. It can't be sure if the closing brace
// // was intended to be a closing brace for the unicode escape, so // // was intended to be a closing brace for the unicode escape, so
// // report that there were no digits specified. // // report that there were no digits specified.
// assert_malformed_str( // assert_malformed_str(
// r#""abc\u00A0}def""#, // r#""abc\u00A0}def""#,
// vec![Located::new(0, 4, 0, 5, Problem::NoUnicodeDigits)], // vec![Located::new(0, 4, 0, 5, Problem::NoUnicodeDigits)],
// ); // );
// } // }
// #[test] // #[test]
// fn string_with_no_unicode_closing_brace() { // fn string_with_no_unicode_closing_brace() {
// // No closing curly brace // // No closing curly brace
// assert_malformed_str( // assert_malformed_str(
// r#""blah\u{stuff""#, // r#""blah\u{stuff""#,
// vec![Located::new(0, 5, 0, 12, Problem::MalformedEscapedUnicode)], // vec![Located::new(0, 5, 0, 12, Problem::MalformedEscapedUnicode)],
// ); // );
// } // }
// #[test] // #[test]
// fn string_with_no_unicode_braces() { // fn string_with_no_unicode_braces() {
// // No curly braces // // No curly braces
// assert_malformed_str( // assert_malformed_str(
// r#""zzzz\uzzzzz""#, // r#""zzzz\uzzzzz""#,
// vec![Located::new(0, 5, 0, 6, Problem::NoUnicodeDigits)], // vec![Located::new(0, 5, 0, 6, Problem::NoUnicodeDigits)],
// ); // );
// } // }
// #[test] // #[test]
// fn string_with_escaped_interpolation() { // fn string_with_escaped_interpolation() {
@ -1242,13 +1245,12 @@ mod test_can {
// // This should NOT be string interpolation, because of the \\ // // This should NOT be string interpolation, because of the \\
// indoc!( // indoc!(
// r#" // r#"
// "abcd\\(efg)hij" // "abcd\\(efg)hij"
// "# // "#
// ), // ),
// Str(r#"abcd\(efg)hij"#.into()), // Str(r#"abcd\(efg)hij"#.into()),
// ); // );
// } // }
//
// #[test] // #[test]
// fn string_without_escape() { // fn string_without_escape() {

View file

@ -56,6 +56,8 @@ pub enum Problem {
region: Region, region: Region,
}, },
InvalidInterpolation(Region), InvalidInterpolation(Region),
InvalidHexadecimal(Region),
InvalidUnicodeCodePoint(Region),
} }
#[derive(Clone, Debug, PartialEq)] #[derive(Clone, Debug, PartialEq)]
@ -127,6 +129,8 @@ pub enum RuntimeError {
NonExhaustivePattern, NonExhaustivePattern,
InvalidInterpolation(Region), InvalidInterpolation(Region),
InvalidHexadecimal(Region),
InvalidUnicodeCodePoint(Region),
/// When the author specifies a type annotation but no implementation /// When the author specifies a type annotation but no implementation
NoImplementation, NoImplementation,

View file

@ -262,6 +262,18 @@ pub fn can_problem<'b>(
alloc.reflow(" can occur in this position."), alloc.reflow(" can occur in this position."),
]), ]),
]), ]),
Problem::InvalidHexadecimal(region) => {
todo!(
"TODO report an invalid hexadecimal number in a \\u(...) code point at region {:?}",
region
);
}
Problem::InvalidUnicodeCodePoint(region) => {
todo!(
"TODO report an invalid \\u(...) code point at region {:?}",
region
);
}
Problem::InvalidInterpolation(region) => { Problem::InvalidInterpolation(region) => {
todo!( todo!(
"TODO report an invalid string interpolation at region {:?}", "TODO report an invalid string interpolation at region {:?}",
@ -530,6 +542,18 @@ fn pretty_runtime_error<'b>(
alloc.region(region), alloc.region(region),
alloc.reflow("Only variables can be updated with record update syntax."), alloc.reflow("Only variables can be updated with record update syntax."),
]), ]),
RuntimeError::InvalidHexadecimal(region) => {
todo!(
"TODO runtime error for an invalid hexadecimal number in a \\u(...) code point at region {:?}",
region
);
}
RuntimeError::InvalidUnicodeCodePoint(region) => {
todo!(
"TODO runtime error for an invalid \\u(...) code point at region {:?}",
region
);
}
RuntimeError::InvalidInterpolation(region) => { RuntimeError::InvalidInterpolation(region) => {
todo!( todo!(
"TODO runtime error for an invalid string interpolation at region {:?}", "TODO runtime error for an invalid string interpolation at region {:?}",