share literal validation logic with compiler

This commit is contained in:
Aleksey Kladov 2019-05-07 19:38:26 +03:00
parent ef782adc29
commit 313314e14b
10 changed files with 620 additions and 1201 deletions

View file

@ -1,199 +0,0 @@
//! Validation of byte literals
use crate::{
string_lexing::{self, StringComponentKind},
TextRange,
validation::char,
SyntaxError,
SyntaxErrorKind::*,
SyntaxToken,
};
pub(super) fn validate_byte_node(node: SyntaxToken, errors: &mut Vec<SyntaxError>) {
let literal_text = node.text();
let literal_range = node.range();
let mut components = string_lexing::parse_quoted_literal(Some('b'), '\'', literal_text);
let mut len = 0;
for component in &mut components {
len += 1;
let text = &literal_text[component.range];
let range = component.range + literal_range.start();
validate_byte_component(text, component.kind, range, errors);
}
if !components.has_closing_quote {
errors.push(SyntaxError::new(UnclosedByte, literal_range));
}
if let Some(range) = components.suffix {
errors.push(SyntaxError::new(InvalidSuffix, range + literal_range.start()));
}
if len == 0 {
errors.push(SyntaxError::new(EmptyByte, literal_range));
}
if len > 1 {
errors.push(SyntaxError::new(OverlongByte, literal_range));
}
}
pub(super) fn validate_byte_component(
text: &str,
kind: StringComponentKind,
range: TextRange,
errors: &mut Vec<SyntaxError>,
) {
use self::StringComponentKind::*;
match kind {
AsciiEscape => validate_byte_escape(text, range, errors),
AsciiCodeEscape => validate_byte_code_escape(text, range, errors),
UnicodeEscape => errors.push(SyntaxError::new(UnicodeEscapeForbidden, range)),
CodePoint => {
let c = text.chars().next().expect("Code points should be one character long");
// These bytes must always be escaped
if c == '\t' || c == '\r' || c == '\n' {
errors.push(SyntaxError::new(UnescapedByte, range));
}
// Only ASCII bytes are allowed
if c > 0x7F as char {
errors.push(SyntaxError::new(ByteOutOfRange, range));
}
}
IgnoreNewline => { /* always valid */ }
}
}
fn validate_byte_escape(text: &str, range: TextRange, errors: &mut Vec<SyntaxError>) {
if text.len() == 1 {
// Escape sequence consists only of leading `\`
errors.push(SyntaxError::new(EmptyByteEscape, range));
} else {
let escape_code = text.chars().skip(1).next().unwrap();
if !char::is_ascii_escape(escape_code) {
errors.push(SyntaxError::new(InvalidByteEscape, range));
}
}
}
fn validate_byte_code_escape(text: &str, range: TextRange, errors: &mut Vec<SyntaxError>) {
// A ByteCodeEscape has 4 chars, example: `\xDD`
if !text.is_ascii() {
errors.push(SyntaxError::new(MalformedByteCodeEscape, range));
} else if text.chars().count() < 4 {
errors.push(SyntaxError::new(TooShortByteCodeEscape, range));
} else {
assert!(text.chars().count() == 4, "ByteCodeEscape cannot be longer than 4 chars");
if u8::from_str_radix(&text[2..], 16).is_err() {
errors.push(SyntaxError::new(MalformedByteCodeEscape, range));
}
}
}
#[cfg(test)]
mod test {
use crate::{SourceFile, TreeArc};
fn build_file(literal: &str) -> TreeArc<SourceFile> {
let src = format!("const C: u8 = b'{}';", literal);
SourceFile::parse(&src)
}
fn assert_valid_byte(literal: &str) {
let file = build_file(literal);
assert!(file.errors().len() == 0, "Errors for literal '{}': {:?}", literal, file.errors());
}
fn assert_invalid_byte(literal: &str) {
let file = build_file(literal);
assert!(file.errors().len() > 0);
}
#[test]
fn test_ansi_codepoints() {
for byte in 0..128 {
match byte {
b'\n' | b'\r' | b'\t' => assert_invalid_byte(&(byte as char).to_string()),
b'\'' | b'\\' => { /* Ignore character close and backslash */ }
_ => assert_valid_byte(&(byte as char).to_string()),
}
}
for byte in 128..=255u8 {
assert_invalid_byte(&(byte as char).to_string());
}
}
#[test]
fn test_unicode_codepoints() {
let invalid = ["Ƒ", "", "", ""];
for c in &invalid {
assert_invalid_byte(c);
}
}
#[test]
fn test_unicode_multiple_codepoints() {
let invalid = ["नी", "👨‍👨‍"];
for c in &invalid {
assert_invalid_byte(c);
}
}
#[test]
fn test_valid_byte_escape() {
let valid = [r"\'", "\"", "\\\\", "\\\"", r"\n", r"\r", r"\t", r"\0"];
for c in &valid {
assert_valid_byte(c);
}
}
#[test]
fn test_invalid_byte_escape() {
let invalid = [r"\a", r"\?", r"\"];
for c in &invalid {
assert_invalid_byte(c);
}
}
#[test]
fn test_valid_byte_code_escape() {
let valid = [r"\x00", r"\x7F", r"\x55", r"\xF0"];
for c in &valid {
assert_valid_byte(c);
}
}
#[test]
fn test_invalid_byte_code_escape() {
let invalid = [r"\x", r"\x7"];
for c in &invalid {
assert_invalid_byte(c);
}
}
#[test]
fn test_invalid_unicode_escape() {
let well_formed = [r"\u{FF}", r"\u{0}", r"\u{F}", r"\u{10FFFF}", r"\u{1_0__FF___FF_____}"];
for c in &well_formed {
assert_invalid_byte(c);
}
let invalid = [
r"\u",
r"\u{}",
r"\u{",
r"\u{FF",
r"\u{FFFFFF}",
r"\u{_F}",
r"\u{00FFFFF}",
r"\u{110000}",
];
for c in &invalid {
assert_invalid_byte(c);
}
}
}

View file

@ -1,169 +0,0 @@
use crate::{
string_lexing::{self, StringComponentKind},
SyntaxError,
SyntaxErrorKind::*,
SyntaxToken,
};
use super::byte;
pub(crate) fn validate_byte_string_node(node: SyntaxToken, errors: &mut Vec<SyntaxError>) {
let literal_text = node.text();
let literal_range = node.range();
let mut components = string_lexing::parse_quoted_literal(Some('b'), '"', literal_text);
for component in &mut components {
let range = component.range + literal_range.start();
match component.kind {
StringComponentKind::IgnoreNewline => { /* always valid */ }
_ => {
// Chars must escape \t, \n and \r codepoints, but strings don't
let text = &literal_text[component.range];
match text {
"\t" | "\n" | "\r" => { /* always valid */ }
_ => byte::validate_byte_component(text, component.kind, range, errors),
}
}
}
}
if !components.has_closing_quote {
errors.push(SyntaxError::new(UnclosedString, literal_range));
}
if let Some(range) = components.suffix {
errors.push(SyntaxError::new(InvalidSuffix, range + literal_range.start()));
}
}
#[cfg(test)]
mod test {
use crate::{SourceFile, TreeArc};
fn build_file(literal: &str) -> TreeArc<SourceFile> {
let src = format!(r#"const S: &'static [u8] = b"{}";"#, literal);
println!("Source: {}", src);
SourceFile::parse(&src)
}
fn assert_valid_str(literal: &str) {
let file = build_file(literal);
assert!(file.errors().len() == 0, "Errors for literal '{}': {:?}", literal, file.errors());
}
fn assert_invalid_str(literal: &str) {
let file = build_file(literal);
assert!(file.errors().len() > 0);
}
#[test]
fn test_ansi_codepoints() {
for byte in 0..128 {
match byte {
b'\"' | b'\\' => { /* Ignore string close and backslash */ }
_ => assert_valid_str(&(byte as char).to_string()),
}
}
for byte in 128..=255u8 {
assert_invalid_str(&(byte as char).to_string());
}
}
#[test]
fn test_unicode_codepoints() {
let invalid = ["Ƒ", "", "", ""];
for c in &invalid {
assert_invalid_str(c);
}
}
#[test]
fn test_unicode_multiple_codepoints() {
let invalid = ["नी", "👨‍👨‍"];
for c in &invalid {
assert_invalid_str(c);
}
}
#[test]
fn test_valid_ascii_escape() {
let valid = [r"\'", r#"\""#, r"\\", r"\n", r"\r", r"\t", r"\0", "a", "b"];
for c in &valid {
assert_valid_str(c);
}
}
#[test]
fn test_invalid_ascii_escape() {
let invalid = [r"\a", r"\?", r"\"];
for c in &invalid {
assert_invalid_str(c);
}
}
#[test]
fn test_valid_ascii_code_escape() {
let valid = [r"\x00", r"\x7F", r"\x55", r"\xF0"];
for c in &valid {
assert_valid_str(c);
}
}
#[test]
fn test_invalid_ascii_code_escape() {
let invalid = [r"\x", r"\x7"];
for c in &invalid {
assert_invalid_str(c);
}
}
#[test]
fn test_invalid_unicode_escape() {
let well_formed = [r"\u{FF}", r"\u{0}", r"\u{F}", r"\u{10FFFF}", r"\u{1_0__FF___FF_____}"];
for c in &well_formed {
assert_invalid_str(c);
}
let invalid = [
r"\u",
r"\u{}",
r"\u{",
r"\u{FF",
r"\u{FFFFFF}",
r"\u{_F}",
r"\u{00FFFFF}",
r"\u{110000}",
];
for c in &invalid {
assert_invalid_str(c);
}
}
#[test]
fn test_mixed_invalid() {
assert_invalid_str(
r"This is the tale of a string
with a newline in between, some emoji (👨👨) here and there,
unicode escapes like this: \u{1FFBB} and weird stuff like
this ",
);
}
#[test]
fn test_mixed_valid() {
assert_valid_str(
r"This is the tale of a string
with a newline in between, no emoji at all,
nor unicode escapes or weird stuff",
);
}
#[test]
fn test_ignore_newline() {
assert_valid_str(
"Hello \
World",
);
}
}

View file

@ -1,273 +0,0 @@
//! Validation of char literals
use std::u32;
use arrayvec::ArrayString;
use crate::{
string_lexing::{self, StringComponentKind},
TextRange,
SyntaxError,
SyntaxErrorKind::*,
SyntaxToken,
};
pub(super) fn validate_char_node(node: SyntaxToken, errors: &mut Vec<SyntaxError>) {
let literal_text = node.text();
let literal_range = node.range();
let mut components = string_lexing::parse_quoted_literal(None, '\'', literal_text);
let mut len = 0;
for component in &mut components {
len += 1;
let text = &literal_text[component.range];
let range = component.range + literal_range.start();
validate_char_component(text, component.kind, range, errors);
}
if !components.has_closing_quote {
errors.push(SyntaxError::new(UnclosedChar, literal_range));
}
if let Some(range) = components.suffix {
errors.push(SyntaxError::new(InvalidSuffix, range + literal_range.start()));
}
if len == 0 {
errors.push(SyntaxError::new(EmptyChar, literal_range));
}
if len > 1 {
errors.push(SyntaxError::new(OverlongChar, literal_range));
}
}
pub(super) fn validate_char_component(
text: &str,
kind: StringComponentKind,
range: TextRange,
errors: &mut Vec<SyntaxError>,
) {
// Validate escapes
use self::StringComponentKind::*;
match kind {
AsciiEscape => validate_ascii_escape(text, range, errors),
AsciiCodeEscape => validate_ascii_code_escape(text, range, errors),
UnicodeEscape => validate_unicode_escape(text, range, errors),
CodePoint => {
// These code points must always be escaped
if text == "\t" || text == "\r" || text == "\n" {
errors.push(SyntaxError::new(UnescapedCodepoint, range));
}
}
StringComponentKind::IgnoreNewline => { /* always valid */ }
}
}
fn validate_ascii_escape(text: &str, range: TextRange, errors: &mut Vec<SyntaxError>) {
if text.len() == 1 {
// Escape sequence consists only of leading `\` (only occurs at EOF, otherwise e.g. '\' is treated as an unclosed char containing a single quote `'`)
errors.push(SyntaxError::new(EmptyAsciiEscape, range));
} else {
let escape_code = text.chars().skip(1).next().unwrap();
if !is_ascii_escape(escape_code) {
errors.push(SyntaxError::new(InvalidAsciiEscape, range));
}
}
}
pub(super) fn is_ascii_escape(code: char) -> bool {
match code {
'\\' | '\'' | '"' | 'n' | 'r' | 't' | '0' => true,
_ => false,
}
}
fn validate_ascii_code_escape(text: &str, range: TextRange, errors: &mut Vec<SyntaxError>) {
// An AsciiCodeEscape has 4 chars, example: `\xDD`
if !text.is_ascii() {
// FIXME: Give a more precise error message (say what the invalid character was)
errors.push(SyntaxError::new(AsciiCodeEscapeOutOfRange, range));
} else if text.chars().count() < 4 {
errors.push(SyntaxError::new(TooShortAsciiCodeEscape, range));
} else {
assert_eq!(
text.chars().count(),
4,
"AsciiCodeEscape cannot be longer than 4 chars, but text '{}' is",
text,
);
match u8::from_str_radix(&text[2..], 16) {
Ok(code) if code < 128 => { /* Escape code is valid */ }
Ok(_) => errors.push(SyntaxError::new(AsciiCodeEscapeOutOfRange, range)),
Err(_) => errors.push(SyntaxError::new(MalformedAsciiCodeEscape, range)),
}
}
}
fn validate_unicode_escape(text: &str, range: TextRange, errors: &mut Vec<SyntaxError>) {
assert!(&text[..2] == "\\u", "UnicodeEscape always starts with \\u");
if text.len() == 2 {
// No starting `{`
errors.push(SyntaxError::new(MalformedUnicodeEscape, range));
return;
}
if text.len() == 3 {
// Only starting `{`
errors.push(SyntaxError::new(UnclosedUnicodeEscape, range));
return;
}
let mut code = ArrayString::<[_; 6]>::new();
let mut closed = false;
for c in text[3..].chars() {
assert!(!closed, "no characters after escape is closed");
if c.is_digit(16) {
if code.len() == 6 {
errors.push(SyntaxError::new(OverlongUnicodeEscape, range));
return;
}
code.push(c);
} else if c == '_' {
// Reject leading _
if code.len() == 0 {
errors.push(SyntaxError::new(MalformedUnicodeEscape, range));
return;
}
} else if c == '}' {
closed = true;
} else {
errors.push(SyntaxError::new(MalformedUnicodeEscape, range));
return;
}
}
if !closed {
errors.push(SyntaxError::new(UnclosedUnicodeEscape, range))
}
if code.len() == 0 {
errors.push(SyntaxError::new(EmptyUnicodeEcape, range));
return;
}
match u32::from_str_radix(&code, 16) {
Ok(code_u32) if code_u32 > 0x10FFFF => {
errors.push(SyntaxError::new(UnicodeEscapeOutOfRange, range));
}
Ok(_) => {
// Valid escape code
}
Err(_) => {
errors.push(SyntaxError::new(MalformedUnicodeEscape, range));
}
}
}
#[cfg(test)]
mod test {
use crate::{SourceFile, TreeArc};
fn build_file(literal: &str) -> TreeArc<SourceFile> {
let src = format!("const C: char = '{}';", literal);
SourceFile::parse(&src)
}
fn assert_valid_char(literal: &str) {
let file = build_file(literal);
assert!(file.errors().len() == 0, "Errors for literal '{}': {:?}", literal, file.errors());
}
fn assert_invalid_char(literal: &str) {
let file = build_file(literal);
assert!(file.errors().len() > 0);
}
#[test]
fn test_ansi_codepoints() {
for byte in 0..=255u8 {
match byte {
b'\n' | b'\r' | b'\t' => assert_invalid_char(&(byte as char).to_string()),
b'\'' | b'\\' => { /* Ignore character close and backslash */ }
_ => assert_valid_char(&(byte as char).to_string()),
}
}
}
#[test]
fn test_unicode_codepoints() {
let valid = ["Ƒ", "", "", ""];
for c in &valid {
assert_valid_char(c);
}
}
#[test]
fn test_unicode_multiple_codepoints() {
let invalid = ["नी", "👨‍👨‍"];
for c in &invalid {
assert_invalid_char(c);
}
}
#[test]
fn test_valid_ascii_escape() {
let valid = [r"\'", "\"", "\\\\", "\\\"", r"\n", r"\r", r"\t", r"\0"];
for c in &valid {
assert_valid_char(c);
}
}
#[test]
fn test_invalid_ascii_escape() {
let invalid = [r"\a", r"\?", r"\"];
for c in &invalid {
assert_invalid_char(c);
}
}
#[test]
fn test_valid_ascii_code_escape() {
let valid = [r"\x00", r"\x7F", r"\x55"];
for c in &valid {
assert_valid_char(c);
}
}
#[test]
fn test_invalid_ascii_code_escape() {
let invalid = [r"\x", r"\x7", r"\xF0"];
for c in &invalid {
assert_invalid_char(c);
}
}
#[test]
fn test_valid_unicode_escape() {
let valid = [r"\u{FF}", r"\u{0}", r"\u{F}", r"\u{10FFFF}", r"\u{1_0__FF___FF_____}"];
for c in &valid {
assert_valid_char(c);
}
}
#[test]
fn test_invalid_unicode_escape() {
let invalid = [
r"\u",
r"\u{}",
r"\u{",
r"\u{FF",
r"\u{FFFFFF}",
r"\u{_F}",
r"\u{00FFFFF}",
r"\u{110000}",
];
for c in &invalid {
assert_invalid_char(c);
}
}
}

View file

@ -1,154 +0,0 @@
use crate::{
string_lexing,
SyntaxError,
SyntaxErrorKind::*,
SyntaxToken,
};
use super::char;
pub(crate) fn validate_string_node(node: SyntaxToken, errors: &mut Vec<SyntaxError>) {
let literal_text = node.text();
let literal_range = node.range();
let mut components = string_lexing::parse_quoted_literal(None, '"', literal_text);
for component in &mut components {
let range = component.range + literal_range.start();
// Chars must escape \t, \n and \r codepoints, but strings don't
let text = &literal_text[component.range];
match text {
"\t" | "\n" | "\r" => { /* always valid */ }
_ => char::validate_char_component(text, component.kind, range, errors),
}
}
if !components.has_closing_quote {
errors.push(SyntaxError::new(UnclosedString, literal_range));
}
if let Some(range) = components.suffix {
errors.push(SyntaxError::new(InvalidSuffix, range + literal_range.start()));
}
}
#[cfg(test)]
mod test {
use crate::{SourceFile, TreeArc};
fn build_file(literal: &str) -> TreeArc<SourceFile> {
let src = format!(r#"const S: &'static str = "{}";"#, literal);
println!("Source: {}", src);
SourceFile::parse(&src)
}
fn assert_valid_str(literal: &str) {
let file = build_file(literal);
assert!(file.errors().len() == 0, "Errors for literal '{}': {:?}", literal, file.errors());
}
fn assert_invalid_str(literal: &str) {
let file = build_file(literal);
assert!(file.errors().len() > 0);
}
#[test]
fn test_ansi_codepoints() {
for byte in 0..=255u8 {
match byte {
b'\"' | b'\\' => { /* Ignore string close and backslash */ }
_ => assert_valid_str(&(byte as char).to_string()),
}
}
}
#[test]
fn test_unicode_codepoints() {
let valid = ["Ƒ", "", "", ""];
for c in &valid {
assert_valid_str(c);
}
}
#[test]
fn test_unicode_multiple_codepoints() {
let valid = ["नी", "👨‍👨‍"];
for c in &valid {
assert_valid_str(c);
}
}
#[test]
fn test_valid_ascii_escape() {
let valid = [r"\'", r#"\""#, r"\\", r"\n", r"\r", r"\t", r"\0", "a", "b"];
for c in &valid {
assert_valid_str(c);
}
}
#[test]
fn test_invalid_ascii_escape() {
let invalid = [r"\a", r"\?", r"\"];
for c in &invalid {
assert_invalid_str(c);
}
}
#[test]
fn test_valid_ascii_code_escape() {
let valid = [r"\x00", r"\x7F", r"\x55"];
for c in &valid {
assert_valid_str(c);
}
}
#[test]
fn test_invalid_ascii_code_escape() {
let invalid = [r"\x", r"\x7", r"\xF0"];
for c in &invalid {
assert_invalid_str(c);
}
}
#[test]
fn test_valid_unicode_escape() {
let valid = [r"\u{FF}", r"\u{0}", r"\u{F}", r"\u{10FFFF}", r"\u{1_0__FF___FF_____}"];
for c in &valid {
assert_valid_str(c);
}
}
#[test]
fn test_invalid_unicode_escape() {
let invalid = [
r"\u",
r"\u{}",
r"\u{",
r"\u{FF",
r"\u{FFFFFF}",
r"\u{_F}",
r"\u{00FFFFF}",
r"\u{110000}",
];
for c in &invalid {
assert_invalid_str(c);
}
}
#[test]
fn test_mixed() {
assert_valid_str(
r"This is the tale of a string
with a newline in between, some emoji (👨👨) here and there,
unicode escapes like this: \u{1FFBB} and weird stuff like
this ",
);
}
#[test]
fn test_ignore_newline() {
assert_valid_str(
"Hello \
World",
);
}
}

View file

@ -0,0 +1,521 @@
//! Utilities for validating string and char literals and turning them into
//! values they represent.
//!
//! This file is copy-pasted from the compiler
//!
//! https://github.com/rust-lang/rust/blob/c6ac57564852cb6e2d0db60f7b46d9eb98d4b449/src/libsyntax/parse/unescape.rs
//!
//! Hopefully, we'll share this code in a proper way some day
use std::str::Chars;
use std::ops::Range;
#[derive(Debug, PartialEq, Eq, Clone, Hash)]
pub enum EscapeError {
ZeroChars,
MoreThanOneChar,
LoneSlash,
InvalidEscape,
BareCarriageReturn,
EscapeOnlyChar,
TooShortHexEscape,
InvalidCharInHexEscape,
OutOfRangeHexEscape,
NoBraceInUnicodeEscape,
InvalidCharInUnicodeEscape,
EmptyUnicodeEscape,
UnclosedUnicodeEscape,
LeadingUnderscoreUnicodeEscape,
OverlongUnicodeEscape,
LoneSurrogateUnicodeEscape,
OutOfRangeUnicodeEscape,
UnicodeEscapeInByte,
NonAsciiCharInByte,
}
/// Takes a contents of a char literal (without quotes), and returns an
/// unescaped char or an error
pub(crate) fn unescape_char(literal_text: &str) -> Result<char, (usize, EscapeError)> {
let mut chars = literal_text.chars();
unescape_char_or_byte(&mut chars, Mode::Char)
.map_err(|err| (literal_text.len() - chars.as_str().len(), err))
}
/// Takes a contents of a string literal (without quotes) and produces a
/// sequence of escaped characters or errors.
pub(crate) fn unescape_str<F>(literal_text: &str, callback: &mut F)
where
F: FnMut(Range<usize>, Result<char, EscapeError>),
{
unescape_str_or_byte_str(literal_text, Mode::Str, callback)
}
pub(crate) fn unescape_byte(literal_text: &str) -> Result<u8, (usize, EscapeError)> {
let mut chars = literal_text.chars();
unescape_char_or_byte(&mut chars, Mode::Byte)
.map(byte_from_char)
.map_err(|err| (literal_text.len() - chars.as_str().len(), err))
}
/// Takes a contents of a string literal (without quotes) and produces a
/// sequence of escaped characters or errors.
pub(crate) fn unescape_byte_str<F>(literal_text: &str, callback: &mut F)
where
F: FnMut(Range<usize>, Result<u8, EscapeError>),
{
unescape_str_or_byte_str(literal_text, Mode::ByteStr, &mut |range, char| {
callback(range, char.map(byte_from_char))
})
}
#[derive(Debug, Clone, Copy)]
pub(crate) enum Mode {
Char,
Str,
Byte,
ByteStr,
}
impl Mode {
fn in_single_quotes(self) -> bool {
match self {
Mode::Char | Mode::Byte => true,
Mode::Str | Mode::ByteStr => false,
}
}
pub(crate) fn in_double_quotes(self) -> bool {
!self.in_single_quotes()
}
pub(crate) fn is_bytes(self) -> bool {
match self {
Mode::Byte | Mode::ByteStr => true,
Mode::Char | Mode::Str => false,
}
}
}
fn scan_escape(first_char: char, chars: &mut Chars<'_>, mode: Mode) -> Result<char, EscapeError> {
if first_char != '\\' {
return match first_char {
'\t' | '\n' => Err(EscapeError::EscapeOnlyChar),
'\r' => Err(if chars.clone().next() == Some('\n') {
EscapeError::EscapeOnlyChar
} else {
EscapeError::BareCarriageReturn
}),
'\'' if mode.in_single_quotes() => Err(EscapeError::EscapeOnlyChar),
'"' if mode.in_double_quotes() => Err(EscapeError::EscapeOnlyChar),
_ => {
if mode.is_bytes() && !first_char.is_ascii() {
return Err(EscapeError::NonAsciiCharInByte);
}
Ok(first_char)
}
};
}
let second_char = chars.next().ok_or(EscapeError::LoneSlash)?;
let res = match second_char {
'"' => '"',
'n' => '\n',
'r' => '\r',
't' => '\t',
'\\' => '\\',
'\'' => '\'',
'0' => '\0',
'x' => {
let hi = chars.next().ok_or(EscapeError::TooShortHexEscape)?;
let hi = hi.to_digit(16).ok_or(EscapeError::InvalidCharInHexEscape)?;
let lo = chars.next().ok_or(EscapeError::TooShortHexEscape)?;
let lo = lo.to_digit(16).ok_or(EscapeError::InvalidCharInHexEscape)?;
let value = hi * 16 + lo;
if !mode.is_bytes() && !is_ascii(value) {
return Err(EscapeError::OutOfRangeHexEscape);
}
let value = value as u8;
value as char
}
'u' => {
if chars.next() != Some('{') {
return Err(EscapeError::NoBraceInUnicodeEscape);
}
let mut n_digits = 1;
let mut value: u32 = match chars.next().ok_or(EscapeError::UnclosedUnicodeEscape)? {
'_' => return Err(EscapeError::LeadingUnderscoreUnicodeEscape),
'}' => return Err(EscapeError::EmptyUnicodeEscape),
c => c.to_digit(16).ok_or(EscapeError::InvalidCharInUnicodeEscape)?,
};
loop {
match chars.next() {
None => return Err(EscapeError::UnclosedUnicodeEscape),
Some('_') => continue,
Some('}') => {
if n_digits > 6 {
return Err(EscapeError::OverlongUnicodeEscape);
}
if mode.is_bytes() {
return Err(EscapeError::UnicodeEscapeInByte);
}
break std::char::from_u32(value).ok_or_else(|| {
if value > 0x10FFFF {
EscapeError::OutOfRangeUnicodeEscape
} else {
EscapeError::LoneSurrogateUnicodeEscape
}
})?;
}
Some(c) => {
let digit =
c.to_digit(16).ok_or(EscapeError::InvalidCharInUnicodeEscape)?;
n_digits += 1;
if n_digits > 6 {
continue;
}
let digit = digit as u32;
value = value * 16 + digit;
}
};
}
}
_ => return Err(EscapeError::InvalidEscape),
};
Ok(res)
}
fn unescape_char_or_byte(chars: &mut Chars<'_>, mode: Mode) -> Result<char, EscapeError> {
let first_char = chars.next().ok_or(EscapeError::ZeroChars)?;
let res = scan_escape(first_char, chars, mode)?;
if chars.next().is_some() {
return Err(EscapeError::MoreThanOneChar);
}
Ok(res)
}
/// Takes a contents of a string literal (without quotes) and produces a
/// sequence of escaped characters or errors.
fn unescape_str_or_byte_str<F>(src: &str, mode: Mode, callback: &mut F)
where
F: FnMut(Range<usize>, Result<char, EscapeError>),
{
assert!(mode.in_double_quotes());
let initial_len = src.len();
let mut chars = src.chars();
while let Some(first_char) = chars.next() {
let start = initial_len - chars.as_str().len() - first_char.len_utf8();
let unescaped_char = match first_char {
'\\' => {
let (second_char, third_char) = {
let mut chars = chars.clone();
(chars.next(), chars.next())
};
match (second_char, third_char) {
(Some('\n'), _) | (Some('\r'), Some('\n')) => {
skip_ascii_whitespace(&mut chars);
continue;
}
_ => scan_escape(first_char, &mut chars, mode),
}
}
'\r' => {
let second_char = chars.clone().next();
if second_char == Some('\n') {
chars.next();
Ok('\n')
} else {
scan_escape(first_char, &mut chars, mode)
}
}
'\n' => Ok('\n'),
'\t' => Ok('\t'),
_ => scan_escape(first_char, &mut chars, mode),
};
let end = initial_len - chars.as_str().len();
callback(start..end, unescaped_char);
}
fn skip_ascii_whitespace(chars: &mut Chars<'_>) {
let str = chars.as_str();
let first_non_space = str
.bytes()
.position(|b| b != b' ' && b != b'\t' && b != b'\n' && b != b'\r')
.unwrap_or(str.len());
*chars = str[first_non_space..].chars()
}
}
fn byte_from_char(c: char) -> u8 {
let res = c as u32;
assert!(res <= u8::max_value() as u32, "guaranteed because of Mode::Byte");
res as u8
}
fn is_ascii(x: u32) -> bool {
x <= 0x7F
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_unescape_char_bad() {
fn check(literal_text: &str, expected_error: EscapeError) {
let actual_result = unescape_char(literal_text).map_err(|(_offset, err)| err);
assert_eq!(actual_result, Err(expected_error));
}
check("", EscapeError::ZeroChars);
check(r"\", EscapeError::LoneSlash);
check("\n", EscapeError::EscapeOnlyChar);
check("\r\n", EscapeError::EscapeOnlyChar);
check("\t", EscapeError::EscapeOnlyChar);
check("'", EscapeError::EscapeOnlyChar);
check("\r", EscapeError::BareCarriageReturn);
check("spam", EscapeError::MoreThanOneChar);
check(r"\x0ff", EscapeError::MoreThanOneChar);
check(r#"\"a"#, EscapeError::MoreThanOneChar);
check(r"\na", EscapeError::MoreThanOneChar);
check(r"\ra", EscapeError::MoreThanOneChar);
check(r"\ta", EscapeError::MoreThanOneChar);
check(r"\\a", EscapeError::MoreThanOneChar);
check(r"\'a", EscapeError::MoreThanOneChar);
check(r"\0a", EscapeError::MoreThanOneChar);
check(r"\u{0}x", EscapeError::MoreThanOneChar);
check(r"\u{1F63b}}", EscapeError::MoreThanOneChar);
check(r"\v", EscapeError::InvalidEscape);
check(r"\💩", EscapeError::InvalidEscape);
check(r"\●", EscapeError::InvalidEscape);
check(r"\x", EscapeError::TooShortHexEscape);
check(r"\x0", EscapeError::TooShortHexEscape);
check(r"\xf", EscapeError::TooShortHexEscape);
check(r"\xa", EscapeError::TooShortHexEscape);
check(r"\xx", EscapeError::InvalidCharInHexEscape);
check(r"\xы", EscapeError::InvalidCharInHexEscape);
check(r"\x🦀", EscapeError::InvalidCharInHexEscape);
check(r"\xtt", EscapeError::InvalidCharInHexEscape);
check(r"\xff", EscapeError::OutOfRangeHexEscape);
check(r"\xFF", EscapeError::OutOfRangeHexEscape);
check(r"\x80", EscapeError::OutOfRangeHexEscape);
check(r"\u", EscapeError::NoBraceInUnicodeEscape);
check(r"\u[0123]", EscapeError::NoBraceInUnicodeEscape);
check(r"\u{0x}", EscapeError::InvalidCharInUnicodeEscape);
check(r"\u{", EscapeError::UnclosedUnicodeEscape);
check(r"\u{0000", EscapeError::UnclosedUnicodeEscape);
check(r"\u{}", EscapeError::EmptyUnicodeEscape);
check(r"\u{_0000}", EscapeError::LeadingUnderscoreUnicodeEscape);
check(r"\u{0000000}", EscapeError::OverlongUnicodeEscape);
check(r"\u{FFFFFF}", EscapeError::OutOfRangeUnicodeEscape);
check(r"\u{ffffff}", EscapeError::OutOfRangeUnicodeEscape);
check(r"\u{ffffff}", EscapeError::OutOfRangeUnicodeEscape);
check(r"\u{DC00}", EscapeError::LoneSurrogateUnicodeEscape);
check(r"\u{DDDD}", EscapeError::LoneSurrogateUnicodeEscape);
check(r"\u{DFFF}", EscapeError::LoneSurrogateUnicodeEscape);
check(r"\u{D800}", EscapeError::LoneSurrogateUnicodeEscape);
check(r"\u{DAAA}", EscapeError::LoneSurrogateUnicodeEscape);
check(r"\u{DBFF}", EscapeError::LoneSurrogateUnicodeEscape);
}
#[test]
fn test_unescape_char_good() {
fn check(literal_text: &str, expected_char: char) {
let actual_result = unescape_char(literal_text);
assert_eq!(actual_result, Ok(expected_char));
}
check("a", 'a');
check("ы", 'ы');
check("🦀", '🦀');
check(r#"\""#, '"');
check(r"\n", '\n');
check(r"\r", '\r');
check(r"\t", '\t');
check(r"\\", '\\');
check(r"\'", '\'');
check(r"\0", '\0');
check(r"\x00", '\0');
check(r"\x5a", 'Z');
check(r"\x5A", 'Z');
check(r"\x7f", 127 as char);
check(r"\u{0}", '\0');
check(r"\u{000000}", '\0');
check(r"\u{41}", 'A');
check(r"\u{0041}", 'A');
check(r"\u{00_41}", 'A');
check(r"\u{4__1__}", 'A');
check(r"\u{1F63b}", '😻');
}
#[test]
fn test_unescape_str_good() {
fn check(literal_text: &str, expected: &str) {
let mut buf = Ok(String::with_capacity(literal_text.len()));
unescape_str(literal_text, &mut |range, c| {
if let Ok(b) = &mut buf {
match c {
Ok(c) => b.push(c),
Err(e) => buf = Err((range, e)),
}
}
});
let buf = buf.as_ref().map(|it| it.as_ref());
assert_eq!(buf, Ok(expected))
}
check("foo", "foo");
check("", "");
check(" \t\n\r\n", " \t\n\n");
check("hello \\\n world", "hello world");
check("hello \\\r\n world", "hello world");
check("thread's", "thread's")
}
#[test]
fn test_unescape_byte_bad() {
fn check(literal_text: &str, expected_error: EscapeError) {
let actual_result = unescape_byte(literal_text).map_err(|(_offset, err)| err);
assert_eq!(actual_result, Err(expected_error));
}
check("", EscapeError::ZeroChars);
check(r"\", EscapeError::LoneSlash);
check("\n", EscapeError::EscapeOnlyChar);
check("\r\n", EscapeError::EscapeOnlyChar);
check("\t", EscapeError::EscapeOnlyChar);
check("'", EscapeError::EscapeOnlyChar);
check("\r", EscapeError::BareCarriageReturn);
check("spam", EscapeError::MoreThanOneChar);
check(r"\x0ff", EscapeError::MoreThanOneChar);
check(r#"\"a"#, EscapeError::MoreThanOneChar);
check(r"\na", EscapeError::MoreThanOneChar);
check(r"\ra", EscapeError::MoreThanOneChar);
check(r"\ta", EscapeError::MoreThanOneChar);
check(r"\\a", EscapeError::MoreThanOneChar);
check(r"\'a", EscapeError::MoreThanOneChar);
check(r"\0a", EscapeError::MoreThanOneChar);
check(r"\v", EscapeError::InvalidEscape);
check(r"\💩", EscapeError::InvalidEscape);
check(r"\●", EscapeError::InvalidEscape);
check(r"\x", EscapeError::TooShortHexEscape);
check(r"\x0", EscapeError::TooShortHexEscape);
check(r"\xa", EscapeError::TooShortHexEscape);
check(r"\xf", EscapeError::TooShortHexEscape);
check(r"\xx", EscapeError::InvalidCharInHexEscape);
check(r"\xы", EscapeError::InvalidCharInHexEscape);
check(r"\x🦀", EscapeError::InvalidCharInHexEscape);
check(r"\xtt", EscapeError::InvalidCharInHexEscape);
check(r"\u", EscapeError::NoBraceInUnicodeEscape);
check(r"\u[0123]", EscapeError::NoBraceInUnicodeEscape);
check(r"\u{0x}", EscapeError::InvalidCharInUnicodeEscape);
check(r"\u{", EscapeError::UnclosedUnicodeEscape);
check(r"\u{0000", EscapeError::UnclosedUnicodeEscape);
check(r"\u{}", EscapeError::EmptyUnicodeEscape);
check(r"\u{_0000}", EscapeError::LeadingUnderscoreUnicodeEscape);
check(r"\u{0000000}", EscapeError::OverlongUnicodeEscape);
check("ы", EscapeError::NonAsciiCharInByte);
check("🦀", EscapeError::NonAsciiCharInByte);
check(r"\u{0}", EscapeError::UnicodeEscapeInByte);
check(r"\u{000000}", EscapeError::UnicodeEscapeInByte);
check(r"\u{41}", EscapeError::UnicodeEscapeInByte);
check(r"\u{0041}", EscapeError::UnicodeEscapeInByte);
check(r"\u{00_41}", EscapeError::UnicodeEscapeInByte);
check(r"\u{4__1__}", EscapeError::UnicodeEscapeInByte);
check(r"\u{1F63b}", EscapeError::UnicodeEscapeInByte);
check(r"\u{0}x", EscapeError::UnicodeEscapeInByte);
check(r"\u{1F63b}}", EscapeError::UnicodeEscapeInByte);
check(r"\u{FFFFFF}", EscapeError::UnicodeEscapeInByte);
check(r"\u{ffffff}", EscapeError::UnicodeEscapeInByte);
check(r"\u{ffffff}", EscapeError::UnicodeEscapeInByte);
check(r"\u{DC00}", EscapeError::UnicodeEscapeInByte);
check(r"\u{DDDD}", EscapeError::UnicodeEscapeInByte);
check(r"\u{DFFF}", EscapeError::UnicodeEscapeInByte);
check(r"\u{D800}", EscapeError::UnicodeEscapeInByte);
check(r"\u{DAAA}", EscapeError::UnicodeEscapeInByte);
check(r"\u{DBFF}", EscapeError::UnicodeEscapeInByte);
}
#[test]
fn test_unescape_byte_good() {
fn check(literal_text: &str, expected_byte: u8) {
let actual_result = unescape_byte(literal_text);
assert_eq!(actual_result, Ok(expected_byte));
}
check("a", b'a');
check(r#"\""#, b'"');
check(r"\n", b'\n');
check(r"\r", b'\r');
check(r"\t", b'\t');
check(r"\\", b'\\');
check(r"\'", b'\'');
check(r"\0", b'\0');
check(r"\x00", b'\0');
check(r"\x5a", b'Z');
check(r"\x5A", b'Z');
check(r"\x7f", 127);
check(r"\x80", 128);
check(r"\xff", 255);
check(r"\xFF", 255);
}
#[test]
fn test_unescape_byte_str_good() {
fn check(literal_text: &str, expected: &[u8]) {
let mut buf = Ok(Vec::with_capacity(literal_text.len()));
unescape_byte_str(literal_text, &mut |range, c| {
if let Ok(b) = &mut buf {
match c {
Ok(c) => b.push(c),
Err(e) => buf = Err((range, e)),
}
}
});
let buf = buf.as_ref().map(|it| it.as_ref());
assert_eq!(buf, Ok(expected))
}
check("foo", b"foo");
check("", b"");
check(" \t\n\r\n", b" \t\n\n");
check("hello \\\n world", b"hello world");
check("hello \\\r\n world", b"hello world");
check("thread's", b"thread's")
}
}