Attempt UTF-8 decoding before falling back to Latin-1

This commit is contained in:
Tad Hardesty 2019-11-02 13:11:27 -07:00
parent 50e8d95961
commit 3bc83b11d1
3 changed files with 24 additions and 33 deletions

View file

@ -345,14 +345,14 @@ impl fmt::Display for ConstFn {
// The constant evaluator
pub fn evaluate_str(location: Location, input: &[u8]) -> Result<Constant, DMError> {
use super::lexer::{Lexer, from_latin1_borrowed};
use super::lexer::{Lexer, from_utf8_or_latin1_borrowed};
use super::parser::Parser;
let mut bytes = input.iter().map(|&x| Ok(x));
let ctx = Context::default();
let expr = Parser::new(&ctx, Lexer::new(&ctx, Default::default(), &mut bytes)).require_expression()?;
if bytes.next().is_some() {
return Err(DMError::new(location, format!("leftover: {:?} {}", from_latin1_borrowed(&input), bytes.len())));
return Err(DMError::new(location, format!("leftover: {:?} {}", from_utf8_or_latin1_borrowed(&input), bytes.len())));
}
expr.simple_evaluate(location)
}

View file

@ -429,17 +429,8 @@ fn is_ident(ch: u8) -> bool {
(ch >= b'a' && ch <= b'z') || (ch >= b'A' && ch <= b'Z') || ch == b'_'
}
/// Convert the input bytes to a `String` assuming Latin-1 encoding.
pub fn from_latin1(mut bytes: Vec<u8>) -> String {
fn from_latin1(bytes: &[u8]) -> String {
let non_ascii = bytes.iter().filter(|&&i| i > 0x7f).count();
if non_ascii == 0 {
match String::from_utf8(bytes) {
Ok(v) => return v,
// shouldn't happen, but try to produce a sensible result anyways
Err(e) => bytes = e.into_bytes(),
}
}
let mut output = String::with_capacity(bytes.len() + non_ascii);
for &byte in bytes.iter() {
output.push(byte as char);
@ -447,20 +438,20 @@ pub fn from_latin1(mut bytes: Vec<u8>) -> String {
output
}
/// Convert the input bytes to a `String` assuming Latin-1 encoding.
pub fn from_latin1_borrowed(bytes: &[u8]) -> Cow<str> {
let non_ascii = bytes.iter().filter(|&&i| i > 0x7f).count();
if non_ascii == 0 {
if let Ok(v) = ::std::str::from_utf8(bytes) {
return Cow::Borrowed(v);
}
/// Convert the input bytes to a `String` attempting UTF-8 or falling back to Latin-1.
pub fn from_utf8_or_latin1(bytes: Vec<u8>) -> String {
match String::from_utf8(bytes) {
Ok(v) => v,
Err(e) => from_latin1(e.as_bytes()),
}
}
let mut output = String::with_capacity(bytes.len() + non_ascii);
for &byte in bytes.iter() {
output.push(byte as char);
/// Convert the input bytes to a `String` attempting UTF-8 or falling back to Latin-1.
pub fn from_utf8_or_latin1_borrowed(bytes: &[u8]) -> Cow<str> {
match ::std::str::from_utf8(bytes) {
Ok(v) => Cow::Borrowed(v),
Err(_) => Cow::Owned(from_latin1(bytes)),
}
Cow::Owned(output)
}
// Used to track nested string interpolations and know when they end.
@ -841,7 +832,7 @@ impl<'ctx, I: Iterator<Item=io::Result<u8>>> Lexer<'ctx, I> {
}
}
}
from_latin1(ident)
from_utf8_or_latin1(ident)
}
fn read_resource(&mut self) -> String {
@ -863,7 +854,7 @@ impl<'ctx, I: Iterator<Item=io::Result<u8>>> Lexer<'ctx, I> {
}
}
}
from_latin1(buf)
from_utf8_or_latin1(buf)
}
fn read_string(&mut self, end: &'static [u8], interp_closed: bool) -> Token {
@ -925,7 +916,7 @@ impl<'ctx, I: Iterator<Item=io::Result<u8>>> Lexer<'ctx, I> {
}
}
let string = from_latin1(buf);
let string = from_utf8_or_latin1(buf);
match (interp_opened, interp_closed) {
(true, true) => Token::InterpStringPart(string),
(true, false) => Token::InterpStringBegin(string),
@ -952,7 +943,7 @@ impl<'ctx, I: Iterator<Item=io::Result<u8>>> Lexer<'ctx, I> {
break;
}
}
Token::String(from_latin1(buf))
Token::String(from_utf8_or_latin1(buf))
}
fn read_raw_string(&mut self) -> Token {

View file

@ -8,7 +8,7 @@ use ndarray::{self, Array3, Axis};
use linked_hash_map::LinkedHashMap;
use dm::{DMError, Location, HasLocation};
use dm::lexer::{LocationTracker, from_latin1};
use dm::lexer::{LocationTracker, from_utf8_or_latin1};
use dm::constants::Constant;
const MAX_KEY_LENGTH: u8 = 3;
@ -329,14 +329,14 @@ fn parse_map(map: &mut Map, f: File) -> Result<(), DMError> {
skip_whitespace = true;
} else if ch == b';' {
curr_prefab.vars.insert(
from_latin1(take(&mut curr_var)),
from_utf8_or_latin1(take(&mut curr_var)),
dm::constants::evaluate_str(chars.location(), &take(&mut curr_datum))?,
);
skip_whitespace = true;
} else if ch == b'}' {
if !curr_var.is_empty() {
curr_prefab.vars.insert(
from_latin1(take(&mut curr_var)),
from_utf8_or_latin1(take(&mut curr_var)),
dm::constants::evaluate_str(chars.location(), &take(&mut curr_datum))?,
);
}
@ -346,16 +346,16 @@ fn parse_map(map: &mut Map, f: File) -> Result<(), DMError> {
}
}
} else if ch == b'{' {
curr_prefab.path = from_latin1(take(&mut curr_datum));
curr_prefab.path = from_utf8_or_latin1(take(&mut curr_datum));
in_varedit_block = true;
} else if ch == b',' {
if curr_prefab.path.is_empty() && !curr_datum.is_empty() {
curr_prefab.path = from_latin1(take(&mut curr_datum));
curr_prefab.path = from_utf8_or_latin1(take(&mut curr_datum));
}
curr_data.push(take(&mut curr_prefab));
} else if ch == b')' {
if curr_prefab.path.is_empty() && !curr_datum.is_empty() {
curr_prefab.path = from_latin1(take(&mut curr_datum));
curr_prefab.path = from_utf8_or_latin1(take(&mut curr_datum));
}
curr_data.push(take(&mut curr_prefab));
let key = take(&mut curr_key);