From 3bc83b11d1315f9dac4ccab89a01a41d4b930374 Mon Sep 17 00:00:00 2001 From: Tad Hardesty Date: Sat, 2 Nov 2019 13:11:27 -0700 Subject: [PATCH] Attempt UTF-8 decoding before falling back to Latin-1 --- src/dreammaker/constants.rs | 4 ++-- src/dreammaker/lexer.rs | 41 +++++++++++++++---------------------- src/tools/dmm.rs | 12 +++++------ 3 files changed, 24 insertions(+), 33 deletions(-) diff --git a/src/dreammaker/constants.rs b/src/dreammaker/constants.rs index 481d0c25..5a755b98 100644 --- a/src/dreammaker/constants.rs +++ b/src/dreammaker/constants.rs @@ -345,14 +345,14 @@ impl fmt::Display for ConstFn { // The constant evaluator pub fn evaluate_str(location: Location, input: &[u8]) -> Result { - use super::lexer::{Lexer, from_latin1_borrowed}; + use super::lexer::{Lexer, from_utf8_or_latin1_borrowed}; use super::parser::Parser; let mut bytes = input.iter().map(|&x| Ok(x)); let ctx = Context::default(); let expr = Parser::new(&ctx, Lexer::new(&ctx, Default::default(), &mut bytes)).require_expression()?; if bytes.next().is_some() { - return Err(DMError::new(location, format!("leftover: {:?} {}", from_latin1_borrowed(&input), bytes.len()))); + return Err(DMError::new(location, format!("leftover: {:?} {}", from_utf8_or_latin1_borrowed(&input), bytes.len()))); } expr.simple_evaluate(location) } diff --git a/src/dreammaker/lexer.rs b/src/dreammaker/lexer.rs index e3e1edfa..f6afec6e 100644 --- a/src/dreammaker/lexer.rs +++ b/src/dreammaker/lexer.rs @@ -429,17 +429,8 @@ fn is_ident(ch: u8) -> bool { (ch >= b'a' && ch <= b'z') || (ch >= b'A' && ch <= b'Z') || ch == b'_' } -/// Convert the input bytes to a `String` assuming Latin-1 encoding. -pub fn from_latin1(mut bytes: Vec) -> String { +fn from_latin1(bytes: &[u8]) -> String { let non_ascii = bytes.iter().filter(|&&i| i > 0x7f).count(); - if non_ascii == 0 { - match String::from_utf8(bytes) { - Ok(v) => return v, - // shouldn't happen, but try to produce a sensible result anyways - Err(e) => bytes = e.into_bytes(), - } - } - let mut output = String::with_capacity(bytes.len() + non_ascii); for &byte in bytes.iter() { output.push(byte as char); @@ -447,20 +438,20 @@ pub fn from_latin1(mut bytes: Vec) -> String { output } -/// Convert the input bytes to a `String` assuming Latin-1 encoding. -pub fn from_latin1_borrowed(bytes: &[u8]) -> Cow { - let non_ascii = bytes.iter().filter(|&&i| i > 0x7f).count(); - if non_ascii == 0 { - if let Ok(v) = ::std::str::from_utf8(bytes) { - return Cow::Borrowed(v); - } +/// Convert the input bytes to a `String` attempting UTF-8 or falling back to Latin-1. +pub fn from_utf8_or_latin1(bytes: Vec) -> String { + match String::from_utf8(bytes) { + Ok(v) => v, + Err(e) => from_latin1(e.as_bytes()), } +} - let mut output = String::with_capacity(bytes.len() + non_ascii); - for &byte in bytes.iter() { - output.push(byte as char); +/// Convert the input bytes to a `String` attempting UTF-8 or falling back to Latin-1. +pub fn from_utf8_or_latin1_borrowed(bytes: &[u8]) -> Cow { + match ::std::str::from_utf8(bytes) { + Ok(v) => Cow::Borrowed(v), + Err(_) => Cow::Owned(from_latin1(bytes)), } - Cow::Owned(output) } // Used to track nested string interpolations and know when they end. @@ -841,7 +832,7 @@ impl<'ctx, I: Iterator>> Lexer<'ctx, I> { } } } - from_latin1(ident) + from_utf8_or_latin1(ident) } fn read_resource(&mut self) -> String { @@ -863,7 +854,7 @@ impl<'ctx, I: Iterator>> Lexer<'ctx, I> { } } } - from_latin1(buf) + from_utf8_or_latin1(buf) } fn read_string(&mut self, end: &'static [u8], interp_closed: bool) -> Token { @@ -925,7 +916,7 @@ impl<'ctx, I: Iterator>> Lexer<'ctx, I> { } } - let string = from_latin1(buf); + let string = from_utf8_or_latin1(buf); match (interp_opened, interp_closed) { (true, true) => Token::InterpStringPart(string), (true, false) => Token::InterpStringBegin(string), @@ -952,7 +943,7 @@ impl<'ctx, I: Iterator>> Lexer<'ctx, I> { break; } } - Token::String(from_latin1(buf)) + Token::String(from_utf8_or_latin1(buf)) } fn read_raw_string(&mut self) -> Token { diff --git a/src/tools/dmm.rs b/src/tools/dmm.rs index 7924c2dd..12f74078 100644 --- a/src/tools/dmm.rs +++ b/src/tools/dmm.rs @@ -8,7 +8,7 @@ use ndarray::{self, Array3, Axis}; use linked_hash_map::LinkedHashMap; use dm::{DMError, Location, HasLocation}; -use dm::lexer::{LocationTracker, from_latin1}; +use dm::lexer::{LocationTracker, from_utf8_or_latin1}; use dm::constants::Constant; const MAX_KEY_LENGTH: u8 = 3; @@ -329,14 +329,14 @@ fn parse_map(map: &mut Map, f: File) -> Result<(), DMError> { skip_whitespace = true; } else if ch == b';' { curr_prefab.vars.insert( - from_latin1(take(&mut curr_var)), + from_utf8_or_latin1(take(&mut curr_var)), dm::constants::evaluate_str(chars.location(), &take(&mut curr_datum))?, ); skip_whitespace = true; } else if ch == b'}' { if !curr_var.is_empty() { curr_prefab.vars.insert( - from_latin1(take(&mut curr_var)), + from_utf8_or_latin1(take(&mut curr_var)), dm::constants::evaluate_str(chars.location(), &take(&mut curr_datum))?, ); } @@ -346,16 +346,16 @@ fn parse_map(map: &mut Map, f: File) -> Result<(), DMError> { } } } else if ch == b'{' { - curr_prefab.path = from_latin1(take(&mut curr_datum)); + curr_prefab.path = from_utf8_or_latin1(take(&mut curr_datum)); in_varedit_block = true; } else if ch == b',' { if curr_prefab.path.is_empty() && !curr_datum.is_empty() { - curr_prefab.path = from_latin1(take(&mut curr_datum)); + curr_prefab.path = from_utf8_or_latin1(take(&mut curr_datum)); } curr_data.push(take(&mut curr_prefab)); } else if ch == b')' { if curr_prefab.path.is_empty() && !curr_datum.is_empty() { - curr_prefab.path = from_latin1(take(&mut curr_datum)); + curr_prefab.path = from_utf8_or_latin1(take(&mut curr_datum)); } curr_data.push(take(&mut curr_prefab)); let key = take(&mut curr_key);