From 3bc83b11d1315f9dac4ccab89a01a41d4b930374 Mon Sep 17 00:00:00 2001
From: Tad Hardesty <tad@platymuus.com>
Date: Sat, 2 Nov 2019 13:11:27 -0700
Subject: [PATCH] Attempt UTF-8 decoding before falling back to Latin-1

---
 src/dreammaker/constants.rs |  4 ++--
 src/dreammaker/lexer.rs     | 41 +++++++++++++++----------------------
 src/tools/dmm.rs            | 12 +++++------
 3 files changed, 24 insertions(+), 33 deletions(-)
diff --git a/src/dreammaker/constants.rs b/src/dreammaker/constants.rs
index 481d0c25..5a755b98 100644
--- a/src/dreammaker/constants.rs
+++ b/src/dreammaker/constants.rs
@@ -345,14 +345,14 @@ impl fmt::Display for ConstFn {
 // The constant evaluator
 
 pub fn evaluate_str(location: Location, input: &[u8]) -> Result<Constant, DMError> {
-    use super::lexer::{Lexer, from_latin1_borrowed};
+    use super::lexer::{Lexer, from_utf8_or_latin1_borrowed};
     use super::parser::Parser;
 
     let mut bytes = input.iter().map(|&x| Ok(x));
     let ctx = Context::default();
     let expr = Parser::new(&ctx, Lexer::new(&ctx, Default::default(), &mut bytes)).require_expression()?;
     if bytes.next().is_some() {
-        return Err(DMError::new(location, format!("leftover: {:?} {}", from_latin1_borrowed(&input), bytes.len())));
+        return Err(DMError::new(location, format!("leftover: {:?} {}", from_utf8_or_latin1_borrowed(&input), bytes.len())));
     }
     expr.simple_evaluate(location)
 }
diff --git a/src/dreammaker/lexer.rs b/src/dreammaker/lexer.rs
index e3e1edfa..f6afec6e 100644
--- a/src/dreammaker/lexer.rs
+++ b/src/dreammaker/lexer.rs
@@ -429,17 +429,8 @@ fn is_ident(ch: u8) -> bool {
     (ch >= b'a' && ch <= b'z') || (ch >= b'A' && ch <= b'Z') || ch == b'_'
 }
 
-/// Convert the input bytes to a `String` assuming Latin-1 encoding.
-pub fn from_latin1(mut bytes: Vec<u8>) -> String {
+fn from_latin1(bytes: &[u8]) -> String {
     let non_ascii = bytes.iter().filter(|&&i| i > 0x7f).count();
-    if non_ascii == 0 {
-        match String::from_utf8(bytes) {
-            Ok(v) => return v,
-            // shouldn't happen, but try to produce a sensible result anyways
-            Err(e) => bytes = e.into_bytes(),
-        }
-    }
-
     let mut output = String::with_capacity(bytes.len() + non_ascii);
     for &byte in bytes.iter() {
         output.push(byte as char);
@@ -447,20 +438,20 @@ pub fn from_latin1(mut bytes: Vec<u8>) -> String {
     output
 }
 
-/// Convert the input bytes to a `String` assuming Latin-1 encoding.
-pub fn from_latin1_borrowed(bytes: &[u8]) -> Cow<str> {
-    let non_ascii = bytes.iter().filter(|&&i| i > 0x7f).count();
-    if non_ascii == 0 {
-        if let Ok(v) = ::std::str::from_utf8(bytes) {
-            return Cow::Borrowed(v);
-        }
+/// Convert the input bytes to a `String` attempting UTF-8 or falling back to Latin-1.
+pub fn from_utf8_or_latin1(bytes: Vec<u8>) -> String {
+    match String::from_utf8(bytes) {
+        Ok(v) => v,
+        Err(e) => from_latin1(e.as_bytes()),
     }
+}
 
-    let mut output = String::with_capacity(bytes.len() + non_ascii);
-    for &byte in bytes.iter() {
-        output.push(byte as char);
+/// Convert the input bytes to a `String` attempting UTF-8 or falling back to Latin-1.
+pub fn from_utf8_or_latin1_borrowed(bytes: &[u8]) -> Cow<str> {
+    match ::std::str::from_utf8(bytes) {
+        Ok(v) => Cow::Borrowed(v),
+        Err(_) => Cow::Owned(from_latin1(bytes)),
     }
-    Cow::Owned(output)
 }
 
 // Used to track nested string interpolations and know when they end.
@@ -841,7 +832,7 @@ impl<'ctx, I: Iterator<Item=io::Result<u8>>> Lexer<'ctx, I> {
                 }
             }
         }
-        from_latin1(ident)
+        from_utf8_or_latin1(ident)
     }
 
     fn read_resource(&mut self) -> String {
@@ -863,7 +854,7 @@ impl<'ctx, I: Iterator<Item=io::Result<u8>>> Lexer<'ctx, I> {
                 }
             }
         }
-        from_latin1(buf)
+        from_utf8_or_latin1(buf)
     }
 
     fn read_string(&mut self, end: &'static [u8], interp_closed: bool) -> Token {
@@ -925,7 +916,7 @@ impl<'ctx, I: Iterator<Item=io::Result<u8>>> Lexer<'ctx, I> {
             }
         }
 
-        let string = from_latin1(buf);
+        let string = from_utf8_or_latin1(buf);
         match (interp_opened, interp_closed) {
             (true, true) => Token::InterpStringPart(string),
             (true, false) => Token::InterpStringBegin(string),
@@ -952,7 +943,7 @@ impl<'ctx, I: Iterator<Item=io::Result<u8>>> Lexer<'ctx, I> {
                 break;
             }
         }
-        Token::String(from_latin1(buf))
+        Token::String(from_utf8_or_latin1(buf))
     }
 
     fn read_raw_string(&mut self) -> Token {
diff --git a/src/tools/dmm.rs b/src/tools/dmm.rs
index 7924c2dd..12f74078 100644
--- a/src/tools/dmm.rs
+++ b/src/tools/dmm.rs
@@ -8,7 +8,7 @@ use ndarray::{self, Array3, Axis};
 use linked_hash_map::LinkedHashMap;
 
 use dm::{DMError, Location, HasLocation};
-use dm::lexer::{LocationTracker, from_latin1};
+use dm::lexer::{LocationTracker, from_utf8_or_latin1};
 use dm::constants::Constant;
 
 const MAX_KEY_LENGTH: u8 = 3;
@@ -329,14 +329,14 @@ fn parse_map(map: &mut Map, f: File) -> Result<(), DMError> {
                         skip_whitespace = true;
                     } else if ch == b';' {
                         curr_prefab.vars.insert(
-                            from_latin1(take(&mut curr_var)),
+                            from_utf8_or_latin1(take(&mut curr_var)),
                             dm::constants::evaluate_str(chars.location(), &take(&mut curr_datum))?,
                         );
                         skip_whitespace = true;
                     } else if ch == b'}' {
                         if !curr_var.is_empty() {
                             curr_prefab.vars.insert(
-                                from_latin1(take(&mut curr_var)),
+                                from_utf8_or_latin1(take(&mut curr_var)),
                                 dm::constants::evaluate_str(chars.location(), &take(&mut curr_datum))?,
                             );
                         }
@@ -346,16 +346,16 @@ fn parse_map(map: &mut Map, f: File) -> Result<(), DMError> {
                     }
                 }
             } else if ch == b'{' {
-                curr_prefab.path = from_latin1(take(&mut curr_datum));
+                curr_prefab.path = from_utf8_or_latin1(take(&mut curr_datum));
                 in_varedit_block = true;
             } else if ch == b',' {
                 if curr_prefab.path.is_empty() && !curr_datum.is_empty() {
-                    curr_prefab.path = from_latin1(take(&mut curr_datum));
+                    curr_prefab.path = from_utf8_or_latin1(take(&mut curr_datum));
                 }
                 curr_data.push(take(&mut curr_prefab));
             } else if ch == b')' {
                 if curr_prefab.path.is_empty() && !curr_datum.is_empty() {
-                    curr_prefab.path = from_latin1(take(&mut curr_datum));
+                    curr_prefab.path = from_utf8_or_latin1(take(&mut curr_datum));
                 }
                 curr_data.push(take(&mut curr_prefab));
                 let key = take(&mut curr_key);