Fix remaining UTF-8 parsing issues

2025-09-28 22:34:45 +00:00 · 2020-07-26 21:38:29 -04:00 · 2020-07-26 21:38:29 -04:00 · 273528db77
commit 273528db77
parent eaaeda728a
5 changed files with 358 additions and 267 deletions
--- a/compiler/parse/src/blankspace.rs
+++ b/compiler/parse/src/blankspace.rs
@ -1,7 +1,7 @@
 use crate::ast::CommentOrNewline::{self, *};
 use crate::ast::Spaceable;
 use crate::parser::{
-    self, and, peek_utf8_char, unexpected, unexpected_eof, Fail, FailReason, Parser, State,
+    self, and, peek_utf8_char, unexpected, unexpected_eof, FailReason, Parser, State,
 };
 use bumpalo::collections::string::String;
 use bumpalo::collections::vec::Vec;
@ -219,7 +219,7 @@ fn spaces<'a>(
    move |arena: &'a Bump, state: State<'a>| {
        let original_state = state.clone();
        let mut space_list = Vec::new_in(arena);
-        let mut chars_parsed = 0;
+        let mut bytes_parsed = 0;
        let mut comment_line_buf = String::new_in(arena);
        let mut line_state = LineState::Normal;
        let mut state = state;
@ -227,8 +227,8 @@ fn spaces<'a>(
        while !state.bytes.is_empty() {
            match peek_utf8_char(&state) {
-                Ok(ch) => {
+                Ok((ch, utf8_len)) => {
-                    chars_parsed += 1;
+                    bytes_parsed += utf8_len;
                    match line_state {
                        LineState::Normal => {
@ -263,7 +263,7 @@ fn spaces<'a>(
                                    line_state = LineState::Comment;
                                }
                                _ => {
-                                    return if require_at_least_one && chars_parsed <= 1 {
+                                    return if require_at_least_one && bytes_parsed <= 1 {
                                        // We've parsed 1 char and it was not a space,
                                        // but we require parsing at least one space!
                                        Err(unexpected(0, state.clone(), state.attempting))
@ -349,8 +349,7 @@ fn spaces<'a>(
                                    line_state = LineState::Normal;
                                }
                                nonblank => {
-                                    // Chars can have btye lengths of more than 1!
+                                    state = state.advance_without_indenting(utf8_len)?;
                                    state = state.advance_without_indenting(nonblank.len_utf8())?;
                                    comment_line_buf.push(nonblank);
                                }
@ -358,21 +357,12 @@ fn spaces<'a>(
                        }
                    }
                }
-                Err(Fail {
+                Err(FailReason::BadUtf8) => {
                    reason: FailReason::BadUtf8,
                    attempting,
                }) => {
                    // If we hit an invalid UTF-8 character, bail out immediately.
-                    return Err((
+                    return state.fail(FailReason::BadUtf8);
                        Fail {
                            reason: dbg!(FailReason::BadUtf8),
                            attempting,
                        },
                        state,
                    ));
                }
                Err(_) => {
-                    if require_at_least_one && chars_parsed == 0 {
+                    if require_at_least_one && bytes_parsed == 0 {
                        return Err(unexpected_eof(0, state.attempting, state));
                    } else {
                        let space_slice = space_list.into_bump_slice();
--- a/compiler/parse/src/ident.rs
+++ b/compiler/parse/src/ident.rs
@ -1,6 +1,6 @@
 use crate::ast::Attempting;
 use crate::keyword;
-use crate::parser::{unexpected, utf8_char, Fail, FailReason, ParseResult, Parser, State};
+use crate::parser::{peek_utf8_char, unexpected, Fail, FailReason, ParseResult, Parser, State};
 use bumpalo::collections::string::String;
 use bumpalo::collections::vec::Vec;
 use bumpalo::Bump;
@ -69,7 +69,7 @@ impl<'a> Ident<'a> {
 #[inline(always)]
 pub fn parse_ident<'a>(
    arena: &'a Bump,
-    state: State<'a>,
+    mut state: State<'a>,
 ) -> ParseResult<'a, (Ident<'a>, Option<char>)> {
    let mut part_buf = String::new_in(arena); // The current "part" (parts are dot-separated.)
    let mut capitalized_parts: Vec<&'a str> = Vec::new_in(arena);
@ -80,93 +80,112 @@ pub fn parse_ident<'a>(
    // Identifiers and accessor functions must start with either a letter or a dot.
    // If this starts with neither, it must be something else!
-    let (first_ch, mut state) = utf8_char().parse(arena, state)?;
+    match peek_utf8_char(&state) {
        Ok((first_ch, bytes_parsed)) => {
            if first_ch.is_alphabetic() {
                part_buf.push(first_ch);
-    if first_ch.is_alphabetic() {
+                is_capitalized = first_ch.is_uppercase();
-        part_buf.push(first_ch);
+                is_accessor_fn = false;
-        is_capitalized = first_ch.is_uppercase();
+                state = state.advance_without_indenting(bytes_parsed)?;
-        is_accessor_fn = false;
+            } else if first_ch == '.' {
-    } else if first_ch == '.' {
+                is_capitalized = false;
-        is_capitalized = false;
+                is_accessor_fn = true;
        is_accessor_fn = true;
    } else if first_ch == '@' {
        // '@' must always be followed by a capital letter!
        let (next_ch, new_state) = utf8_char().parse(arena, state)?;
-        state = new_state;
+                state = state.advance_without_indenting(bytes_parsed)?;
            } else if first_ch == '@' {
                state = state.advance_without_indenting(bytes_parsed)?;
-        if next_ch.is_uppercase() {
+                // '@' must always be followed by a capital letter!
-            part_buf.push('@');
+                match peek_utf8_char(&state) {
-            part_buf.push(next_ch);
+                    Ok((next_ch, next_bytes_parsed)) => {
                        if next_ch.is_uppercase() {
                            state = state.advance_without_indenting(next_bytes_parsed)?;
-            is_private_tag = true;
+                            part_buf.push('@');
-            is_capitalized = true;
+                            part_buf.push(next_ch);
-            is_accessor_fn = false;
+
-        } else {
+                            is_private_tag = true;
-            return Err(unexpected(0, state, Attempting::Identifier));
+                            is_capitalized = true;
                            is_accessor_fn = false;
                        } else {
                            return Err(unexpected(
                                bytes_parsed + next_bytes_parsed,
                                state,
                                Attempting::Identifier,
                            ));
                        }
                    }
                    Err(reason) => return state.fail(reason),
                }
            } else {
                return Err(unexpected(0, state, Attempting::Identifier));
            }
        }
-    } else {
+        Err(reason) => return state.fail(reason),
        return Err(unexpected(0, state, Attempting::Identifier));
    }
    while !state.bytes.is_empty() {
-        let (ch, new_state) = utf8_char().parse(arena, state)?;
+        match peek_utf8_char(&state) {
            Ok((ch, bytes_parsed)) => {
                // After the first character, only these are allowed:
                //
                // * Unicode alphabetic chars - you might name a variable `鹏` if that's clear to your readers
                // * ASCII digits - e.g. `1` but not `¾`, both of which pass .is_numeric()
                // * A dot ('.')
                if ch.is_alphabetic() {
                    if part_buf.is_empty() {
                        // Capitalization is determined by the first character in the part.
                        is_capitalized = ch.is_uppercase();
                    }
-        state = new_state;
+                    part_buf.push(ch);
                } else if ch.is_ascii_digit() {
                    // Parts may not start with numbers!
                    if part_buf.is_empty() {
                        return malformed(
                            Some(ch),
                            arena,
                            state,
                            capitalized_parts,
                            noncapitalized_parts,
                        );
                    }
-        // After the first character, only these are allowed:
+                    part_buf.push(ch);
-        //
+                } else if ch == '.' {
-        // * Unicode alphabetic chars - you might name a variable `鹏` if that's clear to your readers
+                    // There are two posssible errors here:
-        // * ASCII digits - e.g. `1` but not `¾`, both of which pass .is_numeric()
+                    //
-        // * A dot ('.')
+                    // 1. Having two consecutive dots is an error.
-        if ch.is_alphabetic() {
+                    // 2. Having capitalized parts after noncapitalized (e.g. `foo.Bar`) is an error.
-            if part_buf.is_empty() {
+                    if part_buf.is_empty() || (is_capitalized && !noncapitalized_parts.is_empty()) {
-                // Capitalization is determined by the first character in the part.
+                        return malformed(
-                is_capitalized = ch.is_uppercase();
+                            Some(ch),
                            arena,
                            state,
                            capitalized_parts,
                            noncapitalized_parts,
                        );
                    }
                    if is_capitalized {
                        capitalized_parts.push(part_buf.into_bump_str());
                    } else {
                        noncapitalized_parts.push(part_buf.into_bump_str());
                    }
                    // Now that we've recorded the contents of the current buffer, reset it.
                    part_buf = String::new_in(arena);
                } else {
                    // This must be the end of the identifier. We're done!
                    break;
                }
                state = state.advance_without_indenting(bytes_parsed)?;
            }
-
+            Err(reason) => return state.fail(reason),
            part_buf.push(ch);
        } else if ch.is_ascii_digit() {
            // Parts may not start with numbers!
            if part_buf.is_empty() {
                return malformed(
                    Some(ch),
                    arena,
                    state,
                    capitalized_parts,
                    noncapitalized_parts,
                );
            }
            part_buf.push(ch);
        } else if ch == '.' {
            // There are two posssible errors here:
            //
            // 1. Having two consecutive dots is an error.
            // 2. Having capitalized parts after noncapitalized (e.g. `foo.Bar`) is an error.
            if part_buf.is_empty() || (is_capitalized && !noncapitalized_parts.is_empty()) {
                return malformed(
                    Some(ch),
                    arena,
                    state,
                    capitalized_parts,
                    noncapitalized_parts,
                );
            }
            if is_capitalized {
                capitalized_parts.push(part_buf.into_bump_str());
            } else {
                noncapitalized_parts.push(part_buf.into_bump_str());
            }
            // Now that we've recorded the contents of the current buffer, reset it.
            part_buf = String::new_in(arena);
        } else {
            // This must be the end of the identifier. We're done!
            break;
        }
    }
@ -262,26 +281,27 @@ fn malformed<'a>(
    let mut next_char = None;
    while !state.bytes.is_empty() {
-        let (ch, new_state) = utf8_char().parse(arena, state)?;
+        match peek_utf8_char(&state) {
            Ok((ch, bytes_parsed)) => {
                // We can't use ch.is_alphanumeric() here because that passes for
                // things that are "numeric" but not ASCII digits, like `¾`
                if ch == '.' || ch.is_alphabetic() || ch.is_ascii_digit() {
                    full_string.push(ch);
                } else {
                    next_char = Some(ch);
-        state = new_state;
+                    break;
                }
-        // We can't use ch.is_alphanumeric() here because that passes for
+                state = state.advance_without_indenting(bytes_parsed)?;
-        // things that are "numeric" but not ASCII digits, like `¾`
+            }
-        if ch == '.' || ch.is_alphabetic() || ch.is_ascii_digit() {
+            Err(reason) => return state.fail(reason),
            full_string.push(ch);
        } else {
            next_char = Some(ch);
            break;
        }
    }
    let chars_parsed = full_string.len();
    Ok((
        (Ident::Malformed(full_string.into_bump_str()), next_char),
-        state.advance_without_indenting(chars_parsed)?,
+        state,
    ))
 }
@ -298,42 +318,47 @@ pub fn global_tag_or_ident<'a, F>(pred: F) -> impl Parser<'a, &'a str>
 where
    F: Fn(char) -> bool,
 {
-    move |arena, state: State<'a>| {
+    move |arena, mut state: State<'a>| {
        // pred will determine if this is a tag or ident (based on capitalization)
-        let (first_letter, mut state) = utf8_char().parse(arena, state)?;
+        let (first_letter, bytes_parsed) = match peek_utf8_char(&state) {
            Ok((first_letter, bytes_parsed)) => {
                if !pred(first_letter) {
                    return Err(unexpected(0, state, Attempting::RecordFieldLabel));
                }
-        if !pred(first_letter) {
+                (first_letter, bytes_parsed)
-            return Err(unexpected(0, state, Attempting::RecordFieldLabel));
+            }
-        }
+            Err(reason) => return state.fail(reason),
        };
        let mut buf = String::with_capacity_in(1, arena);
        buf.push(first_letter);
        state = state.advance_without_indenting(bytes_parsed)?;
        while !state.bytes.is_empty() {
-            let (ch, new_state) = utf8_char().parse(arena, state)?;
+            match peek_utf8_char(&state) {
                Ok((ch, bytes_parsed)) => {
                    // After the first character, only these are allowed:
                    //
                    // * Unicode alphabetic chars - you might include `鹏` if that's clear to your readers
                    // * ASCII digits - e.g. `1` but not `¾`, both of which pass .is_numeric()
                    // * A ':' indicating the end of the field
                    if ch.is_alphabetic() || ch.is_ascii_digit() {
                        buf.push(ch);
-            state = new_state;
+                        state = state.advance_without_indenting(bytes_parsed)?;
-
+                    } else {
-            // After the first character, only these are allowed:
+                        // This is the end of the field. We're done!
-            //
+                        break;
-            // * Unicode alphabetic chars - you might include `鹏` if that's clear to your readers
+                    }
-            // * ASCII digits - e.g. `1` but not `¾`, both of which pass .is_numeric()
+                }
-            // * A ':' indicating the end of the field
+                Err(reason) => return state.fail(reason),
-            if ch.is_alphabetic() || ch.is_ascii_digit() {
+            };
                buf.push(ch);
            } else {
                // This is the end of the field. We're done!
                break;
            }
        }
-        let chars_parsed = buf.len();
+        Ok((buf.into_bump_str(), state))
        Ok((
            buf.into_bump_str(),
            state.advance_without_indenting(chars_parsed)?,
        ))
    }
 }
--- a/compiler/parse/src/module.rs
+++ b/compiler/parse/src/module.rs
@ -7,7 +7,8 @@ use crate::expr::def;
 use crate::header::ModuleName;
 use crate::ident::unqualified_ident;
 use crate::parser::{
-    self, ascii_char, ascii_string, loc, optional, unexpected, utf8_char, Parser, State,
+    self, ascii_char, ascii_string, loc, optional, peek_utf8_char, peek_utf8_char_at, unexpected,
    Parser, State,
 };
 use bumpalo::collections::{String, Vec};
 use roc_region::all::Located;
@ -61,57 +62,68 @@ pub fn interface_header<'a>() -> impl Parser<'a, InterfaceHeader<'a>> {
 #[inline(always)]
 pub fn module_name<'a>() -> impl Parser<'a, ModuleName<'a>> {
-    move |arena, state: State<'a>| {
+    move |arena, mut state: State<'a>| {
-        let (first_letter, mut state) = utf8_char().parse(arena, state)?;
+        match peek_utf8_char(&state) {
            Ok((first_letter, bytes_parsed)) => {
                if !first_letter.is_uppercase() {
                    return Err(unexpected(0, state, Attempting::Module));
                };
-        if !first_letter.is_uppercase() {
+                let mut buf = String::with_capacity_in(1, arena);
            return Err(unexpected(0, state, Attempting::Module));
        };
-        let mut buf = String::with_capacity_in(1, arena);
+                buf.push(first_letter);
-        buf.push(first_letter);
+                state = state.advance_without_indenting(bytes_parsed)?;
-        while !state.bytes.is_empty() {
+                while !state.bytes.is_empty() {
-            let (ch, new_state) = utf8_char().parse(arena, state)?;
+                    match peek_utf8_char(&state) {
                        Ok((ch, bytes_parsed)) => {
                            // After the first character, only these are allowed:
                            //
                            // * Unicode alphabetic chars - you might include `鹏` if that's clear to your readers
                            // * ASCII digits - e.g. `1` but not `¾`, both of which pass .is_numeric()
                            // * A '.' separating module parts
                            if ch.is_alphabetic() || ch.is_ascii_digit() {
                                buf.push(ch);
-            state = new_state;
+                                state = state.advance_without_indenting(bytes_parsed)?;
                            } else if ch == '.' {
                                match peek_utf8_char_at(&state, 1) {
                                    Ok((next, next_bytes_parsed)) => {
                                        if next.is_uppercase() {
                                            // If we hit another uppercase letter, keep going!
                                            buf.push('.');
                                            buf.push(next);
-            // After the first character, only these are allowed:
+                                            state = state.advance_without_indenting(
-            //
+                                                bytes_parsed + next_bytes_parsed,
-            // * Unicode alphabetic chars - you might include `鹏` if that's clear to your readers
+                                            )?;
-            // * ASCII digits - e.g. `1` but not `¾`, both of which pass .is_numeric()
+                                        } else {
-            // * A '.' separating module parts
+                                            // We have finished parsing the module name.
-            if ch.is_alphabetic() || ch.is_ascii_digit() {
+                                            //
-                buf.push(ch);
+                                            // There may be an identifier after this '.',
-            } else if ch == '.' {
+                                            // e.g. "baz" in `Foo.Bar.baz`
-                let (next, new_state) = utf8_char().parse(arena, state)?;
+                                            return Ok((
-
+                                                ModuleName::new(buf.into_bump_str()),
-                state = new_state;
+                                                state,
-
+                                            ));
-                if next.is_uppercase() {
+                                        }
-                    // If we hit another uppercase letter, keep going!
+                                    }
-                    buf.push('.');
+                                    Err(reason) => return state.fail(reason),
-                    buf.push(next);
+                                }
-                } else {
+                            } else {
-                    let chars_parsed = buf.len();
+                                // This is the end of the module name. We're done!
-
+                                break;
-                    // We have finished parsing the module name.
+                            }
-                    //
+                        }
-                    // There may be an identifier after this '.',
+                        Err(reason) => return state.fail(reason),
-                    // e.g. "baz" in `Foo.Bar.baz`
+                    }
                    return Ok((
                        ModuleName::new(buf.into_bump_str()),
                        state.advance_without_indenting(chars_parsed)?,
                    ));
                }
            } else {
                // This is the end of the module name. We're done!
                break;
            }
        }
-        Ok((ModuleName::new(buf.into_bump_str()), state))
+                Ok((ModuleName::new(buf.into_bump_str()), state))
            }
            Err(reason) => state.fail(reason),
        }
    }
 }
--- a/compiler/parse/src/parser.rs
+++ b/compiler/parse/src/parser.rs
@ -3,11 +3,12 @@ use bumpalo::collections::vec::Vec;
 use bumpalo::Bump;
 use encode_unicode::CharExt;
 use roc_region::all::{Located, Region};
 use std::fmt;
 use std::str::from_utf8;
 use std::{char, mem, u16};
 /// A position in a source file.
-#[derive(Debug, Clone, PartialEq, Eq)]
+#[derive(Clone, PartialEq, Eq)]
 pub struct State<'a> {
    /// The raw input bytes from the file.
    pub bytes: &'a [u8],
@ -101,7 +102,7 @@ impl<'a> State<'a> {
    /// This assumes we are *not* advancing with spaces, or at least that
    /// any spaces on the line were preceded by non-spaces - which would mean
    /// they weren't eligible to indent anyway.
-    pub fn advance_without_indenting(&self, quantity: usize) -> Result<Self, (Fail, Self)> {
+    pub fn advance_without_indenting(self, quantity: usize) -> Result<Self, (Fail, Self)> {
        match (self.column as usize).checked_add(quantity) {
            Some(column_usize) if column_usize <= u16::MAX as usize => {
                Ok(State {
@ -184,6 +185,24 @@ impl<'a> State<'a> {
    }
 }
 impl<'a> fmt::Debug for State<'a> {
    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
        write!(f, "State {{")?;
        match from_utf8(self.bytes) {
            Ok(string) => write!(f, "\n\tbytes: [utf8] {:?}", string)?,
            Err(_) => write!(f, "\n\tbytes: [invalid utf8] {:?}", self.bytes)?,
        }
        write!(f, "\n\t(line, col): ({}, {}),", self.line, self.column)?;
        write!(f, "\n\tindent_col: {}", self.indent_col)?;
        write!(f, "\n\tis_indenting: {:?}", self.is_indenting)?;
        write!(f, "\n\tattempting: {:?}", self.attempting)?;
        write!(f, "\n\toriginal_len: {}", self.original_len)?;
        write!(f, "\n}}")
    }
 }
 #[test]
 fn state_size() {
    // State should always be under 8 machine words, so it fits in a typical
@ -428,14 +447,14 @@ pub fn ascii_char<'a>(expected: char) -> impl Parser<'a, ()> {
 /// A single UTF-8-encoded char. This will both parse *and* validate that the
 /// char is valid UTF-8.
-pub fn utf8_char<'a>() -> impl Parser<'a, char> {
+pub fn utf8_char2<'a>() -> impl Parser<'a, char> {
    move |_arena, state: State<'a>| {
        if !state.bytes.is_empty() {
            match char::from_utf8_slice_start(state.bytes) {
                Ok((ch, bytes_parsed)) => {
                    return Ok((ch, state.advance_without_indenting(bytes_parsed)?))
                }
-                Err(_) => return state.fail(dbg!(FailReason::BadUtf8)),
+                Err(_) => return state.fail(FailReason::BadUtf8),
            }
        } else {
            Err(unexpected_eof(0, state.attempting, state))
@ -445,17 +464,40 @@ pub fn utf8_char<'a>() -> impl Parser<'a, char> {
 /// A single UTF-8-encoded char. This will both parse *and* validate that the
 /// char is valid UTF-8, but it will *not* advance the state.
-pub fn peek_utf8_char<'a>(state: &State<'a>) -> Result<char, Fail> {
+pub fn peek_utf8_char<'a>(state: &State<'a>) -> Result<(char, usize), FailReason> {
-    match char::from_utf8_slice_start(state.bytes) {
+    if !state.bytes.is_empty() {
-        Ok((ch, _)) => Ok(ch),
+        match char::from_utf8_slice_start(state.bytes) {
-        Err(_) => Err(Fail {
+            Ok((ch, len_utf8)) => Ok((ch, len_utf8)),
-            reason: dbg!(FailReason::BadUtf8),
+            Err(_) => Err(FailReason::BadUtf8),
-            attempting: state.attempting,
+        }
-        }),
+    } else {
        Err(FailReason::Eof(
            Region::zero(), /* TODO get a better region */
        ))
    }
 }
-/// A hardcoded string consisting only of ASCII characters.
+/// A single UTF-8-encoded char, with an offset. This will both parse *and*
 /// validate that the char is valid UTF-8, but it will *not* advance the state.
 pub fn peek_utf8_char_at<'a>(
    state: &State<'a>,
    offset: usize,
 ) -> Result<(char, usize), FailReason> {
    if state.bytes.len() > offset {
        let bytes = &state.bytes[offset..];
        match char::from_utf8_slice_start(bytes) {
            Ok((ch, len_utf8)) => Ok((ch, len_utf8)),
            Err(_) => Err(FailReason::BadUtf8),
        }
    } else {
        Err(FailReason::Eof(
            Region::zero(), /* TODO get a better region */
        ))
    }
 }
 /// A hardcoded string with no newlines, consisting only of ASCII characters
 pub fn ascii_string<'a>(keyword: &'static str) -> impl Parser<'a, ()> {
    // Verify that this really is exclusively ASCII characters.
    // The `unsafe` block in this function relies upon this assumption!
@ -472,10 +514,12 @@ pub fn ascii_string<'a>(keyword: &'static str) -> impl Parser<'a, ()> {
            // SAFETY: Roc language keywords are statically known to contain only
            // ASCII characters, which means their &str will be 100% u8 values in
            // memory, and thus can be safely interpreted as &[u8]
-            Some(next_str)
+            Some(next_str) => {
-                if next_str == unsafe { mem::transmute::<&'static str, &'a [u8]>(keyword) } =>
+                if next_str == unsafe { mem::transmute::<&'static str, &'a [u8]>(keyword) } {
-            {
+                    Ok(((), state.advance_without_indenting(len)?))
-                Ok(((), state.advance_without_indenting(len)?))
+                } else {
                    Err(unexpected(len, state, Attempting::Keyword))
                }
            }
            _ => Err(unexpected_eof(0, Attempting::Keyword, state)),
        }
@ -1126,6 +1170,6 @@ where
 pub fn parse_utf8<'a>(bytes: &'a [u8]) -> Result<&'a str, FailReason> {
    match from_utf8(bytes) {
        Ok(string) => Ok(string),
-        Err(_) => Err(dbg!(FailReason::BadUtf8)),
+        Err(_) => Err(FailReason::BadUtf8),
    }
 }
--- a/compiler/parse/src/type_annotation.rs
+++ b/compiler/parse/src/type_annotation.rs
@ -4,8 +4,8 @@ use crate::expr::{global_tag, private_tag};
 use crate::ident::join_module_parts;
 use crate::keyword;
 use crate::parser::{
-    allocated, ascii_char, ascii_string, not, optional, unexpected, utf8_char, Either, ParseResult,
+    allocated, ascii_char, ascii_string, not, optional, peek_utf8_char, unexpected, Either,
-    Parser, State,
+    ParseResult, Parser, State,
 };
 use bumpalo::collections::string::String;
 use bumpalo::collections::vec::Vec;
@ -263,61 +263,69 @@ fn expression<'a>(min_indent: u16) -> impl Parser<'a, Located<TypeAnnotation<'a>
 fn parse_concrete_type<'a>(
    arena: &'a Bump,
-    state: State<'a>,
+    mut state: State<'a>,
 ) -> ParseResult<'a, TypeAnnotation<'a>> {
    let mut part_buf = String::new_in(arena); // The current "part" (parts are dot-separated.)
    let mut parts: Vec<&'a str> = Vec::new_in(arena);
    // Qualified types must start with a capitalized letter.
-    let (first_letter, mut state) = utf8_char().parse(arena, state)?;
+    match peek_utf8_char(&state) {
        Ok((first_letter, bytes_parsed)) => {
            if first_letter.is_alphabetic() && first_letter.is_uppercase() {
                part_buf.push(first_letter);
            } else {
                return Err(unexpected(0, state, Attempting::ConcreteType));
            }
-    if first_letter.is_alphabetic() && first_letter.is_uppercase() {
+            state = state.advance_without_indenting(bytes_parsed)?;
-        part_buf.push(first_letter);
+        }
-    } else {
+        Err(reason) => return state.fail(reason),
        return Err(unexpected(0, state, Attempting::ConcreteType));
    }
    let mut next_char = None;
    while !state.bytes.is_empty() {
-        let (ch, new_state) = utf8_char().parse(arena, state)?;
+        match peek_utf8_char(&state) {
            Ok((ch, bytes_parsed)) => {
                // After the first character, only these are allowed:
                //
                // * Unicode alphabetic chars - you might name a variable `鹏` if that's clear to your readers
                // * ASCII digits - e.g. `1` but not `¾`, both of which pass .is_numeric()
                // * A dot ('.')
                if ch.is_alphabetic() {
                    if part_buf.is_empty() && !ch.is_uppercase() {
                        // Each part must begin with a capital letter.
                        return malformed(Some(ch), arena, state, parts);
                    }
-        state = new_state;
+                    part_buf.push(ch);
                } else if ch.is_ascii_digit() {
                    // Parts may not start with numbers!
                    if part_buf.is_empty() {
                        return malformed(Some(ch), arena, state, parts);
                    }
-        // After the first character, only these are allowed:
+                    part_buf.push(ch);
-        //
+                } else if ch == '.' {
-        // * Unicode alphabetic chars - you might name a variable `鹏` if that's clear to your readers
+                    // Having two consecutive dots is an error.
-        // * ASCII digits - e.g. `1` but not `¾`, both of which pass .is_numeric()
+                    if part_buf.is_empty() {
-        // * A dot ('.')
+                        return malformed(Some(ch), arena, state, parts);
-        if ch.is_alphabetic() {
+                    }
-            if part_buf.is_empty() && !ch.is_uppercase() {
+
-                // Each part must begin with a capital letter.
+                    parts.push(part_buf.into_bump_str());
-                return malformed(Some(ch), arena, state, parts);
+
                    // Now that we've recorded the contents of the current buffer, reset it.
                    part_buf = String::new_in(arena);
                } else {
                    // This must be the end of the type. We're done!
                    next_char = Some(ch);
                    break;
                }
                state = state.advance_without_indenting(bytes_parsed)?;
            }
-
+            Err(reason) => return state.fail(reason),
            part_buf.push(ch);
        } else if ch.is_ascii_digit() {
            // Parts may not start with numbers!
            if part_buf.is_empty() {
                return malformed(Some(ch), arena, state, parts);
            }
            part_buf.push(ch);
        } else if ch == '.' {
            // Having two consecutive dots is an error.
            if part_buf.is_empty() {
                return malformed(Some(ch), arena, state, parts);
            }
            parts.push(part_buf.into_bump_str());
            // Now that we've recorded the contents of the current buffer, reset it.
            part_buf = String::new_in(arena);
        } else {
            // This must be the end of the type. We're done!
            next_char = Some(ch);
            break;
        }
    }
@ -349,31 +357,41 @@ fn parse_concrete_type<'a>(
 fn parse_type_variable<'a>(
    arena: &'a Bump,
-    state: State<'a>,
+    mut state: State<'a>,
 ) -> ParseResult<'a, TypeAnnotation<'a>> {
    let mut buf = String::new_in(arena);
    let (first_letter, mut state) = utf8_char().parse(arena, state)?;
-    // Type variables must start with a lowercase letter.
+    match peek_utf8_char(&state) {
-    if first_letter.is_alphabetic() && first_letter.is_lowercase() {
+        Ok((first_letter, bytes_parsed)) => {
-        buf.push(first_letter);
+            // Type variables must start with a lowercase letter.
-    } else {
+            if first_letter.is_alphabetic() && first_letter.is_lowercase() {
-        return Err(unexpected(0, state, Attempting::TypeVariable));
+                buf.push(first_letter);
            } else {
                return Err(unexpected(0, state, Attempting::TypeVariable));
            }
            state = state.advance_without_indenting(bytes_parsed)?;
        }
        Err(reason) => return state.fail(reason),
    }
    while !state.bytes.is_empty() {
-        let (ch, new_state) = utf8_char().parse(arena, state)?;
+        match peek_utf8_char(&state) {
            Ok((ch, bytes_parsed)) => {
                // After the first character, only these are allowed:
                //
                // * Unicode alphabetic chars - you might name a variable `鹏` if that's clear to your readers
                // * ASCII digits - e.g. `1` but not `¾`, both of which pass .is_numeric()
                if ch.is_alphabetic() || ch.is_ascii_digit() {
                    buf.push(ch);
                } else {
                    // This must be the end of the type. We're done!
                    break;
                }
-        state = new_state;
+                state = state.advance_without_indenting(bytes_parsed)?;
-        // After the first character, only these are allowed:
+            }
-        //
+            Err(reason) => return state.fail(reason),
        // * Unicode alphabetic chars - you might name a variable `鹏` if that's clear to your readers
        // * ASCII digits - e.g. `1` but not `¾`, both of which pass .is_numeric()
        if ch.is_alphabetic() || ch.is_ascii_digit() {
            buf.push(ch);
        } else {
            // This must be the end of the type. We're done!
            break;
        }
    }
@ -399,22 +417,24 @@ fn malformed<'a>(
    // Consume the remaining chars in the identifier.
    while !state.bytes.is_empty() {
-        let (ch, new_state) = utf8_char().parse(arena, state)?;
+        match peek_utf8_char(&state) {
            Ok((ch, bytes_parsed)) => {
                // We can't use ch.is_alphanumeric() here because that passes for
                // things that are "numeric" but not ASCII digits, like `¾`
                if ch == '.' || ch.is_alphabetic() || ch.is_ascii_digit() {
                    full_string.push(ch);
                } else {
                    break;
                }
-        state = new_state;
+                state = state.advance_without_indenting(bytes_parsed)?;
-        // We can't use ch.is_alphanumeric() here because that passes for
+            }
-        // things that are "numeric" but not ASCII digits, like `¾`
+            Err(reason) => return state.fail(reason),
        if ch == '.' || ch.is_alphabetic() || ch.is_ascii_digit() {
            full_string.push(ch);
        } else {
            break;
        }
    }
    let chars_parsed = full_string.len();
    Ok((
        TypeAnnotation::Malformed(full_string.into_bump_str()),
-        state.advance_without_indenting(chars_parsed)?,
+        state,
    ))
 }