moved all crates into seperate folder + related path fixes

2025-09-26 13:29:12 +00:00 · 2022-07-01 17:37:43 +02:00 · 2022-07-01 17:37:43 +02:00 · eee85fa45d
commit eee85fa45d
parent 12ef03bb86
1063 changed files with 92 additions and 93 deletions
--- a/crates/compiler/parse/src/ident.rs
+++ b/crates/compiler/parse/src/ident.rs
@ -0,0 +1,546 @@
+use crate::parser::Progress::{self, *};
+use crate::parser::{BadInputError, EExpr, ParseResult, Parser};
+use crate::state::State;
+use bumpalo::collections::vec::Vec;
+use bumpalo::Bump;
+use roc_region::all::Position;
+
+/// A tag, for example. Must start with an uppercase letter
+/// and then contain only letters and numbers afterwards - no dots allowed!
+#[derive(Debug, Copy, Clone, PartialEq, Eq)]
+pub struct UppercaseIdent<'a>(&'a str);
+
+impl<'a> From<&'a str> for UppercaseIdent<'a> {
+    fn from(string: &'a str) -> Self {
+        UppercaseIdent(string)
+    }
+}
+
+impl<'a> From<UppercaseIdent<'a>> for &'a str {
+    fn from(ident: UppercaseIdent<'a>) -> Self {
+        ident.0
+    }
+}
+
+impl<'a> From<&'a UppercaseIdent<'a>> for &'a str {
+    fn from(ident: &'a UppercaseIdent<'a>) -> Self {
+        ident.0
+    }
+}
+
+/// The parser accepts all of these in any position where any one of them could
+/// appear. This way, canonicalization can give more helpful error messages like
+/// "you can't redefine this tag!" if you wrote `Foo = ...` or
+/// "you can only define unqualified constants" if you wrote `Foo.bar = ...`
+#[derive(Debug, Clone, PartialEq, Eq)]
+pub enum Ident<'a> {
+    /// Foo or Bar
+    Tag(&'a str),
+    /// @Foo or @Bar
+    OpaqueRef(&'a str),
+    /// foo or foo.bar or Foo.Bar.baz.qux
+    Access {
+        module_name: &'a str,
+        parts: &'a [&'a str],
+    },
+    /// .foo { foo: 42 }
+    AccessorFunction(&'a str),
+    /// .Foo or foo. or something like foo.Bar
+    Malformed(&'a str, BadIdent),
+}
+
+impl<'a> Ident<'a> {
+    pub fn len(&self) -> usize {
+        use self::Ident::*;
+
+        match self {
+            Tag(string) | OpaqueRef(string) => string.len(),
+            Access { module_name, parts } => {
+                let mut len = if module_name.is_empty() {
+                    0
+                } else {
+                    module_name.len() + 1
+                    // +1 for the dot
+                };
+
+                for part in parts.iter() {
+                    len += part.len() + 1 // +1 for the dot
+                }
+
+                len - 1
+            }
+            AccessorFunction(string) => string.len(),
+            Malformed(string, _) => string.len(),
+        }
+    }
+
+    pub fn is_empty(&self) -> bool {
+        self.len() == 0
+    }
+}
+
+/// This could be:
+///
+/// * A record field, e.g. "email" in `.email` or in `email:`
+/// * A named pattern match, e.g. "foo" in `foo =` or `foo ->` or `\foo ->`
+pub fn lowercase_ident<'a>() -> impl Parser<'a, &'a str, ()> {
+    move |_, state: State<'a>| match chomp_lowercase_part(state.bytes()) {
+        Err(progress) => Err((progress, (), state)),
+        Ok(ident) => {
+            if crate::keyword::KEYWORDS.iter().any(|kw| &ident == kw) {
+                Err((NoProgress, (), state))
+            } else {
+                let width = ident.len();
+                Ok((MadeProgress, ident, state.advance(width)))
+            }
+        }
+    }
+}
+
+pub fn tag_name<'a>() -> impl Parser<'a, &'a str, ()> {
+    move |arena, state: State<'a>| uppercase_ident().parse(arena, state)
+}
+
+/// This could be:
+///
+/// * A module name
+/// * A type name
+/// * A tag
+pub fn uppercase<'a>() -> impl Parser<'a, UppercaseIdent<'a>, ()> {
+    move |_, state: State<'a>| match chomp_uppercase_part(state.bytes()) {
+        Err(progress) => Err((progress, (), state)),
+        Ok(ident) => {
+            let width = ident.len();
+            Ok((MadeProgress, ident.into(), state.advance(width)))
+        }
+    }
+}
+
+/// This could be:
+///
+/// * A module name
+/// * A type name
+/// * A tag
+pub fn uppercase_ident<'a>() -> impl Parser<'a, &'a str, ()> {
+    move |_, state: State<'a>| match chomp_uppercase_part(state.bytes()) {
+        Err(progress) => Err((progress, (), state)),
+        Ok(ident) => {
+            let width = ident.len();
+            Ok((MadeProgress, ident, state.advance(width)))
+        }
+    }
+}
+
+pub fn unqualified_ident<'a>() -> impl Parser<'a, &'a str, ()> {
+    move |_, state: State<'a>| match chomp_part(|c| c.is_alphabetic(), state.bytes()) {
+        Err(progress) => Err((progress, (), state)),
+        Ok(ident) => {
+            if crate::keyword::KEYWORDS.iter().any(|kw| &ident == kw) {
+                Err((MadeProgress, (), state))
+            } else {
+                let width = ident.len();
+                Ok((MadeProgress, ident, state.advance(width)))
+            }
+        }
+    }
+}
+
+macro_rules! advance_state {
+    ($state:expr, $n:expr) => {
+        Ok($state.advance($n))
+    };
+}
+
+pub fn parse_ident<'a>(arena: &'a Bump, state: State<'a>) -> ParseResult<'a, Ident<'a>, EExpr<'a>> {
+    let initial = state.clone();
+
+    match parse_ident_help(arena, state) {
+        Ok((progress, ident, state)) => {
+            if let Ident::Access { module_name, parts } = ident {
+                if module_name.is_empty() {
+                    if let Some(first) = parts.first() {
+                        for keyword in crate::keyword::KEYWORDS.iter() {
+                            if first == keyword {
+                                return Err((NoProgress, EExpr::Start(initial.pos()), initial));
+                            }
+                        }
+                    }
+                }
+            }
+
+            Ok((progress, ident, state))
+        }
+        Err((NoProgress, _, state)) => Err((NoProgress, EExpr::Start(state.pos()), state)),
+        Err((MadeProgress, fail, state)) => match fail {
+            BadIdent::Start(pos) => Err((NoProgress, EExpr::Start(pos), state)),
+            BadIdent::Space(e, pos) => Err((NoProgress, EExpr::Space(e, pos), state)),
+            _ => malformed_identifier(initial.bytes(), fail, state),
+        },
+    }
+}
+
+fn malformed_identifier<'a>(
+    initial_bytes: &'a [u8],
+    problem: BadIdent,
+    mut state: State<'a>,
+) -> ParseResult<'a, Ident<'a>, EExpr<'a>> {
+    let chomped = chomp_malformed(state.bytes());
+    let delta = initial_bytes.len() - state.bytes().len();
+    let parsed_str = unsafe { std::str::from_utf8_unchecked(&initial_bytes[..chomped + delta]) };
+
+    state = state.advance(chomped);
+
+    Ok((MadeProgress, Ident::Malformed(parsed_str, problem), state))
+}
+
+/// skip forward to the next non-identifier character
+pub fn chomp_malformed(bytes: &[u8]) -> usize {
+    use encode_unicode::CharExt;
+    let mut chomped = 0;
+    while let Ok((ch, width)) = char::from_utf8_slice_start(&bytes[chomped..]) {
+        // We can't use ch.is_alphanumeric() here because that passes for
+        // things that are "numeric" but not ASCII digits, like `¾`
+        if ch == '.' || ch == '_' || ch.is_alphabetic() || ch.is_ascii_digit() {
+            chomped += width;
+            continue;
+        } else {
+            break;
+        }
+    }
+
+    chomped
+}
+
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub enum BadIdent {
+    Start(Position),
+    Space(BadInputError, Position),
+
+    Underscore(Position),
+    QualifiedTag(Position),
+    WeirdAccessor(Position),
+    WeirdDotAccess(Position),
+    WeirdDotQualified(Position),
+    StrayDot(Position),
+    BadOpaqueRef(Position),
+}
+
+fn chomp_lowercase_part(buffer: &[u8]) -> Result<&str, Progress> {
+    chomp_part(|c: char| c.is_lowercase(), buffer)
+}
+
+fn chomp_uppercase_part(buffer: &[u8]) -> Result<&str, Progress> {
+    chomp_part(|c: char| c.is_uppercase(), buffer)
+}
+
+#[inline(always)]
+fn chomp_part<F>(leading_is_good: F, buffer: &[u8]) -> Result<&str, Progress>
+where
+    F: Fn(char) -> bool,
+{
+    use encode_unicode::CharExt;
+
+    let mut chomped = 0;
+
+    if let Ok((ch, width)) = char::from_utf8_slice_start(&buffer[chomped..]) {
+        if leading_is_good(ch) {
+            chomped += width;
+        } else {
+            return Err(NoProgress);
+        }
+    }
+
+    while let Ok((ch, width)) = char::from_utf8_slice_start(&buffer[chomped..]) {
+        if ch.is_alphabetic() || ch.is_ascii_digit() {
+            chomped += width;
+        } else {
+            // we're done
+            break;
+        }
+    }
+
+    if chomped == 0 {
+        Err(NoProgress)
+    } else {
+        let name = unsafe { std::str::from_utf8_unchecked(&buffer[..chomped]) };
+
+        Ok(name)
+    }
+}
+
+/// a `.foo` accessor function
+fn chomp_accessor(buffer: &[u8], pos: Position) -> Result<&str, BadIdent> {
+    // assumes the leading `.` has been chomped already
+    use encode_unicode::CharExt;
+
+    match chomp_lowercase_part(buffer) {
+        Ok(name) => {
+            let chomped = name.len();
+
+            if let Ok(('.', _)) = char::from_utf8_slice_start(&buffer[chomped..]) {
+                Err(BadIdent::WeirdAccessor(pos))
+            } else {
+                Ok(name)
+            }
+        }
+        Err(_) => {
+            // we've already made progress with the initial `.`
+            Err(BadIdent::StrayDot(pos.bump_column(1)))
+        }
+    }
+}
+
+/// a `@Token` opaque
+fn chomp_opaque_ref(buffer: &[u8], pos: Position) -> Result<&str, BadIdent> {
+    // assumes the leading `@` has NOT been chomped already
+    debug_assert_eq!(buffer.get(0), Some(&b'@'));
+    use encode_unicode::CharExt;
+
+    let bad_ident = BadIdent::BadOpaqueRef;
+
+    match chomp_uppercase_part(&buffer[1..]) {
+        Ok(name) => {
+            let width = 1 + name.len();
+
+            if let Ok(('.', _)) = char::from_utf8_slice_start(&buffer[width..]) {
+                Err(bad_ident(pos.bump_column(width as u32)))
+            } else {
+                let value = unsafe { std::str::from_utf8_unchecked(&buffer[..width]) };
+                Ok(value)
+            }
+        }
+        Err(_) => Err(bad_ident(pos.bump_column(1))),
+    }
+}
+
+fn chomp_identifier_chain<'a>(
+    arena: &'a Bump,
+    buffer: &'a [u8],
+    pos: Position,
+) -> Result<(u32, Ident<'a>), (u32, BadIdent)> {
+    use encode_unicode::CharExt;
+
+    let first_is_uppercase;
+    let mut chomped = 0;
+
+    match char::from_utf8_slice_start(&buffer[chomped..]) {
+        Ok((ch, width)) => match ch {
+            '.' => match chomp_accessor(&buffer[1..], pos) {
+                Ok(accessor) => {
+                    let bytes_parsed = 1 + accessor.len();
+
+                    return Ok((bytes_parsed as u32, Ident::AccessorFunction(accessor)));
+                }
+                Err(fail) => return Err((1, fail)),
+            },
+            '@' => match chomp_opaque_ref(buffer, pos) {
+                Ok(tagname) => {
+                    let bytes_parsed = tagname.len();
+
+                    let ident = Ident::OpaqueRef;
+
+                    return Ok((bytes_parsed as u32, ident(tagname)));
+                }
+                Err(fail) => return Err((1, fail)),
+            },
+            c if c.is_alphabetic() => {
+                // fall through
+                chomped += width;
+                first_is_uppercase = c.is_uppercase();
+            }
+            _ => {
+                return Err((0, BadIdent::Start(pos)));
+            }
+        },
+        Err(_) => return Err((0, BadIdent::Start(pos))),
+    }
+
+    while let Ok((ch, width)) = char::from_utf8_slice_start(&buffer[chomped..]) {
+        if ch.is_alphabetic() || ch.is_ascii_digit() {
+            chomped += width;
+        } else {
+            // we're done
+            break;
+        }
+    }
+
+    if let Ok(('.', _)) = char::from_utf8_slice_start(&buffer[chomped..]) {
+        let module_name = if first_is_uppercase {
+            match chomp_module_chain(&buffer[chomped..]) {
+                Ok(width) => {
+                    chomped += width as usize;
+                    unsafe { std::str::from_utf8_unchecked(&buffer[..chomped]) }
+                }
+                Err(MadeProgress) => todo!(),
+                Err(NoProgress) => unsafe { std::str::from_utf8_unchecked(&buffer[..chomped]) },
+            }
+        } else {
+            ""
+        };
+
+        let mut parts = Vec::with_capacity_in(4, arena);
+
+        if !first_is_uppercase {
+            let first_part = unsafe { std::str::from_utf8_unchecked(&buffer[..chomped]) };
+            parts.push(first_part);
+        }
+
+        match chomp_access_chain(&buffer[chomped..], &mut parts) {
+            Ok(width) => {
+                chomped += width as usize;
+
+                let ident = Ident::Access {
+                    module_name,
+                    parts: parts.into_bump_slice(),
+                };
+
+                Ok((chomped as u32, ident))
+            }
+            Err(0) if !module_name.is_empty() => Err((
+                chomped as u32,
+                BadIdent::QualifiedTag(pos.bump_column(chomped as u32)),
+            )),
+            Err(1) if parts.is_empty() => Err((
+                chomped as u32 + 1,
+                BadIdent::WeirdDotQualified(pos.bump_column(chomped as u32 + 1)),
+            )),
+            Err(width) => Err((
+                chomped as u32 + width,
+                BadIdent::WeirdDotAccess(pos.bump_column(chomped as u32 + width)),
+            )),
+        }
+    } else if let Ok(('_', _)) = char::from_utf8_slice_start(&buffer[chomped..]) {
+        // we don't allow underscores in the middle of an identifier
+        // but still parse them (and generate a malformed identifier)
+        // to give good error messages for this case
+        Err((
+            chomped as u32 + 1,
+            BadIdent::Underscore(pos.bump_column(chomped as u32 + 1)),
+        ))
+    } else if first_is_uppercase {
+        // just one segment, starting with an uppercase letter; that's a tag
+        let value = unsafe { std::str::from_utf8_unchecked(&buffer[..chomped]) };
+        Ok((chomped as u32, Ident::Tag(value)))
+    } else {
+        // just one segment, starting with a lowercase letter; that's a normal identifier
+        let value = unsafe { std::str::from_utf8_unchecked(&buffer[..chomped]) };
+        let ident = Ident::Access {
+            module_name: "",
+            parts: arena.alloc([value]),
+        };
+        Ok((chomped as u32, ident))
+    }
+}
+
+fn chomp_module_chain(buffer: &[u8]) -> Result<u32, Progress> {
+    let mut chomped = 0;
+
+    while let Some(b'.') = buffer.get(chomped) {
+        match &buffer.get(chomped + 1..) {
+            Some(slice) => match chomp_uppercase_part(slice) {
+                Ok(name) => {
+                    chomped += name.len() + 1;
+                }
+                Err(MadeProgress) => return Err(MadeProgress),
+                Err(NoProgress) => break,
+            },
+            None => return Err(MadeProgress),
+        }
+    }
+
+    if chomped == 0 {
+        Err(NoProgress)
+    } else {
+        Ok(chomped as u32)
+    }
+}
+
+pub fn concrete_type<'a>() -> impl Parser<'a, (&'a str, &'a str), ()> {
+    move |_, state: State<'a>| match chomp_concrete_type(state.bytes()) {
+        Err(progress) => Err((progress, (), state)),
+        Ok((module_name, type_name, width)) => {
+            Ok((MadeProgress, (module_name, type_name), state.advance(width)))
+        }
+    }
+}
+
+// parse a type name like `Result` or `Result.Result`
+fn chomp_concrete_type(buffer: &[u8]) -> Result<(&str, &str, usize), Progress> {
+    let first = crate::ident::chomp_uppercase_part(buffer)?;
+
+    if let Some(b'.') = buffer.get(first.len()) {
+        match crate::ident::chomp_module_chain(&buffer[first.len()..]) {
+            Err(_) => Err(MadeProgress),
+            Ok(rest) => {
+                let width = first.len() + rest as usize;
+
+                // we must explicitly check here for a trailing `.`
+                if let Some(b'.') = buffer.get(width) {
+                    return Err(MadeProgress);
+                }
+
+                let slice = &buffer[..width];
+
+                match slice.iter().rev().position(|c| *c == b'.') {
+                    None => Ok(("", first, first.len())),
+                    Some(rev_index) => {
+                        let index = slice.len() - rev_index;
+                        let module_name =
+                            unsafe { std::str::from_utf8_unchecked(&slice[..index - 1]) };
+                        let type_name = unsafe { std::str::from_utf8_unchecked(&slice[index..]) };
+
+                        Ok((module_name, type_name, width))
+                    }
+                }
+            }
+        }
+    } else {
+        Ok(("", first, first.len()))
+    }
+}
+
+fn chomp_access_chain<'a>(buffer: &'a [u8], parts: &mut Vec<'a, &'a str>) -> Result<u32, u32> {
+    let mut chomped = 0;
+
+    while let Some(b'.') = buffer.get(chomped) {
+        match &buffer.get(chomped + 1..) {
+            Some(slice) => match chomp_lowercase_part(slice) {
+                Ok(name) => {
+                    let value = unsafe {
+                        std::str::from_utf8_unchecked(
+                            &buffer[chomped + 1..chomped + 1 + name.len()],
+                        )
+                    };
+                    parts.push(value);
+
+                    chomped += name.len() + 1;
+                }
+                Err(_) => return Err(chomped as u32 + 1),
+            },
+            None => return Err(chomped as u32 + 1),
+        }
+    }
+
+    if chomped == 0 {
+        Err(0)
+    } else {
+        Ok(chomped as u32)
+    }
+}
+
+fn parse_ident_help<'a>(
+    arena: &'a Bump,
+    mut state: State<'a>,
+) -> ParseResult<'a, Ident<'a>, BadIdent> {
+    match chomp_identifier_chain(arena, state.bytes(), state.pos()) {
+        Ok((width, ident)) => {
+            state = advance_state!(state, width as usize)?;
+            Ok((MadeProgress, ident, state))
+        }
+        Err((0, fail)) => Err((NoProgress, fail, state)),
+        Err((width, fail)) => {
+            state = advance_state!(state, width as usize)?;
+            Err((MadeProgress, fail, state))
+        }
+    }
+}