First-class Usv type

2025-09-30 23:31:12 +00:00 · 2021-04-29 21:41:04 -04:00 · 2021-04-29 21:41:04 -04:00 · 81014c3790
commit 81014c3790
parent 391a4f13db
5 changed files with 119 additions and 96 deletions
--- a/compiler/builtins/docs/Bytes.roc
+++ b/compiler/builtins/docs/Bytes.roc
@ -8,22 +8,14 @@ interface Bytes
            parseUtf16Grapheme,
            parsePastUtf8,
            parsePastUtf16,
-            parseLeU16,
-            parseLeI16,
-            parseLeU32,
-            parseLeI32,
-            parseLeU64,
-            parseLeI64,
-            parseLeU128,
-            parseLeI128,
-            parseBeU16,
-            parseBeI16,
-            parseBeU32,
-            parseBeI32,
-            parseBeU64,
-            parseBeI64,
-            parseBeU128,
-            parseBeI128
+            parseU16,
+            parseI16,
+            parseU32,
+            parseI32,
+            parseU64,
+            parseI64,
+            parseU128,
+            parseI128,
        ]
    imports []

@ -37,8 +29,14 @@ len : Bytes -> Nat

 isEmpty : Bytes -> Bool

-## The endianness of the currently running system.
-hostEndianness : [ Big, Little ]
+## The [endianness](https://en.wikipedia.org/wiki/Endianness) of the currently running system.
+hostEndi : Endi
+
+## [Endianness](https://en.wikipedia.org/wiki/Endianness)
+##
+## Be - Big Endian
+## Le - Little Endian
+Endi : [ Be, Le ]

 # Access

@ -54,38 +52,30 @@ concat : Bytes, Bytes -> Bytes

 # Parsing

-
 ## Parse a [Unicode Scalar Value](http://www.unicode.org/glossary/#unicode_scalar_value)
 ## (USV) encoded as UTF-8.
 ##
 ## To parse an entire UTF-8 string, you can use #Bytes.toUtf8 or #Bytes.parsePastUtf8.
-parseUsvUtf8 : Bytes -> Result { answer : U32, rest : Bytes } [ Expected [ Utf8Usv ]* Bytes ]*
-parseUsvUtf16Le : Bytes -> Result { answer : U32, rest : Bytes } [ Expected [ Utf16BeUsv ]* Bytes ]*
-parseUsvUtf16Be : Bytes -> Result { answer : U32, rest : Bytes } [ Expected [ Utf16BeUsv ]* Bytes ]*
-parseGraphemeUtf8 : Bytes -> Result { answer : Str, rest : Bytes } [ Expected [ Utf8Grapheme ]* Bytes ]*
-parseGraphemeUtf16Le : Bytes -> Result { answer : Str, rest : Bytes } [ Expected [ Utf16LeGrapheme ]* Bytes ]*
-parseGraphemeUtf16Be : Bytes -> Result { answer : Str, rest : Bytes } [ Expected [ Utf16BeGrapheme ]* Bytes ]*
+parseUsvUtf8 : Bytes -> Result { val : Usv, rest : Bytes } [ Expected [ Utf8Usv ]* Bytes ]*
+parseUsvUtf16 : Bytes, Endi -> Result { val : Usv, rest : Bytes } [ Expected [ Utf16Usv Endi ]* Bytes ]*
+parseGraphemeUtf8 : Bytes -> Result { val : Str, rest : Bytes } [ Expected [ Utf8Grapheme ]* Bytes ]*
+parseGraphemeUtf16 : Bytes, Endi -> Result { val : Str, rest : Bytes } [ Expected [ Utf16Grapheme Endi ]* Bytes ]*

-## If the bytes begin with the given string, return whatever bytes come
+## If the bytes begin with the given UTF-8 string, return whatever bytes come
 ## after it.
-parsePastStr : Bytes, Str -> Result Bytes [ Expected [ ExactStr Str ]* Bytes ]*
+chompUtf8 : Bytes, Str -> Result Bytes [ Expected [ ExactStr Str ]* Bytes ]*
+chompUtf16 : Bytes, Endi, Str -> Result Bytes [ Expected [ ExactStr Str ]* Bytes ]*
+chompUsvUtf8 : Usv -> Result Str [ Expected [ ExactUsv Usv ]* Bytes ]*
+chompUsvUtf16 : Usv, Endi -> Result Str [ Expected [ ExactUsv Usv ]* Bytes ]*
+## If the bytes begin with the given bytes, return whatever bytes come
+## after them.
+chompBytes : Bytes, Bytes -> Result Bytes [ Expected [ ExactStr Str ]* Bytes ]*

-# Little-Endian
-parseU16Le : Bytes -> Result { answer : U16, rest : Bytes } [ Expected [ U16 ]* Bytes ]*
-parseI16Le : Bytes -> Result { answer : I16, rest : Bytes } [ Expected [ I16 ]* Bytes ]*
-parseU32Le : Bytes -> Result { answer : U32, rest : Bytes } [ Expected [ U32 ]* Bytes ]*
-parseI32Le : Bytes -> Result { answer : I32, rest : Bytes } [ Expected [ I32 ]* Bytes ]*
-parseU64Le : Bytes -> Result { answer : U64, rest : Bytes } [ Expected [ U64 ]* Bytes ]*
-parseI64Le : Bytes -> Result { answer : I64, rest : Bytes } [ Expected [ I64 ]* Bytes ]*
-parseU128Le : Bytes -> Result { answer : U128, rest : Bytes } [ Expected [ U128 ]* Bytes ]*
-parseI128Le : Bytes -> Result { answer : I128, rest : Bytes } [ Expected [ I128 ]* Bytes ]*
-
-# Big-Endian
-parseU16Be : Bytes -> Result { answer : U16, rest : Bytes } [ Expected [ U16 ]* Bytes ]*
-parseI16Be : Bytes -> Result { answer : I16, rest : Bytes } [ Expected [ I16 ]* Bytes ]*
-parseU32Be : Bytes -> Result { answer : U32, rest : Bytes } [ Expected [ U32 ]* Bytes ]*
-parseI32Be : Bytes -> Result { answer : I32, rest : Bytes } [ Expected [ I32 ]* Bytes ]*
-parseU64Be : Bytes -> Result { answer : U64, rest : Bytes } [ Expected [ U64 ]* Bytes ]*
-parseI64Be : Bytes -> Result { answer : I64, rest : Bytes } [ Expected [ I64 ]* Bytes ]*
-parseU128Be : Bytes -> Result { answer : U128, rest : Bytes } [ Expected [ U128 ]* Bytes ]*
-parseI128Be : Bytes -> Result { answer : I128, rest : Bytes } [ Expected [ I128 ]* Bytes ]*
+parseU16 : Bytes, Endi -> Result { val : U16, rest : Bytes } [ Expected [ U16 Endi ]* Bytes ]*
+parseI16 : Bytes, Endi -> Result { val : I16, rest : Bytes } [ Expected [ I16 Endi ]* Bytes ]*
+parseU32 : Bytes, Endi -> Result { val : U32, rest : Bytes } [ Expected [ U32 Endi ]* Bytes ]*
+parseI32 : Bytes, Endi -> Result { val : I32, rest : Bytes } [ Expected [ I32 Endi ]* Bytes ]*
+parseU64 : Bytes, Endi -> Result { val : U64, rest : Bytes } [ Expected [ U64 Endi ]* Bytes ]*
+parseI64 : Bytes, Endi -> Result { val : I64, rest : Bytes } [ Expected [ I64 Endi ]* Bytes ]*
+parseU128 : Bytes, Endi -> Result { val : U128, rest : Bytes } [ Expected [ U128 Endi ]* Bytes ]*
+parseI128 : Bytes, Endi -> Result { val : I128, rest : Bytes } [ Expected [ I128 Endi ]* Bytes ]*
--- a/compiler/builtins/docs/Num.roc
+++ b/compiler/builtins/docs/Num.roc
@ -336,7 +336,6 @@ Int size : Num [ @Int size ]
 ## eliminate the performance difference between loud and quiet errors in
 ## the situation where no error occurs.

-
 ## Convert

 ## Return a negative number when given a positive one, and vice versa.
@ -829,3 +828,22 @@ tryRecip : Float a -> Result (Float a) [ DivByZero ]*
 ##
 ## >>> Float.sqrt -4.0
 sqrt : Float a -> [Ok (Float a), InvalidSqrt]*
+
+## Try to convert a [Num] to a [Usv].
+##
+## Although [Usv]s are [U32]s under the hood,
+## not all [U32]s are valid [Usv]s.
+##
+## If you specifically have a [U8], the [Num.ascii]
+## function will convert it directly to a [Usv]
+## with no possibility of failure.
+toUsv : Num * -> Result Usv [ InvalidUsv ]*
+
+## Convert a raw [U8] to a [Usv].
+##
+## Since all [U8] values are valid [Usv]s, this
+## operation cannot fail.
+ascii : U8 -> Usv
+
+## Convert a [Usv] into a [U32].
+fromUsv : Usv -> U32
--- a/compiler/builtins/docs/Str.roc
+++ b/compiler/builtins/docs/Str.roc
@ -101,6 +101,15 @@ interface Str
 ## A [Unicode](https://unicode.org) text value.
 Str : [ @Str ]

+## A [Unicode Scalar Value](http://www.unicode.org/glossary/#unicode_scalar_value).
+##
+## This is a [U32] that has been validated to be in the acceptable range for
+## a USV.
+##
+## You can make one of these using single quote literals - e.g. `'x'` - or
+## convert to and from a raw [Num] using [Num.toUsv] and [Num.fromUsv].
+Usv : [ @Usv U32 ]
+
 ## Convert

 ## Convert a #Float to a decimal string, rounding off to the given number of decimal places.
@ -334,7 +343,7 @@ toUtf16Be : Str -> Bytes

 ## Unicode Scalar Values

-## Besides graphemes, another way to break down strings is into
+## Besides graphemes and bytes, another way to break down strings is into
 ## Unicode Scalar Values.
 ##
 ## USVs are no substitute for graphemes!
@ -350,22 +359,22 @@ toUtf16Be : Str -> Bytes

 ## Walk through the string's [Unicode Scalar Values](http://www.unicode.org/glossary/#unicode_scalar_value)
 ## (USVs) to build up a state.
-## (If you want a `step` function which receives a #Str instead of an #U32, see #Str.walkGraphemes.)
+## (If you want a `step` function which receives a #Str instead of an #Usv, see #Str.walkGraphemes.)
 ##
-## Here are the #U32 values that will be passed to `step` when this function is
+## Here are the #Usv values that will be passed to `step` when this function is
 ## called on various strings:
 ##
 ## * `"👩‍👩‍👦‍👦"` passes 128105, 8205, 128105, 8205, 128102, 8205, 128102
 ## * `"Roc"` passes 82, 111, 99
 ## * `"鹏"` passes 40527
 ## * `"🐦"` passes 128038
-walkUsv : Str, { start: state, step: (state, U32 -> state) } -> state
+walkUsv : Str, { start: state, step: (state, Usv -> state) } -> state

 ## Walk backwards through the string's [Unicode Scalar Values](http://www.unicode.org/glossary/#unicode_scalar_value)
 ## (USVs) to build up a state.
-## (If you want a `step` function which receives a #Str instead of an #U32, see #Str.walkGraphemes.)
+## (If you want a `step` function which receives a #Str instead of an #Usv, see #Str.walkGraphemes.)
 ##
-## Here are the #U32 values that will be passed to `step` when this function is
+## Here are the #Usv values that will be passed to `step` when this function is
 ## called on various strings:
 ##
 ## * `"👩‍👩‍👦‍👦"` passes 128102, 8205, 128102, 8205, 128105, 8205, 128105
@ -373,36 +382,36 @@ walkUsv : Str, { start: state, step: (state, U32 -> state) } -> state
 ## * `"鹏"` passes 40527
 ## * `"🐦"` passes 128038
 ##
-## To convert a #Str into a plain `List U32` of UTF-32 code units, see #Str.toUtf32.
-walkBackwardsUsv : Str, { start: state, step: (state, U32 -> state) } -> state
+## To convert a #Str into a plain `List Usv` of UTF-32 code units, see #Str.toUtf32.
+walkBackwardsUsv : Str, { start: state, step: (state, Usv -> state) } -> state

 # Parsing

 ## Return the first [Unicode Scalar Value](http://www.unicode.org/glossary/#unicode_scalar_value)
 ## in the string, along with the rest of the string after that USV.
-parseUsv : Str -> Result { answer : U32, rest : Str } [ StrWasEmpty ]*
+parseUsv : Str -> Result { val : Usv, rest : Str } [ Expected [ ValidUsv ]* Str ]*

 ## Return the first [extended grapheme cluster](http://www.unicode.org/glossary/#extended_grapheme_cluster)
 ## in the string, along with the rest of the string after that grapheme.
-parseGrapheme : Str -> Result { answer : Str, rest : Str } [ Expected [ Grapheme ]* Str ]*
+parseGrapheme : Str -> Result { val : Str, rest : Str } [ Expected [ Grapheme ]* Str ]*

 ## If the first string begins with the second, return whatever comes
 ## after the second.
 chompStr : Str, Str -> Result Str [ Expected [ ExactStr Str ]* Bytes ]*
-chompUsv, U32 -> Result Str [ Expected [ Usv U32 ]* Bytes ]*
+chompUsv : Usv -> Result Str [ Expected [ ExactUsv Usv ]* Bytes ]*

 ## If the string begins with digits which can represent a valid #U8, return
 ## that number along with the rest of the string after the digits.
-parseU8 : Str -> Result { answer : U8, rest : Str } [ Expected [ NumU8 ]* Str ]*
-parseI8 : Str -> Result { answer : I8, rest : Str } [ Expected [ NumI8 ]* Str ]*
-parseU16 : Str -> Result { answer : U16, rest : Str } [ Expected [ NumU16 ]* Str ]*
-parseI16 : Str -> Result { answer : I16, rest : Str } [ Expected [ NumI16 ]* Str ]*
-parseU32 : Str -> Result { answer : U32, rest : Str } [ Expected [ NumU32 ]* Str ]*
-parseI32 : Str -> Result { answer : I32, rest : Str } [ Expected [ NumI32 ]* Str ]*
-parseU64 : Str -> Result { answer : U64, rest : Str } [ Expected [ NumU64 ]* Str ]*
-parseI64 : Str -> Result { answer : I64, rest : Str } [ Expected [ NumI64 ]* Str ]*
-parseU128 : Str -> Result { answer : U128, rest : Str } [ Expected [ NumU128 ]* Str ]*
-parseI128 : Str -> Result { answer : I128, rest : Str } [ Expected [ NumI128 ]* Str ]*
+parseU8 : Str -> Result { val : U8, rest : Str } [ Expected [ NumU8 ]* Str ]*
+parseI8 : Str -> Result { val : I8, rest : Str } [ Expected [ NumI8 ]* Str ]*
+parseU16 : Str -> Result { val : U16, rest : Str } [ Expected [ NumU16 ]* Str ]*
+parseI16 : Str -> Result { val : I16, rest : Str } [ Expected [ NumI16 ]* Str ]*
+parseU32 : Str -> Result { val : U32, rest : Str } [ Expected [ NumU32 ]* Str ]*
+parseI32 : Str -> Result { val : I32, rest : Str } [ Expected [ NumI32 ]* Str ]*
+parseU64 : Str -> Result { val : U64, rest : Str } [ Expected [ NumU64 ]* Str ]*
+parseI64 : Str -> Result { val : I64, rest : Str } [ Expected [ NumI64 ]* Str ]*
+parseU128 : Str -> Result { val : U128, rest : Str } [ Expected [ NumU128 ]* Str ]*
+parseI128 : Str -> Result { val : I128, rest : Str } [ Expected [ NumI128 ]* Str ]*

-parseF64 : Str -> Result { answer : U128, rest : Str } [ Expected [ NumF64 ]* Str ]*
-parseF32 : Str -> Result { answer : I128, rest : Str } [ Expected [ NumF32 ]* Str ]*
+parseF64 : Str -> Result { val : U128, rest : Str } [ Expected [ NumF64 ]* Str ]*
+parseF32 : Str -> Result { val : I128, rest : Str } [ Expected [ NumF32 ]* Str ]*
--- a/packages/parser/src/Bytes/Parser.roc
+++ b/packages/parser/src/Bytes/Parser.roc
@ -28,40 +28,44 @@ Problem :
            [
                NumU8,
                NumI8,
-                NumU16,
-                NumI16,
-                NumU32,
-                NumI32,
-                NumU64,
-                NumI64,
-                NumU128,
-                NumI128,
-                NumF64,
-                NumF32,
-                Usv U32,
+                NumU16 Endi,
+                NumI16 Endi,
+                NumU32 Endi,
+                NumI32 Endi,
+                NumU64 Endi,
+                NumI64 Endi,
+                NumU128 Endi,
+                NumI128 Endi,
+                NumF64 Endi,
+                NumF32 Endi,
                Utf8 Str,
-                Utf16Le Str,
-                Utf16Be Str,
+                Utf16 Str Endi,
+                UsvUtf8,
+                UsvUtf16 Endi,
                GraphemeUtf8,
-                GraphemeUtf16Le,
-                GraphemeUtf16Be,
+                GraphemeUtf16 Endi,
                End,
            ]
            Str
    ]

 keep : Parser a, (a -> Parser b) -> Parser b
-
 skip : Parser *, ({} -> Parser b) -> Parser b

-utf8 : Parser Str
-utf16 : Parser Str
-
 graphemeUtf8 : Parser Str
-graphemeUtf16Le : Parser Str
-graphemeUtf16Be : Parser Str
-
-usv : Parser U32
+graphemeUtf16 : Endi -> Parser Str
+utf8 : Str -> Parser Str
+utf16 : Str, Endi -> Parser Str
+usvUtf8 : Parser U32 # UTF-8 defines endianness
+usvUtf16 : Endi -> Parser U32

 u8 : Parser U8
 i8 : Parser I8
+u16 : Endi -> Parser U16
+i16 : Endi -> Parser I16
+u32 : Endi -> Parser U32
+i32 : Endi -> Parser I32
+u64 : Endi -> Parser U64
+i64 : Endi -> Parser I64
+u128 : Endi -> Parser U128
+i128 : Endi -> Parser I128
--- a/packages/parser/src/Str/Parser.roc
+++ b/packages/parser/src/Str/Parser.roc
@ -24,7 +24,7 @@ Parser a :
        @Parser (Str -> Result { answer : a, rest : Str } RawProblem),
    ]

-RawProblem :
+Problem :
    [
        Expected
            [
@ -40,12 +40,14 @@ RawProblem :
                NumI128,
                NumF64,
                NumF32,
-                End
+                ExactStr Str,
+                Grapheme,
+                End,
            ]
+            Str
    ]

 keep : Parser a, (a -> Parser b) -> Parser b
-
 skip : Parser *, ({} -> Parser b) -> Parser b

 symbol : Str -> Parser {}