Move things into roc/bytes and roc/unicode

2025-09-28 14:24:45 +00:00 · 2021-05-01 09:40:55 -04:00 · 2021-05-01 09:40:55 -04:00 · 877cc328d2
commit 877cc328d2
parent 9748aa00da
10 changed files with 209 additions and 177 deletions
--- a/packages/unicode/Package-Config.roc
+++ b/packages/unicode/Package-Config.roc
@ -0,0 +1,9 @@
+package roc/unicode 0.1.0
+    roc 0.0.0
+    exposes [ Unicode, Unicode.Scalar, Unicode.CodePoint ]
+    packages {}
+    license UPL-1.0
+
+# TODO should we hande Latin1 encoding? Other encodings? Should there be
+# an Ascii module, or a separate roc/ascii package? Conside that ASCII is
+# 7-bit, so not all U8s are valid ASCII! There's also Extended ASCII to consider.
--- a/packages/unicode/src/Unicode/CodePoint.roc
+++ b/packages/unicode/src/Unicode/CodePoint.roc
@ -0,0 +1,41 @@
+interface Unicode.CodePoint
+    exposes
+        [
+            CodePoint,
+            toU32,
+            fromU32,
+            parseUtf8,
+            parseUtf16,
+            chompUtf8,
+            chompUtf16
+        ]
+    imports
+        [
+            Unicode.CodePoint.Internal as Internal
+        ]
+
+## A [Unicode Code Point](http://www.unicode.org/glossary/#code_point)
+CodePoint : Internal.CodePoint
+
+toU32 : CodePoint -> U32
+toU32 = \codePoint -> Internal.toU32 codePoint
+
+## To convert exactly one [CodePoint] to a [Str], that code point must be
+## a valid [Unicode Scalar Value](http://www.unicode.org/glossary/#unicode_scalar_value).
+## You can get one of those by calling [Unicode.Scalar.fromCodePoint], and then
+## you can call [Unicode.Scalar.toStr] to get a [Str] from it.
+toStr : List CodePoint -> Result Str [ BadCodePoint U32 ]*
+toStr = \points ->
+    u32s = List.map points toU32
+
+    Str.fromCodePoints u32s
+
+fromU32 : U32 -> Result CodePoint [ BadCodePoint ]*
+
+parseUtf8 : Bytes -> Result { val : CodePoint, rest : Bytes } [ Expected [ Utf8CodePoint ]* Bytes ]*
+parseUtf16 : Bytes, Endi -> Result { val : CodePoint, rest : Bytes } [ Expected [ Utf16CodePoint Endi ]* Bytes ]*
+
+chompUtf8 : Bytes, CodePoint -> Result Str [ Expected [ ExactCodePoint CodePoint ]* Bytes ]*
+chompUtf16 : Bytes, CodePoint, Endi -> Result Str [ Expected [ ExactCodePoint CodePoint ]* Bytes ]*
+
+isAsciiDigit : CodePoint -> Bool
--- a/packages/unicode/src/Unicode/CodePoint/Internal.roc
+++ b/packages/unicode/src/Unicode/CodePoint/Internal.roc
@ -0,0 +1,21 @@
+interface Unicode.CodePoint.Internal
+    exposes
+        [
+            CodePoint,
+            toU32,
+            fromU32,
+            fromU32Unchecked,
+        ]
+    imports
+        []
+
+## This is just here so that both Unicode.Scalar and Unicode.CodePoint can access it.
+CodePoint : [ @CodePoint U32 ]
+
+fromU32Unchecked : U32 -> CodePoint
+fromU32Unchecked = \u32 -> @CodePoint u32
+
+toU32 : CodePoint -> U32
+toU32 = \@CodePoint u32 -> u32
+
+fromU32 : U32 -> Result CodePoint [ BadCodePoint ]*
--- a/packages/unicode/src/Unicode/Scalar.roc
+++ b/packages/unicode/src/Unicode/Scalar.roc
@ -0,0 +1,45 @@
+interface Unicode.Scalar
+    exposes
+        [
+            Scalar,
+            toStr,
+            toCodePoint,
+            fromCodePoint,
+            parseUtf8,
+            parseUtf16,
+            chompUtf8,
+            chompUtf16
+        ]
+    imports
+        [
+            Unicode.CodePoint.Internal as Internal
+            Unicode.CodePoint.{ CodePoint },
+            Bytes.{ Bytes }
+        ]
+
+## A [Unicode Scalar Value](http://www.unicode.org/glossary/#unicode_scalar_value)
+Scalar : [ @Scalar U32 ]
+
+toStr : Scalar -> Str
+toStr = \@Scalar u32
+    when Str.fromScalar u32 is
+        Ok str -> str
+        Err _ ->
+            # This will quickly crash if it ever runs, but we're confident
+            # this Err branch will never run. That's becasue it only runs
+            # if Str.fromScalar receives an invalid scalar value, and we've
+            # already validated this!
+            toStr (@Scalar (scalar * 256))
+
+toCodePoint : Scalar -> CodePoint
+toCodePoint = \@Scalar u32 -> Internal.fromU32Unchecked u32
+
+fromCodePoint : CodePoint -> Result Scalar [ PointWasSurrogate ]*
+
+parseUtf8 : Bytes -> Result { val : Scalar, rest : Bytes } [ Expected [ Utf8CodePoint ]* Bytes ]*
+parseUtf16 : Bytes, Endi -> Result { val : Scalar, rest : Bytes } [ Expected [ Utf16CodePoint Endi ]* Bytes ]*
+
+chompUtf8 : Bytes, CodePoint -> Result Str [ Expected [ ExactCodePoint CodePoint ]* Bytes ]*
+chompUtf16 : Bytes, CodePoint, Endi -> Result Str [ Expected [ ExactCodePoint CodePoint ]* Bytes ]*
+
+isAsciiDigit : CodePoint -> Bool