Move things into roc/bytes and roc/unicode

This commit is contained in:
Richard Feldman 2021-05-01 09:40:55 -04:00
parent 9748aa00da
commit 877cc328d2
10 changed files with 209 additions and 177 deletions

View file

@ -0,0 +1,9 @@
package roc/unicode 0.1.0
roc 0.0.0
exposes [ Unicode, Unicode.Scalar, Unicode.CodePoint ]
packages {}
license UPL-1.0
# TODO should we hande Latin1 encoding? Other encodings? Should there be
# an Ascii module, or a separate roc/ascii package? Conside that ASCII is
# 7-bit, so not all U8s are valid ASCII! There's also Extended ASCII to consider.

View file

@ -0,0 +1,41 @@
interface Unicode.CodePoint
exposes
[
CodePoint,
toU32,
fromU32,
parseUtf8,
parseUtf16,
chompUtf8,
chompUtf16
]
imports
[
Unicode.CodePoint.Internal as Internal
]
## A [Unicode Code Point](http://www.unicode.org/glossary/#code_point)
CodePoint : Internal.CodePoint
toU32 : CodePoint -> U32
toU32 = \codePoint -> Internal.toU32 codePoint
## To convert exactly one [CodePoint] to a [Str], that code point must be
## a valid [Unicode Scalar Value](http://www.unicode.org/glossary/#unicode_scalar_value).
## You can get one of those by calling [Unicode.Scalar.fromCodePoint], and then
## you can call [Unicode.Scalar.toStr] to get a [Str] from it.
toStr : List CodePoint -> Result Str [ BadCodePoint U32 ]*
toStr = \points ->
u32s = List.map points toU32
Str.fromCodePoints u32s
fromU32 : U32 -> Result CodePoint [ BadCodePoint ]*
parseUtf8 : Bytes -> Result { val : CodePoint, rest : Bytes } [ Expected [ Utf8CodePoint ]* Bytes ]*
parseUtf16 : Bytes, Endi -> Result { val : CodePoint, rest : Bytes } [ Expected [ Utf16CodePoint Endi ]* Bytes ]*
chompUtf8 : Bytes, CodePoint -> Result Str [ Expected [ ExactCodePoint CodePoint ]* Bytes ]*
chompUtf16 : Bytes, CodePoint, Endi -> Result Str [ Expected [ ExactCodePoint CodePoint ]* Bytes ]*
isAsciiDigit : CodePoint -> Bool

View file

@ -0,0 +1,21 @@
interface Unicode.CodePoint.Internal
exposes
[
CodePoint,
toU32,
fromU32,
fromU32Unchecked,
]
imports
[]
## This is just here so that both Unicode.Scalar and Unicode.CodePoint can access it.
CodePoint : [ @CodePoint U32 ]
fromU32Unchecked : U32 -> CodePoint
fromU32Unchecked = \u32 -> @CodePoint u32
toU32 : CodePoint -> U32
toU32 = \@CodePoint u32 -> u32
fromU32 : U32 -> Result CodePoint [ BadCodePoint ]*

View file

@ -0,0 +1,45 @@
interface Unicode.Scalar
exposes
[
Scalar,
toStr,
toCodePoint,
fromCodePoint,
parseUtf8,
parseUtf16,
chompUtf8,
chompUtf16
]
imports
[
Unicode.CodePoint.Internal as Internal
Unicode.CodePoint.{ CodePoint },
Bytes.{ Bytes }
]
## A [Unicode Scalar Value](http://www.unicode.org/glossary/#unicode_scalar_value)
Scalar : [ @Scalar U32 ]
toStr : Scalar -> Str
toStr = \@Scalar u32
when Str.fromScalar u32 is
Ok str -> str
Err _ ->
# This will quickly crash if it ever runs, but we're confident
# this Err branch will never run. That's becasue it only runs
# if Str.fromScalar receives an invalid scalar value, and we've
# already validated this!
toStr (@Scalar (scalar * 256))
toCodePoint : Scalar -> CodePoint
toCodePoint = \@Scalar u32 -> Internal.fromU32Unchecked u32
fromCodePoint : CodePoint -> Result Scalar [ PointWasSurrogate ]*
parseUtf8 : Bytes -> Result { val : Scalar, rest : Bytes } [ Expected [ Utf8CodePoint ]* Bytes ]*
parseUtf16 : Bytes, Endi -> Result { val : Scalar, rest : Bytes } [ Expected [ Utf16CodePoint Endi ]* Bytes ]*
chompUtf8 : Bytes, CodePoint -> Result Str [ Expected [ ExactCodePoint CodePoint ]* Bytes ]*
chompUtf16 : Bytes, CodePoint, Endi -> Result Str [ Expected [ ExactCodePoint CodePoint ]* Bytes ]*
isAsciiDigit : CodePoint -> Bool