diff --git a/compiler/builtins/docs/Bytes.roc b/compiler/builtins/docs/Bytes.roc deleted file mode 100644 index e93afe948b..0000000000 --- a/compiler/builtins/docs/Bytes.roc +++ /dev/null @@ -1,81 +0,0 @@ -interface Bytes - exposes - [ - Bytes, - parseUtf8Usv, - parseUtf16Usv, - parseUtf8Grapheme, - parseUtf16Grapheme, - parsePastUtf8, - parsePastUtf16, - parseU16, - parseI16, - parseU32, - parseI32, - parseU64, - parseI64, - parseU128, - parseI128, - ] - imports [] - -# Conversion - -fromList : List U8 -> Bytes - -toList : Bytes -> List U8 - -len : Bytes -> Nat - -isEmpty : Bytes -> Bool - -## The [endianness](https://en.wikipedia.org/wiki/Endianness) of the currently running system. -hostEndi : Endi - -## [Endianness](https://en.wikipedia.org/wiki/Endianness) -## -## Be - Big Endian -## Le - Little Endian -Endi : [ Be, Le ] - -# Access - -splitFirst : Bytes -> Result { first : U8, rest : Bytes } [ NoBytes ]* - -take : Bytes, Nat -> Bytes - -# Building - -appendLe : Bytes, Num * -> Bytes -appendBe : Bytes, Num * -> Bytes -concat : Bytes, Bytes -> Bytes - -# Parsing - -## Parse a [Unicode Scalar Value](http://www.unicode.org/glossary/#unicode_scalar_value) -## (USV) encoded as UTF-8. -## -## To parse an entire UTF-8 string, you can use #Bytes.toUtf8 or #Bytes.parsePastUtf8. -parseUsvUtf8 : Bytes -> Result { val : Usv, rest : Bytes } [ Expected [ Utf8Usv ]* Bytes ]* -parseUsvUtf16 : Bytes, Endi -> Result { val : Usv, rest : Bytes } [ Expected [ Utf16Usv Endi ]* Bytes ]* -parseGraphemeUtf8 : Bytes -> Result { val : Str, rest : Bytes } [ Expected [ Utf8Grapheme ]* Bytes ]* -parseGraphemeUtf16 : Bytes, Endi -> Result { val : Str, rest : Bytes } [ Expected [ Utf16Grapheme Endi ]* Bytes ]* - -## If the bytes begin with the given UTF-8 string, return whatever bytes come -## after it. -chompUtf8 : Bytes, Str -> Result Bytes [ Expected [ ExactStr Str ]* Bytes ]* -chompUtf16 : Bytes, Endi, Str -> Result Bytes [ Expected [ ExactStr Str ]* Bytes ]* -chompUsvUtf8 : Usv -> Result Str [ Expected [ ExactUsv Usv ]* Bytes ]* -chompUsvUtf16 : Usv, Endi -> Result Str [ Expected [ ExactUsv Usv ]* Bytes ]* -## If the bytes begin with the given bytes, return whatever bytes come -## after them. -chompBytes : Bytes, Bytes -> Result Bytes [ Expected [ ExactStr Str ]* Bytes ]* - -parseU16 : Bytes, Endi -> Result { val : U16, rest : Bytes } [ Expected [ U16 Endi ]* Bytes ]* -parseI16 : Bytes, Endi -> Result { val : I16, rest : Bytes } [ Expected [ I16 Endi ]* Bytes ]* -parseU32 : Bytes, Endi -> Result { val : U32, rest : Bytes } [ Expected [ U32 Endi ]* Bytes ]* -parseI32 : Bytes, Endi -> Result { val : I32, rest : Bytes } [ Expected [ I32 Endi ]* Bytes ]* -parseU64 : Bytes, Endi -> Result { val : U64, rest : Bytes } [ Expected [ U64 Endi ]* Bytes ]* -parseI64 : Bytes, Endi -> Result { val : I64, rest : Bytes } [ Expected [ I64 Endi ]* Bytes ]* -parseU128 : Bytes, Endi -> Result { val : U128, rest : Bytes } [ Expected [ U128 Endi ]* Bytes ]* -parseI128 : Bytes, Endi -> Result { val : I128, rest : Bytes } [ Expected [ I128 Endi ]* Bytes ]* diff --git a/compiler/builtins/docs/Num.roc b/compiler/builtins/docs/Num.roc index 430bd10cc1..82d2fc3d3e 100644 --- a/compiler/builtins/docs/Num.roc +++ b/compiler/builtins/docs/Num.roc @@ -829,21 +829,12 @@ tryRecip : Float a -> Result (Float a) [ DivByZero ]* ## >>> Float.sqrt -4.0 sqrt : Float a -> [Ok (Float a), InvalidSqrt]* -## Try to convert a [Num] to a [Usv]. -## -## Although [Usv]s are [U32]s under the hood, -## not all [U32]s are valid [Usv]s. -## -## If you specifically have a [U8], the [Num.ascii] -## function will convert it directly to a [Usv] -## with no possibility of failure. -toUsv : Num * -> Result Usv [ InvalidUsv ]* -## Convert a raw [U8] to a [Usv]. -## -## Since all [U8] values are valid [Usv]s, this -## operation cannot fail. -ascii : U8 -> Usv +## [Endianness](https://en.wikipedia.org/wiki/Endianness) +Endi : [ Big, Little ] -## Convert a [Usv] into a [U32]. -fromUsv : Usv -> U32 +## The [endianness](https://en.wikipedia.org/wiki/Endianness) of [Num] values on +## the currently running system. +endi : Endi + +toBytes : Num *, Endi -> List U8 diff --git a/compiler/builtins/docs/Str.roc b/compiler/builtins/docs/Str.roc index f596eab552..875f6d5c59 100644 --- a/compiler/builtins/docs/Str.roc +++ b/compiler/builtins/docs/Str.roc @@ -101,15 +101,6 @@ interface Str ## A [Unicode](https://unicode.org) text value. Str : [ @Str ] -## A [Unicode Scalar Value](http://www.unicode.org/glossary/#unicode_scalar_value). -## -## This is a [U32] that has been validated to be in the acceptable range for -## a USV. -## -## You can make one of these using single quote literals - e.g. `'x'` - or -## convert to and from a raw [Num] using [Num.toUsv] and [Num.fromUsv]. -Usv : [ @Usv U32 ] - ## Convert ## Convert a #Float to a decimal string, rounding off to the given number of decimal places. @@ -235,6 +226,9 @@ isCaseInsensitiveEq : Str, Str -> Bool isCaseInsensitiveNeq : Str, Str -> Bool walkGraphemes : Str, { start: state, step: (state, Str -> state) } -> state +walkGraphemesUntil : Str, { start: state, step: (state, Str -> [ Continue state, Done state ]) } -> state +walkGraphemesBackwards : Str, { start: state, step: (state, Str -> state) } -> state +walkGraphemesBackwardsUntil : Str, { start: state, step: (state, Str -> [ Continue state, Done state ]) } -> state ## Returns #True if the string begins with an uppercase letter. ## @@ -307,19 +301,17 @@ isAllLowercase : Str -> Bool ## as well as the end. trim : Str -> Str -fromUtf8 : Bytes -> Result Str [ BadUtf8 ]* +## If the given [U32] is a valid [Unicode Scalar Value](http://www.unicode.org/glossary/#unicode_scalar_value), +## return a [Str] containing only that scalar. +fromScalar : U32 -> Result Str [ BadScalar ]* +fromCodePoints : List U32 -> Result Str [ BadCodePoint U32 ]* +fromUtf8 : List U8 -> Result Str [ BadUtf8 ]* +fromUtf16 : List U8, Endi -> Result Str [ BadUtf16 Endi ]* ## Convert from UTF-8, substituting the replacement character ("�") for any ## invalid sequences encountered. -fromUtf8Sub : Bytes -> Str - -fromUtf16Le : Bytes -> Result Str [ BadUtf16Le ]* - -fromUtf16LeSub : Bytes -> Str - -fromUtf16Be : Bytes -> Result Str [ BadUtf16Be ]* - -fromUtf16BeSub : Bytes -> Str +fromUtf8Sub : List U8 -> Str +fromUtf16Sub : List U8, Endi -> Str ## Return a #List of the string's #U8 UTF-8 [code units](https://unicode.org/glossary/#code_unit). ## (To split the string into a #List of smaller #Str values instead of #U8 values, @@ -335,70 +327,21 @@ fromUtf16BeSub : Bytes -> Str ## ## For a more flexible function that walks through each of these #U8 code units ## without creating a #List, see #Str.walkUtf8 and #Str.walkRevUtf8. -toUtf8 : Str -> Bytes - -toUtf16Le : Str -> Bytes - -toUtf16Be : Str -> Bytes - -## Unicode Scalar Values - -## Besides graphemes and bytes, another way to break down strings is into -## Unicode Scalar Values. -## -## USVs are no substitute for graphemes! -## These functions exist to support advanced use cases like those found in -## [roc/unicode](roc/unicode), and using USVs when graphemes would -## be more appropriate can very easily lead to bugs. -## -## For example, `Str.countGraphemes "👩‍👩‍👦‍👦"` returns `1`, -## whereas `Str.toUtf8 "👩‍👩‍👦‍👦"` returns a list with a length of 25, -## `Str.toUtf16 "👩‍👩‍👦‍👦"` returns a list with a length of 11. -## and `Str.toUtf32 "👩‍👩‍👦‍👦"` returns a list with a length of 7. - - -## Walk through the string's [Unicode Scalar Values](http://www.unicode.org/glossary/#unicode_scalar_value) -## (USVs) to build up a state. -## (If you want a `step` function which receives a #Str instead of an #Usv, see #Str.walkGraphemes.) -## -## Here are the #Usv values that will be passed to `step` when this function is -## called on various strings: -## -## * `"👩‍👩‍👦‍👦"` passes 128105, 8205, 128105, 8205, 128102, 8205, 128102 -## * `"Roc"` passes 82, 111, 99 -## * `"鹏"` passes 40527 -## * `"🐦"` passes 128038 -walkUsv : Str, { start: state, step: (state, Usv -> state) } -> state - -## Walk backwards through the string's [Unicode Scalar Values](http://www.unicode.org/glossary/#unicode_scalar_value) -## (USVs) to build up a state. -## (If you want a `step` function which receives a #Str instead of an #Usv, see #Str.walkGraphemes.) -## -## Here are the #Usv values that will be passed to `step` when this function is -## called on various strings: -## -## * `"👩‍👩‍👦‍👦"` passes 128102, 8205, 128102, 8205, 128105, 8205, 128105 -## * `"Roc"` passes 99, 111, 82 -## * `"鹏"` passes 40527 -## * `"🐦"` passes 128038 -## -## To convert a #Str into a plain `List Usv` of UTF-32 code units, see #Str.toUtf32. -walkBackwardsUsv : Str, { start: state, step: (state, Usv -> state) } -> state +toUtf8 : Str -> List U8 +toUtf16 : Str, Endi -> List U8 # Parsing -## Return the first [Unicode Scalar Value](http://www.unicode.org/glossary/#unicode_scalar_value) -## in the string, along with the rest of the string after that USV. -parseUsv : Str -> Result { val : Usv, rest : Str } [ Expected [ ValidUsv ]* Str ]* - ## Return the first [extended grapheme cluster](http://www.unicode.org/glossary/#extended_grapheme_cluster) ## in the string, along with the rest of the string after that grapheme. +## +## If the string does not contain a full grapheme, for example because it was +## empty, returns `Err`. parseGrapheme : Str -> Result { val : Str, rest : Str } [ Expected [ Grapheme ]* Str ]* ## If the first string begins with the second, return whatever comes ## after the second. -chompStr : Str, Str -> Result Str [ Expected [ ExactStr Str ]* Bytes ]* -chompUsv : Usv -> Result Str [ Expected [ ExactUsv Usv ]* Bytes ]* +chomp : Str, Str -> Result Str [ Expected [ ExactStr Str ]* Str ]* ## If the string begins with digits which can represent a valid #U8, return ## that number along with the rest of the string after the digits. diff --git a/packages/bytes/Package-Config.roc b/packages/bytes/Package-Config.roc new file mode 100644 index 0000000000..cfd9df8152 --- /dev/null +++ b/packages/bytes/Package-Config.roc @@ -0,0 +1,5 @@ +package roc/bytes 0.1.0 + roc 0.0.0 + exposes [ Bytes ] + packages {} + license UPL-1.0 diff --git a/packages/bytes/src/Bytes.roc b/packages/bytes/src/Bytes.roc new file mode 100644 index 0000000000..3a291e1784 --- /dev/null +++ b/packages/bytes/src/Bytes.roc @@ -0,0 +1,60 @@ +interface Bytes + exposes + [ + Bytes, + parseUtf8Usv, + parseUtf16Usv, + parseUtf8Grapheme, + parseUtf16Grapheme, + parsePastUtf8, + parsePastUtf16, + parseU16, + parseI16, + parseU32, + parseI32, + parseU64, + parseI64, + parseU128, + parseI128, + ] + imports [] + +Bytes : List U8 + +# Access + +splitFirst : Bytes -> Result { val : U8, rest : Bytes } [ NoBytes ]* + +take : Bytes, Nat -> Bytes + +# Building + +append : Bytes, Endi, Num * -> Bytes +concat : Bytes, Bytes -> Bytes + +# Parsing + +## Parse an exact number of UTF-8 [extended grapheme clusters](http://www.unicode.org/glossary/#extended_grapheme_cluster) +## into a [Str], and return the rest of the bytes after those graphemes. +parseGraphemesUtf8 : Bytes, Nat -> Result { val : Str, rest : Bytes } [ Expected [ Utf8Grapheme ]* Bytes ]* +parseGraphemesUtf16 : Bytes, Endi, Nat -> Result { val : Str, rest : Bytes } [ Expected [ Utf16Grapheme Endi ]* Bytes ]* + +## If the bytes begin with the given string encoded as UTF-8, return whatever +## bytes come after. +chompUtf8 : Bytes, Str -> Result Bytes [ Expected [ ExactStr Str ]* Bytes ]* +chompUtf16 : Bytes, Endi, Str -> Result Bytes [ Expected [ ExactStr Str ]* Bytes ]* + +## If the bytes begin with the given bytes, return whatever comes after. +chomp : Bytes, Bytes -> Result Bytes [ Expected [ ExactStr Str ]* Bytes ]* + +parseU16 : Bytes, Endi -> Result { val : U16, rest : Bytes } [ Expected [ NumU16 Endi ]* Bytes ]* +parseI16 : Bytes, Endi -> Result { val : I16, rest : Bytes } [ Expected [ NumI16 Endi ]* Bytes ]* +parseU32 : Bytes, Endi -> Result { val : U32, rest : Bytes } [ Expected [ NumU32 Endi ]* Bytes ]* +parseI32 : Bytes, Endi -> Result { val : I32, rest : Bytes } [ Expected [ NumI32 Endi ]* Bytes ]* +parseU64 : Bytes, Endi -> Result { val : U64, rest : Bytes } [ Expected [ NumU64 Endi ]* Bytes ]* +parseI64 : Bytes, Endi -> Result { val : I64, rest : Bytes } [ Expected [ NumI64 Endi ]* Bytes ]* +parseU128 : Bytes, Endi -> Result { val : U128, rest : Bytes } [ Expected [ NumU128 Endi ]* Bytes ]* +parseI128 : Bytes, Endi -> Result { val : I128, rest : Bytes } [ Expected [ NumI128 Endi ]* Bytes ]* + +parseF64 : Bytes, Endi -> Result { val : U128, rest : Bytes } [ Expected [ NumF64 Endi ]* Bytes ]* +parseF32 : Bytes, Endi -> Result { val : I128, rest : Bytes } [ Expected [ NumF32 Endi ]* Bytes ]* diff --git a/packages/parser/src/Bytes/Parser.roc b/packages/parser/src/Bytes/Parser.roc index 958d08bb51..3d7d8c44d2 100644 --- a/packages/parser/src/Bytes/Parser.roc +++ b/packages/parser/src/Bytes/Parser.roc @@ -40,8 +40,8 @@ Problem : NumF32 Endi, Utf8 Str, Utf16 Str Endi, - UsvUtf8, - UsvUtf16 Endi, + CodePointUtf8, + CodePointUtf16 Endi, GraphemeUtf8, GraphemeUtf16 Endi, End, @@ -52,12 +52,10 @@ Problem : keep : Parser a, (a -> Parser b) -> Parser b skip : Parser *, ({} -> Parser b) -> Parser b -graphemeUtf8 : Parser Str -graphemeUtf16 : Endi -> Parser Str utf8 : Str -> Parser Str utf16 : Str, Endi -> Parser Str -usvUtf8 : Parser U32 # UTF-8 defines endianness -usvUtf16 : Endi -> Parser U32 +graphemeUtf8 : Parser Str +graphemeUtf16 : Endi -> Parser Str u8 : Parser U8 i8 : Parser I8 diff --git a/packages/unicode/Package-Config.roc b/packages/unicode/Package-Config.roc new file mode 100644 index 0000000000..5e73e18ca9 --- /dev/null +++ b/packages/unicode/Package-Config.roc @@ -0,0 +1,9 @@ +package roc/unicode 0.1.0 + roc 0.0.0 + exposes [ Unicode, Unicode.Scalar, Unicode.CodePoint ] + packages {} + license UPL-1.0 + +# TODO should we hande Latin1 encoding? Other encodings? Should there be +# an Ascii module, or a separate roc/ascii package? Conside that ASCII is +# 7-bit, so not all U8s are valid ASCII! There's also Extended ASCII to consider. diff --git a/packages/unicode/src/Unicode/CodePoint.roc b/packages/unicode/src/Unicode/CodePoint.roc new file mode 100644 index 0000000000..a700b0d802 --- /dev/null +++ b/packages/unicode/src/Unicode/CodePoint.roc @@ -0,0 +1,41 @@ +interface Unicode.CodePoint + exposes + [ + CodePoint, + toU32, + fromU32, + parseUtf8, + parseUtf16, + chompUtf8, + chompUtf16 + ] + imports + [ + Unicode.CodePoint.Internal as Internal + ] + +## A [Unicode Code Point](http://www.unicode.org/glossary/#code_point) +CodePoint : Internal.CodePoint + +toU32 : CodePoint -> U32 +toU32 = \codePoint -> Internal.toU32 codePoint + +## To convert exactly one [CodePoint] to a [Str], that code point must be +## a valid [Unicode Scalar Value](http://www.unicode.org/glossary/#unicode_scalar_value). +## You can get one of those by calling [Unicode.Scalar.fromCodePoint], and then +## you can call [Unicode.Scalar.toStr] to get a [Str] from it. +toStr : List CodePoint -> Result Str [ BadCodePoint U32 ]* +toStr = \points -> + u32s = List.map points toU32 + + Str.fromCodePoints u32s + +fromU32 : U32 -> Result CodePoint [ BadCodePoint ]* + +parseUtf8 : Bytes -> Result { val : CodePoint, rest : Bytes } [ Expected [ Utf8CodePoint ]* Bytes ]* +parseUtf16 : Bytes, Endi -> Result { val : CodePoint, rest : Bytes } [ Expected [ Utf16CodePoint Endi ]* Bytes ]* + +chompUtf8 : Bytes, CodePoint -> Result Str [ Expected [ ExactCodePoint CodePoint ]* Bytes ]* +chompUtf16 : Bytes, CodePoint, Endi -> Result Str [ Expected [ ExactCodePoint CodePoint ]* Bytes ]* + +isAsciiDigit : CodePoint -> Bool diff --git a/packages/unicode/src/Unicode/CodePoint/Internal.roc b/packages/unicode/src/Unicode/CodePoint/Internal.roc new file mode 100644 index 0000000000..5bac8917c7 --- /dev/null +++ b/packages/unicode/src/Unicode/CodePoint/Internal.roc @@ -0,0 +1,21 @@ +interface Unicode.CodePoint.Internal + exposes + [ + CodePoint, + toU32, + fromU32, + fromU32Unchecked, + ] + imports + [] + +## This is just here so that both Unicode.Scalar and Unicode.CodePoint can access it. +CodePoint : [ @CodePoint U32 ] + +fromU32Unchecked : U32 -> CodePoint +fromU32Unchecked = \u32 -> @CodePoint u32 + +toU32 : CodePoint -> U32 +toU32 = \@CodePoint u32 -> u32 + +fromU32 : U32 -> Result CodePoint [ BadCodePoint ]* diff --git a/packages/unicode/src/Unicode/Scalar.roc b/packages/unicode/src/Unicode/Scalar.roc new file mode 100644 index 0000000000..18bba5854a --- /dev/null +++ b/packages/unicode/src/Unicode/Scalar.roc @@ -0,0 +1,45 @@ +interface Unicode.Scalar + exposes + [ + Scalar, + toStr, + toCodePoint, + fromCodePoint, + parseUtf8, + parseUtf16, + chompUtf8, + chompUtf16 + ] + imports + [ + Unicode.CodePoint.Internal as Internal + Unicode.CodePoint.{ CodePoint }, + Bytes.{ Bytes } + ] + +## A [Unicode Scalar Value](http://www.unicode.org/glossary/#unicode_scalar_value) +Scalar : [ @Scalar U32 ] + +toStr : Scalar -> Str +toStr = \@Scalar u32 + when Str.fromScalar u32 is + Ok str -> str + Err _ -> + # This will quickly crash if it ever runs, but we're confident + # this Err branch will never run. That's becasue it only runs + # if Str.fromScalar receives an invalid scalar value, and we've + # already validated this! + toStr (@Scalar (scalar * 256)) + +toCodePoint : Scalar -> CodePoint +toCodePoint = \@Scalar u32 -> Internal.fromU32Unchecked u32 + +fromCodePoint : CodePoint -> Result Scalar [ PointWasSurrogate ]* + +parseUtf8 : Bytes -> Result { val : Scalar, rest : Bytes } [ Expected [ Utf8CodePoint ]* Bytes ]* +parseUtf16 : Bytes, Endi -> Result { val : Scalar, rest : Bytes } [ Expected [ Utf16CodePoint Endi ]* Bytes ]* + +chompUtf8 : Bytes, CodePoint -> Result Str [ Expected [ ExactCodePoint CodePoint ]* Bytes ]* +chompUtf16 : Bytes, CodePoint, Endi -> Result Str [ Expected [ ExactCodePoint CodePoint ]* Bytes ]* + +isAsciiDigit : CodePoint -> Bool