Merge branch 'main' into module-params

This commit is contained in:
Agus Zubiaga 2024-01-27 09:36:20 -03:00
commit eb68bf943a
No known key found for this signature in database
159 changed files with 5937 additions and 22739 deletions

File diff suppressed because it is too large Load diff

View file

@ -962,6 +962,14 @@ pub fn listIsUnique(
return list.isEmpty() or list.isUnique();
}
pub fn listClone(
list: RocList,
alignment: u32,
element_width: usize,
) callconv(.C) RocList {
return list.makeUnique(alignment, element_width);
}
pub fn listCapacity(
list: RocList,
) callconv(.C) usize {

View file

@ -75,6 +75,7 @@ comptime {
exportListFn(list.listReplaceInPlace, "replace_in_place");
exportListFn(list.listSwap, "swap");
exportListFn(list.listIsUnique, "is_unique");
exportListFn(list.listClone, "clone");
exportListFn(list.listCapacity, "capacity");
exportListFn(list.listAllocationPtr, "allocation_ptr");
exportListFn(list.listReleaseExcessCapacity, "release_excess_capacity");
@ -110,19 +111,6 @@ comptime {
exportNumFn(num.greaterThanU128, "greater_than.u128");
exportNumFn(num.greaterThanOrEqualU128, "greater_than_or_equal.u128");
exportNumFn(num.compareI128, "compare.i128");
exportNumFn(num.compareU128, "compare.u128");
exportNumFn(num.lessThanI128, "less_than.i128");
exportNumFn(num.lessThanOrEqualI128, "less_than_or_equal.i128");
exportNumFn(num.greaterThanI128, "greater_than.i128");
exportNumFn(num.greaterThanOrEqualI128, "greater_than_or_equal.i128");
exportNumFn(num.lessThanU128, "less_than.u128");
exportNumFn(num.lessThanOrEqualU128, "less_than_or_equal.u128");
exportNumFn(num.greaterThanU128, "greater_than.u128");
exportNumFn(num.greaterThanOrEqualU128, "greater_than_or_equal.u128");
inline for (INTEGERS, 0..) |T, i| {
num.exportPow(T, ROC_BUILTINS ++ "." ++ NUM ++ ".pow_int.");
num.exportDivCeil(T, ROC_BUILTINS ++ "." ++ NUM ++ ".div_ceil.");
@ -190,15 +178,12 @@ comptime {
const str = @import("str.zig");
comptime {
exportStrFn(str.init, "init");
exportStrFn(str.strToScalarsC, "to_scalars");
exportStrFn(str.strSplit, "str_split");
exportStrFn(str.countSegments, "count_segments");
exportStrFn(str.countGraphemeClusters, "count_grapheme_clusters");
exportStrFn(str.countUtf8Bytes, "count_utf8_bytes");
exportStrFn(str.isEmpty, "is_empty");
exportStrFn(str.getCapacity, "capacity");
exportStrFn(str.startsWith, "starts_with");
exportStrFn(str.startsWithScalar, "starts_with_scalar");
exportStrFn(str.endsWith, "ends_with");
exportStrFn(str.strConcatC, "concat");
exportStrFn(str.strJoinWithC, "joinWith");
@ -207,8 +192,6 @@ comptime {
exportStrFn(str.substringUnsafe, "substring_unsafe");
exportStrFn(str.getUnsafe, "get_unsafe");
exportStrFn(str.reserve, "reserve");
exportStrFn(str.getScalarUnsafe, "get_scalar_unsafe");
exportStrFn(str.appendScalar, "append_scalar");
exportStrFn(str.strToUtf8C, "to_utf8");
exportStrFn(str.fromUtf8RangeC, "from_utf8_range");
exportStrFn(str.repeat, "repeat");
@ -217,7 +200,6 @@ comptime {
exportStrFn(str.strTrimEnd, "trim_end");
exportStrFn(str.strCloneTo, "clone_to");
exportStrFn(str.withCapacity, "with_capacity");
exportStrFn(str.strGraphemes, "graphemes");
exportStrFn(str.strAllocationPtr, "allocation_ptr");
exportStrFn(str.strReleaseExcessCapacity, "release_excess_capacity");

View file

@ -1,6 +1,5 @@
const utils = @import("utils.zig");
const RocList = @import("list.zig").RocList;
const grapheme = @import("helpers/grapheme.zig");
const UpdateMode = utils.UpdateMode;
const std = @import("std");
const mem = std.mem;
@ -552,242 +551,6 @@ pub fn strNumberOfBytes(string: RocStr) callconv(.C) usize {
return string.len();
}
// Str.toScalars
pub fn strToScalarsC(str: RocStr) callconv(.C) RocList {
return @call(.always_inline, strToScalars, .{str});
}
fn strToScalars(string: RocStr) callconv(.C) RocList {
const len = string.len();
if (len == 0) {
return RocList.empty();
}
var capacity = len;
if (!string.isSmallStr()) {
capacity = string.getCapacity();
}
// For purposes of preallocation, assume the number of code points is the same
// as the number of bytes. This might be longer than necessary, but definitely
// should not require a second allocation.
var answer = RocList.allocate(@alignOf(u32), capacity, @sizeOf(u32));
// `orelse unreachable` is fine here, because we already did an early
// return to verify the string was nonempty.
var answer_elems = answer.elements(u32) orelse unreachable;
var src_index: usize = 0;
var answer_index: usize = 0;
while (src_index < len) {
src_index += writeNextScalar(string, src_index, answer_elems, answer_index);
answer_index += 1;
}
answer.length = answer_index;
return answer;
}
// Given a non-empty RocStr, and a src_index byte index into that string,
// and a destination [*]u32, and an index into that destination,
// Parses the next scalar value out of the string (at the given byte index),
// writes it into the destination, and returns the number of bytes parsed.
inline fn writeNextScalar(non_empty_string: RocStr, src_index: usize, dest: [*]u32, dest_index: usize) usize {
const utf8_byte = non_empty_string.getUnchecked(src_index);
// How UTF-8 bytes work:
// https://docs.teradata.com/r/Teradata-Database-International-Character-Set-Support/June-2017/Client-Character-Set-Options/UTF8-Client-Character-Set-Support/UTF8-Multibyte-Sequences
if (utf8_byte <= 127) {
// It's an ASCII character. Copy it over directly.
dest[dest_index] = @as(u32, @intCast(utf8_byte));
return 1;
} else if (utf8_byte >> 5 == 0b0000_0110) {
// Its three high order bits are 110, so this is a two-byte sequence.
// Example:
// utf-8: 1100 1111 1011 0001
// code pt: 0000 0011 1111 0001 (decimal: 1009)
// Discard the first byte's high order bits of 110.
var code_pt = @as(u32, @intCast(utf8_byte & 0b0001_1111));
// Discard the second byte's high order bits of 10.
code_pt <<= 6;
code_pt |= non_empty_string.getUnchecked(src_index + 1) & 0b0011_1111;
dest[dest_index] = code_pt;
return 2;
} else if (utf8_byte >> 4 == 0b0000_1110) {
// Its four high order bits are 1110, so this is a three-byte sequence.
// Discard the first byte's high order bits of 1110.
var code_pt = @as(u32, @intCast(utf8_byte & 0b0000_1111));
// Discard the second byte's high order bits of 10.
code_pt <<= 6;
code_pt |= non_empty_string.getUnchecked(src_index + 1) & 0b0011_1111;
// Discard the third byte's high order bits of 10 (same as second byte).
code_pt <<= 6;
code_pt |= non_empty_string.getUnchecked(src_index + 2) & 0b0011_1111;
dest[dest_index] = code_pt;
return 3;
} else {
// This must be a four-byte sequence, so the five high order bits should be 11110.
// Discard the first byte's high order bits of 11110.
var code_pt = @as(u32, @intCast(utf8_byte & 0b0000_0111));
// Discard the second byte's high order bits of 10.
code_pt <<= 6;
code_pt |= non_empty_string.getUnchecked(src_index + 1) & 0b0011_1111;
// Discard the third byte's high order bits of 10 (same as second byte).
code_pt <<= 6;
code_pt |= non_empty_string.getUnchecked(src_index + 2) & 0b0011_1111;
// Discard the fourth byte's high order bits of 10 (same as second and third).
code_pt <<= 6;
code_pt |= non_empty_string.getUnchecked(src_index + 3) & 0b0011_1111;
dest[dest_index] = code_pt;
return 4;
}
}
test "strToScalars: empty string" {
const str = RocStr.fromSlice("");
defer RocStr.decref(str);
const expected = RocList.empty();
const actual = strToScalars(str);
defer actual.decref(@sizeOf(u32));
try expect(RocList.eql(actual, expected));
}
test "strToScalars: One ASCII char" {
const str = RocStr.fromSlice("R");
defer RocStr.decref(str);
const expected_array = [_]u32{82};
const expected = RocList.fromSlice(u32, expected_array[0..expected_array.len]);
defer expected.decref(@sizeOf(u32));
const actual = strToScalars(str);
defer actual.decref(@sizeOf(u32));
try expect(RocList.eql(actual, expected));
}
test "strToScalars: Multiple ASCII chars" {
const str = RocStr.fromSlice("Roc!");
defer RocStr.decref(str);
const expected_array = [_]u32{ 82, 111, 99, 33 };
const expected = RocList.fromSlice(u32, expected_array[0..expected_array.len]);
defer expected.decref(@sizeOf(u32));
const actual = strToScalars(str);
defer actual.decref(@sizeOf(u32));
try expect(RocList.eql(actual, expected));
}
test "strToScalars: One 2-byte UTF-8 character" {
const str = RocStr.fromSlice("é");
defer RocStr.decref(str);
const expected_array = [_]u32{233};
const expected = RocList.fromSlice(u32, expected_array[0..expected_array.len]);
defer expected.decref(@sizeOf(u32));
const actual = strToScalars(str);
defer actual.decref(@sizeOf(u32));
try expect(RocList.eql(actual, expected));
}
test "strToScalars: Multiple 2-byte UTF-8 characters" {
const str = RocStr.fromSlice("Cäfés");
defer RocStr.decref(str);
const expected_array = [_]u32{ 67, 228, 102, 233, 115 };
const expected = RocList.fromSlice(u32, expected_array[0..expected_array.len]);
defer expected.decref(@sizeOf(u32));
const actual = strToScalars(str);
defer actual.decref(@sizeOf(u32));
try expect(RocList.eql(actual, expected));
}
test "strToScalars: One 3-byte UTF-8 character" {
const str = RocStr.fromSlice("");
defer RocStr.decref(str);
const expected_array = [_]u32{40527};
const expected = RocList.fromSlice(u32, expected_array[0..expected_array.len]);
defer expected.decref(@sizeOf(u32));
const actual = strToScalars(str);
defer actual.decref(@sizeOf(u32));
try expect(RocList.eql(actual, expected));
}
test "strToScalars: Multiple 3-byte UTF-8 characters" {
const str = RocStr.fromSlice("鹏很有趣");
defer RocStr.decref(str);
const expected_array = [_]u32{ 40527, 24456, 26377, 36259 };
const expected = RocList.fromSlice(u32, expected_array[0..expected_array.len]);
defer expected.decref(@sizeOf(u32));
const actual = strToScalars(str);
defer actual.decref(@sizeOf(u32));
try expect(RocList.eql(actual, expected));
}
test "strToScalars: One 4-byte UTF-8 character" {
// from https://design215.com/toolbox/utf8-4byte-characters.php
const str = RocStr.fromSlice("𒀀");
defer RocStr.decref(str);
const expected_array = [_]u32{73728};
const expected = RocList.fromSlice(u32, expected_array[0..expected_array.len]);
defer expected.decref(@sizeOf(u32));
const actual = strToScalars(str);
defer actual.decref(@sizeOf(u32));
try expect(RocList.eql(actual, expected));
}
test "strToScalars: Multiple 4-byte UTF-8 characters" {
// from https://design215.com/toolbox/utf8-4byte-characters.php
const str = RocStr.fromSlice("𒀀𒀁");
defer RocStr.decref(str);
const expected_array = [_]u32{ 73728, 73729 };
const expected = RocList.fromSlice(u32, expected_array[0..expected_array.len]);
defer expected.decref(@sizeOf(u32));
const actual = strToScalars(str);
defer actual.decref(@sizeOf(u32));
try expect(RocList.eql(actual, expected));
}
// Str.fromInt
pub fn exportFromInt(comptime T: type, comptime name: []const u8) void {
comptime var f = struct {
@ -1371,125 +1134,6 @@ test "countSegments: overlapping delimiter 2" {
try expectEqual(segments_count, 3);
}
// Str.countGraphemeClusters
pub fn countGraphemeClusters(string: RocStr) callconv(.C) usize {
if (string.isEmpty()) {
return 0;
}
const bytes_len = string.len();
const bytes_ptr = string.asU8ptr();
var bytes = bytes_ptr[0..bytes_len];
var iter = (unicode.Utf8View.init(bytes) catch unreachable).iterator();
var count: usize = 0;
var grapheme_break_state: ?grapheme.BoundClass = null;
var grapheme_break_state_ptr = &grapheme_break_state;
var opt_last_codepoint: ?u21 = null;
while (iter.nextCodepoint()) |cur_codepoint| {
if (opt_last_codepoint) |last_codepoint| {
var did_break = grapheme.isGraphemeBreak(last_codepoint, cur_codepoint, grapheme_break_state_ptr);
if (did_break) {
count += 1;
grapheme_break_state = null;
}
}
opt_last_codepoint = cur_codepoint;
}
// If there are no breaks, but the str is not empty, then there
// must be a single grapheme
if (bytes_len != 0) {
count += 1;
}
return count;
}
// Str.graphemes
pub fn strGraphemes(roc_str: RocStr) callconv(.C) RocList {
var break_state: ?grapheme.BoundClass = null;
var opt_last_codepoint: ?u21 = null;
var index: usize = 0;
var last_codepoint_len: u8 = 0;
const alloc_ptr = @intFromPtr(roc_str.getAllocationPtr()) >> 1;
const init_fn = if (roc_str.isSmallStr())
&initFromSmallStr
else
&initFromBigStr;
var result = RocList.allocate(@alignOf(RocStr), countGraphemeClusters(roc_str), @sizeOf(RocStr));
const graphemes = result.elements(RocStr) orelse return result;
var slice = roc_str.asSlice();
var iter = (unicode.Utf8View.init(slice) catch unreachable).iterator();
while (iter.nextCodepoint()) |cur_codepoint| {
const cur_codepoint_len = unicode.utf8CodepointSequenceLength(cur_codepoint) catch unreachable;
if (opt_last_codepoint) |last_codepoint| {
var did_break = grapheme.isGraphemeBreak(last_codepoint, cur_codepoint, &break_state);
if (did_break) {
graphemes[index] = init_fn(@constCast(slice.ptr), last_codepoint_len, alloc_ptr);
slice = slice[last_codepoint_len..];
index += 1;
break_state = null;
last_codepoint_len = 0;
}
}
last_codepoint_len += cur_codepoint_len;
opt_last_codepoint = cur_codepoint;
}
// Append last grapheme
graphemes[index] = init_fn(@constCast(slice.ptr), slice.len, alloc_ptr);
if (!roc_str.isSmallStr()) {
// Correct refcount for all of the splits made.
roc_str.incref(index + 1);
}
return result;
}
// these test both countGraphemeClusters() and strGraphemes()
fn graphemesTest(input: []const u8, expected: []const []const u8) !void {
const rocstr = RocStr.fromSlice(input);
defer rocstr.decref();
const count = countGraphemeClusters(rocstr);
try expectEqual(expected.len, count);
const graphemes = strGraphemes(rocstr);
defer graphemes.decref(@sizeOf(u8));
if (input.len == 0) return; // empty string
const elems = graphemes.elements(RocStr) orelse unreachable;
for (expected, 0..) |g, i| {
try std.testing.expectEqualStrings(g, elems[i].asSlice());
}
}
test "graphemes: empty string" {
try graphemesTest("", &.{});
}
test "graphemes: ascii characters" {
try graphemesTest("abcd", &.{ "a", "b", "c", "d" });
}
test "graphemes: utf8 characters" {
try graphemesTest("ãxā", &.{ "ã", "x", "ā" });
}
test "graphemes: emojis" {
try graphemesTest("🤔🤔🤔", &.{ "🤔", "🤔", "🤔" });
}
test "graphemes: emojis and ut8 characters" {
try graphemesTest("🤔å🤔¥🤔ç", &.{ "🤔", "å", "🤔", "¥", "🤔", "ç" });
}
test "graphemes: emojis, ut8, and ascii characters" {
try graphemesTest("6🤔å🤔e¥🤔çpp", &.{ "6", "🤔", "å", "🤔", "e", "¥", "🤔", "ç", "p", "p" });
}
pub fn countUtf8Bytes(string: RocStr) callconv(.C) usize {
return string.len();
}
@ -1614,44 +1258,6 @@ pub fn repeat(string: RocStr, count: usize) callconv(.C) RocStr {
return ret_string;
}
// Str.startsWithScalar
pub fn startsWithScalar(string: RocStr, prefix: u32) callconv(.C) bool {
const len = string.len();
if (len == 0) {
return false;
}
// Write this (non-empty) string's first scalar into `first_scalar`
var first_scalar: [1]u32 = undefined;
_ = writeNextScalar(string, 0, &first_scalar, 0);
// Return whether `first_scalar` equals `prefix`
return @as(*u32, @ptrCast(&first_scalar)).* == prefix;
}
test "startsWithScalar: empty string" {
const whole = RocStr.empty();
const prefix: u32 = 'x';
try expect(!startsWithScalar(whole, prefix));
}
test "startsWithScalar: ascii char" {
const whole = RocStr.fromSlice("foobar");
const prefix: u32 = 'f';
try expect(startsWithScalar(whole, prefix));
}
test "startsWithScalar: emoji" {
const yes = RocStr.fromSlice("💖foobar");
const no = RocStr.fromSlice("foobar");
const prefix: u32 = '💖';
try expect(startsWithScalar(yes, prefix));
try expect(!startsWithScalar(no, prefix));
}
test "startsWith: foo starts with fo" {
const foo = RocStr.fromSlice("foo");
const fo = RocStr.fromSlice("fo");
@ -2761,78 +2367,6 @@ test "capacity: big string" {
try expect(data.getCapacity() >= data_bytes.len);
}
pub fn appendScalar(string: RocStr, scalar_u32: u32) callconv(.C) RocStr {
const scalar = @as(u21, @intCast(scalar_u32));
const width = std.unicode.utf8CodepointSequenceLength(scalar) catch unreachable;
var output = string.reallocate(string.len() + width);
var slice = output.asSliceWithCapacityMut();
_ = std.unicode.utf8Encode(scalar, slice[string.len() .. string.len() + width]) catch unreachable;
return output;
}
test "appendScalar: small A" {
const A: []const u8 = "A";
const data_bytes = "hello";
var data = RocStr.init(data_bytes, data_bytes.len);
const actual = appendScalar(data, A[0]);
defer actual.decref();
const expected_bytes = "helloA";
const expected = RocStr.init(expected_bytes, expected_bytes.len);
defer expected.decref();
try expect(actual.eq(expected));
}
test "appendScalar: small 😀" {
const data_bytes = "hello";
var data = RocStr.init(data_bytes, data_bytes.len);
const actual = appendScalar(data, 0x1F600);
defer actual.decref();
const expected_bytes = "hello😀";
const expected = RocStr.init(expected_bytes, expected_bytes.len);
defer expected.decref();
try expect(actual.eq(expected));
}
test "appendScalar: big A" {
const A: []const u8 = "A";
const data_bytes = "a string so large that it must be heap-allocated";
var data = RocStr.init(data_bytes, data_bytes.len);
const actual = appendScalar(data, A[0]);
defer actual.decref();
const expected_bytes = "a string so large that it must be heap-allocatedA";
const expected = RocStr.init(expected_bytes, expected_bytes.len);
defer expected.decref();
try expect(actual.eq(expected));
}
test "appendScalar: big 😀" {
const data_bytes = "a string so large that it must be heap-allocated";
var data = RocStr.init(data_bytes, data_bytes.len);
const actual = appendScalar(data, 0x1F600);
defer actual.decref();
const expected_bytes = "a string so large that it must be heap-allocated😀";
const expected = RocStr.init(expected_bytes, expected_bytes.len);
defer expected.decref();
try expect(actual.eq(expected));
}
pub fn reserve(string: RocStr, spare: usize) callconv(.C) RocStr {
const old_length = string.len();
if (string.getCapacity() >= old_length + spare) {
@ -2849,27 +2383,6 @@ pub fn withCapacity(capacity: usize) callconv(.C) RocStr {
str.setLen(0);
return str;
}
pub fn getScalarUnsafe(string: RocStr, index: usize) callconv(.C) extern struct { bytesParsed: usize, scalar: u32 } {
const slice = string.asSlice();
const bytesParsed = @as(usize, @intCast(std.unicode.utf8ByteSequenceLength(slice[index]) catch unreachable));
const scalar = std.unicode.utf8Decode(slice[index .. index + bytesParsed]) catch unreachable;
return .{ .bytesParsed = bytesParsed, .scalar = @as(u32, @intCast(scalar)) };
}
test "getScalarUnsafe" {
const data_bytes = "A";
var data = RocStr.init(data_bytes, data_bytes.len);
const result = getScalarUnsafe(data, 0);
const expected = try std.unicode.utf8Decode("A");
try expectEqual(result.scalar, @as(u32, @intCast(expected)));
try expectEqual(result.bytesParsed, 1);
}
pub fn strCloneTo(
string: RocStr,
ptr: [*]u8,

View file

@ -435,7 +435,8 @@ repeatHelp = \value, count, accum ->
## ```
reverse : List a -> List a
reverse = \list ->
reverseHelp list 0 (Num.subSaturated (List.len list) 1)
end = List.len list |> Num.subSaturated 1
reverseHelp (List.clone list) 0 end
reverseHelp = \list, left, right ->
if left < right then
@ -443,6 +444,9 @@ reverseHelp = \list, left, right ->
else
list
# Ensures that the list in unique (will re-use if already unique)
clone : List a -> List a
## Join the given lists together into one list.
## ```
## expect List.join [[1], [2, 3], [], [4, 5]] == [1, 2, 3, 4, 5]

View file

@ -48,6 +48,7 @@ interface Num
isLte,
isGt,
isGte,
isApproxEq,
sin,
cos,
tan,
@ -661,6 +662,22 @@ isLte : Num a, Num a -> Bool
## is [defined to be unordered](https://en.wikipedia.org/wiki/NaN#Comparison_with_NaN).)
isGte : Num a, Num a -> Bool
## Returns `Bool.true` if the first number and second number are within a specific threshold
##
## A specific relative and absolute tolerance can be provided to change the threshold
##
## If either argument is [*NaN*](Num.isNaN), returns `Bool.false` no matter what. (*NaN*
## is [defined to be unordered](https://en.wikipedia.org/wiki/NaN#Comparison_with_NaN).)
isApproxEq : Frac a, Frac a, { rtol ? Frac a, atol ? Frac a } -> Bool
isApproxEq = \value, refValue, { rtol ? 0.00001, atol ? 0.00000001 } -> value
<= refValue
&& value
>= refValue
|| Num.absDiff value refValue
<= atol
+ rtol
* Num.abs refValue
## Returns `Bool.true` if the number is `0`, and `Bool.false` otherwise.
isZero : Num a -> Bool

View file

@ -1,90 +1,331 @@
## Roc strings are sequences of text values. This module includes functions for combining strings,
## as well as breaking them up into smaller units—most commonly [extended grapheme clusters](http://www.unicode.org/glossary/#extended_grapheme_cluster)
## (referred to in this module's documentation as "graphemes" rather than "characters" for clarity;
## "characters" can mean very different things in different languages).
## Strings represent text. For example, `"Hi!"` is a string.
##
## This module focuses on graphemes (as opposed to, say, Unicode code points or LATIN-1 bytes)
## because graphemes avoid common classes of bugs. Breaking strings up using code points often
## leads to bugs around things like emoji, where multiple code points combine to form to a
## single rendered glyph. Graphemes avoid these bugs by treating multi-code-point things like
## emojis as indivisible units.
## This guide starts at a high level and works down to the in-memory representation of strings and their [performance characteristics](#performance). For reasons that will be explained later in this guide, some string operations are in the `Str` module while others (notably [capitalization](#capitalization), [code points](#code-points), [graphemes](#graphemes), and sorting) are in separate packages. There's also a list of recommendations for [when to use code points, graphemes, and UTF-8](#when-to-use).
##
## Because graphemes can have variable length (there's no upper limit on how many code points one
## grapheme can represent), it takes linear time to count the number of graphemes in a string,
## and also linear time to find an individual grapheme within a string by its position (or "index")
## among the string's other graphemes. The only way to get constant-time access to these is in a way
## that can result in bugs if the string contains multi-code-point things like emojis, which is why
## this module does not offer those.
## ## Syntax
##
## The most common way to represent strings is using quotation marks:
##
## ## Working with Unicode strings in Roc
##
## Unicode can represent text values which span multiple languages, symbols, and emoji.
## Here are some valid Roc strings:
## ```
## "Roc!"
## "鹏"
## "🕊"
## "Hello, World!"
## ```
## Every Unicode string is a sequence of [extended grapheme clusters](http://www.unicode.org/glossary/#extended_grapheme_cluster).
## An extended grapheme cluster represents what a person reading a string might
## call a "character" - like "A" or "ö" or "👩‍👩‍👦‍👦".
## Because the term "character" means different things in different areas of
## programming, and "extended grapheme cluster" is a mouthful, in Roc we use the
## term "grapheme" as a shorthand for the more precise "extended grapheme cluster."
##
## You can get the number of graphemes in a string by calling `Str.countGraphemes` on it:
## Using this syntax, the whole string must go on one line. You can write multiline strings using triple quotes:
##
## ```
## Str.countGraphemes "Roc!"
## Str.countGraphemes "折り紙"
## Str.countGraphemes "🕊"
## text =
## """
## In memory, this string will not have any spaces
## at its start. That's because the first line
## starts at the same indentation level as the
## opening quotation mark. Actually, none of these
## lines will be indented.
##
## However, this line will be indented!
## """
## ```
## > The `countGraphemes` function walks through the entire string to get its answer,
## > so if you want to check whether a string is empty, you'll get much better performance
## > by calling `Str.isEmpty myStr` instead of `Str.countGraphemes myStr == 0`.
##
## ### Escape sequences
## In triple-quoted strings, both the opening and closing `"""` must be at the same indentation level. Lines in the string begin at that indentation level; the spaces that indent the multiline string itself are not considered content.
##
## ### Interpolation
##
## *String interpolation* is syntax for inserting a string into another string.
##
## If you put a `\` in a Roc string literal, it begins an *escape sequence*.
## An escape sequence is a convenient way to insert certain strings into other strings.
## For example, suppose you write this Roc string:
## ```
## "I took the one less traveled by,\nAnd that has made all the difference."
## name = "Sam"
##
## "Hi, my name is $(name)!"
## ```
## The `"\n"` in the middle will insert a line break into this string. There are
## other ways of getting a line break in there, but `"\n"` is the most common.
##
## Another way you could insert a newlines is by writing `\u(0A)` instead of `\n`.
## That would result in the same string, because the `\u` escape sequence inserts
## [Unicode code points](https://unicode.org/glossary/#code_point) directly into
## the string. The Unicode code point 10 is a newline, and 10 is `0A` in hexadecimal.
## `\u` escape sequences are always followed by a hexadecimal number inside `(` and `)`
## like this.
## This will evaluate to the string `"Hi, my name is Sam!"`
##
## As another example, `"R\u(6F)c"` is the same string as `"Roc"`, because
## `"\u(6F)"` corresponds to the Unicode code point for lowercase `o`. If you
## want to [spice things up a bit](https://en.wikipedia.org/wiki/Metal_umlaut),
## you can write `"R\u(F6)c"` as an alternative way to get the string `"Röc"\.
## You can put any expression you like inside the parentheses, as long as it's all on one line:
##
## Roc strings also support these escape sequences:
##
## * `\\` - an actual backslash (writing a single `\` always begins an escape sequence!)
## * `\"` - an actual quotation mark (writing a `"` without a `\` ends the string)
## * `\r` - [carriage return](https://en.wikipedia.org/wiki/Carriage_Return)
## * `\t` - [horizontal tab](https://en.wikipedia.org/wiki/Tab_key#Tab_characters)
## * `\v` - [vertical tab](https://en.wikipedia.org/wiki/Tab_key#Tab_characters)
##
## You can also use escape sequences to insert named strings into other strings, like so:
## ```
## name = "Lee"
## city = "Roctown"
## greeting = "Hello there, \(name)! Welcome to \(city)."
## colors = ["red", "green", "blue"]
##
## "The colors are $(colors |> Str.joinWith ", ")!"
## ```
## Here, `greeting` will become the string `"Hello there, Lee! Welcome to Roctown."`.
## This is known as [string interpolation](https://en.wikipedia.org/wiki/String_interpolation),
## and you can use it as many times as you like inside a string. The name
## between the parentheses must refer to a `Str` value that is currently in
## scope, and it must be a name - it can't be an arbitrary expression like a function call.
##
## Interpolation can be used in multiline strings, but the part inside the parentheses must still be on one line.
##
## ### Escapes
##
## There are a few special escape sequences in strings:
##
## * `\n` becomes a [newline](https://en.wikipedia.org/wiki/Newline)
## * `\r` becomes a [carriage return](https://en.wikipedia.org/wiki/Carriage_return#Computers)
## * `\t` becomes a [tab](https://en.wikipedia.org/wiki/Tab_key#Tab_characters)
## * `\"` becomes a normal `"` (this lets you write `"` inside a single-line string)
## * `\\` becomes a normal `\` (this lets you write `\` without it being treated as an escape)
## * `\$` becomes a normal `$` (this lets you write `$` followed by `(` without it being treated as [interpolation](#interpolation))
##
## These work in both single-line and multiline strings. We'll also discuss another escape later, for inserting [Unicode code points](#code-points) into a string.
##
## ### Single quote syntax
##
## Try putting `'👩'` into `roc repl`. You should see this:
##
## ```
## » '👩'
##
## 128105 : Int *
## ```
##
## The single-quote `'` syntax lets you represent a Unicode code point (discussed in the next section) in source code, in a way that renders as the actual text it represents rather than as a number literal. This lets you see what it looks like in the source code rather than looking at a number.
##
## At runtime, the single-quoted value will be treated the same as an ordinary number literal—in other words, `'👩'` is syntax sugar for writing `128105`. You can verify this in `roc repl`:
##
## ```
## » '👩' == 128105
##
## Bool.true : Bool
## ```
##
## Double quotes (`"`), on the other hand, are not type-compatible with integers—not only because strings can be empty (`""` is valid, but `''` is not) but also because there may be more than one code point involved in any given string!
##
## There are also some special escape sequences in single-quote strings:
##
## * `\n` becomes a [newline](https://en.wikipedia.org/wiki/Newline)
## * `\r` becomes a [carriage return](https://en.wikipedia.org/wiki/Carriage_return#Computers)
## * `\t` becomes a [tab](https://en.wikipedia.org/wiki/Tab_key#Tab_characters)
## * `\'` becomes a normal `'` (this lets you write `'` inside a single-quote string)
## * `\\` becomes a normal `\` (this lets you write `\` without it being treated as an escape)
##
## Most often this single-quote syntax is used when writing parsers; most Roc programs never use it at all.
##
## ## Unicode
##
## Roc strings represent text using [Unicode](https://unicode.org) This guide will provide only a basic overview of Unicode (the [Unicode glossary](http://www.unicode.org/glossary/) has over 500 entries in it), but it will include the most relevant differences between these concepts:
##
## * Code points
## * Graphemes
## * UTF-8
##
## It will also explain why some operations are included in Roc's builtin [Str](https://www.roc-lang.org/builtins/Str)
## module, and why others are in separate packages like [roc-lang/unicode](https://github.com/roc-lang/unicode).
##
## ### Graphemes
##
## Let's start with the following string:
##
## `"👩‍👩‍👦‍👦"`
##
## Some might call this a "character." After all, in a monospace font, it looks to be about the same width as the letter "A" or the punctuation mark "!"—both of which are commonly called "characters." Unfortunately, the term "character" in programming has changed meanings many times across the years and across programming languages, and today it's become a major source of confusion.
##
## Unicode uses the less ambiguous term [*grapheme*](https://www.unicode.org/glossary/#grapheme), which it defines as a "user-perceived character" (as opposed to one of the several historical ways the term "character" has been used in programming) or, alternatively, "A minimally distinctive unit of writing in the context of a particular writing system."
##
## By Unicode's definition, each of the following is an individual grapheme:
##
## * `a`
## * `鹏`
## * `👩‍👩‍👦‍👦`
##
## Note that although *grapheme* is less ambiguous than *character*, its definition is still open to interpretation. To address this, Unicode has formally specified [text segmentation rules](https://www.unicode.org/reports/tr29/) which define grapheme boundaries in precise technical terms. We won't get into those rules here, but since they can change with new Unicode releases, functions for working with graphemes are in the [roc-lang/unicode](https://github.com/roc-lang/unicode) package rather than in the builtin [`Str`](https://www.roc-lang.org/builtins/Str) module. This allows them to be updated without being blocked on a new release of the Roc language.
##
## ### Code Points
##
## Every Unicode text value can be broken down into [Unicode code points](http://www.unicode.org/glossary/#code_point), which are integers between `0` and `285_212_438` that describe components of the text. In memory, every Roc string is a sequence of these integers stored in a format called UTF-8, which will be discussed [later](#utf8).
##
## The string `"👩‍👩‍👦‍👦"` happens to be made up of these code points:
##
## ```
## [128105, 8205, 128105, 8205, 128102, 8205, 128102]
## ```
##
## From this we can see that:
##
## - One grapheme can be made up of multiple code points. In fact, there is no upper limit on how many code points can go into a single grapheme! (Some programming languages use the term "character" to refer to individual code points; this can be confusing for graphemes like 👩‍👩‍👦‍👦 because it visually looks like "one character" but no single code point can represent it.)
## - Sometimes code points repeat within an individual grapheme. Here, 128105 repeats twice, as does 128102, and there's an 8205 in between each of the other code points.
##
## ### Combining Code Points
##
## The reason every other code point in 👩‍👩‍👦‍👦 is 8205 is that code point 8205 joins together other code points. This emoji, known as ["Family: Woman, Woman, Boy, Boy"](https://emojipedia.org/family-woman-woman-boy-boy), is made by combining several emoji using [zero-width joiners](https://emojipedia.org/zero-width-joiner)—which are represented by code point 8205 in memory, and which have no visual repesentation on their own.
##
## Here are those code points again, this time with comments about what they represent:
##
## ```
## [128105] # "👩"
## [8205] # (joiner)
## [128105] # "👩"
## [8205] # (joiner)
## [128102] # "👦"
## [8205] # (joiner)
## [128102] # "👦"
## ```
##
## One way to read this is "woman emoji joined to woman emoji joined to boy emoji joined to boy emoji." Without the joins, it would be:
##
## ```
## "👩👩👦👦"
## ```
##
## With the joins, however, it is instead:
##
## ```
## "👩‍👩‍👦‍👦"
## ```
##
## Even though 👩‍👩‍👦‍👦 is visually smaller when rendered, it takes up almost twice as much memory as 👩👩👦👦 does! That's because it has all the same code points, plus the zero-width joiners in between them.
##
## ### String equality and normalization
##
## Besides emoji like 👩‍👩‍👦‍👦, another classic example of multiple code points being combined to render as one grapheme has to do with accent marks. Try putting these two strings into `roc repl`:
##
## ```
## "caf\u(e9)"
## "cafe\u(301)"
## ```
##
## The `\u(e9)` syntax is a way of inserting code points into string literals. In this case, it's the same as inserting the hexadecimal number `0xe9` as a code point onto the end of the string `"caf"`. Since Unicode code point `0xe9` happens to be `é`, the string `"caf\u(e9)"` ends up being identical in memory to the string `"café"`.
##
## We can verify this too:
##
## ```
## » "caf\u(e9)" == "café"
##
## Bool.true : Bool
## ```
##
## As it turns out, `"cafe\u(301)"` is another way to represent the same word. The Unicode code point 0x301 represents a ["combining acute accent"](https://unicodeplus.com/U+0301)—which essentially means that it will add an accent mark to whatever came before it. In this case, since `"cafe\u(301)"` has an `e` before the `"\u(301)"`, that `e` ends up with an accent mark on it and becomes `é`.
##
## Although these two strings get rendered identically to one another, they are different in memory because their code points are different! We can also confirm this in `roc repl`:
##
## ```
## » "caf\u(e9)" == "cafe\u(301)"
##
## Bool.false : Bool
## ```
##
## As you can imagine, this can be a source of bugs. Not only are they considered unequal, they also hash differently, meaning `"caf\u(e9)"` and `"cafe\u(301)"` can both be separate entries in the same [`Set`](https://www.roc-lang.org/builtins/Set).
##
## One way to prevent problems like these is to perform [Unicode normalization](https://www.unicode.org/reports/tr15/), a process which converts conceptually equivalent strings (like `"caf\u(e9)"` and `"cafe\u(301)"`) into one canonical in-memory representation. This makes equality checks on them pass, among other benefits.
##
## It would be technically possible for Roc to perform string normalization automatically on every equality check. Unfortunately, although some programs might want to treat `"caf\u(e9)"` and `"cafe\u(301)"` as equivalent, for other programs it might actually be important to be able to tell them apart. If these equality checks always passed, then there would be no way to tell them apart!
##
## As such, normalization must be performed explicitly when desired. Like graphemes, Unicode normalization rules can change with new releases of Unicode. As such, these functions are in separate packages instead of builtins (normalization is planned to be in [roc-lang/unicode](https://github.com/roc-lang/unicode) in the future, but it has not yet been implemented) so that updates to these functions based on new Unicode releases can happen without waiting on new releases of the Roc language.
##
## ### Capitalization
##
## We've already seen two examples of Unicode definitions that can change with new Unicode releases: graphemes and normalization. Another is capitalization; these rules can change with new Unicode releases (most often in the form of additions of new languages, but breaking changes to capitalization rules for existing languages are also possible), and so they are not included in builtin [`Str`](https://www.roc-lang.org/builtins/Str).
##
## This might seem particularly surprising, since capitalization functions are commonly included in standard libraries. However, it turns out that "capitalizing an arbitrary string" is impossible to do correctly without additional information.
##
## For example, what is the capitalized version of this string?
##
## ```
## "i"
## ```
##
## * In English, the correct answer is `"I"`.
## * In Turkish, the correct answer is `"İ"`.
##
## Similarly, the correct lowercased version of the string `"I"` is `"i"` in English and `"ı"` in Turkish.
##
## Turkish is not the only language to use this [dotless i](https://en.wikipedia.org/wiki/Dotless_I), and it's an example of how a function which capitalizes strings cannot give correct answers without the additional information of which language's capitalization rules should be used.
##
## Many languages defer to the operating system's [localization](https://en.wikipedia.org/wiki/Internationalization_and_localization) settings for this information. In that design, calling a program's capitalization function with an input string of `"i"` might give an answer of `"I"` on one machine and `"İ"` on a different machine, even though it was the same program running on both systems. Naturally, this can cause bugs—but more than that, writing tests to prevent bugs like this usually requires extra complexity compared to writing ordinary tests.
##
## In general, Roc programs should give the same answers for the same inputs even when run on different machines. There are exceptions to this (e.g. a program running out of system resources on one machine, while being able to make more progress on a machine that has more resources), but operating system's language localization is not among them.
##
## For these reasons, capitalization functions are not in [`Str`](https://www.roc-lang.org/builtins/Str). There is a planned `roc-lang` package to handle use cases like capitalization and sorting—sorting can also vary by language as well as by things like country—but implementation work has not yet started on this package.
##
## ### UTF-8
##
## Earlier, we discussed how Unicode code points can be described as [`U32`](https://www.roc-lang.org/builtins/Num#U32) integers. However, many common code points are very low integers, and can fit into a `U8` instead of needing an entire `U32` to represent them in memory. UTF-8 takes advantage of this, using a variable-width encoding to represent code points in 1-4 bytes, which saves a lot of memory in the typical case—especially compared to [UTF-16](https://en.wikipedia.org/wiki/UTF-16), which always uses at least 2 bytes to represent each code point, or [UTF-32](https://en.wikipedia.org/wiki/UTF-32), which always uses the maximum 4 bytes.
##
## This guide won't cover all the details of UTF-8, but the basic idea is this:
##
## - If a code point is 127 or lower, UTF-8 stores it in 1 byte.
## - If it's between 128 and 2047, UTF-8 stores it in 2 bytes.
## - If it's between 2048 and 65535, UTF-8 stores it in 3 bytes.
## - If it's higher than that, UTF-8 stores it in 4 bytes.
##
## The specific [UTF-8 encoding](https://en.wikipedia.org/wiki/UTF-8#Encoding) of these bytes involves using 1 to 5 bits of each byte for metadata about multi-byte sequences.
##
## A valuable feature of UTF-8 is that it is backwards-compatible with the [ASCII](https://en.wikipedia.org/wiki/ASCII) encoding that was widely used for many years. ASCII existed before Unicode did, and only used the integers 0 to 127 to represent its equivalent of code points. The Unicode code points 0 to 127 represent the same semantic information as ASCII, (e.g. the number 64 represents the letter "A" in both ASCII and in Unicode), and since UTF-8 represents code points 0 to 127 using one byte, all valid ASCII strings can be successfully parsed as UTF-8 without any need for conversion.
##
## Since many textual computer encodings—including [CSV](https://en.wikipedia.org/wiki/CSV), [XML](https://en.wikipedia.org/wiki/XML), and [JSON](https://en.wikipedia.org/wiki/JSON)—do not use any code points above 127 for their delimiters, it is often possible to write parsers for these formats using only `Str` functions which present UTF-8 as raw `U8` sequences, such as [`Str.walkUtf8`](https://www.roc-lang.org/builtins/Str#walkUtf8) and [`Str.toUtf8`](https://www.roc-lang.org/builtins/Str#toUtf8). In the typical case where they do not to need to parse out individual Unicode code points, they can get everything they need from `Str` UTF-8 functions without needing to depend on other packages.
##
## ### When to use code points, graphemes, and UTF-8
##
## Deciding when to use code points, graphemes, and UTF-8 can be nonobvious to say the least!
##
## The way Roc organizes the `Str` module and supporting packages is designed to help answer this question. Every situation is different, but the following rules of thumb are typical:
##
## * Most often, using `Str` values along with helper functions like [`split`](https://www.roc-lang.org/builtins/Str#split), [`joinWith`](https://www.roc-lang.org/builtins/Str#joinWith), and so on, is the best option.
## * If you are specifically implementing a parser, working in UTF-8 bytes is usually the best option. So functions like [`walkUtf8`](https://www.roc-lang.org/builtins/Str#walkUtf8), [toUtf8](https://www.roc-lang.org/builtins/Str#toUtf8), and so on. (Note that single-quote literals produce number literals, so ASCII-range literals like `'a'` gives an integer literal that works with a UTF-8 `U8`.)
## * If you are implementing a Unicode library like [roc-lang/unicode](https://github.com/roc-lang/unicode), working in terms of code points will be unavoidable. Aside from basic readability considerations like `\u(...)` in string literals, if you have the option to avoid working in terms of code points, it is almost always correct to avoid them.
## * If it seems like a good idea to split a string into "characters" (graphemes), you should definitely stop and reconsider whether this is really the best design. Almost always, doing this is some combination of more error-prone or slower (usually both) than doing something else that does not require taking graphemes into consideration.
##
## For this reason (among others), grapheme functions live in [roc-lang/unicode](https://github.com/roc-lang/unicode) rather than in [`Str`](https://www.roc-lang.org/builtins/Str). They are more niche than they seem, so they should not be reached for all the time!
##
## ## Performance
##
## This section deals with how Roc strings are represented in memory, and their performance characteristics.
##
## A normal heap-allocated roc `Str` is represented on the stack as:
## - A "capacity" unsigned integer, which respresents how many bytes are allocated on the heap to hold the string's contents.
## - A "length" unsigned integer, which rerepresents how many of the "capacity" bytes are actually in use. (A `Str` can have more bytes allocated on the heap than are actually in use.)
## - The memory address of the first byte in the string's actual contents.
##
## Each of these three fields is the same size: 64 bits on a 64-bit system, and 32 bits on a 32-bit system. The actual contents of the string are stored in one contiguous sequence of bytes, encoded as UTF-8, often on the heap but sometimes elsewhere—more on this later. Empty strings do not have heap allocations, so an empty `Str` on a 64-bit system still takes up 24 bytes on the stack (due to its three 64-bit fields).
##
## ### Reference counting and opportunistic mutation
##
## Like lists, dictionaries, and sets, Roc strings are automatically reference-counted and can benefit from opportunistic in-place mutation. The reference count is stored on the heap immediately before the first byte of the string's contents, and it has the same size as a memory address. This means it can count so high that it's impossible to write a Roc program which overflows a reference count, because having that many simultaneous references (each of which is a memory address) would have exhausted the operating system's address space first.
##
## When the string's reference count is 1, functions like [`Str.concat`](https://www.roc-lang.org/builtins/Str#concat) and [`Str.replaceEach`](https://www.roc-lang.org/builtins/Str#replaceEach) mutate the string in-place rather than allocating a new string. This preserves semantic immutability because it is unobservable in terms of the operation's output; if the reference count is 1, it means that memory would have otherwise been deallocated immediately anyway, and it's more efficient to reuse it instead of deallocating it and then immediately making a new allocation.
##
## The contents of statically-known strings (today that means string literals) are stored in the readonly section of the binary, so they do not need heap allocations or reference counts. They are not eligible for in-place mutation, since mutating the readonly section of the binary would cause an operating system [access violation](https://en.wikipedia.org/wiki/Segmentation_fault).
##
## ### Small String Optimization
##
## Roc uses a "small string optimization" when representing certain strings in memory.
##
## If you have a sufficiently long string, then on a 64-bit system it will be represented on the stack using 24 bytes, and on a 32-bit system it will take 12 bytes—plus however many bytes are in the string itself—on the heap. However, if there is a string shorter than either of these stack sizes (so, a string of up to 23 bytes on a 64-bit system, and up to 11 bytes on a 32-bit system), then that string will be stored entirely on the stack rather than having a separate heap allocation at all.
##
## This can be much more memory-efficient! However, `List` does not have this optimization (it has some runtime cost, and in the case of `List` it's not anticipated to come up nearly as often), which means when converting a small string to `List U8` it can result in a heap allocation.
##
## Note that this optimization is based entirely on how many UTF-8 bytes the string takes up in memory. It doesn't matter how many [graphemes](#graphemes), [code points](#code-points) or anything else it has; the only factor that determines whether a particular string is eligible for the small string optimization is the number of UTF-8 bytes it takes up in memory!
##
## ### Seamless Slices
##
## Try putting this into `roc repl`:
##
## ```
## » "foo/bar/baz" |> Str.split "/"
##
## ["foo", "bar", "baz"] : List Str
## ```
##
## All of these strings are small enough that the [small string optimization](#small) will apply, so none of them will be allocated on the heap.
##
## Now let's suppose they were long enough that this optimization no longer applied:
##
## ```
## » "a much, much, much, much/longer/string compared to the last one!" |> Str.split "/"
##
## ["a much, much, much, much", "longer", "string compared to the last one!"] : List Str
## ```
##
## Here, the only strings small enough for the small string optimization are `"/"` and `"longer"`. They will be allocated on the stack.
##
## The first and last strings in the returned list `"a much, much, much, much"` and `"string compared to the last one!"` will not be allocated on the heap either. Instead, they will be *seamless slices*, which means they will share memory with the original input string.
##
## * `"a much, much, much, much"` will share the first 24 bytes of the original string.
## * `"string compared to the last one!"` will share the last 32 bytes of the original string.
##
## All of these strings are semantically immutable, so sharing these bytes is an implementation detail that should only affect performance. By design, there is no way at either compile time or runtime to tell whether a string is a seamless slice. This allows the optimization's behavior to change in the future without affecting Roc programs' semantic behavior.
##
## Seamless slices create additional references to the original string, which make it ineligible for opportunistic mutation (along with the slices themselves; slices are never eligible for mutation), and which also make it take longer before the original string can be deallocated. A case where this might be noticeable in terms of performance would be:
## 1. A function takes a very large string as an argument and returns a much smaller slice into that string.
## 2. The smaller slice is used for a long time in the program, whereas the much larger original string stops being used.
## 3. In this situation, it might have been better for total program memory usage (although not necessarily overall performance) if the original large string could have been deallocated sooner, even at the expense of having to copy the smaller string into a new allocation instead of reusing the bytes with a seamless slice.
##
## If a situation like this comes up, a slice can be turned into a separate string by using [`Str.concat`](https://www.roc-lang.org/builtins/Str#concat) to concatenate the slice onto an empty string (or one created with [`Str.withCapacity`](https://www.roc-lang.org/builtins/Str#withCapacity)).
##
## Currently, the only way to get seamless slices of strings is by calling certain `Str` functions which return them. In general, `Str` functions which accept a string and return a subset of that string tend to do this. [`Str.trim`](https://www.roc-lang.org/builtins/Str#trim) is another example of a function which returns a seamless slice.
interface Str
exposes [
Utf8Problem,
@ -94,9 +335,7 @@ interface Str
joinWith,
split,
repeat,
countGraphemes,
countUtf8Bytes,
startsWithScalar,
toUtf8,
fromUtf8,
fromUtf8Range,
@ -119,7 +358,6 @@ interface Str
toI16,
toU8,
toI8,
toScalars,
replaceEach,
replaceFirst,
replaceLast,
@ -129,12 +367,8 @@ interface Str
walkUtf8WithIndex,
reserve,
releaseExcessCapacity,
appendScalar,
walkScalars,
walkScalarsUntil,
withCapacity,
withPrefix,
graphemes,
contains,
]
imports []
@ -265,8 +499,7 @@ joinWith : List Str, Str -> Str
## Split a string around a separator.
##
## Passing `""` for the separator is not useful;
## it returns the original string wrapped in a [List]. To split a string
## into its individual [graphemes](https://stackoverflow.com/a/27331885/4200103), use `Str.graphemes`
## it returns the original string wrapped in a [List].
## ```
## expect Str.split "1,2,3" "," == ["1","2","3"]
## expect Str.split "1,2,3" "" == ["1,2,3"]
@ -285,78 +518,6 @@ split : Str, Str -> List Str
## ```
repeat : Str, Nat -> Str
## Counts the number of [extended grapheme clusters](http://www.unicode.org/glossary/#extended_grapheme_cluster)
## in the string.
##
## Note that the number of extended grapheme clusters can be different from the number
## of visual glyphs rendered! Consider the following examples:
## ```
## expect Str.countGraphemes "Roc" == 3
## expect Str.countGraphemes "👩‍👩‍👦‍👦" == 4
## expect Str.countGraphemes "🕊" == 1
## ```
## Note that "👩‍👩‍👦‍👦" takes up 4 graphemes (even though visually it appears as a single
## glyph) because under the hood it's represented using an emoji modifier sequence.
## In contrast, "🕊" only takes up 1 grapheme because under the hood it's represented
## using a single Unicode code point.
countGraphemes : Str -> Nat
## Split a string into its constituent graphemes.
##
## This function breaks a string into its individual [graphemes](https://stackoverflow.com/a/27331885/4200103),
## returning them as a list of strings. This is useful for working with text that
## contains complex characters, such as emojis.
##
## Examples:
## ```
## expect Str.graphemes "Roc" == ["R", "o", "c"]
## expect Str.graphemes "नमस्ते" == ["न", "म", "स्", "ते"]
## expect Str.graphemes "👩‍👩‍👦‍👦" == ["👩‍", "👩‍", "👦‍", "👦"]
## ```
##
## Note that the "👩‍👩‍👦‍👦" example consists of 4 grapheme clusters, although it visually
## appears as a single glyph. This is because it uses an emoji modifier sequence.
graphemes : Str -> List Str
## If the string begins with a [Unicode code point](http://www.unicode.org/glossary/#code_point)
## equal to the given [U32], returns [Bool.true]. Otherwise returns [Bool.false].
##
## If the given string is empty, or if the given [U32] is not a valid
## code point, returns [Bool.false].
## ```
## expect Str.startsWithScalar "鹏 means 'roc'" 40527 # "鹏" is Unicode scalar 40527
## expect !Str.startsWithScalar "9" 9 # the Unicode scalar for "9" is 57, not 9
## expect !Str.startsWithScalar "" 40527
## ```
##
## ## Performance Details
##
## This runs slightly faster than [Str.startsWith], so
## if you want to check whether a string begins with something that's representable
## in a single code point, you can use (for example) `Str.startsWithScalar '鹏'`
## instead of `Str.startsWith "鹏"`. ('鹏' evaluates to the [U32] value `40527`.)
## This will not work for graphemes which take up multiple code points, however;
## `Str.startsWithScalar '👩‍👩‍👦‍👦'` would be a compiler error because 👩‍👩‍👦‍👦 takes up
## multiple code points and cannot be represented as a single [U32].
## You'd need to use `Str.startsWithScalar "🕊"` instead.
startsWithScalar : Str, U32 -> Bool
## Returns a [List] of the [Unicode scalar values](https://unicode.org/glossary/#unicode_scalar_value)
## in the given string.
##
## (Roc strings contain only scalar values, not [surrogate code points](https://unicode.org/glossary/#surrogate_code_point),
## so this is equivalent to returning a list of the string's [code points](https://unicode.org/glossary/#code_point).)
## ```
## expect Str.toScalars "Roc" == [82, 111, 99]
## expect Str.toScalars "鹏" == [40527]
## expect Str.toScalars "சி" == [2970, 3007]
## expect Str.toScalars "🐦" == [128038]
## expect Str.toScalars "👩‍👩‍👦‍👦" == [128105, 8205, 128105, 8205, 128102, 8205, 128102]
## expect Str.toScalars "I ♥ Roc" == [73, 32, 9829, 32, 82, 111, 99]
## expect Str.toScalars "" == []
## ```
toScalars : Str -> List U32
## Returns a [List] of the string's [U8] UTF-8 [code units](https://unicode.org/glossary/#code_unit).
## (To split the string into a [List] of smaller [Str] values instead of [U8] values,
## see [Str.split].)
@ -907,80 +1068,6 @@ expect (walkUtf8 "鹏" [] List.append) == [233, 185, 143]
## Note: This will also convert seamless slices to regular lists.
releaseExcessCapacity : Str -> Str
## is UB when the scalar is invalid
appendScalarUnsafe : Str, U32 -> Str
## Append a [U32] scalar to the given string. If the given scalar is not a valid
## unicode value, it returns [Err InvalidScalar].
## ```
## expect Str.appendScalar "H" 105 == Ok "Hi"
## expect Str.appendScalar "😢" 0xabcdef == Err InvalidScalar
## ```
appendScalar : Str, U32 -> Result Str [InvalidScalar]
appendScalar = \string, scalar ->
if isValidScalar scalar then
Ok (appendScalarUnsafe string scalar)
else
Err InvalidScalar
isValidScalar : U32 -> Bool
isValidScalar = \scalar ->
scalar <= 0xD7FF || (scalar >= 0xE000 && scalar <= 0x10FFFF)
getScalarUnsafe : Str, Nat -> { scalar : U32, bytesParsed : Nat }
## Walks over the unicode [U32] values for the given [Str] and calls a function
## to update state for each.
## ```
## f : List U32, U32 -> List U32
## f = \state, scalar -> List.append state scalar
## expect Str.walkScalars "ABC" [] f == [65, 66, 67]
## ```
walkScalars : Str, state, (state, U32 -> state) -> state
walkScalars = \string, init, step ->
walkScalarsHelp string init step 0 (Str.countUtf8Bytes string)
walkScalarsHelp : Str, state, (state, U32 -> state), Nat, Nat -> state
walkScalarsHelp = \string, state, step, index, length ->
if index < length then
{ scalar, bytesParsed } = getScalarUnsafe string index
newState = step state scalar
walkScalarsHelp string newState step (Num.addWrap index bytesParsed) length
else
state
## Walks over the unicode [U32] values for the given [Str] and calls a function
## to update state for each.
## ```
## f : List U32, U32 -> [Break (List U32), Continue (List U32)]
## f = \state, scalar ->
## check = 66
## if scalar == check then
## Break [check]
## else
## Continue (List.append state scalar)
## expect Str.walkScalarsUntil "ABC" [] f == [66]
## expect Str.walkScalarsUntil "AxC" [] f == [65, 120, 67]
## ```
walkScalarsUntil : Str, state, (state, U32 -> [Break state, Continue state]) -> state
walkScalarsUntil = \string, init, step ->
walkScalarsUntilHelp string init step 0 (Str.countUtf8Bytes string)
walkScalarsUntilHelp : Str, state, (state, U32 -> [Break state, Continue state]), Nat, Nat -> state
walkScalarsUntilHelp = \string, state, step, index, length ->
if index < length then
{ scalar, bytesParsed } = getScalarUnsafe string index
when step state scalar is
Continue newState ->
walkScalarsUntilHelp string newState step (Num.addWrap index bytesParsed) length
Break newState ->
newState
else
state
strToNum : Str -> { berrorcode : U8, aresult : Num * }
strToNumHelp : Str -> Result (Num a) [InvalidNumStr]

View file

@ -18,7 +18,7 @@ import Bool exposing [Bool]
## An opaque type with the `EncoderFormatting` and
## `DecoderFormatting` abilities.
Json := { fieldNameMapping : FieldNameMapping }
Json := {}
implements [
EncoderFormatting {
u8: encodeU8,
@ -64,21 +64,11 @@ Json := { fieldNameMapping : FieldNameMapping }
]
## Returns a JSON `Encoder` and `Decoder`
json = @Json { fieldNameMapping: Default }
json = @Json {}
## Returns a JSON `Encoder` and `Decoder` with configuration options
jsonWithOptions = \{ fieldNameMapping ? Default } ->
@Json { fieldNameMapping }
## Mapping between Roc record fields and JSON object names
FieldNameMapping : [
Default, # no transformation
SnakeCase, # snake_case
PascalCase, # PascalCase
KebabCase, # kabab-case
CamelCase, # camelCase
Custom (Str -> Str), # provide a custom formatting
]
jsonWithOptions = \{} ->
@Json {}
# TODO encode as JSON numbers as base 10 decimal digits
# e.g. the REPL `Num.toStr 12e42f64` gives
@ -146,14 +136,6 @@ encodeBool = \b ->
else
List.concat bytes (Str.toUtf8 "false")
# Test encode boolean
expect
input = [Bool.true, Bool.false]
actual = Encode.toBytes input json
expected = Str.toUtf8 "[true,false]"
actual == expected
encodeString = \str ->
Encode.custom \bytes, @Json {} ->
List.concat bytes (encodeStrBytes str)
@ -223,38 +205,10 @@ escapedByteToJson = \b ->
0x09 -> [0x5c, 'r'] # U+0009 Tab
_ -> [b]
expect escapedByteToJson '\n' == ['\\', 'n']
expect escapedByteToJson '\\' == ['\\', '\\']
expect escapedByteToJson '"' == ['\\', '"']
# Test encode small string
expect
input = "G'day"
actual = Encode.toBytes input json
expected = Str.toUtf8 "\"G'day\""
actual == expected
# Test encode large string
expect
input = "the quick brown fox jumps over the lazy dog"
actual = Encode.toBytes input json
expected = Str.toUtf8 "\"the quick brown fox jumps over the lazy dog\""
actual == expected
# Test encode with escapes e.g. "\r" encodes to "\\r"
expect
input = "the quick brown fox jumps over the lazy doga\r\nbc\\\"xz"
actual = Encode.toBytes input json
expected = Str.toUtf8 "\"the quick brown fox jumps over the lazy doga\\r\\nbc\\\\\\\"xz\""
actual == expected
encodeList = \lst, encodeElem ->
Encode.custom \bytes, @Json { fieldNameMapping } ->
Encode.custom \bytes, @Json {} ->
writeList = \{ buffer, elemsLeft }, elem ->
bufferWithElem = appendWith buffer (encodeElem elem) (@Json { fieldNameMapping })
bufferWithElem = appendWith buffer (encodeElem elem) (@Json {})
bufferWithSuffix =
if elemsLeft > 1 then
List.append bufferWithElem (Num.toU8 ',')
@ -268,27 +222,16 @@ encodeList = \lst, encodeElem ->
List.append withList (Num.toU8 ']')
# Test encode list of floats
expect
input : List F64
input = [-1, 0.00001, 1e12, 2.0e-2, 0.0003, 43]
actual = Encode.toBytes input json
expected = Str.toUtf8 "[-1,0.00001,1000000000000,0.02,0.0003,43]"
actual == expected
encodeRecord = \fields ->
Encode.custom \bytes, @Json { fieldNameMapping } ->
Encode.custom \bytes, @Json {} ->
writeRecord = \{ buffer, fieldsLeft }, { key, value } ->
fieldName = toObjectNameUsingMap key fieldNameMapping
fieldName = key
bufferWithKeyValue =
List.append buffer (Num.toU8 '"')
|> List.concat (Str.toUtf8 fieldName)
|> List.append (Num.toU8 '"')
|> List.append (Num.toU8 ':') # Note we need to encode using the json config here
|> appendWith value (@Json { fieldNameMapping })
|> appendWith value (@Json {})
bufferWithSuffix =
if fieldsLeft > 1 then
@ -303,52 +246,11 @@ encodeRecord = \fields ->
List.append bytesWithRecord (Num.toU8 '}')
# Test encode for a record with two strings ignoring whitespace
expect
input = { fruitCount: 2, ownerName: "Farmer Joe" }
encoder = jsonWithOptions { fieldNameMapping: PascalCase }
actual = Encode.toBytes input encoder
expected = Str.toUtf8 "{\"FruitCount\":2,\"OwnerName\":\"Farmer Joe\"}"
actual == expected
# Test encode of record with an array of strings and a boolean field
expect
input = { fruitFlavours: ["Apples", "Bananas", "Pears"], isFresh: Bool.true }
encoder = jsonWithOptions { fieldNameMapping: KebabCase }
actual = Encode.toBytes input encoder
expected = Str.toUtf8 "{\"fruit-flavours\":[\"Apples\",\"Bananas\",\"Pears\"],\"is-fresh\":true}"
actual == expected
# Test encode of record with a string and number field
expect
input = { firstSegment: "ab", secondSegment: 10u8 }
encoder = jsonWithOptions { fieldNameMapping: SnakeCase }
actual = Encode.toBytes input encoder
expected = Str.toUtf8 "{\"first_segment\":\"ab\",\"second_segment\":10}"
actual == expected
# Test encode of record of a record
expect
input = { outer: { inner: "a" }, other: { one: "b", two: 10u8 } }
encoder = jsonWithOptions { fieldNameMapping: Custom toYellingCase }
actual = Encode.toBytes input encoder
expected = Str.toUtf8 "{\"OTHER\":{\"ONE\":\"b\",\"TWO\":10},\"OUTER\":{\"INNER\":\"a\"}}"
actual == expected
toYellingCase = \str ->
Str.graphemes str
|> List.map toUppercase
|> Str.joinWith ""
encodeTuple = \elems ->
Encode.custom \bytes, @Json { fieldNameMapping } ->
Encode.custom \bytes, @Json {} ->
writeTuple = \{ buffer, elemsLeft }, elemEncoder ->
bufferWithElem =
appendWith buffer elemEncoder (@Json { fieldNameMapping })
appendWith buffer elemEncoder (@Json {})
bufferWithSuffix =
if elemsLeft > 1 then
@ -362,20 +264,11 @@ encodeTuple = \elems ->
{ buffer: bytesWithRecord } = List.walk elems { buffer: bytesHead, elemsLeft: List.len elems } writeTuple
List.append bytesWithRecord (Num.toU8 ']')
# Test encode of tuple
expect
input = ("The Answer is", 42)
actual = Encode.toBytes input json
expected = Str.toUtf8 "[\"The Answer is\",42]"
actual == expected
encodeTag = \name, payload ->
Encode.custom \bytes, @Json { fieldNameMapping } ->
Encode.custom \bytes, @Json {} ->
# Idea: encode `A v1 v2` as `{"A": [v1, v2]}`
writePayload = \{ buffer, itemsLeft }, encoder ->
bufferWithValue = appendWith buffer encoder (@Json { fieldNameMapping })
bufferWithValue = appendWith buffer encoder (@Json {})
bufferWithSuffix =
if itemsLeft > 1 then
List.append bufferWithValue (Num.toU8 ',')
@ -397,15 +290,6 @@ encodeTag = \name, payload ->
List.append bytesWithPayload (Num.toU8 ']')
|> List.append (Num.toU8 '}')
# Test encode of tag
expect
input = TheAnswer "is" 42
encoder = jsonWithOptions { fieldNameMapping: KebabCase }
actual = Encode.toBytes input encoder
expected = Str.toUtf8 "{\"TheAnswer\":[\"is\",42]}"
actual == expected
decodeU8 = Decode.custom \bytes, @Json {} ->
{ taken, rest } = takeJsonNumber bytes
@ -1309,7 +1193,7 @@ expect
# JSON OBJECTS -----------------------------------------------------------------
decodeRecord = \initialState, stepField, finalizer -> Decode.custom \bytes, @Json { fieldNameMapping } ->
decodeRecord = \initialState, stepField, finalizer -> Decode.custom \bytes, @Json {} ->
# Recursively build up record from object field:value pairs
decodeFields = \recordState, bytesBeforeField ->
@ -1336,8 +1220,7 @@ decodeRecord = \initialState, stepField, finalizer -> Decode.custom \bytes, @Jso
# Decode the json value
{ val: updatedRecord, rest: bytesAfterValue } <-
(
fieldName =
fromObjectNameUsingMap objectName fieldNameMapping
fieldName = objectName
# Retrieve value decoder for the current field
when stepField recordState fieldName is
@ -1350,7 +1233,7 @@ decodeRecord = \initialState, stepField, finalizer -> Decode.custom \bytes, @Jso
Keep valueDecoder ->
# Decode the value using the decoder from the recordState
# Note we need to pass json config options recursively here
Decode.decodeWith valueBytes valueDecoder (@Json { fieldNameMapping })
Decode.decodeWith valueBytes valueDecoder (@Json {})
)
|> tryDecode
@ -1419,327 +1302,3 @@ ObjectState : [
AfterClosingBrace Nat,
InvalidObject,
]
# Test decode of record with two strings ignoring whitespace
expect
input = Str.toUtf8 " {\n\"FruitCount\"\t:2\n, \"OwnerName\": \"Farmer Joe\" } "
decoder = jsonWithOptions { fieldNameMapping: PascalCase }
actual = Decode.fromBytesPartial input decoder
expected = Ok { fruitCount: 2, ownerName: "Farmer Joe" }
actual.result == expected
# Test decode of record with an array of strings and a boolean field
expect
input = Str.toUtf8 "{\"fruit-flavours\": [\"Apples\",\"Bananas\",\"Pears\"], \"is-fresh\": true }"
decoder = jsonWithOptions { fieldNameMapping: KebabCase }
actual = Decode.fromBytesPartial input decoder
expected = Ok { fruitFlavours: ["Apples", "Bananas", "Pears"], isFresh: Bool.true }
actual.result == expected
# Test decode of record with a string and number field
expect
input = Str.toUtf8 "{\"first_segment\":\"ab\",\"second_segment\":10}"
decoder = jsonWithOptions { fieldNameMapping: SnakeCase }
actual = Decode.fromBytesPartial input decoder
expected = Ok { firstSegment: "ab", secondSegment: 10u8 }
actual.result == expected
# Test decode of record of a record
expect
input = Str.toUtf8 "{\"OUTER\":{\"INNER\":\"a\"},\"OTHER\":{\"ONE\":\"b\",\"TWO\":10}}"
decoder = jsonWithOptions { fieldNameMapping: Custom fromYellingCase }
actual = Decode.fromBytesPartial input decoder
expected = Ok { outer: { inner: "a" }, other: { one: "b", two: 10u8 } }
actual.result == expected
fromYellingCase = \str ->
Str.graphemes str
|> List.map toLowercase
|> Str.joinWith ""
expect fromYellingCase "YELLING" == "yelling"
# Complex example from IETF RFC 8259 (2017)
complexExampleJson = Str.toUtf8 "{\"Image\":{\"Animated\":false,\"Height\":600,\"Ids\":[116,943,234,38793],\"Thumbnail\":{\"Height\":125,\"Url\":\"http:\\/\\/www.example.com\\/image\\/481989943\",\"Width\":100},\"Title\":\"View from 15th Floor\",\"Width\":800}}"
complexExampleRecord = {
image: {
width: 800,
height: 600,
title: "View from 15th Floor",
thumbnail: {
url: "http://www.example.com/image/481989943",
height: 125,
width: 100,
},
animated: Bool.false,
ids: [116, 943, 234, 38793],
},
}
# Test decode of Complex Example
expect
input = complexExampleJson
decoder = jsonWithOptions { fieldNameMapping: PascalCase }
actual = Decode.fromBytes input decoder
expected = Ok complexExampleRecord
actual == expected
# Test encode of Complex Example
expect
input = complexExampleRecord
encoder = jsonWithOptions { fieldNameMapping: PascalCase }
actual = Encode.toBytes input encoder
expected = complexExampleJson
actual == expected
fromObjectNameUsingMap : Str, FieldNameMapping -> Str
fromObjectNameUsingMap = \objectName, fieldNameMapping ->
when fieldNameMapping is
Default -> objectName
SnakeCase -> fromSnakeCase objectName
PascalCase -> fromPascalCase objectName
KebabCase -> fromKebabCase objectName
CamelCase -> fromCamelCase objectName
Custom transformation -> transformation objectName
toObjectNameUsingMap : Str, FieldNameMapping -> Str
toObjectNameUsingMap = \fieldName, fieldNameMapping ->
when fieldNameMapping is
Default -> fieldName
SnakeCase -> toSnakeCase fieldName
PascalCase -> toPascalCase fieldName
KebabCase -> toKebabCase fieldName
CamelCase -> toCamelCase fieldName
Custom transformation -> transformation fieldName
# Convert a `snake_case` JSON Object name to a Roc Field name
fromSnakeCase = \str ->
snakeToCamel str
# Convert a `PascalCase` JSON Object name to a Roc Field name
fromPascalCase = \str ->
pascalToCamel str
# Convert a `kabab-case` JSON Object name to a Roc Field name
fromKebabCase = \str ->
kebabToCamel str
# Convert a `camelCase` JSON Object name to a Roc Field name
fromCamelCase = \str ->
# Nothing to change as Roc field names are camelCase by default
str
# Convert a `camelCase` Roc Field name to a `snake_case` JSON Object name
toSnakeCase = \str ->
camelToSnake str
# Convert a `camelCase` Roc Field name to a `PascalCase` JSON Object name
toPascalCase = \str ->
camelToPascal str
# Convert a `camelCase` Roc Field name to a `kabab-case` JSON Object name
toKebabCase = \str ->
camelToKebeb str
# Convert a `camelCase` Roc Field name to a `camelCase` JSON Object name
toCamelCase = \str ->
# Nothing to change as Roc field names are camelCase by default
str
snakeToCamel : Str -> Str
snakeToCamel = \str ->
segments = Str.split str "_"
when segments is
[first, ..] ->
segments
|> List.dropFirst 1
|> List.map uppercaseFirst
|> List.prepend first
|> Str.joinWith ""
_ -> str
expect snakeToCamel "snake_case_string" == "snakeCaseString"
pascalToCamel : Str -> Str
pascalToCamel = \str ->
segments = Str.graphemes str
when segments is
[a, ..] ->
first = toLowercase a
rest = List.dropFirst segments 1
Str.joinWith (List.prepend rest first) ""
_ -> str
expect pascalToCamel "PascalCaseString" == "pascalCaseString"
kebabToCamel : Str -> Str
kebabToCamel = \str ->
segments = Str.split str "-"
when segments is
[first, ..] ->
segments
|> List.dropFirst 1
|> List.map uppercaseFirst
|> List.prepend first
|> Str.joinWith ""
_ -> str
expect kebabToCamel "kebab-case-string" == "kebabCaseString"
camelToPascal : Str -> Str
camelToPascal = \str ->
segments = Str.graphemes str
when segments is
[a, ..] ->
first = toUppercase a
rest = List.dropFirst segments 1
Str.joinWith (List.prepend rest first) ""
_ -> str
expect camelToPascal "someCaseString" == "SomeCaseString"
camelToKebeb : Str -> Str
camelToKebeb = \str ->
rest = Str.graphemes str
taken = List.withCapacity (List.len rest)
camelToKebabHelp { taken, rest }
|> .taken
|> Str.joinWith ""
camelToKebabHelp : { taken : List Str, rest : List Str } -> { taken : List Str, rest : List Str }
camelToKebabHelp = \{ taken, rest } ->
when rest is
[] -> { taken, rest }
[a, ..] if isUpperCase a ->
camelToKebabHelp {
taken: List.concat taken ["-", toLowercase a],
rest: List.dropFirst rest 1,
}
[a, ..] ->
camelToKebabHelp {
taken: List.append taken a,
rest: List.dropFirst rest 1,
}
expect camelToKebeb "someCaseString" == "some-case-string"
camelToSnake : Str -> Str
camelToSnake = \str ->
rest = Str.graphemes str
taken = List.withCapacity (List.len rest)
camelToSnakeHelp { taken, rest }
|> .taken
|> Str.joinWith ""
camelToSnakeHelp : { taken : List Str, rest : List Str } -> { taken : List Str, rest : List Str }
camelToSnakeHelp = \{ taken, rest } ->
when rest is
[] -> { taken, rest }
[a, ..] if isUpperCase a ->
camelToSnakeHelp {
taken: List.concat taken ["_", toLowercase a],
rest: List.dropFirst rest 1,
}
[a, ..] ->
camelToSnakeHelp {
taken: List.append taken a,
rest: List.dropFirst rest 1,
}
expect camelToSnake "someCaseString" == "some_case_string"
uppercaseFirst : Str -> Str
uppercaseFirst = \str ->
segments = Str.graphemes str
when segments is
[a, ..] ->
first = toUppercase a
rest = List.dropFirst segments 1
Str.joinWith (List.prepend rest first) ""
_ -> str
toUppercase : Str -> Str
toUppercase = \str ->
when str is
"a" -> "A"
"b" -> "B"
"c" -> "C"
"d" -> "D"
"e" -> "E"
"f" -> "F"
"g" -> "G"
"h" -> "H"
"i" -> "I"
"j" -> "J"
"k" -> "K"
"l" -> "L"
"m" -> "M"
"n" -> "N"
"o" -> "O"
"p" -> "P"
"q" -> "Q"
"r" -> "R"
"s" -> "S"
"t" -> "T"
"u" -> "U"
"v" -> "V"
"w" -> "W"
"x" -> "X"
"y" -> "Y"
"z" -> "Z"
_ -> str
toLowercase : Str -> Str
toLowercase = \str ->
when str is
"A" -> "a"
"B" -> "b"
"C" -> "c"
"D" -> "d"
"E" -> "e"
"F" -> "f"
"G" -> "g"
"H" -> "h"
"I" -> "i"
"J" -> "j"
"K" -> "k"
"L" -> "l"
"M" -> "m"
"N" -> "n"
"O" -> "o"
"P" -> "p"
"Q" -> "q"
"R" -> "r"
"S" -> "s"
"T" -> "t"
"U" -> "u"
"V" -> "v"
"W" -> "w"
"X" -> "x"
"Y" -> "y"
"Z" -> "z"
_ -> str
isUpperCase : Str -> Bool
isUpperCase = \str ->
when str is
"A" | "B" | "C" | "D" | "E" | "F" | "G" | "H" | "I" | "J" | "K" | "L" | "M" | "N" | "O" | "P" | "Q" | "R" | "S" | "T" | "U" | "V" | "W" | "X" | "Y" | "Z" -> Bool.true
_ -> Bool.false

View file

@ -341,13 +341,10 @@ pub const STR_COUNT_SEGMENTS: &str = "roc_builtins.str.count_segments";
pub const STR_CONCAT: &str = "roc_builtins.str.concat";
pub const STR_JOIN_WITH: &str = "roc_builtins.str.joinWith";
pub const STR_SPLIT: &str = "roc_builtins.str.str_split";
pub const STR_TO_SCALARS: &str = "roc_builtins.str.to_scalars";
pub const STR_COUNT_GRAPEHEME_CLUSTERS: &str = "roc_builtins.str.count_grapheme_clusters";
pub const STR_COUNT_UTF8_BYTES: &str = "roc_builtins.str.count_utf8_bytes";
pub const STR_IS_EMPTY: &str = "roc_builtins.str.is_empty";
pub const STR_CAPACITY: &str = "roc_builtins.str.capacity";
pub const STR_STARTS_WITH: &str = "roc_builtins.str.starts_with";
pub const STR_STARTS_WITH_SCALAR: &str = "roc_builtins.str.starts_with_scalar";
pub const STR_ENDS_WITH: &str = "roc_builtins.str.ends_with";
pub const STR_NUMBER_OF_BYTES: &str = "roc_builtins.str.number_of_bytes";
pub const STR_FROM_INT: IntrinsicName = int_intrinsic!("roc_builtins.str.from_int");
@ -365,11 +362,8 @@ pub const STR_TRIM_START: &str = "roc_builtins.str.trim_start";
pub const STR_TRIM_END: &str = "roc_builtins.str.trim_end";
pub const STR_GET_UNSAFE: &str = "roc_builtins.str.get_unsafe";
pub const STR_RESERVE: &str = "roc_builtins.str.reserve";
pub const STR_APPEND_SCALAR: &str = "roc_builtins.str.append_scalar";
pub const STR_GET_SCALAR_UNSAFE: &str = "roc_builtins.str.get_scalar_unsafe";
pub const STR_CLONE_TO: &str = "roc_builtins.str.clone_to";
pub const STR_WITH_CAPACITY: &str = "roc_builtins.str.with_capacity";
pub const STR_GRAPHEMES: &str = "roc_builtins.str.graphemes";
pub const STR_ALLOCATION_PTR: &str = "roc_builtins.str.allocation_ptr";
pub const STR_RELEASE_EXCESS_CAPACITY: &str = "roc_builtins.str.release_excess_capacity";
@ -386,6 +380,7 @@ pub const LIST_CONCAT: &str = "roc_builtins.list.concat";
pub const LIST_REPLACE: &str = "roc_builtins.list.replace";
pub const LIST_REPLACE_IN_PLACE: &str = "roc_builtins.list.replace_in_place";
pub const LIST_IS_UNIQUE: &str = "roc_builtins.list.is_unique";
pub const LIST_CLONE: &str = "roc_builtins.list.clone";
pub const LIST_PREPEND: &str = "roc_builtins.list.prepend";
pub const LIST_APPEND_UNSAFE: &str = "roc_builtins.list.append_unsafe";
pub const LIST_RESERVE: &str = "roc_builtins.list.reserve";