with_ascii_lowercased zig builtin

2025-08-04 12:18:19 +00:00 · 2025-01-08 21:22:33 +01:00 · 2025-01-08 21:22:33 +01:00 · 8403f1ea19
commit 8403f1ea19
parent 809fe23afd
36 changed files with 303 additions and 163 deletions
--- a/crates/compiler/builtins/bitcode/src/main.zig
+++ b/crates/compiler/builtins/bitcode/src/main.zig
@ -211,6 +211,7 @@ comptime {
    exportStrFn(str.withCapacityC, "with_capacity");
    exportStrFn(str.strAllocationPtr, "allocation_ptr");
    exportStrFn(str.strReleaseExcessCapacity, "release_excess_capacity");
+    exportStrFn(str.strWithAsciiLowercased, "with_ascii_lowercased");

    for (INTEGERS) |T| {
        str.exportFromInt(T, ROC_BUILTINS ++ "." ++ STR ++ ".from_int.");
--- a/crates/compiler/builtins/bitcode/src/str.zig
+++ b/crates/compiler/builtins/bitcode/src/str.zig
@ -2,6 +2,7 @@ const utils = @import("utils.zig");
 const RocList = @import("list.zig").RocList;
 const UpdateMode = utils.UpdateMode;
 const std = @import("std");
+const ascii = std.ascii;
 const mem = std.mem;
 const unicode = std.unicode;
 const testing = std.testing;
@ -374,7 +375,12 @@ pub const RocStr = extern struct {
            return 1;
        }

-        const ptr: [*]usize = @as([*]usize, @ptrCast(@alignCast(self.bytes)));
+        const data_ptr = if (self.isSeamlessSlice())
+            self.getAllocationPtr()
+        else
+            self.bytes;
+
+        const ptr: [*]usize = @as([*]usize, @ptrCast(@alignCast(data_ptr)));
        return (ptr - 1)[0];
    }

@ -611,16 +617,6 @@ fn initFromSmallStr(slice_bytes: [*]u8, len: usize, _: usize) RocStr {
    return RocStr.init(slice_bytes, len);
 }

-// The alloc_ptr must already be shifted to be ready for storing in a seamless slice.
-fn initFromBigStr(slice_bytes: [*]u8, len: usize, alloc_ptr: usize) RocStr {
-    // Here we can make seamless slices instead of copying to a new small str.
-    return RocStr{
-        .bytes = slice_bytes,
-        .length = len | SEAMLESS_SLICE_BIT,
-        .capacity_or_alloc_ptr = alloc_ptr,
-    };
-}
-
 fn strSplitOnHelp(array: [*]RocStr, string: RocStr, delimiter: RocStr) void {
    if (delimiter.len() == 0) {
        string.incref(1);
@ -1968,6 +1964,66 @@ fn countTrailingWhitespaceBytes(string: RocStr) usize {
    return byte_count;
 }

+// Str.with_ascii_lowercased
+pub fn strWithAsciiLowercased(string: RocStr) callconv(.C) RocStr {
+    var new_str = if (string.isUnique())
+        string
+    else blk: {
+        string.decref();
+        break :blk RocStr.fromSlice(string.asSlice());
+    };
+
+    const new_str_bytes = new_str.asU8ptrMut()[0..string.len()];
+    for (new_str_bytes) |*c| {
+        c.* = ascii.toLower(c.*);
+    }
+    return new_str;
+}
+
+test "withAsciiLowercased: small str" {
+    const original = RocStr.fromSlice("cOFFÉ");
+    try expect(original.isSmallStr());
+
+    const expected = RocStr.fromSlice("coffÉ");
+    defer expected.decref();
+
+    const str_result = strWithAsciiLowercased(original);
+    defer str_result.decref();
+
+    try expect(str_result.isSmallStr());
+    try expect(str_result.eq(expected));
+}
+
+test "withAsciiLowercased: non small str" {
+    const original = RocStr.fromSlice("cOFFÉ cOFFÉ cOFFÉ cOFFÉ cOFFÉ cOFFÉ");
+    defer original.decref();
+    try expect(!original.isSmallStr());
+
+    const expected = RocStr.fromSlice("coffÉ coffÉ coffÉ coffÉ coffÉ coffÉ");
+    defer expected.decref();
+
+    const str_result = strWithAsciiLowercased(original);
+
+    try expect(!str_result.isSmallStr());
+    try expect(str_result.eq(expected));
+}
+
+test "withAsciiLowercased: seamless slice" {
+    const l = RocStr.fromSlice("cOFFÉ cOFFÉ cOFFÉ cOFFÉ cOFFÉ cOFFÉ");
+    const original = substringUnsafeC(l, 1, l.len() - 1);
+    defer original.decref();
+
+    try expect(original.isSeamlessSlice());
+
+    const expected = RocStr.fromSlice("offÉ coffÉ coffÉ coffÉ coffÉ coffÉ");
+    defer expected.decref();
+
+    const str_result = strWithAsciiLowercased(original);
+
+    try expect(!str_result.isSmallStr());
+    try expect(str_result.eq(expected));
+}
+
 fn rcNone(_: ?[*]u8) callconv(.C) void {}

 fn decStr(ptr: ?[*]u8) callconv(.C) void {
--- a/crates/compiler/builtins/roc/Str.roc
+++ b/crates/compiler/builtins/roc/Str.roc
@ -369,6 +369,7 @@ module [
    contains,
    drop_prefix,
    drop_suffix,
+    with_ascii_lowercased,
 ]

 import Bool exposing [Bool]
@ -1092,3 +1093,18 @@ drop_suffix = |haystack, suffix|
        substring_unsafe(haystack, start, len)
    else
        haystack
+
+## Returns a version of the string with all [ASCII characters](https://en.wikipedia.org/wiki/ASCII) lowercased. Non-ASCII characters are left unmodified. For example:
+##
+## ```roc
+## expect "CAFÉ".with_ascii_lowercased() == "cafÉ"
+## ```
+##
+## This function is useful for things like [command-line options](https://en.wikipedia.org/wiki/Command-line_interface#Command-line_option) and [environment variables](https://en.wikipedia.org/wiki/Environment_variablewhere you ## know in advance that you're dealing with a hardcoded string containing only ASCII characters. It has better performance than lowercasing operations which take Unicode into account.
+##
+## That said, strings received from user input can always contain non-ASCII Unicode characters, and lowercasing [Unicode](https://unicode.org) works differently in different languages. For example, the string `"I"lowercases to `"i"## ` in English and to `"ı"` (a [dotless i](https://en.wikipedia.org/wiki/Dotless_I)) in Turkish. These rules can also change in each [Unicode release](https://www.unicode.org/releases/), so we have separate [`unicode` package]## (https://github.com/roc-lang/unicode) for Unicode capitalization that can be upgraded independently from the language's builtins.
+##
+## To do a case-insensitive comparison of the ASCII characters in a string, use [`caseless_ascii_equals`](#caseless_ascii_equals).
+with_ascii_lowercased: Str -> Str
+
+expect Str.with_ascii_lowercased "cOFFÉ" == "XYZFÉ"
--- a/crates/compiler/builtins/src/bitcode.rs
+++ b/crates/compiler/builtins/src/bitcode.rs
@ -358,6 +358,7 @@ pub const STR_CLONE_TO: &str = "roc_builtins.str.clone_to";
 pub const STR_WITH_CAPACITY: &str = "roc_builtins.str.with_capacity";
 pub const STR_ALLOCATION_PTR: &str = "roc_builtins.str.allocation_ptr";
 pub const STR_RELEASE_EXCESS_CAPACITY: &str = "roc_builtins.str.release_excess_capacity";
+pub const STR_WITH_ASCII_LOWERCASED: &str = "roc_builtins.str.with_ascii_lowercased";

 pub const LIST_MAP: &str = "roc_builtins.list.map";
 pub const LIST_MAP2: &str = "roc_builtins.list.map2";