with_ascii_lowercased zig builtin

This commit is contained in:
Norbert Hajagos 2025-01-08 21:22:33 +01:00
parent 809fe23afd
commit 8403f1ea19
No known key found for this signature in database
36 changed files with 303 additions and 163 deletions

View file

@ -211,6 +211,7 @@ comptime {
exportStrFn(str.withCapacityC, "with_capacity");
exportStrFn(str.strAllocationPtr, "allocation_ptr");
exportStrFn(str.strReleaseExcessCapacity, "release_excess_capacity");
exportStrFn(str.strWithAsciiLowercased, "with_ascii_lowercased");
for (INTEGERS) |T| {
str.exportFromInt(T, ROC_BUILTINS ++ "." ++ STR ++ ".from_int.");

View file

@ -2,6 +2,7 @@ const utils = @import("utils.zig");
const RocList = @import("list.zig").RocList;
const UpdateMode = utils.UpdateMode;
const std = @import("std");
const ascii = std.ascii;
const mem = std.mem;
const unicode = std.unicode;
const testing = std.testing;
@ -374,7 +375,12 @@ pub const RocStr = extern struct {
return 1;
}
const ptr: [*]usize = @as([*]usize, @ptrCast(@alignCast(self.bytes)));
const data_ptr = if (self.isSeamlessSlice())
self.getAllocationPtr()
else
self.bytes;
const ptr: [*]usize = @as([*]usize, @ptrCast(@alignCast(data_ptr)));
return (ptr - 1)[0];
}
@ -611,16 +617,6 @@ fn initFromSmallStr(slice_bytes: [*]u8, len: usize, _: usize) RocStr {
return RocStr.init(slice_bytes, len);
}
// The alloc_ptr must already be shifted to be ready for storing in a seamless slice.
fn initFromBigStr(slice_bytes: [*]u8, len: usize, alloc_ptr: usize) RocStr {
// Here we can make seamless slices instead of copying to a new small str.
return RocStr{
.bytes = slice_bytes,
.length = len | SEAMLESS_SLICE_BIT,
.capacity_or_alloc_ptr = alloc_ptr,
};
}
fn strSplitOnHelp(array: [*]RocStr, string: RocStr, delimiter: RocStr) void {
if (delimiter.len() == 0) {
string.incref(1);
@ -1968,6 +1964,66 @@ fn countTrailingWhitespaceBytes(string: RocStr) usize {
return byte_count;
}
// Str.with_ascii_lowercased
pub fn strWithAsciiLowercased(string: RocStr) callconv(.C) RocStr {
var new_str = if (string.isUnique())
string
else blk: {
string.decref();
break :blk RocStr.fromSlice(string.asSlice());
};
const new_str_bytes = new_str.asU8ptrMut()[0..string.len()];
for (new_str_bytes) |*c| {
c.* = ascii.toLower(c.*);
}
return new_str;
}
test "withAsciiLowercased: small str" {
const original = RocStr.fromSlice("cOFFÉ");
try expect(original.isSmallStr());
const expected = RocStr.fromSlice("coffÉ");
defer expected.decref();
const str_result = strWithAsciiLowercased(original);
defer str_result.decref();
try expect(str_result.isSmallStr());
try expect(str_result.eq(expected));
}
test "withAsciiLowercased: non small str" {
const original = RocStr.fromSlice("cOFFÉ cOFFÉ cOFFÉ cOFFÉ cOFFÉ cOFFÉ");
defer original.decref();
try expect(!original.isSmallStr());
const expected = RocStr.fromSlice("coffÉ coffÉ coffÉ coffÉ coffÉ coffÉ");
defer expected.decref();
const str_result = strWithAsciiLowercased(original);
try expect(!str_result.isSmallStr());
try expect(str_result.eq(expected));
}
test "withAsciiLowercased: seamless slice" {
const l = RocStr.fromSlice("cOFFÉ cOFFÉ cOFFÉ cOFFÉ cOFFÉ cOFFÉ");
const original = substringUnsafeC(l, 1, l.len() - 1);
defer original.decref();
try expect(original.isSeamlessSlice());
const expected = RocStr.fromSlice("offÉ coffÉ coffÉ coffÉ coffÉ coffÉ");
defer expected.decref();
const str_result = strWithAsciiLowercased(original);
try expect(!str_result.isSmallStr());
try expect(str_result.eq(expected));
}
fn rcNone(_: ?[*]u8) callconv(.C) void {}
fn decStr(ptr: ?[*]u8) callconv(.C) void {

View file

@ -369,6 +369,7 @@ module [
contains,
drop_prefix,
drop_suffix,
with_ascii_lowercased,
]
import Bool exposing [Bool]
@ -1092,3 +1093,18 @@ drop_suffix = |haystack, suffix|
substring_unsafe(haystack, start, len)
else
haystack
## Returns a version of the string with all [ASCII characters](https://en.wikipedia.org/wiki/ASCII) lowercased. Non-ASCII characters are left unmodified. For example:
##
## ```roc
## expect "CAFÉ".with_ascii_lowercased() == "cafÉ"
## ```
##
## This function is useful for things like [command-line options](https://en.wikipedia.org/wiki/Command-line_interface#Command-line_option) and [environment variables](https://en.wikipedia.org/wiki/Environment_variablewhere you ## know in advance that you're dealing with a hardcoded string containing only ASCII characters. It has better performance than lowercasing operations which take Unicode into account.
##
## That said, strings received from user input can always contain non-ASCII Unicode characters, and lowercasing [Unicode](https://unicode.org) works differently in different languages. For example, the string `"I"lowercases to `"i"## ` in English and to `"ı"` (a [dotless i](https://en.wikipedia.org/wiki/Dotless_I)) in Turkish. These rules can also change in each [Unicode release](https://www.unicode.org/releases/), so we have separate [`unicode` package]## (https://github.com/roc-lang/unicode) for Unicode capitalization that can be upgraded independently from the language's builtins.
##
## To do a case-insensitive comparison of the ASCII characters in a string, use [`caseless_ascii_equals`](#caseless_ascii_equals).
with_ascii_lowercased: Str -> Str
expect Str.with_ascii_lowercased "cOFFÉ" == "XYZFÉ"

View file

@ -358,6 +358,7 @@ pub const STR_CLONE_TO: &str = "roc_builtins.str.clone_to";
pub const STR_WITH_CAPACITY: &str = "roc_builtins.str.with_capacity";
pub const STR_ALLOCATION_PTR: &str = "roc_builtins.str.allocation_ptr";
pub const STR_RELEASE_EXCESS_CAPACITY: &str = "roc_builtins.str.release_excess_capacity";
pub const STR_WITH_ASCII_LOWERCASED: &str = "roc_builtins.str.with_ascii_lowercased";
pub const LIST_MAP: &str = "roc_builtins.list.map";
pub const LIST_MAP2: &str = "roc_builtins.list.map2";