mirror of
https://github.com/roc-lang/roc.git
synced 2025-08-04 12:18:19 +00:00
Merge pull request #4364 from travisstaloch/str-graphemes
Str graphemes
This commit is contained in:
commit
f734cc11c2
11 changed files with 94 additions and 45 deletions
|
@ -435,7 +435,12 @@ pub fn build_c_host_native(
|
|||
_ => {
|
||||
command.args(&[
|
||||
shared_lib_path.to_str().unwrap(),
|
||||
&bitcode::get_builtins_host_obj_path(),
|
||||
// This line is commented out because
|
||||
// @bhansconnect: With the addition of Str.graphemes, always
|
||||
// linking the built-ins led to a surgical linker bug for
|
||||
// optimized builds. Disabling until it is needed for dev
|
||||
// builds.
|
||||
// &bitcode::get_builtins_host_obj_path(),
|
||||
"-fPIE",
|
||||
"-pie",
|
||||
"-lm",
|
||||
|
|
|
@ -145,6 +145,7 @@ comptime {
|
|||
exportStrFn(str.strTrimRight, "trim_right");
|
||||
exportStrFn(str.strCloneTo, "clone_to");
|
||||
exportStrFn(str.withCapacity, "with_capacity");
|
||||
exportStrFn(str.strGraphemes, "graphemes");
|
||||
|
||||
inline for (INTEGERS) |T| {
|
||||
str.exportFromInt(T, ROC_BUILTINS ++ "." ++ STR ++ ".from_int.");
|
||||
|
|
|
@ -1,5 +1,6 @@
|
|||
const utils = @import("utils.zig");
|
||||
const RocList = @import("list.zig").RocList;
|
||||
const grapheme = @import("helpers/grapheme.zig");
|
||||
const UpdateMode = utils.UpdateMode;
|
||||
const std = @import("std");
|
||||
const mem = std.mem;
|
||||
|
@ -1212,7 +1213,6 @@ test "countSegments: string equals delimiter" {
|
|||
}
|
||||
|
||||
// Str.countGraphemeClusters
|
||||
const grapheme = @import("helpers/grapheme.zig");
|
||||
pub fn countGraphemeClusters(string: RocStr) callconv(.C) usize {
|
||||
if (string.isEmpty()) {
|
||||
return 0;
|
||||
|
@ -1248,59 +1248,76 @@ pub fn countGraphemeClusters(string: RocStr) callconv(.C) usize {
|
|||
return count;
|
||||
}
|
||||
|
||||
test "countGraphemeClusters: empty string" {
|
||||
const count = countGraphemeClusters(RocStr.empty());
|
||||
try expectEqual(count, 0);
|
||||
// Str.graphemes
|
||||
pub fn strGraphemes(roc_str: RocStr) callconv(.C) RocList {
|
||||
var break_state: ?grapheme.BoundClass = null;
|
||||
var opt_last_codepoint: ?u21 = null;
|
||||
var index: usize = 0;
|
||||
var last_codepoint_len: u8 = 0;
|
||||
|
||||
var result = RocList.allocate(@alignOf(RocStr), countGraphemeClusters(roc_str), @sizeOf(RocStr));
|
||||
const graphemes = result.elements(RocStr) orelse return result;
|
||||
var slice = roc_str.asSlice();
|
||||
var iter = (unicode.Utf8View.init(slice) catch unreachable).iterator();
|
||||
|
||||
while (iter.nextCodepoint()) |cur_codepoint| {
|
||||
const cur_codepoint_len = unicode.utf8CodepointSequenceLength(cur_codepoint) catch unreachable;
|
||||
if (opt_last_codepoint) |last_codepoint| {
|
||||
var did_break = grapheme.isGraphemeBreak(last_codepoint, cur_codepoint, &break_state);
|
||||
if (did_break) {
|
||||
graphemes[index] = RocStr.fromSlice(slice[0..last_codepoint_len]);
|
||||
slice = slice[last_codepoint_len..];
|
||||
index += 1;
|
||||
break_state = null;
|
||||
last_codepoint_len = 0;
|
||||
}
|
||||
}
|
||||
last_codepoint_len += cur_codepoint_len;
|
||||
opt_last_codepoint = cur_codepoint;
|
||||
}
|
||||
// Append last grapheme
|
||||
graphemes[index] = RocStr.fromSlice(slice);
|
||||
return result;
|
||||
}
|
||||
|
||||
test "countGraphemeClusters: ascii characters" {
|
||||
const bytes_arr = "abcd";
|
||||
const bytes_len = bytes_arr.len;
|
||||
const str = RocStr.init(bytes_arr, bytes_len);
|
||||
defer str.deinit();
|
||||
// these test both countGraphemeClusters() and strGraphemes()
|
||||
fn graphemesTest(input: []const u8, expected: []const []const u8) !void {
|
||||
const rocstr = RocStr.fromSlice(input);
|
||||
defer rocstr.deinit();
|
||||
const count = countGraphemeClusters(rocstr);
|
||||
try expectEqual(expected.len, count);
|
||||
|
||||
const count = countGraphemeClusters(str);
|
||||
try expectEqual(count, 4);
|
||||
const graphemes = strGraphemes(rocstr);
|
||||
defer graphemes.deinit(u8);
|
||||
if (input.len == 0) return; // empty string
|
||||
const elems = graphemes.elements(RocStr) orelse unreachable;
|
||||
for (expected) |g, i| {
|
||||
try std.testing.expectEqualStrings(g, elems[i].asSlice());
|
||||
}
|
||||
}
|
||||
|
||||
test "countGraphemeClusters: utf8 characters" {
|
||||
const bytes_arr = "ãxā";
|
||||
const bytes_len = bytes_arr.len;
|
||||
const str = RocStr.init(bytes_arr, bytes_len);
|
||||
defer str.deinit();
|
||||
|
||||
const count = countGraphemeClusters(str);
|
||||
try expectEqual(count, 3);
|
||||
test "graphemes: empty string" {
|
||||
try graphemesTest("", &.{});
|
||||
}
|
||||
|
||||
test "countGraphemeClusters: emojis" {
|
||||
const bytes_arr = "🤔🤔🤔";
|
||||
const bytes_len = bytes_arr.len;
|
||||
const str = RocStr.init(bytes_arr, bytes_len);
|
||||
defer str.deinit();
|
||||
|
||||
const count = countGraphemeClusters(str);
|
||||
try expectEqual(count, 3);
|
||||
test "graphemes: ascii characters" {
|
||||
try graphemesTest("abcd", &.{ "a", "b", "c", "d" });
|
||||
}
|
||||
|
||||
test "countGraphemeClusters: emojis and ut8 characters" {
|
||||
const bytes_arr = "🤔å🤔¥🤔ç";
|
||||
const bytes_len = bytes_arr.len;
|
||||
const str = RocStr.init(bytes_arr, bytes_len);
|
||||
defer str.deinit();
|
||||
|
||||
const count = countGraphemeClusters(str);
|
||||
try expectEqual(count, 6);
|
||||
test "graphemes: utf8 characters" {
|
||||
try graphemesTest("ãxā", &.{ "ã", "x", "ā" });
|
||||
}
|
||||
|
||||
test "countGraphemeClusters: emojis, ut8, and ascii characters" {
|
||||
const bytes_arr = "6🤔å🤔e¥🤔çpp";
|
||||
const bytes_len = bytes_arr.len;
|
||||
const str = RocStr.init(bytes_arr, bytes_len);
|
||||
defer str.deinit();
|
||||
test "graphemes: emojis" {
|
||||
try graphemesTest("🤔🤔🤔", &.{ "🤔", "🤔", "🤔" });
|
||||
}
|
||||
|
||||
const count = countGraphemeClusters(str);
|
||||
try expectEqual(count, 10);
|
||||
test "graphemes: emojis and ut8 characters" {
|
||||
try graphemesTest("🤔å🤔¥🤔ç", &.{ "🤔", "å", "🤔", "¥", "🤔", "ç" });
|
||||
}
|
||||
|
||||
test "graphemes: emojis, ut8, and ascii characters" {
|
||||
try graphemesTest("6🤔å🤔e¥🤔çpp", &.{ "6", "🤔", "å", "🤔", "e", "¥", "🤔", "ç", "p", "p" });
|
||||
}
|
||||
|
||||
pub fn countUtf8Bytes(string: RocStr) callconv(.C) usize {
|
||||
|
|
|
@ -45,6 +45,7 @@ interface Str
|
|||
walkScalarsUntil,
|
||||
withCapacity,
|
||||
withPrefix,
|
||||
graphemes,
|
||||
]
|
||||
imports [
|
||||
Bool.{ Bool, Eq },
|
||||
|
@ -180,6 +181,9 @@ repeat : Str, Nat -> Str
|
|||
## expect Str.countGraphemes "üïä" == 4
|
||||
countGraphemes : Str -> Nat
|
||||
|
||||
## Split a string into its constituent grapheme clusters
|
||||
graphemes : Str -> List Str
|
||||
|
||||
## If the string begins with a [Unicode code point](http://www.unicode.org/glossary/#code_point)
|
||||
## equal to the given [U32], return `Bool.true`. Otherwise return `Bool.false`.
|
||||
##
|
||||
|
|
|
@ -362,6 +362,7 @@ pub const STR_APPEND_SCALAR: &str = "roc_builtins.str.append_scalar";
|
|||
pub const STR_GET_SCALAR_UNSAFE: &str = "roc_builtins.str.get_scalar_unsafe";
|
||||
pub const STR_CLONE_TO: &str = "roc_builtins.str.clone_to";
|
||||
pub const STR_WITH_CAPACITY: &str = "roc_builtins.str.with_capacity";
|
||||
pub const STR_GRAPHEMES: &str = "roc_builtins.str.graphemes";
|
||||
|
||||
pub const LIST_MAP: &str = "roc_builtins.list.map";
|
||||
pub const LIST_MAP2: &str = "roc_builtins.list.map2";
|
||||
|
|
|
@ -125,6 +125,7 @@ map_symbol_to_lowlevel_and_arity! {
|
|||
StrToNum; STR_TO_NUM; 1,
|
||||
StrGetCapacity; STR_CAPACITY; 1,
|
||||
StrWithCapacity; STR_WITH_CAPACITY; 1,
|
||||
StrGraphemes; STR_GRAPHEMES; 1,
|
||||
|
||||
ListLen; LIST_LEN; 1,
|
||||
ListWithCapacity; LIST_WITH_CAPACITY; 1,
|
||||
|
|
|
@ -6049,6 +6049,20 @@ fn run_low_level<'a, 'ctx, 'env>(
|
|||
bitcode::STR_WITH_CAPACITY,
|
||||
)
|
||||
}
|
||||
StrGraphemes => {
|
||||
// Str.graphemes : Str -> List Str
|
||||
debug_assert_eq!(args.len(), 1);
|
||||
|
||||
let string = load_symbol(scope, &args[0]);
|
||||
|
||||
call_str_bitcode_fn(
|
||||
env,
|
||||
&[string],
|
||||
&[],
|
||||
BitcodeReturns::List,
|
||||
bitcode::STR_GRAPHEMES,
|
||||
)
|
||||
}
|
||||
ListLen => {
|
||||
// List.len : List * -> Nat
|
||||
debug_assert_eq!(args.len(), 1);
|
||||
|
|
|
@ -304,6 +304,7 @@ impl<'a> LowLevelCall<'a> {
|
|||
self.load_args_and_call_zig(backend, bitcode::STR_SUBSTRING_UNSAFE)
|
||||
}
|
||||
StrWithCapacity => self.load_args_and_call_zig(backend, bitcode::STR_WITH_CAPACITY),
|
||||
StrGraphemes => self.load_args_and_call_zig(backend, bitcode::STR_GRAPHEMES),
|
||||
|
||||
// List
|
||||
ListLen => match backend.storage.get(&self.arguments[0]) {
|
||||
|
|
|
@ -31,6 +31,7 @@ pub enum LowLevel {
|
|||
StrGetScalarUnsafe,
|
||||
StrGetCapacity,
|
||||
StrWithCapacity,
|
||||
StrGraphemes,
|
||||
ListLen,
|
||||
ListWithCapacity,
|
||||
ListReserve,
|
||||
|
@ -250,6 +251,7 @@ map_symbol_to_lowlevel! {
|
|||
StrToNum <= STR_TO_NUM,
|
||||
StrGetCapacity <= STR_CAPACITY,
|
||||
StrWithCapacity <= STR_WITH_CAPACITY,
|
||||
StrGraphemes <= STR_GRAPHEMES,
|
||||
ListLen <= LIST_LEN,
|
||||
ListGetCapacity <= LIST_CAPACITY,
|
||||
ListWithCapacity <= LIST_WITH_CAPACITY,
|
||||
|
|
|
@ -1318,6 +1318,7 @@ define_builtins! {
|
|||
52 STR_REPLACE_LAST: "replaceLast"
|
||||
53 STR_WITH_CAPACITY: "withCapacity"
|
||||
54 STR_WITH_PREFIX: "withPrefix"
|
||||
55 STR_GRAPHEMES: "graphemes"
|
||||
}
|
||||
6 LIST: "List" => {
|
||||
0 LIST_LIST: "List" exposed_apply_type=true // the List.List type alias
|
||||
|
|
|
@ -879,8 +879,10 @@ pub fn lowlevel_borrow_signature(arena: &Bump, op: LowLevel) -> &[bool] {
|
|||
// - other refcounted arguments are Borrowed
|
||||
match op {
|
||||
Unreachable => arena.alloc_slice_copy(&[irrelevant]),
|
||||
ListLen | StrIsEmpty | StrToScalars | StrCountGraphemes | StrCountUtf8Bytes
|
||||
| StrGetCapacity | ListGetCapacity => arena.alloc_slice_copy(&[borrowed]),
|
||||
ListLen | StrIsEmpty | StrToScalars | StrCountGraphemes | StrGraphemes
|
||||
| StrCountUtf8Bytes | StrGetCapacity | ListGetCapacity => {
|
||||
arena.alloc_slice_copy(&[borrowed])
|
||||
}
|
||||
ListWithCapacity | StrWithCapacity => arena.alloc_slice_copy(&[irrelevant]),
|
||||
ListReplaceUnsafe => arena.alloc_slice_copy(&[owned, irrelevant, irrelevant]),
|
||||
StrGetUnsafe | ListGetUnsafe => arena.alloc_slice_copy(&[borrowed, irrelevant]),
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue