builtin(str): implement Str.graphemes

Signed-off-by: Prajwal S N <prajwalnadig21@gmail.com>
This commit is contained in:
Prajwal S N 2022-10-10 20:51:25 +05:30 committed by Travis Staloch
parent b6a282b0ce
commit df7e4eea7e
No known key found for this signature in database
GPG key ID: 9726F5C64475E635
10 changed files with 67 additions and 3 deletions

View file

@ -145,6 +145,7 @@ comptime {
exportStrFn(str.strTrimRight, "trim_right");
exportStrFn(str.strCloneTo, "clone_to");
exportStrFn(str.withCapacity, "with_capacity");
exportStrFn(str.strGraphemes, "str_graphemes");
inline for (INTEGERS) |T| {
str.exportFromInt(T, ROC_BUILTINS ++ "." ++ STR ++ ".from_int.");

View file

@ -1,5 +1,6 @@
const utils = @import("utils.zig");
const RocList = @import("list.zig").RocList;
const grapheme = @import("helpers/grapheme.zig");
const UpdateMode = utils.UpdateMode;
const std = @import("std");
const mem = std.mem;
@ -1212,7 +1213,6 @@ test "countSegments: string equals delimiter" {
}
// Str.countGraphemeClusters
const grapheme = @import("helpers/grapheme.zig");
pub fn countGraphemeClusters(string: RocStr) callconv(.C) usize {
if (string.isEmpty()) {
return 0;
@ -1303,6 +1303,43 @@ test "countGraphemeClusters: emojis, ut8, and ascii characters" {
try expectEqual(count, 10);
}
// Str.graphemes
pub fn strGraphemes(string: RocStr) callconv(.C) RocList {
var list = RocList.allocate(@alignOf(RocStr), countGraphemeClusters(string), @sizeOf(RocStr));
const graphemes = @ptrCast([*]RocStr, @alignCast(@alignOf(RocStr), list.bytes));
const bytes_ptr = string.asU8ptr();
var bytes = bytes_ptr[0..string.len()];
var iter = (unicode.Utf8View.init(bytes) catch unreachable).iterator();
var grapheme_break_state: ?grapheme.BoundClass = null;
var grapheme_break_state_ptr = &grapheme_break_state;
var opt_last_codepoint: ?u21 = null;
var list_index: usize = 0;
var start_index: usize = 0;
var str_index: usize = 0;
var cur_codepoint_len: usize = 0;
while (iter.nextCodepoint()) |cur_codepoint| {
cur_codepoint_len = unicode.utf8CodepointSequenceLength(cur_codepoint) catch unreachable;
if (opt_last_codepoint) |last_codepoint| {
var did_break = grapheme.isGraphemeBreak(last_codepoint, cur_codepoint, grapheme_break_state_ptr);
if (did_break) {
graphemes[list_index] = RocStr.init(bytes_ptr + start_index, str_index - start_index + cur_codepoint_len);
list_index += 1;
start_index = str_index + cur_codepoint_len;
grapheme_break_state = null;
}
str_index += cur_codepoint_len;
}
opt_last_codepoint = cur_codepoint;
}
// Append last grapheme
graphemes[list_index] = RocStr.init(bytes_ptr + start_index, str_index - start_index + cur_codepoint_len);
return list;
}
pub fn countUtf8Bytes(string: RocStr) callconv(.C) usize {
return string.len();
}

View file

@ -45,6 +45,7 @@ interface Str
walkScalarsUntil,
withCapacity,
withPrefix,
graphemes,
]
imports [
Bool.{ Bool, Eq },
@ -180,6 +181,9 @@ repeat : Str, Nat -> Str
## expect Str.countGraphemes "üïä" == 4
countGraphemes : Str -> Nat
## Split a string into its constituent grapheme clusters
graphemes : Str -> List Str
## If the string begins with a [Unicode code point](http://www.unicode.org/glossary/#code_point)
## equal to the given [U32], return `Bool.true`. Otherwise return `Bool.false`.
##

View file

@ -362,6 +362,7 @@ pub const STR_APPEND_SCALAR: &str = "roc_builtins.str.append_scalar";
pub const STR_GET_SCALAR_UNSAFE: &str = "roc_builtins.str.get_scalar_unsafe";
pub const STR_CLONE_TO: &str = "roc_builtins.str.clone_to";
pub const STR_WITH_CAPACITY: &str = "roc_builtins.str.with_capacity";
pub const STR_GRAPHEMES: &str = "roc_builtins.str.str_graphemes";
pub const LIST_MAP: &str = "roc_builtins.list.map";
pub const LIST_MAP2: &str = "roc_builtins.list.map2";