mirror of
https://github.com/roc-lang/roc.git
synced 2025-09-26 13:29:12 +00:00
Add a fast path for validating ASCII charcters
From my quick testing using the code for testing this function in zig standard lib: small ascii: 2.2x small unicode: 0.9x medium ascii: 15x medium unicode: 1.01x medium mix: 3.25x large ascii: 32.8x large unicode: 1.03x large mixed: 3.35x small being less than 8 bytes. large being roughly greater than 512 bytes.
This commit is contained in:
parent
2023770ce7
commit
f6677f3c1f
1 changed files with 41 additions and 2 deletions
|
@ -1878,8 +1878,47 @@ fn errorToProblem(bytes: [*]u8, length: usize) struct { index: usize, problem: U
|
|||
}
|
||||
|
||||
pub fn isValidUnicode(ptr: [*]u8, len: usize) callconv(.C) bool {
|
||||
const bytes: []u8 = ptr[0..len];
|
||||
return @call(.{ .modifier = always_inline }, unicode.utf8ValidateSlice, .{bytes});
|
||||
const buf: []u8 = ptr[0..len];
|
||||
|
||||
const size = @sizeOf(u64);
|
||||
// TODO: we should test changing the step on other platforms.
|
||||
// The general tradeoff is making extremely large strings potentially much faster
|
||||
// at the cost of small strings being slightly slower.
|
||||
const step = size;
|
||||
var i: usize = 0;
|
||||
while (i + step < buf.len) {
|
||||
var bytes: usize = 0;
|
||||
@memcpy(@ptrCast([*]u8, &bytes), @ptrCast([*]const u8, buf) + i, size);
|
||||
const unicode_bytes = bytes & 0x8080_8080_8080_8080;
|
||||
if (unicode_bytes == 0) {
|
||||
i += step;
|
||||
continue;
|
||||
}
|
||||
|
||||
while (buf[i] < 0b1000_0000) : (i += 1) {}
|
||||
|
||||
while (buf[i] >= 0b1000_0000) {
|
||||
// This forces prefetching, otherwise the loop can run at about half speed.
|
||||
var small_buf = [4]u8{ 0, 0, 0, 0 };
|
||||
@memcpy(&small_buf, @ptrCast([*]const u8, buf) + i, size);
|
||||
// TODO: Should we always inline these function calls below?
|
||||
if (std.unicode.utf8ByteSequenceLength(small_buf[0])) |cp_len| {
|
||||
if (std.meta.isError(std.unicode.utf8Decode(small_buf[0..cp_len]))) {
|
||||
return false;
|
||||
}
|
||||
i += cp_len;
|
||||
if (i == buf.len) return true;
|
||||
} else |_| {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
while (buf[i] < 0b1000_0000) : (i += 1) {
|
||||
if (i == buf.len) return true;
|
||||
}
|
||||
|
||||
return @call(.{ .modifier = always_inline }, unicode.utf8ValidateSlice, .{buf[i..]});
|
||||
}
|
||||
|
||||
const Utf8DecodeError = error{
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue