Add a fast path for validating ASCII charcters

From my quick testing using the code for testing this function in zig standard lib:

small ascii:     2.2x
small unicode:   0.9x
medium ascii:    15x
medium unicode:  1.01x
medium mix:      3.25x
large ascii:     32.8x
large unicode:   1.03x
large mixed:     3.35x

small being less than 8 bytes.
large being roughly greater than 512 bytes.
This commit is contained in:
Brendan Hansknecht 2023-03-14 17:31:32 -07:00
parent 2023770ce7
commit f6677f3c1f
No known key found for this signature in database
GPG key ID: 0EA784685083E75B

View file

@ -1878,8 +1878,47 @@ fn errorToProblem(bytes: [*]u8, length: usize) struct { index: usize, problem: U
}
pub fn isValidUnicode(ptr: [*]u8, len: usize) callconv(.C) bool {
const bytes: []u8 = ptr[0..len];
return @call(.{ .modifier = always_inline }, unicode.utf8ValidateSlice, .{bytes});
const buf: []u8 = ptr[0..len];
const size = @sizeOf(u64);
// TODO: we should test changing the step on other platforms.
// The general tradeoff is making extremely large strings potentially much faster
// at the cost of small strings being slightly slower.
const step = size;
var i: usize = 0;
while (i + step < buf.len) {
var bytes: usize = 0;
@memcpy(@ptrCast([*]u8, &bytes), @ptrCast([*]const u8, buf) + i, size);
const unicode_bytes = bytes & 0x8080_8080_8080_8080;
if (unicode_bytes == 0) {
i += step;
continue;
}
while (buf[i] < 0b1000_0000) : (i += 1) {}
while (buf[i] >= 0b1000_0000) {
// This forces prefetching, otherwise the loop can run at about half speed.
var small_buf = [4]u8{ 0, 0, 0, 0 };
@memcpy(&small_buf, @ptrCast([*]const u8, buf) + i, size);
// TODO: Should we always inline these function calls below?
if (std.unicode.utf8ByteSequenceLength(small_buf[0])) |cp_len| {
if (std.meta.isError(std.unicode.utf8Decode(small_buf[0..cp_len]))) {
return false;
}
i += cp_len;
if (i == buf.len) return true;
} else |_| {
return false;
}
}
}
while (buf[i] < 0b1000_0000) : (i += 1) {
if (i == buf.len) return true;
}
return @call(.{ .modifier = always_inline }, unicode.utf8ValidateSlice, .{buf[i..]});
}
const Utf8DecodeError = error{