mirror of
https://github.com/roc-lang/roc.git
synced 2025-08-03 03:42:17 +00:00
remove all compiler-rt and libc code ports from zig bitcode
This commit is contained in:
parent
be06599bb6
commit
797ba64003
15 changed files with 0 additions and 1577 deletions
|
@ -1,478 +0,0 @@
|
|||
const std = @import("std");
|
||||
const builtin = @import("builtin");
|
||||
const math = std.math;
|
||||
|
||||
// Eventually, we need to statically ingest compiler-rt and get it working with the surgical linker, then these should not be needed anymore.
|
||||
// Until then, we are manually ingesting used parts of compiler-rt here.
|
||||
//
|
||||
// Taken from
|
||||
// https://github.com/ziglang/zig/tree/4976b58ab16069f8d3267b69ed030f29685c1abe/lib/compiler_rt/
|
||||
// Thank you Zig Contributors!
|
||||
|
||||
// Libcalls that involve u128 on Windows x86-64 are expected by LLVM to use the
|
||||
// calling convention of @Vector(2, u64), rather than what's standard.
|
||||
pub const want_windows_v2u64_abi = builtin.os.tag == .windows and builtin.cpu.arch == .x86_64 and @import("builtin").object_format != .c;
|
||||
|
||||
const v2u64 = @Vector(2, u64);
|
||||
|
||||
// Export it as weak incase it is already linked in by something else.
|
||||
comptime {
|
||||
if (!want_windows_v2u64_abi) {
|
||||
@export(__muloti4, .{ .name = "__muloti4", .linkage = .Weak });
|
||||
@export(__lshrti3, .{ .name = "__lshrti3", .linkage = .Weak });
|
||||
@export(__divti3, .{ .name = "__divti3", .linkage = .Weak });
|
||||
@export(__modti3, .{ .name = "__modti3", .linkage = .Weak });
|
||||
@export(__umodti3, .{ .name = "__umodti3", .linkage = .Weak });
|
||||
@export(__udivti3, .{ .name = "__udivti3", .linkage = .Weak });
|
||||
@export(__fixdfti, .{ .name = "__fixdfti", .linkage = .Weak });
|
||||
@export(__fixsfti, .{ .name = "__fixsfti", .linkage = .Weak });
|
||||
@export(__fixunsdfti, .{ .name = "__fixunsdfti", .linkage = .Weak });
|
||||
@export(__fixunssfti, .{ .name = "__fixunssfti", .linkage = .Weak });
|
||||
}
|
||||
}
|
||||
|
||||
pub fn __muloti4(a: i128, b: i128, overflow: *c_int) callconv(.C) i128 {
|
||||
if (2 * @bitSizeOf(i128) <= @bitSizeOf(usize)) {
|
||||
return muloXi4_genericFast(i128, a, b, overflow);
|
||||
} else {
|
||||
return muloXi4_genericSmall(i128, a, b, overflow);
|
||||
}
|
||||
}
|
||||
|
||||
pub fn __divti3(a: i128, b: i128) callconv(.C) i128 {
|
||||
return div(a, b);
|
||||
}
|
||||
|
||||
fn __divti3_windows_x86_64(a: v2u64, b: v2u64) callconv(.C) v2u64 {
|
||||
return @as(v2u64, @bitCast(div(@as(i128, @bitCast(a)), @as(i128, @bitCast(b)))));
|
||||
}
|
||||
|
||||
inline fn div(a: i128, b: i128) i128 {
|
||||
const s_a = a >> (128 - 1);
|
||||
const s_b = b >> (128 - 1);
|
||||
|
||||
const an = (a ^ s_a) -% s_a;
|
||||
const bn = (b ^ s_b) -% s_b;
|
||||
|
||||
const r = udivmod(u128, @as(u128, @bitCast(an)), @as(u128, @bitCast(bn)), null);
|
||||
const s = s_a ^ s_b;
|
||||
return (@as(i128, @bitCast(r)) ^ s) -% s;
|
||||
}
|
||||
|
||||
pub fn __udivti3(a: u128, b: u128) callconv(.C) u128 {
|
||||
return udivmod(u128, a, b, null);
|
||||
}
|
||||
|
||||
fn __udivti3_windows_x86_64(a: v2u64, b: v2u64) callconv(.C) v2u64 {
|
||||
return @as(v2u64, @bitCast(udivmod(u128, @as(u128, @bitCast(a)), @as(u128, @bitCast(b)), null)));
|
||||
}
|
||||
|
||||
pub fn __umodti3(a: u128, b: u128) callconv(.C) u128 {
|
||||
var r: u128 = undefined;
|
||||
_ = udivmod(u128, a, b, &r);
|
||||
return r;
|
||||
}
|
||||
|
||||
fn __umodti3_windows_x86_64(a: v2u64, b: v2u64) callconv(.C) v2u64 {
|
||||
var r: u128 = undefined;
|
||||
_ = udivmod(u128, @as(u128, @bitCast(a)), @as(u128, @bitCast(b)), &r);
|
||||
return @as(v2u64, @bitCast(r));
|
||||
}
|
||||
|
||||
pub fn __modti3(a: i128, b: i128) callconv(.C) i128 {
|
||||
return mod(a, b);
|
||||
}
|
||||
|
||||
fn __modti3_windows_x86_64(a: v2u64, b: v2u64) callconv(.C) v2u64 {
|
||||
return @as(v2u64, @bitCast(mod(@as(i128, @bitCast(a)), @as(i128, @bitCast(b)))));
|
||||
}
|
||||
|
||||
inline fn mod(a: i128, b: i128) i128 {
|
||||
const s_a = a >> (128 - 1); // s = a < 0 ? -1 : 0
|
||||
const s_b = b >> (128 - 1); // s = b < 0 ? -1 : 0
|
||||
|
||||
const an = (a ^ s_a) -% s_a; // negate if s == -1
|
||||
const bn = (b ^ s_b) -% s_b; // negate if s == -1
|
||||
|
||||
var r: u128 = undefined;
|
||||
_ = udivmod(u128, @as(u128, @bitCast(an)), @as(u128, @bitCast(bn)), &r);
|
||||
return (@as(i128, @bitCast(r)) ^ s_a) -% s_a; // negate if s == -1
|
||||
}
|
||||
|
||||
pub fn __fixdfti(a: f64) callconv(.C) i128 {
|
||||
return floatToInt(i128, a);
|
||||
}
|
||||
|
||||
fn __fixdfti_windows_x86_64(a: f64) callconv(.C) v2u64 {
|
||||
return @as(v2u64, @bitCast(floatToInt(i128, a)));
|
||||
}
|
||||
|
||||
pub fn __fixsfti(a: f32) callconv(.C) i128 {
|
||||
return floatToInt(i128, a);
|
||||
}
|
||||
|
||||
fn __fixsfti_windows_x86_64(a: f32) callconv(.C) v2u64 {
|
||||
return @as(v2u64, @bitCast(floatToInt(i128, a)));
|
||||
}
|
||||
|
||||
pub fn __fixunsdfti(a: f64) callconv(.C) u128 {
|
||||
return floatToInt(u128, a);
|
||||
}
|
||||
|
||||
fn __fixunsdfti_windows_x86_64(a: f64) callconv(.C) v2u64 {
|
||||
return @as(v2u64, @bitCast(floatToInt(u128, a)));
|
||||
}
|
||||
|
||||
pub fn __fixunssfti(a: f32) callconv(.C) u128 {
|
||||
return floatToInt(u128, a);
|
||||
}
|
||||
|
||||
fn __fixunssfti_windows_x86_64(a: f32) callconv(.C) v2u64 {
|
||||
return @as(v2u64, @bitCast(floatToInt(u128, a)));
|
||||
}
|
||||
// mulo - multiplication overflow
|
||||
// * return a*%b.
|
||||
// * return if a*b overflows => 1 else => 0
|
||||
// - muloXi4_genericSmall as default
|
||||
// - muloXi4_genericFast for 2*bitsize <= usize
|
||||
|
||||
inline fn muloXi4_genericSmall(comptime ST: type, a: ST, b: ST, overflow: *c_int) ST {
|
||||
overflow.* = 0;
|
||||
const min = math.minInt(ST);
|
||||
var res: ST = a *% b;
|
||||
// Hacker's Delight section Overflow subsection Multiplication
|
||||
// case a=-2^{31}, b=-1 problem, because
|
||||
// on some machines a*b = -2^{31} with overflow
|
||||
// Then -2^{31}/-1 overflows and any result is possible.
|
||||
// => check with a<0 and b=-2^{31}
|
||||
if ((a < 0 and b == min) or (a != 0 and @divTrunc(res, a) != b))
|
||||
overflow.* = 1;
|
||||
return res;
|
||||
}
|
||||
|
||||
inline fn muloXi4_genericFast(comptime ST: type, a: ST, b: ST, overflow: *c_int) ST {
|
||||
overflow.* = 0;
|
||||
const EST = switch (ST) {
|
||||
i32 => i64,
|
||||
i64 => i128,
|
||||
i128 => i256,
|
||||
else => unreachable,
|
||||
};
|
||||
const min = math.minInt(ST);
|
||||
const max = math.maxInt(ST);
|
||||
var res: EST = @as(EST, a) * @as(EST, b);
|
||||
//invariant: -2^{bitwidth(EST)} < res < 2^{bitwidth(EST)-1}
|
||||
if (res < min or max < res)
|
||||
overflow.* = 1;
|
||||
return @as(ST, @truncate(res));
|
||||
}
|
||||
|
||||
const native_endian = builtin.cpu.arch.endian();
|
||||
const low = switch (native_endian) {
|
||||
.Big => 1,
|
||||
.Little => 0,
|
||||
};
|
||||
const high = 1 - low;
|
||||
|
||||
pub fn udivmod(comptime DoubleInt: type, a: DoubleInt, b: DoubleInt, maybe_rem: ?*DoubleInt) DoubleInt {
|
||||
// @setRuntimeSafety(builtin.is_test);
|
||||
|
||||
const double_int_bits = @typeInfo(DoubleInt).Int.bits;
|
||||
const single_int_bits = @divExact(double_int_bits, 2);
|
||||
const SingleInt = std.meta.Int(.unsigned, single_int_bits);
|
||||
const SignedDoubleInt = std.meta.Int(.signed, double_int_bits);
|
||||
const Log2SingleInt = std.math.Log2Int(SingleInt);
|
||||
|
||||
const n = @as([2]SingleInt, @bitCast(a));
|
||||
const d = @as([2]SingleInt, @bitCast(b));
|
||||
var q: [2]SingleInt = undefined;
|
||||
var r: [2]SingleInt = undefined;
|
||||
var sr: c_uint = undefined;
|
||||
// special cases, X is unknown, K != 0
|
||||
if (n[high] == 0) {
|
||||
if (d[high] == 0) {
|
||||
// 0 X
|
||||
// ---
|
||||
// 0 X
|
||||
if (maybe_rem) |rem| {
|
||||
rem.* = n[low] % d[low];
|
||||
}
|
||||
return n[low] / d[low];
|
||||
}
|
||||
// 0 X
|
||||
// ---
|
||||
// K X
|
||||
if (maybe_rem) |rem| {
|
||||
rem.* = n[low];
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
// n[high] != 0
|
||||
if (d[low] == 0) {
|
||||
if (d[high] == 0) {
|
||||
// K X
|
||||
// ---
|
||||
// 0 0
|
||||
if (maybe_rem) |rem| {
|
||||
rem.* = n[high] % d[low];
|
||||
}
|
||||
return n[high] / d[low];
|
||||
}
|
||||
// d[high] != 0
|
||||
if (n[low] == 0) {
|
||||
// K 0
|
||||
// ---
|
||||
// K 0
|
||||
if (maybe_rem) |rem| {
|
||||
r[high] = n[high] % d[high];
|
||||
r[low] = 0;
|
||||
rem.* = @as(DoubleInt, @bitCast(r));
|
||||
}
|
||||
return n[high] / d[high];
|
||||
}
|
||||
// K K
|
||||
// ---
|
||||
// K 0
|
||||
if ((d[high] & (d[high] - 1)) == 0) {
|
||||
// d is a power of 2
|
||||
if (maybe_rem) |rem| {
|
||||
r[low] = n[low];
|
||||
r[high] = n[high] & (d[high] - 1);
|
||||
rem.* = @as(DoubleInt, @bitCast(r));
|
||||
}
|
||||
return n[high] >> @as(Log2SingleInt, @intCast(@ctz(d[high])));
|
||||
}
|
||||
// K K
|
||||
// ---
|
||||
// K 0
|
||||
sr = @as(c_uint, @bitCast(@as(c_int, @clz(d[high])) - @as(c_int, @clz(n[high]))));
|
||||
// 0 <= sr <= single_int_bits - 2 or sr large
|
||||
if (sr > single_int_bits - 2) {
|
||||
if (maybe_rem) |rem| {
|
||||
rem.* = a;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
sr += 1;
|
||||
// 1 <= sr <= single_int_bits - 1
|
||||
// q.all = a << (double_int_bits - sr);
|
||||
q[low] = 0;
|
||||
q[high] = n[low] << @as(Log2SingleInt, @intCast(single_int_bits - sr));
|
||||
// r.all = a >> sr;
|
||||
r[high] = n[high] >> @as(Log2SingleInt, @intCast(sr));
|
||||
r[low] = (n[high] << @as(Log2SingleInt, @intCast(single_int_bits - sr))) | (n[low] >> @as(Log2SingleInt, @intCast(sr)));
|
||||
} else {
|
||||
// d[low] != 0
|
||||
if (d[high] == 0) {
|
||||
// K X
|
||||
// ---
|
||||
// 0 K
|
||||
if ((d[low] & (d[low] - 1)) == 0) {
|
||||
// d is a power of 2
|
||||
if (maybe_rem) |rem| {
|
||||
rem.* = n[low] & (d[low] - 1);
|
||||
}
|
||||
if (d[low] == 1) {
|
||||
return a;
|
||||
}
|
||||
sr = @ctz(d[low]);
|
||||
q[high] = n[high] >> @as(Log2SingleInt, @intCast(sr));
|
||||
q[low] = (n[high] << @as(Log2SingleInt, @intCast(single_int_bits - sr))) | (n[low] >> @as(Log2SingleInt, @intCast(sr)));
|
||||
return @as(DoubleInt, @bitCast(q));
|
||||
}
|
||||
// K X
|
||||
// ---
|
||||
// 0 K
|
||||
sr = 1 + single_int_bits + @as(c_uint, @clz(d[low])) - @as(c_uint, @clz(n[high]));
|
||||
// 2 <= sr <= double_int_bits - 1
|
||||
// q.all = a << (double_int_bits - sr);
|
||||
// r.all = a >> sr;
|
||||
if (sr == single_int_bits) {
|
||||
q[low] = 0;
|
||||
q[high] = n[low];
|
||||
r[high] = 0;
|
||||
r[low] = n[high];
|
||||
} else if (sr < single_int_bits) {
|
||||
// 2 <= sr <= single_int_bits - 1
|
||||
q[low] = 0;
|
||||
q[high] = n[low] << @as(Log2SingleInt, @intCast(single_int_bits - sr));
|
||||
r[high] = n[high] >> @as(Log2SingleInt, @intCast(sr));
|
||||
r[low] = (n[high] << @as(Log2SingleInt, @intCast(single_int_bits - sr))) | (n[low] >> @as(Log2SingleInt, @intCast(sr)));
|
||||
} else {
|
||||
// single_int_bits + 1 <= sr <= double_int_bits - 1
|
||||
q[low] = n[low] << @as(Log2SingleInt, @intCast(double_int_bits - sr));
|
||||
q[high] = (n[high] << @as(Log2SingleInt, @intCast(double_int_bits - sr))) | (n[low] >> @as(Log2SingleInt, @intCast(sr - single_int_bits)));
|
||||
r[high] = 0;
|
||||
r[low] = n[high] >> @as(Log2SingleInt, @intCast(sr - single_int_bits));
|
||||
}
|
||||
} else {
|
||||
// K X
|
||||
// ---
|
||||
// K K
|
||||
sr = @as(c_uint, @bitCast(@as(c_int, @clz(d[high])) - @as(c_int, @clz(n[high]))));
|
||||
// 0 <= sr <= single_int_bits - 1 or sr large
|
||||
if (sr > single_int_bits - 1) {
|
||||
if (maybe_rem) |rem| {
|
||||
rem.* = a;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
sr += 1;
|
||||
// 1 <= sr <= single_int_bits
|
||||
// q.all = a << (double_int_bits - sr);
|
||||
// r.all = a >> sr;
|
||||
q[low] = 0;
|
||||
if (sr == single_int_bits) {
|
||||
q[high] = n[low];
|
||||
r[high] = 0;
|
||||
r[low] = n[high];
|
||||
} else {
|
||||
r[high] = n[high] >> @as(Log2SingleInt, @intCast(sr));
|
||||
r[low] = (n[high] << @as(Log2SingleInt, @intCast(single_int_bits - sr))) | (n[low] >> @as(Log2SingleInt, @intCast(sr)));
|
||||
q[high] = n[low] << @as(Log2SingleInt, @intCast(single_int_bits - sr));
|
||||
}
|
||||
}
|
||||
}
|
||||
// Not a special case
|
||||
// q and r are initialized with:
|
||||
// q.all = a << (double_int_bits - sr);
|
||||
// r.all = a >> sr;
|
||||
// 1 <= sr <= double_int_bits - 1
|
||||
var carry: u32 = 0;
|
||||
var r_all: DoubleInt = undefined;
|
||||
while (sr > 0) : (sr -= 1) {
|
||||
// r:q = ((r:q) << 1) | carry
|
||||
r[high] = (r[high] << 1) | (r[low] >> (single_int_bits - 1));
|
||||
r[low] = (r[low] << 1) | (q[high] >> (single_int_bits - 1));
|
||||
q[high] = (q[high] << 1) | (q[low] >> (single_int_bits - 1));
|
||||
q[low] = (q[low] << 1) | carry;
|
||||
// carry = 0;
|
||||
// if (r.all >= b)
|
||||
// {
|
||||
// r.all -= b;
|
||||
// carry = 1;
|
||||
// }
|
||||
r_all = @as(DoubleInt, @bitCast(r));
|
||||
const s: SignedDoubleInt = @as(SignedDoubleInt, @bitCast(b -% r_all -% 1)) >> (double_int_bits - 1);
|
||||
carry = @as(u32, @intCast(s & 1));
|
||||
r_all -= b & @as(DoubleInt, @bitCast(s));
|
||||
r = @as([2]SingleInt, @bitCast(r_all));
|
||||
}
|
||||
const q_all = (@as(DoubleInt, @bitCast(q)) << 1) | carry;
|
||||
if (maybe_rem) |rem| {
|
||||
rem.* = r_all;
|
||||
}
|
||||
return q_all;
|
||||
}
|
||||
|
||||
pub inline fn floatToInt(comptime I: type, a: anytype) I {
|
||||
const Log2Int = math.Log2Int;
|
||||
const Int = @import("std").meta.Int;
|
||||
const F = @TypeOf(a);
|
||||
const float_bits = @typeInfo(F).Float.bits;
|
||||
const int_bits = @typeInfo(I).Int.bits;
|
||||
const rep_t = Int(.unsigned, float_bits);
|
||||
const sig_bits = math.floatMantissaBits(F);
|
||||
const exp_bits = math.floatExponentBits(F);
|
||||
const fractional_bits = floatFractionalBits(F);
|
||||
|
||||
// const implicit_bit = if (F != f80) (@as(rep_t, 1) << sig_bits) else 0;
|
||||
const implicit_bit = @as(rep_t, 1) << sig_bits;
|
||||
const max_exp = (1 << (exp_bits - 1));
|
||||
const exp_bias = max_exp - 1;
|
||||
const sig_mask = (@as(rep_t, 1) << sig_bits) - 1;
|
||||
|
||||
// Break a into sign, exponent, significand
|
||||
const a_rep: rep_t = @as(rep_t, @bitCast(a));
|
||||
const negative = (a_rep >> (float_bits - 1)) != 0;
|
||||
const exponent = @as(i32, @intCast((a_rep << 1) >> (sig_bits + 1))) - exp_bias;
|
||||
const significand: rep_t = (a_rep & sig_mask) | implicit_bit;
|
||||
|
||||
// If the exponent is negative, the result rounds to zero.
|
||||
if (exponent < 0) return 0;
|
||||
|
||||
// If the value is too large for the integer type, saturate.
|
||||
switch (@typeInfo(I).Int.signedness) {
|
||||
.unsigned => {
|
||||
if (negative) return 0;
|
||||
if (@as(c_uint, @intCast(exponent)) >= @min(int_bits, max_exp)) return math.maxInt(I);
|
||||
},
|
||||
.signed => if (@as(c_uint, @intCast(exponent)) >= @min(int_bits - 1, max_exp)) {
|
||||
return if (negative) math.minInt(I) else math.maxInt(I);
|
||||
},
|
||||
}
|
||||
|
||||
// If 0 <= exponent < sig_bits, right shift to get the result.
|
||||
// Otherwise, shift left.
|
||||
var result: I = undefined;
|
||||
if (exponent < fractional_bits) {
|
||||
result = @as(I, @intCast(significand >> @as(Log2Int(rep_t), @intCast(fractional_bits - exponent))));
|
||||
} else {
|
||||
result = @as(I, @intCast(significand)) << @as(Log2Int(I), @intCast(exponent - fractional_bits));
|
||||
}
|
||||
|
||||
if ((@typeInfo(I).Int.signedness == .signed) and negative)
|
||||
return ~result +% 1;
|
||||
return result;
|
||||
}
|
||||
|
||||
/// Returns the number of fractional bits in the mantissa of floating point type T.
|
||||
pub inline fn floatFractionalBits(comptime T: type) comptime_int {
|
||||
comptime std.debug.assert(@typeInfo(T) == .Float);
|
||||
|
||||
// standard IEEE floats have an implicit 0.m or 1.m integer part
|
||||
// f80 is special and has an explicitly stored bit in the MSB
|
||||
// this function corresponds to `MANT_DIG - 1' from C
|
||||
return switch (@typeInfo(T).Float.bits) {
|
||||
16 => 10,
|
||||
32 => 23,
|
||||
64 => 52,
|
||||
80 => 63,
|
||||
128 => 112,
|
||||
else => @compileError("unknown floating point type " ++ @typeName(T)),
|
||||
};
|
||||
}
|
||||
|
||||
pub fn __lshrti3(a: i128, b: i32) callconv(.C) i128 {
|
||||
return lshrXi3(i128, a, b);
|
||||
}
|
||||
|
||||
// Logical shift right: shift in 0 from left to right
|
||||
// Precondition: 0 <= b < T.bit_count
|
||||
inline fn lshrXi3(comptime T: type, a: T, b: i32) T {
|
||||
const word_t = HalveInt(T, false);
|
||||
const S = std.math.Log2Int(word_t.HalfT);
|
||||
|
||||
const input = word_t{ .all = a };
|
||||
var output: word_t = undefined;
|
||||
|
||||
if (b >= word_t.bits) {
|
||||
output.s.high = 0;
|
||||
output.s.low = input.s.high >> @as(S, @intCast(b - word_t.bits));
|
||||
} else if (b == 0) {
|
||||
return a;
|
||||
} else {
|
||||
output.s.high = input.s.high >> @as(S, @intCast(b));
|
||||
output.s.low = input.s.high << @as(S, @intCast(word_t.bits - b));
|
||||
output.s.low |= input.s.low >> @as(S, @intCast(b));
|
||||
}
|
||||
|
||||
return output.all;
|
||||
}
|
||||
|
||||
/// Allows to access underlying bits as two equally sized lower and higher
|
||||
/// signed or unsigned integers.
|
||||
fn HalveInt(comptime T: type, comptime signed_half: bool) type {
|
||||
return extern union {
|
||||
pub const bits = @divExact(@typeInfo(T).Int.bits, 2);
|
||||
pub const HalfTU = std.meta.Int(.unsigned, bits);
|
||||
pub const HalfTS = std.meta.Int(.signed, bits);
|
||||
pub const HalfT = if (signed_half) HalfTS else HalfTU;
|
||||
|
||||
all: T,
|
||||
s: if (native_endian == .Little)
|
||||
extern struct { low: HalfT, high: HalfT }
|
||||
else
|
||||
extern struct { high: HalfT, low: HalfT },
|
||||
};
|
||||
}
|
|
@ -1,87 +0,0 @@
|
|||
const std = @import("std");
|
||||
const builtin = @import("builtin");
|
||||
const arch = builtin.cpu.arch;
|
||||
const musl = @import("libc/musl.zig");
|
||||
const folly = @import("libc/folly.zig");
|
||||
const cpuid = @import("libc/cpuid.zig");
|
||||
|
||||
comptime {
|
||||
// TODO: remove this workaround.
|
||||
// Our wasm llvm pipeline always links in memcpy.
|
||||
// As such, our impl will conflict.
|
||||
if (builtin.is_test) {
|
||||
// We don't need memcpy for tests because the tests are built with -lc
|
||||
} else if (arch != .wasm32) {
|
||||
@export(memcpy, .{ .name = "memcpy", .linkage = .Strong });
|
||||
}
|
||||
}
|
||||
|
||||
const Memcpy = *const fn (noalias [*]u8, noalias [*]const u8, len: usize) callconv(.C) [*]u8;
|
||||
|
||||
pub var memcpy_target: Memcpy = switch (arch) {
|
||||
.x86_64 => dispatch_memcpy,
|
||||
else => unreachable,
|
||||
};
|
||||
|
||||
pub fn memcpy(noalias dest: [*]u8, noalias src: [*]const u8, len: usize) callconv(.C) [*]u8 {
|
||||
switch (builtin.os.tag) {
|
||||
.windows => {
|
||||
return musl.memcpy(dest, src, len);
|
||||
},
|
||||
else => switch (arch) {
|
||||
// x86_64 has a special optimized memcpy that can use avx2.
|
||||
.x86_64 => {
|
||||
return memcpy_target(dest, src, len);
|
||||
},
|
||||
else => {
|
||||
return musl.memcpy(dest, src, len);
|
||||
},
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
const MemcpyDecision = enum {
|
||||
uninitialized,
|
||||
folly_prefetchw,
|
||||
folly_prefetcht0,
|
||||
musl,
|
||||
};
|
||||
|
||||
var memcpy_decision: MemcpyDecision = .uninitialized;
|
||||
|
||||
fn dispatch_memcpy(noalias dest: [*]u8, noalias src: [*]const u8, len: usize) callconv(.C) [*]u8 {
|
||||
switch (arch) {
|
||||
.x86_64 => {
|
||||
// TODO: Switch this to overwrite the memcpy_target pointer once the surgical linker can support it.
|
||||
// Then dispatch will just happen on the first call instead of every call.
|
||||
// if (cpuid.supports_avx2()) {
|
||||
// if (cpuid.supports_prefetchw()) {
|
||||
// memcpy_target = folly.memcpy_prefetchw;
|
||||
// } else {
|
||||
// memcpy_target = folly.memcpy_prefetcht0;
|
||||
// }
|
||||
// } else {
|
||||
// memcpy_target = musl.memcpy;
|
||||
// }
|
||||
// return memcpy_target(dest, src, len);
|
||||
switch (memcpy_decision) {
|
||||
.uninitialized => {
|
||||
if (cpuid.supports_avx2()) {
|
||||
if (cpuid.supports_prefetchw()) {
|
||||
memcpy_decision = .folly_prefetchw;
|
||||
} else {
|
||||
memcpy_decision = .folly_prefetcht0;
|
||||
}
|
||||
} else {
|
||||
memcpy_decision = .musl;
|
||||
}
|
||||
return dispatch_memcpy(dest, src, len);
|
||||
},
|
||||
.folly_prefetchw => return folly.memcpy_prefetchw(dest, src, len),
|
||||
.folly_prefetcht0 => return folly.memcpy_prefetcht0(dest, src, len),
|
||||
.musl => return musl.memcpy(dest, src, len),
|
||||
}
|
||||
},
|
||||
else => unreachable,
|
||||
}
|
||||
}
|
|
@ -1,7 +0,0 @@
|
|||
const builtin = @import("builtin");
|
||||
const os = builtin.os;
|
||||
|
||||
pub const function_prefix = switch (os.tag) {
|
||||
.macos => "_",
|
||||
else => "",
|
||||
};
|
|
@ -1,53 +0,0 @@
|
|||
// Check if AVX2 is supported.
|
||||
// Returns 1 if AVX2 is supported, 0 otherwise.
|
||||
.global {[function_prefix]s}supports_avx2;
|
||||
{[function_prefix]s}supports_avx2:
|
||||
// Save the EBX register.
|
||||
push %rbx
|
||||
|
||||
// Call the CPUID instruction with the EAX register set to 7 and ECX set to 0.
|
||||
// This will get the CPUID information for the current CPU.
|
||||
mov $7, %eax
|
||||
mov $0, %ecx
|
||||
cpuid
|
||||
|
||||
// The AVX2 feature flag is located in the EBX register at bit 5.
|
||||
bt $5, %ebx
|
||||
jc .avx2_supported
|
||||
|
||||
// AVX2 is not supported.
|
||||
pop %rbx
|
||||
mov $0, %eax
|
||||
ret
|
||||
|
||||
.avx2_supported:
|
||||
pop %rbx
|
||||
mov $1, %eax
|
||||
ret
|
||||
|
||||
// Check if prefetchw is supported.
|
||||
// Returns 1 if the prefetchw instruction is supported, 0 otherwise.
|
||||
.global {[function_prefix]s}supports_prefetchw;
|
||||
{[function_prefix]s}supports_prefetchw:
|
||||
// Save the EBX register.
|
||||
push %rbx
|
||||
|
||||
// Call the CPUID instruction with the EAX register set to 0x80000001 and ECX set to 0.
|
||||
// This will get the CPUID information for the current CPU.
|
||||
mov $0x80000001, %eax
|
||||
mov $0, %ecx
|
||||
cpuid
|
||||
|
||||
// The prefetchw feature flag is located in the ECX register at bit 8.
|
||||
bt $8, %ecx
|
||||
jc .prefetchw_supported
|
||||
|
||||
// AVX2 is not supported.
|
||||
pop %rbx
|
||||
mov $0, %eax
|
||||
ret
|
||||
|
||||
.prefetchw_supported:
|
||||
pop %rbx
|
||||
mov $1, %eax
|
||||
ret
|
|
@ -1,18 +0,0 @@
|
|||
const std = @import("std");
|
||||
const builtin = @import("builtin");
|
||||
const arch = builtin.cpu.arch;
|
||||
const function_prefix = @import("assembly_util.zig").function_prefix;
|
||||
|
||||
// I couldn't manage to define this in a PIE friendly way with inline assembly.
|
||||
// Instead, I am defining it as global assembly functions.
|
||||
comptime {
|
||||
switch (arch) {
|
||||
.x86_64 => {
|
||||
asm (std.fmt.comptimePrint(@embedFile("cpuid.S"), .{ .function_prefix = function_prefix }));
|
||||
},
|
||||
else => unreachable,
|
||||
}
|
||||
}
|
||||
|
||||
pub extern fn supports_avx2() bool;
|
||||
pub extern fn supports_prefetchw() bool;
|
|
@ -1,2 +0,0 @@
|
|||
pub const memcpy_prefetchw = @import("folly/memcpy.zig").__folly_memcpy_prefetchw;
|
||||
pub const memcpy_prefetcht0 = @import("folly/memcpy.zig").__folly_memcpy_prefetcht0;
|
|
@ -1,437 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
/*
|
||||
* __folly_memcpy: An optimized memcpy implementation that uses prefetch and
|
||||
* AVX2 instructions.
|
||||
*
|
||||
* This implementation of memcpy acts as a memmove: while overlapping copies
|
||||
* are undefined in memcpy, in some implementations they're the same function and
|
||||
* legacy programs rely on this behavior.
|
||||
*
|
||||
* This implementation uses prefetch to avoid dtlb misses. This can
|
||||
* substantially reduce dtlb store misses in cases where the destination
|
||||
* location is absent from L1 cache and where the copy size is small enough
|
||||
* that the hardware prefetcher doesn't have a large impact.
|
||||
*
|
||||
* The number of branches is limited by the use of overlapping loads & stores.
|
||||
* This helps with copies where the source and destination cache lines are already
|
||||
* present in L1 because there are fewer instructions to execute and fewer
|
||||
* branches to potentially mispredict.
|
||||
* e.g. to copy the last 4 <= n <= 7 bytes: copy the first & last 4 bytes (overlapped):
|
||||
* movl (%rsi), %r8d
|
||||
* movl -4(%rsi,%rdx), %r9d
|
||||
* movl %r8d, (%rdi)
|
||||
* movl %r9d, -4(%rdi,%rdx)
|
||||
*
|
||||
*
|
||||
* For sizes up to 256 all source data is first read into registers and then written:
|
||||
* - n <= 16: overlapping movs
|
||||
* - n <= 32: overlapping unaligned 16-byte SSE XMM load/stores
|
||||
* - n <= 256: overlapping unaligned 32-byte AVX YMM load/stores
|
||||
*
|
||||
* Large copies (> 256 bytes) use unaligned loads + aligned stores.
|
||||
* This is observed to always be faster than rep movsb, so the rep movsb
|
||||
* instruction is not used.
|
||||
* - The head & tail may be unaligned => they're always written using unaligned stores.
|
||||
*
|
||||
* If the copy size is humongous (> 32 KiB) and the source and destination are both
|
||||
* aligned, this memcpy will use non-temporal operations (AVX2). This can have
|
||||
* a substantial speedup for copies where data is absent from L1, but it
|
||||
* is significantly slower if the source and destination data were already
|
||||
* in L1. The use of non-temporal operations also has the effect that after
|
||||
* the copy is complete, the data will be moved out of L1, even if the data was
|
||||
* present before the copy started.
|
||||
*
|
||||
* For n > 256 and overlapping src & dst buffers (memmove):
|
||||
* - use unaligned loads + aligned stores, but not non-temporal stores
|
||||
* - for dst < src forward copy in 128 byte batches:
|
||||
* - unaligned load the first 32 bytes & last 4 x 32 bytes
|
||||
* - forward copy (unaligned load + aligned stores) 4 x 32 bytes at a time
|
||||
* - unaligned store the first 32 bytes & last 4 x 32 bytes
|
||||
* - for dst > src backward copy in 128 byte batches:
|
||||
* - unaligned load the first 4 x 32 bytes & last 32 bytes
|
||||
* - backward copy (unaligned load + aligned stores) 4 x 32 bytes at a time
|
||||
* - unaligned store the first 4 x 32 bytes & last 32 bytes
|
||||
*
|
||||
* @author Logan Evans <lpe@fb.com>
|
||||
*/
|
||||
|
||||
|
||||
// .type {[function_prefix]s}__folly_memcpy_short_{[prefetch]s}, @function not supported by windows
|
||||
{[function_prefix]s}__folly_memcpy_short_{[prefetch]s}:
|
||||
.cfi_startproc
|
||||
|
||||
.L_GE1_LE7_{[prefetch]s}:
|
||||
cmp $1, %rdx
|
||||
je .L_EQ1_{[prefetch]s}
|
||||
|
||||
cmp $4, %rdx
|
||||
jae .L_GE4_LE7_{[prefetch]s}
|
||||
|
||||
.L_GE2_LE3_{[prefetch]s}:
|
||||
movw (%rsi), %r8w
|
||||
movw -2(%rsi,%rdx), %r9w
|
||||
movw %r8w, (%rdi)
|
||||
movw %r9w, -2(%rdi,%rdx)
|
||||
ret
|
||||
|
||||
.balign 2
|
||||
.L_EQ1_{[prefetch]s}:
|
||||
movb (%rsi), %r8b
|
||||
movb %r8b, (%rdi)
|
||||
ret
|
||||
|
||||
// Aligning the target of a jump to an even address has a measurable
|
||||
// speedup in microbenchmarks.
|
||||
.balign 2
|
||||
.L_GE4_LE7_{[prefetch]s}:
|
||||
movl (%rsi), %r8d
|
||||
movl -4(%rsi,%rdx), %r9d
|
||||
movl %r8d, (%rdi)
|
||||
movl %r9d, -4(%rdi,%rdx)
|
||||
ret
|
||||
|
||||
.cfi_endproc
|
||||
// .size {[function_prefix]s}__folly_memcpy_short_{[prefetch]s}, .-{[function_prefix]s}__folly_memcpy_short_{[prefetch]s} not supported by windows
|
||||
|
||||
// memcpy is an alternative entrypoint into the function named __folly_memcpy.
|
||||
// The compiler is able to call memcpy since the name is global while
|
||||
// stacktraces will show __folly_memcpy since that is the name of the function.
|
||||
// This is intended to aid in debugging by making it obvious which version of
|
||||
// memcpy is being used.
|
||||
.balign 64
|
||||
.globl {[function_prefix]s}__folly_memcpy_{[prefetch]s}
|
||||
// .type {[function_prefix]s}__folly_memcpy_{[prefetch]s}, @function not supported by windows
|
||||
|
||||
{[function_prefix]s}__folly_memcpy_{[prefetch]s}:
|
||||
.cfi_startproc
|
||||
|
||||
mov %rdi, %rax // return: $rdi
|
||||
|
||||
test %rdx, %rdx
|
||||
je .L_EQ0_{[prefetch]s}
|
||||
|
||||
{[prefetch]s} (%rdi)
|
||||
{[prefetch]s} -1(%rdi,%rdx)
|
||||
|
||||
cmp $8, %rdx
|
||||
jb .L_GE1_LE7_{[prefetch]s}
|
||||
|
||||
.L_GE8_{[prefetch]s}:
|
||||
cmp $32, %rdx
|
||||
ja .L_GE33_{[prefetch]s}
|
||||
|
||||
.L_GE8_LE32_{[prefetch]s}:
|
||||
cmp $16, %rdx
|
||||
ja .L_GE17_LE32_{[prefetch]s}
|
||||
|
||||
.L_GE8_LE16_{[prefetch]s}:
|
||||
mov (%rsi), %r8
|
||||
mov -8(%rsi,%rdx), %r9
|
||||
mov %r8, (%rdi)
|
||||
mov %r9, -8(%rdi,%rdx)
|
||||
.L_EQ0_{[prefetch]s}:
|
||||
ret
|
||||
|
||||
.balign 2
|
||||
.L_GE17_LE32_{[prefetch]s}:
|
||||
movdqu (%rsi), %xmm0
|
||||
movdqu -16(%rsi,%rdx), %xmm1
|
||||
movdqu %xmm0, (%rdi)
|
||||
movdqu %xmm1, -16(%rdi,%rdx)
|
||||
ret
|
||||
|
||||
.balign 2
|
||||
.L_GE193_LE256_{[prefetch]s}:
|
||||
vmovdqu %ymm3, 96(%rdi)
|
||||
vmovdqu %ymm4, -128(%rdi,%rdx)
|
||||
|
||||
.L_GE129_LE192_{[prefetch]s}:
|
||||
vmovdqu %ymm2, 64(%rdi)
|
||||
vmovdqu %ymm5, -96(%rdi,%rdx)
|
||||
|
||||
.L_GE65_LE128_{[prefetch]s}:
|
||||
vmovdqu %ymm1, 32(%rdi)
|
||||
vmovdqu %ymm6, -64(%rdi,%rdx)
|
||||
|
||||
.L_GE33_LE64_{[prefetch]s}:
|
||||
vmovdqu %ymm0, (%rdi)
|
||||
vmovdqu %ymm7, -32(%rdi,%rdx)
|
||||
|
||||
vzeroupper
|
||||
ret
|
||||
|
||||
.balign 2
|
||||
.L_GE33_{[prefetch]s}:
|
||||
vmovdqu (%rsi), %ymm0
|
||||
vmovdqu -32(%rsi,%rdx), %ymm7
|
||||
|
||||
cmp $64, %rdx
|
||||
jbe .L_GE33_LE64_{[prefetch]s}
|
||||
|
||||
{[prefetch]s} 64(%rdi)
|
||||
|
||||
vmovdqu 32(%rsi), %ymm1
|
||||
vmovdqu -64(%rsi,%rdx), %ymm6
|
||||
|
||||
cmp $128, %rdx
|
||||
jbe .L_GE65_LE128_{[prefetch]s}
|
||||
|
||||
{[prefetch]s} 128(%rdi)
|
||||
|
||||
vmovdqu 64(%rsi), %ymm2
|
||||
vmovdqu -96(%rsi,%rdx), %ymm5
|
||||
|
||||
cmp $192, %rdx
|
||||
jbe .L_GE129_LE192_{[prefetch]s}
|
||||
|
||||
{[prefetch]s} 192(%rdi)
|
||||
|
||||
vmovdqu 96(%rsi), %ymm3
|
||||
vmovdqu -128(%rsi,%rdx), %ymm4
|
||||
|
||||
cmp $256, %rdx
|
||||
jbe .L_GE193_LE256_{[prefetch]s}
|
||||
|
||||
.L_GE257_{[prefetch]s}:
|
||||
{[prefetch]s} 256(%rdi)
|
||||
|
||||
// Check if there is an overlap. If there is an overlap then the caller
|
||||
// has a bug since this is undefined behavior. However, for legacy
|
||||
// reasons this behavior is expected by some callers.
|
||||
//
|
||||
// All copies through 256 bytes will operate as a memmove since for
|
||||
// those sizes all reads are performed before any writes.
|
||||
//
|
||||
// This check uses the idea that there is an overlap if
|
||||
// (%rdi < (%rsi + %rdx)) && (%rsi < (%rdi + %rdx)),
|
||||
// or equivalently, there is no overlap if
|
||||
// ((%rsi + %rdx) <= %rdi) || ((%rdi + %rdx) <= %rsi).
|
||||
//
|
||||
// %r9 will be used after .L_ALIGNED_DST_LOOP to calculate how many
|
||||
// bytes remain to be copied.
|
||||
|
||||
// (%rsi + %rdx <= %rdi) => no overlap
|
||||
lea (%rsi,%rdx), %r9
|
||||
cmp %rdi, %r9
|
||||
jbe .L_NO_OVERLAP_{[prefetch]s}
|
||||
|
||||
// (%rdi + %rdx <= %rsi) => no overlap
|
||||
lea (%rdi,%rdx), %r8
|
||||
cmp %rsi, %r8
|
||||
// If no info is available in branch predictor's cache, Intel CPUs assume
|
||||
// forward jumps are not taken. Use a forward jump as overlapping buffers
|
||||
// are unlikely.
|
||||
ja .L_OVERLAP_{[prefetch]s}
|
||||
|
||||
.balign 2
|
||||
.L_NO_OVERLAP_{[prefetch]s}:
|
||||
vmovdqu %ymm0, (%rdi)
|
||||
vmovdqu %ymm1, 32(%rdi)
|
||||
vmovdqu %ymm2, 64(%rdi)
|
||||
vmovdqu %ymm3, 96(%rdi)
|
||||
|
||||
// Align %rdi to a 32 byte boundary.
|
||||
// %rcx = 128 - 31 & %rdi
|
||||
mov $128, %rcx
|
||||
and $31, %rdi
|
||||
sub %rdi, %rcx
|
||||
|
||||
lea (%rsi,%rcx), %rsi
|
||||
lea (%rax,%rcx), %rdi
|
||||
sub %rcx, %rdx
|
||||
|
||||
// %r8 is the end condition for the loop.
|
||||
lea -128(%rsi,%rdx), %r8
|
||||
|
||||
// This threshold is half of L1 cache on a Skylake machine, which means that
|
||||
// potentially all of L1 will be populated by this copy once it is executed
|
||||
// (dst and src are cached for temporal copies).
|
||||
// NON_TEMPORAL_STORE_THRESHOLD = $32768
|
||||
// cmp NON_TEMPORAL_STORE_THRESHOLD, %rdx
|
||||
cmp $32768, %rdx
|
||||
jae .L_NON_TEMPORAL_LOOP_{[prefetch]s}
|
||||
|
||||
.balign 2
|
||||
.L_ALIGNED_DST_LOOP_{[prefetch]s}:
|
||||
{[prefetch]s} 128(%rdi)
|
||||
{[prefetch]s} 192(%rdi)
|
||||
|
||||
vmovdqu (%rsi), %ymm0
|
||||
vmovdqu 32(%rsi), %ymm1
|
||||
vmovdqu 64(%rsi), %ymm2
|
||||
vmovdqu 96(%rsi), %ymm3
|
||||
add $128, %rsi
|
||||
|
||||
vmovdqa %ymm0, (%rdi)
|
||||
vmovdqa %ymm1, 32(%rdi)
|
||||
vmovdqa %ymm2, 64(%rdi)
|
||||
vmovdqa %ymm3, 96(%rdi)
|
||||
add $128, %rdi
|
||||
|
||||
cmp %r8, %rsi
|
||||
jb .L_ALIGNED_DST_LOOP_{[prefetch]s}
|
||||
|
||||
.L_ALIGNED_DST_LOOP_END_{[prefetch]s}:
|
||||
sub %rsi, %r9
|
||||
mov %r9, %rdx
|
||||
|
||||
vmovdqu %ymm4, -128(%rdi,%rdx)
|
||||
vmovdqu %ymm5, -96(%rdi,%rdx)
|
||||
vmovdqu %ymm6, -64(%rdi,%rdx)
|
||||
vmovdqu %ymm7, -32(%rdi,%rdx)
|
||||
|
||||
vzeroupper
|
||||
ret
|
||||
|
||||
.balign 2
|
||||
.L_NON_TEMPORAL_LOOP_{[prefetch]s}:
|
||||
testb $31, %sil
|
||||
jne .L_ALIGNED_DST_LOOP_{[prefetch]s}
|
||||
// This is prefetching the source data unlike ALIGNED_DST_LOOP which
|
||||
// prefetches the destination data. This choice is again informed by
|
||||
// benchmarks. With a non-temporal store the entirety of the cache line
|
||||
// is being written so the previous data can be discarded without being
|
||||
// fetched.
|
||||
prefetchnta 128(%rsi)
|
||||
prefetchnta 196(%rsi)
|
||||
|
||||
vmovntdqa (%rsi), %ymm0
|
||||
vmovntdqa 32(%rsi), %ymm1
|
||||
vmovntdqa 64(%rsi), %ymm2
|
||||
vmovntdqa 96(%rsi), %ymm3
|
||||
add $128, %rsi
|
||||
|
||||
vmovntdq %ymm0, (%rdi)
|
||||
vmovntdq %ymm1, 32(%rdi)
|
||||
vmovntdq %ymm2, 64(%rdi)
|
||||
vmovntdq %ymm3, 96(%rdi)
|
||||
add $128, %rdi
|
||||
|
||||
cmp %r8, %rsi
|
||||
jb .L_NON_TEMPORAL_LOOP_{[prefetch]s}
|
||||
|
||||
sfence
|
||||
jmp .L_ALIGNED_DST_LOOP_END_{[prefetch]s}
|
||||
|
||||
|
||||
.L_OVERLAP_{[prefetch]s}:
|
||||
.balign 2
|
||||
cmp %rdi, %rsi
|
||||
jb .L_OVERLAP_BWD_{[prefetch]s} // %rsi < %rdi => backward-copy
|
||||
je .L_RET_{[prefetch]s} // %rsi == %rdi => return, nothing to copy
|
||||
|
||||
// Source & destination buffers overlap. Forward copy.
|
||||
|
||||
vmovdqu (%rsi), %ymm8
|
||||
|
||||
// Align %rdi to a 32 byte boundary.
|
||||
// %rcx = 32 - 31 & %rdi
|
||||
mov $32, %rcx
|
||||
and $31, %rdi
|
||||
sub %rdi, %rcx
|
||||
|
||||
lea (%rsi,%rcx), %rsi
|
||||
lea (%rax,%rcx), %rdi
|
||||
sub %rcx, %rdx
|
||||
|
||||
// %r8 is the end condition for the loop.
|
||||
lea -128(%rsi,%rdx), %r8
|
||||
|
||||
|
||||
.L_OVERLAP_FWD_ALIGNED_DST_LOOP_{[prefetch]s}:
|
||||
{[prefetch]s} 128(%rdi)
|
||||
{[prefetch]s} 192(%rdi)
|
||||
|
||||
vmovdqu (%rsi), %ymm0
|
||||
vmovdqu 32(%rsi), %ymm1
|
||||
vmovdqu 64(%rsi), %ymm2
|
||||
vmovdqu 96(%rsi), %ymm3
|
||||
add $128, %rsi
|
||||
|
||||
vmovdqa %ymm0, (%rdi)
|
||||
vmovdqa %ymm1, 32(%rdi)
|
||||
vmovdqa %ymm2, 64(%rdi)
|
||||
vmovdqa %ymm3, 96(%rdi)
|
||||
add $128, %rdi
|
||||
|
||||
cmp %r8, %rsi
|
||||
jb .L_OVERLAP_FWD_ALIGNED_DST_LOOP_{[prefetch]s}
|
||||
|
||||
sub %rsi, %r9
|
||||
mov %r9, %rdx
|
||||
|
||||
vmovdqu %ymm4, -128(%rdi,%rdx)
|
||||
vmovdqu %ymm5, -96(%rdi,%rdx)
|
||||
vmovdqu %ymm6, -64(%rdi,%rdx)
|
||||
vmovdqu %ymm7, -32(%rdi,%rdx)
|
||||
vmovdqu %ymm8, (%rax) // %rax == the original (unaligned) %rdi
|
||||
|
||||
vzeroupper
|
||||
|
||||
.L_RET_{[prefetch]s}:
|
||||
ret
|
||||
|
||||
.L_OVERLAP_BWD_{[prefetch]s}:
|
||||
// Save last 32 bytes.
|
||||
vmovdqu -32(%rsi, %rdx), %ymm8
|
||||
lea -32(%rdi, %rdx), %r9
|
||||
|
||||
|
||||
// %r8 is the end condition for the loop.
|
||||
lea 128(%rsi), %r8
|
||||
|
||||
// Align %rdi+%rdx (destination end) to a 32 byte boundary.
|
||||
// %rcx = (%rdi + %rdx - 32) & 31
|
||||
mov %r9, %rcx
|
||||
and $31, %rcx
|
||||
// Set %rsi & %rdi to the end of the 32 byte aligned range.
|
||||
sub %rcx, %rdx
|
||||
add %rdx, %rsi
|
||||
add %rdx, %rdi
|
||||
|
||||
|
||||
.L_OVERLAP_BWD_ALIGNED_DST_LOOP_{[prefetch]s}:
|
||||
{[prefetch]s} -128(%rdi)
|
||||
{[prefetch]s} -192(%rdi)
|
||||
|
||||
vmovdqu -32(%rsi), %ymm4
|
||||
vmovdqu -64(%rsi), %ymm5
|
||||
vmovdqu -96(%rsi), %ymm6
|
||||
vmovdqu -128(%rsi), %ymm7
|
||||
sub $128, %rsi
|
||||
|
||||
vmovdqa %ymm4, -32(%rdi)
|
||||
vmovdqa %ymm5, -64(%rdi)
|
||||
vmovdqa %ymm6, -96(%rdi)
|
||||
vmovdqa %ymm7, -128(%rdi)
|
||||
sub $128, %rdi
|
||||
|
||||
cmp %r8, %rsi
|
||||
ja .L_OVERLAP_BWD_ALIGNED_DST_LOOP_{[prefetch]s}
|
||||
|
||||
vmovdqu %ymm0, (%rax) // %rax == the original unaligned %rdi
|
||||
vmovdqu %ymm1, 32(%rax)
|
||||
vmovdqu %ymm2, 64(%rax)
|
||||
vmovdqu %ymm3, 96(%rax)
|
||||
vmovdqu %ymm8, (%r9)
|
||||
|
||||
vzeroupper
|
||||
ret
|
||||
|
||||
.cfi_endproc
|
||||
// .size {[function_prefix]s}__folly_memcpy_{[prefetch]s}, .-{[function_prefix]s}__folly_memcpy_{[prefetch]s} not supported by windows
|
|
@ -1,18 +0,0 @@
|
|||
const std = @import("std");
|
||||
const builtin = @import("builtin");
|
||||
const arch = builtin.cpu.arch;
|
||||
const function_prefix = @import("../assembly_util.zig").function_prefix;
|
||||
|
||||
comptime {
|
||||
switch (arch) {
|
||||
.x86_64 => {
|
||||
inline for ([_][]const u8{ "prefetchw", "prefetcht0" }) |prefetch| {
|
||||
asm (std.fmt.comptimePrint(@embedFile("memcpy-x86_64.S"), .{ .prefetch = prefetch, .function_prefix = function_prefix }));
|
||||
}
|
||||
},
|
||||
else => unreachable,
|
||||
}
|
||||
}
|
||||
|
||||
pub extern fn __folly_memcpy_prefetchw(noalias dest: [*]u8, noalias src: [*]const u8, len: usize) callconv(.SysV) [*]u8;
|
||||
pub extern fn __folly_memcpy_prefetcht0(noalias dest: [*]u8, noalias src: [*]const u8, len: usize) callconv(.SysV) [*]u8;
|
|
@ -1 +0,0 @@
|
|||
pub const memcpy = @import("musl/memcpy.zig").memcpy;
|
|
@ -1,193 +0,0 @@
|
|||
musl as a whole is licensed under the following standard MIT license:
|
||||
|
||||
----------------------------------------------------------------------
|
||||
Copyright © 2005-2020 Rich Felker, et al.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining
|
||||
a copy of this software and associated documentation files (the
|
||||
"Software"), to deal in the Software without restriction, including
|
||||
without limitation the rights to use, copy, modify, merge, publish,
|
||||
distribute, sublicense, and/or sell copies of the Software, and to
|
||||
permit persons to whom the Software is furnished to do so, subject to
|
||||
the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be
|
||||
included in all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
|
||||
IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
|
||||
CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
|
||||
TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
|
||||
SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
----------------------------------------------------------------------
|
||||
|
||||
Authors/contributors include:
|
||||
|
||||
A. Wilcox
|
||||
Ada Worcester
|
||||
Alex Dowad
|
||||
Alex Suykov
|
||||
Alexander Monakov
|
||||
Andre McCurdy
|
||||
Andrew Kelley
|
||||
Anthony G. Basile
|
||||
Aric Belsito
|
||||
Arvid Picciani
|
||||
Bartosz Brachaczek
|
||||
Benjamin Peterson
|
||||
Bobby Bingham
|
||||
Boris Brezillon
|
||||
Brent Cook
|
||||
Chris Spiegel
|
||||
Clément Vasseur
|
||||
Daniel Micay
|
||||
Daniel Sabogal
|
||||
Daurnimator
|
||||
David Carlier
|
||||
David Edelsohn
|
||||
Denys Vlasenko
|
||||
Dmitry Ivanov
|
||||
Dmitry V. Levin
|
||||
Drew DeVault
|
||||
Emil Renner Berthing
|
||||
Fangrui Song
|
||||
Felix Fietkau
|
||||
Felix Janda
|
||||
Gianluca Anzolin
|
||||
Hauke Mehrtens
|
||||
He X
|
||||
Hiltjo Posthuma
|
||||
Isaac Dunham
|
||||
Jaydeep Patil
|
||||
Jens Gustedt
|
||||
Jeremy Huntwork
|
||||
Jo-Philipp Wich
|
||||
Joakim Sindholt
|
||||
John Spencer
|
||||
Julien Ramseier
|
||||
Justin Cormack
|
||||
Kaarle Ritvanen
|
||||
Khem Raj
|
||||
Kylie McClain
|
||||
Leah Neukirchen
|
||||
Luca Barbato
|
||||
Luka Perkov
|
||||
M Farkas-Dyck (Strake)
|
||||
Mahesh Bodapati
|
||||
Markus Wichmann
|
||||
Masanori Ogino
|
||||
Michael Clark
|
||||
Michael Forney
|
||||
Mikhail Kremnyov
|
||||
Natanael Copa
|
||||
Nicholas J. Kain
|
||||
orc
|
||||
Pascal Cuoq
|
||||
Patrick Oppenlander
|
||||
Petr Hosek
|
||||
Petr Skocik
|
||||
Pierre Carrier
|
||||
Reini Urban
|
||||
Rich Felker
|
||||
Richard Pennington
|
||||
Ryan Fairfax
|
||||
Samuel Holland
|
||||
Segev Finer
|
||||
Shiz
|
||||
sin
|
||||
Solar Designer
|
||||
Stefan Kristiansson
|
||||
Stefan O'Rear
|
||||
Szabolcs Nagy
|
||||
Timo Teräs
|
||||
Trutz Behn
|
||||
Valentin Ochs
|
||||
Will Dietz
|
||||
William Haddon
|
||||
William Pitcock
|
||||
|
||||
Portions of this software are derived from third-party works licensed
|
||||
under terms compatible with the above MIT license:
|
||||
|
||||
The TRE regular expression implementation (src/regex/reg* and
|
||||
src/regex/tre*) is Copyright © 2001-2008 Ville Laurikari and licensed
|
||||
under a 2-clause BSD license (license text in the source files). The
|
||||
included version has been heavily modified by Rich Felker in 2012, in
|
||||
the interests of size, simplicity, and namespace cleanliness.
|
||||
|
||||
Much of the math library code (src/math/* and src/complex/*) is
|
||||
Copyright © 1993,2004 Sun Microsystems or
|
||||
Copyright © 2003-2011 David Schultz or
|
||||
Copyright © 2003-2009 Steven G. Kargl or
|
||||
Copyright © 2003-2009 Bruce D. Evans or
|
||||
Copyright © 2008 Stephen L. Moshier or
|
||||
Copyright © 2017-2018 Arm Limited
|
||||
and labelled as such in comments in the individual source files. All
|
||||
have been licensed under extremely permissive terms.
|
||||
|
||||
The ARM memcpy code (src/string/arm/memcpy.S) is Copyright © 2008
|
||||
The Android Open Source Project and is licensed under a two-clause BSD
|
||||
license. It was taken from Bionic libc, used on Android.
|
||||
|
||||
The AArch64 memcpy and memset code (src/string/aarch64/*) are
|
||||
Copyright © 1999-2019, Arm Limited.
|
||||
|
||||
The implementation of DES for crypt (src/crypt/crypt_des.c) is
|
||||
Copyright © 1994 David Burren. It is licensed under a BSD license.
|
||||
|
||||
The implementation of blowfish crypt (src/crypt/crypt_blowfish.c) was
|
||||
originally written by Solar Designer and placed into the public
|
||||
domain. The code also comes with a fallback permissive license for use
|
||||
in jurisdictions that may not recognize the public domain.
|
||||
|
||||
The smoothsort implementation (src/stdlib/qsort.c) is Copyright © 2011
|
||||
Valentin Ochs and is licensed under an MIT-style license.
|
||||
|
||||
The x86_64 port was written by Nicholas J. Kain and is licensed under
|
||||
the standard MIT terms.
|
||||
|
||||
The mips and microblaze ports were originally written by Richard
|
||||
Pennington for use in the ellcc project. The original code was adapted
|
||||
by Rich Felker for build system and code conventions during upstream
|
||||
integration. It is licensed under the standard MIT terms.
|
||||
|
||||
The mips64 port was contributed by Imagination Technologies and is
|
||||
licensed under the standard MIT terms.
|
||||
|
||||
The powerpc port was also originally written by Richard Pennington,
|
||||
and later supplemented and integrated by John Spencer. It is licensed
|
||||
under the standard MIT terms.
|
||||
|
||||
All other files which have no copyright comments are original works
|
||||
produced specifically for use as part of this library, written either
|
||||
by Rich Felker, the main author of the library, or by one or more
|
||||
contibutors listed above. Details on authorship of individual files
|
||||
can be found in the git version control history of the project. The
|
||||
omission of copyright and license comments in each file is in the
|
||||
interest of source tree size.
|
||||
|
||||
In addition, permission is hereby granted for all public header files
|
||||
(include/* and arch/*/bits/*) and crt files intended to be linked into
|
||||
applications (crt/*, ldso/dlstart.c, and arch/*/crt_arch.h) to omit
|
||||
the copyright notice and permission notice otherwise required by the
|
||||
license, and to use these files without any requirement of
|
||||
attribution. These files include substantial contributions from:
|
||||
|
||||
Bobby Bingham
|
||||
John Spencer
|
||||
Nicholas J. Kain
|
||||
Rich Felker
|
||||
Richard Pennington
|
||||
Stefan Kristiansson
|
||||
Szabolcs Nagy
|
||||
|
||||
all of whom have explicitly granted such permission.
|
||||
|
||||
This file previously contained text expressing a belief that most of
|
||||
the files covered by the above exception were sufficiently trivial not
|
||||
to be subject to copyright, resulting in confusion over whether it
|
||||
negated the permissions granted in the license. In the spirit of
|
||||
permissive licensing, and of not having licensing issues being an
|
||||
obstacle to adoption, that text has been removed.
|
|
@ -1,2 +0,0 @@
|
|||
This set of files all come from [musl libc](https://musl.libc.org/).
|
||||
Roc just directly uses a few of them instead of depending on musl libc fully.
|
|
@ -1,30 +0,0 @@
|
|||
.global {[function_prefix]s}musl_memcpy
|
||||
// Windows does not support the type directive.
|
||||
// .type {[function_prefix]s}musl_memcpy,@function
|
||||
{[function_prefix]s}musl_memcpy:
|
||||
push %esi
|
||||
push %edi
|
||||
mov 12(%esp),%edi
|
||||
mov 16(%esp),%esi
|
||||
mov 20(%esp),%ecx
|
||||
mov %edi,%eax
|
||||
cmp $4,%ecx
|
||||
jc 1f
|
||||
test $3,%edi
|
||||
jz 1f
|
||||
2: movsb
|
||||
dec %ecx
|
||||
test $3,%edi
|
||||
jnz 2b
|
||||
1: mov %ecx,%edx
|
||||
shr $2,%ecx
|
||||
rep
|
||||
movsl
|
||||
and $3,%edx
|
||||
jz 1f
|
||||
2: movsb
|
||||
dec %edx
|
||||
jnz 2b
|
||||
1: pop %edi
|
||||
pop %esi
|
||||
ret
|
|
@ -1,23 +0,0 @@
|
|||
.global {[function_prefix]s}musl_memcpy
|
||||
// Windows does not support the type directive.
|
||||
// .type {[function_prefix]s}musl_memcpy,@function
|
||||
{[function_prefix]s}musl_memcpy:
|
||||
mov %rdi,%rax
|
||||
cmp $8,%rdx
|
||||
jc 1f
|
||||
test $7,%edi
|
||||
jz 1f
|
||||
2: movsb
|
||||
dec %rdx
|
||||
test $7,%edi
|
||||
jnz 2b
|
||||
1: mov %rdx,%rcx
|
||||
shr $3,%rcx
|
||||
rep
|
||||
movsq
|
||||
and $7,%edx
|
||||
jz 1f
|
||||
2: movsb
|
||||
dec %edx
|
||||
jnz 2b
|
||||
1: ret
|
|
@ -1,223 +0,0 @@
|
|||
const std = @import("std");
|
||||
const builtin = @import("builtin");
|
||||
const arch = builtin.cpu.arch;
|
||||
const function_prefix = @import("../assembly_util.zig").function_prefix;
|
||||
|
||||
comptime {
|
||||
switch (arch) {
|
||||
.x86_64 => {
|
||||
asm (std.fmt.comptimePrint(@embedFile("memcpy-x86_64.S"), .{ .function_prefix = function_prefix }));
|
||||
},
|
||||
.x86 => {
|
||||
asm (std.fmt.comptimePrint(@embedFile("memcpy-x86.S"), .{ .function_prefix = function_prefix }));
|
||||
},
|
||||
// TODO: add assembly implementations for other platforms.
|
||||
else => {},
|
||||
}
|
||||
}
|
||||
|
||||
pub const memcpy =
|
||||
switch (builtin.os.tag) {
|
||||
.windows => fallback_memcpy,
|
||||
else => switch (arch) {
|
||||
.x86_64, .x86 => musl_memcpy,
|
||||
else => fallback_memcpy,
|
||||
},
|
||||
};
|
||||
|
||||
pub extern fn musl_memcpy(noalias dest: [*]u8, noalias src: [*]const u8, len: usize) callconv(.C) [*]u8;
|
||||
|
||||
// Note: this is written to only support little endian targets.
|
||||
// To support big endian, `<<` and `>>` wold need to be swapped.
|
||||
pub fn fallback_memcpy(noalias dest: [*]u8, noalias src: [*]const u8, len: usize) callconv(.C) [*]u8 {
|
||||
var d = dest;
|
||||
var s = src;
|
||||
var n = len;
|
||||
switch (@min(n, @intFromPtr(s) % 4)) {
|
||||
1 => {
|
||||
d[0] = s[0];
|
||||
d += 1;
|
||||
s += 1;
|
||||
n -= 1;
|
||||
},
|
||||
2 => {
|
||||
d[0] = s[0];
|
||||
d[1] = s[1];
|
||||
d += 2;
|
||||
s += 2;
|
||||
n -= 2;
|
||||
},
|
||||
3 => {
|
||||
d[0] = s[0];
|
||||
d[1] = s[1];
|
||||
d[2] = s[2];
|
||||
d += 3;
|
||||
s += 3;
|
||||
n -= 3;
|
||||
},
|
||||
else => {},
|
||||
}
|
||||
|
||||
if (@intFromPtr(d) % 4 == 0) {
|
||||
var d4 = @as([*]align(4) u8, @alignCast(d));
|
||||
var s4 = @as([*]align(4) const u8, @alignCast(s));
|
||||
while (n >= 16) : (n -= 16) {
|
||||
var d_u32 = @as([*]u32, @ptrCast(d4));
|
||||
var s_u32 = @as([*]const u32, @ptrCast(s4));
|
||||
d_u32[0] = s_u32[0];
|
||||
d_u32[1] = s_u32[1];
|
||||
d_u32[2] = s_u32[2];
|
||||
d_u32[3] = s_u32[3];
|
||||
|
||||
d4 += 16;
|
||||
s4 += 16;
|
||||
}
|
||||
if (n & 8 != 0) {
|
||||
var d_u32 = @as([*]u32, @ptrCast(d4));
|
||||
var s_u32 = @as([*]const u32, @ptrCast(s4));
|
||||
d_u32[0] = s_u32[0];
|
||||
d_u32[1] = s_u32[1];
|
||||
|
||||
d4 += 8;
|
||||
s4 += 8;
|
||||
}
|
||||
if (n & 4 != 0) {
|
||||
var d_u32 = @as([*]u32, @ptrCast(d4));
|
||||
var s_u32 = @as([*]const u32, @ptrCast(s4));
|
||||
d_u32[0] = s_u32[0];
|
||||
|
||||
d4 += 4;
|
||||
s4 += 4;
|
||||
}
|
||||
d = d4;
|
||||
s = s4;
|
||||
if (n & 2 != 0) {
|
||||
d[0] = s[0];
|
||||
d += 1;
|
||||
s += 1;
|
||||
d[0] = s[0];
|
||||
d += 1;
|
||||
s += 1;
|
||||
}
|
||||
if (n & 1 != 0) {
|
||||
d[0] = s[0];
|
||||
}
|
||||
return dest;
|
||||
}
|
||||
if (n >= 32) {
|
||||
switch (@intFromPtr(d) % 4) {
|
||||
1 => {
|
||||
var w = @as([*]const u32, @ptrCast(@alignCast(s)))[0];
|
||||
d[0] = s[0];
|
||||
d += 1;
|
||||
s += 1;
|
||||
d[0] = s[0];
|
||||
d += 1;
|
||||
s += 1;
|
||||
d[0] = s[0];
|
||||
d += 1;
|
||||
s += 1;
|
||||
n -= 3;
|
||||
while (n >= 17) : (n -= 16) {
|
||||
var d_u32 = @as([*]u32, @ptrCast(@alignCast(d)));
|
||||
var s_u32 = @as([*]const u32, @ptrCast(@alignCast(s + 1)));
|
||||
var x = s_u32[0];
|
||||
d_u32[0] = (w >> 24) | (x << 8);
|
||||
w = s_u32[1];
|
||||
d_u32[1] = (x >> 24) | (w << 8);
|
||||
x = s_u32[2];
|
||||
d_u32[2] = (w >> 24) | (x << 8);
|
||||
w = s_u32[3];
|
||||
d_u32[3] = (x >> 24) | (w << 8);
|
||||
|
||||
d += 16;
|
||||
s += 16;
|
||||
}
|
||||
},
|
||||
2 => {
|
||||
var w = @as([*]const u32, @ptrCast(@alignCast(s)))[0];
|
||||
d[0] = s[0];
|
||||
d += 1;
|
||||
s += 1;
|
||||
d[0] = s[0];
|
||||
d += 1;
|
||||
s += 1;
|
||||
n -= 2;
|
||||
while (n >= 18) : (n -= 16) {
|
||||
var d_u32 = @as([*]u32, @ptrCast(@alignCast(d)));
|
||||
var s_u32 = @as([*]const u32, @ptrCast(@alignCast(s + 2)));
|
||||
var x = s_u32[0];
|
||||
d_u32[0] = (w >> 16) | (x << 16);
|
||||
w = s_u32[1];
|
||||
d_u32[1] = (x >> 16) | (w << 16);
|
||||
x = s_u32[2];
|
||||
d_u32[2] = (w >> 16) | (x << 16);
|
||||
w = s_u32[3];
|
||||
d_u32[3] = (x >> 16) | (w << 16);
|
||||
|
||||
d += 16;
|
||||
s += 16;
|
||||
}
|
||||
},
|
||||
3 => {
|
||||
var w = @as([*]const u32, @ptrCast(@alignCast(s)))[0];
|
||||
d[0] = s[0];
|
||||
d += 1;
|
||||
s += 1;
|
||||
n -= 1;
|
||||
while (n >= 19) : (n -= 16) {
|
||||
var d_u32 = @as([*]u32, @ptrCast(@alignCast(d)));
|
||||
var s_u32 = @as([*]const u32, @ptrCast(@alignCast(s + 3)));
|
||||
var x = s_u32[0];
|
||||
d_u32[0] = (w >> 8) | (x << 24);
|
||||
w = s_u32[1];
|
||||
d_u32[1] = (x >> 8) | (w << 24);
|
||||
x = s_u32[2];
|
||||
d_u32[2] = (w >> 8) | (x << 24);
|
||||
w = s_u32[3];
|
||||
d_u32[3] = (x >> 8) | (w << 24);
|
||||
|
||||
d += 16;
|
||||
s += 16;
|
||||
}
|
||||
},
|
||||
else => unreachable,
|
||||
}
|
||||
}
|
||||
if (n & 16 != 0) {
|
||||
comptime var i = 0;
|
||||
inline while (i < 16) : (i += 1) {
|
||||
d[0] = s[0];
|
||||
d += 1;
|
||||
s += 1;
|
||||
}
|
||||
}
|
||||
if (n & 8 != 0) {
|
||||
comptime var i = 0;
|
||||
inline while (i < 8) : (i += 1) {
|
||||
d[0] = s[0];
|
||||
d += 1;
|
||||
s += 1;
|
||||
}
|
||||
}
|
||||
if (n & 4 != 0) {
|
||||
comptime var i = 0;
|
||||
inline while (i < 4) : (i += 1) {
|
||||
d[0] = s[0];
|
||||
d += 1;
|
||||
s += 1;
|
||||
}
|
||||
}
|
||||
if (n & 2 != 0) {
|
||||
d[0] = s[0];
|
||||
d += 1;
|
||||
s += 1;
|
||||
d[0] = s[0];
|
||||
d += 1;
|
||||
s += 1;
|
||||
}
|
||||
if (n & 1 != 0) {
|
||||
d[0] = s[0];
|
||||
}
|
||||
return dest;
|
||||
}
|
|
@ -6,11 +6,6 @@ const expect = @import("expect.zig");
|
|||
const panic_utils = @import("panic.zig");
|
||||
const dbg_utils = @import("dbg.zig");
|
||||
|
||||
// comptime {
|
||||
// _ = @import("compiler_rt.zig");
|
||||
// _ = @import("libc.zig");
|
||||
// }
|
||||
|
||||
const ROC_BUILTINS = "roc_builtins";
|
||||
const NUM = "num";
|
||||
const STR = "str";
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue