mirror of
https://github.com/roc-lang/roc.git
synced 2025-08-02 19:32:17 +00:00
Write newlines after reading the file
This makes the prefetches much more effective, at the cost of one copy_nonoverlapping
This commit is contained in:
parent
061d2e5df6
commit
af863f54b9
1 changed files with 75 additions and 73 deletions
|
@ -12,6 +12,7 @@
|
||||||
use bumpalo::{self, Bump};
|
use bumpalo::{self, Bump};
|
||||||
use core::{
|
use core::{
|
||||||
alloc::Layout,
|
alloc::Layout,
|
||||||
|
mem::{align_of, MaybeUninit},
|
||||||
ptr::{self, NonNull},
|
ptr::{self, NonNull},
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -82,7 +83,15 @@ impl<'a> Src64<'a> {
|
||||||
}
|
}
|
||||||
|
|
||||||
// Safety: we got capacity by rounding up to the nearest 64B
|
// Safety: we got capacity by rounding up to the nearest 64B
|
||||||
let dest = unsafe { allocate_and_pad_with_newlines(arena, capacity)? }.as_ptr() as *mut u8;
|
let dest = unsafe { allocate_chunks(arena, capacity)? }.as_ptr() as *mut u8;
|
||||||
|
|
||||||
|
// Safety: `dest` has a length of `capacity`, which has been rounded up to a multiple of 64.
|
||||||
|
unsafe {
|
||||||
|
let trailing_newlines_needed = capacity - src_len;
|
||||||
|
|
||||||
|
// Start writing newlines right after the last of the bytes we got from the file.
|
||||||
|
write_newlines(dest.add(src_len), trailing_newlines_needed);
|
||||||
|
};
|
||||||
|
|
||||||
// Safety: we just allocated `dest` to have len >= src.len(), and they're both u8 arrays.
|
// Safety: we just allocated `dest` to have len >= src.len(), and they're both u8 arrays.
|
||||||
unsafe {
|
unsafe {
|
||||||
|
@ -128,7 +137,7 @@ impl<'a> Src64<'a> {
|
||||||
|
|
||||||
// Safety: round_up_to_nearest_u64 will give us a capacity that is
|
// Safety: round_up_to_nearest_u64 will give us a capacity that is
|
||||||
// at least 64, and also a multiple of 64.
|
// at least 64, and also a multiple of 64.
|
||||||
match unsafe { allocate_and_pad_with_newlines(arena, capacity) } {
|
match unsafe { allocate_chunks(arena, capacity) } {
|
||||||
Some(buf) => {
|
Some(buf) => {
|
||||||
// Read bytes equal to file_size into the arena allocation.
|
// Read bytes equal to file_size into the arena allocation.
|
||||||
//
|
//
|
||||||
|
@ -205,6 +214,46 @@ impl<'a> Src64<'a> {
|
||||||
return Err(FileErr::ReadErr);
|
return Err(FileErr::ReadErr);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Before we write newlines to the last chunk, branchlessly prefetch the first four 64-byte chunks.
|
||||||
|
// We're about to have a cache miss due to loading the last chunk from main memory (DMA will have
|
||||||
|
// written it there without having gone through the CPU), and if we don't prefetch here, then we'll
|
||||||
|
// immediately get a second cache miss when we start traversing the loaded file. The prefetch means
|
||||||
|
// by the time we finish resolving the first cache miss on the last chunk, continuing with the first
|
||||||
|
// chunk(s) won't be a cache miss anymore because they'll already be in cache.
|
||||||
|
//
|
||||||
|
// We can do further prefetches in the actual tokenization loop.
|
||||||
|
{
|
||||||
|
// We know capacity >= 64, so this will never wrap.
|
||||||
|
let last_chunk_offset = capacity - 64;
|
||||||
|
|
||||||
|
// Prefetch the first 64-byte chunk.
|
||||||
|
prefetch_read(buf, 0);
|
||||||
|
|
||||||
|
// Prefetch the second 64-byte chunk, using min() to branchlessly avoid prefetching an address we might not own.
|
||||||
|
prefetch_read(buf, 64.min(last_chunk_offset));
|
||||||
|
|
||||||
|
// Prefetch the third 64-byte chunk, using min() to branchlessly avoid prefetching an address we might not own.
|
||||||
|
prefetch_read(buf, 128.min(last_chunk_offset));
|
||||||
|
|
||||||
|
// Prefetch the fourth 64-byte chunk, using min() to branchlessly avoid prefetching an address we might not own.
|
||||||
|
prefetch_read(buf, 192.min(last_chunk_offset));
|
||||||
|
|
||||||
|
// Further prefetching can happen in the tokenization loop. Now that we've prefetched the first 3 pages,
|
||||||
|
// we should be able to prefetch the others in the loop with enough time before the tokenizer arrives there.
|
||||||
|
}
|
||||||
|
|
||||||
|
if capacity > file_size {
|
||||||
|
debug_assert!(capacity - file_size < 64);
|
||||||
|
debug_assert!(capacity - file_size > 0);
|
||||||
|
let trailing_newlines_needed = capacity - file_size;
|
||||||
|
|
||||||
|
// Safety: `buf_ptr` has a length of `capacity`, which has been rounded up to a multiple of 64.
|
||||||
|
unsafe {
|
||||||
|
// Start writing newlines right after the last of the bytes we got from the file.
|
||||||
|
write_newlines(buf.as_ptr().add(file_size), trailing_newlines_needed);
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
// Safety: bytes_ptr came from an allocation of `capacity` bytes, it's had
|
// Safety: bytes_ptr came from an allocation of `capacity` bytes, it's had
|
||||||
// newlines filled at the end, and `file_size` bytes written over the rest.
|
// newlines filled at the end, and `file_size` bytes written over the rest.
|
||||||
let bytes =
|
let bytes =
|
||||||
|
@ -225,7 +274,7 @@ fn round_up_to_nearest_64(num: usize) -> usize {
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Safety: capacity must be a multiple of 64, and must be at least 64.
|
/// Safety: capacity must be a multiple of 64, and must be at least 64.
|
||||||
unsafe fn allocate_and_pad_with_newlines(arena: &Bump, capacity: usize) -> Option<NonNull<u8>> {
|
unsafe fn allocate_chunks(arena: &Bump, capacity: usize) -> Option<NonNull<u8>> {
|
||||||
// Compare capacity here instead of size because this file limit is based on what we can record row and line
|
// Compare capacity here instead of size because this file limit is based on what we can record row and line
|
||||||
// numbers for, and those can theoretically oveflow on the trailing newlines we may have added.
|
// numbers for, and those can theoretically oveflow on the trailing newlines we may have added.
|
||||||
// This distinction will most likely come up in practice zero times ever, but it could come up in fuzzing.
|
// This distinction will most likely come up in practice zero times ever, but it could come up in fuzzing.
|
||||||
|
@ -242,85 +291,60 @@ unsafe fn allocate_and_pad_with_newlines(arena: &Bump, capacity: usize) -> Optio
|
||||||
let layout = unsafe { Layout::from_size_align_unchecked(capacity, Src64::BYTES_ALIGNMENT) };
|
let layout = unsafe { Layout::from_size_align_unchecked(capacity, Src64::BYTES_ALIGNMENT) };
|
||||||
|
|
||||||
// We have to use alloc_layout here because we have stricter alignment requirements than normal slices.
|
// We have to use alloc_layout here because we have stricter alignment requirements than normal slices.
|
||||||
let buf_ptr: NonNull<u8> = arena.alloc_layout(layout);
|
Some(arena.alloc_layout(layout))
|
||||||
|
|
||||||
// Branchlessly prefetch the first three 64-byte chunks, and the last chunk. This prevents a double cache miss:
|
|
||||||
// first a cache miss when we write the newlines to the end, and then a second cache miss when we start
|
|
||||||
// working on the beginning of the allocation. We can do further prefetches in the actual tokenization loop.
|
|
||||||
{
|
|
||||||
// We know capacity >= 64, so this will never wrap.
|
|
||||||
let last_chunk_offset = capacity - 64;
|
|
||||||
|
|
||||||
// Prefetch the last 64-byte chunk. (We do this one first because we'll be writing newlines to it first.)
|
|
||||||
prefetch_readwrite(buf_ptr, last_chunk_offset);
|
|
||||||
|
|
||||||
// Prefetch the first 64-byte chunk. The rest of these only need reading, since we never write to them.
|
|
||||||
prefetch_read(buf_ptr, 0);
|
|
||||||
|
|
||||||
// Prefetch the second 64-byte chunk, using min() to branchlessly avoid prefetching an address we might not own.
|
|
||||||
prefetch_read(buf_ptr, 64.min(last_chunk_offset));
|
|
||||||
|
|
||||||
// Prefetch the third 64-byte chunk, using min() to branchlessly avoid prefetching an address we might not own.
|
|
||||||
prefetch_read(buf_ptr, 128.min(last_chunk_offset));
|
|
||||||
|
|
||||||
// Further prefetching can happen in the tokenization loop. Now that we've prefetched the first 3 pages,
|
|
||||||
// we should be able to prefetch the others in the loop with enough time before the tokenizer arrives there.
|
|
||||||
}
|
|
||||||
|
|
||||||
// Safety: `buf_ptr` has a length of `capacity`, which has been rounded up to a multiple of 64.
|
|
||||||
unsafe { fill_last_64_bytes_with_newlines(buf_ptr, capacity) };
|
|
||||||
|
|
||||||
Some(buf_ptr)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// This is branchless so there can't be mispredictions. We know the buffer's length is a multiple of 64,
|
/// This is branchless so there can't be mispredictions.
|
||||||
/// so we can just always do four SIMD writes and call it a day. (Eight if we don't have SIMD.)
|
|
||||||
///
|
///
|
||||||
/// Safety: this pointer must have an alignment of at least 64,
|
/// Safety: this pointer must have an alignment of at least 64,
|
||||||
/// and the length must be both at least 64 and also a multiple of 64.
|
/// and the length must be both at least 64 and also a multiple of 64.
|
||||||
unsafe fn fill_last_64_bytes_with_newlines(ptr: NonNull<u8>, len: usize) {
|
unsafe fn write_newlines(dest: *mut u8, len: usize) {
|
||||||
debug_assert_eq!(
|
debug_assert!(len <= 64);
|
||||||
ptr.as_ptr() as usize % 16,
|
|
||||||
0,
|
|
||||||
"The pointer's alignment must be at least 16."
|
|
||||||
);
|
|
||||||
debug_assert_eq!(len % 64, 0, "The buffer's length must be a multiple of 64.");
|
|
||||||
debug_assert!(len >= 64, "The buffer's length must be at least 64.");
|
|
||||||
|
|
||||||
// Safety: this function's docs note that it must be given a slice with at least 64 bytes in it.
|
|
||||||
let last_64_bytes = ptr.as_ptr().add(len - 64);
|
|
||||||
|
|
||||||
#[cfg(target_feature = "sse2")]
|
#[cfg(target_feature = "sse2")]
|
||||||
{
|
{
|
||||||
use core::arch::x86_64::{__m128i, _mm_set1_epi8, _mm_storeu_si128};
|
use core::arch::x86_64::{__m128i, _mm_set1_epi8, _mm_storeu_si128};
|
||||||
|
|
||||||
|
let mut buf: MaybeUninit<[__m128i; 4]> = MaybeUninit::uninit();
|
||||||
let newline = _mm_set1_epi8(b'\n' as i8);
|
let newline = _mm_set1_epi8(b'\n' as i8);
|
||||||
let ptr = last_64_bytes as *mut __m128i;
|
let ptr = buf.as_mut_ptr() as *mut __m128i;
|
||||||
|
|
||||||
|
debug_assert_eq!(ptr.align_offset(align_of::<__m128i>()), 0);
|
||||||
|
|
||||||
_mm_storeu_si128(ptr.add(0), newline);
|
_mm_storeu_si128(ptr.add(0), newline);
|
||||||
_mm_storeu_si128(ptr.add(1), newline);
|
_mm_storeu_si128(ptr.add(1), newline);
|
||||||
_mm_storeu_si128(ptr.add(2), newline);
|
_mm_storeu_si128(ptr.add(2), newline);
|
||||||
_mm_storeu_si128(ptr.add(3), newline);
|
_mm_storeu_si128(ptr.add(3), newline);
|
||||||
|
|
||||||
|
core::ptr::copy_nonoverlapping(ptr as *const u8, dest.as_ptr(), len);
|
||||||
}
|
}
|
||||||
|
|
||||||
#[cfg(target_feature = "neon")]
|
#[cfg(target_feature = "neon")]
|
||||||
{
|
{
|
||||||
use core::arch::aarch64::{vdupq_n_s8, vst1q_s8};
|
use core::arch::aarch64::{int8x16_t, vdupq_n_s8, vst1q_s8};
|
||||||
|
|
||||||
|
let mut buf: MaybeUninit<[int8x16_t; 4]> = MaybeUninit::uninit();
|
||||||
let newline = vdupq_n_s8(b'\n' as i8);
|
let newline = vdupq_n_s8(b'\n' as i8);
|
||||||
let ptr = last_64_bytes as *mut i8;
|
let ptr = buf.as_mut_ptr() as *mut i8;
|
||||||
|
|
||||||
|
debug_assert_eq!(ptr.align_offset(align_of::<int8x16_t>()), 0);
|
||||||
|
|
||||||
vst1q_s8(ptr.add(0), newline);
|
vst1q_s8(ptr.add(0), newline);
|
||||||
vst1q_s8(ptr.add(16), newline);
|
vst1q_s8(ptr.add(16), newline);
|
||||||
vst1q_s8(ptr.add(32), newline);
|
vst1q_s8(ptr.add(32), newline);
|
||||||
vst1q_s8(ptr.add(48), newline);
|
vst1q_s8(ptr.add(48), newline);
|
||||||
|
|
||||||
|
core::ptr::copy_nonoverlapping(ptr as *const u8, dest, len);
|
||||||
}
|
}
|
||||||
|
|
||||||
#[cfg(not(any(target_feature = "sse2", target_feature = "neon")))]
|
#[cfg(not(any(target_feature = "sse2", target_feature = "neon")))]
|
||||||
{
|
{
|
||||||
// We don't have access to SIMD, so do eight 64-bit writes instead of four 128-bit writes.
|
// We don't have access to SIMD, so do eight 64-bit writes instead of four 128-bit writes.
|
||||||
|
let mut buf: MaybeUninit<[u64; 8]> = MaybeUninit::uninit();
|
||||||
let newline_repeated = (b'\n' as u64) * 0x0101010101010101;
|
let newline_repeated = (b'\n' as u64) * 0x0101010101010101;
|
||||||
let ptr = last_64_bytes as *mut u64;
|
let ptr = buf.as_mut_ptr() as *mut u64;
|
||||||
|
|
||||||
|
debug_assert_eq!(ptr.align_offset(align_of::<u64>()), 0);
|
||||||
|
|
||||||
*ptr.add(0) = newline_repeated;
|
*ptr.add(0) = newline_repeated;
|
||||||
*ptr.add(1) = newline_repeated;
|
*ptr.add(1) = newline_repeated;
|
||||||
|
@ -330,6 +354,8 @@ unsafe fn fill_last_64_bytes_with_newlines(ptr: NonNull<u8>, len: usize) {
|
||||||
*ptr.add(5) = newline_repeated;
|
*ptr.add(5) = newline_repeated;
|
||||||
*ptr.add(6) = newline_repeated;
|
*ptr.add(6) = newline_repeated;
|
||||||
*ptr.add(7) = newline_repeated;
|
*ptr.add(7) = newline_repeated;
|
||||||
|
|
||||||
|
core::ptr::copy_nonoverlapping(ptr as *const u8, dest.as_ptr(), len);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -357,30 +383,6 @@ fn prefetch_read<T>(non_null_ptr: NonNull<T>, offset: usize) {
|
||||||
// If we're not on x64 or aarch64, just do nothing!
|
// If we're not on x64 or aarch64, just do nothing!
|
||||||
}
|
}
|
||||||
|
|
||||||
#[inline(always)]
|
|
||||||
fn prefetch_readwrite<T>(non_null_ptr: NonNull<T>, offset: usize) {
|
|
||||||
// Use inline asm until this is stabilized:
|
|
||||||
// https://doc.rust-lang.org/std/intrinsics/fn.prefetch_write_data.html
|
|
||||||
|
|
||||||
#[cfg(target_arch = "x86_64")]
|
|
||||||
unsafe {
|
|
||||||
core::arch::asm!(
|
|
||||||
"prefetchw [{}]",
|
|
||||||
in(reg) non_null_ptr.as_ptr().add(offset)
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
#[cfg(target_arch = "aarch64")]
|
|
||||||
unsafe {
|
|
||||||
core::arch::asm!(
|
|
||||||
"prfm PSTL1KEEP, [{}]",
|
|
||||||
in(reg) non_null_ptr.as_ptr().add(offset)
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
// If we're not on x64 or aarch64, just do nothing!
|
|
||||||
}
|
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod src64_tests {
|
mod src64_tests {
|
||||||
use super::{FileErr, Src64, MAX_ROC_SOURCE_FILE_SIZE};
|
use super::{FileErr, Src64, MAX_ROC_SOURCE_FILE_SIZE};
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue