diff --git a/crates/compiler/parse/src/src64.rs b/crates/compiler/parse/src/src64.rs index 8ebed17324..24eb7f1e93 100644 --- a/crates/compiler/parse/src/src64.rs +++ b/crates/compiler/parse/src/src64.rs @@ -12,6 +12,7 @@ use bumpalo::{self, Bump}; use core::{ alloc::Layout, + mem::{align_of, MaybeUninit}, ptr::{self, NonNull}, }; @@ -82,7 +83,15 @@ impl<'a> Src64<'a> { } // Safety: we got capacity by rounding up to the nearest 64B - let dest = unsafe { allocate_and_pad_with_newlines(arena, capacity)? }.as_ptr() as *mut u8; + let dest = unsafe { allocate_chunks(arena, capacity)? }.as_ptr() as *mut u8; + + // Safety: `dest` has a length of `capacity`, which has been rounded up to a multiple of 64. + unsafe { + let trailing_newlines_needed = capacity - src_len; + + // Start writing newlines right after the last of the bytes we got from the file. + write_newlines(dest.add(src_len), trailing_newlines_needed); + }; // Safety: we just allocated `dest` to have len >= src.len(), and they're both u8 arrays. unsafe { @@ -128,7 +137,7 @@ impl<'a> Src64<'a> { // Safety: round_up_to_nearest_u64 will give us a capacity that is // at least 64, and also a multiple of 64. - match unsafe { allocate_and_pad_with_newlines(arena, capacity) } { + match unsafe { allocate_chunks(arena, capacity) } { Some(buf) => { // Read bytes equal to file_size into the arena allocation. // @@ -205,6 +214,46 @@ impl<'a> Src64<'a> { return Err(FileErr::ReadErr); } + // Before we write newlines to the last chunk, branchlessly prefetch the first four 64-byte chunks. + // We're about to have a cache miss due to loading the last chunk from main memory (DMA will have + // written it there without having gone through the CPU), and if we don't prefetch here, then we'll + // immediately get a second cache miss when we start traversing the loaded file. The prefetch means + // by the time we finish resolving the first cache miss on the last chunk, continuing with the first + // chunk(s) won't be a cache miss anymore because they'll already be in cache. + // + // We can do further prefetches in the actual tokenization loop. + { + // We know capacity >= 64, so this will never wrap. + let last_chunk_offset = capacity - 64; + + // Prefetch the first 64-byte chunk. + prefetch_read(buf, 0); + + // Prefetch the second 64-byte chunk, using min() to branchlessly avoid prefetching an address we might not own. + prefetch_read(buf, 64.min(last_chunk_offset)); + + // Prefetch the third 64-byte chunk, using min() to branchlessly avoid prefetching an address we might not own. + prefetch_read(buf, 128.min(last_chunk_offset)); + + // Prefetch the fourth 64-byte chunk, using min() to branchlessly avoid prefetching an address we might not own. + prefetch_read(buf, 192.min(last_chunk_offset)); + + // Further prefetching can happen in the tokenization loop. Now that we've prefetched the first 3 pages, + // we should be able to prefetch the others in the loop with enough time before the tokenizer arrives there. + } + + if capacity > file_size { + debug_assert!(capacity - file_size < 64); + debug_assert!(capacity - file_size > 0); + let trailing_newlines_needed = capacity - file_size; + + // Safety: `buf_ptr` has a length of `capacity`, which has been rounded up to a multiple of 64. + unsafe { + // Start writing newlines right after the last of the bytes we got from the file. + write_newlines(buf.as_ptr().add(file_size), trailing_newlines_needed); + }; + } + // Safety: bytes_ptr came from an allocation of `capacity` bytes, it's had // newlines filled at the end, and `file_size` bytes written over the rest. let bytes = @@ -225,7 +274,7 @@ fn round_up_to_nearest_64(num: usize) -> usize { } /// Safety: capacity must be a multiple of 64, and must be at least 64. -unsafe fn allocate_and_pad_with_newlines(arena: &Bump, capacity: usize) -> Option> { +unsafe fn allocate_chunks(arena: &Bump, capacity: usize) -> Option> { // Compare capacity here instead of size because this file limit is based on what we can record row and line // numbers for, and those can theoretically oveflow on the trailing newlines we may have added. // This distinction will most likely come up in practice zero times ever, but it could come up in fuzzing. @@ -242,85 +291,60 @@ unsafe fn allocate_and_pad_with_newlines(arena: &Bump, capacity: usize) -> Optio let layout = unsafe { Layout::from_size_align_unchecked(capacity, Src64::BYTES_ALIGNMENT) }; // We have to use alloc_layout here because we have stricter alignment requirements than normal slices. - let buf_ptr: NonNull = arena.alloc_layout(layout); - - // Branchlessly prefetch the first three 64-byte chunks, and the last chunk. This prevents a double cache miss: - // first a cache miss when we write the newlines to the end, and then a second cache miss when we start - // working on the beginning of the allocation. We can do further prefetches in the actual tokenization loop. - { - // We know capacity >= 64, so this will never wrap. - let last_chunk_offset = capacity - 64; - - // Prefetch the last 64-byte chunk. (We do this one first because we'll be writing newlines to it first.) - prefetch_readwrite(buf_ptr, last_chunk_offset); - - // Prefetch the first 64-byte chunk. The rest of these only need reading, since we never write to them. - prefetch_read(buf_ptr, 0); - - // Prefetch the second 64-byte chunk, using min() to branchlessly avoid prefetching an address we might not own. - prefetch_read(buf_ptr, 64.min(last_chunk_offset)); - - // Prefetch the third 64-byte chunk, using min() to branchlessly avoid prefetching an address we might not own. - prefetch_read(buf_ptr, 128.min(last_chunk_offset)); - - // Further prefetching can happen in the tokenization loop. Now that we've prefetched the first 3 pages, - // we should be able to prefetch the others in the loop with enough time before the tokenizer arrives there. - } - - // Safety: `buf_ptr` has a length of `capacity`, which has been rounded up to a multiple of 64. - unsafe { fill_last_64_bytes_with_newlines(buf_ptr, capacity) }; - - Some(buf_ptr) + Some(arena.alloc_layout(layout)) } -/// This is branchless so there can't be mispredictions. We know the buffer's length is a multiple of 64, -/// so we can just always do four SIMD writes and call it a day. (Eight if we don't have SIMD.) +/// This is branchless so there can't be mispredictions. /// /// Safety: this pointer must have an alignment of at least 64, /// and the length must be both at least 64 and also a multiple of 64. -unsafe fn fill_last_64_bytes_with_newlines(ptr: NonNull, len: usize) { - debug_assert_eq!( - ptr.as_ptr() as usize % 16, - 0, - "The pointer's alignment must be at least 16." - ); - debug_assert_eq!(len % 64, 0, "The buffer's length must be a multiple of 64."); - debug_assert!(len >= 64, "The buffer's length must be at least 64."); - - // Safety: this function's docs note that it must be given a slice with at least 64 bytes in it. - let last_64_bytes = ptr.as_ptr().add(len - 64); +unsafe fn write_newlines(dest: *mut u8, len: usize) { + debug_assert!(len <= 64); #[cfg(target_feature = "sse2")] { use core::arch::x86_64::{__m128i, _mm_set1_epi8, _mm_storeu_si128}; + let mut buf: MaybeUninit<[__m128i; 4]> = MaybeUninit::uninit(); let newline = _mm_set1_epi8(b'\n' as i8); - let ptr = last_64_bytes as *mut __m128i; + let ptr = buf.as_mut_ptr() as *mut __m128i; + + debug_assert_eq!(ptr.align_offset(align_of::<__m128i>()), 0); _mm_storeu_si128(ptr.add(0), newline); _mm_storeu_si128(ptr.add(1), newline); _mm_storeu_si128(ptr.add(2), newline); _mm_storeu_si128(ptr.add(3), newline); + + core::ptr::copy_nonoverlapping(ptr as *const u8, dest.as_ptr(), len); } #[cfg(target_feature = "neon")] { - use core::arch::aarch64::{vdupq_n_s8, vst1q_s8}; + use core::arch::aarch64::{int8x16_t, vdupq_n_s8, vst1q_s8}; + let mut buf: MaybeUninit<[int8x16_t; 4]> = MaybeUninit::uninit(); let newline = vdupq_n_s8(b'\n' as i8); - let ptr = last_64_bytes as *mut i8; + let ptr = buf.as_mut_ptr() as *mut i8; + + debug_assert_eq!(ptr.align_offset(align_of::()), 0); vst1q_s8(ptr.add(0), newline); vst1q_s8(ptr.add(16), newline); vst1q_s8(ptr.add(32), newline); vst1q_s8(ptr.add(48), newline); + + core::ptr::copy_nonoverlapping(ptr as *const u8, dest, len); } #[cfg(not(any(target_feature = "sse2", target_feature = "neon")))] { // We don't have access to SIMD, so do eight 64-bit writes instead of four 128-bit writes. + let mut buf: MaybeUninit<[u64; 8]> = MaybeUninit::uninit(); let newline_repeated = (b'\n' as u64) * 0x0101010101010101; - let ptr = last_64_bytes as *mut u64; + let ptr = buf.as_mut_ptr() as *mut u64; + + debug_assert_eq!(ptr.align_offset(align_of::()), 0); *ptr.add(0) = newline_repeated; *ptr.add(1) = newline_repeated; @@ -330,6 +354,8 @@ unsafe fn fill_last_64_bytes_with_newlines(ptr: NonNull, len: usize) { *ptr.add(5) = newline_repeated; *ptr.add(6) = newline_repeated; *ptr.add(7) = newline_repeated; + + core::ptr::copy_nonoverlapping(ptr as *const u8, dest.as_ptr(), len); } } @@ -357,30 +383,6 @@ fn prefetch_read(non_null_ptr: NonNull, offset: usize) { // If we're not on x64 or aarch64, just do nothing! } -#[inline(always)] -fn prefetch_readwrite(non_null_ptr: NonNull, offset: usize) { - // Use inline asm until this is stabilized: - // https://doc.rust-lang.org/std/intrinsics/fn.prefetch_write_data.html - - #[cfg(target_arch = "x86_64")] - unsafe { - core::arch::asm!( - "prefetchw [{}]", - in(reg) non_null_ptr.as_ptr().add(offset) - ); - } - - #[cfg(target_arch = "aarch64")] - unsafe { - core::arch::asm!( - "prfm PSTL1KEEP, [{}]", - in(reg) non_null_ptr.as_ptr().add(offset) - ); - } - - // If we're not on x64 or aarch64, just do nothing! -} - #[cfg(test)] mod src64_tests { use super::{FileErr, Src64, MAX_ROC_SOURCE_FILE_SIZE};