// Loads Roc source files (from strings or from files) into a structure which is
// guaranteed to have the following properties, all of which the SIMD parser requires:
// - 16B alignment
// - byte length is a multiple of 64
// - if the source bytes were not a multiple of 64, the extra space is filled with trailing newlines
//
// (Trailing newlines are the filler of choice because they are irrelevant to the parser.)
//
// It does this as efficiently as possible by using branchless SIMD to fill padding bytes,
// and reading the contents of the file directly into an arena in as few syscalls as possible.

use bumpalo::{self, Bump};
use core::{
    alloc::Layout,
    mem::{align_of, MaybeUninit},
    ptr::{self, NonNull},
};

#[cfg(not(test))]
/// We store both line and column numbers as u16s, so the largest possible file you could open
/// would be every line having the longest possible column length, or u16::MAX * u16::MAX.
const MAX_ROC_SOURCE_FILE_SIZE: usize = u16::MAX as usize * u16::MAX as usize; // 4GB

#[cfg(test)]
const MAX_ROC_SOURCE_FILE_SIZE: usize = 1024; // small enough that we can create a tempfile to exercise this scenario

pub struct Src64<'a> {
    /// These bytes are guaranteed to have a 16B-aligned address (so the parser can do 128-bit SIMD on it).
    /// This slice is guaranteed to have a length that's a multiple of 64B, because the parser iterates in
    /// chunks of 64B. (If extra bytes are needed to make it a multiple of 64B, we add trailing newlines
    /// because the parser ignores those.)
    bytes: &'a [u8],
}

#[derive(Debug, Copy, Clone, PartialEq)]
pub enum FileErr {
    FileWasEmpty,
    ReadErr,
    FileWasTooBig(usize),
    ErrReadingFileSize,
    FileOpenFailed,
}

impl<'a> Src64<'a> {
    const BYTES_ALIGNMENT: usize = 64;

    /// The underlying source bytes that originally came from a file or from a string.
    ///
    /// These bytes are guaranteed to have a 16B-aligned address (so the parser can do 128-bit SIMD on it).
    /// This slice is guaranteed to have a length that's a multiple of 64B, because the parser iterates in
    /// chunks of 64B. (If extra bytes are needed to make it a multiple of 64B, we add trailing newlines
    /// because the parser ignores those.)
    pub fn bytes(&self) -> &[u8] {
        self.bytes
    }

    pub fn len(&self) -> usize {
        self.bytes.len()
    }

    pub fn is_empty(&self) -> bool {
        self.bytes.is_empty()
    }

    /// Returns None if the given string exceeds the maximum size of a Roc source file.
    pub fn from_str(arena: &'a Bump, src: &'a str) -> Option<Src64<'a>> {
        let src_len = src.len();

        if src_len == 0 {
            return None;
        }

        let capacity = round_up_to_nearest_64(src_len);

        debug_assert_eq!(capacity % 64, 0);

        if capacity == src_len && src.as_ptr().align_offset(Self::BYTES_ALIGNMENT) == 0 {
            // If the string already happens to meet our capacity and alignment requirements, just return it.
            return Some(Self {
                bytes: src.as_bytes(),
            });
        }

        // Safety: we got capacity by rounding up to the nearest 64B
        let dest = unsafe { allocate_chunks(arena, capacity)? }.as_ptr();

        // Safety: `dest` has a length of `capacity`, which has been rounded up to a multiple of 64.
        unsafe {
            let trailing_newlines_needed = capacity - src_len;

            // Start writing newlines right after the last of the bytes we got from the file.
            write_newlines(dest.add(src_len), trailing_newlines_needed);
        };

        // Safety: we just allocated `dest` to have len >= src.len(), and they're both u8 arrays.
        unsafe {
            ptr::copy_nonoverlapping(src.as_bytes().as_ptr(), dest, src_len);
        }

        Some(Self {
            // Safety: all the bytes should now be initialized
            bytes: unsafe { core::slice::from_raw_parts_mut(dest, capacity) },
        })
    }

    #[cfg(any(unix, windows))] // This is not available on wasm32. We could make it work with WASI if desired.
    pub fn from_file(arena: &'a Bump, path: &std::path::Path) -> Result<Self, FileErr> {
        use core::ffi::c_void;

        let file = match std::fs::File::open(path) {
            Ok(file) => file,
            Err(_) => {
                return Err(FileErr::FileOpenFailed);
            }
        };

        let file_size = match file.metadata() {
            Ok(metadata) => {
                #[cfg(unix)]
                {
                    use std::os::unix::prelude::MetadataExt;

                    metadata.size() as usize
                }

                #[cfg(windows)]
                {
                    use std::os::windows::prelude::MetadataExt;

                    metadata.file_size() as usize
                }
            }
            Err(_io_err) => {
                return Err(FileErr::ErrReadingFileSize);
            }
        };

        if file_size == 0 {
            return Err(FileErr::FileWasEmpty);
        }

        let capacity = round_up_to_nearest_64(file_size);

        // Safety: round_up_to_nearest_u64 will give us a capacity that is
        // at least 64, and also a multiple of 64.
        match unsafe { allocate_chunks(arena, capacity) } {
            Some(buf) => {
                // Read bytes equal to file_size into the arena allocation.
                //
                // We use the native OS read() operation here to avoid UB; file.read_exact()
                // only reads into a slice, and constructing a slice with uninitialized
                // data is UB (per the slice::from_raw_parts docs). The allocation is uninitialized here,
                // and initializing it would be a waste of CPU cycles because we're about to overwrite
                // those bytes with bytes from the file anyway.
                let bytes_read = {
                    #[cfg(unix)]
                    unsafe {
                        use std::os::fd::AsRawFd;

                        // This extern lets us avoid an entire libc crate dependency.
                        extern "C" {
                            // https://linux.die.net/man/2/read
                            pub fn read(
                                fd: core::ffi::c_int,
                                buf: *mut c_void,
                                count: usize,
                            ) -> isize;
                        }

                        read(file.as_raw_fd(), buf.as_ptr() as *mut c_void, file_size) as usize
                    }

                    #[cfg(windows)]
                    unsafe {
                        use std::os::windows::io::AsRawHandle;

                        // This extern lets us avoid an entire winapi crate dependency.
                        extern "system" {
                            // https://learn.microsoft.com/en-us/windows/win32/api/fileapi/nf-fileapi-readfile
                            pub fn ReadFile(
                                hFile: *mut c_void,
                                lpBuffer: *mut c_void,
                                nNumberOfBytesToRead: u32,
                                lpNumberOfBytesRead: *mut u32,
                                lpOverlapped: *mut c_void, // this should be a pointer to a struct, but we always pass null.
                            ) -> i32;
                        }

                        let mut bytes_read = core::mem::MaybeUninit::uninit();

                        // We should have already errored out if file_size exceeded u32::MAX,
                        // due to our maximum source file size. This debug_assert! is here to
                        // make sure casting file_size to u32 is safe in the ReadFile call.
                        debug_assert!(MAX_ROC_SOURCE_FILE_SIZE <= u32::MAX as usize);

                        ReadFile(
                            file.as_raw_handle() as *mut c_void,
                            buf.as_ptr() as *mut c_void,
                            file_size as u32,
                            bytes_read.as_mut_ptr(),
                            core::ptr::null_mut(),
                        );

                        bytes_read.assume_init() as usize
                    }
                };

                // We can close the file now; we're done with it.
                drop(file);

                // It's crucial that we successfully read the entire file; otherwise, it would be unsafe
                // to make a slice out of it because we might not have overwritten the uninitialized
                // memory leading up to the newlines at the end!
                //
                // Note that on UNIX, bytes_read might be -1 if this was a file read error. This
                // condition will catch that too, since we know file_size won't be (-1isize as usize)
                // beacuse if it was, then this match would have taken the None branch due to
                // (-1isize as usize) exceeding our maximum file size.
                if bytes_read != file_size {
                    return Err(FileErr::ReadErr);
                }

                // Before we write newlines to the last chunk, branchlessly prefetch the first four 64-byte chunks.
                // We're about to have a cache miss due to loading the last chunk from main memory (DMA will have
                // written it there without having gone through the CPU), and if we don't prefetch here, then we'll
                // immediately get a second cache miss when we start traversing the loaded file. The prefetch means
                // by the time we finish resolving the first cache miss on the last chunk, continuing with the first
                // chunk(s) won't be a cache miss anymore because they'll already be in cache.
                //
                // We can do further prefetches in the actual tokenization loop.
                #[cfg(any(target_arch = "x86_64", target_arch = "aarch64"))]
                {
                    // We know capacity >= 64, so this will never wrap.
                    let last_chunk_offset = capacity - 64;

                    // Prefetch the first 64-byte chunk.
                    prefetch_read(buf, 0);

                    // Prefetch the second 64-byte chunk, using min() to branchlessly avoid prefetching an address we might not own.
                    prefetch_read(buf, 64.min(last_chunk_offset));

                    // Prefetch the third 64-byte chunk, using min() to branchlessly avoid prefetching an address we might not own.
                    prefetch_read(buf, 128.min(last_chunk_offset));

                    // Prefetch the fourth 64-byte chunk, using min() to branchlessly avoid prefetching an address we might not own.
                    prefetch_read(buf, 192.min(last_chunk_offset));

                    // Further prefetching can happen in the tokenization loop. Now that we've prefetched the first pages,
                    // we should be able to prefetch the others in the tokenization loop before it needs to read them.
                }

                // We may have coincidentally had a file size that was a multiple of 64, but if not,
                // we'll need to fill the allocation with trailing newlines so we aren't tokenizing
                // uninitialized memory.
                if capacity > file_size {
                    debug_assert!(capacity - file_size < 64);
                    let trailing_newlines_needed = capacity - file_size;

                    // Safety: `buf_ptr` has a length of `capacity`, which has been rounded up to a multiple of 64.
                    unsafe {
                        // Start writing newlines right after the last of the bytes we got from the file.
                        write_newlines(buf.as_ptr().add(file_size), trailing_newlines_needed);
                    };
                }

                // Safety: bytes_ptr came from an allocation of `capacity` bytes, it's had
                // newlines filled at the end, and `file_size` bytes written over the rest.
                let bytes = unsafe { core::slice::from_raw_parts_mut(buf.as_ptr(), capacity) };

                Ok(Self { bytes })
            }
            None => Err(FileErr::FileWasTooBig(file_size)),
        }
    }
}

fn round_up_to_nearest_64(num: usize) -> usize {
    // Round up to the nearest 64. (Writing this out as 64 - 1 so it's clearer where the numbers came from.)
    // We can do saturating addition here rather than overflow checking, because if we overflow usize::MAX,
    // we will most definitely be over the max source file size and return None anyway.
    (num.saturating_add(64 - 1)) & !(64 - 1)
}

/// Safety: capacity must be a multiple of 64, and must be at least 64.
unsafe fn allocate_chunks(arena: &Bump, capacity: usize) -> Option<NonNull<u8>> {
    // Compare capacity here instead of size because this file limit is based on what we can record row and line
    // numbers for, and those can theoretically oveflow on the trailing newlines we may have added.
    // This distinction will most likely come up in practice zero times ever, but it could come up in fuzzing.
    if capacity > MAX_ROC_SOURCE_FILE_SIZE {
        return None;
    }

    debug_assert!(capacity >= 64);
    debug_assert!(capacity % 64 == 0);

    // Safety: the rules we follow are https://doc.rust-lang.org/core/alloc/struct.Layout.html#method.from_size_align_unchecked
    // `align` is valid because it's hardcoded, and we already rounded `capacity` up to something even bigger.
    // We align it to 64B so that it's on cache line boundaries on many CPUs, which makes prefetching simpler.
    let layout = unsafe { Layout::from_size_align_unchecked(capacity, Src64::BYTES_ALIGNMENT) };

    // We have to use alloc_layout here because we have stricter alignment requirements than normal slices.
    Some(arena.alloc_layout(layout))
}

/// This is branchless so there can't be mispredictions.
///
/// Safety: this pointer must have an alignment of at least 64,
/// and the length must be both at least 64 and also a multiple of 64.
unsafe fn write_newlines(dest: *mut u8, len: usize) {
    debug_assert!(len <= 64);

    #[cfg(target_feature = "sse2")]
    {
        use core::arch::x86_64::{__m128i, _mm_set1_epi8, _mm_storeu_si128};

        let mut buf: MaybeUninit<[__m128i; 4]> = MaybeUninit::uninit();
        let newline = _mm_set1_epi8(b'\n' as i8);
        let ptr = buf.as_mut_ptr() as *mut __m128i;

        debug_assert_eq!(ptr.align_offset(align_of::<__m128i>()), 0);

        _mm_storeu_si128(ptr.add(0), newline);
        _mm_storeu_si128(ptr.add(1), newline);
        _mm_storeu_si128(ptr.add(2), newline);
        _mm_storeu_si128(ptr.add(3), newline);

        core::ptr::copy_nonoverlapping(ptr as *const u8, dest, len);
    }

    #[cfg(target_feature = "neon")]
    {
        use core::arch::aarch64::{int8x16_t, vdupq_n_s8, vst1q_s8};

        let mut buf: MaybeUninit<[int8x16_t; 4]> = MaybeUninit::uninit();
        let newline = vdupq_n_s8(b'\n' as i8);
        let ptr = buf.as_mut_ptr() as *mut i8;

        debug_assert_eq!(ptr.align_offset(align_of::<int8x16_t>()), 0);

        vst1q_s8(ptr.add(0), newline);
        vst1q_s8(ptr.add(16), newline);
        vst1q_s8(ptr.add(32), newline);
        vst1q_s8(ptr.add(48), newline);

        core::ptr::copy_nonoverlapping(ptr as *const u8, dest, len);
    }

    #[cfg(not(any(target_feature = "sse2", target_feature = "neon")))]
    {
        // We don't have access to SIMD, so do eight 64-bit writes instead of four 128-bit writes.
        let mut buf: MaybeUninit<[u64; 8]> = MaybeUninit::uninit();
        let newline_repeated = (b'\n' as u64) * 0x0101010101010101;
        let ptr = buf.as_mut_ptr() as *mut u64;

        debug_assert_eq!(ptr.align_offset(align_of::<u64>()), 0);

        *ptr.add(0) = newline_repeated;
        *ptr.add(1) = newline_repeated;
        *ptr.add(2) = newline_repeated;
        *ptr.add(3) = newline_repeated;
        *ptr.add(4) = newline_repeated;
        *ptr.add(5) = newline_repeated;
        *ptr.add(6) = newline_repeated;
        *ptr.add(7) = newline_repeated;

        core::ptr::copy_nonoverlapping(ptr as *const u8, dest, len);
    }
}

#[cfg(any(target_arch = "x86_64", target_arch = "aarch64"))]
#[inline(always)]
fn prefetch_read<T>(non_null_ptr: NonNull<T>, offset: usize) {
    // Use inline asm until this is stabilized:
    // https://doc.rust-lang.org/std/intrinsics/fn.prefetch_read_data.html

    #[cfg(target_arch = "x86_64")]
    unsafe {
        core::arch::asm!(
            "prefetcht0 [{}]",
            in(reg) non_null_ptr.as_ptr().add(offset)
        );
    }

    #[cfg(target_arch = "aarch64")]
    unsafe {
        core::arch::asm!(
            "prfm PLDL1KEEP, [{}]",
            in(reg) non_null_ptr.as_ptr().add(offset)
        );
    }
}

#[cfg(test)]
mod src64_tests {
    use super::{FileErr, Src64, MAX_ROC_SOURCE_FILE_SIZE};
    use bumpalo::Bump;
    use quickcheck::{quickcheck, Arbitrary, Gen};
    use std::fs::File;
    use std::io::Write;
    use tempfile::tempdir;

    fn expect_from_str(arena: &Bump, contents: &str, expected: &Result<Vec<u8>, FileErr>) {
        match Src64::from_str(arena, contents) {
            Some(actual) => {
                assert_eq!(actual.len() % 64, 0);
                assert_eq!(
                    expected.as_ref().ok(),
                    Some(&actual.bytes().into()),
                    "Src64::from_str had unexpected output"
                )
            }
            None => {
                assert_eq!(
                    expected.as_ref().ok(),
                    None,
                    "Src64::from_str had unexpected output"
                )
            }
        }
    }

    fn expect_from_file(arena: &Bump, contents: &str, expected: &Result<Vec<u8>, FileErr>) {
        let dir = tempdir().expect("Failed to create temp dir");
        let file_path = dir.path().join("temp_file");

        // Write contents to the temp file
        {
            let mut file = File::create(&file_path).expect("Failed to create temp file");
            file.write_all(contents.as_bytes())
                .expect("Failed to write to temp file");
        }

        match Src64::from_file(arena, &file_path) {
            Ok(actual) => {
                assert_eq!(actual.len() % 64, 0);
                assert_eq!(
                    expected,
                    &Ok(actual.bytes().into()),
                    "Src64::from_file had unexpected output"
                )
            }
            Err(err) => assert_eq!(
                expected,
                &Err(err),
                "Src64::from_file had unexpected output"
            ),
        }
    }

    /// Runs both Src64::from_str and Src64::from_file on the given str, then
    /// asserts the output of both of those functions is equal to `expected`.
    /// (Since from_str returns an Option, we call .ok() on `expected` before comparing it.)
    fn expect_from(contents: &str, expected: Result<Vec<u8>, FileErr>) {
        let arena = Bump::new();

        expect_from_str(&arena, contents, &expected);
        expect_from_file(&arena, contents, &expected);
    }

    #[test]
    fn empty() {
        expect_from("", Err(FileErr::FileWasEmpty));
    }

    #[test]
    fn one_newline() {
        expect_from("\n", Ok([b'\n'; 64].into()));
    }

    #[test]
    fn one_byte() {
        expect_from(
            "x",
            Ok({
                let mut vec: Vec<u8> = [b'\n'; 64].as_mut_slice().into();

                vec[0] = b'x';

                vec
            }),
        );
    }

    #[test]
    fn two_bytes() {
        expect_from(
            "xy",
            Ok({
                let mut vec: Vec<u8> = [b'\n'; 64].as_mut_slice().into();

                vec[0] = b'x';
                vec[1] = b'y';

                vec
            }),
        );
    }

    #[test]
    fn max_file_size() {
        let bytes = [b'z'; MAX_ROC_SOURCE_FILE_SIZE];

        expect_from(
            core::str::from_utf8(bytes.as_slice()).unwrap(),
            Ok(bytes.into()),
        );
    }

    #[test]
    fn too_big() {
        let bytes = [b'z'; MAX_ROC_SOURCE_FILE_SIZE + 1];

        expect_from(
            core::str::from_utf8(bytes.as_slice()).unwrap(),
            Err(FileErr::FileWasTooBig(bytes.len())),
        );
    }

    #[derive(Debug, Clone)]
    struct FileBytes(Vec<u8>);

    impl Arbitrary for FileBytes {
        fn arbitrary(g: &mut Gen) -> Self {
            let len = g.size() % (MAX_ROC_SOURCE_FILE_SIZE + 1); // Wrap around to avoid clustering
                                                                 //
            FileBytes((0..len).map(|_| u8::arbitrary(g)).collect())
        }
    }

    quickcheck! {
        /// Creates a tempfile containing arbitrary bytes, then reads it with Str::from_file. Asserts that:
        /// - the returned Result<Str64> is Ok
        /// - its length is a multiple of 64
        /// - it's at least as long as the input bytes were
        /// - it starts_with the input bytes
        fn from_arb_file(bytes: FileBytes) -> bool {
            let FileBytes(bytes) = bytes;

            let dir = tempdir().expect("Failed to create temp dir");
            let file_path = dir.path().join("temp_file");

            // Write random bytes to the temp file
            {
                let mut file = File::create(&file_path).expect("Failed to create temp file");
                file.write_all(&bytes).expect("Failed to write to temp file");
            }

            let arena = Bump::new();

            match Src64::from_file(&arena, &file_path) {
                Ok(src64) => {
                    let len = src64.len();

                    len % 64 == 0 && len >= bytes.len() && src64.bytes().starts_with(&bytes)
                }
                Err(_) => false
            }
        }
    }
}