[red-knot]: Add a VendoredFileSystem implementation (#11863)

Co-authored-by: Micha Reiser <micha@reiser.io>
This commit is contained in:
Alex Waygood 2024-06-18 16:43:39 +01:00 committed by GitHub
parent f666d79cd7
commit 1d73d60bd3
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
10 changed files with 582 additions and 172 deletions

View file

@ -20,6 +20,11 @@ camino = { workspace = true }
countme = { workspace = true }
dashmap = { workspace = true }
filetime = { workspace = true }
itertools = { workspace = true }
salsa = { workspace = true }
tracing = { workspace = true }
rustc-hash = { workspace = true }
zip = { workspace = true }
[dev-dependencies]
once_cell = { workspace = true }

View file

@ -0,0 +1,69 @@
/// A number representing the revision of a file.
///
/// Two revisions that don't compare equal signify that the file has been modified.
/// Revisions aren't guaranteed to be monotonically increasing or in any specific order.
///
/// Possible revisions are:
/// * The last modification time of the file.
/// * The hash of the file's content.
/// * The revision as it comes from an external system, for example the LSP.
#[derive(Copy, Clone, Debug, Eq, PartialEq)]
pub struct FileRevision(u128);
impl FileRevision {
pub fn new(value: u128) -> Self {
Self(value)
}
pub const fn zero() -> Self {
Self(0)
}
#[must_use]
pub fn as_u128(self) -> u128 {
self.0
}
}
impl From<u128> for FileRevision {
fn from(value: u128) -> Self {
FileRevision(value)
}
}
impl From<u64> for FileRevision {
fn from(value: u64) -> Self {
FileRevision(u128::from(value))
}
}
impl From<filetime::FileTime> for FileRevision {
fn from(value: filetime::FileTime) -> Self {
let seconds = value.seconds() as u128;
let seconds = seconds << 64;
let nanos = u128::from(value.nanoseconds());
FileRevision(seconds | nanos)
}
}
#[cfg(test)]
mod tests {
use filetime::FileTime;
use super::*;
#[test]
fn revision_from_file_time() {
let file_time = FileTime::now();
let revision = FileRevision::from(file_time);
let revision = revision.as_u128();
let nano = revision & 0xFFFF_FFFF_FFFF_FFFF;
let seconds = revision >> 64;
assert_eq!(file_time.nanoseconds(), nano as u32);
assert_eq!(file_time.seconds(), seconds as i64);
}
}

View file

@ -3,8 +3,8 @@ use std::ops::Deref;
use std::path::{Path, StripPrefixError};
use camino::{Utf8Path, Utf8PathBuf};
use filetime::FileTime;
use crate::file_revision::FileRevision;
pub use memory::MemoryFileSystem;
pub use os::OsFileSystem;
@ -514,55 +514,6 @@ impl Metadata {
}
}
/// A number representing the revision of a file.
///
/// Two revisions that don't compare equal signify that the file has been modified.
/// Revisions aren't guaranteed to be monotonically increasing or in any specific order.
///
/// Possible revisions are:
/// * The last modification time of the file.
/// * The hash of the file's content.
/// * The revision as it comes from an external system, for example the LSP.
#[derive(Copy, Clone, Debug, Eq, PartialEq)]
pub struct FileRevision(u128);
impl FileRevision {
pub fn new(value: u128) -> Self {
Self(value)
}
pub const fn zero() -> Self {
Self(0)
}
#[must_use]
pub fn as_u128(self) -> u128 {
self.0
}
}
impl From<u128> for FileRevision {
fn from(value: u128) -> Self {
FileRevision(value)
}
}
impl From<u64> for FileRevision {
fn from(value: u64) -> Self {
FileRevision(u128::from(value))
}
}
impl From<FileTime> for FileRevision {
fn from(value: FileTime) -> Self {
let seconds = value.seconds() as u128;
let seconds = seconds << 64;
let nanos = u128::from(value.nanoseconds());
FileRevision(seconds | nanos)
}
}
#[derive(Copy, Clone, Eq, PartialEq, Debug, Hash)]
pub enum FileType {
File,
@ -583,24 +534,3 @@ impl FileType {
matches!(self, FileType::Symlink)
}
}
#[cfg(test)]
mod tests {
use filetime::FileTime;
use crate::file_system::FileRevision;
#[test]
fn revision_from_file_time() {
let file_time = FileTime::now();
let revision = FileRevision::from(file_time);
let revision = revision.as_u128();
let nano = revision & 0xFFFF_FFFF_FFFF_FFFF;
let seconds = revision >> 64;
assert_eq!(file_time.nanoseconds(), nano as u32);
assert_eq!(file_time.seconds(), seconds as i64);
}
}

View file

@ -8,9 +8,11 @@ use crate::parsed::parsed_module;
use crate::source::{line_index, source_text};
use crate::vfs::{Vfs, VfsFile};
mod file_revision;
pub mod file_system;
pub mod parsed;
pub mod source;
pub mod vendored;
pub mod vfs;
pub(crate) type FxDashMap<K, V> = dashmap::DashMap<K, V, BuildHasherDefault<FxHasher>>;

View file

@ -73,7 +73,7 @@ mod tests {
use crate::file_system::FileSystemPath;
use crate::parsed::parsed_module;
use crate::tests::TestDb;
use crate::vfs::VendoredPath;
use crate::vendored::VendoredPath;
use crate::vfs::{system_path_to_file, vendored_path_to_file};
#[test]

View file

@ -0,0 +1,373 @@
use std::cell::RefCell;
use std::io::{self, Read};
use std::sync::{Mutex, MutexGuard};
use itertools::Itertools;
use zip::{read::ZipFile, ZipArchive};
use crate::file_revision::FileRevision;
pub use path::{VendoredPath, VendoredPathBuf};
pub mod path;
type Result<T> = io::Result<T>;
/// File system that stores all content in a static zip archive
/// bundled as part of the Ruff binary.
///
/// "Files" in the `VendoredFileSystem` are read-only and immutable.
/// Directories are supported, but symlinks and hardlinks cannot exist.
#[derive(Debug)]
pub struct VendoredFileSystem {
inner: VendoredFileSystemInner,
}
impl VendoredFileSystem {
pub fn new(raw_bytes: &'static [u8]) -> Result<Self> {
Ok(Self {
inner: VendoredFileSystemInner::new(raw_bytes)?,
})
}
pub fn exists(&self, path: &VendoredPath) -> bool {
let normalized = normalize_vendored_path(path);
let inner_locked = self.inner.lock();
let mut archive = inner_locked.borrow_mut();
// Must probe the zipfile twice, as "stdlib" and "stdlib/" are considered
// different paths in a zip file, but we want to abstract over that difference here
// so that paths relative to the `VendoredFileSystem`
// work the same as other paths in Ruff.
archive.lookup_path(&normalized).is_ok()
|| archive
.lookup_path(&normalized.with_trailing_slash())
.is_ok()
}
pub fn metadata(&self, path: &VendoredPath) -> Option<Metadata> {
let normalized = normalize_vendored_path(path);
let inner_locked = self.inner.lock();
// Must probe the zipfile twice, as "stdlib" and "stdlib/" are considered
// different paths in a zip file, but we want to abstract over that difference here
// so that paths relative to the `VendoredFileSystem`
// work the same as other paths in Ruff.
let mut archive = inner_locked.borrow_mut();
if let Ok(zip_file) = archive.lookup_path(&normalized) {
return Some(Metadata::from_zip_file(zip_file));
}
if let Ok(zip_file) = archive.lookup_path(&normalized.with_trailing_slash()) {
return Some(Metadata::from_zip_file(zip_file));
}
None
}
/// Read the entire contents of the zip file at `path` into a string
///
/// Returns an Err() if any of the following are true:
/// - The path does not exist in the underlying zip archive
/// - The path exists in the underlying zip archive, but represents a directory
/// - The contents of the zip file at `path` contain invalid UTF-8
pub fn read(&self, path: &VendoredPath) -> Result<String> {
let inner_locked = self.inner.lock();
let mut archive = inner_locked.borrow_mut();
let mut zip_file = archive.lookup_path(&normalize_vendored_path(path))?;
let mut buffer = String::new();
zip_file.read_to_string(&mut buffer)?;
Ok(buffer)
}
}
#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)]
pub enum FileType {
/// The path exists in the zip archive and represents a vendored file
File,
/// The path exists in the zip archive and represents a vendored directory of files
Directory,
}
impl FileType {
pub const fn is_file(self) -> bool {
matches!(self, Self::File)
}
pub const fn is_directory(self) -> bool {
matches!(self, Self::Directory)
}
}
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
pub struct Metadata {
kind: FileType,
revision: FileRevision,
}
impl Metadata {
fn from_zip_file(zip_file: ZipFile) -> Self {
let kind = if zip_file.is_dir() {
FileType::Directory
} else {
FileType::File
};
Self {
kind,
revision: FileRevision::new(u128::from(zip_file.crc32())),
}
}
pub fn kind(&self) -> FileType {
self.kind
}
pub fn revision(&self) -> FileRevision {
self.revision
}
}
#[derive(Debug)]
struct VendoredFileSystemInner(Mutex<RefCell<VendoredZipArchive>>);
type LockedZipArchive<'a> = MutexGuard<'a, RefCell<VendoredZipArchive>>;
impl VendoredFileSystemInner {
fn new(raw_bytes: &'static [u8]) -> Result<Self> {
Ok(Self(Mutex::new(RefCell::new(VendoredZipArchive::new(
raw_bytes,
)?))))
}
/// Acquire a lock on the underlying zip archive.
/// The call will block until it is able to acquire the lock.
///
/// ## Panics:
/// If the current thread already holds the lock.
fn lock(&self) -> LockedZipArchive {
self.0.lock().unwrap()
}
}
/// Newtype wrapper around a ZipArchive.
#[derive(Debug)]
struct VendoredZipArchive(ZipArchive<io::Cursor<&'static [u8]>>);
impl VendoredZipArchive {
fn new(data: &'static [u8]) -> Result<Self> {
Ok(Self(ZipArchive::new(io::Cursor::new(data))?))
}
fn lookup_path(&mut self, path: &NormalizedVendoredPath) -> Result<ZipFile> {
Ok(self.0.by_name(path.as_str())?)
}
}
/// A path that has been normalized via the `normalize_vendored_path` function.
///
/// Trailing slashes are normalized away by `camino::Utf8PathBuf`s,
/// but trailing slashes are crucial for distinguishing between
/// files and directories inside zip archives.
#[derive(Debug, Clone, PartialEq, Eq)]
struct NormalizedVendoredPath(String);
impl NormalizedVendoredPath {
fn with_trailing_slash(mut self) -> Self {
debug_assert!(!self.0.ends_with('/'));
self.0.push('/');
self
}
fn as_str(&self) -> &str {
self.0.as_str()
}
}
/// Normalizes the path by removing `.` and `..` components.
///
/// ## Panics:
/// If a path with an unsupported component for vendored paths is passed.
/// Unsupported components are path prefixes,
/// and path root directories appearing anywhere except at the start of the path.
fn normalize_vendored_path(path: &VendoredPath) -> NormalizedVendoredPath {
let mut normalized_parts = camino::Utf8PathBuf::new();
// Allow the `RootDir` component, but only if it is at the very start of the string.
let mut components = path.components().peekable();
if let Some(camino::Utf8Component::RootDir) = components.peek() {
components.next();
}
for component in components {
match component {
camino::Utf8Component::Normal(part) => normalized_parts.push(part),
camino::Utf8Component::CurDir => continue,
camino::Utf8Component::ParentDir => {
normalized_parts.pop();
}
unsupported => panic!("Unsupported component in a vendored path: {unsupported}"),
}
}
NormalizedVendoredPath(normalized_parts.into_iter().join("/"))
}
#[cfg(test)]
mod tests {
use std::io::Write;
use once_cell::sync::Lazy;
use zip::{write::FileOptions, CompressionMethod, ZipWriter};
use super::*;
const FUNCTOOLS_CONTENTS: &str = "def update_wrapper(): ...";
const ASYNCIO_TASKS_CONTENTS: &str = "class Task: ...";
static MOCK_ZIP_ARCHIVE: Lazy<Box<[u8]>> = Lazy::new(|| {
let mut typeshed_buffer = Vec::new();
let typeshed = io::Cursor::new(&mut typeshed_buffer);
let options = FileOptions::default()
.compression_method(CompressionMethod::Zstd)
.unix_permissions(0o644);
{
let mut archive = ZipWriter::new(typeshed);
archive.add_directory("stdlib/", options).unwrap();
archive.start_file("stdlib/functools.pyi", options).unwrap();
archive.write_all(FUNCTOOLS_CONTENTS.as_bytes()).unwrap();
archive.add_directory("stdlib/asyncio/", options).unwrap();
archive
.start_file("stdlib/asyncio/tasks.pyi", options)
.unwrap();
archive
.write_all(ASYNCIO_TASKS_CONTENTS.as_bytes())
.unwrap();
archive.finish().unwrap();
}
typeshed_buffer.into_boxed_slice()
});
fn mock_typeshed() -> VendoredFileSystem {
VendoredFileSystem::new(&MOCK_ZIP_ARCHIVE).unwrap()
}
fn test_directory(dirname: &str) {
let mock_typeshed = mock_typeshed();
let path = VendoredPath::new(dirname);
assert!(mock_typeshed.exists(path));
assert!(mock_typeshed.read(path).is_err());
let metadata = mock_typeshed.metadata(path).unwrap();
assert!(metadata.kind.is_directory());
}
#[test]
fn stdlib_dir_no_trailing_slash() {
test_directory("stdlib")
}
#[test]
fn stdlib_dir_trailing_slash() {
test_directory("stdlib/")
}
#[test]
fn asyncio_dir_no_trailing_slash() {
test_directory("stdlib/asyncio")
}
#[test]
fn asyncio_dir_trailing_slash() {
test_directory("stdlib/asyncio/")
}
#[test]
fn stdlib_dir_parent_components() {
test_directory("stdlib/asyncio/../../stdlib")
}
#[test]
fn asyncio_dir_odd_components() {
test_directory("./stdlib/asyncio/../asyncio/")
}
fn test_nonexistent_path(path: &str) {
let mock_typeshed = mock_typeshed();
let path = VendoredPath::new(path);
assert!(!mock_typeshed.exists(path));
assert!(mock_typeshed.metadata(path).is_none());
assert!(mock_typeshed
.read(path)
.is_err_and(|err| err.to_string().contains("file not found")));
}
#[test]
fn simple_nonexistent_path() {
test_nonexistent_path("foo")
}
#[test]
fn nonexistent_path_with_extension() {
test_nonexistent_path("foo.pyi")
}
#[test]
fn nonexistent_path_with_trailing_slash() {
test_nonexistent_path("foo/")
}
#[test]
fn nonexistent_path_with_fancy_components() {
test_nonexistent_path("./foo/../../../foo")
}
fn test_file(mock_typeshed: &VendoredFileSystem, path: &VendoredPath) {
assert!(mock_typeshed.exists(path));
let metadata = mock_typeshed.metadata(path).unwrap();
assert!(metadata.kind.is_file());
}
#[test]
fn functools_file_contents() {
let mock_typeshed = mock_typeshed();
let path = VendoredPath::new("stdlib/functools.pyi");
test_file(&mock_typeshed, path);
let functools_stub = mock_typeshed.read(path).unwrap();
assert_eq!(functools_stub.as_str(), FUNCTOOLS_CONTENTS);
// Test that using the RefCell doesn't mutate
// the internal state of the underlying zip archive incorrectly:
let functools_stub_again = mock_typeshed.read(path).unwrap();
assert_eq!(functools_stub_again.as_str(), FUNCTOOLS_CONTENTS);
}
#[test]
fn functools_file_other_path() {
test_file(
&mock_typeshed(),
VendoredPath::new("stdlib/../stdlib/../stdlib/functools.pyi"),
)
}
#[test]
fn asyncio_file_contents() {
let mock_typeshed = mock_typeshed();
let path = VendoredPath::new("stdlib/asyncio/tasks.pyi");
test_file(&mock_typeshed, path);
let asyncio_stub = mock_typeshed.read(path).unwrap();
assert_eq!(asyncio_stub.as_str(), ASYNCIO_TASKS_CONTENTS);
}
#[test]
fn asyncio_file_other_path() {
test_file(
&mock_typeshed(),
VendoredPath::new("./stdlib/asyncio/../asyncio/tasks.pyi"),
)
}
}

View file

@ -0,0 +1,95 @@
use std::ops::Deref;
use std::path;
use camino::{Utf8Components, Utf8Path, Utf8PathBuf};
#[repr(transparent)]
#[derive(Debug, Eq, PartialEq, Hash)]
pub struct VendoredPath(Utf8Path);
impl VendoredPath {
pub fn new(path: &(impl AsRef<Utf8Path> + ?Sized)) -> &Self {
let path = path.as_ref();
// SAFETY: VendoredPath is marked as #[repr(transparent)] so the conversion from a
// *const Utf8Path to a *const VendoredPath is valid.
unsafe { &*(path as *const Utf8Path as *const VendoredPath) }
}
pub fn to_path_buf(&self) -> VendoredPathBuf {
VendoredPathBuf(self.0.to_path_buf())
}
pub fn as_str(&self) -> &str {
self.0.as_str()
}
pub fn as_std_path(&self) -> &path::Path {
self.0.as_std_path()
}
pub fn components(&self) -> Utf8Components {
self.0.components()
}
}
#[repr(transparent)]
#[derive(Debug, Eq, PartialEq, Clone, Hash)]
pub struct VendoredPathBuf(Utf8PathBuf);
impl Default for VendoredPathBuf {
fn default() -> Self {
Self::new()
}
}
impl VendoredPathBuf {
pub fn new() -> Self {
Self(Utf8PathBuf::new())
}
pub fn as_path(&self) -> &VendoredPath {
VendoredPath::new(&self.0)
}
}
impl AsRef<VendoredPath> for VendoredPathBuf {
fn as_ref(&self) -> &VendoredPath {
self.as_path()
}
}
impl AsRef<VendoredPath> for VendoredPath {
#[inline]
fn as_ref(&self) -> &VendoredPath {
self
}
}
impl AsRef<VendoredPath> for str {
#[inline]
fn as_ref(&self) -> &VendoredPath {
VendoredPath::new(self)
}
}
impl AsRef<VendoredPath> for String {
#[inline]
fn as_ref(&self) -> &VendoredPath {
VendoredPath::new(self)
}
}
impl AsRef<path::Path> for VendoredPath {
#[inline]
fn as_ref(&self) -> &path::Path {
self.0.as_std_path()
}
}
impl Deref for VendoredPathBuf {
type Target = VendoredPath;
fn deref(&self) -> &Self::Target {
self.as_path()
}
}

View file

@ -3,9 +3,12 @@ use std::sync::Arc;
use countme::Count;
use dashmap::mapref::entry::Entry;
pub use path::{VendoredPath, VendoredPathBuf, VfsPath};
pub use crate::vendored::{VendoredPath, VendoredPathBuf};
pub use path::VfsPath;
use crate::file_system::{FileRevision, FileSystemPath};
use crate::file_revision::FileRevision;
use crate::file_system::FileSystemPath;
use crate::vendored::VendoredFileSystem;
use crate::vfs::private::FileStatus;
use crate::{Db, FxDashMap};
@ -296,27 +299,44 @@ impl VfsFile {
}
}
#[derive(Default, Debug)]
#[derive(Debug)]
enum VendoredVfs {
#[default]
Real,
#[allow(unused)]
Real(VendoredFileSystem),
Stubbed(FxDashMap<VendoredPathBuf, String>),
}
impl Default for VendoredVfs {
fn default() -> Self {
Self::Stubbed(FxDashMap::default())
}
}
impl VendoredVfs {
fn revision(&self, path: &VendoredPath) -> Option<FileRevision> {
match self {
VendoredVfs::Real => todo!(),
VendoredVfs::Real(file_system) => file_system
.metadata(path)
.map(|metadata| metadata.revision()),
VendoredVfs::Stubbed(stubbed) => stubbed
.contains_key(&path.to_path_buf())
.then_some(FileRevision::new(1)),
}
}
fn read(&self, path: &VendoredPath) -> Option<String> {
fn read(&self, path: &VendoredPath) -> std::io::Result<String> {
match self {
VendoredVfs::Real => todo!(),
VendoredVfs::Stubbed(stubbed) => stubbed.get(&path.to_path_buf()).as_deref().cloned(),
VendoredVfs::Real(file_system) => file_system.read(path),
VendoredVfs::Stubbed(stubbed) => {
if let Some(contents) = stubbed.get(&path.to_path_buf()).as_deref().cloned() {
Ok(contents)
} else {
Err(std::io::Error::new(
std::io::ErrorKind::NotFound,
format!("Could not find file {path:?}"),
))
}
}
}
}
}
@ -336,7 +356,7 @@ mod private {
#[cfg(test)]
mod tests {
use crate::file_system::FileRevision;
use crate::file_revision::FileRevision;
use crate::tests::TestDb;
use crate::vfs::{system_path_to_file, vendored_path_to_file};

View file

@ -1,92 +1,5 @@
use std::ops::Deref;
use std::path::Path;
use camino::{Utf8Path, Utf8PathBuf};
use crate::file_system::{FileSystemPath, FileSystemPathBuf};
#[repr(transparent)]
#[derive(Debug, Eq, PartialEq, Hash)]
pub struct VendoredPath(Utf8Path);
impl VendoredPath {
pub fn new(path: &(impl AsRef<Utf8Path> + ?Sized)) -> &Self {
let path = path.as_ref();
// SAFETY: VendoredPath is marked as #[repr(transparent)] so the conversion from a
// *const Utf8Path to a *const VendoredPath is valid.
unsafe { &*(path as *const Utf8Path as *const VendoredPath) }
}
pub fn to_path_buf(&self) -> VendoredPathBuf {
VendoredPathBuf(self.0.to_path_buf())
}
pub fn as_str(&self) -> &str {
self.0.as_str()
}
}
#[repr(transparent)]
#[derive(Debug, Eq, PartialEq, Clone, Hash)]
pub struct VendoredPathBuf(Utf8PathBuf);
impl Default for VendoredPathBuf {
fn default() -> Self {
Self::new()
}
}
impl VendoredPathBuf {
pub fn new() -> Self {
Self(Utf8PathBuf::new())
}
pub fn as_path(&self) -> &VendoredPath {
VendoredPath::new(&self.0)
}
}
impl AsRef<VendoredPath> for VendoredPathBuf {
fn as_ref(&self) -> &VendoredPath {
self.as_path()
}
}
impl AsRef<VendoredPath> for VendoredPath {
#[inline]
fn as_ref(&self) -> &VendoredPath {
self
}
}
impl AsRef<VendoredPath> for str {
#[inline]
fn as_ref(&self) -> &VendoredPath {
VendoredPath::new(self)
}
}
impl AsRef<VendoredPath> for String {
#[inline]
fn as_ref(&self) -> &VendoredPath {
VendoredPath::new(self)
}
}
impl AsRef<Path> for VendoredPath {
#[inline]
fn as_ref(&self) -> &Path {
self.0.as_std_path()
}
}
impl Deref for VendoredPathBuf {
type Target = VendoredPath;
fn deref(&self) -> &Self::Target {
self.as_path()
}
}
use crate::vendored::path::{VendoredPath, VendoredPathBuf};
/// Path to a file.
///