ruff/crates/ruff_python_ast/src/source_code/mod.rs
Thomas de Zeeuw e3c12764f8
Only use a single cache file per Python package (#5117)
## Summary

This changes the caching design from one cache file per source file, to
one cache file per package. This greatly reduces the amount of cache
files that are opened and written, while maintaining roughly the same
(combined) size as bincode is very compact.

Below are some very much not scientific performance tests. It uses
projects/sources to check:

* small.py: single, 31 bytes Python file with 2 errors.
* test.py: single, 43k Python file with 8 errors.
* fastapi: FastAPI repo, 1134 files checked, 0 errors.

Source   | Before # files | After # files | Before size | After size
-------|-------|-------|-------|-------
small.py | 1              | 1             | 20 K        | 20 K
test.py  | 1              | 1             | 60 K        | 60 K
fastapi  | 1134           | 518           | 4.5 M       | 2.3 M

One question that might come up is why fastapi still has 518 cache files
and not 1? That is because this is using the existing package
resolution, which sees examples, docs, etc. as separate from the "main"
source code (in the fastapi directory in the repo). In this future it
might be worth consider switching to a one cache file per repo strategy.

This new design is not perfect and does have a number of known issues.
First, like the old design it doesn't remove the cache for a source file
that has been (re)moved until `ruff clean` is called.

Second, this currently uses a large mutex around the mutation of the
package cache (e.g. inserting result). This could be (or become) a
bottleneck. It's future work to test and improve this (if needed).

Third, currently the packages and opened and stored in a sequential
loop, this could be done parallel. This is also future work.


## Test Plan

Run `ruff check` (with caching enabled) twice on any Python source code
and it should produce the same results.
2023-06-19 17:46:13 +02:00

266 lines
6.9 KiB
Rust

use std::cmp::Ordering;
use std::fmt::{Debug, Formatter};
use std::sync::Arc;
use ruff_text_size::{TextRange, TextSize};
use rustpython_parser::{ast, lexer, Mode, Parse, ParseError};
#[cfg(feature = "serde")]
use serde::{Deserialize, Serialize};
pub use comment_ranges::{CommentRanges, CommentRangesBuilder};
pub use generator::Generator;
pub use indexer::Indexer;
pub use locator::Locator;
pub use stylist::{Quote, Stylist};
pub use crate::source_code::line_index::{LineIndex, OneIndexed};
mod comment_ranges;
mod generator;
mod indexer;
mod line_index;
mod locator;
mod stylist;
/// Run round-trip source code generation on a given Python code.
pub fn round_trip(code: &str, source_path: &str) -> Result<String, ParseError> {
let locator = Locator::new(code);
let python_ast = ast::Suite::parse(code, source_path)?;
let tokens: Vec<_> = lexer::lex(code, Mode::Module).collect();
let stylist = Stylist::from_tokens(&tokens, &locator);
let mut generator: Generator = (&stylist).into();
generator.unparse_suite(&python_ast);
Ok(generator.generate())
}
/// Gives access to the source code of a file and allows mapping between [`TextSize`] and [`SourceLocation`].
#[derive(Debug)]
pub struct SourceCode<'src, 'index> {
text: &'src str,
index: &'index LineIndex,
}
impl<'src, 'index> SourceCode<'src, 'index> {
pub fn new(content: &'src str, index: &'index LineIndex) -> Self {
Self {
text: content,
index,
}
}
/// Computes the one indexed row and column numbers for `offset`.
#[inline]
pub fn source_location(&self, offset: TextSize) -> SourceLocation {
self.index.source_location(offset, self.text)
}
#[inline]
pub fn line_index(&self, offset: TextSize) -> OneIndexed {
self.index.line_index(offset)
}
/// Take the source code up to the given [`TextSize`].
#[inline]
pub fn up_to(&self, offset: TextSize) -> &'src str {
&self.text[TextRange::up_to(offset)]
}
/// Take the source code after the given [`TextSize`].
#[inline]
pub fn after(&self, offset: TextSize) -> &'src str {
&self.text[usize::from(offset)..]
}
/// Take the source code between the given [`TextRange`].
pub fn slice(&self, range: TextRange) -> &'src str {
&self.text[range]
}
pub fn line_start(&self, line: OneIndexed) -> TextSize {
self.index.line_start(line, self.text)
}
pub fn line_end(&self, line: OneIndexed) -> TextSize {
self.index.line_end(line, self.text)
}
pub fn line_range(&self, line: OneIndexed) -> TextRange {
self.index.line_range(line, self.text)
}
/// Returns the source text of the line with the given index
#[inline]
pub fn line_text(&self, index: OneIndexed) -> &'src str {
let range = self.index.line_range(index, self.text);
&self.text[range]
}
/// Returns the source text
pub fn text(&self) -> &'src str {
self.text
}
/// Returns the number of lines
#[inline]
pub fn line_count(&self) -> usize {
self.index.line_count()
}
}
impl PartialEq<Self> for SourceCode<'_, '_> {
fn eq(&self, other: &Self) -> bool {
self.text == other.text
}
}
impl Eq for SourceCode<'_, '_> {}
/// A Builder for constructing a [`SourceFile`]
pub struct SourceFileBuilder {
name: Box<str>,
code: Box<str>,
index: Option<LineIndex>,
}
impl SourceFileBuilder {
/// Creates a new builder for a file named `name`.
pub fn new<Name: Into<Box<str>>, Code: Into<Box<str>>>(name: Name, code: Code) -> Self {
Self {
name: name.into(),
code: code.into(),
index: None,
}
}
#[must_use]
pub fn line_index(mut self, index: LineIndex) -> Self {
self.index = Some(index);
self
}
pub fn set_line_index(&mut self, index: LineIndex) {
self.index = Some(index);
}
/// Consumes `self` and returns the [`SourceFile`].
pub fn finish(self) -> SourceFile {
let index = if let Some(index) = self.index {
once_cell::sync::OnceCell::with_value(index)
} else {
once_cell::sync::OnceCell::new()
};
SourceFile {
inner: Arc::new(SourceFileInner {
name: self.name,
code: self.code,
line_index: index,
}),
}
}
}
/// A source file that is identified by its name. Optionally stores the source code and [`LineIndex`].
///
/// Cloning a [`SourceFile`] is cheap, because it only requires bumping a reference count.
#[derive(Clone, Eq, PartialEq)]
pub struct SourceFile {
inner: Arc<SourceFileInner>,
}
impl Debug for SourceFile {
fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
f.debug_struct("SourceFile")
.field("name", &self.name())
.field("code", &self.source_text())
.finish()
}
}
impl SourceFile {
/// Returns the name of the source file (filename).
#[inline]
pub fn name(&self) -> &str {
&self.inner.name
}
#[inline]
pub fn slice(&self, range: TextRange) -> &str {
&self.source_text()[range]
}
pub fn to_source_code(&self) -> SourceCode {
SourceCode {
text: self.source_text(),
index: self.index(),
}
}
fn index(&self) -> &LineIndex {
self.inner
.line_index
.get_or_init(|| LineIndex::from_source_text(self.source_text()))
}
/// Returns the source code.
#[inline]
pub fn source_text(&self) -> &str {
&self.inner.code
}
}
impl PartialOrd for SourceFile {
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
Some(self.cmp(other))
}
}
impl Ord for SourceFile {
fn cmp(&self, other: &Self) -> Ordering {
// Short circuit if these are the same source files
if Arc::ptr_eq(&self.inner, &other.inner) {
Ordering::Equal
} else {
self.inner.name.cmp(&other.inner.name)
}
}
}
struct SourceFileInner {
name: Box<str>,
code: Box<str>,
line_index: once_cell::sync::OnceCell<LineIndex>,
}
impl PartialEq for SourceFileInner {
fn eq(&self, other: &Self) -> bool {
self.name == other.name && self.code == other.code
}
}
impl Eq for SourceFileInner {}
#[derive(Clone, Eq, PartialEq, Ord, PartialOrd, Hash)]
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
pub struct SourceLocation {
pub row: OneIndexed,
pub column: OneIndexed,
}
impl Default for SourceLocation {
fn default() -> Self {
Self {
row: OneIndexed::MIN,
column: OneIndexed::MIN,
}
}
}
impl Debug for SourceLocation {
fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
f.debug_struct("SourceLocation")
.field("row", &self.row.get())
.field("column", &self.column.get())
.finish()
}
}