From 3ce145c47669645c9031ec42ef9b339835e548c9 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Fri, 15 Dec 2023 08:19:35 -0500 Subject: [PATCH] release: switch to Cargo's default (#9031) This sets `lto = "thin"` instead of using "fat" LTO, and sets `codegen-units = 16`. These are the defaults for Cargo's `release` profile, and I think it may give us faster iteration times, especially when benchmarking. The point of this PR is to see what kind of impact this has on benchmarks. It is expected that benchmarks may regress to some extent. I did some quick ad hoc experiments to quantify this change in compile times. Namely, I ran: cargo build --profile release -p ruff_cli Then I ran touch crates/ruff_python_formatter/src/expression/string/docstring.rs (because that's where i've been working lately) and re-ran cargo build --profile release -p ruff_cli This last command is what I timed, since it reflects how much time one has to wait between making a change and getting a compiled artifact. Here are my results: * With status quo `release` profile, build takes 77s * with `release` but `lto = "thin"`, build takes 41s * with `release`, but `lto = false`, build takes 19s * with `release`, but `lto = false` **and** `codegen-units = 16`, build takes 7s * with `release`, but `lto = "thin"` **and** `codegen-units = 16`, build takes 16s (i believe this is the default `release` configuration) This PR represents the last option. It's not the fastest to compile, but it's nearly a whole minute faster! The idea is that with `codegen-units = 16`, we still make use of parallelism, but keep _some_ level of LTO on to try and re-gain what we lose by increasing the number of codegen units. --- CONTRIBUTING.md | 10 ++++----- Cargo.toml | 21 +++++++++++++++---- crates/ruff_python_parser/src/lexer/cursor.rs | 1 + 3 files changed, 23 insertions(+), 9 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 2d4ecb9b44..d79da233eb 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -556,10 +556,10 @@ examples. #### Linux -Install `perf` and build `ruff_benchmark` with the `release-debug` profile and then run it with perf +Install `perf` and build `ruff_benchmark` with the `profiling` profile and then run it with perf ```shell -cargo bench -p ruff_benchmark --no-run --profile=release-debug && perf record --call-graph dwarf -F 9999 cargo bench -p ruff_benchmark --profile=release-debug -- --profile-time=1 +cargo bench -p ruff_benchmark --no-run --profile=profiling && perf record --call-graph dwarf -F 9999 cargo bench -p ruff_benchmark --profile=profiling -- --profile-time=1 ``` You can also use the `ruff_dev` launcher to run `ruff check` multiple times on a repository to @@ -567,8 +567,8 @@ gather enough samples for a good flamegraph (change the 999, the sample rate, an of checks, to your liking) ```shell -cargo build --bin ruff_dev --profile=release-debug -perf record -g -F 999 target/release-debug/ruff_dev repeat --repeat 30 --exit-zero --no-cache path/to/cpython > /dev/null +cargo build --bin ruff_dev --profile=profiling +perf record -g -F 999 target/profiling/ruff_dev repeat --repeat 30 --exit-zero --no-cache path/to/cpython > /dev/null ``` Then convert the recorded profile @@ -598,7 +598,7 @@ cargo install cargo-instruments Then run the profiler with ```shell -cargo instruments -t time --bench linter --profile release-debug -p ruff_benchmark -- --profile-time=1 +cargo instruments -t time --bench linter --profile profiling -p ruff_benchmark -- --profile-time=1 ``` - `-t`: Specifies what to profile. Useful options are `time` to profile the wall time and `alloc` diff --git a/Cargo.toml b/Cargo.toml index 304162dba6..aad3c66446 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -88,7 +88,20 @@ rc_mutex = "warn" rest_pat_in_fully_bound_structs = "warn" [profile.release] -lto = "fat" +# Note that we set these explicitly, and these values +# were chosen based on a trade-off between compile times +# and runtime performance[1]. +# +# [1]: https://github.com/astral-sh/ruff/pull/9031 +lto = "thin" +codegen-units = 16 + +# Some crates don't change as much but benefit more from +# more expensive optimization passes, so we selectively +# decrease codegen-units in some cases. +[profile.release.package.ruff_python_parser] +codegen-units = 1 +[profile.release.package.ruff_python_ast] codegen-units = 1 [profile.dev.package.insta] @@ -102,8 +115,8 @@ opt-level = 3 [profile.dev.package.ruff_python_parser] opt-level = 1 -# Use the `--profile release-debug` flag to show symbols in release mode. -# e.g. `cargo build --profile release-debug` -[profile.release-debug] +# Use the `--profile profiling` flag to show symbols in release mode. +# e.g. `cargo build --profile profiling` +[profile.profiling] inherits = "release" debug = 1 diff --git a/crates/ruff_python_parser/src/lexer/cursor.rs b/crates/ruff_python_parser/src/lexer/cursor.rs index 91c7d30c53..26f3bb8a5b 100644 --- a/crates/ruff_python_parser/src/lexer/cursor.rs +++ b/crates/ruff_python_parser/src/lexer/cursor.rs @@ -120,6 +120,7 @@ impl<'a> Cursor<'a> { } /// Eats symbols while predicate returns true or until the end of file is reached. + #[inline] pub(super) fn eat_while(&mut self, mut predicate: impl FnMut(char) -> bool) { // It was tried making optimized version of this for eg. line comments, but // LLVM can inline all of this and compile it down to fast iteration over bytes.