mirror of
https://github.com/astral-sh/ruff.git
synced 2025-08-04 18:58:04 +00:00
Improve ruff_parse_simple to find UTF-8 violations (#5008)
Improves the `ruff_parse_simple` fuzz harness by adding checks for parsed locations to ensure they all lie on UTF-8 character boundaries. This will allow for faster identification of issues like #5004. This also adds additional details for Apple M1 users and clarifies the importance of using `init-fuzzer.sh` (thanks for the feedback, @jasikpark 🙂).
This commit is contained in:
parent
9db622afe1
commit
70e6c212d9
6 changed files with 57 additions and 1976 deletions
|
@ -1,2 +1,3 @@
|
|||
pub use crate::node::AstNode;
|
||||
pub use rustpython_ast::*;
|
||||
pub use rustpython_parser::*;
|
||||
|
|
|
@ -183,7 +183,7 @@ impl<'a> Generator<'a> {
|
|||
self.buffer
|
||||
}
|
||||
|
||||
pub(crate) fn unparse_suite<U>(&mut self, suite: &Suite<U>) {
|
||||
pub fn unparse_suite<U>(&mut self, suite: &Suite<U>) {
|
||||
for stmt in suite {
|
||||
self.unparse_stmt(stmt);
|
||||
}
|
||||
|
|
1
fuzz/.gitignore
vendored
1
fuzz/.gitignore
vendored
|
@ -1,2 +1,3 @@
|
|||
artifacts/
|
||||
corpus/ruff_fix_validity
|
||||
Cargo.lock
|
||||
|
|
1965
fuzz/Cargo.lock
generated
1965
fuzz/Cargo.lock
generated
File diff suppressed because it is too large
Load diff
|
@ -12,6 +12,8 @@ To use the fuzzers provided in this directory, start by invoking:
|
|||
|
||||
This will install [`cargo-fuzz`](https://github.com/rust-fuzz/cargo-fuzz) and optionally download a
|
||||
[dataset](https://zenodo.org/record/3628784) which improves the efficacy of the testing.
|
||||
**This step is necessary for initialising the corpus directory, as all fuzzers share a common
|
||||
corpus.**
|
||||
The dataset may take several hours to download and clean, so if you're just looking to try out the
|
||||
fuzzers, skip the dataset download, though be warned that some features simply cannot be tested
|
||||
without it (very unlikely for the fuzzer to generate valid python code from "thin air").
|
||||
|
@ -22,6 +24,8 @@ Once you have initialised the fuzzers, you can then execute any fuzzer with:
|
|||
cargo fuzz run -s none name_of_fuzzer -- -timeout=1
|
||||
```
|
||||
|
||||
**Users using Apple M1 devices must use a nightly compiler and omit the `-s none` portion of this
|
||||
command, as this architecture does not support fuzzing without a sanitizer.**
|
||||
You can view the names of the available fuzzers with `cargo fuzz list`.
|
||||
For specific details about how each fuzzer works, please read this document in its entirety.
|
||||
|
||||
|
@ -74,6 +78,8 @@ itself, each harness is briefly described below.
|
|||
|
||||
This fuzz harness does not perform any "smart" testing of Ruff; it merely checks that the parsing
|
||||
and unparsing of a particular input (what would normally be a source code file) does not crash.
|
||||
It also attempts to verify that the locations of tokens and errors identified do not fall in the
|
||||
middle of a UTF-8 code point, which may cause downstream panics.
|
||||
While this is unlikely to find any issues on its own, it executes very quickly and covers a large
|
||||
and diverse code region that may speed up the generation of inputs and therefore make a more
|
||||
valuable corpus quickly.
|
||||
|
@ -95,11 +101,3 @@ This fuzz harness checks that fixes applied by Ruff do not introduce new errors
|
|||
[`ruff::test::test_snippet`](../crates/ruff/src/test.rs) testing utility.
|
||||
It currently is only configured to use default settings, but may be extended in future versions to
|
||||
test non-default linter settings.
|
||||
|
||||
## Experimental settings
|
||||
|
||||
You can optionally use `--no-default-features --features libafl` to use the libafl fuzzer instead of
|
||||
libfuzzer.
|
||||
This fuzzer has experimental support, but can vastly improve fuzzer performance.
|
||||
If you are not already familiar with [LibAFL](https://github.com/AFLplusplus/LibAFL), this mode is
|
||||
not currently recommended.
|
||||
|
|
|
@ -4,13 +4,59 @@
|
|||
#![no_main]
|
||||
|
||||
use libfuzzer_sys::{fuzz_target, Corpus};
|
||||
use ruff_python_ast::source_code::round_trip;
|
||||
use ruff_python_ast::prelude::{lexer, Mode, Parse, ParseError, Suite};
|
||||
use ruff_python_ast::source_code::{Generator, Locator, Stylist};
|
||||
|
||||
fn do_fuzz(case: &[u8]) -> Corpus {
|
||||
let Ok(code) = std::str::from_utf8(case) else { return Corpus::Reject; };
|
||||
|
||||
// just round-trip it once to trigger both parse and unparse
|
||||
let _ = round_trip(code, "fuzzed-source.py");
|
||||
let locator = Locator::new(code);
|
||||
let python_ast = match Suite::parse(code, "fuzzed-source.py") {
|
||||
Ok(stmts) => stmts,
|
||||
Err(ParseError { offset, .. }) => {
|
||||
let offset = offset.to_usize();
|
||||
assert!(
|
||||
code.is_char_boundary(offset),
|
||||
"Invalid error location {} (not at char boundary)",
|
||||
offset
|
||||
);
|
||||
return Corpus::Keep;
|
||||
}
|
||||
};
|
||||
|
||||
let tokens: Vec<_> = lexer::lex(code, Mode::Module).collect();
|
||||
|
||||
for maybe_token in tokens.iter() {
|
||||
match maybe_token.as_ref() {
|
||||
Ok((_, range)) => {
|
||||
let start = range.start().to_usize();
|
||||
let end = range.end().to_usize();
|
||||
assert!(
|
||||
code.is_char_boundary(start),
|
||||
"Invalid start position {} (not at char boundary)",
|
||||
start
|
||||
);
|
||||
assert!(
|
||||
code.is_char_boundary(end),
|
||||
"Invalid end position {} (not at char boundary)",
|
||||
end
|
||||
);
|
||||
}
|
||||
Err(err) => {
|
||||
let offset = err.location.to_usize();
|
||||
assert!(
|
||||
code.is_char_boundary(offset),
|
||||
"Invalid error location {} (not at char boundary)",
|
||||
offset
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let stylist = Stylist::from_tokens(&tokens, &locator);
|
||||
let mut generator: Generator = (&stylist).into();
|
||||
generator.unparse_suite(&python_ast);
|
||||
|
||||
Corpus::Keep
|
||||
}
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue