mirror of
https://github.com/astral-sh/ruff.git
synced 2025-10-01 14:21:53 +00:00
Improve ruff_parse_simple to find UTF-8 violations (#5008)
Improves the `ruff_parse_simple` fuzz harness by adding checks for parsed locations to ensure they all lie on UTF-8 character boundaries. This will allow for faster identification of issues like #5004. This also adds additional details for Apple M1 users and clarifies the importance of using `init-fuzzer.sh` (thanks for the feedback, @jasikpark 🙂).
This commit is contained in:
parent
9db622afe1
commit
70e6c212d9
6 changed files with 57 additions and 1976 deletions
|
@ -1,2 +1,3 @@
|
||||||
pub use crate::node::AstNode;
|
pub use crate::node::AstNode;
|
||||||
pub use rustpython_ast::*;
|
pub use rustpython_ast::*;
|
||||||
|
pub use rustpython_parser::*;
|
||||||
|
|
|
@ -183,7 +183,7 @@ impl<'a> Generator<'a> {
|
||||||
self.buffer
|
self.buffer
|
||||||
}
|
}
|
||||||
|
|
||||||
pub(crate) fn unparse_suite<U>(&mut self, suite: &Suite<U>) {
|
pub fn unparse_suite<U>(&mut self, suite: &Suite<U>) {
|
||||||
for stmt in suite {
|
for stmt in suite {
|
||||||
self.unparse_stmt(stmt);
|
self.unparse_stmt(stmt);
|
||||||
}
|
}
|
||||||
|
|
1
fuzz/.gitignore
vendored
1
fuzz/.gitignore
vendored
|
@ -1,2 +1,3 @@
|
||||||
artifacts/
|
artifacts/
|
||||||
corpus/ruff_fix_validity
|
corpus/ruff_fix_validity
|
||||||
|
Cargo.lock
|
||||||
|
|
1965
fuzz/Cargo.lock
generated
1965
fuzz/Cargo.lock
generated
File diff suppressed because it is too large
Load diff
|
@ -12,6 +12,8 @@ To use the fuzzers provided in this directory, start by invoking:
|
||||||
|
|
||||||
This will install [`cargo-fuzz`](https://github.com/rust-fuzz/cargo-fuzz) and optionally download a
|
This will install [`cargo-fuzz`](https://github.com/rust-fuzz/cargo-fuzz) and optionally download a
|
||||||
[dataset](https://zenodo.org/record/3628784) which improves the efficacy of the testing.
|
[dataset](https://zenodo.org/record/3628784) which improves the efficacy of the testing.
|
||||||
|
**This step is necessary for initialising the corpus directory, as all fuzzers share a common
|
||||||
|
corpus.**
|
||||||
The dataset may take several hours to download and clean, so if you're just looking to try out the
|
The dataset may take several hours to download and clean, so if you're just looking to try out the
|
||||||
fuzzers, skip the dataset download, though be warned that some features simply cannot be tested
|
fuzzers, skip the dataset download, though be warned that some features simply cannot be tested
|
||||||
without it (very unlikely for the fuzzer to generate valid python code from "thin air").
|
without it (very unlikely for the fuzzer to generate valid python code from "thin air").
|
||||||
|
@ -22,6 +24,8 @@ Once you have initialised the fuzzers, you can then execute any fuzzer with:
|
||||||
cargo fuzz run -s none name_of_fuzzer -- -timeout=1
|
cargo fuzz run -s none name_of_fuzzer -- -timeout=1
|
||||||
```
|
```
|
||||||
|
|
||||||
|
**Users using Apple M1 devices must use a nightly compiler and omit the `-s none` portion of this
|
||||||
|
command, as this architecture does not support fuzzing without a sanitizer.**
|
||||||
You can view the names of the available fuzzers with `cargo fuzz list`.
|
You can view the names of the available fuzzers with `cargo fuzz list`.
|
||||||
For specific details about how each fuzzer works, please read this document in its entirety.
|
For specific details about how each fuzzer works, please read this document in its entirety.
|
||||||
|
|
||||||
|
@ -74,6 +78,8 @@ itself, each harness is briefly described below.
|
||||||
|
|
||||||
This fuzz harness does not perform any "smart" testing of Ruff; it merely checks that the parsing
|
This fuzz harness does not perform any "smart" testing of Ruff; it merely checks that the parsing
|
||||||
and unparsing of a particular input (what would normally be a source code file) does not crash.
|
and unparsing of a particular input (what would normally be a source code file) does not crash.
|
||||||
|
It also attempts to verify that the locations of tokens and errors identified do not fall in the
|
||||||
|
middle of a UTF-8 code point, which may cause downstream panics.
|
||||||
While this is unlikely to find any issues on its own, it executes very quickly and covers a large
|
While this is unlikely to find any issues on its own, it executes very quickly and covers a large
|
||||||
and diverse code region that may speed up the generation of inputs and therefore make a more
|
and diverse code region that may speed up the generation of inputs and therefore make a more
|
||||||
valuable corpus quickly.
|
valuable corpus quickly.
|
||||||
|
@ -95,11 +101,3 @@ This fuzz harness checks that fixes applied by Ruff do not introduce new errors
|
||||||
[`ruff::test::test_snippet`](../crates/ruff/src/test.rs) testing utility.
|
[`ruff::test::test_snippet`](../crates/ruff/src/test.rs) testing utility.
|
||||||
It currently is only configured to use default settings, but may be extended in future versions to
|
It currently is only configured to use default settings, but may be extended in future versions to
|
||||||
test non-default linter settings.
|
test non-default linter settings.
|
||||||
|
|
||||||
## Experimental settings
|
|
||||||
|
|
||||||
You can optionally use `--no-default-features --features libafl` to use the libafl fuzzer instead of
|
|
||||||
libfuzzer.
|
|
||||||
This fuzzer has experimental support, but can vastly improve fuzzer performance.
|
|
||||||
If you are not already familiar with [LibAFL](https://github.com/AFLplusplus/LibAFL), this mode is
|
|
||||||
not currently recommended.
|
|
||||||
|
|
|
@ -4,13 +4,59 @@
|
||||||
#![no_main]
|
#![no_main]
|
||||||
|
|
||||||
use libfuzzer_sys::{fuzz_target, Corpus};
|
use libfuzzer_sys::{fuzz_target, Corpus};
|
||||||
use ruff_python_ast::source_code::round_trip;
|
use ruff_python_ast::prelude::{lexer, Mode, Parse, ParseError, Suite};
|
||||||
|
use ruff_python_ast::source_code::{Generator, Locator, Stylist};
|
||||||
|
|
||||||
fn do_fuzz(case: &[u8]) -> Corpus {
|
fn do_fuzz(case: &[u8]) -> Corpus {
|
||||||
let Ok(code) = std::str::from_utf8(case) else { return Corpus::Reject; };
|
let Ok(code) = std::str::from_utf8(case) else { return Corpus::Reject; };
|
||||||
|
|
||||||
// just round-trip it once to trigger both parse and unparse
|
// just round-trip it once to trigger both parse and unparse
|
||||||
let _ = round_trip(code, "fuzzed-source.py");
|
let locator = Locator::new(code);
|
||||||
|
let python_ast = match Suite::parse(code, "fuzzed-source.py") {
|
||||||
|
Ok(stmts) => stmts,
|
||||||
|
Err(ParseError { offset, .. }) => {
|
||||||
|
let offset = offset.to_usize();
|
||||||
|
assert!(
|
||||||
|
code.is_char_boundary(offset),
|
||||||
|
"Invalid error location {} (not at char boundary)",
|
||||||
|
offset
|
||||||
|
);
|
||||||
|
return Corpus::Keep;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
let tokens: Vec<_> = lexer::lex(code, Mode::Module).collect();
|
||||||
|
|
||||||
|
for maybe_token in tokens.iter() {
|
||||||
|
match maybe_token.as_ref() {
|
||||||
|
Ok((_, range)) => {
|
||||||
|
let start = range.start().to_usize();
|
||||||
|
let end = range.end().to_usize();
|
||||||
|
assert!(
|
||||||
|
code.is_char_boundary(start),
|
||||||
|
"Invalid start position {} (not at char boundary)",
|
||||||
|
start
|
||||||
|
);
|
||||||
|
assert!(
|
||||||
|
code.is_char_boundary(end),
|
||||||
|
"Invalid end position {} (not at char boundary)",
|
||||||
|
end
|
||||||
|
);
|
||||||
|
}
|
||||||
|
Err(err) => {
|
||||||
|
let offset = err.location.to_usize();
|
||||||
|
assert!(
|
||||||
|
code.is_char_boundary(offset),
|
||||||
|
"Invalid error location {} (not at char boundary)",
|
||||||
|
offset
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
let stylist = Stylist::from_tokens(&tokens, &locator);
|
||||||
|
let mut generator: Generator = (&stylist).into();
|
||||||
|
generator.unparse_suite(&python_ast);
|
||||||
|
|
||||||
Corpus::Keep
|
Corpus::Keep
|
||||||
}
|
}
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue