Improve ruff_parse_simple to find UTF-8 violations (#5008)

Improves the `ruff_parse_simple` fuzz harness by adding checks for parsed locations to ensure they all lie on UTF-8 character boundaries. This will allow for faster identification of issues like #5004. This also adds additional details for Apple M1 users and clarifies the importance of using `init-fuzzer.sh` (thanks for the feedback, @jasikpark 🙂).
2025-08-04 18:58:04 +00:00 · 2023-06-12 18:10:23 +02:00 · 2023-06-12 18:10:23 +02:00 · 70e6c212d9
commit 70e6c212d9
parent 9db622afe1
6 changed files with 57 additions and 1976 deletions
--- a/crates/ruff_python_ast/src/prelude.rs
+++ b/crates/ruff_python_ast/src/prelude.rs
@ -1,2 +1,3 @@
 pub use crate::node::AstNode;
 pub use rustpython_ast::*;
+pub use rustpython_parser::*;
--- a/crates/ruff_python_ast/src/source_code/generator.rs
+++ b/crates/ruff_python_ast/src/source_code/generator.rs
@ -183,7 +183,7 @@ impl<'a> Generator<'a> {
        self.buffer
    }

-    pub(crate) fn unparse_suite<U>(&mut self, suite: &Suite<U>) {
+    pub fn unparse_suite<U>(&mut self, suite: &Suite<U>) {
        for stmt in suite {
            self.unparse_stmt(stmt);
        }
--- a/fuzz/.gitignore
+++ b/fuzz/.gitignore
@ -1,2 +1,3 @@
 artifacts/
 corpus/ruff_fix_validity
+Cargo.lock
--- a/fuzz/Cargo.lock
+++ b/fuzz/Cargo.lock
--- a/fuzz/README.md
+++ b/fuzz/README.md
@ -12,6 +12,8 @@ To use the fuzzers provided in this directory, start by invoking:

 This will install [`cargo-fuzz`](https://github.com/rust-fuzz/cargo-fuzz) and optionally download a
 [dataset](https://zenodo.org/record/3628784) which improves the efficacy of the testing.
+**This step is necessary for initialising the corpus directory, as all fuzzers share a common
+corpus.**
 The dataset may take several hours to download and clean, so if you're just looking to try out the
 fuzzers, skip the dataset download, though be warned that some features simply cannot be tested
 without it (very unlikely for the fuzzer to generate valid python code from "thin air").
@ -22,6 +24,8 @@ Once you have initialised the fuzzers, you can then execute any fuzzer with:
 cargo fuzz run -s none name_of_fuzzer -- -timeout=1
 ```

+**Users using Apple M1 devices must use a nightly compiler and omit the `-s none` portion of this
+command, as this architecture does not support fuzzing without a sanitizer.**
 You can view the names of the available fuzzers with `cargo fuzz list`.
 For specific details about how each fuzzer works, please read this document in its entirety.

@ -74,6 +78,8 @@ itself, each harness is briefly described below.

 This fuzz harness does not perform any "smart" testing of Ruff; it merely checks that the parsing
 and unparsing of a particular input (what would normally be a source code file) does not crash.
+It also attempts to verify that the locations of tokens and errors identified do not fall in the
+middle of a UTF-8 code point, which may cause downstream panics.
 While this is unlikely to find any issues on its own, it executes very quickly and covers a large
 and diverse code region that may speed up the generation of inputs and therefore make a more
 valuable corpus quickly.
@ -95,11 +101,3 @@ This fuzz harness checks that fixes applied by Ruff do not introduce new errors
 [`ruff::test::test_snippet`](../crates/ruff/src/test.rs) testing utility.
 It currently is only configured to use default settings, but may be extended in future versions to
 test non-default linter settings.
-
-## Experimental settings
-
-You can optionally use `--no-default-features --features libafl` to use the libafl fuzzer instead of
-libfuzzer.
-This fuzzer has experimental support, but can vastly improve fuzzer performance.
-If you are not already familiar with [LibAFL](https://github.com/AFLplusplus/LibAFL), this mode is
-not currently recommended.
--- a/fuzz/fuzz_targets/ruff_parse_simple.rs
+++ b/fuzz/fuzz_targets/ruff_parse_simple.rs
@ -4,13 +4,59 @@
 #![no_main]

 use libfuzzer_sys::{fuzz_target, Corpus};
-use ruff_python_ast::source_code::round_trip;
+use ruff_python_ast::prelude::{lexer, Mode, Parse, ParseError, Suite};
+use ruff_python_ast::source_code::{Generator, Locator, Stylist};

 fn do_fuzz(case: &[u8]) -> Corpus {
    let Ok(code) = std::str::from_utf8(case) else { return Corpus::Reject; };

    // just round-trip it once to trigger both parse and unparse
-    let _ = round_trip(code, "fuzzed-source.py");
+    let locator = Locator::new(code);
+    let python_ast = match Suite::parse(code, "fuzzed-source.py") {
+        Ok(stmts) => stmts,
+        Err(ParseError { offset, .. }) => {
+            let offset = offset.to_usize();
+            assert!(
+                code.is_char_boundary(offset),
+                "Invalid error location {} (not at char boundary)",
+                offset
+            );
+            return Corpus::Keep;
+        }
+    };
+
+    let tokens: Vec<_> = lexer::lex(code, Mode::Module).collect();
+
+    for maybe_token in tokens.iter() {
+        match maybe_token.as_ref() {
+            Ok((_, range)) => {
+                let start = range.start().to_usize();
+                let end = range.end().to_usize();
+                assert!(
+                    code.is_char_boundary(start),
+                    "Invalid start position {} (not at char boundary)",
+                    start
+                );
+                assert!(
+                    code.is_char_boundary(end),
+                    "Invalid end position {} (not at char boundary)",
+                    end
+                );
+            }
+            Err(err) => {
+                let offset = err.location.to_usize();
+                assert!(
+                    code.is_char_boundary(offset),
+                    "Invalid error location {} (not at char boundary)",
+                    offset
+                );
+            }
+        }
+    }
+
+    let stylist = Stylist::from_tokens(&tokens, &locator);
+    let mut generator: Generator = (&stylist).into();
+    generator.unparse_suite(&python_ast);

    Corpus::Keep
 }