[red-knot] Hand-written MDTest parser (#15926)

## Summary

Replaces our existing Markdown test parser with a fully hand-written
parser. I tried to fix this bug using the old approach and kept running
into problems. Eventually this seemed like the easier way. It's more
code (+50 lines, excluding the new test), but I hope it's relatively
straightforward to understand, compared to the complex interplay between
the byte-stream-manipulation and regex-parsing that we had before.

I did not really focus on performance, as the parsing time does not
dominate the test execution time, but this seems to be slightly faster
than what we had before (executing all MD tests; debug):

| Command | Mean [s] | Min [s] | Max [s] | Relative |
|:---|---:|---:|---:|---:|
| this branch | 2.775 ± 0.072 | 2.690 | 2.877 | 1.00 |
| `main` | 2.921 ± 0.034 | 2.865 | 2.967 | 1.05 ± 0.03 |

closes #15923

## Test Plan

One new regression test.
This commit is contained in:
David Peter 2025-02-04 14:01:53 +01:00 committed by GitHub
parent 15dd3b5ebd
commit 9a33924a65
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
2 changed files with 207 additions and 95 deletions

View file

@ -1,8 +1,4 @@
use std::sync::LazyLock;
use anyhow::bail; use anyhow::bail;
use memchr::memchr2;
use regex::{Captures, Match, Regex};
use rustc_hash::FxHashSet; use rustc_hash::FxHashSet;
use ruff_index::{newtype_index, IndexVec}; use ruff_index::{newtype_index, IndexVec};
@ -155,27 +151,6 @@ pub(crate) struct EmbeddedFile<'s> {
pub(crate) md_offset: TextSize, pub(crate) md_offset: TextSize,
} }
/// Matches a sequence of `#` characters, followed by a title heading, followed by a newline.
static HEADER_RE: LazyLock<Regex> =
LazyLock::new(|| Regex::new(r"^(?<level>#+)[^\S\n]+(?<title>.+)[^\S\n]*\n").unwrap());
/// Matches a code block fenced by triple backticks, possibly with language and `key=val`
/// configuration items following the opening backticks (in the "tag string" of the code block).
static CODE_RE: LazyLock<Regex> = LazyLock::new(|| {
Regex::new(
r"(?x)
^(?:
`(?<path>[^`\n]+)`[^\S\n]*:[^\S\n]*\n
\n?
)?
```(?<lang>(?-u:\w)+)?\x20*(?<config>\S.*)?\n
(?<code>[\s\S]*?)\n?
(?<end>```\n?|\z)
",
)
.unwrap()
});
#[derive(Debug)] #[derive(Debug)]
struct SectionStack(Vec<SectionId>); struct SectionStack(Vec<SectionId>);
@ -219,6 +194,12 @@ struct Parser<'s> {
/// The unparsed remainder of the Markdown source. /// The unparsed remainder of the Markdown source.
cursor: Cursor<'s>, cursor: Cursor<'s>,
// Number of consecutive empty lines.
preceding_blank_lines: usize,
// Explicitly specified path for the upcoming code block.
explicit_path: Option<&'s str>,
source: &'s str, source: &'s str,
source_len: TextSize, source_len: TextSize,
@ -247,6 +228,8 @@ impl<'s> Parser<'s> {
files: IndexVec::default(), files: IndexVec::default(),
unnamed_file_count: 0, unnamed_file_count: 0,
cursor: Cursor::new(source), cursor: Cursor::new(source),
preceding_blank_lines: 0,
explicit_path: None,
source_len: source.text_len(), source_len: source.text_len(),
stack: SectionStack::new(root_section_id), stack: SectionStack::new(root_section_id),
current_section_files: None, current_section_files: None,
@ -269,39 +252,132 @@ impl<'s> Parser<'s> {
} }
} }
fn parse_impl(&mut self) -> anyhow::Result<()> { fn skip_whitespace(&mut self) {
while let Some(position) = memchr2(b'`', b'#', self.cursor.as_bytes()) { self.cursor.eat_while(|c| c.is_whitespace() && c != '\n');
self.cursor.skip_bytes(position.saturating_sub(2)); }
// Code blocks and headers must start on a new line fn skip_to_beginning_of_next_line(&mut self) -> bool {
// and preceded by at least one blank line. if let Some(position) = memchr::memchr(b'\n', self.cursor.as_bytes()) {
if position == 0 && self.cursor.first() == '#' self.cursor.skip_bytes(position + 1);
|| position == 1 && self.cursor.eat_char('\n') true
|| self.cursor.eat_char('\n') && self.cursor.eat_char('\n') } else {
{ false
match self.cursor.first() { }
'#' => { }
if let Some(find) = HEADER_RE.find(self.cursor.as_str()) {
self.parse_header(find.as_str())?; fn consume_until(&mut self, mut end_predicate: impl FnMut(char) -> bool) -> Option<&'s str> {
self.cursor.skip_bytes(find.len()); let start = self.offset().to_usize();
continue;
while !self.cursor.is_eof() {
if end_predicate(self.cursor.first()) {
return Some(&self.source[start..self.offset().to_usize()]);
}
self.cursor.bump();
}
None
}
fn parse_impl(&mut self) -> anyhow::Result<()> {
const CODE_BLOCK_END: &[u8] = b"```";
while let Some(first) = self.cursor.bump() {
match first {
'#' => {
self.explicit_path = None;
self.preceding_blank_lines = 0;
// Determine header level (number of '#' characters)
let mut header_level = 1;
while self.cursor.eat_char('#') {
header_level += 1;
}
// Parse header title
if let Some(title) = self.consume_until(|c| c == '\n') {
let title = title.trim();
if !title.is_empty() {
self.process_header(header_level, title)?;
} }
} }
'`' => { }
if let Some(captures) = CODE_RE.captures(self.cursor.as_str()) { '`' => {
self.parse_code_block(&captures)?; if self.cursor.eat_char2('`', '`') {
self.cursor.skip_bytes(captures.get(0).unwrap().len()); // We saw the triple-backtick beginning of a code block.
continue;
if self.preceding_blank_lines < 1 && self.explicit_path.is_none() {
bail!("Code blocks must start on a new line and be preceded by at least one blank line.");
}
self.skip_whitespace();
// Parse the code block language specifier
let lang = self
.consume_until(|c| matches!(c, ' ' | '\n'))
.unwrap_or_default();
self.skip_whitespace();
if !self.cursor.eat_char('\n') {
bail!("Trailing code-block metadata is not supported. Only the code block language can be specified.");
}
if let Some(position) =
memchr::memmem::find(self.cursor.as_bytes(), CODE_BLOCK_END)
{
let mut code = &self.cursor.as_str()[..position];
self.cursor.skip_bytes(position + CODE_BLOCK_END.len());
if code.ends_with('\n') {
code = &code[..code.len() - '\n'.len_utf8()];
}
self.process_code_block(lang, code)?;
} else {
let code_block_start = self.cursor.token_len();
let line = self.source.count_lines(TextRange::up_to(code_block_start));
bail!("Unterminated code block at line {line}.");
}
self.explicit_path = None;
} else if self.preceding_blank_lines > 0 {
// This could be a line that specifies an explicit path for a Markdown code block (`module.py`:)
self.explicit_path = None;
if let Some(path) = self.consume_until(|c| matches!(c, '`' | '\n')) {
if self.cursor.eat_char('`') {
self.skip_whitespace();
if self.cursor.eat_char(':') {
self.explicit_path = Some(path);
}
}
}
}
self.preceding_blank_lines = 0;
}
'\n' => {
self.preceding_blank_lines += 1;
continue;
}
c => {
self.preceding_blank_lines = 0;
self.explicit_path = None;
if c.is_whitespace() {
self.skip_whitespace();
if self.cursor.eat_char('`')
&& self.cursor.eat_char('`')
&& self.cursor.eat_char('`')
{
bail!("Indented code blocks are not supported.");
} }
} }
_ => unreachable!(),
} }
} }
// Skip to the end of the line if !self.skip_to_beginning_of_next_line() {
if let Some(position) = memchr::memchr(b'\n', self.cursor.as_bytes()) {
self.cursor.skip_bytes(position + 1);
} else {
break; break;
} }
} }
@ -309,17 +385,7 @@ impl<'s> Parser<'s> {
Ok(()) Ok(())
} }
fn parse_header(&mut self, header: &'s str) -> anyhow::Result<()> { fn process_header(&mut self, header_level: usize, title: &'s str) -> anyhow::Result<()> {
let mut trimmed = header.trim();
let mut header_level = 0usize;
while let Some(rest) = trimmed.strip_prefix('#') {
header_level += 1;
trimmed = rest;
}
let title = trimmed.trim_start();
self.pop_sections_to_level(header_level); self.pop_sections_to_level(header_level);
let parent = self.stack.top(); let parent = self.stack.top();
@ -332,11 +398,11 @@ impl<'s> Parser<'s> {
}; };
if self.current_section_files.is_some() { if self.current_section_files.is_some() {
return Err(anyhow::anyhow!( bail!(
"Header '{}' not valid inside a test case; parent '{}' has code files.", "Header '{}' not valid inside a test case; parent '{}' has code files.",
section.title, section.title,
self.sections[parent].title, self.sections[parent].title,
)); );
} }
let section_id = self.sections.push(section); let section_id = self.sections.push(section);
@ -348,48 +414,27 @@ impl<'s> Parser<'s> {
Ok(()) Ok(())
} }
fn parse_code_block(&mut self, captures: &Captures<'s>) -> anyhow::Result<()> { fn process_code_block(&mut self, lang: &'s str, code: &'s str) -> anyhow::Result<()> {
// We never pop the implicit root section. // We never pop the implicit root section.
let section = self.stack.top(); let section = self.stack.top();
if captures.name("end").unwrap().is_empty() {
let code_block_start = self.cursor.token_len();
let line = self.source.count_lines(TextRange::up_to(code_block_start)) + 1;
return Err(anyhow::anyhow!("Unterminated code block at line {line}."));
}
if captures.name("config").is_some() {
return Err(anyhow::anyhow!("Trailing code-block metadata is not supported. Only the code block language can be specified."));
}
// CODE_RE can't match without matches for 'lang' and 'code'.
let lang = captures
.name("lang")
.as_ref()
.map(Match::as_str)
.unwrap_or_default();
let code = captures.name("code").unwrap().into();
if lang == "toml" { if lang == "toml" {
return self.parse_config(code); return self.process_config_block(code);
} }
let explicit_path = captures.name("path").map(|it| it.as_str()); if let Some(explicit_path) = self.explicit_path {
if let Some(explicit_path) = explicit_path {
if !lang.is_empty() if !lang.is_empty()
&& lang != "text" && lang != "text"
&& explicit_path.contains('.') && explicit_path.contains('.')
&& !explicit_path.ends_with(&format!(".{lang}")) && !explicit_path.ends_with(&format!(".{lang}"))
{ {
return Err(anyhow::anyhow!( bail!(
"File ending of test file path `{explicit_path}` does not match `lang={lang}` of code block" "File ending of test file path `{explicit_path}` does not match `lang={lang}` of code block"
)); );
} }
} }
let path = match explicit_path { let path = match self.explicit_path {
Some(path) => path.to_string(), Some(path) => path.to_string(),
None => { None => {
self.unnamed_file_count += 1; self.unnamed_file_count += 1;
@ -398,10 +443,10 @@ impl<'s> Parser<'s> {
"py" | "pyi" => format!("mdtest_snippet__{}.{lang}", self.unnamed_file_count), "py" | "pyi" => format!("mdtest_snippet__{}.{lang}", self.unnamed_file_count),
"" => format!("mdtest_snippet__{}.py", self.unnamed_file_count), "" => format!("mdtest_snippet__{}.py", self.unnamed_file_count),
_ => { _ => {
return Err(anyhow::anyhow!( bail!(
"Cannot generate name for `lang={}`: Unsupported extension", "Cannot generate name for `lang={}`: Unsupported extension",
lang lang
)) );
} }
} }
} }
@ -417,10 +462,10 @@ impl<'s> Parser<'s> {
if let Some(current_files) = &mut self.current_section_files { if let Some(current_files) = &mut self.current_section_files {
if !current_files.insert(path.clone()) { if !current_files.insert(path.clone()) {
return Err(anyhow::anyhow!( bail!(
"Test `{}` has duplicate files named `{path}`.", "Test `{}` has duplicate files named `{path}`.",
self.sections[section].title self.sections[section].title
)); );
}; };
} else { } else {
self.current_section_files = Some(FxHashSet::from_iter([path])); self.current_section_files = Some(FxHashSet::from_iter([path]));
@ -429,7 +474,7 @@ impl<'s> Parser<'s> {
Ok(()) Ok(())
} }
fn parse_config(&mut self, code: &str) -> anyhow::Result<()> { fn process_config_block(&mut self, code: &str) -> anyhow::Result<()> {
if self.current_section_has_config { if self.current_section_has_config {
bail!("Multiple TOML configuration blocks in the same section are not allowed."); bail!("Multiple TOML configuration blocks in the same section are not allowed.");
} }
@ -846,6 +891,42 @@ mod tests {
assert_eq!(err.to_string(), "Unterminated code block at line 10."); assert_eq!(err.to_string(), "Unterminated code block at line 10.");
} }
#[test]
fn header_start_at_beginning_of_line() {
let source = dedent(
"
# A test
# not a header
```py
x = 1
```
",
);
let mf = super::parse("file.md", &source).unwrap();
let [test] = &mf.tests().collect::<Vec<_>>()[..] else {
panic!("expected one test");
};
assert_eq!(test.name(), "file.md - A test");
}
#[test]
fn code_blocks_must_not_be_indented() {
let source = dedent(
"
# A test?
```py
x = 1
```
",
);
super::parse("file.md", &source).expect_err("Indented code blocks are not supported.");
}
#[test] #[test]
fn no_header_inside_test() { fn no_header_inside_test() {
let source = dedent( let source = dedent(
@ -1170,6 +1251,25 @@ mod tests {
assert_eq!(file.code, "x = 1"); assert_eq!(file.code, "x = 1");
} }
#[test]
fn no_newline_between_prose_and_code() {
// Regression test for https://github.com/astral-sh/ruff/issues/15923
let source = dedent(
"
Some code:
No newline between prose and code:
```py
# A syntax error:
§
```
",
);
super::parse("file.md", &source).expect_err(
"Code blocks must start on a new line and be preceded by at least one blank line.",
);
}
#[test] #[test]
fn config_no_longer_allowed() { fn config_no_longer_allowed() {
let source = dedent( let source = dedent(

View file

@ -92,6 +92,18 @@ impl<'a> Cursor<'a> {
} }
} }
/// Eats the next two characters if they are `c1` and `c2`. Does not
/// consume any input otherwise, even if the first character matches.
pub fn eat_char2(&mut self, c1: char, c2: char) -> bool {
if self.first() == c1 && self.second() == c2 {
self.bump();
self.bump();
true
} else {
false
}
}
pub fn eat_char_back(&mut self, c: char) -> bool { pub fn eat_char_back(&mut self, c: char) -> bool {
if self.last() == c { if self.last() == c {
self.bump_back(); self.bump_back();