[red-knot] Enforce specifying paths for mdtest code blocks in a separate preceding line (#15890)

## Summary

Resolves #15695, rework of #15704.

This change modifies the Mdtests framework so that:

* Paths must now be specified in a separate preceding line:

	`````markdown
	`a.py`:

	```py
	x = 1
	```
	`````

If the path of a file conflicts with its `lang`, an error will be
thrown.

* Configs are no longer accepted. The pattern still take them into
account, however, to avoid "Unterminated code block" errors.
* Unnamed files are now assigned unique, `lang`-respecting paths
automatically.

Additionally, all legacy usages have been updated.

## Test Plan

Unit tests and Markdown tests.

---------

Co-authored-by: Carl Meyer <carl@astral.sh>
This commit is contained in:
InSync 2025-02-04 14:27:17 +07:00 committed by GitHub
parent 0529ad67d7
commit 11cfe2ea8a
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
35 changed files with 967 additions and 286 deletions

View file

@ -3,7 +3,7 @@ use std::sync::LazyLock;
use anyhow::bail;
use memchr::memchr2;
use regex::{Captures, Match, Regex};
use rustc_hash::{FxHashMap, FxHashSet};
use rustc_hash::FxHashSet;
use ruff_index::{newtype_index, IndexVec};
use ruff_python_trivia::Cursor;
@ -147,7 +147,7 @@ struct EmbeddedFileId;
#[derive(Debug)]
pub(crate) struct EmbeddedFile<'s> {
section: SectionId,
pub(crate) path: &'s str,
pub(crate) path: String,
pub(crate) lang: &'s str,
pub(crate) code: &'s str,
@ -157,16 +157,20 @@ pub(crate) struct EmbeddedFile<'s> {
/// Matches a sequence of `#` characters, followed by a title heading, followed by a newline.
static HEADER_RE: LazyLock<Regex> =
LazyLock::new(|| Regex::new(r"^(?<level>#+)\s+(?<title>.+)\s*\n").unwrap());
LazyLock::new(|| Regex::new(r"^(?<level>#+)[^\S\n]+(?<title>.+)[^\S\n]*\n").unwrap());
/// Matches a code block fenced by triple backticks, possibly with language and `key=val`
/// configuration items following the opening backticks (in the "tag string" of the code block).
static CODE_RE: LazyLock<Regex> = LazyLock::new(|| {
Regex::new(
r"(?x)
^```(?<lang>(?-u:\w)+)?(?<config>(?:\x20+\S+)*)\s*\n
(?<code>(?:.|\n)*?)\n?
(?<end>```|\z)
^(?:
`(?<path>[^`\n]+)`[^\S\n]*:[^\S\n]*\n
\n?
)?
```(?<lang>(?-u:\w)+)?\x20*(?<config>\S.*)?\n
(?<code>[\s\S]*?)\n?
(?<end>```\n?|\z)
",
)
.unwrap()
@ -210,6 +214,7 @@ struct Parser<'s> {
/// [`EmbeddedFile`]s of the final [`MarkdownTestSuite`].
files: IndexVec<EmbeddedFileId, EmbeddedFile<'s>>,
unnamed_file_count: usize,
/// The unparsed remainder of the Markdown source.
cursor: Cursor<'s>,
@ -221,7 +226,7 @@ struct Parser<'s> {
stack: SectionStack,
/// Names of embedded files in current active section.
current_section_files: Option<FxHashSet<&'s str>>,
current_section_files: Option<FxHashSet<String>>,
/// Whether or not the current section has a config block.
current_section_has_config: bool,
@ -240,6 +245,7 @@ impl<'s> Parser<'s> {
sections,
source,
files: IndexVec::default(),
unnamed_file_count: 0,
cursor: Cursor::new(source),
source_len: source.text_len(),
stack: SectionStack::new(root_section_id),
@ -265,10 +271,14 @@ impl<'s> Parser<'s> {
fn parse_impl(&mut self) -> anyhow::Result<()> {
while let Some(position) = memchr2(b'`', b'#', self.cursor.as_bytes()) {
self.cursor.skip_bytes(position.saturating_sub(1));
self.cursor.skip_bytes(position.saturating_sub(2));
// code blocks and headers must start on a new line.
if position == 0 || self.cursor.eat_char('\n') {
// Code blocks and headers must start on a new line
// and preceded by at least one blank line.
if position == 0 && self.cursor.first() == '#'
|| position == 1 && self.cursor.eat_char('\n')
|| self.cursor.eat_char('\n') && self.cursor.eat_char('\n')
{
match self.cursor.first() {
'#' => {
if let Some(find) = HEADER_RE.find(self.cursor.as_str()) {
@ -290,7 +300,7 @@ impl<'s> Parser<'s> {
// Skip to the end of the line
if let Some(position) = memchr::memchr(b'\n', self.cursor.as_bytes()) {
self.cursor.skip_bytes(position);
self.cursor.skip_bytes(position + 1);
} else {
break;
}
@ -349,26 +359,10 @@ impl<'s> Parser<'s> {
return Err(anyhow::anyhow!("Unterminated code block at line {line}."));
}
let mut config: FxHashMap<&'s str, &'s str> = FxHashMap::default();
if let Some(config_match) = captures.name("config") {
for item in config_match.as_str().split_whitespace() {
let mut parts = item.split('=');
let key = parts.next().unwrap();
let Some(val) = parts.next() else {
return Err(anyhow::anyhow!("Invalid config item `{}`.", item));
};
if parts.next().is_some() {
return Err(anyhow::anyhow!("Invalid config item `{}`.", item));
}
if config.insert(key, val).is_some() {
return Err(anyhow::anyhow!("Duplicate config item `{}`.", item));
}
}
if captures.name("config").is_some() {
return Err(anyhow::anyhow!("Trailing code-block metadata is not supported. Only the code block language can be specified."));
}
let path = config.get("path").copied().unwrap_or("test.py");
// CODE_RE can't match without matches for 'lang' and 'code'.
let lang = captures
.name("lang")
@ -381,26 +375,48 @@ impl<'s> Parser<'s> {
return self.parse_config(code);
}
let explicit_path = captures.name("path").map(|it| it.as_str());
if let Some(explicit_path) = explicit_path {
if !lang.is_empty()
&& lang != "text"
&& explicit_path.contains('.')
&& !explicit_path.ends_with(&format!(".{lang}"))
{
return Err(anyhow::anyhow!(
"File ending of test file path `{explicit_path}` does not match `lang={lang}` of code block"
));
}
}
let path = match explicit_path {
Some(path) => path.to_string(),
None => {
self.unnamed_file_count += 1;
match lang {
"py" | "pyi" => format!("mdtest_snippet__{}.{lang}", self.unnamed_file_count),
"" => format!("mdtest_snippet__{}.py", self.unnamed_file_count),
_ => {
return Err(anyhow::anyhow!(
"Cannot generate name for `lang={}`: Unsupported extension",
lang
))
}
}
}
};
self.files.push(EmbeddedFile {
path,
path: path.clone(),
section,
lang,
code,
md_offset: self.offset(),
});
if let Some(current_files) = &mut self.current_section_files {
if !current_files.insert(path) {
if path == "test.py" {
return Err(anyhow::anyhow!(
"Test `{}` has duplicate files named `{path}`. \
(This is the default filename; \
consider giving some files an explicit name with `path=...`.)",
self.sections[section].title
));
}
if !current_files.insert(path.clone()) {
return Err(anyhow::anyhow!(
"Test `{}` has duplicate files named `{path}`.",
self.sections[section].title
@ -473,7 +489,7 @@ mod tests {
panic!("expected one file");
};
assert_eq!(file.path, "test.py");
assert_eq!(file.path, "mdtest_snippet__1.py");
assert_eq!(file.lang, "py");
assert_eq!(file.code, "x = 1");
}
@ -498,7 +514,7 @@ mod tests {
panic!("expected one file");
};
assert_eq!(file.path, "test.py");
assert_eq!(file.path, "mdtest_snippet__1.py");
assert_eq!(file.lang, "py");
assert_eq!(file.code, "x = 1");
}
@ -518,22 +534,33 @@ mod tests {
```py
y = 2
```
# Three
```pyi
a: int
```
```pyi
b: str
```
",
);
let mf = super::parse("file.md", &source).unwrap();
let [test1, test2] = &mf.tests().collect::<Vec<_>>()[..] else {
panic!("expected two tests");
let [test1, test2, test3] = &mf.tests().collect::<Vec<_>>()[..] else {
panic!("expected three tests");
};
assert_eq!(test1.name(), "file.md - One");
assert_eq!(test2.name(), "file.md - Two");
assert_eq!(test3.name(), "file.md - Three");
let [file] = test1.files().collect::<Vec<_>>()[..] else {
panic!("expected one file");
};
assert_eq!(file.path, "test.py");
assert_eq!(file.path, "mdtest_snippet__1.py");
assert_eq!(file.lang, "py");
assert_eq!(file.code, "x = 1");
@ -541,9 +568,21 @@ mod tests {
panic!("expected one file");
};
assert_eq!(file.path, "test.py");
assert_eq!(file.path, "mdtest_snippet__2.py");
assert_eq!(file.lang, "py");
assert_eq!(file.code, "y = 2");
let [file_1, file_2] = test3.files().collect::<Vec<_>>()[..] else {
panic!("expected two files");
};
assert_eq!(file_1.path, "mdtest_snippet__3.pyi");
assert_eq!(file_1.lang, "pyi");
assert_eq!(file_1.code, "a: int");
assert_eq!(file_2.path, "mdtest_snippet__4.pyi");
assert_eq!(file_2.lang, "pyi");
assert_eq!(file_2.code, "b: str");
}
#[test]
@ -552,11 +591,15 @@ mod tests {
"
# One
```py path=main.py
`main.py`:
```py
from foo import y
```
```py path=foo.py
`foo.py`:
```py
y = 2
```
@ -592,7 +635,7 @@ mod tests {
panic!("expected one file");
};
assert_eq!(file.path, "test.py");
assert_eq!(file.path, "mdtest_snippet__1.py");
assert_eq!(file.lang, "py");
assert_eq!(file.code, "y = 2");
}
@ -601,7 +644,9 @@ mod tests {
fn custom_file_path() {
let source = dedent(
"
```py path=foo.py
`foo.py`:
```py
x = 1
```
",
@ -685,6 +730,90 @@ mod tests {
assert_eq!(file.code, "x = 10");
}
#[test]
fn cannot_generate_name_for_lang() {
let source = dedent(
"
```json
{}
```
",
);
let err = super::parse("file.md", &source).expect_err("Should fail to parse");
assert_eq!(
err.to_string(),
"Cannot generate name for `lang=json`: Unsupported extension"
);
}
#[test]
fn mismatching_lang() {
let source = dedent(
"
`a.py`:
```pyi
x = 1
```
",
);
let err = super::parse("file.md", &source).expect_err("Should fail to parse");
assert_eq!(
err.to_string(),
"File ending of test file path `a.py` does not match `lang=pyi` of code block"
);
}
#[test]
fn files_with_no_extension_can_have_any_lang() {
let source = dedent(
"
`lorem`:
```foo
x = 1
```
",
);
let mf = super::parse("file.md", &source).unwrap();
let [test] = &mf.tests().collect::<Vec<_>>()[..] else {
panic!("expected one test");
};
let [file] = test.files().collect::<Vec<_>>()[..] else {
panic!("expected one file");
};
assert_eq!(file.path, "lorem");
assert_eq!(file.code, "x = 1");
}
#[test]
fn files_with_lang_text_can_have_any_paths() {
let source = dedent(
"
`lorem.yaml`:
```text
x = 1
```
",
);
let mf = super::parse("file.md", &source).unwrap();
let [test] = &mf.tests().collect::<Vec<_>>()[..] else {
panic!("expected one test");
};
let [file] = test.files().collect::<Vec<_>>()[..] else {
panic!("expected one file");
};
assert_eq!(file.path, "lorem.yaml");
assert_eq!(file.code, "x = 1");
}
#[test]
fn unterminated_code_block_1() {
let source = dedent(
@ -738,83 +867,319 @@ mod tests {
}
#[test]
fn invalid_config_item_no_equals() {
fn line_break_in_header_1() {
let source = dedent(
"
```py foo
#
Foo
```py
x = 1
```
",
);
let err = super::parse("file.md", &source).expect_err("Should fail to parse");
assert_eq!(err.to_string(), "Invalid config item `foo`.");
let mf = super::parse("file.md", &source).unwrap();
let [test] = &mf.tests().collect::<Vec<_>>()[..] else {
panic!("expected one test");
};
let [file] = test.files().collect::<Vec<_>>()[..] else {
panic!("expected one file");
};
assert_eq!(test.section.title, "file.md");
assert_eq!(file.path, "mdtest_snippet__1.py");
assert_eq!(file.code, "x = 1");
}
#[test]
fn invalid_config_item_too_many_equals() {
fn line_break_in_header_2() {
let source = dedent(
"
```py foo=bar=baz
x = 1
```
",
);
let err = super::parse("file.md", &source).expect_err("Should fail to parse");
assert_eq!(err.to_string(), "Invalid config item `foo=bar=baz`.");
}
# Foo
#[test]
fn invalid_config_item_duplicate() {
let source = dedent(
"
```py foo=bar foo=baz
##
Lorem
```py
x = 1
```
",
);
let err = super::parse("file.md", &source).expect_err("Should fail to parse");
assert_eq!(err.to_string(), "Duplicate config item `foo=baz`.");
let mf = super::parse("file.md", &source).unwrap();
let [test] = &mf.tests().collect::<Vec<_>>()[..] else {
panic!("expected one test");
};
let [file] = test.files().collect::<Vec<_>>()[..] else {
panic!("expected one file");
};
assert_eq!(test.section.title, "Foo");
assert_eq!(file.path, "mdtest_snippet__1.py");
assert_eq!(file.code, "x = 1");
}
#[test]
fn no_duplicate_name_files_in_test() {
let source = dedent(
"
`foo.py`:
```py
x = 1
```
`foo.py`:
```py
y = 2
```
",
);
let err = super::parse("file.md", &source).expect_err("Should fail to parse");
assert_eq!(
err.to_string(),
"Test `file.md` has duplicate files named `test.py`. \
(This is the default filename; consider giving some files an explicit name \
with `path=...`.)"
);
}
#[test]
fn no_duplicate_name_files_in_test_non_default() {
let source = dedent(
"
```py path=foo.py
x = 1
```
```py path=foo.py
y = 2
```
",
);
let err = super::parse("file.md", &source).expect_err("Should fail to parse");
assert_eq!(
err.to_string(),
"Test `file.md` has duplicate files named `foo.py`."
);
}
#[test]
fn no_duplicate_name_files_in_test_2() {
let source = dedent(
"
`mdtest_snippet__1.py`:
```py
x = 1
```
```py
y = 2
```
",
);
let err = super::parse("file.md", &source).expect_err("Should fail to parse");
assert_eq!(
err.to_string(),
"Test `file.md` has duplicate files named `mdtest_snippet__1.py`."
);
}
#[test]
fn separate_path() {
let source = dedent(
"
`foo.py`:
```py
x = 1
```
",
);
let mf = super::parse("file.md", &source).unwrap();
let [test] = &mf.tests().collect::<Vec<_>>()[..] else {
panic!("expected one test");
};
let [file] = test.files().collect::<Vec<_>>()[..] else {
panic!("expected one file");
};
assert_eq!(file.path, "foo.py");
assert_eq!(file.code, "x = 1");
}
#[test]
fn separate_path_whitespace_1() {
let source = dedent(
"
`foo.py` :
```py
x = 1
```
",
);
let mf = super::parse("file.md", &source).unwrap();
let [test] = &mf.tests().collect::<Vec<_>>()[..] else {
panic!("expected one test");
};
let [file] = test.files().collect::<Vec<_>>()[..] else {
panic!("expected one file");
};
assert_eq!(file.path, "foo.py");
assert_eq!(file.code, "x = 1");
}
#[test]
fn separate_path_whitespace_2() {
let source = dedent(
"
`foo.py`:
```py
x = 1
```
",
);
let mf = super::parse("file.md", &source).unwrap();
let [test] = &mf.tests().collect::<Vec<_>>()[..] else {
panic!("expected one test");
};
let [file] = test.files().collect::<Vec<_>>()[..] else {
panic!("expected one file");
};
assert_eq!(file.path, "foo.py");
assert_eq!(file.code, "x = 1");
}
#[test]
fn path_with_space() {
let source = dedent(
"
`foo bar.py`:
```py
x = 1
```
",
);
let mf = super::parse("file.md", &source).unwrap();
let [test] = &mf.tests().collect::<Vec<_>>()[..] else {
panic!("expected one test");
};
let [file] = test.files().collect::<Vec<_>>()[..] else {
panic!("expected one file");
};
assert_eq!(file.path, "foo bar.py");
assert_eq!(file.code, "x = 1");
}
#[test]
fn path_with_line_break() {
let source = dedent(
"
`foo
.py`:
```py
x = 1
```
",
);
let mf = super::parse("file.md", &source).unwrap();
let [test] = &mf.tests().collect::<Vec<_>>()[..] else {
panic!("expected one test");
};
let [file] = test.files().collect::<Vec<_>>()[..] else {
panic!("expected one file");
};
assert_eq!(file.path, "mdtest_snippet__1.py");
assert_eq!(file.code, "x = 1");
}
#[test]
fn path_with_backtick() {
let source = dedent(
"
`foo`bar.py`:
```py
x = 1
```
",
);
let mf = super::parse("file.md", &source).unwrap();
let [test] = &mf.tests().collect::<Vec<_>>()[..] else {
panic!("expected one test");
};
let [file] = test.files().collect::<Vec<_>>()[..] else {
panic!("expected one file");
};
assert_eq!(file.path, "mdtest_snippet__1.py");
assert_eq!(file.code, "x = 1");
}
#[test]
fn path_colon_on_next_line() {
let source = dedent(
"
`foo.py`
:
```py
x = 1
```
",
);
let mf = super::parse("file.md", &source).unwrap();
let [test] = &mf.tests().collect::<Vec<_>>()[..] else {
panic!("expected one test");
};
let [file] = test.files().collect::<Vec<_>>()[..] else {
panic!("expected one file");
};
assert_eq!(file.path, "mdtest_snippet__1.py");
assert_eq!(file.code, "x = 1");
}
#[test]
fn random_trailing_backtick_quoted() {
let source = dedent(
"
A long sentence that forces a line break
`int`:
```py
x = 1
```
",
);
let mf = super::parse("file.md", &source).unwrap();
let [test] = &mf.tests().collect::<Vec<_>>()[..] else {
panic!("expected one test");
};
let [file] = test.files().collect::<Vec<_>>()[..] else {
panic!("expected one file");
};
assert_eq!(file.path, "mdtest_snippet__1.py");
assert_eq!(file.code, "x = 1");
}
#[test]
fn config_no_longer_allowed() {
let source = dedent(
"
```py foo=bar
x = 1
```
",
);
let err = super::parse("file.md", &source).expect_err("Should fail to parse");
assert_eq!(err.to_string(), "Trailing code-block metadata is not supported. Only the code block language can be specified.");
}
}