Detect and ignore Jupyter automagics (#8398)

## Summary

LangChain is attempting to use Ruff over their Jupyter notebooks
(https://github.com/langchain-ai/langchain/pull/12677/files), but
running into a bunch of syntax errors, the majority of which come from
our inability to recognize automagic.

If you run this in a cell:

```jupyter
pip install requests
```

Jupyter will automatically treat that as:

```jupyter
%pip install requests
```

We need to ignore cells that use these automagics, since the parser
doesn't understand them. (I guess we could support it in the parser, but
that seems much harder?). The good news is that AFAICT Jupyter doesn't
let you mix automagics with code, so by skipping these cells, we don't
miss out on analyzing any Python code.

## Test Plan

1. `cargo test`
2. Ran over LangChain and verified that there are no more errors
relating to `pip install` automagics.
This commit is contained in:
Charlie Marsh 2023-11-02 18:14:10 -07:00 committed by GitHub
parent 2ff1afb15c
commit f64c389654
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
5 changed files with 154 additions and 6 deletions

View file

@ -0,0 +1,8 @@
{
"execution_count": null,
"cell_type": "code",
"id": "1",
"metadata": {},
"outputs": [],
"source": ["pip install requests"]
}

View file

@ -0,0 +1,8 @@
{
"execution_count": null,
"cell_type": "code",
"id": "1",
"metadata": {},
"outputs": [],
"source": ["x = 1\n", "pip install requests"]
}

View file

@ -0,0 +1,8 @@
{
"execution_count": null,
"cell_type": "code",
"id": "1",
"metadata": {},
"outputs": [],
"source": ["pip install requests\n", "x = 1"]
}

View file

@ -0,0 +1,8 @@
{
"execution_count": null,
"cell_type": "code",
"id": "1",
"metadata": {},
"outputs": [],
"source": ["pip install requests\n", "pip install requests"]
}

View file

@ -80,14 +80,126 @@ impl Cell {
// Ignore cells containing cell magic as they act on the entire cell
// as compared to line magic which acts on a single line.
!match source {
SourceValue::String(string) => string
.lines()
.any(|line| line.trim_start().starts_with("%%")),
SourceValue::StringArray(string_array) => string_array
.iter()
.any(|line| line.trim_start().starts_with("%%")),
SourceValue::String(string) => Self::is_magic_cell(string.lines()),
SourceValue::StringArray(string_array) => {
Self::is_magic_cell(string_array.iter().map(String::as_str))
}
}
}
/// Returns `true` if a cell should be ignored due to the use of cell magics.
fn is_magic_cell<'a>(lines: impl Iterator<Item = &'a str>) -> bool {
let mut lines = lines.peekable();
// Detect automatic line magics (automagic), which aren't supported by the parser. If a line
// magic uses automagic, Jupyter doesn't allow following it with non-magic lines anyway, so
// we aren't missing out on any valid Python code.
//
// For example, this is valid:
// ```jupyter
// cat /path/to/file
// cat /path/to/file
// ```
//
// But this is invalid:
// ```jupyter
// cat /path/to/file
// x = 1
// ```
//
// See: https://ipython.readthedocs.io/en/stable/interactive/magics.html
if lines
.peek()
.and_then(|line| line.split_whitespace().next())
.is_some_and(|token| {
matches!(
token,
"alias"
| "alias_magic"
| "autoawait"
| "autocall"
| "automagic"
| "bookmark"
| "cd"
| "code_wrap"
| "colors"
| "conda"
| "config"
| "debug"
| "dhist"
| "dirs"
| "doctest_mode"
| "edit"
| "env"
| "gui"
| "history"
| "killbgscripts"
| "load"
| "load_ext"
| "loadpy"
| "logoff"
| "logon"
| "logstart"
| "logstate"
| "logstop"
| "lsmagic"
| "macro"
| "magic"
| "mamba"
| "matplotlib"
| "micromamba"
| "notebook"
| "page"
| "pastebin"
| "pdb"
| "pdef"
| "pdoc"
| "pfile"
| "pinfo"
| "pinfo2"
| "pip"
| "popd"
| "pprint"
| "precision"
| "prun"
| "psearch"
| "psource"
| "pushd"
| "pwd"
| "pycat"
| "pylab"
| "quickref"
| "recall"
| "rehashx"
| "reload_ext"
| "rerun"
| "reset"
| "reset_selective"
| "run"
| "save"
| "sc"
| "set_env"
| "sx"
| "system"
| "tb"
| "time"
| "timeit"
| "unalias"
| "unload_ext"
| "who"
| "who_ls"
| "whos"
| "xdel"
| "xmode"
)
})
{
return true;
}
// Detect cell magics (which operate on multiple lines).
lines.any(|line| line.trim_start().starts_with("%%"))
}
}
/// An error that can occur while deserializing a Jupyter Notebook.
@ -481,6 +593,10 @@ mod tests {
#[test_case(Path::new("code_and_magic.json"), true; "code_and_magic")]
#[test_case(Path::new("only_code.json"), true; "only_code")]
#[test_case(Path::new("cell_magic.json"), false; "cell_magic")]
#[test_case(Path::new("automagic.json"), false; "automagic")]
#[test_case(Path::new("automagics.json"), false; "automagics")]
#[test_case(Path::new("automagic_before_code.json"), false; "automagic_before_code")]
#[test_case(Path::new("automagic_after_code.json"), true; "automagic_after_code")]
fn test_is_valid_code_cell(path: &Path, expected: bool) -> Result<()> {
/// Read a Jupyter cell from the `resources/test/fixtures/jupyter/cell` directory.
fn read_jupyter_cell(path: impl AsRef<Path>) -> Result<Cell> {