Refactor noqa directive parsing away from regex-based implementation (#5554)

## Summary

I'll write up a more detailed description tomorrow, but in short, this
PR removes our regex-based implementation in favor of "manual" parsing.

I tried a couple different implementations. In the benchmarks below:

- `Directive/Regex` is our implementation on `main`.
- `Directive/Find` just uses `text.find("noqa")`, which is insufficient,
since it doesn't cover case-insensitive variants like `NOQA`, and
doesn't handle multiple `noqa` matches in a single like, like ` # Here's
a noqa comment # noqa: F401`. But it's kind of a baseline.
- `Directive/Memchr` uses three `memchr` iterative finders (one for
`noqa`, `NOQA`, and `NoQA`).
- `Directive/AhoCorasick` is roughly the variant checked-in here.

The raw results:

```
Directive/Regex/# noqa: F401
                        time:   [273.69 ns 274.71 ns 276.03 ns]
                        change: [+1.4467% +1.8979% +2.4243%] (p = 0.00 < 0.05)
                        Performance has regressed.
Found 15 outliers among 100 measurements (15.00%)
  3 (3.00%) low mild
  8 (8.00%) high mild
  4 (4.00%) high severe
Directive/Find/# noqa: F401
                        time:   [66.972 ns 67.048 ns 67.132 ns]
                        change: [+2.8292% +2.9377% +3.0540%] (p = 0.00 < 0.05)
                        Performance has regressed.
Found 15 outliers among 100 measurements (15.00%)
  1 (1.00%) low severe
  3 (3.00%) low mild
  8 (8.00%) high mild
  3 (3.00%) high severe
Directive/AhoCorasick/# noqa: F401
                        time:   [76.922 ns 77.189 ns 77.536 ns]
                        change: [+0.4265% +0.6862% +0.9871%] (p = 0.00 < 0.05)
                        Change within noise threshold.
Found 8 outliers among 100 measurements (8.00%)
  1 (1.00%) low mild
  3 (3.00%) high mild
  4 (4.00%) high severe
Directive/Memchr/# noqa: F401
                        time:   [62.627 ns 62.654 ns 62.679 ns]
                        change: [-0.1780% -0.0887% -0.0120%] (p = 0.03 < 0.05)
                        Change within noise threshold.
Found 11 outliers among 100 measurements (11.00%)
  1 (1.00%) low severe
  5 (5.00%) low mild
  3 (3.00%) high mild
  2 (2.00%) high severe
Directive/Regex/# noqa: F401, F841
                        time:   [321.83 ns 322.39 ns 322.93 ns]
                        change: [+8602.4% +8623.5% +8644.5%] (p = 0.00 < 0.05)
                        Performance has regressed.
Found 5 outliers among 100 measurements (5.00%)
  1 (1.00%) low severe
  2 (2.00%) low mild
  1 (1.00%) high mild
  1 (1.00%) high severe
Directive/Find/# noqa: F401, F841
                        time:   [78.618 ns 78.758 ns 78.896 ns]
                        change: [+1.6909% +1.8771% +2.0628%] (p = 0.00 < 0.05)
                        Performance has regressed.
Found 3 outliers among 100 measurements (3.00%)
  3 (3.00%) high mild
Directive/AhoCorasick/# noqa: F401, F841
                        time:   [87.739 ns 88.057 ns 88.468 ns]
                        change: [+0.1843% +0.4685% +0.7854%] (p = 0.00 < 0.05)
                        Change within noise threshold.
Found 11 outliers among 100 measurements (11.00%)
  5 (5.00%) low mild
  3 (3.00%) high mild
  3 (3.00%) high severe
Directive/Memchr/# noqa: F401, F841
                        time:   [80.674 ns 80.774 ns 80.860 ns]
                        change: [-0.7343% -0.5633% -0.4031%] (p = 0.00 < 0.05)
                        Change within noise threshold.
Found 14 outliers among 100 measurements (14.00%)
  4 (4.00%) low severe
  9 (9.00%) low mild
  1 (1.00%) high mild
Directive/Regex/# noqa  time:   [194.86 ns 195.93 ns 196.97 ns]
                        change: [+11973% +12039% +12103%] (p = 0.00 < 0.05)
                        Performance has regressed.
Found 6 outliers among 100 measurements (6.00%)
  5 (5.00%) low mild
  1 (1.00%) high mild
Directive/Find/# noqa   time:   [25.327 ns 25.354 ns 25.383 ns]
                        change: [+3.8524% +4.0267% +4.1845%] (p = 0.00 < 0.05)
                        Performance has regressed.
Found 9 outliers among 100 measurements (9.00%)
  6 (6.00%) high mild
  3 (3.00%) high severe
Directive/AhoCorasick/# noqa
                        time:   [34.267 ns 34.368 ns 34.481 ns]
                        change: [+0.5646% +0.8505% +1.1281%] (p = 0.00 < 0.05)
                        Change within noise threshold.
Found 5 outliers among 100 measurements (5.00%)
  5 (5.00%) high mild
Directive/Memchr/# noqa time:   [21.770 ns 21.818 ns 21.874 ns]
                        change: [-0.0990% +0.1464% +0.4046%] (p = 0.26 > 0.05)
                        No change in performance detected.
Found 10 outliers among 100 measurements (10.00%)
  4 (4.00%) low mild
  4 (4.00%) high mild
  2 (2.00%) high severe
Directive/Regex/# type: ignore # noqa: E501
                        time:   [278.76 ns 279.69 ns 280.72 ns]
                        change: [+7449.4% +7469.8% +7490.5%] (p = 0.00 < 0.05)
                        Performance has regressed.
Found 3 outliers among 100 measurements (3.00%)
  1 (1.00%) low mild
  1 (1.00%) high mild
  1 (1.00%) high severe
Directive/Find/# type: ignore # noqa: E501
                        time:   [67.791 ns 67.976 ns 68.184 ns]
                        change: [+2.8321% +3.1735% +3.5418%] (p = 0.00 < 0.05)
                        Performance has regressed.
Found 6 outliers among 100 measurements (6.00%)
  5 (5.00%) high mild
  1 (1.00%) high severe
Directive/AhoCorasick/# type: ignore # noqa: E501
                        time:   [75.908 ns 76.055 ns 76.210 ns]
                        change: [+0.9269% +1.1427% +1.3955%] (p = 0.00 < 0.05)
                        Change within noise threshold.
Found 1 outliers among 100 measurements (1.00%)
  1 (1.00%) high severe
Directive/Memchr/# type: ignore # noqa: E501
                        time:   [72.549 ns 72.723 ns 72.957 ns]
                        change: [+1.5881% +1.9660% +2.3974%] (p = 0.00 < 0.05)
                        Performance has regressed.
Found 15 outliers among 100 measurements (15.00%)
  10 (10.00%) high mild
  5 (5.00%) high severe
Directive/Regex/# type: ignore # nosec
                        time:   [66.967 ns 67.075 ns 67.207 ns]
                        change: [+1713.0% +1715.8% +1718.9%] (p = 0.00 < 0.05)
                        Performance has regressed.
Found 10 outliers among 100 measurements (10.00%)
  1 (1.00%) low severe
  3 (3.00%) low mild
  2 (2.00%) high mild
  4 (4.00%) high severe
Directive/Find/# type: ignore # nosec
                        time:   [18.505 ns 18.548 ns 18.597 ns]
                        change: [+1.3520% +1.6976% +2.0333%] (p = 0.00 < 0.05)
                        Performance has regressed.
Found 4 outliers among 100 measurements (4.00%)
  4 (4.00%) high mild
Directive/AhoCorasick/# type: ignore # nosec
                        time:   [16.162 ns 16.206 ns 16.252 ns]
                        change: [+1.2919% +1.5587% +1.8430%] (p = 0.00 < 0.05)
                        Performance has regressed.
Found 4 outliers among 100 measurements (4.00%)
  3 (3.00%) high mild
  1 (1.00%) high severe
Directive/Memchr/# type: ignore # nosec
                        time:   [39.192 ns 39.233 ns 39.276 ns]
                        change: [+0.5164% +0.7456% +0.9790%] (p = 0.00 < 0.05)
                        Change within noise threshold.
Found 13 outliers among 100 measurements (13.00%)
  2 (2.00%) low severe
  4 (4.00%) low mild
  3 (3.00%) high mild
  4 (4.00%) high severe
Directive/Regex/# some very long comment that # is interspersed with characters but # no directive
                        time:   [81.460 ns 81.578 ns 81.703 ns]
                        change: [+2093.3% +2098.8% +2104.2%] (p = 0.00 < 0.05)
                        Performance has regressed.
Found 4 outliers among 100 measurements (4.00%)
  2 (2.00%) low mild
  2 (2.00%) high mild
Directive/Find/# some very long comment that # is interspersed with characters but # no directive
                        time:   [26.284 ns 26.331 ns 26.387 ns]
                        change: [+0.7554% +1.1027% +1.3832%] (p = 0.00 < 0.05)
                        Change within noise threshold.
Found 6 outliers among 100 measurements (6.00%)
  5 (5.00%) high mild
  1 (1.00%) high severe
Directive/AhoCorasick/# some very long comment that # is interspersed with characters but # no direc...
                        time:   [28.643 ns 28.714 ns 28.787 ns]
                        change: [+1.3774% +1.6780% +2.0028%] (p = 0.00 < 0.05)
                        Performance has regressed.
Found 2 outliers among 100 measurements (2.00%)
  2 (2.00%) high mild
Directive/Memchr/# some very long comment that # is interspersed with characters but # no directive
                        time:   [55.766 ns 55.831 ns 55.897 ns]
                        change: [+1.5802% +1.7476% +1.9021%] (p = 0.00 < 0.05)
                        Performance has regressed.
Found 2 outliers among 100 measurements (2.00%)
  2 (2.00%) low mild
```

While memchr is faster than aho-corasick in some of the common cases
(like `# noqa: F401`), the latter is way, way faster when there _isn't_
a match (like 2x faster -- see the last two cases). Since most comments
_aren't_ `noqa` comments, this felt like the right tradeoff. Note that
all implementations are significantly faster than the regex version.

(I know I originally reported a 10x speedup, but I ended up improving
the regex version a bit in some prior PRs, so it got unintentionally
faster via some refactors.)

There's also one behavior change in here, which is that we now allow
variable spaces, e.g., `#noqa` or `# noqa`. Previously, we required
exactly one space. This thus closes #5177.
This commit is contained in:
Charlie Marsh 2023-07-06 12:03:10 -04:00 committed by GitHub
parent 87ca6171cf
commit cc822082a7
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
19 changed files with 310 additions and 47 deletions

1
Cargo.lock generated
View file

@ -1811,6 +1811,7 @@ dependencies = [
name = "ruff"
version = "0.0.277"
dependencies = [
"aho-corasick 1.0.2",
"annotate-snippets 0.9.1",
"anyhow",
"bitflags 2.3.3",

View file

@ -27,6 +27,7 @@ ruff_rustpython = { path = "../ruff_rustpython" }
ruff_text_size = { workspace = true }
ruff_textwrap = { path = "../ruff_textwrap" }
aho-corasick = { version = "1.0.2" }
annotate-snippets = { version = "0.9.1", features = ["color"] }
anyhow = { workspace = true }
bitflags = { workspace = true }

View file

@ -4,11 +4,11 @@ use std::fs;
use std::ops::Add;
use std::path::Path;
use aho_corasick::AhoCorasick;
use anyhow::Result;
use itertools::Itertools;
use log::warn;
use once_cell::sync::Lazy;
use regex::Regex;
use ruff_text_size::{TextLen, TextRange, TextSize};
use rustpython_parser::ast::Ranged;
@ -20,8 +20,10 @@ use crate::codes::NoqaCode;
use crate::registry::{AsRule, Rule, RuleSet};
use crate::rule_redirects::get_redirect_target;
static NOQA_LINE_REGEX: Lazy<Regex> = Lazy::new(|| {
Regex::new(r"(?P<noqa>(?i:# noqa)(?::\s?(?P<codes>[A-Z]+[0-9]+(?:[,\s]+[A-Z]+[0-9]+)*))?)")
static NOQA_MATCHER: Lazy<AhoCorasick> = Lazy::new(|| {
AhoCorasick::builder()
.ascii_case_insensitive(true)
.build(["noqa"])
.unwrap()
});
@ -38,32 +40,104 @@ pub(crate) enum Directive<'a> {
impl<'a> Directive<'a> {
/// Extract the noqa `Directive` from a line of Python source code.
pub(crate) fn try_extract(text: &'a str, offset: TextSize) -> Option<Self> {
let caps = NOQA_LINE_REGEX.captures(text)?;
match (caps.name("noqa"), caps.name("codes")) {
(Some(noqa), Some(codes)) => {
let codes = codes
.as_str()
.split(|c: char| c.is_whitespace() || c == ',')
.map(str::trim)
.filter(|code| !code.is_empty())
.collect_vec();
if codes.is_empty() {
warn!("Expected rule codes on `noqa` directive: \"{text}\"");
}
let range = TextRange::at(
TextSize::try_from(noqa.start()).unwrap().add(offset),
noqa.as_str().text_len(),
);
Some(Self::Codes(Codes { range, codes }))
for mat in NOQA_MATCHER.find_iter(text) {
let noqa_literal_start = mat.start();
// Determine the start of the comment.
let mut comment_start = noqa_literal_start;
// Trim any whitespace between the `#` character and the `noqa` literal.
comment_start = text[..comment_start].trim_end().len();
// The next character has to be the `#` character.
if text[..comment_start]
.chars()
.last()
.map_or(false, |c| c != '#')
{
continue;
}
(Some(noqa), None) => {
let range = TextRange::at(
TextSize::try_from(noqa.start()).unwrap().add(offset),
noqa.as_str().text_len(),
);
Some(Self::All(All { range }))
}
_ => None,
comment_start -= '#'.len_utf8();
// If the next character is `:`, then it's a list of codes. Otherwise, it's a directive
// to ignore all rules.
let noqa_literal_end = mat.end();
return Some(
if text[noqa_literal_end..]
.chars()
.next()
.map_or(false, |c| c == ':')
{
// E.g., `# noqa: F401, F841`.
let mut codes_start = noqa_literal_end;
// Skip the `:` character.
codes_start += ':'.len_utf8();
// Skip any whitespace between the `:` and the codes.
codes_start += text[codes_start..]
.find(|c: char| !c.is_whitespace())
.unwrap_or(0);
// Extract the comma-separated list of codes.
let mut codes = vec![];
let mut codes_end = codes_start;
let mut leading_space = 0;
while let Some(code) = Directive::lex_code(&text[codes_end + leading_space..]) {
codes.push(code);
codes_end += leading_space;
codes_end += code.len();
// Codes can be comma- or whitespace-delimited. Compute the length of the
// delimiter, but only add it in the next iteration, once we find the next
// code.
if let Some(space_between) =
text[codes_end..].find(|c: char| !(c.is_whitespace() || c == ','))
{
leading_space = space_between;
} else {
break;
}
}
let range = TextRange::new(
TextSize::try_from(comment_start).unwrap(),
TextSize::try_from(codes_end).unwrap(),
);
Self::Codes(Codes {
range: range.add(offset),
codes,
})
} else {
// E.g., `# noqa`.
let range = TextRange::new(
TextSize::try_from(comment_start).unwrap(),
TextSize::try_from(noqa_literal_end).unwrap(),
);
Self::All(All {
range: range.add(offset),
})
},
);
}
None
}
/// Lex an individual rule code (e.g., `F401`).
fn lex_code(text: &str) -> Option<&str> {
// Extract, e.g., the `F` in `F401`.
let prefix = text.chars().take_while(char::is_ascii_uppercase).count();
// Extract, e.g., the `401` in `F401`.
let suffix = text[prefix..]
.chars()
.take_while(char::is_ascii_digit)
.count();
if prefix > 0 && suffix > 0 {
Some(&text[..prefix + suffix])
} else {
None
}
}
}
@ -488,7 +562,7 @@ impl NoqaMapping {
}
/// Returns the re-mapped position or `position` if no mapping exists.
pub fn resolve(&self, offset: TextSize) -> TextSize {
pub(crate) fn resolve(&self, offset: TextSize) -> TextSize {
let index = self.ranges.binary_search_by(|range| {
if range.end() < offset {
std::cmp::Ordering::Less
@ -506,7 +580,7 @@ impl NoqaMapping {
}
}
pub fn push_mapping(&mut self, range: TextRange) {
pub(crate) fn push_mapping(&mut self, range: TextRange) {
if let Some(last_range) = self.ranges.last_mut() {
// Strictly sorted insertion
if last_range.end() <= range.start() {
@ -634,6 +708,48 @@ mod tests {
assert_debug_snapshot!(Directive::try_extract(source, TextSize::default()));
}
#[test]
fn noqa_all_leading_comment() {
let source = "# Some comment describing the noqa # noqa";
assert_debug_snapshot!(Directive::try_extract(source, TextSize::default()));
}
#[test]
fn noqa_code_leading_comment() {
let source = "# Some comment describing the noqa # noqa: F401";
assert_debug_snapshot!(Directive::try_extract(source, TextSize::default()));
}
#[test]
fn noqa_codes_leading_comment() {
let source = "# Some comment describing the noqa # noqa: F401, F841";
assert_debug_snapshot!(Directive::try_extract(source, TextSize::default()));
}
#[test]
fn noqa_all_trailing_comment() {
let source = "# noqa # Some comment describing the noqa";
assert_debug_snapshot!(Directive::try_extract(source, TextSize::default()));
}
#[test]
fn noqa_code_trailing_comment() {
let source = "# noqa: F401 # Some comment describing the noqa";
assert_debug_snapshot!(Directive::try_extract(source, TextSize::default()));
}
#[test]
fn noqa_codes_trailing_comment() {
let source = "# noqa: F401, F841 # Some comment describing the noqa";
assert_debug_snapshot!(Directive::try_extract(source, TextSize::default()));
}
#[test]
fn noqa_invalid_codes() {
let source = "# noqa: F401, unused-import, some other code";
assert_debug_snapshot!(Directive::try_extract(source, TextSize::default()));
}
#[test]
fn modification() {
let contents = "x = 1";

View file

@ -0,0 +1,11 @@
---
source: crates/ruff/src/noqa.rs
expression: "Directive::try_extract(source, TextSize::default())"
---
Some(
All(
All {
range: 35..41,
},
),
)

View file

@ -1,5 +1,11 @@
---
source: crates/ruff/src/noqa.rs
expression: "Directive::extract(range, &locator)"
expression: "Directive::try_extract(source, TextSize::default())"
---
None
Some(
All(
All {
range: 0..7,
},
),
)

View file

@ -1,5 +1,11 @@
---
source: crates/ruff/src/noqa.rs
expression: "Directive::extract(range, &locator)"
expression: "Directive::try_extract(source, TextSize::default())"
---
None
Some(
All(
All {
range: 0..5,
},
),
)

View file

@ -0,0 +1,11 @@
---
source: crates/ruff/src/noqa.rs
expression: "Directive::try_extract(source, TextSize::default())"
---
Some(
All(
All {
range: 0..6,
},
),
)

View file

@ -0,0 +1,14 @@
---
source: crates/ruff/src/noqa.rs
expression: "Directive::try_extract(source, TextSize::default())"
---
Some(
Codes(
Codes {
range: 35..47,
codes: [
"F401",
],
},
),
)

View file

@ -1,5 +1,14 @@
---
source: crates/ruff/src/noqa.rs
expression: "Directive::extract(range, &locator)"
expression: "Directive::try_extract(source, TextSize::default())"
---
None
Some(
Codes(
Codes {
range: 0..13,
codes: [
"F401",
],
},
),
)

View file

@ -1,5 +1,14 @@
---
source: crates/ruff/src/noqa.rs
expression: "Directive::extract(range, &locator)"
expression: "Directive::try_extract(source, TextSize::default())"
---
None
Some(
Codes(
Codes {
range: 0..10,
codes: [
"F401",
],
},
),
)

View file

@ -0,0 +1,14 @@
---
source: crates/ruff/src/noqa.rs
expression: "Directive::try_extract(source, TextSize::default())"
---
Some(
Codes(
Codes {
range: 0..12,
codes: [
"F401",
],
},
),
)

View file

@ -0,0 +1,15 @@
---
source: crates/ruff/src/noqa.rs
expression: "Directive::try_extract(source, TextSize::default())"
---
Some(
Codes(
Codes {
range: 35..53,
codes: [
"F401",
"F841",
],
},
),
)

View file

@ -1,5 +1,15 @@
---
source: crates/ruff/src/noqa.rs
expression: "Directive::extract(range, &locator)"
expression: "Directive::try_extract(source, TextSize::default())"
---
None
Some(
Codes(
Codes {
range: 0..20,
codes: [
"F401",
"F841",
],
},
),
)

View file

@ -1,5 +1,15 @@
---
source: crates/ruff/src/noqa.rs
expression: "Directive::extract(range, &locator)"
expression: "Directive::try_extract(source, TextSize::default())"
---
None
Some(
Codes(
Codes {
range: 0..15,
codes: [
"F401",
"F841",
],
},
),
)

View file

@ -0,0 +1,15 @@
---
source: crates/ruff/src/noqa.rs
expression: "Directive::try_extract(source, TextSize::default())"
---
Some(
Codes(
Codes {
range: 0..18,
codes: [
"F401",
"F841",
],
},
),
)

View file

@ -0,0 +1,14 @@
---
source: crates/ruff/src/noqa.rs
expression: "Directive::try_extract(source, TextSize::default())"
---
Some(
Codes(
Codes {
range: 0..12,
codes: [
"F401",
],
},
),
)

1
foo.py Normal file
View file

@ -0,0 +1 @@
import os # noqa

View file

@ -20,7 +20,7 @@ from asyncio.subprocess import PIPE, create_subprocess_exec
from contextlib import asynccontextmanager, nullcontext
from pathlib import Path
from signal import SIGINT, SIGTERM
from typing import TYPE_CHECKING, NamedTuple, Self
from typing import TYPE_CHECKING, NamedTuple, Self, TypeVar
if TYPE_CHECKING:
from collections.abc import AsyncIterator, Iterator, Sequence
@ -272,6 +272,9 @@ def read_projects_jsonl(projects_jsonl: Path) -> dict[tuple[str, str], Repositor
return repositories
T = TypeVar("T")
async def main(
*,
ruff1: Path,
@ -291,7 +294,7 @@ async def main(
# Otherwise doing 3k repositories can take >8GB RAM
semaphore = asyncio.Semaphore(50)
async def limited_parallelism(coroutine): # noqa: ANN
async def limited_parallelism(coroutine: T) -> T:
async with semaphore:
return await coroutine

View file

@ -23,6 +23,3 @@ ignore = [
[tool.ruff.isort]
required-imports = ["from __future__ import annotations"]
[tool.ruff.pydocstyle]
convention = "pep257"