Refactor noqa directive parsing away from regex-based implementation (#5554)

## Summary

I'll write up a more detailed description tomorrow, but in short, this
PR removes our regex-based implementation in favor of "manual" parsing.

I tried a couple different implementations. In the benchmarks below:

- `Directive/Regex` is our implementation on `main`.
- `Directive/Find` just uses `text.find("noqa")`, which is insufficient,
since it doesn't cover case-insensitive variants like `NOQA`, and
doesn't handle multiple `noqa` matches in a single like, like ` # Here's
a noqa comment # noqa: F401`. But it's kind of a baseline.
- `Directive/Memchr` uses three `memchr` iterative finders (one for
`noqa`, `NOQA`, and `NoQA`).
- `Directive/AhoCorasick` is roughly the variant checked-in here.

The raw results:

```
Directive/Regex/# noqa: F401
                        time:   [273.69 ns 274.71 ns 276.03 ns]
                        change: [+1.4467% +1.8979% +2.4243%] (p = 0.00 < 0.05)
                        Performance has regressed.
Found 15 outliers among 100 measurements (15.00%)
  3 (3.00%) low mild
  8 (8.00%) high mild
  4 (4.00%) high severe
Directive/Find/# noqa: F401
                        time:   [66.972 ns 67.048 ns 67.132 ns]
                        change: [+2.8292% +2.9377% +3.0540%] (p = 0.00 < 0.05)
                        Performance has regressed.
Found 15 outliers among 100 measurements (15.00%)
  1 (1.00%) low severe
  3 (3.00%) low mild
  8 (8.00%) high mild
  3 (3.00%) high severe
Directive/AhoCorasick/# noqa: F401
                        time:   [76.922 ns 77.189 ns 77.536 ns]
                        change: [+0.4265% +0.6862% +0.9871%] (p = 0.00 < 0.05)
                        Change within noise threshold.
Found 8 outliers among 100 measurements (8.00%)
  1 (1.00%) low mild
  3 (3.00%) high mild
  4 (4.00%) high severe
Directive/Memchr/# noqa: F401
                        time:   [62.627 ns 62.654 ns 62.679 ns]
                        change: [-0.1780% -0.0887% -0.0120%] (p = 0.03 < 0.05)
                        Change within noise threshold.
Found 11 outliers among 100 measurements (11.00%)
  1 (1.00%) low severe
  5 (5.00%) low mild
  3 (3.00%) high mild
  2 (2.00%) high severe
Directive/Regex/# noqa: F401, F841
                        time:   [321.83 ns 322.39 ns 322.93 ns]
                        change: [+8602.4% +8623.5% +8644.5%] (p = 0.00 < 0.05)
                        Performance has regressed.
Found 5 outliers among 100 measurements (5.00%)
  1 (1.00%) low severe
  2 (2.00%) low mild
  1 (1.00%) high mild
  1 (1.00%) high severe
Directive/Find/# noqa: F401, F841
                        time:   [78.618 ns 78.758 ns 78.896 ns]
                        change: [+1.6909% +1.8771% +2.0628%] (p = 0.00 < 0.05)
                        Performance has regressed.
Found 3 outliers among 100 measurements (3.00%)
  3 (3.00%) high mild
Directive/AhoCorasick/# noqa: F401, F841
                        time:   [87.739 ns 88.057 ns 88.468 ns]
                        change: [+0.1843% +0.4685% +0.7854%] (p = 0.00 < 0.05)
                        Change within noise threshold.
Found 11 outliers among 100 measurements (11.00%)
  5 (5.00%) low mild
  3 (3.00%) high mild
  3 (3.00%) high severe
Directive/Memchr/# noqa: F401, F841
                        time:   [80.674 ns 80.774 ns 80.860 ns]
                        change: [-0.7343% -0.5633% -0.4031%] (p = 0.00 < 0.05)
                        Change within noise threshold.
Found 14 outliers among 100 measurements (14.00%)
  4 (4.00%) low severe
  9 (9.00%) low mild
  1 (1.00%) high mild
Directive/Regex/# noqa  time:   [194.86 ns 195.93 ns 196.97 ns]
                        change: [+11973% +12039% +12103%] (p = 0.00 < 0.05)
                        Performance has regressed.
Found 6 outliers among 100 measurements (6.00%)
  5 (5.00%) low mild
  1 (1.00%) high mild
Directive/Find/# noqa   time:   [25.327 ns 25.354 ns 25.383 ns]
                        change: [+3.8524% +4.0267% +4.1845%] (p = 0.00 < 0.05)
                        Performance has regressed.
Found 9 outliers among 100 measurements (9.00%)
  6 (6.00%) high mild
  3 (3.00%) high severe
Directive/AhoCorasick/# noqa
                        time:   [34.267 ns 34.368 ns 34.481 ns]
                        change: [+0.5646% +0.8505% +1.1281%] (p = 0.00 < 0.05)
                        Change within noise threshold.
Found 5 outliers among 100 measurements (5.00%)
  5 (5.00%) high mild
Directive/Memchr/# noqa time:   [21.770 ns 21.818 ns 21.874 ns]
                        change: [-0.0990% +0.1464% +0.4046%] (p = 0.26 > 0.05)
                        No change in performance detected.
Found 10 outliers among 100 measurements (10.00%)
  4 (4.00%) low mild
  4 (4.00%) high mild
  2 (2.00%) high severe
Directive/Regex/# type: ignore # noqa: E501
                        time:   [278.76 ns 279.69 ns 280.72 ns]
                        change: [+7449.4% +7469.8% +7490.5%] (p = 0.00 < 0.05)
                        Performance has regressed.
Found 3 outliers among 100 measurements (3.00%)
  1 (1.00%) low mild
  1 (1.00%) high mild
  1 (1.00%) high severe
Directive/Find/# type: ignore # noqa: E501
                        time:   [67.791 ns 67.976 ns 68.184 ns]
                        change: [+2.8321% +3.1735% +3.5418%] (p = 0.00 < 0.05)
                        Performance has regressed.
Found 6 outliers among 100 measurements (6.00%)
  5 (5.00%) high mild
  1 (1.00%) high severe
Directive/AhoCorasick/# type: ignore # noqa: E501
                        time:   [75.908 ns 76.055 ns 76.210 ns]
                        change: [+0.9269% +1.1427% +1.3955%] (p = 0.00 < 0.05)
                        Change within noise threshold.
Found 1 outliers among 100 measurements (1.00%)
  1 (1.00%) high severe
Directive/Memchr/# type: ignore # noqa: E501
                        time:   [72.549 ns 72.723 ns 72.957 ns]
                        change: [+1.5881% +1.9660% +2.3974%] (p = 0.00 < 0.05)
                        Performance has regressed.
Found 15 outliers among 100 measurements (15.00%)
  10 (10.00%) high mild
  5 (5.00%) high severe
Directive/Regex/# type: ignore # nosec
                        time:   [66.967 ns 67.075 ns 67.207 ns]
                        change: [+1713.0% +1715.8% +1718.9%] (p = 0.00 < 0.05)
                        Performance has regressed.
Found 10 outliers among 100 measurements (10.00%)
  1 (1.00%) low severe
  3 (3.00%) low mild
  2 (2.00%) high mild
  4 (4.00%) high severe
Directive/Find/# type: ignore # nosec
                        time:   [18.505 ns 18.548 ns 18.597 ns]
                        change: [+1.3520% +1.6976% +2.0333%] (p = 0.00 < 0.05)
                        Performance has regressed.
Found 4 outliers among 100 measurements (4.00%)
  4 (4.00%) high mild
Directive/AhoCorasick/# type: ignore # nosec
                        time:   [16.162 ns 16.206 ns 16.252 ns]
                        change: [+1.2919% +1.5587% +1.8430%] (p = 0.00 < 0.05)
                        Performance has regressed.
Found 4 outliers among 100 measurements (4.00%)
  3 (3.00%) high mild
  1 (1.00%) high severe
Directive/Memchr/# type: ignore # nosec
                        time:   [39.192 ns 39.233 ns 39.276 ns]
                        change: [+0.5164% +0.7456% +0.9790%] (p = 0.00 < 0.05)
                        Change within noise threshold.
Found 13 outliers among 100 measurements (13.00%)
  2 (2.00%) low severe
  4 (4.00%) low mild
  3 (3.00%) high mild
  4 (4.00%) high severe
Directive/Regex/# some very long comment that # is interspersed with characters but # no directive
                        time:   [81.460 ns 81.578 ns 81.703 ns]
                        change: [+2093.3% +2098.8% +2104.2%] (p = 0.00 < 0.05)
                        Performance has regressed.
Found 4 outliers among 100 measurements (4.00%)
  2 (2.00%) low mild
  2 (2.00%) high mild
Directive/Find/# some very long comment that # is interspersed with characters but # no directive
                        time:   [26.284 ns 26.331 ns 26.387 ns]
                        change: [+0.7554% +1.1027% +1.3832%] (p = 0.00 < 0.05)
                        Change within noise threshold.
Found 6 outliers among 100 measurements (6.00%)
  5 (5.00%) high mild
  1 (1.00%) high severe
Directive/AhoCorasick/# some very long comment that # is interspersed with characters but # no direc...
                        time:   [28.643 ns 28.714 ns 28.787 ns]
                        change: [+1.3774% +1.6780% +2.0028%] (p = 0.00 < 0.05)
                        Performance has regressed.
Found 2 outliers among 100 measurements (2.00%)
  2 (2.00%) high mild
Directive/Memchr/# some very long comment that # is interspersed with characters but # no directive
                        time:   [55.766 ns 55.831 ns 55.897 ns]
                        change: [+1.5802% +1.7476% +1.9021%] (p = 0.00 < 0.05)
                        Performance has regressed.
Found 2 outliers among 100 measurements (2.00%)
  2 (2.00%) low mild
```

While memchr is faster than aho-corasick in some of the common cases
(like `# noqa: F401`), the latter is way, way faster when there _isn't_
a match (like 2x faster -- see the last two cases). Since most comments
_aren't_ `noqa` comments, this felt like the right tradeoff. Note that
all implementations are significantly faster than the regex version.

(I know I originally reported a 10x speedup, but I ended up improving
the regex version a bit in some prior PRs, so it got unintentionally
faster via some refactors.)

There's also one behavior change in here, which is that we now allow
variable spaces, e.g., `#noqa` or `# noqa`. Previously, we required
exactly one space. This thus closes #5177.
This commit is contained in:
Charlie Marsh 2023-07-06 12:03:10 -04:00 committed by GitHub
parent 87ca6171cf
commit cc822082a7
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
19 changed files with 310 additions and 47 deletions

1
Cargo.lock generated
View file

@ -1811,6 +1811,7 @@ dependencies = [
name = "ruff" name = "ruff"
version = "0.0.277" version = "0.0.277"
dependencies = [ dependencies = [
"aho-corasick 1.0.2",
"annotate-snippets 0.9.1", "annotate-snippets 0.9.1",
"anyhow", "anyhow",
"bitflags 2.3.3", "bitflags 2.3.3",

View file

@ -27,6 +27,7 @@ ruff_rustpython = { path = "../ruff_rustpython" }
ruff_text_size = { workspace = true } ruff_text_size = { workspace = true }
ruff_textwrap = { path = "../ruff_textwrap" } ruff_textwrap = { path = "../ruff_textwrap" }
aho-corasick = { version = "1.0.2" }
annotate-snippets = { version = "0.9.1", features = ["color"] } annotate-snippets = { version = "0.9.1", features = ["color"] }
anyhow = { workspace = true } anyhow = { workspace = true }
bitflags = { workspace = true } bitflags = { workspace = true }

View file

@ -4,11 +4,11 @@ use std::fs;
use std::ops::Add; use std::ops::Add;
use std::path::Path; use std::path::Path;
use aho_corasick::AhoCorasick;
use anyhow::Result; use anyhow::Result;
use itertools::Itertools; use itertools::Itertools;
use log::warn; use log::warn;
use once_cell::sync::Lazy; use once_cell::sync::Lazy;
use regex::Regex;
use ruff_text_size::{TextLen, TextRange, TextSize}; use ruff_text_size::{TextLen, TextRange, TextSize};
use rustpython_parser::ast::Ranged; use rustpython_parser::ast::Ranged;
@ -20,8 +20,10 @@ use crate::codes::NoqaCode;
use crate::registry::{AsRule, Rule, RuleSet}; use crate::registry::{AsRule, Rule, RuleSet};
use crate::rule_redirects::get_redirect_target; use crate::rule_redirects::get_redirect_target;
static NOQA_LINE_REGEX: Lazy<Regex> = Lazy::new(|| { static NOQA_MATCHER: Lazy<AhoCorasick> = Lazy::new(|| {
Regex::new(r"(?P<noqa>(?i:# noqa)(?::\s?(?P<codes>[A-Z]+[0-9]+(?:[,\s]+[A-Z]+[0-9]+)*))?)") AhoCorasick::builder()
.ascii_case_insensitive(true)
.build(["noqa"])
.unwrap() .unwrap()
}); });
@ -38,32 +40,104 @@ pub(crate) enum Directive<'a> {
impl<'a> Directive<'a> { impl<'a> Directive<'a> {
/// Extract the noqa `Directive` from a line of Python source code. /// Extract the noqa `Directive` from a line of Python source code.
pub(crate) fn try_extract(text: &'a str, offset: TextSize) -> Option<Self> { pub(crate) fn try_extract(text: &'a str, offset: TextSize) -> Option<Self> {
let caps = NOQA_LINE_REGEX.captures(text)?; for mat in NOQA_MATCHER.find_iter(text) {
match (caps.name("noqa"), caps.name("codes")) { let noqa_literal_start = mat.start();
(Some(noqa), Some(codes)) => {
let codes = codes // Determine the start of the comment.
.as_str() let mut comment_start = noqa_literal_start;
.split(|c: char| c.is_whitespace() || c == ',')
.map(str::trim) // Trim any whitespace between the `#` character and the `noqa` literal.
.filter(|code| !code.is_empty()) comment_start = text[..comment_start].trim_end().len();
.collect_vec();
if codes.is_empty() { // The next character has to be the `#` character.
warn!("Expected rule codes on `noqa` directive: \"{text}\""); if text[..comment_start]
} .chars()
let range = TextRange::at( .last()
TextSize::try_from(noqa.start()).unwrap().add(offset), .map_or(false, |c| c != '#')
noqa.as_str().text_len(), {
); continue;
Some(Self::Codes(Codes { range, codes }))
} }
(Some(noqa), None) => { comment_start -= '#'.len_utf8();
let range = TextRange::at(
TextSize::try_from(noqa.start()).unwrap().add(offset), // If the next character is `:`, then it's a list of codes. Otherwise, it's a directive
noqa.as_str().text_len(), // to ignore all rules.
); let noqa_literal_end = mat.end();
Some(Self::All(All { range })) return Some(
} if text[noqa_literal_end..]
_ => None, .chars()
.next()
.map_or(false, |c| c == ':')
{
// E.g., `# noqa: F401, F841`.
let mut codes_start = noqa_literal_end;
// Skip the `:` character.
codes_start += ':'.len_utf8();
// Skip any whitespace between the `:` and the codes.
codes_start += text[codes_start..]
.find(|c: char| !c.is_whitespace())
.unwrap_or(0);
// Extract the comma-separated list of codes.
let mut codes = vec![];
let mut codes_end = codes_start;
let mut leading_space = 0;
while let Some(code) = Directive::lex_code(&text[codes_end + leading_space..]) {
codes.push(code);
codes_end += leading_space;
codes_end += code.len();
// Codes can be comma- or whitespace-delimited. Compute the length of the
// delimiter, but only add it in the next iteration, once we find the next
// code.
if let Some(space_between) =
text[codes_end..].find(|c: char| !(c.is_whitespace() || c == ','))
{
leading_space = space_between;
} else {
break;
}
}
let range = TextRange::new(
TextSize::try_from(comment_start).unwrap(),
TextSize::try_from(codes_end).unwrap(),
);
Self::Codes(Codes {
range: range.add(offset),
codes,
})
} else {
// E.g., `# noqa`.
let range = TextRange::new(
TextSize::try_from(comment_start).unwrap(),
TextSize::try_from(noqa_literal_end).unwrap(),
);
Self::All(All {
range: range.add(offset),
})
},
);
}
None
}
/// Lex an individual rule code (e.g., `F401`).
fn lex_code(text: &str) -> Option<&str> {
// Extract, e.g., the `F` in `F401`.
let prefix = text.chars().take_while(char::is_ascii_uppercase).count();
// Extract, e.g., the `401` in `F401`.
let suffix = text[prefix..]
.chars()
.take_while(char::is_ascii_digit)
.count();
if prefix > 0 && suffix > 0 {
Some(&text[..prefix + suffix])
} else {
None
} }
} }
} }
@ -488,7 +562,7 @@ impl NoqaMapping {
} }
/// Returns the re-mapped position or `position` if no mapping exists. /// Returns the re-mapped position or `position` if no mapping exists.
pub fn resolve(&self, offset: TextSize) -> TextSize { pub(crate) fn resolve(&self, offset: TextSize) -> TextSize {
let index = self.ranges.binary_search_by(|range| { let index = self.ranges.binary_search_by(|range| {
if range.end() < offset { if range.end() < offset {
std::cmp::Ordering::Less std::cmp::Ordering::Less
@ -506,7 +580,7 @@ impl NoqaMapping {
} }
} }
pub fn push_mapping(&mut self, range: TextRange) { pub(crate) fn push_mapping(&mut self, range: TextRange) {
if let Some(last_range) = self.ranges.last_mut() { if let Some(last_range) = self.ranges.last_mut() {
// Strictly sorted insertion // Strictly sorted insertion
if last_range.end() <= range.start() { if last_range.end() <= range.start() {
@ -634,6 +708,48 @@ mod tests {
assert_debug_snapshot!(Directive::try_extract(source, TextSize::default())); assert_debug_snapshot!(Directive::try_extract(source, TextSize::default()));
} }
#[test]
fn noqa_all_leading_comment() {
let source = "# Some comment describing the noqa # noqa";
assert_debug_snapshot!(Directive::try_extract(source, TextSize::default()));
}
#[test]
fn noqa_code_leading_comment() {
let source = "# Some comment describing the noqa # noqa: F401";
assert_debug_snapshot!(Directive::try_extract(source, TextSize::default()));
}
#[test]
fn noqa_codes_leading_comment() {
let source = "# Some comment describing the noqa # noqa: F401, F841";
assert_debug_snapshot!(Directive::try_extract(source, TextSize::default()));
}
#[test]
fn noqa_all_trailing_comment() {
let source = "# noqa # Some comment describing the noqa";
assert_debug_snapshot!(Directive::try_extract(source, TextSize::default()));
}
#[test]
fn noqa_code_trailing_comment() {
let source = "# noqa: F401 # Some comment describing the noqa";
assert_debug_snapshot!(Directive::try_extract(source, TextSize::default()));
}
#[test]
fn noqa_codes_trailing_comment() {
let source = "# noqa: F401, F841 # Some comment describing the noqa";
assert_debug_snapshot!(Directive::try_extract(source, TextSize::default()));
}
#[test]
fn noqa_invalid_codes() {
let source = "# noqa: F401, unused-import, some other code";
assert_debug_snapshot!(Directive::try_extract(source, TextSize::default()));
}
#[test] #[test]
fn modification() { fn modification() {
let contents = "x = 1"; let contents = "x = 1";

View file

@ -0,0 +1,11 @@
---
source: crates/ruff/src/noqa.rs
expression: "Directive::try_extract(source, TextSize::default())"
---
Some(
All(
All {
range: 35..41,
},
),
)

View file

@ -1,5 +1,11 @@
--- ---
source: crates/ruff/src/noqa.rs source: crates/ruff/src/noqa.rs
expression: "Directive::extract(range, &locator)" expression: "Directive::try_extract(source, TextSize::default())"
--- ---
None Some(
All(
All {
range: 0..7,
},
),
)

View file

@ -1,5 +1,11 @@
--- ---
source: crates/ruff/src/noqa.rs source: crates/ruff/src/noqa.rs
expression: "Directive::extract(range, &locator)" expression: "Directive::try_extract(source, TextSize::default())"
--- ---
None Some(
All(
All {
range: 0..5,
},
),
)

View file

@ -0,0 +1,11 @@
---
source: crates/ruff/src/noqa.rs
expression: "Directive::try_extract(source, TextSize::default())"
---
Some(
All(
All {
range: 0..6,
},
),
)

View file

@ -0,0 +1,14 @@
---
source: crates/ruff/src/noqa.rs
expression: "Directive::try_extract(source, TextSize::default())"
---
Some(
Codes(
Codes {
range: 35..47,
codes: [
"F401",
],
},
),
)

View file

@ -1,5 +1,14 @@
--- ---
source: crates/ruff/src/noqa.rs source: crates/ruff/src/noqa.rs
expression: "Directive::extract(range, &locator)" expression: "Directive::try_extract(source, TextSize::default())"
--- ---
None Some(
Codes(
Codes {
range: 0..13,
codes: [
"F401",
],
},
),
)

View file

@ -1,5 +1,14 @@
--- ---
source: crates/ruff/src/noqa.rs source: crates/ruff/src/noqa.rs
expression: "Directive::extract(range, &locator)" expression: "Directive::try_extract(source, TextSize::default())"
--- ---
None Some(
Codes(
Codes {
range: 0..10,
codes: [
"F401",
],
},
),
)

View file

@ -0,0 +1,14 @@
---
source: crates/ruff/src/noqa.rs
expression: "Directive::try_extract(source, TextSize::default())"
---
Some(
Codes(
Codes {
range: 0..12,
codes: [
"F401",
],
},
),
)

View file

@ -0,0 +1,15 @@
---
source: crates/ruff/src/noqa.rs
expression: "Directive::try_extract(source, TextSize::default())"
---
Some(
Codes(
Codes {
range: 35..53,
codes: [
"F401",
"F841",
],
},
),
)

View file

@ -1,5 +1,15 @@
--- ---
source: crates/ruff/src/noqa.rs source: crates/ruff/src/noqa.rs
expression: "Directive::extract(range, &locator)" expression: "Directive::try_extract(source, TextSize::default())"
--- ---
None Some(
Codes(
Codes {
range: 0..20,
codes: [
"F401",
"F841",
],
},
),
)

View file

@ -1,5 +1,15 @@
--- ---
source: crates/ruff/src/noqa.rs source: crates/ruff/src/noqa.rs
expression: "Directive::extract(range, &locator)" expression: "Directive::try_extract(source, TextSize::default())"
--- ---
None Some(
Codes(
Codes {
range: 0..15,
codes: [
"F401",
"F841",
],
},
),
)

View file

@ -0,0 +1,15 @@
---
source: crates/ruff/src/noqa.rs
expression: "Directive::try_extract(source, TextSize::default())"
---
Some(
Codes(
Codes {
range: 0..18,
codes: [
"F401",
"F841",
],
},
),
)

View file

@ -0,0 +1,14 @@
---
source: crates/ruff/src/noqa.rs
expression: "Directive::try_extract(source, TextSize::default())"
---
Some(
Codes(
Codes {
range: 0..12,
codes: [
"F401",
],
},
),
)

1
foo.py Normal file
View file

@ -0,0 +1 @@
import os # noqa

View file

@ -20,7 +20,7 @@ from asyncio.subprocess import PIPE, create_subprocess_exec
from contextlib import asynccontextmanager, nullcontext from contextlib import asynccontextmanager, nullcontext
from pathlib import Path from pathlib import Path
from signal import SIGINT, SIGTERM from signal import SIGINT, SIGTERM
from typing import TYPE_CHECKING, NamedTuple, Self from typing import TYPE_CHECKING, NamedTuple, Self, TypeVar
if TYPE_CHECKING: if TYPE_CHECKING:
from collections.abc import AsyncIterator, Iterator, Sequence from collections.abc import AsyncIterator, Iterator, Sequence
@ -272,6 +272,9 @@ def read_projects_jsonl(projects_jsonl: Path) -> dict[tuple[str, str], Repositor
return repositories return repositories
T = TypeVar("T")
async def main( async def main(
*, *,
ruff1: Path, ruff1: Path,
@ -291,7 +294,7 @@ async def main(
# Otherwise doing 3k repositories can take >8GB RAM # Otherwise doing 3k repositories can take >8GB RAM
semaphore = asyncio.Semaphore(50) semaphore = asyncio.Semaphore(50)
async def limited_parallelism(coroutine): # noqa: ANN async def limited_parallelism(coroutine: T) -> T:
async with semaphore: async with semaphore:
return await coroutine return await coroutine

View file

@ -23,6 +23,3 @@ ignore = [
[tool.ruff.isort] [tool.ruff.isort]
required-imports = ["from __future__ import annotations"] required-imports = ["from __future__ import annotations"]
[tool.ruff.pydocstyle]
convention = "pep257"