mirror of
https://github.com/astral-sh/ruff.git
synced 2025-07-12 07:35:07 +00:00
Use memchr
for string lexing (#9888)
## Summary On `main`, string lexing consists of walking through the string character-by-character to search for the closing quote (with some nuance: we also need to skip escaped characters, and error if we see newlines in non-triple-quoted strings). This PR rewrites `lex_string` to instead use `memchr` to search for the closing quote, which is significantly faster. On my machine, at least, the `globals.py` benchmark (which contains a lot of docstrings) gets 40% faster... ```text lexer/numpy/globals.py time: [3.6410 µs 3.6496 µs 3.6585 µs] thrpt: [806.53 MiB/s 808.49 MiB/s 810.41 MiB/s] change: time: [-40.413% -40.185% -39.984%] (p = 0.00 < 0.05) thrpt: [+66.623% +67.181% +67.822%] Performance has improved. Found 2 outliers among 100 measurements (2.00%) 2 (2.00%) high mild lexer/unicode/pypinyin.py time: [12.422 µs 12.445 µs 12.467 µs] thrpt: [337.03 MiB/s 337.65 MiB/s 338.27 MiB/s] change: time: [-9.4213% -9.1930% -8.9586%] (p = 0.00 < 0.05) thrpt: [+9.8401% +10.124% +10.401%] Performance has improved. Found 3 outliers among 100 measurements (3.00%) 1 (1.00%) high mild 2 (2.00%) high severe lexer/pydantic/types.py time: [107.45 µs 107.50 µs 107.56 µs] thrpt: [237.11 MiB/s 237.24 MiB/s 237.35 MiB/s] change: time: [-4.0108% -3.7005% -3.3787%] (p = 0.00 < 0.05) thrpt: [+3.4968% +3.8427% +4.1784%] Performance has improved. Found 7 outliers among 100 measurements (7.00%) 2 (2.00%) high mild 5 (5.00%) high severe lexer/numpy/ctypeslib.py time: [46.123 µs 46.165 µs 46.208 µs] thrpt: [360.36 MiB/s 360.69 MiB/s 361.01 MiB/s] change: time: [-19.313% -18.996% -18.710%] (p = 0.00 < 0.05) thrpt: [+23.016% +23.451% +23.935%] Performance has improved. Found 8 outliers among 100 measurements (8.00%) 3 (3.00%) low mild 1 (1.00%) high mild 4 (4.00%) high severe lexer/large/dataset.py time: [231.07 µs 231.19 µs 231.33 µs] thrpt: [175.87 MiB/s 175.97 MiB/s 176.06 MiB/s] change: time: [-2.0437% -1.7663% -1.4922%] (p = 0.00 < 0.05) thrpt: [+1.5148% +1.7981% +2.0864%] Performance has improved. Found 10 outliers among 100 measurements (10.00%) 5 (5.00%) high mild 5 (5.00%) high severe ```
This commit is contained in:
parent
ad313b9089
commit
6fffde72e7
2 changed files with 100 additions and 35 deletions
|
@ -690,48 +690,65 @@ impl<'source> Lexer<'source> {
|
|||
|
||||
let value_start = self.offset();
|
||||
|
||||
let value_end = loop {
|
||||
match self.cursor.bump() {
|
||||
Some('\\') => {
|
||||
if self.cursor.eat_char('\r') {
|
||||
self.cursor.eat_char('\n');
|
||||
} else {
|
||||
self.cursor.bump();
|
||||
}
|
||||
}
|
||||
Some('\r' | '\n') if !triple_quoted => {
|
||||
let quote_byte = u8::try_from(quote).expect("char that fits in u8");
|
||||
let value_end = if triple_quoted {
|
||||
// For triple-quoted strings, scan until we find the closing quote (ignoring escaped
|
||||
// quotes) or the end of the file.
|
||||
loop {
|
||||
let Some(index) = memchr::memchr(quote_byte, self.cursor.rest().as_bytes()) else {
|
||||
self.cursor.skip_to_end();
|
||||
|
||||
if let Some(fstring) = self.fstrings.current() {
|
||||
// When we are in an f-string, check whether the initial quote
|
||||
// matches with f-strings quotes and if it is, then this must be a
|
||||
// missing '}' token so raise the proper error.
|
||||
if fstring.quote_char() == quote && !fstring.is_triple_quoted() {
|
||||
if fstring.quote_char() == quote
|
||||
&& fstring.is_triple_quoted() == triple_quoted
|
||||
{
|
||||
return Err(LexicalError {
|
||||
error: LexicalErrorType::FStringError(
|
||||
FStringErrorType::UnclosedLbrace,
|
||||
),
|
||||
location: self.offset() - TextSize::new(1),
|
||||
location: self.cursor.text_len(),
|
||||
});
|
||||
}
|
||||
}
|
||||
return Err(LexicalError {
|
||||
error: LexicalErrorType::OtherError(
|
||||
"EOL while scanning string literal".to_owned(),
|
||||
),
|
||||
location: self.offset() - TextSize::new(1),
|
||||
error: LexicalErrorType::Eof,
|
||||
location: self.cursor.text_len(),
|
||||
});
|
||||
}
|
||||
Some(c) if c == quote => {
|
||||
if triple_quoted {
|
||||
if self.cursor.eat_char2(quote, quote) {
|
||||
break self.offset() - TextSize::new(3);
|
||||
}
|
||||
} else {
|
||||
break self.offset() - TextSize::new(1);
|
||||
}
|
||||
};
|
||||
|
||||
// Rare case: if there are an odd number of backslashes before the quote, then
|
||||
// the quote is escaped and we should continue scanning.
|
||||
let num_backslashes = self.cursor.rest().as_bytes()[..index]
|
||||
.iter()
|
||||
.rev()
|
||||
.take_while(|&&c| c == b'\\')
|
||||
.count();
|
||||
|
||||
// Advance the cursor past the quote and continue scanning.
|
||||
self.cursor.skip_bytes(index + 1);
|
||||
|
||||
// If the character is escaped, continue scanning.
|
||||
if num_backslashes % 2 == 1 {
|
||||
continue;
|
||||
}
|
||||
|
||||
Some(_) => {}
|
||||
None => {
|
||||
// Otherwise, if it's followed by two more quotes, then we're done.
|
||||
if self.cursor.eat_char2(quote, quote) {
|
||||
break self.offset() - TextSize::new(3);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// For non-triple-quoted strings, scan until we find the closing quote, but end early
|
||||
// if we encounter a newline or the end of the file.
|
||||
loop {
|
||||
let Some(index) =
|
||||
memchr::memchr3(quote_byte, b'\r', b'\n', self.cursor.rest().as_bytes())
|
||||
else {
|
||||
self.cursor.skip_to_end();
|
||||
|
||||
if let Some(fstring) = self.fstrings.current() {
|
||||
// When we are in an f-string, check whether the initial quote
|
||||
// matches with f-strings quotes and if it is, then this must be a
|
||||
|
@ -748,23 +765,66 @@ impl<'source> Lexer<'source> {
|
|||
}
|
||||
}
|
||||
return Err(LexicalError {
|
||||
error: if triple_quoted {
|
||||
LexicalErrorType::Eof
|
||||
} else {
|
||||
LexicalErrorType::StringError
|
||||
},
|
||||
error: LexicalErrorType::StringError,
|
||||
location: self.offset(),
|
||||
});
|
||||
};
|
||||
|
||||
// Rare case: if there are an odd number of backslashes before the quote, then
|
||||
// the quote is escaped and we should continue scanning.
|
||||
let num_backslashes = self.cursor.rest().as_bytes()[..index]
|
||||
.iter()
|
||||
.rev()
|
||||
.take_while(|&&c| c == b'\\')
|
||||
.count();
|
||||
|
||||
// Skip up to the current character.
|
||||
self.cursor.skip_bytes(index);
|
||||
let ch = self.cursor.bump();
|
||||
|
||||
// If the character is escaped, continue scanning.
|
||||
if num_backslashes % 2 == 1 {
|
||||
if ch == Some('\r') {
|
||||
self.cursor.eat_char('\n');
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
match ch {
|
||||
Some('\r' | '\n') => {
|
||||
if let Some(fstring) = self.fstrings.current() {
|
||||
// When we are in an f-string, check whether the initial quote
|
||||
// matches with f-strings quotes and if it is, then this must be a
|
||||
// missing '}' token so raise the proper error.
|
||||
if fstring.quote_char() == quote && !fstring.is_triple_quoted() {
|
||||
return Err(LexicalError {
|
||||
error: LexicalErrorType::FStringError(
|
||||
FStringErrorType::UnclosedLbrace,
|
||||
),
|
||||
location: self.offset() - TextSize::new(1),
|
||||
});
|
||||
}
|
||||
}
|
||||
return Err(LexicalError {
|
||||
error: LexicalErrorType::OtherError(
|
||||
"EOL while scanning string literal".to_owned(),
|
||||
),
|
||||
location: self.offset() - TextSize::new(1),
|
||||
});
|
||||
}
|
||||
Some(ch) if ch == quote => {
|
||||
break self.offset() - TextSize::new(1);
|
||||
}
|
||||
_ => unreachable!("memchr2 returned an index that is not a quote or a newline"),
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
let tok = Tok::String {
|
||||
Ok(Tok::String {
|
||||
value: self.source[TextRange::new(value_start, value_end)].to_string(),
|
||||
kind,
|
||||
triple_quoted,
|
||||
};
|
||||
Ok(tok)
|
||||
})
|
||||
}
|
||||
|
||||
// This is the main entry point. Call this function to retrieve the next token.
|
||||
|
|
|
@ -145,4 +145,9 @@ impl<'a> Cursor<'a> {
|
|||
|
||||
self.chars = self.chars.as_str()[count..].chars();
|
||||
}
|
||||
|
||||
/// Skips to the end of the input stream.
|
||||
pub(super) fn skip_to_end(&mut self) {
|
||||
self.chars = "".chars();
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue