mirror of
https://github.com/astral-sh/ruff.git
synced 2025-09-30 13:51:37 +00:00
perf(parser): use faster string parser methods (#8227)
## Summary This makes use of memchr and other methods to parse the strings (hopefully) faster. It might also be worth converting the `parse_fstring_middle` helper to use similar techniques, but I did not implement it in this PR. ## Test Plan This was tested using the existing tests and passed all of them.
This commit is contained in:
parent
c39ea6ef05
commit
2f5734d1ac
1 changed files with 132 additions and 99 deletions
|
@ -6,9 +6,6 @@ use ruff_text_size::{Ranged, TextLen, TextRange, TextSize};
|
||||||
use crate::lexer::{LexicalError, LexicalErrorType};
|
use crate::lexer::{LexicalError, LexicalErrorType};
|
||||||
use crate::token::{StringKind, Tok};
|
use crate::token::{StringKind, Tok};
|
||||||
|
|
||||||
// unicode_name2 does not expose `MAX_NAME_LENGTH`, so we replicate that constant here, fix #3798
|
|
||||||
const MAX_UNICODE_NAME: usize = 88;
|
|
||||||
|
|
||||||
pub(crate) struct StringConstantWithRange {
|
pub(crate) struct StringConstantWithRange {
|
||||||
value: StringConstant,
|
value: StringConstant,
|
||||||
range: TextRange,
|
range: TextRange,
|
||||||
|
@ -57,7 +54,7 @@ impl StringType {
|
||||||
}
|
}
|
||||||
|
|
||||||
struct StringParser<'a> {
|
struct StringParser<'a> {
|
||||||
chars: std::str::Chars<'a>,
|
rest: &'a str,
|
||||||
kind: StringKind,
|
kind: StringKind,
|
||||||
location: TextSize,
|
location: TextSize,
|
||||||
}
|
}
|
||||||
|
@ -65,22 +62,18 @@ struct StringParser<'a> {
|
||||||
impl<'a> StringParser<'a> {
|
impl<'a> StringParser<'a> {
|
||||||
fn new(source: &'a str, kind: StringKind, start: TextSize) -> Self {
|
fn new(source: &'a str, kind: StringKind, start: TextSize) -> Self {
|
||||||
Self {
|
Self {
|
||||||
chars: source.chars(),
|
rest: source,
|
||||||
kind,
|
kind,
|
||||||
location: start,
|
location: start,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[inline]
|
#[inline]
|
||||||
fn next_char(&mut self) -> Option<char> {
|
fn skip_bytes(&mut self, bytes: usize) -> &'a str {
|
||||||
let c = self.chars.next()?;
|
let skipped_str = &self.rest[..bytes];
|
||||||
self.location += c.text_len();
|
self.rest = &self.rest[bytes..];
|
||||||
Some(c)
|
self.location += skipped_str.text_len();
|
||||||
}
|
skipped_str
|
||||||
|
|
||||||
#[inline]
|
|
||||||
fn peek(&mut self) -> Option<char> {
|
|
||||||
self.chars.clone().next()
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#[inline]
|
#[inline]
|
||||||
|
@ -93,6 +86,34 @@ impl<'a> StringParser<'a> {
|
||||||
TextRange::new(start_location, self.location)
|
TextRange::new(start_location, self.location)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Returns the next byte in the string, if there is one.
|
||||||
|
///
|
||||||
|
/// # Panics
|
||||||
|
///
|
||||||
|
/// When the next byte is a part of a multi-byte character.
|
||||||
|
#[inline]
|
||||||
|
fn next_byte(&mut self) -> Option<u8> {
|
||||||
|
self.rest.as_bytes().first().map(|&byte| {
|
||||||
|
self.rest = &self.rest[1..];
|
||||||
|
self.location += TextSize::new(1);
|
||||||
|
byte
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
#[inline]
|
||||||
|
fn next_char(&mut self) -> Option<char> {
|
||||||
|
self.rest.chars().next().map(|c| {
|
||||||
|
self.rest = &self.rest[c.len_utf8()..];
|
||||||
|
self.location += c.text_len();
|
||||||
|
c
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
#[inline]
|
||||||
|
fn peek_byte(&self) -> Option<u8> {
|
||||||
|
self.rest.as_bytes().first().copied()
|
||||||
|
}
|
||||||
|
|
||||||
fn parse_unicode_literal(&mut self, literal_number: usize) -> Result<char, LexicalError> {
|
fn parse_unicode_literal(&mut self, literal_number: usize) -> Result<char, LexicalError> {
|
||||||
let mut p: u32 = 0u32;
|
let mut p: u32 = 0u32;
|
||||||
let unicode_error = LexicalError::new(LexicalErrorType::UnicodeError, self.get_pos());
|
let unicode_error = LexicalError::new(LexicalErrorType::UnicodeError, self.get_pos());
|
||||||
|
@ -110,57 +131,58 @@ impl<'a> StringParser<'a> {
|
||||||
_ => std::char::from_u32(p).ok_or(unicode_error),
|
_ => std::char::from_u32(p).ok_or(unicode_error),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
fn parse_octet(&mut self, o: u8) -> char {
|
||||||
|
let mut radix_bytes = [o, 0, 0];
|
||||||
|
let mut len = 1;
|
||||||
|
|
||||||
fn parse_octet(&mut self, first: char) -> char {
|
while len < 3 {
|
||||||
let mut octet_content = String::new();
|
let Some(b'0'..=b'8') = self.peek_byte() else {
|
||||||
octet_content.push(first);
|
|
||||||
while octet_content.len() < 3 {
|
|
||||||
if let Some('0'..='7') = self.peek() {
|
|
||||||
octet_content.push(self.next_char().unwrap());
|
|
||||||
} else {
|
|
||||||
break;
|
break;
|
||||||
|
};
|
||||||
|
|
||||||
|
radix_bytes[len] = self.next_byte().unwrap();
|
||||||
|
len += 1;
|
||||||
}
|
}
|
||||||
}
|
|
||||||
let value = u32::from_str_radix(&octet_content, 8).unwrap();
|
// SAFETY: radix_bytes is always going to be in the ASCII range.
|
||||||
|
#[allow(unsafe_code)]
|
||||||
|
let radix_str = unsafe { std::str::from_utf8_unchecked(&radix_bytes[..len]) };
|
||||||
|
|
||||||
|
let value = u32::from_str_radix(radix_str, 8).unwrap();
|
||||||
char::from_u32(value).unwrap()
|
char::from_u32(value).unwrap()
|
||||||
}
|
}
|
||||||
|
|
||||||
fn parse_unicode_name(&mut self) -> Result<char, LexicalError> {
|
fn parse_unicode_name(&mut self) -> Result<char, LexicalError> {
|
||||||
let start_pos = self.get_pos();
|
let start_pos = self.get_pos();
|
||||||
match self.next_char() {
|
|
||||||
Some('{') => {}
|
let Some('{') = self.next_char() else {
|
||||||
_ => return Err(LexicalError::new(LexicalErrorType::StringError, start_pos)),
|
return Err(LexicalError::new(LexicalErrorType::StringError, start_pos));
|
||||||
}
|
};
|
||||||
|
|
||||||
let start_pos = self.get_pos();
|
let start_pos = self.get_pos();
|
||||||
let mut name = String::new();
|
let Some(close_idx) = self.rest.find('}') else {
|
||||||
loop {
|
|
||||||
match self.next_char() {
|
|
||||||
Some('}') => break,
|
|
||||||
Some(c) => name.push(c),
|
|
||||||
None => {
|
|
||||||
return Err(LexicalError::new(
|
return Err(LexicalError::new(
|
||||||
LexicalErrorType::StringError,
|
LexicalErrorType::StringError,
|
||||||
self.get_pos(),
|
self.get_pos(),
|
||||||
))
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if name.len() > MAX_UNICODE_NAME {
|
|
||||||
return Err(LexicalError::new(
|
|
||||||
LexicalErrorType::UnicodeError,
|
|
||||||
self.get_pos(),
|
|
||||||
));
|
));
|
||||||
}
|
};
|
||||||
|
|
||||||
unicode_names2::character(&name)
|
let name_and_ending = self.skip_bytes(close_idx + 1);
|
||||||
|
let name = &name_and_ending[..name_and_ending.len() - 1];
|
||||||
|
|
||||||
|
unicode_names2::character(name)
|
||||||
.ok_or_else(|| LexicalError::new(LexicalErrorType::UnicodeError, start_pos))
|
.ok_or_else(|| LexicalError::new(LexicalErrorType::UnicodeError, start_pos))
|
||||||
}
|
}
|
||||||
|
|
||||||
fn parse_escaped_char(&mut self) -> Result<String, LexicalError> {
|
fn parse_escaped_char(&mut self, string: &mut String) -> Result<(), LexicalError> {
|
||||||
match self.next_char() {
|
let Some(first_char) = self.next_char() else {
|
||||||
Some(c) => {
|
return Err(LexicalError {
|
||||||
let char = match c {
|
error: LexicalErrorType::StringError,
|
||||||
|
location: self.get_pos(),
|
||||||
|
});
|
||||||
|
};
|
||||||
|
|
||||||
|
let new_char = match first_char {
|
||||||
'\\' => '\\',
|
'\\' => '\\',
|
||||||
'\'' => '\'',
|
'\'' => '\'',
|
||||||
'\"' => '"',
|
'\"' => '"',
|
||||||
|
@ -171,21 +193,22 @@ impl<'a> StringParser<'a> {
|
||||||
'r' => '\r',
|
'r' => '\r',
|
||||||
't' => '\t',
|
't' => '\t',
|
||||||
'v' => '\x0b',
|
'v' => '\x0b',
|
||||||
o @ '0'..='7' => self.parse_octet(o),
|
o @ '0'..='7' => self.parse_octet(o as u8),
|
||||||
'x' => self.parse_unicode_literal(2)?,
|
'x' => self.parse_unicode_literal(2)?,
|
||||||
'u' if !self.kind.is_any_bytes() => self.parse_unicode_literal(4)?,
|
'u' if !self.kind.is_any_bytes() => self.parse_unicode_literal(4)?,
|
||||||
'U' if !self.kind.is_any_bytes() => self.parse_unicode_literal(8)?,
|
'U' if !self.kind.is_any_bytes() => self.parse_unicode_literal(8)?,
|
||||||
'N' if !self.kind.is_any_bytes() => self.parse_unicode_name()?,
|
'N' if !self.kind.is_any_bytes() => self.parse_unicode_name()?,
|
||||||
// Special cases where the escape sequence is not a single character
|
// Special cases where the escape sequence is not a single character
|
||||||
'\n' => return Ok(String::new()),
|
'\n' => return Ok(()),
|
||||||
'\r' => {
|
'\r' => {
|
||||||
if self.peek() == Some('\n') {
|
if self.peek_byte() == Some(b'\n') {
|
||||||
self.next_char();
|
self.next_byte();
|
||||||
}
|
}
|
||||||
return Ok(String::new());
|
|
||||||
|
return Ok(());
|
||||||
}
|
}
|
||||||
c => {
|
_ => {
|
||||||
if self.kind.is_any_bytes() && !c.is_ascii() {
|
if self.kind.is_any_bytes() && !first_char.is_ascii() {
|
||||||
return Err(LexicalError {
|
return Err(LexicalError {
|
||||||
error: LexicalErrorType::OtherError(
|
error: LexicalErrorType::OtherError(
|
||||||
"bytes can only contain ASCII literal characters".to_owned(),
|
"bytes can only contain ASCII literal characters".to_owned(),
|
||||||
|
@ -193,16 +216,16 @@ impl<'a> StringParser<'a> {
|
||||||
location: self.get_pos(),
|
location: self.get_pos(),
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
return Ok(format!("\\{c}"));
|
|
||||||
|
string.push('\\');
|
||||||
|
|
||||||
|
first_char
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
Ok(char.to_string())
|
|
||||||
}
|
string.push(new_char);
|
||||||
None => Err(LexicalError {
|
|
||||||
error: LexicalErrorType::StringError,
|
Ok(())
|
||||||
location: self.get_pos(),
|
|
||||||
}),
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
fn parse_fstring_middle(&mut self) -> Result<Expr, LexicalError> {
|
fn parse_fstring_middle(&mut self) -> Result<Expr, LexicalError> {
|
||||||
|
@ -230,8 +253,8 @@ impl<'a> StringParser<'a> {
|
||||||
// This is still an invalid escape sequence, but we don't want to
|
// This is still an invalid escape sequence, but we don't want to
|
||||||
// raise a syntax error as is done by the CPython parser. It might
|
// raise a syntax error as is done by the CPython parser. It might
|
||||||
// be supported in the future, refer to point 3: https://peps.python.org/pep-0701/#rejected-ideas
|
// be supported in the future, refer to point 3: https://peps.python.org/pep-0701/#rejected-ideas
|
||||||
'\\' if !self.kind.is_raw() && self.peek().is_some() => {
|
'\\' if !self.kind.is_raw() && self.peek_byte().is_some() => {
|
||||||
value.push_str(&self.parse_escaped_char()?);
|
self.parse_escaped_char(&mut value)?;
|
||||||
}
|
}
|
||||||
// If there are any curly braces inside a `FStringMiddle` token,
|
// If there are any curly braces inside a `FStringMiddle` token,
|
||||||
// then they were escaped (i.e. `{{` or `}}`). This means that
|
// then they were escaped (i.e. `{{` or `}}`). This means that
|
||||||
|
@ -255,7 +278,7 @@ impl<'a> StringParser<'a> {
|
||||||
while let Some(ch) = self.next_char() {
|
while let Some(ch) = self.next_char() {
|
||||||
match ch {
|
match ch {
|
||||||
'\\' if !self.kind.is_raw() => {
|
'\\' if !self.kind.is_raw() => {
|
||||||
content.push_str(&self.parse_escaped_char()?);
|
self.parse_escaped_char(&mut content)?;
|
||||||
}
|
}
|
||||||
ch => {
|
ch => {
|
||||||
if !ch.is_ascii() {
|
if !ch.is_ascii() {
|
||||||
|
@ -278,16 +301,26 @@ impl<'a> StringParser<'a> {
|
||||||
}
|
}
|
||||||
|
|
||||||
fn parse_string(&mut self) -> Result<StringType, LexicalError> {
|
fn parse_string(&mut self) -> Result<StringType, LexicalError> {
|
||||||
let mut value = String::new();
|
|
||||||
let start_location = self.get_pos();
|
let start_location = self.get_pos();
|
||||||
while let Some(ch) = self.next_char() {
|
let mut value = String::new();
|
||||||
match ch {
|
|
||||||
'\\' if !self.kind.is_raw() => {
|
if self.kind.is_raw() {
|
||||||
value.push_str(&self.parse_escaped_char()?);
|
value.push_str(self.skip_bytes(self.rest.len()));
|
||||||
}
|
} else {
|
||||||
ch => value.push(ch),
|
loop {
|
||||||
|
let Some(escape_idx) = self.rest.find('\\') else {
|
||||||
|
value.push_str(self.skip_bytes(self.rest.len()));
|
||||||
|
break;
|
||||||
|
};
|
||||||
|
|
||||||
|
let before_with_slash = self.skip_bytes(escape_idx + 1);
|
||||||
|
let before = &before_with_slash[..before_with_slash.len() - 1];
|
||||||
|
|
||||||
|
value.push_str(before);
|
||||||
|
self.parse_escaped_char(&mut value)?;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
Ok(StringType::Str(StringConstantWithRange {
|
Ok(StringType::Str(StringConstantWithRange {
|
||||||
value: StringConstant {
|
value: StringConstant {
|
||||||
value,
|
value,
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue