perf(parser): use faster string parser methods (#8227)

## Summary

This makes use of memchr and other methods to parse the strings
(hopefully) faster. It might also be worth converting the
`parse_fstring_middle` helper to use similar techniques, but I did not
implement it in this PR.

## Test Plan

This was tested using the existing tests and passed all of them.
This commit is contained in:
Carter Snook 2023-10-28 17:50:54 -05:00 committed by GitHub
parent c39ea6ef05
commit 2f5734d1ac
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23

View file

@ -6,9 +6,6 @@ use ruff_text_size::{Ranged, TextLen, TextRange, TextSize};
use crate::lexer::{LexicalError, LexicalErrorType}; use crate::lexer::{LexicalError, LexicalErrorType};
use crate::token::{StringKind, Tok}; use crate::token::{StringKind, Tok};
// unicode_name2 does not expose `MAX_NAME_LENGTH`, so we replicate that constant here, fix #3798
const MAX_UNICODE_NAME: usize = 88;
pub(crate) struct StringConstantWithRange { pub(crate) struct StringConstantWithRange {
value: StringConstant, value: StringConstant,
range: TextRange, range: TextRange,
@ -57,7 +54,7 @@ impl StringType {
} }
struct StringParser<'a> { struct StringParser<'a> {
chars: std::str::Chars<'a>, rest: &'a str,
kind: StringKind, kind: StringKind,
location: TextSize, location: TextSize,
} }
@ -65,22 +62,18 @@ struct StringParser<'a> {
impl<'a> StringParser<'a> { impl<'a> StringParser<'a> {
fn new(source: &'a str, kind: StringKind, start: TextSize) -> Self { fn new(source: &'a str, kind: StringKind, start: TextSize) -> Self {
Self { Self {
chars: source.chars(), rest: source,
kind, kind,
location: start, location: start,
} }
} }
#[inline] #[inline]
fn next_char(&mut self) -> Option<char> { fn skip_bytes(&mut self, bytes: usize) -> &'a str {
let c = self.chars.next()?; let skipped_str = &self.rest[..bytes];
self.location += c.text_len(); self.rest = &self.rest[bytes..];
Some(c) self.location += skipped_str.text_len();
} skipped_str
#[inline]
fn peek(&mut self) -> Option<char> {
self.chars.clone().next()
} }
#[inline] #[inline]
@ -93,6 +86,34 @@ impl<'a> StringParser<'a> {
TextRange::new(start_location, self.location) TextRange::new(start_location, self.location)
} }
/// Returns the next byte in the string, if there is one.
///
/// # Panics
///
/// When the next byte is a part of a multi-byte character.
#[inline]
fn next_byte(&mut self) -> Option<u8> {
self.rest.as_bytes().first().map(|&byte| {
self.rest = &self.rest[1..];
self.location += TextSize::new(1);
byte
})
}
#[inline]
fn next_char(&mut self) -> Option<char> {
self.rest.chars().next().map(|c| {
self.rest = &self.rest[c.len_utf8()..];
self.location += c.text_len();
c
})
}
#[inline]
fn peek_byte(&self) -> Option<u8> {
self.rest.as_bytes().first().copied()
}
fn parse_unicode_literal(&mut self, literal_number: usize) -> Result<char, LexicalError> { fn parse_unicode_literal(&mut self, literal_number: usize) -> Result<char, LexicalError> {
let mut p: u32 = 0u32; let mut p: u32 = 0u32;
let unicode_error = LexicalError::new(LexicalErrorType::UnicodeError, self.get_pos()); let unicode_error = LexicalError::new(LexicalErrorType::UnicodeError, self.get_pos());
@ -110,57 +131,58 @@ impl<'a> StringParser<'a> {
_ => std::char::from_u32(p).ok_or(unicode_error), _ => std::char::from_u32(p).ok_or(unicode_error),
} }
} }
fn parse_octet(&mut self, o: u8) -> char {
let mut radix_bytes = [o, 0, 0];
let mut len = 1;
fn parse_octet(&mut self, first: char) -> char { while len < 3 {
let mut octet_content = String::new(); let Some(b'0'..=b'8') = self.peek_byte() else {
octet_content.push(first);
while octet_content.len() < 3 {
if let Some('0'..='7') = self.peek() {
octet_content.push(self.next_char().unwrap());
} else {
break; break;
};
radix_bytes[len] = self.next_byte().unwrap();
len += 1;
} }
}
let value = u32::from_str_radix(&octet_content, 8).unwrap(); // SAFETY: radix_bytes is always going to be in the ASCII range.
#[allow(unsafe_code)]
let radix_str = unsafe { std::str::from_utf8_unchecked(&radix_bytes[..len]) };
let value = u32::from_str_radix(radix_str, 8).unwrap();
char::from_u32(value).unwrap() char::from_u32(value).unwrap()
} }
fn parse_unicode_name(&mut self) -> Result<char, LexicalError> { fn parse_unicode_name(&mut self) -> Result<char, LexicalError> {
let start_pos = self.get_pos(); let start_pos = self.get_pos();
match self.next_char() {
Some('{') => {} let Some('{') = self.next_char() else {
_ => return Err(LexicalError::new(LexicalErrorType::StringError, start_pos)), return Err(LexicalError::new(LexicalErrorType::StringError, start_pos));
} };
let start_pos = self.get_pos(); let start_pos = self.get_pos();
let mut name = String::new(); let Some(close_idx) = self.rest.find('}') else {
loop {
match self.next_char() {
Some('}') => break,
Some(c) => name.push(c),
None => {
return Err(LexicalError::new( return Err(LexicalError::new(
LexicalErrorType::StringError, LexicalErrorType::StringError,
self.get_pos(), self.get_pos(),
))
}
}
}
if name.len() > MAX_UNICODE_NAME {
return Err(LexicalError::new(
LexicalErrorType::UnicodeError,
self.get_pos(),
)); ));
} };
unicode_names2::character(&name) let name_and_ending = self.skip_bytes(close_idx + 1);
let name = &name_and_ending[..name_and_ending.len() - 1];
unicode_names2::character(name)
.ok_or_else(|| LexicalError::new(LexicalErrorType::UnicodeError, start_pos)) .ok_or_else(|| LexicalError::new(LexicalErrorType::UnicodeError, start_pos))
} }
fn parse_escaped_char(&mut self) -> Result<String, LexicalError> { fn parse_escaped_char(&mut self, string: &mut String) -> Result<(), LexicalError> {
match self.next_char() { let Some(first_char) = self.next_char() else {
Some(c) => { return Err(LexicalError {
let char = match c { error: LexicalErrorType::StringError,
location: self.get_pos(),
});
};
let new_char = match first_char {
'\\' => '\\', '\\' => '\\',
'\'' => '\'', '\'' => '\'',
'\"' => '"', '\"' => '"',
@ -171,21 +193,22 @@ impl<'a> StringParser<'a> {
'r' => '\r', 'r' => '\r',
't' => '\t', 't' => '\t',
'v' => '\x0b', 'v' => '\x0b',
o @ '0'..='7' => self.parse_octet(o), o @ '0'..='7' => self.parse_octet(o as u8),
'x' => self.parse_unicode_literal(2)?, 'x' => self.parse_unicode_literal(2)?,
'u' if !self.kind.is_any_bytes() => self.parse_unicode_literal(4)?, 'u' if !self.kind.is_any_bytes() => self.parse_unicode_literal(4)?,
'U' if !self.kind.is_any_bytes() => self.parse_unicode_literal(8)?, 'U' if !self.kind.is_any_bytes() => self.parse_unicode_literal(8)?,
'N' if !self.kind.is_any_bytes() => self.parse_unicode_name()?, 'N' if !self.kind.is_any_bytes() => self.parse_unicode_name()?,
// Special cases where the escape sequence is not a single character // Special cases where the escape sequence is not a single character
'\n' => return Ok(String::new()), '\n' => return Ok(()),
'\r' => { '\r' => {
if self.peek() == Some('\n') { if self.peek_byte() == Some(b'\n') {
self.next_char(); self.next_byte();
} }
return Ok(String::new());
return Ok(());
} }
c => { _ => {
if self.kind.is_any_bytes() && !c.is_ascii() { if self.kind.is_any_bytes() && !first_char.is_ascii() {
return Err(LexicalError { return Err(LexicalError {
error: LexicalErrorType::OtherError( error: LexicalErrorType::OtherError(
"bytes can only contain ASCII literal characters".to_owned(), "bytes can only contain ASCII literal characters".to_owned(),
@ -193,16 +216,16 @@ impl<'a> StringParser<'a> {
location: self.get_pos(), location: self.get_pos(),
}); });
} }
return Ok(format!("\\{c}"));
string.push('\\');
first_char
} }
}; };
Ok(char.to_string())
} string.push(new_char);
None => Err(LexicalError {
error: LexicalErrorType::StringError, Ok(())
location: self.get_pos(),
}),
}
} }
fn parse_fstring_middle(&mut self) -> Result<Expr, LexicalError> { fn parse_fstring_middle(&mut self) -> Result<Expr, LexicalError> {
@ -230,8 +253,8 @@ impl<'a> StringParser<'a> {
// This is still an invalid escape sequence, but we don't want to // This is still an invalid escape sequence, but we don't want to
// raise a syntax error as is done by the CPython parser. It might // raise a syntax error as is done by the CPython parser. It might
// be supported in the future, refer to point 3: https://peps.python.org/pep-0701/#rejected-ideas // be supported in the future, refer to point 3: https://peps.python.org/pep-0701/#rejected-ideas
'\\' if !self.kind.is_raw() && self.peek().is_some() => { '\\' if !self.kind.is_raw() && self.peek_byte().is_some() => {
value.push_str(&self.parse_escaped_char()?); self.parse_escaped_char(&mut value)?;
} }
// If there are any curly braces inside a `FStringMiddle` token, // If there are any curly braces inside a `FStringMiddle` token,
// then they were escaped (i.e. `{{` or `}}`). This means that // then they were escaped (i.e. `{{` or `}}`). This means that
@ -255,7 +278,7 @@ impl<'a> StringParser<'a> {
while let Some(ch) = self.next_char() { while let Some(ch) = self.next_char() {
match ch { match ch {
'\\' if !self.kind.is_raw() => { '\\' if !self.kind.is_raw() => {
content.push_str(&self.parse_escaped_char()?); self.parse_escaped_char(&mut content)?;
} }
ch => { ch => {
if !ch.is_ascii() { if !ch.is_ascii() {
@ -278,16 +301,26 @@ impl<'a> StringParser<'a> {
} }
fn parse_string(&mut self) -> Result<StringType, LexicalError> { fn parse_string(&mut self) -> Result<StringType, LexicalError> {
let mut value = String::new();
let start_location = self.get_pos(); let start_location = self.get_pos();
while let Some(ch) = self.next_char() { let mut value = String::new();
match ch {
'\\' if !self.kind.is_raw() => { if self.kind.is_raw() {
value.push_str(&self.parse_escaped_char()?); value.push_str(self.skip_bytes(self.rest.len()));
} } else {
ch => value.push(ch), loop {
let Some(escape_idx) = self.rest.find('\\') else {
value.push_str(self.skip_bytes(self.rest.len()));
break;
};
let before_with_slash = self.skip_bytes(escape_idx + 1);
let before = &before_with_slash[..before_with_slash.len() - 1];
value.push_str(before);
self.parse_escaped_char(&mut value)?;
} }
} }
Ok(StringType::Str(StringConstantWithRange { Ok(StringType::Str(StringConstantWithRange {
value: StringConstant { value: StringConstant {
value, value,