Use memchr to speedup newline search on x86 (#3985)

This commit is contained in:
Micha Reiser 2023-04-26 21:15:47 +02:00 committed by GitHub
parent f3e6ddda62
commit e04ef42334
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
10 changed files with 147 additions and 114 deletions

1
Cargo.lock generated
View file

@ -2204,6 +2204,7 @@ dependencies = [
"is-macro", "is-macro",
"itertools", "itertools",
"log", "log",
"memchr",
"num-bigint", "num-bigint",
"num-traits", "num-traits",
"once_cell", "once_cell",

View file

@ -234,11 +234,12 @@ fn top_of_file_insertion(body: &[Stmt], locator: &Locator, stylist: &Stylist) ->
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use anyhow::Result; use anyhow::Result;
use ruff_python_ast::newlines::LineEnding;
use ruff_text_size::TextSize; use ruff_text_size::TextSize;
use rustpython_parser as parser; use rustpython_parser as parser;
use rustpython_parser::lexer::LexResult; use rustpython_parser::lexer::LexResult;
use ruff_python_ast::source_code::{LineEnding, Locator, Stylist}; use ruff_python_ast::source_code::{Locator, Stylist};
use crate::importer::{top_of_file_insertion, Insertion}; use crate::importer::{top_of_file_insertion, Insertion};

View file

@ -1,6 +1,7 @@
use crate::fs::{relativize_path, relativize_path_to}; use crate::fs::{relativize_path, relativize_path_to};
use crate::message::{Emitter, EmitterContext, Message}; use crate::message::{Emitter, EmitterContext, Message};
use crate::registry::AsRule; use crate::registry::AsRule;
use ruff_python_ast::source_code::SourceLocation;
use serde::ser::SerializeSeq; use serde::ser::SerializeSeq;
use serde::{Serialize, Serializer}; use serde::{Serialize, Serializer};
use serde_json::json; use serde_json::json;
@ -56,6 +57,9 @@ impl Serialize for SerializedMessages<'_> {
let mut s = serializer.serialize_seq(Some(self.messages.len()))?; let mut s = serializer.serialize_seq(Some(self.messages.len()))?;
for message in self.messages { for message in self.messages {
let start_location = message.compute_start_location();
let end_location = message.compute_end_location();
let lines = if self.context.is_jupyter_notebook(message.filename()) { let lines = if self.context.is_jupyter_notebook(message.filename()) {
// We can't give a reasonable location for the structured formats, // We can't give a reasonable location for the structured formats,
// so we show one that's clearly a fallback // so we show one that's clearly a fallback
@ -64,8 +68,6 @@ impl Serialize for SerializedMessages<'_> {
"end": 1 "end": 1
}) })
} else { } else {
let start_location = message.compute_start_location();
let end_location = message.compute_end_location();
json!({ json!({
"begin": start_location.row, "begin": start_location.row,
"end": end_location.row "end": end_location.row
@ -80,7 +82,7 @@ impl Serialize for SerializedMessages<'_> {
let value = json!({ let value = json!({
"description": format!("({}) {}", message.kind.rule().noqa_code(), message.kind.body), "description": format!("({}) {}", message.kind.rule().noqa_code(), message.kind.body),
"severity": "major", "severity": "major",
"fingerprint": fingerprint(message), "fingerprint": fingerprint(message, &start_location, &end_location),
"location": { "location": {
"path": path, "path": path,
"lines": lines "lines": lines
@ -95,10 +97,14 @@ impl Serialize for SerializedMessages<'_> {
} }
/// Generate a unique fingerprint to identify a violation. /// Generate a unique fingerprint to identify a violation.
fn fingerprint(message: &Message) -> String { fn fingerprint(
message: &Message,
start_location: &SourceLocation,
end_location: &SourceLocation,
) -> String {
let Message { let Message {
kind, kind,
range, range: _,
fix: _fix, fix: _fix,
file, file,
noqa_offset: _, noqa_offset: _,
@ -107,7 +113,8 @@ fn fingerprint(message: &Message) -> String {
let mut hasher = DefaultHasher::new(); let mut hasher = DefaultHasher::new();
kind.rule().hash(&mut hasher); kind.rule().hash(&mut hasher);
range.hash(&mut hasher); start_location.hash(&mut hasher);
end_location.hash(&mut hasher);
file.name().hash(&mut hasher); file.name().hash(&mut hasher);
format!("{:x}", hasher.finish()) format!("{:x}", hasher.finish())

View file

@ -11,7 +11,8 @@ use regex::Regex;
use ruff_text_size::{TextLen, TextRange, TextSize}; use ruff_text_size::{TextLen, TextRange, TextSize};
use ruff_diagnostics::Diagnostic; use ruff_diagnostics::Diagnostic;
use ruff_python_ast::source_code::{LineEnding, Locator}; use ruff_python_ast::newlines::LineEnding;
use ruff_python_ast::source_code::Locator;
use crate::codes::NoqaCode; use crate::codes::NoqaCode;
use crate::registry::{AsRule, Rule, RuleSet}; use crate::registry::{AsRule, Rule, RuleSet};
@ -511,7 +512,8 @@ mod tests {
use ruff_text_size::{TextRange, TextSize}; use ruff_text_size::{TextRange, TextSize};
use ruff_diagnostics::Diagnostic; use ruff_diagnostics::Diagnostic;
use ruff_python_ast::source_code::{LineEnding, Locator}; use ruff_python_ast::newlines::LineEnding;
use ruff_python_ast::source_code::Locator;
use crate::noqa::{add_noqa_inner, NoqaMapping, NOQA_LINE_REGEX}; use crate::noqa::{add_noqa_inner, NoqaMapping, NOQA_LINE_REGEX};
use crate::rules::pycodestyle::rules::AmbiguousVariableName; use crate::rules::pycodestyle::rules::AmbiguousVariableName;

View file

@ -16,6 +16,7 @@ bitflags = { workspace = true }
is-macro = { workspace = true } is-macro = { workspace = true }
itertools = { workspace = true } itertools = { workspace = true }
log = { workspace = true } log = { workspace = true }
memchr = "2.5.0"
num-bigint = { version = "0.4.3" } num-bigint = { version = "0.4.3" }
num-traits = { version = "0.2.15" } num-traits = { version = "0.2.15" }
once_cell = { workspace = true } once_cell = { workspace = true }

View file

@ -1,3 +1,4 @@
use memchr::{memchr2, memrchr2};
use ruff_text_size::{TextLen, TextRange, TextSize}; use ruff_text_size::{TextLen, TextRange, TextSize};
use std::iter::FusedIterator; use std::iter::FusedIterator;
use std::ops::Deref; use std::ops::Deref;
@ -50,6 +51,30 @@ impl<'a> UniversalNewlineIterator<'a> {
} }
} }
/// Finds the next newline character. Returns its position and the [`LineEnding`].
#[inline]
pub fn find_newline(text: &str) -> Option<(usize, LineEnding)> {
let bytes = text.as_bytes();
if let Some(position) = memchr2(b'\n', b'\r', bytes) {
// SAFETY: memchr guarantees to return valid positions
#[allow(unsafe_code)]
let newline_character = unsafe { *bytes.get_unchecked(position) };
let line_ending = match newline_character {
// Explicit branch for `\n` as this is the most likely path
b'\n' => LineEnding::Lf,
// '\r\n'
b'\r' if bytes.get(position.saturating_add(1)) == Some(&b'\n') => LineEnding::CrLf,
// '\r'
_ => LineEnding::Cr,
};
Some((position, line_ending))
} else {
None
}
}
impl<'a> Iterator for UniversalNewlineIterator<'a> { impl<'a> Iterator for UniversalNewlineIterator<'a> {
type Item = Line<'a>; type Item = Line<'a>;
@ -59,35 +84,25 @@ impl<'a> Iterator for UniversalNewlineIterator<'a> {
return None; return None;
} }
let line = match self.text.find(['\n', '\r']) { let line = if let Some((newline_position, line_ending)) = find_newline(self.text) {
// Non-last line let (text, remainder) = self.text.split_at(newline_position + line_ending.len());
Some(line_end) => {
let offset: usize = match self.text.as_bytes()[line_end] {
// Explicit branch for `\n` as this is the most likely path
b'\n' => 1,
// '\r\n'
b'\r' if self.text.as_bytes().get(line_end + 1) == Some(&b'\n') => 2,
// '\r'
_ => 1,
};
let (text, remainder) = self.text.split_at(line_end + offset); let line = Line {
offset: self.offset,
text,
};
let line = Line { self.text = remainder;
offset: self.offset, self.offset += text.text_len();
text,
};
self.text = remainder; line
self.offset += text.text_len(); }
// Last line
line else {
} Line {
// Last line
None => Line {
offset: self.offset, offset: self.offset,
text: std::mem::take(&mut self.text), text: std::mem::take(&mut self.text),
}, }
}; };
Some(line) Some(line)
@ -116,7 +131,7 @@ impl DoubleEndedIterator for UniversalNewlineIterator<'_> {
// Find the end of the previous line. The previous line is the text up to, but not including // Find the end of the previous line. The previous line is the text up to, but not including
// the newline character. // the newline character.
let line = if let Some(line_end) = haystack.rfind(['\n', '\r']) { let line = if let Some(line_end) = memrchr2(b'\n', b'\r', haystack.as_bytes()) {
// '\n' or '\r' or '\r\n' // '\n' or '\r' or '\r\n'
let (remainder, line) = self.text.split_at(line_end + 1); let (remainder, line) = self.text.split_at(line_end + 1);
self.text = remainder; self.text = remainder;
@ -268,6 +283,58 @@ impl PartialEq<Line<'_>> for &str {
} }
} }
/// The line ending style used in Python source code.
/// See <https://docs.python.org/3/reference/lexical_analysis.html#physical-lines>
#[derive(Debug, PartialEq, Eq, Copy, Clone)]
pub enum LineEnding {
Lf,
Cr,
CrLf,
}
impl Default for LineEnding {
fn default() -> Self {
if cfg!(windows) {
LineEnding::CrLf
} else {
LineEnding::Lf
}
}
}
impl LineEnding {
pub const fn as_str(&self) -> &'static str {
match self {
LineEnding::Lf => "\n",
LineEnding::CrLf => "\r\n",
LineEnding::Cr => "\r",
}
}
#[allow(clippy::len_without_is_empty)]
pub const fn len(&self) -> usize {
match self {
LineEnding::Lf | LineEnding::Cr => 1,
LineEnding::CrLf => 2,
}
}
pub const fn text_len(&self) -> TextSize {
match self {
LineEnding::Lf | LineEnding::Cr => TextSize::new(1),
LineEnding::CrLf => TextSize::new(2),
}
}
}
impl Deref for LineEnding {
type Target = str;
fn deref(&self) -> &Self::Target {
self.as_str()
}
}
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use super::UniversalNewlineIterator; use super::UniversalNewlineIterator;

View file

@ -9,9 +9,10 @@ use rustpython_parser::ast::{
}; };
use rustpython_parser::ConversionFlag; use rustpython_parser::ConversionFlag;
use crate::newlines::LineEnding;
use ruff_rustpython::vendor::{bytes, str}; use ruff_rustpython::vendor::{bytes, str};
use crate::source_code::stylist::{Indentation, LineEnding, Quote, Stylist}; use crate::source_code::stylist::{Indentation, Quote, Stylist};
mod precedence { mod precedence {
pub const ASSIGN: u8 = 3; pub const ASSIGN: u8 = 3;
@ -1256,9 +1257,10 @@ impl<'a> Generator<'a> {
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use crate::newlines::LineEnding;
use rustpython_parser as parser; use rustpython_parser as parser;
use crate::source_code::stylist::{Indentation, LineEnding, Quote}; use crate::source_code::stylist::{Indentation, Quote};
use crate::source_code::Generator; use crate::source_code::Generator;
fn round_trip(contents: &str) -> String { fn round_trip(contents: &str) -> String {

View file

@ -1,6 +1,8 @@
//! Struct used to efficiently slice source code at (row, column) Locations. //! Struct used to efficiently slice source code at (row, column) Locations.
use crate::newlines::find_newline;
use crate::source_code::{LineIndex, OneIndexed, SourceCode, SourceLocation}; use crate::source_code::{LineIndex, OneIndexed, SourceCode, SourceLocation};
use memchr::{memchr2, memrchr2};
use once_cell::unsync::OnceCell; use once_cell::unsync::OnceCell;
use ruff_text_size::{TextLen, TextRange, TextSize}; use ruff_text_size::{TextLen, TextRange, TextSize};
use std::ops::Add; use std::ops::Add;
@ -68,7 +70,8 @@ impl<'a> Locator<'a> {
/// ## Panics /// ## Panics
/// If `offset` is out of bounds. /// If `offset` is out of bounds.
pub fn line_start(&self, offset: TextSize) -> TextSize { pub fn line_start(&self, offset: TextSize) -> TextSize {
if let Some(index) = self.contents[TextRange::up_to(offset)].rfind(['\n', '\r']) { let bytes = self.contents[TextRange::up_to(offset)].as_bytes();
if let Some(index) = memrchr2(b'\n', b'\r', bytes) {
// SAFETY: Safe because `index < offset` // SAFETY: Safe because `index < offset`
TextSize::try_from(index).unwrap().add(TextSize::from(1)) TextSize::try_from(index).unwrap().add(TextSize::from(1))
} else { } else {
@ -101,19 +104,8 @@ impl<'a> Locator<'a> {
/// If `offset` is passed the end of the content. /// If `offset` is passed the end of the content.
pub fn full_line_end(&self, offset: TextSize) -> TextSize { pub fn full_line_end(&self, offset: TextSize) -> TextSize {
let slice = &self.contents[usize::from(offset)..]; let slice = &self.contents[usize::from(offset)..];
if let Some(index) = slice.find(['\n', '\r']) { if let Some((index, line_ending)) = find_newline(slice) {
let bytes = slice.as_bytes(); offset + TextSize::try_from(index).unwrap() + line_ending.text_len()
// `\r\n`
let relative_offset = if bytes[index] == b'\r' && bytes.get(index + 1) == Some(&b'\n') {
TextSize::try_from(index + 2).unwrap()
}
// `\r` or `\n`
else {
TextSize::try_from(index + 1).unwrap()
};
offset.add(relative_offset)
} else { } else {
self.contents.text_len() self.contents.text_len()
} }
@ -139,7 +131,7 @@ impl<'a> Locator<'a> {
/// If `offset` is passed the end of the content. /// If `offset` is passed the end of the content.
pub fn line_end(&self, offset: TextSize) -> TextSize { pub fn line_end(&self, offset: TextSize) -> TextSize {
let slice = &self.contents[usize::from(offset)..]; let slice = &self.contents[usize::from(offset)..];
if let Some(index) = slice.find(['\n', '\r']) { if let Some(index) = memchr2(b'\n', b'\r', slice.as_bytes()) {
offset + TextSize::try_from(index).unwrap() offset + TextSize::try_from(index).unwrap()
} else { } else {
self.contents.text_len() self.contents.text_len()

View file

@ -15,8 +15,7 @@ use rustpython_parser::{lexer, Mode, ParseError};
use serde::{Deserialize, Serialize}; use serde::{Deserialize, Serialize};
use std::fmt::{Debug, Formatter}; use std::fmt::{Debug, Formatter};
use std::sync::Arc; use std::sync::Arc;
pub use stylist::Stylist;
pub use stylist::{LineEnding, Stylist};
/// Run round-trip source code generation on a given Python code. /// Run round-trip source code generation on a given Python code.
pub fn round_trip(code: &str, source_path: &str) -> Result<String, ParseError> { pub fn round_trip(code: &str, source_path: &str) -> Result<String, ParseError> {

View file

@ -7,6 +7,7 @@ use once_cell::unsync::OnceCell;
use rustpython_parser::lexer::LexResult; use rustpython_parser::lexer::LexResult;
use rustpython_parser::Tok; use rustpython_parser::Tok;
use crate::newlines::{find_newline, LineEnding};
use ruff_rustpython::vendor; use ruff_rustpython::vendor;
use crate::source_code::Locator; use crate::source_code::Locator;
@ -29,9 +30,12 @@ impl<'a> Stylist<'a> {
} }
pub fn line_ending(&'a self) -> LineEnding { pub fn line_ending(&'a self) -> LineEnding {
*self *self.line_ending.get_or_init(|| {
.line_ending let contents = self.locator.contents();
.get_or_init(|| detect_line_ending(self.locator.contents()).unwrap_or_default()) find_newline(contents)
.map(|(_, ending)| ending)
.unwrap_or_default()
})
} }
pub fn from_tokens(tokens: &[LexResult], locator: &'a Locator<'a>) -> Self { pub fn from_tokens(tokens: &[LexResult], locator: &'a Locator<'a>) -> Self {
@ -158,65 +162,13 @@ impl Deref for Indentation {
} }
} }
/// The line ending style used in Python source code.
/// See <https://docs.python.org/3/reference/lexical_analysis.html#physical-lines>
#[derive(Debug, PartialEq, Eq, Copy, Clone)]
pub enum LineEnding {
Lf,
Cr,
CrLf,
}
impl Default for LineEnding {
fn default() -> Self {
if cfg!(windows) {
LineEnding::CrLf
} else {
LineEnding::Lf
}
}
}
impl LineEnding {
pub const fn as_str(&self) -> &'static str {
match self {
LineEnding::CrLf => "\r\n",
LineEnding::Lf => "\n",
LineEnding::Cr => "\r",
}
}
}
impl Deref for LineEnding {
type Target = str;
fn deref(&self) -> &Self::Target {
self.as_str()
}
}
/// Detect the line ending style of the given contents.
fn detect_line_ending(contents: &str) -> Option<LineEnding> {
if let Some(position) = contents.find(['\n', '\r']) {
let bytes = contents.as_bytes();
if bytes[position] == b'\n' {
Some(LineEnding::Lf)
} else if bytes.get(position.saturating_add(1)) == Some(&b'\n') {
Some(LineEnding::CrLf)
} else {
Some(LineEnding::Cr)
}
} else {
None
}
}
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use crate::newlines::{find_newline, LineEnding};
use rustpython_parser::lexer::lex; use rustpython_parser::lexer::lex;
use rustpython_parser::Mode; use rustpython_parser::Mode;
use crate::source_code::stylist::{detect_line_ending, Indentation, LineEnding, Quote}; use crate::source_code::stylist::{Indentation, Quote};
use crate::source_code::{Locator, Stylist}; use crate::source_code::{Locator, Stylist};
#[test] #[test]
@ -354,15 +306,24 @@ a = "v"
#[test] #[test]
fn line_ending() { fn line_ending() {
let contents = "x = 1"; let contents = "x = 1";
assert_eq!(detect_line_ending(contents), None); assert_eq!(find_newline(contents).map(|(_, ending)| ending), None);
let contents = "x = 1\n"; let contents = "x = 1\n";
assert_eq!(detect_line_ending(contents), Some(LineEnding::Lf)); assert_eq!(
find_newline(contents).map(|(_, ending)| ending),
Some(LineEnding::Lf)
);
let contents = "x = 1\r"; let contents = "x = 1\r";
assert_eq!(detect_line_ending(contents), Some(LineEnding::Cr)); assert_eq!(
find_newline(contents).map(|(_, ending)| ending),
Some(LineEnding::Cr)
);
let contents = "x = 1\r\n"; let contents = "x = 1\r\n";
assert_eq!(detect_line_ending(contents), Some(LineEnding::CrLf)); assert_eq!(
find_newline(contents).map(|(_, ending)| ending),
Some(LineEnding::CrLf)
);
} }
} }