Avoid owned token data in sequence sorting (#11533)

## Summary

This PR updates the sequence sorting (`RUF022` and `RUF023`) to avoid
using the owned data from the string token. Instead, we will directly
use the reference to the data on the AST. This does introduce a lot of
lifetimes but that's required.

The main motivation for this is to allow removing the `lex_starts_at`
usage easily.

### Alternatives

1. Extract the raw string content (stripping the prefix and quotes)
using the `Locator` and use that for comparison
2. Build up an
[`IndexVec`](3e30962077/crates/ruff_index/src/vec.rs)
and use the newtype index in place of the string value itself. This also
does require lifetimes so we might as well just use the method in this
PR.

## Test Plan

`cargo insta test` and no ecosystem changes
This commit is contained in:
Dhruv Manilawala 2024-05-27 05:50:20 +05:30 committed by GitHub
parent b5d147d219
commit 99c400000a
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
3 changed files with 46 additions and 33 deletions

View file

@ -318,13 +318,13 @@ impl<'a> SortClassification<'a> {
// An instance of this struct encapsulates an analysis // An instance of this struct encapsulates an analysis
/// of a multiline Python tuple/list that represents an /// of a multiline Python tuple/list that represents an
/// `__all__`/`__slots__`/etc. definition or augmentation. /// `__all__`/`__slots__`/etc. definition or augmentation.
pub(super) struct MultilineStringSequenceValue { pub(super) struct MultilineStringSequenceValue<'a> {
items: Vec<StringSequenceItem>, items: Vec<StringSequenceItem<'a>>,
range: TextRange, range: TextRange,
ends_with_trailing_comma: bool, ends_with_trailing_comma: bool,
} }
impl MultilineStringSequenceValue { impl<'a> MultilineStringSequenceValue<'a> {
pub(super) fn len(&self) -> usize { pub(super) fn len(&self) -> usize {
self.items.len() self.items.len()
} }
@ -336,14 +336,15 @@ impl MultilineStringSequenceValue {
range: TextRange, range: TextRange,
kind: SequenceKind, kind: SequenceKind,
locator: &Locator, locator: &Locator,
) -> Option<MultilineStringSequenceValue> { string_items: &[&'a str],
) -> Option<MultilineStringSequenceValue<'a>> {
// Parse the multiline string sequence using the raw tokens. // Parse the multiline string sequence using the raw tokens.
// See the docs for `collect_string_sequence_lines()` for why we have to // See the docs for `collect_string_sequence_lines()` for why we have to
// use the raw tokens, rather than just the AST, to do this parsing. // use the raw tokens, rather than just the AST, to do this parsing.
// //
// Step (1). Start by collecting information on each line individually: // Step (1). Start by collecting information on each line individually:
let (lines, ends_with_trailing_comma) = let (lines, ends_with_trailing_comma) =
collect_string_sequence_lines(range, kind, locator)?; collect_string_sequence_lines(range, kind, locator, string_items)?;
// Step (2). Group lines together into sortable "items": // Step (2). Group lines together into sortable "items":
// - Any "item" contains a single element of the list/tuple // - Any "item" contains a single element of the list/tuple
@ -447,7 +448,7 @@ impl MultilineStringSequenceValue {
.map_or(true, |tok| tok.kind() != SimpleTokenKind::Comma); .map_or(true, |tok| tok.kind() != SimpleTokenKind::Comma);
self.items self.items
.sort_by(|a, b| sorting_style.compare(&a.value, &b.value)); .sort_by(|a, b| sorting_style.compare(a.value, b.value));
let joined_items = join_multiline_string_sequence_items( let joined_items = join_multiline_string_sequence_items(
&self.items, &self.items,
locator, locator,
@ -460,7 +461,7 @@ impl MultilineStringSequenceValue {
} }
} }
impl Ranged for MultilineStringSequenceValue { impl Ranged for MultilineStringSequenceValue<'_> {
fn range(&self) -> TextRange { fn range(&self) -> TextRange {
self.range self.range
} }
@ -484,11 +485,12 @@ impl Ranged for MultilineStringSequenceValue {
/// stage if we're to sort items without doing unnecessary /// stage if we're to sort items without doing unnecessary
/// brutality to the comments and pre-existing style choices /// brutality to the comments and pre-existing style choices
/// in the original source code. /// in the original source code.
fn collect_string_sequence_lines( fn collect_string_sequence_lines<'a>(
range: TextRange, range: TextRange,
kind: SequenceKind, kind: SequenceKind,
locator: &Locator, locator: &Locator,
) -> Option<(Vec<StringSequenceLine>, bool)> { string_items: &[&'a str],
) -> Option<(Vec<StringSequenceLine<'a>>, bool)> {
// These first two variables are used for keeping track of state // These first two variables are used for keeping track of state
// regarding the entirety of the string sequence... // regarding the entirety of the string sequence...
let mut ends_with_trailing_comma = false; let mut ends_with_trailing_comma = false;
@ -496,6 +498,8 @@ fn collect_string_sequence_lines(
// ... all state regarding a single line of a string sequence // ... all state regarding a single line of a string sequence
// is encapsulated in this variable // is encapsulated in this variable
let mut line_state = LineState::default(); let mut line_state = LineState::default();
// An iterator over the string values in the sequence.
let mut string_items_iter = string_items.iter();
// `lex_starts_at()` gives us absolute ranges rather than relative ranges, // `lex_starts_at()` gives us absolute ranges rather than relative ranges,
// but (surprisingly) we still need to pass in the slice of code we want it to lex, // but (surprisingly) we still need to pass in the slice of code we want it to lex,
@ -518,8 +522,11 @@ fn collect_string_sequence_lines(
Tok::Comment(_) => { Tok::Comment(_) => {
line_state.visit_comment_token(subrange); line_state.visit_comment_token(subrange);
} }
Tok::String { value, .. } => { Tok::String { .. } => {
line_state.visit_string_token(value, subrange); let Some(string_value) = string_items_iter.next() else {
unreachable!("Expected the number of string tokens to be equal to the number of string items in the sequence");
};
line_state.visit_string_token(string_value, subrange);
ends_with_trailing_comma = false; ends_with_trailing_comma = false;
} }
Tok::Comma => { Tok::Comma => {
@ -558,15 +565,15 @@ fn collect_string_sequence_lines(
/// `into_string_sequence_line()` is called, which consumes /// `into_string_sequence_line()` is called, which consumes
/// `self` and produces the classification for the line. /// `self` and produces the classification for the line.
#[derive(Debug, Default)] #[derive(Debug, Default)]
struct LineState { struct LineState<'a> {
first_item_in_line: Option<(Box<str>, TextRange)>, first_item_in_line: Option<(&'a str, TextRange)>,
following_items_in_line: Vec<(Box<str>, TextRange)>, following_items_in_line: Vec<(&'a str, TextRange)>,
comment_range_start: Option<TextSize>, comment_range_start: Option<TextSize>,
comment_in_line: Option<TextRange>, comment_in_line: Option<TextRange>,
} }
impl LineState { impl<'a> LineState<'a> {
fn visit_string_token(&mut self, token_value: Box<str>, token_range: TextRange) { fn visit_string_token(&mut self, token_value: &'a str, token_range: TextRange) {
if self.first_item_in_line.is_none() { if self.first_item_in_line.is_none() {
self.first_item_in_line = Some((token_value, token_range)); self.first_item_in_line = Some((token_value, token_range));
} else { } else {
@ -600,7 +607,7 @@ impl LineState {
} }
} }
fn into_string_sequence_line(self) -> StringSequenceLine { fn into_string_sequence_line(self) -> StringSequenceLine<'a> {
if let Some(first_item) = self.first_item_in_line { if let Some(first_item) = self.first_item_in_line {
StringSequenceLine::OneOrMoreItems(LineWithItems { StringSequenceLine::OneOrMoreItems(LineWithItems {
first_item, first_item,
@ -627,17 +634,17 @@ struct LineWithJustAComment(TextRange);
/// 1 element of the sequence. The line may contain > 1 element of the /// 1 element of the sequence. The line may contain > 1 element of the
/// sequence, and may also have a trailing comment after the element(s). /// sequence, and may also have a trailing comment after the element(s).
#[derive(Debug)] #[derive(Debug)]
struct LineWithItems { struct LineWithItems<'a> {
// For elements in the list, we keep track of the value of the // For elements in the list, we keep track of the value of the
// value of the element as well as the source-code range of the element. // value of the element as well as the source-code range of the element.
// (We need to know the actual value so that we can sort the items.) // (We need to know the actual value so that we can sort the items.)
first_item: (Box<str>, TextRange), first_item: (&'a str, TextRange),
following_items: Vec<(Box<str>, TextRange)>, following_items: Vec<(&'a str, TextRange)>,
// For comments, we only need to keep track of the source-code range. // For comments, we only need to keep track of the source-code range.
trailing_comment_range: Option<TextRange>, trailing_comment_range: Option<TextRange>,
} }
impl LineWithItems { impl LineWithItems<'_> {
fn num_items(&self) -> usize { fn num_items(&self) -> usize {
self.following_items.len() + 1 self.following_items.len() + 1
} }
@ -651,9 +658,9 @@ impl LineWithItems {
/// and may also have a trailing comment. /// and may also have a trailing comment.
/// - An entirely empty line. /// - An entirely empty line.
#[derive(Debug)] #[derive(Debug)]
enum StringSequenceLine { enum StringSequenceLine<'a> {
JustAComment(LineWithJustAComment), JustAComment(LineWithJustAComment),
OneOrMoreItems(LineWithItems), OneOrMoreItems(LineWithItems<'a>),
Empty, Empty,
} }
@ -667,11 +674,11 @@ enum StringSequenceLine {
/// Note that any comments following the last item are discarded here, /// Note that any comments following the last item are discarded here,
/// but that doesn't matter: we add them back in `into_sorted_source_code()` /// but that doesn't matter: we add them back in `into_sorted_source_code()`
/// as part of the `postlude` (see comments in that function) /// as part of the `postlude` (see comments in that function)
fn collect_string_sequence_items( fn collect_string_sequence_items<'a>(
lines: Vec<StringSequenceLine>, lines: Vec<StringSequenceLine<'a>>,
dunder_all_range: TextRange, dunder_all_range: TextRange,
locator: &Locator, locator: &Locator,
) -> Vec<StringSequenceItem> { ) -> Vec<StringSequenceItem<'a>> {
let mut all_items = Vec::with_capacity(match lines.as_slice() { let mut all_items = Vec::with_capacity(match lines.as_slice() {
[StringSequenceLine::OneOrMoreItems(single)] => single.num_items(), [StringSequenceLine::OneOrMoreItems(single)] => single.num_items(),
_ => lines.len(), _ => lines.len(),
@ -752,8 +759,8 @@ fn collect_string_sequence_items(
/// of `# comment1` does not form a contiguous range with the /// of `# comment1` does not form a contiguous range with the
/// source-code range of `"a"`. /// source-code range of `"a"`.
#[derive(Debug)] #[derive(Debug)]
struct StringSequenceItem { struct StringSequenceItem<'a> {
value: Box<str>, value: &'a str,
preceding_comment_ranges: Vec<TextRange>, preceding_comment_ranges: Vec<TextRange>,
element_range: TextRange, element_range: TextRange,
// total_range incorporates the ranges of preceding comments // total_range incorporates the ranges of preceding comments
@ -764,9 +771,9 @@ struct StringSequenceItem {
end_of_line_comments: Option<TextRange>, end_of_line_comments: Option<TextRange>,
} }
impl StringSequenceItem { impl<'a> StringSequenceItem<'a> {
fn new( fn new(
value: Box<str>, value: &'a str,
preceding_comment_ranges: Vec<TextRange>, preceding_comment_ranges: Vec<TextRange>,
element_range: TextRange, element_range: TextRange,
end_of_line_comments: Option<TextRange>, end_of_line_comments: Option<TextRange>,
@ -787,12 +794,12 @@ impl StringSequenceItem {
} }
} }
fn with_no_comments(value: Box<str>, element_range: TextRange) -> Self { fn with_no_comments(value: &'a str, element_range: TextRange) -> Self {
Self::new(value, vec![], element_range, None) Self::new(value, vec![], element_range, None)
} }
} }
impl Ranged for StringSequenceItem { impl Ranged for StringSequenceItem<'_> {
fn range(&self) -> TextRange { fn range(&self) -> TextRange {
self.total_range self.total_range
} }

View file

@ -212,7 +212,12 @@ fn create_fix(
// bare minimum of token-processing for single-line `__all__` // bare minimum of token-processing for single-line `__all__`
// definitions: // definitions:
if is_multiline { if is_multiline {
let value = MultilineStringSequenceValue::from_source_range(range, kind, locator)?; let value = MultilineStringSequenceValue::from_source_range(
range,
kind,
locator,
string_items,
)?;
assert_eq!(value.len(), elts.len()); assert_eq!(value.len(), elts.len());
value.into_sorted_source_code(SORTING_STYLE, locator, checker.stylist()) value.into_sorted_source_code(SORTING_STYLE, locator, checker.stylist())
} else { } else {

View file

@ -210,6 +210,7 @@ impl<'a> StringLiteralDisplay<'a> {
self.range(), self.range(),
*sequence_kind, *sequence_kind,
locator, locator,
elements,
)?; )?;
assert_eq!(analyzed_sequence.len(), self.elts.len()); assert_eq!(analyzed_sequence.len(), self.elts.len());
analyzed_sequence.into_sorted_source_code(SORTING_STYLE, locator, checker.stylist()) analyzed_sequence.into_sorted_source_code(SORTING_STYLE, locator, checker.stylist())