Add Comments data structure (#4641)

This commit is contained in:
Micha Reiser 2023-05-30 10:54:55 +02:00 committed by GitHub
parent 6146b75dd0
commit 84a5584888
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
5 changed files with 698 additions and 0 deletions

View file

@ -0,0 +1,186 @@
use crate::comments::node_key::NodeRefEqualityKey;
use crate::comments::{CommentsMap, SourceComment};
use ruff_formatter::SourceCode;
use std::fmt::{Debug, Formatter};
/// Prints a debug representation of [`SourceComment`] that includes the comment's text
pub(crate) struct DebugComment<'a> {
comment: &'a SourceComment,
source_code: SourceCode<'a>,
}
impl<'a> DebugComment<'a> {
pub(super) fn new(comment: &'a SourceComment, source_code: SourceCode<'a>) -> Self {
Self {
comment,
source_code,
}
}
}
impl Debug for DebugComment<'_> {
fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
let mut strut = f.debug_struct("SourceComment");
strut.field("text", &self.comment.slice.text(self.source_code));
#[cfg(debug_assertions)]
strut.field("formatted", &self.comment.formatted.get());
strut.finish()
}
}
/// Pretty-printed debug representation of [`Comments`].
pub(crate) struct DebugComments<'a> {
comments: &'a CommentsMap<'a>,
source_code: SourceCode<'a>,
}
impl<'a> DebugComments<'a> {
pub(super) fn new(comments: &'a CommentsMap, source_code: SourceCode<'a>) -> Self {
Self {
comments,
source_code,
}
}
}
impl Debug for DebugComments<'_> {
fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
let mut map = f.debug_map();
for node in self.comments.keys() {
map.entry(
&node,
&DebugNodeComments {
comments: self.comments,
source_code: self.source_code,
key: *node,
},
);
}
map.finish()
}
}
struct DebugNodeComments<'a> {
comments: &'a CommentsMap<'a>,
source_code: SourceCode<'a>,
key: NodeRefEqualityKey<'a>,
}
impl Debug for DebugNodeComments<'_> {
fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
f.debug_map()
.entry(
&"leading",
&DebugNodeCommentSlice {
node_comments: self.comments.leading(&self.key),
source_code: self.source_code,
},
)
.entry(
&"dangling",
&DebugNodeCommentSlice {
node_comments: self.comments.dangling(&self.key),
source_code: self.source_code,
},
)
.entry(
&"trailing",
&DebugNodeCommentSlice {
node_comments: self.comments.trailing(&self.key),
source_code: self.source_code,
},
)
.finish()
}
}
struct DebugNodeCommentSlice<'a> {
node_comments: &'a [SourceComment],
source_code: SourceCode<'a>,
}
impl Debug for DebugNodeCommentSlice<'_> {
fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
let mut list = f.debug_list();
for comment in self.node_comments {
list.entry(&comment.debug(self.source_code));
}
list.finish()
}
}
#[cfg(test)]
mod tests {
use crate::comments::map::MultiMap;
use crate::comments::node_key::NodeRefEqualityKey;
use crate::comments::{node_key, Comments, CommentsData};
use crate::comments::{CommentsMap, SourceComment};
use insta::assert_debug_snapshot;
use ruff_formatter::SourceCode;
use ruff_python_ast::node::AnyNode;
use ruff_python_ast::source_code;
use ruff_text_size::{TextRange, TextSize};
use rustpython_parser::ast::{StmtBreak, StmtContinue};
use std::cell::Cell;
use std::rc::Rc;
#[test]
fn debug() {
let continue_statement = AnyNode::from(StmtContinue {
range: TextRange::default(),
});
let break_statement = AnyNode::from(StmtBreak {
range: TextRange::default(),
});
let source = r#"# leading comment
continue; # trailing
# break leading
break;
"#;
let source_code = SourceCode::new(source);
let mut comments_map: CommentsMap = MultiMap::new();
comments_map.push_leading(
continue_statement.as_ref().into(),
SourceComment {
slice: source_code.slice(TextRange::at(TextSize::new(0), TextSize::new(17))),
formatted: Cell::new(false),
},
);
comments_map.push_trailing(
continue_statement.as_ref().into(),
SourceComment {
slice: source_code.slice(TextRange::at(TextSize::new(28), TextSize::new(10))),
formatted: Cell::new(false),
},
);
comments_map.push_leading(
break_statement.as_ref().into(),
SourceComment {
slice: source_code.slice(TextRange::at(TextSize::new(39), TextSize::new(15))),
formatted: Cell::new(false),
},
);
let comments = Comments {
data: Rc::new(CommentsData {
comments: comments_map,
}),
};
assert_debug_snapshot!(comments.debug(source_code));
}
}

View file

@ -209,6 +209,12 @@ impl<K: std::hash::Hash + Eq, V> MultiMap<K, V> {
}
}
pub fn keys(&self) -> Keys<'_, K> {
Keys {
inner: self.index.keys(),
}
}
/// Returns the *leading* parts of `key` in insertion-order.
pub fn leading(&self, key: &K) -> &[V] {
match self.index.get(key) {
@ -759,6 +765,26 @@ impl PartIndex {
}
}
/// Iterator over the keys of a comments multi map
pub struct Keys<'a, K> {
inner: std::collections::hash_map::Keys<'a, K, Entry>,
}
impl<'a, K> Iterator for Keys<'a, K> {
type Item = &'a K;
fn next(&mut self) -> Option<Self::Item> {
self.inner.next()
}
fn size_hint(&self) -> (usize, Option<usize>) {
self.inner.size_hint()
}
}
impl<K> ExactSizeIterator for Keys<'_, K> {}
impl<K> FusedIterator for Keys<'_, K> {}
#[cfg(test)]
mod tests {
use crate::comments::map::MultiMap;

View file

@ -1,2 +1,274 @@
#![allow(unused, unreachable_pub)] // TODO(micha): Remove after using the new comments infrastructure in the formatter.
//! Types for extracting and representing comments of a syntax tree.
//!
//! Most programming languages support comments allowing programmers to document their programs.
//! Comments are different from other syntax because programming languages allow comments in almost any position,
//! giving programmers great flexibility on where they can write comments:
//!
//! ```javascript
//! /**
//! * Documentation comment
//! */
//! async /* comment */ function Test () // line comment
//! {/*inline*/}
//! ```
//!
//! This flexibility makes formatting comments challenging because:
//! * The formatter must consistently place comments so that re-formatting the output yields the same result,
//! and does not create invalid syntax (line comments).
//! * It is essential that formatters place comments close to the syntax the programmer intended to document.
//! However, the lack of rules regarding where comments are allowed and what syntax they document requires
//! the use of heuristics to infer the documented syntax.
//!
//! This module tries to strike a balance between placing comments as closely as possible to their source location
//! and reducing the complexity of formatting comments. It does so by associating comments per node rather than a token.
//! This greatly reduces the combinations of possible comment positions, but turns out to be, in practice,
//! sufficiently precise to keep comments close to their source location.
//!
//! Luckily, Python doesn't support inline comments, which simplifying the problem significantly.
//!
//! ## Node comments
//!
//! Comments are associated per node but get further distinguished on their location related to that node:
//!
//! ### Leading Comments
//!
//! A comment at the start of a node
//!
//! ```python
//! # Leading comment of the statement
//! print("test");
//!
//! [ # Leading comment of a
//! a
//! ];
//! ```
//!
//! ### Dangling Comments
//!
//! A comment that is neither at the start nor the end of a node.
//!
//! ```python
//! [
//! # I'm between two brackets. There are no nodes
//! ];
//! ```
//!
//! ### Trailing Comments
//!
//! A comment at the end of a node.
//!
//! ```python
//! [
//! a, # trailing comment of a
//! b, c
//! ];
//! ```
//!
//! ## Limitations
//! Limiting the placement of comments to leading, dangling, or trailing node comments reduces complexity inside the formatter but means,
//! that the formatter's possibility of where comments can be formatted depends on the AST structure.
//!
//! For example, *`RustPython`* doesn't create a node for the `/` operator separating positional only arguments from the other arguments.
//!
//! ```python
//! def test(
//! a,
//! /, # The following arguments are positional or named arguments
//! b
//! ):
//! pass
//! ```
//!
//! Because *`RustPython`* doesn't create a Node for the `/` argument, it is impossible to associate any
//! comments with it. Meaning, the default behaviour is to associate the `# The following ...` comment
//! with the `b` argument, which is incorrect. This limitation can be worked around by implementing
//! a custom rule to associate comments for `/` as *dangling comments* of the `Arguments` node and then
//! implement custom formatting inside of the arguments formatter.
//!
//! It is possible to add an additional optional label to [`SourceComment`] If ever the need arises to distinguish two *dangling comments* in the formatting logic,
use std::cell::Cell;
use std::fmt::{Debug, Formatter};
use std::rc::Rc;
mod debug;
mod map;
mod node_key;
use crate::comments::debug::{DebugComment, DebugComments};
use crate::comments::map::MultiMap;
use crate::comments::node_key::NodeRefEqualityKey;
use ruff_formatter::{SourceCode, SourceCodeSlice};
use ruff_python_ast::node::AnyNodeRef;
/// A comment in the source document.
#[derive(Debug, Clone)]
pub(crate) struct SourceComment {
/// The location of the comment in the source document.
pub(super) slice: SourceCodeSlice,
/// Whether the comment has been formatted or not.
#[cfg(debug_assertions)]
pub(super) formatted: Cell<bool>,
}
impl SourceComment {
/// Returns the location of the comment in the original source code.
/// Allows retrieving the text of the comment.
pub(crate) fn slice(&self) -> &SourceCodeSlice {
&self.slice
}
#[cfg(not(debug_assertions))]
#[inline(always)]
pub fn mark_formatted(&self) {}
/// Marks the comment as formatted
#[cfg(debug_assertions)]
pub(crate) fn mark_formatted(&self) {
self.formatted.set(true);
}
}
impl SourceComment {
/// Returns a nice debug representation that prints the source code for every comment (and not just the range).
pub(crate) fn debug<'a>(&'a self, source_code: SourceCode<'a>) -> DebugComment<'a> {
DebugComment::new(self, source_code)
}
}
type CommentsMap<'a> = MultiMap<NodeRefEqualityKey<'a>, SourceComment>;
/// The comments of a syntax tree stored by node.
///
/// Cloning `comments` is cheap as it only involves bumping a reference counter.
#[derive(Clone, Default)]
pub(crate) struct Comments<'a> {
/// The implementation uses an [Rc] so that [Comments] has a lifetime independent from the [crate::Formatter].
/// Independent lifetimes are necessary to support the use case where a (formattable object)[crate::Format]
/// iterates over all comments, and writes them into the [crate::Formatter] (mutably borrowing the [crate::Formatter] and in turn its context).
///
/// ```block
/// for leading in f.context().comments().leading_comments(node) {
/// ^
/// |- Borrows comments
/// write!(f, [comment(leading.piece.text())])?;
/// ^
/// |- Mutably borrows the formatter, state, context, and comments (if comments aren't cloned)
/// }
/// ```
///
/// The use of an `Rc` solves this problem because we can cheaply clone `comments` before iterating.
///
/// ```block
/// let comments = f.context().comments().clone();
/// for leading in comments.leading_comments(node) {
/// write!(f, [comment(leading.piece.text())])?;
/// }
/// ```
data: Rc<CommentsData<'a>>,
}
impl<'a> Comments<'a> {
#[inline]
pub(crate) fn has_comments(&self, node: AnyNodeRef) -> bool {
self.data.comments.has(&NodeRefEqualityKey::from_ref(node))
}
/// Returns `true` if the given `node` has any [leading comments](self#leading-comments).
#[inline]
pub(crate) fn has_leading_comments(&self, node: AnyNodeRef) -> bool {
!self.leading_comments(node).is_empty()
}
/// Returns the `node`'s [leading comments](self#leading-comments).
#[inline]
pub(crate) fn leading_comments(&self, node: AnyNodeRef<'a>) -> &[SourceComment] {
self.data
.comments
.leading(&NodeRefEqualityKey::from_ref(node))
}
/// Returns `true` if node has any [dangling comments](self#dangling-comments).
pub(crate) fn has_dangling_comments(&self, node: AnyNodeRef<'a>) -> bool {
!self.dangling_comments(node).is_empty()
}
/// Returns the [dangling comments](self#dangling-comments) of `node`
pub(crate) fn dangling_comments(&self, node: AnyNodeRef<'a>) -> &[SourceComment] {
self.data
.comments
.dangling(&NodeRefEqualityKey::from_ref(node))
}
/// Returns the `node`'s [trailing comments](self#trailing-comments).
#[inline]
pub(crate) fn trailing_comments(&self, node: AnyNodeRef<'a>) -> &[SourceComment] {
self.data
.comments
.trailing(&NodeRefEqualityKey::from_ref(node))
}
/// Returns `true` if the given `node` has any [trailing comments](self#trailing-comments).
#[inline]
pub(crate) fn has_trailing_comments(&self, node: AnyNodeRef) -> bool {
!self.trailing_comments(node).is_empty()
}
/// Returns an iterator over the [leading](self#leading-comments) and [trailing comments](self#trailing-comments) of `node`.
pub(crate) fn leading_trailing_comments(
&self,
node: AnyNodeRef<'a>,
) -> impl Iterator<Item = &SourceComment> {
self.leading_comments(node)
.iter()
.chain(self.trailing_comments(node).iter())
}
/// Returns an iterator over the [leading](self#leading-comments), [dangling](self#dangling-comments), and [trailing](self#trailing) comments of `node`.
pub(crate) fn leading_dangling_trailing_comments(
&self,
node: AnyNodeRef<'a>,
) -> impl Iterator<Item = &SourceComment> {
self.data
.comments
.parts(&NodeRefEqualityKey::from_ref(node))
}
#[inline(always)]
#[cfg(not(debug_assertions))]
pub(crate) fn assert_formatted_all_comments(&self, _source_code: SourceCode) {}
#[cfg(debug_assertions)]
pub(crate) fn assert_formatted_all_comments(&self, source_code: SourceCode) {
use std::fmt::Write;
let mut output = String::new();
let unformatted_comments = self
.data
.comments
.all_parts()
.filter(|c| !c.formatted.get());
for comment in unformatted_comments {
// SAFETY: Writing to a string never fails.
writeln!(output, "{:#?}", comment.debug(source_code)).unwrap();
}
assert!(
output.is_empty(),
"The following comments have not been formatted.\n{output}"
);
}
/// Returns an object that implements [Debug] for nicely printing the [`Comments`].
pub(crate) fn debug(&'a self, source_code: SourceCode<'a>) -> DebugComments<'a> {
DebugComments::new(&self.data.comments, source_code)
}
}
#[derive(Default)]
struct CommentsData<'a> {
comments: CommentsMap<'a>,
}

View file

@ -0,0 +1,175 @@
use ruff_python_ast::node::AnyNodeRef;
use std::fmt::{Debug, Formatter};
use std::hash::{Hash, Hasher};
use std::ptr::NonNull;
/// Used as key into the [`MultiMap`] storing the comments per node by [`Comments`].
///
/// Implements equality and hashing based on the address of the [`AnyNodeRef`] to get fast and cheap
/// hashing/equality comparison.
#[derive(Copy, Clone)]
pub(super) struct NodeRefEqualityKey<'a> {
node: AnyNodeRef<'a>,
}
impl<'a> NodeRefEqualityKey<'a> {
/// Creates a key for a node reference.
pub(super) const fn from_ref(node: AnyNodeRef<'a>) -> Self {
Self { node }
}
/// Returns the underlying node.
pub(super) fn node(&self) -> AnyNodeRef {
self.node
}
fn ptr(self) -> NonNull<()> {
match self.node {
AnyNodeRef::ModModule(node) => NonNull::from(node).cast(),
AnyNodeRef::ModInteractive(node) => NonNull::from(node).cast(),
AnyNodeRef::ModExpression(node) => NonNull::from(node).cast(),
AnyNodeRef::ModFunctionType(node) => NonNull::from(node).cast(),
AnyNodeRef::StmtFunctionDef(node) => NonNull::from(node).cast(),
AnyNodeRef::StmtAsyncFunctionDef(node) => NonNull::from(node).cast(),
AnyNodeRef::StmtClassDef(node) => NonNull::from(node).cast(),
AnyNodeRef::StmtReturn(node) => NonNull::from(node).cast(),
AnyNodeRef::StmtDelete(node) => NonNull::from(node).cast(),
AnyNodeRef::StmtAssign(node) => NonNull::from(node).cast(),
AnyNodeRef::StmtAugAssign(node) => NonNull::from(node).cast(),
AnyNodeRef::StmtAnnAssign(node) => NonNull::from(node).cast(),
AnyNodeRef::StmtFor(node) => NonNull::from(node).cast(),
AnyNodeRef::StmtAsyncFor(node) => NonNull::from(node).cast(),
AnyNodeRef::StmtWhile(node) => NonNull::from(node).cast(),
AnyNodeRef::StmtIf(node) => NonNull::from(node).cast(),
AnyNodeRef::StmtWith(node) => NonNull::from(node).cast(),
AnyNodeRef::StmtAsyncWith(node) => NonNull::from(node).cast(),
AnyNodeRef::StmtMatch(node) => NonNull::from(node).cast(),
AnyNodeRef::StmtRaise(node) => NonNull::from(node).cast(),
AnyNodeRef::StmtTry(node) => NonNull::from(node).cast(),
AnyNodeRef::StmtTryStar(node) => NonNull::from(node).cast(),
AnyNodeRef::StmtAssert(node) => NonNull::from(node).cast(),
AnyNodeRef::StmtImport(node) => NonNull::from(node).cast(),
AnyNodeRef::StmtImportFrom(node) => NonNull::from(node).cast(),
AnyNodeRef::StmtGlobal(node) => NonNull::from(node).cast(),
AnyNodeRef::StmtNonlocal(node) => NonNull::from(node).cast(),
AnyNodeRef::StmtExpr(node) => NonNull::from(node).cast(),
AnyNodeRef::StmtPass(node) => NonNull::from(node).cast(),
AnyNodeRef::StmtBreak(node) => NonNull::from(node).cast(),
AnyNodeRef::StmtContinue(node) => NonNull::from(node).cast(),
AnyNodeRef::ExprBoolOp(node) => NonNull::from(node).cast(),
AnyNodeRef::ExprNamedExpr(node) => NonNull::from(node).cast(),
AnyNodeRef::ExprBinOp(node) => NonNull::from(node).cast(),
AnyNodeRef::ExprUnaryOp(node) => NonNull::from(node).cast(),
AnyNodeRef::ExprLambda(node) => NonNull::from(node).cast(),
AnyNodeRef::ExprIfExp(node) => NonNull::from(node).cast(),
AnyNodeRef::ExprDict(node) => NonNull::from(node).cast(),
AnyNodeRef::ExprSet(node) => NonNull::from(node).cast(),
AnyNodeRef::ExprListComp(node) => NonNull::from(node).cast(),
AnyNodeRef::ExprSetComp(node) => NonNull::from(node).cast(),
AnyNodeRef::ExprDictComp(node) => NonNull::from(node).cast(),
AnyNodeRef::ExprGeneratorExp(node) => NonNull::from(node).cast(),
AnyNodeRef::ExprAwait(node) => NonNull::from(node).cast(),
AnyNodeRef::ExprYield(node) => NonNull::from(node).cast(),
AnyNodeRef::ExprYieldFrom(node) => NonNull::from(node).cast(),
AnyNodeRef::ExprCompare(node) => NonNull::from(node).cast(),
AnyNodeRef::ExprCall(node) => NonNull::from(node).cast(),
AnyNodeRef::ExprFormattedValue(node) => NonNull::from(node).cast(),
AnyNodeRef::ExprJoinedStr(node) => NonNull::from(node).cast(),
AnyNodeRef::ExprConstant(node) => NonNull::from(node).cast(),
AnyNodeRef::ExprAttribute(node) => NonNull::from(node).cast(),
AnyNodeRef::ExprSubscript(node) => NonNull::from(node).cast(),
AnyNodeRef::ExprStarred(node) => NonNull::from(node).cast(),
AnyNodeRef::ExprName(node) => NonNull::from(node).cast(),
AnyNodeRef::ExprList(node) => NonNull::from(node).cast(),
AnyNodeRef::ExprTuple(node) => NonNull::from(node).cast(),
AnyNodeRef::ExprSlice(node) => NonNull::from(node).cast(),
AnyNodeRef::ExcepthandlerExceptHandler(node) => NonNull::from(node).cast(),
AnyNodeRef::PatternMatchValue(node) => NonNull::from(node).cast(),
AnyNodeRef::PatternMatchSingleton(node) => NonNull::from(node).cast(),
AnyNodeRef::PatternMatchSequence(node) => NonNull::from(node).cast(),
AnyNodeRef::PatternMatchMapping(node) => NonNull::from(node).cast(),
AnyNodeRef::PatternMatchClass(node) => NonNull::from(node).cast(),
AnyNodeRef::PatternMatchStar(node) => NonNull::from(node).cast(),
AnyNodeRef::PatternMatchAs(node) => NonNull::from(node).cast(),
AnyNodeRef::PatternMatchOr(node) => NonNull::from(node).cast(),
AnyNodeRef::TypeIgnoreTypeIgnore(node) => NonNull::from(node).cast(),
AnyNodeRef::Comprehension(node) => NonNull::from(node).cast(),
AnyNodeRef::Arguments(node) => NonNull::from(node).cast(),
AnyNodeRef::Arg(node) => NonNull::from(node).cast(),
AnyNodeRef::Keyword(node) => NonNull::from(node).cast(),
AnyNodeRef::Alias(node) => NonNull::from(node).cast(),
AnyNodeRef::Withitem(node) => NonNull::from(node).cast(),
AnyNodeRef::MatchCase(node) => NonNull::from(node).cast(),
}
}
}
impl Debug for NodeRefEqualityKey<'_> {
fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
self.node.fmt(f)
}
}
impl PartialEq for NodeRefEqualityKey<'_> {
fn eq(&self, other: &Self) -> bool {
self.ptr().eq(&other.ptr())
}
}
impl Eq for NodeRefEqualityKey<'_> {}
impl Hash for NodeRefEqualityKey<'_> {
fn hash<H: Hasher>(&self, state: &mut H) {
self.ptr().hash(state);
}
}
impl<'a> From<AnyNodeRef<'a>> for NodeRefEqualityKey<'a> {
fn from(value: AnyNodeRef<'a>) -> Self {
NodeRefEqualityKey::from_ref(value)
}
}
#[cfg(test)]
mod tests {
use crate::comments::node_key::NodeRefEqualityKey;
use ruff_python_ast::node::AnyNodeRef;
use ruff_text_size::TextRange;
use rustpython_parser::ast::StmtContinue;
use std::collections::hash_map::DefaultHasher;
use std::hash::{Hash, Hasher};
fn hash(key: NodeRefEqualityKey) -> u64 {
let mut h = DefaultHasher::default();
key.hash(&mut h);
h.finish()
}
#[test]
fn equality() {
let continue_statement = StmtContinue {
range: TextRange::default(),
};
let ref_a = NodeRefEqualityKey::from_ref(AnyNodeRef::from(&continue_statement));
let ref_b = NodeRefEqualityKey::from_ref(AnyNodeRef::from(&continue_statement));
assert_eq!(ref_a, ref_b);
assert_eq!(hash(ref_a), hash(ref_b));
}
#[test]
fn inequality() {
let continue_statement = StmtContinue {
range: TextRange::default(),
};
let boxed = Box::new(continue_statement.clone());
let ref_a = NodeRefEqualityKey::from_ref(AnyNodeRef::from(&continue_statement));
let ref_b = NodeRefEqualityKey::from_ref(AnyNodeRef::from(boxed.as_ref()));
assert_ne!(ref_a, ref_b);
assert_ne!(hash(ref_a), hash(ref_b));
}
}

View file

@ -0,0 +1,39 @@
---
source: crates/ruff_python_formatter/src/comments/debug.rs
expression: formatted
---
{
StmtContinue(
StmtContinue {
range: 0..0,
},
): {
"leading": [
SourceComment {
text: "# leading comment",
formatted: false,
},
],
"dangling": [],
"trailing": [
SourceComment {
text: "# trailing",
formatted: false,
},
],
},
StmtBreak(
StmtBreak {
range: 0..0,
},
): {
"leading": [
SourceComment {
text: "# break leading",
formatted: false,
},
],
"dangling": [],
"trailing": [],
},
}