Add Comments data structure (#4641)

2025-09-30 22:01:47 +00:00 · 2023-05-30 10:54:55 +02:00 · 2023-05-30 10:54:55 +02:00 · 84a5584888
commit 84a5584888
parent 6146b75dd0
5 changed files with 698 additions and 0 deletions
--- a/crates/ruff_python_formatter/src/comments/mod.rs
+++ b/crates/ruff_python_formatter/src/comments/mod.rs
@ -1,2 +1,274 @@
 #![allow(unused, unreachable_pub)] // TODO(micha): Remove after using the new comments infrastructure in the formatter.
+
+//! Types for extracting and representing comments of a syntax tree.
+//!
+//! Most programming languages support comments allowing programmers to document their programs.
+//! Comments are different from other syntax because programming languages allow comments in almost any position,
+//! giving programmers great flexibility on where they can write comments:
+//!
+//! ```javascript
+//! /**
+//!  * Documentation comment
+//!  */
+//! async /* comment */ function Test () // line comment
+//! {/*inline*/}
+//! ```
+//!
+//! This flexibility makes formatting comments challenging because:
+//! * The formatter must consistently place comments so that re-formatting the output yields the same result,
+//!   and does not create invalid syntax (line comments).
+//! * It is essential that formatters place comments close to the syntax the programmer intended to document.
+//!   However, the lack of rules regarding where comments are allowed and what syntax they document requires
+//!   the use of heuristics to infer the documented syntax.
+//!
+//! This module tries to strike a balance between placing comments as closely as possible to their source location
+//! and reducing the complexity of formatting comments. It does so by associating comments per node rather than a token.
+//! This greatly reduces the combinations of possible comment positions, but turns out to be, in practice,
+//! sufficiently precise to keep comments close to their source location.
+//!
+//! Luckily, Python doesn't support inline comments, which simplifying the problem significantly.
+//!
+//! ## Node comments
+//!
+//! Comments are associated per node but get further distinguished on their location related to that node:
+//!
+//! ### Leading Comments
+//!
+//! A comment at the start of a node
+//!
+//! ```python
+//! # Leading comment of the statement
+//! print("test");
+//!
+//! [   # Leading comment of a
+//!     a
+//! ];
+//! ```
+//!
+//! ### Dangling Comments
+//!
+//! A comment that is neither at the start nor the end of a node.
+//!
+//! ```python
+//! [
+//!     # I'm between two brackets. There are no nodes
+//! ];
+//! ```
+//!
+//! ### Trailing Comments
+//!
+//! A comment at the end of a node.
+//!
+//! ```python
+//! [
+//!     a, # trailing comment of a
+//!     b, c
+//! ];
+//! ```
+//!
+//! ## Limitations
+//! Limiting the placement of comments to leading, dangling, or trailing node comments reduces complexity inside the formatter but means,
+//! that the formatter's possibility of where comments can be formatted depends on the AST structure.
+//!
+//! For example, *`RustPython`* doesn't create a node for the `/` operator separating positional only arguments from the other arguments.
+//!
+//! ```python
+//! def test(
+//!     a,
+//!     /, # The following arguments are positional or named arguments
+//!     b
+//! ):
+//!     pass
+//! ```
+//!
+//! Because *`RustPython`* doesn't create a Node for the `/` argument, it is impossible to associate any
+//! comments with it. Meaning, the default behaviour is to associate the `# The following ...` comment
+//! with the `b` argument, which is incorrect. This limitation can be worked around by implementing
+//! a custom rule to associate comments for `/` as *dangling comments* of the `Arguments` node and then
+//! implement custom formatting inside of the arguments formatter.
+//!
+//! It is possible to add an additional optional label to [`SourceComment`] If ever the need arises to distinguish two *dangling comments* in the formatting logic,
+
+use std::cell::Cell;
+use std::fmt::{Debug, Formatter};
+use std::rc::Rc;
+
+mod debug;
 mod map;
+mod node_key;
+
+use crate::comments::debug::{DebugComment, DebugComments};
+use crate::comments::map::MultiMap;
+use crate::comments::node_key::NodeRefEqualityKey;
+use ruff_formatter::{SourceCode, SourceCodeSlice};
+use ruff_python_ast::node::AnyNodeRef;
+
+/// A comment in the source document.
+#[derive(Debug, Clone)]
+pub(crate) struct SourceComment {
+    /// The location of the comment in the source document.
+    pub(super) slice: SourceCodeSlice,
+
+    /// Whether the comment has been formatted or not.
+    #[cfg(debug_assertions)]
+    pub(super) formatted: Cell<bool>,
+}
+
+impl SourceComment {
+    /// Returns the location of the comment in the original source code.
+    /// Allows retrieving the text of the comment.
+    pub(crate) fn slice(&self) -> &SourceCodeSlice {
+        &self.slice
+    }
+
+    #[cfg(not(debug_assertions))]
+    #[inline(always)]
+    pub fn mark_formatted(&self) {}
+
+    /// Marks the comment as formatted
+    #[cfg(debug_assertions)]
+    pub(crate) fn mark_formatted(&self) {
+        self.formatted.set(true);
+    }
+}
+
+impl SourceComment {
+    /// Returns a nice debug representation that prints the source code for every comment (and not just the range).
+    pub(crate) fn debug<'a>(&'a self, source_code: SourceCode<'a>) -> DebugComment<'a> {
+        DebugComment::new(self, source_code)
+    }
+}
+type CommentsMap<'a> = MultiMap<NodeRefEqualityKey<'a>, SourceComment>;
+
+/// The comments of a syntax tree stored by node.
+///
+/// Cloning `comments` is cheap as it only involves bumping a reference counter.
+#[derive(Clone, Default)]
+pub(crate) struct Comments<'a> {
+    /// The implementation uses an [Rc] so that [Comments] has a lifetime independent from the [crate::Formatter].
+    /// Independent lifetimes are necessary to support the use case where a (formattable object)[crate::Format]
+    /// iterates over all comments, and writes them into the [crate::Formatter] (mutably borrowing the [crate::Formatter] and in turn its context).
+    ///
+    /// ```block
+    /// for leading in f.context().comments().leading_comments(node) {
+    ///     ^
+    ///     |- Borrows comments
+    ///   write!(f, [comment(leading.piece.text())])?;
+    ///          ^
+    ///          |- Mutably borrows the formatter, state, context, and comments (if comments aren't cloned)
+    /// }
+    /// ```
+    ///
+    /// The use of an `Rc` solves this problem because we can cheaply clone `comments` before iterating.
+    ///
+    /// ```block
+    /// let comments = f.context().comments().clone();
+    /// for leading in comments.leading_comments(node) {
+    ///     write!(f, [comment(leading.piece.text())])?;
+    /// }
+    /// ```
+    data: Rc<CommentsData<'a>>,
+}
+
+impl<'a> Comments<'a> {
+    #[inline]
+    pub(crate) fn has_comments(&self, node: AnyNodeRef) -> bool {
+        self.data.comments.has(&NodeRefEqualityKey::from_ref(node))
+    }
+
+    /// Returns `true` if the given `node` has any [leading comments](self#leading-comments).
+    #[inline]
+    pub(crate) fn has_leading_comments(&self, node: AnyNodeRef) -> bool {
+        !self.leading_comments(node).is_empty()
+    }
+
+    /// Returns the `node`'s [leading comments](self#leading-comments).
+    #[inline]
+    pub(crate) fn leading_comments(&self, node: AnyNodeRef<'a>) -> &[SourceComment] {
+        self.data
+            .comments
+            .leading(&NodeRefEqualityKey::from_ref(node))
+    }
+
+    /// Returns `true` if node has any [dangling comments](self#dangling-comments).
+    pub(crate) fn has_dangling_comments(&self, node: AnyNodeRef<'a>) -> bool {
+        !self.dangling_comments(node).is_empty()
+    }
+
+    /// Returns the [dangling comments](self#dangling-comments) of `node`
+    pub(crate) fn dangling_comments(&self, node: AnyNodeRef<'a>) -> &[SourceComment] {
+        self.data
+            .comments
+            .dangling(&NodeRefEqualityKey::from_ref(node))
+    }
+
+    /// Returns the `node`'s [trailing comments](self#trailing-comments).
+    #[inline]
+    pub(crate) fn trailing_comments(&self, node: AnyNodeRef<'a>) -> &[SourceComment] {
+        self.data
+            .comments
+            .trailing(&NodeRefEqualityKey::from_ref(node))
+    }
+
+    /// Returns `true` if the given `node` has any [trailing comments](self#trailing-comments).
+    #[inline]
+    pub(crate) fn has_trailing_comments(&self, node: AnyNodeRef) -> bool {
+        !self.trailing_comments(node).is_empty()
+    }
+
+    /// Returns an iterator over the [leading](self#leading-comments) and [trailing comments](self#trailing-comments) of `node`.
+    pub(crate) fn leading_trailing_comments(
+        &self,
+        node: AnyNodeRef<'a>,
+    ) -> impl Iterator<Item = &SourceComment> {
+        self.leading_comments(node)
+            .iter()
+            .chain(self.trailing_comments(node).iter())
+    }
+
+    /// Returns an iterator over the [leading](self#leading-comments), [dangling](self#dangling-comments), and [trailing](self#trailing) comments of `node`.
+    pub(crate) fn leading_dangling_trailing_comments(
+        &self,
+        node: AnyNodeRef<'a>,
+    ) -> impl Iterator<Item = &SourceComment> {
+        self.data
+            .comments
+            .parts(&NodeRefEqualityKey::from_ref(node))
+    }
+
+    #[inline(always)]
+    #[cfg(not(debug_assertions))]
+    pub(crate) fn assert_formatted_all_comments(&self, _source_code: SourceCode) {}
+
+    #[cfg(debug_assertions)]
+    pub(crate) fn assert_formatted_all_comments(&self, source_code: SourceCode) {
+        use std::fmt::Write;
+
+        let mut output = String::new();
+        let unformatted_comments = self
+            .data
+            .comments
+            .all_parts()
+            .filter(|c| !c.formatted.get());
+
+        for comment in unformatted_comments {
+            // SAFETY: Writing to a string never fails.
+            writeln!(output, "{:#?}", comment.debug(source_code)).unwrap();
+        }
+
+        assert!(
+            output.is_empty(),
+            "The following comments have not been formatted.\n{output}"
+        );
+    }
+
+    /// Returns an object that implements [Debug] for nicely printing the [`Comments`].
+    pub(crate) fn debug(&'a self, source_code: SourceCode<'a>) -> DebugComments<'a> {
+        DebugComments::new(&self.data.comments, source_code)
+    }
+}
+
+#[derive(Default)]
+struct CommentsData<'a> {
+    comments: CommentsMap<'a>,
+}