feat(core): Add support for Python docstrings (#2038)

* feat(core): Add support for Python docstrings

* Remove unused dependency

* Revert "Remove unused dependency"

This reverts commit 5720b2eced.

* Fix for harper-ls

* Fix handling of multiline strings

* Fix merge artifact

* Formatting fix

* Do not pass quotes for linting

---------

Co-authored-by: Elijah Potter <me@elijahpotter.dev>
This commit is contained in:
Artem Golubin 2025-10-02 00:36:39 +04:00 committed by GitHub
parent 84a52e3988
commit 041d5a0b16
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
15 changed files with 209 additions and 5 deletions

16
harper-python/Cargo.toml Normal file
View file

@ -0,0 +1,16 @@
[package]
name = "harper-python"
version = "0.66.0"
edition = "2024"
description = "The language checker for developers."
license = "Apache-2.0"
repository = "https://github.com/automattic/harper"
[dependencies]
harper-core = { path = "../harper-core", version = "0.66.0" }
harper-tree-sitter = { path = "../harper-tree-sitter", version = "0.66.0" }
tree-sitter-python = "0.25.0"
tree-sitter = "0.25.10"
[dev-dependencies]
paste = "1.0.15"

94
harper-python/src/lib.rs Normal file
View file

@ -0,0 +1,94 @@
use harper_core::parsers::{self, Parser, PlainEnglish};
use harper_core::{Token, TokenKind};
use harper_tree_sitter::TreeSitterMasker;
use tree_sitter::Node;
pub struct PythonParser {
/// Used to grab the text nodes.
inner: parsers::Mask<TreeSitterMasker, PlainEnglish>,
}
impl PythonParser {
fn node_condition(n: &Node) -> bool {
if n.kind().contains("comment") {
return true;
}
if n.kind() == "string_content"
&& let Some(expr_stmt) = parent_is_expression_statement(n)
&& (is_module_level_docstring(&expr_stmt)
|| is_fn_or_class_docstrings(&expr_stmt)
|| is_attribute_docstring(&expr_stmt))
{
return true;
}
false
}
}
impl Default for PythonParser {
fn default() -> Self {
Self {
inner: parsers::Mask::new(
TreeSitterMasker::new(tree_sitter_python::LANGUAGE.into(), Self::node_condition),
PlainEnglish,
),
}
}
}
impl Parser for PythonParser {
fn parse(&self, source: &[char]) -> Vec<Token> {
let mut tokens = self.inner.parse(source);
let mut prev_kind: Option<&TokenKind> = None;
for token in &mut tokens {
if let TokenKind::Space(v) = &mut token.kind {
if let Some(TokenKind::Newline(_)) = &prev_kind {
// Lines in multiline docstrings are indented with spaces to match the current level.
// We need to remove such spaces to avoid triggering French spaces rule.
*v = 0;
} else {
*v = (*v).clamp(0, 1);
}
}
prev_kind = Some(&token.kind);
}
tokens
}
}
fn parent_is_expression_statement<'a>(node: &Node<'a>) -> Option<Node<'a>> {
node.parent()
.filter(|n| n.kind() == "string")
.and_then(|string_node| string_node.parent())
.filter(|n| n.kind() == "expression_statement")
}
#[inline]
fn is_module_level_docstring(expr_stmt: &Node) -> bool {
// (module . (expression_statement (string)))
expr_stmt.parent().is_some_and(|n| n.kind() == "module")
}
#[inline]
fn is_fn_or_class_docstrings(expr_stmt: &Node) -> bool {
// (class/func_definition body: (block . (expression_statement (string))))
expr_stmt
.parent()
.filter(|n| n.kind() == "block")
.and_then(|n| n.parent())
.is_some_and(|n| n.kind() == "function_definition" || n.kind() == "class_definition")
}
#[inline]
fn is_attribute_docstring(expr_stmt: &Node) -> bool {
// ((expression_statement (assignment)) . (expression_statement (string)))
expr_stmt
.prev_sibling()
.filter(|s| s.kind() == "expression_statement")
.and_then(|s| s.child(0))
.is_some_and(|c| c.kind() == "assignment")
}

View file

@ -0,0 +1,41 @@
use harper_core::linting::{LintGroup, Linter};
use harper_core::spell::FstDictionary;
use harper_core::{Dialect, Document};
use harper_python::PythonParser;
/// Creates a unit test checking Python source code parsing.
macro_rules! create_test {
($filename:ident.$ext:ident, $correct_expected:expr) => {
paste::paste! {
#[test]
fn [<lints_$ext _ $filename _correctly>](){
let source = include_str!(
concat!(
"./test_sources/",
concat!(
stringify!($filename), ".", stringify!($ext))
)
);
let parser = PythonParser::default();
let dict = FstDictionary::curated();
let document = Document::new(&source, &parser, &dict);
let mut linter = LintGroup::new_curated(dict, Dialect::American);
let lints = linter.lint(&document);
dbg!(&lints);
assert_eq!(lints.len(), $correct_expected);
// Make sure that all generated tokens span real characters
for token in document.tokens(){
assert!(token.span.try_get_content(document.get_source()).is_some());
}
}
}
};
}
create_test!(docstrings.py, 4);
create_test!(field_docstrings.py, 2);
create_test!(comments.py, 1);

View file

@ -0,0 +1,7 @@
# This is a camment.
header = "This is a haeder."
def main():
welcome_message = "Hellom World!"

View file

@ -0,0 +1,22 @@
"""Errors should never passs silently"""
def main():
"""Beautifull is better than ugly."""
class Main:
"""Explicit is better than implicet."""
def __init__(self):
"""Flat is bettter than nested."""
pass
def multiline_docstring(action_name: str):
"""Perform the specified action.
Available actions:
- stop
- start
- pause
"""

View file

@ -0,0 +1,5 @@
class Result:
output_path: str
"""The path to the autput file."""
status: str
"""The stotus of the job."""