feat(core): Add support for Python docstrings (#2038)

* feat(core): Add support for Python docstrings

* Remove unused dependency

* Revert "Remove unused dependency"

This reverts commit 5720b2eced.

* Fix for harper-ls

* Fix handling of multiline strings

* Fix merge artifact

* Formatting fix

* Do not pass quotes for linting

---------

Co-authored-by: Elijah Potter <me@elijahpotter.dev>
This commit is contained in:
Artem Golubin 2025-10-02 00:36:39 +04:00 committed by GitHub
parent 84a52e3988
commit 041d5a0b16
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
15 changed files with 209 additions and 5 deletions

14
Cargo.lock generated
View file

@ -2302,6 +2302,7 @@ dependencies = [
"harper-ink",
"harper-literate-haskell",
"harper-pos-utils",
"harper-python",
"harper-stats",
"harper-typst",
"hashbrown 0.16.0",
@ -2336,7 +2337,6 @@ dependencies = [
"tree-sitter-lua",
"tree-sitter-nix",
"tree-sitter-php",
"tree-sitter-python",
"tree-sitter-ruby",
"tree-sitter-rust",
"tree-sitter-scala",
@ -2430,6 +2430,7 @@ dependencies = [
"harper-html",
"harper-ink",
"harper-literate-haskell",
"harper-python",
"harper-stats",
"harper-typst",
"indexmap",
@ -2464,6 +2465,17 @@ dependencies = [
"strum_macros 0.27.2",
]
[[package]]
name = "harper-python"
version = "0.66.0"
dependencies = [
"harper-core",
"harper-tree-sitter",
"paste",
"tree-sitter",
"tree-sitter-python",
]
[[package]]
name = "harper-stats"
version = "0.66.0"

View file

@ -1,5 +1,5 @@
[workspace]
members = ["harper-cli", "harper-core", "harper-ls", "harper-comments", "harper-wasm", "harper-tree-sitter", "harper-html", "harper-literate-haskell", "harper-typst", "harper-stats", "harper-pos-utils", "harper-brill", "harper-ink"]
members = ["harper-cli", "harper-core", "harper-ls", "harper-comments", "harper-wasm", "harper-tree-sitter", "harper-html", "harper-literate-haskell", "harper-typst", "harper-stats", "harper-pos-utils", "harper-brill", "harper-ink", "harper-python"]
resolver = "2"
# Comment out the below lines if you plan to use a debugger.

View file

@ -12,6 +12,7 @@ clap = { version = "4.5.48", features = ["derive", "string"] }
harper-stats = { path = "../harper-stats", version = "0.66.0" }
dirs = "6.0.0"
harper-literate-haskell = { path = "../harper-literate-haskell", version = "0.66.0" }
harper-python = { path = "../harper-python", version = "0.66.0" }
harper-core = { path = "../harper-core", version = "0.66.0" }
harper-pos-utils = { path = "../harper-pos-utils", version = "0.66.0", features = ["threaded"] }
harper-comments = { path = "../harper-comments", version = "0.66.0" }

View file

@ -24,6 +24,7 @@ use harper_ink::InkParser;
use harper_literate_haskell::LiterateHaskellParser;
#[cfg(feature = "training")]
use harper_pos_utils::{BrillChunker, BrillTagger, BurnChunkerCpu};
use harper_python::PythonParser;
use harper_stats::Stats;
use serde::Serialize;
@ -826,6 +827,7 @@ fn load_file(
)),
Some("org") => Box::new(OrgMode),
Some("typ") => Box::new(harper_typst::Typst),
Some("py") | Some("pyi") => Box::new(PythonParser::default()),
_ => {
if let Some(comment_parser) = CommentParser::new_from_filename(file, markdown_options) {
Box::new(comment_parser)

View file

@ -26,7 +26,6 @@ tree-sitter-kotlin-ng = "1.1.0"
tree-sitter-lua = "0.2.0"
tree-sitter-nix = "0.3.0"
tree-sitter-php = "0.24.2"
tree-sitter-python = "0.25.0"
tree-sitter-ruby = "0.23.1"
tree-sitter-rust = "0.24.0"
tree-sitter-scala = "0.24.0"

View file

@ -37,7 +37,6 @@ impl CommentParser {
"lua" => tree_sitter_lua::LANGUAGE,
"nix" => tree_sitter_nix::LANGUAGE,
"php" => tree_sitter_php::LANGUAGE_PHP,
"python" => tree_sitter_python::LANGUAGE,
"ruby" => tree_sitter_ruby::LANGUAGE,
"rust" => tree_sitter_rust::LANGUAGE,
"scala" => tree_sitter_scala::LANGUAGE,

View file

@ -1,6 +1,6 @@
use super::{Lint, LintKind, Linter, Suggestion};
use crate::Document;
use crate::TokenStringExt;
use crate::{Document, TokenKind};
#[derive(Debug, Default)]
pub struct NoFrenchSpaces;
@ -13,6 +13,9 @@ impl Linter for NoFrenchSpaces {
if let Some(space_idx) = sentence.iter_space_indices().next() {
let space = &sentence[space_idx];
if matches!(space.kind, TokenKind::Space(0)) {
continue;
}
if space_idx == 0 && space.span.len() != 1 {
output.push(Lint {
span: space.span,

View file

@ -14,6 +14,7 @@ harper-core = { path = "../harper-core", version = "0.66.0", features = ["concur
harper-comments = { path = "../harper-comments", version = "0.66.0" }
harper-typst = { path = "../harper-typst", version = "0.66.0" }
harper-html = { path = "../harper-html", version = "0.66.0" }
harper-python = { path = "../harper-python", version = "0.66.0" }
tower-lsp-server = "0.22.1"
tokio = { version = "1.47.1", features = ["fs", "rt", "rt-multi-thread", "macros", "io-std", "io-util", "net"] }
clap = { version = "4.5.48", features = ["derive"] }

View file

@ -22,6 +22,7 @@ use harper_core::{Dialect, DictWordMetadata, Document, IgnoredLints};
use harper_html::HtmlParser;
use harper_ink::InkParser;
use harper_literate_haskell::LiterateHaskellParser;
use harper_python::PythonParser;
use harper_stats::{Record, Stats};
use harper_typst::Typst;
use serde_json::Value;
@ -388,6 +389,7 @@ impl Backend {
"mail" | "plaintext" | "text" => Some(Box::new(PlainEnglish)),
"typst" => Some(Box::new(Typst)),
"org" => Some(Box::new(OrgMode)),
"python" => Some(Box::new(PythonParser::default())),
_ => None,
};

16
harper-python/Cargo.toml Normal file
View file

@ -0,0 +1,16 @@
[package]
name = "harper-python"
version = "0.66.0"
edition = "2024"
description = "The language checker for developers."
license = "Apache-2.0"
repository = "https://github.com/automattic/harper"
[dependencies]
harper-core = { path = "../harper-core", version = "0.66.0" }
harper-tree-sitter = { path = "../harper-tree-sitter", version = "0.66.0" }
tree-sitter-python = "0.25.0"
tree-sitter = "0.25.10"
[dev-dependencies]
paste = "1.0.15"

94
harper-python/src/lib.rs Normal file
View file

@ -0,0 +1,94 @@
use harper_core::parsers::{self, Parser, PlainEnglish};
use harper_core::{Token, TokenKind};
use harper_tree_sitter::TreeSitterMasker;
use tree_sitter::Node;
pub struct PythonParser {
/// Used to grab the text nodes.
inner: parsers::Mask<TreeSitterMasker, PlainEnglish>,
}
impl PythonParser {
fn node_condition(n: &Node) -> bool {
if n.kind().contains("comment") {
return true;
}
if n.kind() == "string_content"
&& let Some(expr_stmt) = parent_is_expression_statement(n)
&& (is_module_level_docstring(&expr_stmt)
|| is_fn_or_class_docstrings(&expr_stmt)
|| is_attribute_docstring(&expr_stmt))
{
return true;
}
false
}
}
impl Default for PythonParser {
fn default() -> Self {
Self {
inner: parsers::Mask::new(
TreeSitterMasker::new(tree_sitter_python::LANGUAGE.into(), Self::node_condition),
PlainEnglish,
),
}
}
}
impl Parser for PythonParser {
fn parse(&self, source: &[char]) -> Vec<Token> {
let mut tokens = self.inner.parse(source);
let mut prev_kind: Option<&TokenKind> = None;
for token in &mut tokens {
if let TokenKind::Space(v) = &mut token.kind {
if let Some(TokenKind::Newline(_)) = &prev_kind {
// Lines in multiline docstrings are indented with spaces to match the current level.
// We need to remove such spaces to avoid triggering French spaces rule.
*v = 0;
} else {
*v = (*v).clamp(0, 1);
}
}
prev_kind = Some(&token.kind);
}
tokens
}
}
fn parent_is_expression_statement<'a>(node: &Node<'a>) -> Option<Node<'a>> {
node.parent()
.filter(|n| n.kind() == "string")
.and_then(|string_node| string_node.parent())
.filter(|n| n.kind() == "expression_statement")
}
#[inline]
fn is_module_level_docstring(expr_stmt: &Node) -> bool {
// (module . (expression_statement (string)))
expr_stmt.parent().is_some_and(|n| n.kind() == "module")
}
#[inline]
fn is_fn_or_class_docstrings(expr_stmt: &Node) -> bool {
// (class/func_definition body: (block . (expression_statement (string))))
expr_stmt
.parent()
.filter(|n| n.kind() == "block")
.and_then(|n| n.parent())
.is_some_and(|n| n.kind() == "function_definition" || n.kind() == "class_definition")
}
#[inline]
fn is_attribute_docstring(expr_stmt: &Node) -> bool {
// ((expression_statement (assignment)) . (expression_statement (string)))
expr_stmt
.prev_sibling()
.filter(|s| s.kind() == "expression_statement")
.and_then(|s| s.child(0))
.is_some_and(|c| c.kind() == "assignment")
}

View file

@ -0,0 +1,41 @@
use harper_core::linting::{LintGroup, Linter};
use harper_core::spell::FstDictionary;
use harper_core::{Dialect, Document};
use harper_python::PythonParser;
/// Creates a unit test checking Python source code parsing.
macro_rules! create_test {
($filename:ident.$ext:ident, $correct_expected:expr) => {
paste::paste! {
#[test]
fn [<lints_$ext _ $filename _correctly>](){
let source = include_str!(
concat!(
"./test_sources/",
concat!(
stringify!($filename), ".", stringify!($ext))
)
);
let parser = PythonParser::default();
let dict = FstDictionary::curated();
let document = Document::new(&source, &parser, &dict);
let mut linter = LintGroup::new_curated(dict, Dialect::American);
let lints = linter.lint(&document);
dbg!(&lints);
assert_eq!(lints.len(), $correct_expected);
// Make sure that all generated tokens span real characters
for token in document.tokens(){
assert!(token.span.try_get_content(document.get_source()).is_some());
}
}
}
};
}
create_test!(docstrings.py, 4);
create_test!(field_docstrings.py, 2);
create_test!(comments.py, 1);

View file

@ -0,0 +1,7 @@
# This is a camment.
header = "This is a haeder."
def main():
welcome_message = "Hellom World!"

View file

@ -0,0 +1,22 @@
"""Errors should never passs silently"""
def main():
"""Beautifull is better than ugly."""
class Main:
"""Explicit is better than implicet."""
def __init__(self):
"""Flat is bettter than nested."""
pass
def multiline_docstring(action_name: str):
"""Perform the specified action.
Available actions:
- stop
- start
- pause
"""

View file

@ -0,0 +1,5 @@
class Result:
output_path: str
"""The path to the autput file."""
status: str
"""The stotus of the job."""