diff --git a/Cargo.lock b/Cargo.lock index 20776b3c..81285b17 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2302,6 +2302,7 @@ dependencies = [ "harper-ink", "harper-literate-haskell", "harper-pos-utils", + "harper-python", "harper-stats", "harper-typst", "hashbrown 0.16.0", @@ -2336,7 +2337,6 @@ dependencies = [ "tree-sitter-lua", "tree-sitter-nix", "tree-sitter-php", - "tree-sitter-python", "tree-sitter-ruby", "tree-sitter-rust", "tree-sitter-scala", @@ -2430,6 +2430,7 @@ dependencies = [ "harper-html", "harper-ink", "harper-literate-haskell", + "harper-python", "harper-stats", "harper-typst", "indexmap", @@ -2464,6 +2465,17 @@ dependencies = [ "strum_macros 0.27.2", ] +[[package]] +name = "harper-python" +version = "0.66.0" +dependencies = [ + "harper-core", + "harper-tree-sitter", + "paste", + "tree-sitter", + "tree-sitter-python", +] + [[package]] name = "harper-stats" version = "0.66.0" diff --git a/Cargo.toml b/Cargo.toml index bbc00cf6..9eab182d 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,5 +1,5 @@ [workspace] -members = ["harper-cli", "harper-core", "harper-ls", "harper-comments", "harper-wasm", "harper-tree-sitter", "harper-html", "harper-literate-haskell", "harper-typst", "harper-stats", "harper-pos-utils", "harper-brill", "harper-ink"] +members = ["harper-cli", "harper-core", "harper-ls", "harper-comments", "harper-wasm", "harper-tree-sitter", "harper-html", "harper-literate-haskell", "harper-typst", "harper-stats", "harper-pos-utils", "harper-brill", "harper-ink", "harper-python"] resolver = "2" # Comment out the below lines if you plan to use a debugger. diff --git a/harper-cli/Cargo.toml b/harper-cli/Cargo.toml index 4f18667f..9bf96489 100644 --- a/harper-cli/Cargo.toml +++ b/harper-cli/Cargo.toml @@ -12,6 +12,7 @@ clap = { version = "4.5.48", features = ["derive", "string"] } harper-stats = { path = "../harper-stats", version = "0.66.0" } dirs = "6.0.0" harper-literate-haskell = { path = "../harper-literate-haskell", version = "0.66.0" } +harper-python = { path = "../harper-python", version = "0.66.0" } harper-core = { path = "../harper-core", version = "0.66.0" } harper-pos-utils = { path = "../harper-pos-utils", version = "0.66.0", features = ["threaded"] } harper-comments = { path = "../harper-comments", version = "0.66.0" } diff --git a/harper-cli/src/main.rs b/harper-cli/src/main.rs index 2ce1540b..57dea687 100644 --- a/harper-cli/src/main.rs +++ b/harper-cli/src/main.rs @@ -24,6 +24,7 @@ use harper_ink::InkParser; use harper_literate_haskell::LiterateHaskellParser; #[cfg(feature = "training")] use harper_pos_utils::{BrillChunker, BrillTagger, BurnChunkerCpu}; +use harper_python::PythonParser; use harper_stats::Stats; use serde::Serialize; @@ -826,6 +827,7 @@ fn load_file( )), Some("org") => Box::new(OrgMode), Some("typ") => Box::new(harper_typst::Typst), + Some("py") | Some("pyi") => Box::new(PythonParser::default()), _ => { if let Some(comment_parser) = CommentParser::new_from_filename(file, markdown_options) { Box::new(comment_parser) diff --git a/harper-comments/Cargo.toml b/harper-comments/Cargo.toml index d362df81..9b5382cb 100644 --- a/harper-comments/Cargo.toml +++ b/harper-comments/Cargo.toml @@ -26,7 +26,6 @@ tree-sitter-kotlin-ng = "1.1.0" tree-sitter-lua = "0.2.0" tree-sitter-nix = "0.3.0" tree-sitter-php = "0.24.2" -tree-sitter-python = "0.25.0" tree-sitter-ruby = "0.23.1" tree-sitter-rust = "0.24.0" tree-sitter-scala = "0.24.0" diff --git a/harper-comments/src/comment_parser.rs b/harper-comments/src/comment_parser.rs index 935f83b5..5b6e3b1d 100644 --- a/harper-comments/src/comment_parser.rs +++ b/harper-comments/src/comment_parser.rs @@ -37,7 +37,6 @@ impl CommentParser { "lua" => tree_sitter_lua::LANGUAGE, "nix" => tree_sitter_nix::LANGUAGE, "php" => tree_sitter_php::LANGUAGE_PHP, - "python" => tree_sitter_python::LANGUAGE, "ruby" => tree_sitter_ruby::LANGUAGE, "rust" => tree_sitter_rust::LANGUAGE, "scala" => tree_sitter_scala::LANGUAGE, diff --git a/harper-core/src/linting/no_french_spaces.rs b/harper-core/src/linting/no_french_spaces.rs index 1822fa3e..0ffde783 100644 --- a/harper-core/src/linting/no_french_spaces.rs +++ b/harper-core/src/linting/no_french_spaces.rs @@ -1,6 +1,6 @@ use super::{Lint, LintKind, Linter, Suggestion}; -use crate::Document; use crate::TokenStringExt; +use crate::{Document, TokenKind}; #[derive(Debug, Default)] pub struct NoFrenchSpaces; @@ -13,6 +13,9 @@ impl Linter for NoFrenchSpaces { if let Some(space_idx) = sentence.iter_space_indices().next() { let space = &sentence[space_idx]; + if matches!(space.kind, TokenKind::Space(0)) { + continue; + } if space_idx == 0 && space.span.len() != 1 { output.push(Lint { span: space.span, diff --git a/harper-ls/Cargo.toml b/harper-ls/Cargo.toml index 9809e0f9..326a675c 100644 --- a/harper-ls/Cargo.toml +++ b/harper-ls/Cargo.toml @@ -14,6 +14,7 @@ harper-core = { path = "../harper-core", version = "0.66.0", features = ["concur harper-comments = { path = "../harper-comments", version = "0.66.0" } harper-typst = { path = "../harper-typst", version = "0.66.0" } harper-html = { path = "../harper-html", version = "0.66.0" } +harper-python = { path = "../harper-python", version = "0.66.0" } tower-lsp-server = "0.22.1" tokio = { version = "1.47.1", features = ["fs", "rt", "rt-multi-thread", "macros", "io-std", "io-util", "net"] } clap = { version = "4.5.48", features = ["derive"] } diff --git a/harper-ls/src/backend.rs b/harper-ls/src/backend.rs index f4bebcf9..a02e62db 100644 --- a/harper-ls/src/backend.rs +++ b/harper-ls/src/backend.rs @@ -22,6 +22,7 @@ use harper_core::{Dialect, DictWordMetadata, Document, IgnoredLints}; use harper_html::HtmlParser; use harper_ink::InkParser; use harper_literate_haskell::LiterateHaskellParser; +use harper_python::PythonParser; use harper_stats::{Record, Stats}; use harper_typst::Typst; use serde_json::Value; @@ -388,6 +389,7 @@ impl Backend { "mail" | "plaintext" | "text" => Some(Box::new(PlainEnglish)), "typst" => Some(Box::new(Typst)), "org" => Some(Box::new(OrgMode)), + "python" => Some(Box::new(PythonParser::default())), _ => None, }; diff --git a/harper-python/Cargo.toml b/harper-python/Cargo.toml new file mode 100644 index 00000000..46e45f73 --- /dev/null +++ b/harper-python/Cargo.toml @@ -0,0 +1,16 @@ +[package] +name = "harper-python" +version = "0.66.0" +edition = "2024" +description = "The language checker for developers." +license = "Apache-2.0" +repository = "https://github.com/automattic/harper" + +[dependencies] +harper-core = { path = "../harper-core", version = "0.66.0" } +harper-tree-sitter = { path = "../harper-tree-sitter", version = "0.66.0" } +tree-sitter-python = "0.25.0" +tree-sitter = "0.25.10" + +[dev-dependencies] +paste = "1.0.15" diff --git a/harper-python/src/lib.rs b/harper-python/src/lib.rs new file mode 100644 index 00000000..ad4a6e54 --- /dev/null +++ b/harper-python/src/lib.rs @@ -0,0 +1,94 @@ +use harper_core::parsers::{self, Parser, PlainEnglish}; +use harper_core::{Token, TokenKind}; +use harper_tree_sitter::TreeSitterMasker; +use tree_sitter::Node; + +pub struct PythonParser { + /// Used to grab the text nodes. + inner: parsers::Mask, +} + +impl PythonParser { + fn node_condition(n: &Node) -> bool { + if n.kind().contains("comment") { + return true; + } + if n.kind() == "string_content" + && let Some(expr_stmt) = parent_is_expression_statement(n) + && (is_module_level_docstring(&expr_stmt) + || is_fn_or_class_docstrings(&expr_stmt) + || is_attribute_docstring(&expr_stmt)) + { + return true; + } + false + } +} + +impl Default for PythonParser { + fn default() -> Self { + Self { + inner: parsers::Mask::new( + TreeSitterMasker::new(tree_sitter_python::LANGUAGE.into(), Self::node_condition), + PlainEnglish, + ), + } + } +} + +impl Parser for PythonParser { + fn parse(&self, source: &[char]) -> Vec { + let mut tokens = self.inner.parse(source); + + let mut prev_kind: Option<&TokenKind> = None; + + for token in &mut tokens { + if let TokenKind::Space(v) = &mut token.kind { + if let Some(TokenKind::Newline(_)) = &prev_kind { + // Lines in multiline docstrings are indented with spaces to match the current level. + // We need to remove such spaces to avoid triggering French spaces rule. + *v = 0; + } else { + *v = (*v).clamp(0, 1); + } + } + + prev_kind = Some(&token.kind); + } + + tokens + } +} + +fn parent_is_expression_statement<'a>(node: &Node<'a>) -> Option> { + node.parent() + .filter(|n| n.kind() == "string") + .and_then(|string_node| string_node.parent()) + .filter(|n| n.kind() == "expression_statement") +} + +#[inline] +fn is_module_level_docstring(expr_stmt: &Node) -> bool { + // (module . (expression_statement (string))) + expr_stmt.parent().is_some_and(|n| n.kind() == "module") +} + +#[inline] +fn is_fn_or_class_docstrings(expr_stmt: &Node) -> bool { + // (class/func_definition body: (block . (expression_statement (string)))) + expr_stmt + .parent() + .filter(|n| n.kind() == "block") + .and_then(|n| n.parent()) + .is_some_and(|n| n.kind() == "function_definition" || n.kind() == "class_definition") +} + +#[inline] +fn is_attribute_docstring(expr_stmt: &Node) -> bool { + // ((expression_statement (assignment)) . (expression_statement (string))) + expr_stmt + .prev_sibling() + .filter(|s| s.kind() == "expression_statement") + .and_then(|s| s.child(0)) + .is_some_and(|c| c.kind() == "assignment") +} diff --git a/harper-python/tests/run_tests.rs b/harper-python/tests/run_tests.rs new file mode 100644 index 00000000..cf58e065 --- /dev/null +++ b/harper-python/tests/run_tests.rs @@ -0,0 +1,41 @@ +use harper_core::linting::{LintGroup, Linter}; +use harper_core::spell::FstDictionary; +use harper_core::{Dialect, Document}; +use harper_python::PythonParser; + +/// Creates a unit test checking Python source code parsing. +macro_rules! create_test { + ($filename:ident.$ext:ident, $correct_expected:expr) => { + paste::paste! { + #[test] + fn [](){ + let source = include_str!( + concat!( + "./test_sources/", + concat!( + stringify!($filename), ".", stringify!($ext)) + ) + ); + + let parser = PythonParser::default(); + let dict = FstDictionary::curated(); + let document = Document::new(&source, &parser, &dict); + + let mut linter = LintGroup::new_curated(dict, Dialect::American); + let lints = linter.lint(&document); + + dbg!(&lints); + assert_eq!(lints.len(), $correct_expected); + + // Make sure that all generated tokens span real characters + for token in document.tokens(){ + assert!(token.span.try_get_content(document.get_source()).is_some()); + } + } + } + }; +} + +create_test!(docstrings.py, 4); +create_test!(field_docstrings.py, 2); +create_test!(comments.py, 1); diff --git a/harper-python/tests/test_sources/comments.py b/harper-python/tests/test_sources/comments.py new file mode 100644 index 00000000..a9a8fd08 --- /dev/null +++ b/harper-python/tests/test_sources/comments.py @@ -0,0 +1,7 @@ + +# This is a camment. + +header = "This is a haeder." + +def main(): + welcome_message = "Hellom World!" \ No newline at end of file diff --git a/harper-python/tests/test_sources/docstrings.py b/harper-python/tests/test_sources/docstrings.py new file mode 100644 index 00000000..60b8e7a0 --- /dev/null +++ b/harper-python/tests/test_sources/docstrings.py @@ -0,0 +1,22 @@ +"""Errors should never passs silently""" +def main(): + """Beautifull is better than ugly.""" + + +class Main: + """Explicit is better than implicet.""" + + def __init__(self): + """Flat is bettter than nested.""" + pass + + + +def multiline_docstring(action_name: str): + """Perform the specified action. + + Available actions: + - stop + - start + - pause + """ \ No newline at end of file diff --git a/harper-python/tests/test_sources/field_docstrings.py b/harper-python/tests/test_sources/field_docstrings.py new file mode 100644 index 00000000..b08b2b68 --- /dev/null +++ b/harper-python/tests/test_sources/field_docstrings.py @@ -0,0 +1,5 @@ +class Result: + output_path: str + """The path to the autput file.""" + status: str + """The stotus of the job."""