Added long_sentences and changed linter API

2025-08-04 18:48:02 +00:00 · 2024-01-20 19:43:59 -07:00 · 2024-01-20 19:43:59 -07:00 · c9227e2faa
commit c9227e2faa
parent 8f9bcbfecd
17 changed files with 162 additions and 30 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@ -619,6 +619,7 @@ dependencies = [
 "is-macro",
 "itertools 0.11.0",
 "once_cell",
+ "paste",
 "pulldown-cmark",
 "serde",
 "smallvec",
@ -987,6 +988,12 @@ dependencies = [
 "windows-targets 0.48.5",
 ]

+[[package]]
+name = "paste"
+version = "1.0.14"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "de3145af08024dea9fa9914f381a17b8fc6034dfb00f3a84013f7ff43f29ed4c"
+
 [[package]]
 name = "percent-encoding"
 version = "2.3.1"
--- a/dictionary.dict
+++ b/dictionary.dict
@ -10727,6 +10727,7 @@ Zest/M
 Zeus/M
 Maia/M
 Semele/M
+Chiron/M
 Katniss/M
 Everdeen/M
 Leto/M
--- a/harper-core/Cargo.toml
+++ b/harper-core/Cargo.toml
@ -8,6 +8,7 @@ ahash = "0.8.7"
 is-macro = "0.3.0"
 itertools = "0.11.0"
 once_cell = "1.19.0"
+paste = "1.0.14"
 pulldown-cmark = "0.9.3"
 serde = { version = "1.0.190", features = ["derive"] }
 smallvec = "1.12.0"
--- a/harper-core/clippy.toml
+++ b/harper-core/clippy.toml
--- a/harper-core/src/document.rs
+++ b/harper-core/src/document.rs
@ -4,10 +4,11 @@ use itertools::Itertools;

 use crate::{
    lex_to_end,
-    linting::Suggestion,
+    linting::{LintSet, Suggestion},
    parsing::lex_to_end_md,
+    run_lint_set,
    span::Span,
-    FatToken,
+    Dictionary, FatToken, Lint,
    Punctuation::{self},
    Token, TokenKind,
 };
@ -48,6 +49,10 @@ impl Document {
        self.match_quotes();
    }

+    pub fn run_lint_set(&self, lint_set: &LintSet, dictionary: &Dictionary) -> Vec<Lint> {
+        run_lint_set(lint_set, self, dictionary)
+    }
+
    pub fn iter_quote_indices(&self) -> impl Iterator<Item = usize> + '_ {
        self.tokens.iter().enumerate().filter_map(|(idx, token)| {
            if let TokenKind::Punctuation(Punctuation::Quote(_)) = &token.kind {
--- a/harper-core/src/lib.rs
+++ b/harper-core/src/lib.rs
@ -7,7 +7,8 @@ mod span;
 mod spell;

 pub use document::Document;
-pub use linting::all_linters;
+pub use linting::run_lint_set;
+pub use linting::LintSet;
 pub use linting::{Lint, LintKind, Suggestion};
 pub use parsing::{lex_to_end, lex_to_end_str};
 pub use parsing::{FatToken, Punctuation, Token, TokenKind};
--- a/harper-core/src/linting/lint.rs
+++ b/harper-core/src/linting/lint.rs
@ -5,7 +5,7 @@ use serde::{Deserialize, Serialize};

 use crate::{document::Document, span::Span, Dictionary};

-#[derive(Debug, Clone, Serialize, Deserialize)]
+#[derive(Debug, Clone, Serialize, Deserialize, Default)]
 pub struct Lint {
    pub span: Span,
    pub lint_kind: LintKind,
@ -13,13 +13,16 @@ pub struct Lint {
    pub message: String,
 }

-#[derive(Debug, Clone, Copy, Serialize, Deserialize, Is)]
+#[derive(Debug, Clone, Copy, Serialize, Deserialize, Is, Default)]
 pub enum LintKind {
    Spelling,
    Capitalization,
    UnmatchedQuote,
    WrongQuotes,
    Repetition,
+    Readability,
+    #[default]
+    Miscellaneous,
 }

 #[derive(Debug, Clone, Serialize, Deserialize, Is)]
--- a/harper-core/src/linting/lint_set.rs
+++ b/harper-core/src/linting/lint_set.rs
@ -0,0 +1,80 @@
+use super::{
+    lint::Linter, long_sentences, repeated_words, sentence_capitalization, spell_check,
+    unclosed_quotes, wrong_quotes,
+};
+use paste::paste;
+
+use super::{
+    long_sentences::long_sentences, repeated_words::repeated_words,
+    sentence_capitalization::sentence_capitalization, spell_check::spell_check,
+    unclosed_quotes::unclosed_quotes, wrong_quotes::wrong_quotes,
+};
+
+#[derive(Debug, Clone)]
+pub struct LintSet {
+    pub(super) linters: Vec<Linter>,
+}
+
+impl LintSet {
+    pub fn new() -> Self {
+        Self {
+            linters: Vec::new(),
+        }
+    }
+}
+
+impl Default for LintSet {
+    fn default() -> Self {
+        Self::new()
+            .with_spell_check()
+            .with_repeated_words()
+            .with_long_sentences()
+            .with_unclosed_quotes()
+            .with_sentence_capitalization()
+    }
+}
+
+macro_rules! create_builder {
+    ($($linter:ident),*) => {
+        impl LintSet {
+            pub fn add_all(&mut self) -> &mut Self {
+                self.linters.extend_from_slice(&[
+                    $(
+                        $linter
+                    ),*
+                ]);
+
+                self
+            }
+
+            paste! {
+                $(
+                    #[doc = "Modifies self, adding the `" $linter "` linter to the set."]
+                    pub fn [<add_$linter>](&mut self) -> &mut Self{
+                        self.linters.push($linter);
+                        self
+                    }
+                )*
+            }
+
+            paste! {
+                $(
+                    #[doc = "Consumes self, adding the `" $linter "` linter to the set."]
+                    pub fn [<with_$linter>](mut self) -> Self{
+                        self.linters.push($linter);
+                        self
+                    }
+                )*
+            }
+        }
+    };
+}
+
+create_builder!(
+    spell_check,
+    sentence_capitalization,
+    unclosed_quotes,
+    wrong_quotes,
+    repeated_words,
+    long_sentences
+);
--- a/harper-core/src/linting/long_sentences.rs
+++ b/harper-core/src/linting/long_sentences.rs
@ -0,0 +1,21 @@
+use crate::{parsing::TokenStringExt, Dictionary, Document, Lint, LintKind, Span};
+
+/// Detect and warn that the sentence is too long.
+pub fn long_sentences(document: &Document, _dictionary: &Dictionary) -> Vec<Lint> {
+    let mut output = Vec::new();
+
+    for sentence in document.sentences() {
+        let word_count = sentence.iter_words().count();
+
+        if word_count > 40 {
+            output.push(Lint {
+                span: Span::new(sentence[0].span.start, sentence.last().unwrap().span.end),
+                lint_kind: LintKind::Readability,
+                message: format!("This sentence is {} words long.", word_count),
+                ..Default::default()
+            })
+        }
+    }
+
+    output
+}
--- a/harper-core/src/linting/mod.rs
+++ b/harper-core/src/linting/mod.rs
@ -1,4 +1,6 @@
 mod lint;
+mod lint_set;
+mod long_sentences;
 mod repeated_words;
 mod sentence_capitalization;
 mod spell_check;
@ -6,23 +8,14 @@ mod unclosed_quotes;
 mod wrong_quotes;

 pub use lint::{Lint, LintKind, Suggestion};
+pub use lint_set::LintSet;

 use crate::{Dictionary, Document};

-use self::lint::Linter;
-
-pub fn all_linters(document: &Document, dictionary: &Dictionary) -> Vec<Lint> {
+pub fn run_lint_set(lint_set: &LintSet, document: &Document, dictionary: &Dictionary) -> Vec<Lint> {
    let mut lints = Vec::new();

-    let linters: [Linter; 5] = [
-        spell_check::spell_check,
-        sentence_capitalization::sentence_capitalization_lint,
-        unclosed_quotes::unclosed_quotes,
-        wrong_quotes::wrong_quotes,
-        repeated_words::repeated_words_lint,
-    ];
-
-    for linter in linters {
+    for linter in &lint_set.linters {
        lints.append(&mut linter(document, dictionary));
    }

--- a/harper-core/src/linting/repeated_words.rs
+++ b/harper-core/src/linting/repeated_words.rs
@ -6,7 +6,7 @@ use crate::{
 };

 /// A linter that checks to make sure the first word of each sentence is capitalized.
-pub fn repeated_words_lint(document: &Document, _dictionary: &Dictionary) -> Vec<Lint> {
+pub fn repeated_words(document: &Document, _dictionary: &Dictionary) -> Vec<Lint> {
    let mut lints = Vec::new();
    let set = create_match_set();

@ -49,6 +49,7 @@ pub fn repeated_words_lint(document: &Document, _dictionary: &Dictionary) -> Vec
    lints
 }

+/// The set of words that can be considered for repetition checking.
 fn create_match_set() -> HashSet<Vec<char>> {
    let mut output = HashSet::default();

@ -56,20 +57,41 @@ fn create_match_set() -> HashSet<Vec<char>> {
    output.insert(vec!['T', 'h', 'e']);
    output.insert(vec!['a']);
    output.insert(vec!['A']);
+    output.insert(vec!['a', 'n']);
+    output.insert(vec!['A', 'n']);
+    output.insert(vec!['i', 's']);
+    output.insert(vec!['I', 's']);
+    output.insert(vec!['w', 'i', 'l', 'l']);
+    output.insert(vec!['W', 'i', 'l', 'l']);
+    output.insert(vec!['l', 'i', 'k', 'e']);
+    output.insert(vec!['L', 'i', 'k', 'e']);
+    output.insert(vec!['t', 'h', 'a', 't']);
+    output.insert(vec!['T', 'h', 'a', 't']);
+    output.insert(vec!['w', 'h', 'a', 't']);
+    output.insert(vec!['W', 'h', 'a', 't']);
+    output.insert(vec!['w', 'h', 'i', 'c', 'h']);
+    output.insert(vec!['W', 'h', 'i', 'c', 'h']);
+    output.insert(vec!['b', 'e']);
+    output.insert(vec!['B', 'e']);
+    output.insert(vec!['a', 'n', 'd']);
+    output.insert(vec!['A', 'n', 'd']);
+    output.insert(vec!['I']);
+    output.insert(vec!['a', 't']);
+    output.insert(vec!['A', 't']);

    output
 }

 #[cfg(test)]
 mod tests {
-    use super::repeated_words_lint;
+    use super::repeated_words;
    use crate::{Dictionary, Document};

    #[test]
    fn catches_basic() {
        let dictionary = Dictionary::new();
        let test = Document::new("I wanted the the banana.", false);
-        let lints = repeated_words_lint(&test, dictionary);
+        let lints = repeated_words(&test, dictionary);
        assert!(lints.len() == 1);
    }
 }
--- a/harper-core/src/linting/sentence_capitalization.rs
+++ b/harper-core/src/linting/sentence_capitalization.rs
@ -5,7 +5,7 @@ use crate::{document::Document, parsing::TokenStringExt, Dictionary, Lint, LintK
 use super::lint::Suggestion;

 /// A linter that checks to make sure the first word of each sentence is capitalized.
-pub fn sentence_capitalization_lint(document: &Document, _dictionary: &Dictionary) -> Vec<Lint> {
+pub fn sentence_capitalization(document: &Document, _dictionary: &Dictionary) -> Vec<Lint> {
    let mut lints = Vec::new();

    for sentence in document.sentences() {
--- a/harper-core/src/span.rs
+++ b/harper-core/src/span.rs
@ -1,7 +1,7 @@
 use serde::{Deserialize, Serialize};

 /// A window in a [char].
-#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
+#[derive(Debug, Clone, Copy, Serialize, Deserialize, Default)]
 pub struct Span {
    pub start: usize,
    pub end: usize,
--- a/harper-core/src/spell/dictionary.rs
+++ b/harper-core/src/spell/dictionary.rs
@ -2,7 +2,6 @@ use std::hash::Hasher;

 use ahash::{AHashSet, AHasher};
 use once_cell::sync::Lazy;
-use smallvec::SmallVec;

 use super::hunspell::{parse_default_attribute_list, parse_default_word_list};

--- a/harper-ls/src/diagnostics.rs
+++ b/harper-ls/src/diagnostics.rs
@ -1,5 +1,5 @@
 use cached::proc_macro::cached;
-use harper_core::{all_linters, Dictionary, Document, Lint, Span, Suggestion};
+use harper_core::{Dictionary, Document, Lint, LintSet, Span, Suggestion};
 use std::collections::HashMap;
 use std::fs::read;
 use tower_lsp::jsonrpc::{ErrorCode, Result};
@ -80,7 +80,7 @@ fn open_url(url: &Url) -> Result<String> {
 fn lint_string(text: String) -> Vec<Lint> {
    let document = Document::new(&text, true);
    let dictionary = Dictionary::new();
-    all_linters(&document, dictionary)
+    document.run_lint_set(&LintSet::default(), dictionary)
 }

 fn lint_to_diagnostic(lint: Lint, source: &[char]) -> Diagnostic {
--- a/harper-serve/src/main.rs
+++ b/harper-serve/src/main.rs
@ -1,6 +1,6 @@
 #![allow(dead_code)]

-use harper_core::{all_linters, Dictionary, Document, FatToken, Lint, Span, Suggestion};
+use harper_core::{Dictionary, Document, FatToken, Lint, LintSet, Span, Suggestion};
 use std::net::SocketAddr;
 use tokio::time::Instant;
 use tracing::{info, Level};
@ -92,8 +92,7 @@ async fn lint(Json(payload): Json<LintRequest>) -> (StatusCode, Json<LintRespons

    let dictionary = Dictionary::new();
    let document = Document::new(&text, true);
-
-    let lints = all_linters(&document, dictionary);
+    let lints = document.run_lint_set(&LintSet::default(), dictionary);

    (StatusCode::ACCEPTED, Json(LintResponse { lints }))
 }
--- a/harper-wasm/src/lib.rs
+++ b/harper-wasm/src/lib.rs
@ -1,4 +1,4 @@
-use harper_core::{all_linters, Dictionary, Document};
+use harper_core::{Dictionary, Document, LintSet};
 use serde::Serialize;
 use wasm_bindgen::{prelude::wasm_bindgen, JsValue};

@ -22,7 +22,7 @@ pub fn lint(text: String) -> Vec<JsValue> {
    let dictionary = Dictionary::new();
    let document = Document::new(&text, true);

-    let lints = all_linters(&document, dictionary);
+    let lints = document.run_lint_set(&LintSet::default(), dictionary);

    lints
        .into_iter()