test: add snapshots for linters (#1228)

* test: add snapshots for linters * fix(core): update snapshots --------- Co-authored-by: Elijah Potter <me@elijahpotter.dev>
2025-12-23 08:48:15 +00:00 · 2025-05-06 20:29:52 +02:00 · 2025-05-06 20:29:52 +02:00 · 663c729e46
commit 663c729e46
parent 08927bdaae
8 changed files with 14954 additions and 91 deletions
--- a/harper-core/tests/linters.rs
+++ b/harper-core/tests/linters.rs
@ -0,0 +1,237 @@
+//! This test creats snapshots of the reports of all linters.
+//!
+//! # Usage
+//!
+//! To add a new snapshot, simply add the document to `tests/text` and run this
+//! test. It will automatically create a new snapshot in `tests/text/linters`.
+//! To update an existing snapshot, also just run this test.
+//!
+//! Note: This test will fail if the snapshot files are not up to date. This
+//! ensures that CI will fail if linters change their behavior.
+
+use harper_core::{
+    Dialect, Document, FstDictionary,
+    linting::{LintGroup, Linter},
+};
+
+mod snapshot;
+
+struct LinePos {
+    /// 0-based index of the line
+    pub line: usize,
+    /// 0-based index of the column
+    pub col: usize,
+}
+
+struct Lines<'a> {
+    lines: Vec<&'a str>,
+    offsets: Vec<usize>,
+}
+impl Lines<'_> {
+    fn new(source: &str) -> Lines {
+        let lines: Vec<&str> = source.split('\n').collect();
+        let offsets: Vec<usize> = lines
+            .iter()
+            .scan(0, |offset, line| {
+                let old_offset = *offset;
+                *offset += line.chars().count() + 1;
+                Some(old_offset)
+            })
+            .collect();
+
+        Lines { lines, offsets }
+    }
+
+    fn len(&self) -> usize {
+        self.lines.len()
+    }
+
+    fn get_pos(&self, offset: usize) -> LinePos {
+        let line_index = self
+            .offsets
+            .binary_search(&offset)
+            .unwrap_or_else(|x| x - 1);
+
+        LinePos {
+            line: line_index,
+            col: offset - self.offsets[line_index],
+        }
+    }
+}
+impl<'a> std::ops::Index<usize> for Lines<'a> {
+    type Output = &'a str;
+
+    fn index(&self, index: usize) -> &Self::Output {
+        &self.lines[index]
+    }
+}
+
+fn print_error(lines: &Lines, start: usize, end: usize, message: &str) -> String {
+    let mut out = String::new();
+
+    fn print_line(out: &mut String, line: &str, number: usize) {
+        out.push_str(&format!("{number:>6} | {line}\n"));
+    }
+
+    fn is_sentence_boundary(c: char) -> bool {
+        matches!(c, '.' | '?' | '!' | ':' | ';')
+    }
+    fn print_pre_line_context(
+        out: &mut String,
+        context_line: &str,
+        number: usize,
+        line: &str,
+        start_col: usize,
+    ) {
+        if context_line.is_empty() {
+            return;
+        }
+        if start_col > 40 {
+            // that's enough context
+            return;
+        }
+
+        let last_char = context_line.chars().last().unwrap();
+        let mut chars_before = line.chars().take(start_col);
+        if !is_sentence_boundary(last_char) && !chars_before.any(is_sentence_boundary) {
+            print_line(out, context_line, number);
+        }
+    }
+    fn print_post_line_context(
+        out: &mut String,
+        context_line: &str,
+        number: usize,
+        line: &str,
+        end_col: usize,
+    ) {
+        if context_line.is_empty() {
+            return;
+        }
+        if end_col < 40 {
+            // that's enough context
+            return;
+        }
+
+        let mut chars_after = line.chars().skip(end_col);
+        if !chars_after.any(is_sentence_boundary) {
+            print_line(out, context_line, number);
+        }
+    }
+
+    fn print_underline(
+        out: &mut String,
+        start_col: usize,
+        end_col: usize,
+        continuation: bool,
+        message: &str,
+    ) {
+        out.push_str("       | ");
+        for _ in 0..start_col {
+            out.push(' ');
+        }
+        out.push(if continuation { '~' } else { '^' });
+        for _ in 0..end_col.saturating_sub(start_col) {
+            out.push('~');
+        }
+
+        if !message.is_empty() {
+            out.push(' ');
+            out.push_str(message);
+        }
+        out.push('\n');
+    }
+
+    let start = lines.get_pos(start);
+    let end = lines.get_pos(end - 1);
+
+    if start.line > 0 {
+        print_pre_line_context(
+            &mut out,
+            lines[start.line - 1],
+            start.line,
+            lines[start.line],
+            start.col,
+        );
+    }
+
+    if start.line == end.line {
+        print_line(&mut out, lines[start.line], start.line + 1);
+        print_underline(&mut out, start.col, end.col, false, message);
+    } else {
+        for i in start.line..end.line {
+            let line = lines[i];
+            print_line(&mut out, line, i + 1);
+            print_underline(
+                &mut out,
+                if i == start.line { start.col } else { 0 },
+                line.chars().count(),
+                i != start.line,
+                "",
+            );
+        }
+
+        print_line(&mut out, lines[end.line], end.line + 1);
+        print_underline(&mut out, 0, end.col, true, message);
+    }
+
+    if end.line + 1 < lines.len() {
+        print_post_line_context(
+            &mut out,
+            lines[end.line + 1],
+            end.line + 2,
+            lines[end.line],
+            end.col,
+        );
+    }
+
+    out
+}
+
+#[test]
+fn test_most_lints() {
+    snapshot::snapshot_all_text_files("linters", ".snap.yml", |source| {
+        let dict = FstDictionary::curated();
+        let document = Document::new_markdown_default(source, &dict);
+
+        let mut linter = LintGroup::new_curated(dict, Dialect::American);
+
+        let mut lints = linter.lint(&document);
+        lints.sort_by(|a, b| {
+            a.span
+                .start
+                .cmp(&b.span.start)
+                .then(a.span.end.cmp(&b.span.end))
+        });
+
+        // split the input document into lines
+        let lines = Lines::new(source);
+
+        let mut out = String::new();
+
+        for lint in lints {
+            out.push_str(&format!(
+                "Lint:    {:?} ({} priority)\n",
+                lint.lint_kind, lint.priority
+            ));
+
+            let message = print_error(&lines, lint.span.start, lint.span.end, &lint.message);
+            out.push_str("Message: |\n");
+            for l in message.lines() {
+                out.push_str("  ");
+                out.push_str(l);
+                out.push('\n');
+            }
+
+            if !lint.suggestions.is_empty() {
+                out.push_str("Suggest:\n");
+                for suggestion in &lint.suggestions {
+                    out.push_str(&format!("  - {}\n", suggestion));
+                }
+            }
+
+            out.push_str("\n\n\n");
+        }
+
+        out
+    });
+}
--- a/harper-core/tests/pos_tags.rs
+++ b/harper-core/tests/pos_tags.rs
@ -58,37 +58,11 @@
 //! - [`TokenKind::Space`], [`TokenKind::Newline`], and
 //!   [`TokenKind::ParagraphBreak`] are ignored.
 //! - All other token kinds are denoted by their variant name.
-use std::{borrow::Cow, path::PathBuf};
+use std::borrow::Cow;

 use harper_core::{Degree, Document, FstDictionary, TokenKind, WordMetadata};

-fn get_tests_dir() -> PathBuf {
-    PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("tests")
-}
-fn get_text_dir() -> PathBuf {
-    get_tests_dir().join("text")
-}
-fn get_snapshot_dir() -> PathBuf {
-    get_tests_dir().join("text/tagged")
-}
-fn get_text_files() -> Vec<PathBuf> {
-    let mut files = vec![];
-    for entry in std::fs::read_dir(get_text_dir())
-        .unwrap()
-        .filter_map(|f| f.ok())
-        .filter(|f| f.metadata().unwrap().is_file())
-    {
-        let path = entry.path();
-        let ext = path
-            .extension()
-            .map(|e| e.to_string_lossy().to_string())
-            .unwrap_or_default();
-        if matches!(ext.as_str(), "txt" | "md") {
-            files.push(entry.path());
-        }
-    }
-    files
-}
+mod snapshot;

 fn format_word_tag(word: &WordMetadata) -> String {
    // These tags are inspired by the Penn Treebank POS tagset
@ -258,70 +232,31 @@ impl Formatter {
    }
 }

-fn tag_text(source: &str) -> String {
-    let dict = FstDictionary::curated();
-    let document = Document::new_markdown_default(&source.replace("\r\n", "\n"), &dict);
-
-    let mut formatter = Formatter::new();
-    for token in document.fat_string_tokens() {
-        match token.kind {
-            TokenKind::Space(_) => { /* ignore */ }
-            TokenKind::ParagraphBreak => {
-                formatter.new_line();
-                formatter.new_line();
-            }
-            TokenKind::Newline(_) => {
-                formatter.new_line();
-            }
-            kind => {
-                let text = &token.content;
-                let tag = format_tag(&kind);
-                formatter.add(text, &tag);
-            }
-        }
-    }
-
-    formatter.finish()
-}
-
-fn tag_file(path: &PathBuf) -> Result<(), Box<dyn std::error::Error>> {
-    let source = std::fs::read_to_string(path)?.replace("\r\n", "\n");
-    let tagged = tag_text(source.trim_end());
-
-    // compare with snapshot
-    let snapshot_name = path.file_stem().unwrap().to_string_lossy().to_string() + ".md";
-    let snapshot_file = get_snapshot_dir().join(snapshot_name);
-    let has_snapshot = snapshot_file.exists();
-    if has_snapshot {
-        let snapshot = std::fs::read_to_string(&snapshot_file)?;
-        if tagged == snapshot {
-            return Ok(());
-        }
-    }
-
-    // write snapshot
-    std::fs::create_dir_all(get_snapshot_dir())?;
-    std::fs::write(snapshot_file, tagged)?;
-
-    Err(if has_snapshot {
-        "Snapshot mismatches!"
-    } else {
-        "No snapshot!"
-    }
-    .into())
-}
-
 #[test]
 fn test_pos_tagger() {
-    let mut errors = 0;
-    for file in get_text_files() {
-        println!("Processing {}", file.display());
-        if let Err(e) = tag_file(&file) {
-            eprintln!("Error processing {}: {}", file.display(), e);
-            errors += 1;
+    snapshot::snapshot_all_text_files("tagged", ".md", |source| {
+        let dict = FstDictionary::curated();
+        let document = Document::new_markdown_default(source, &dict);
+
+        let mut formatter = Formatter::new();
+        for token in document.fat_string_tokens() {
+            match token.kind {
+                TokenKind::Space(_) => { /* ignore */ }
+                TokenKind::ParagraphBreak => {
+                    formatter.new_line();
+                    formatter.new_line();
+                }
+                TokenKind::Newline(_) => {
+                    formatter.new_line();
+                }
+                kind => {
+                    let text = &token.content;
+                    let tag = format_tag(&kind);
+                    formatter.add(text, &tag);
+                }
+            }
        }
-    }
-    if errors > 0 {
-        panic!("{} errors occurred while processing files", errors);
-    }
+
+        formatter.finish()
+    });
 }
--- a/harper-core/tests/snapshot.rs
+++ b/harper-core/tests/snapshot.rs
@ -0,0 +1,81 @@
+use std::path::{Path, PathBuf};
+
+fn get_tests_dir() -> PathBuf {
+    PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("tests")
+}
+fn get_text_dir() -> PathBuf {
+    get_tests_dir().join("text")
+}
+
+pub fn get_text_files() -> Vec<PathBuf> {
+    let mut files = vec![];
+    for entry in std::fs::read_dir(get_text_dir())
+        .unwrap()
+        .filter_map(|f| f.ok())
+        .filter(|f| f.metadata().unwrap().is_file())
+    {
+        let path = entry.path();
+        let ext = path
+            .extension()
+            .map(|e| e.to_string_lossy().to_string())
+            .unwrap_or_default();
+        if matches!(ext.as_str(), "txt" | "md") {
+            files.push(entry.path());
+        }
+    }
+    files
+}
+
+fn tag_file(
+    text_file: &Path,
+    snapshot_file: &Path,
+    create_snapshot: impl Fn(&str) -> String,
+) -> Result<(), Box<dyn std::error::Error>> {
+    let source = std::fs::read_to_string(text_file)?.replace("\r\n", "\n");
+    let tagged = create_snapshot(source.trim_end());
+
+    // compare with snapshot
+    let has_snapshot = snapshot_file.exists();
+    if has_snapshot {
+        let snapshot = std::fs::read_to_string(snapshot_file)?;
+        if tagged == snapshot {
+            return Ok(());
+        }
+    }
+
+    // write snapshot
+    std::fs::write(snapshot_file, tagged)?;
+
+    Err(if has_snapshot {
+        "Snapshot mismatches!".into()
+    } else {
+        "No snapshot!".into()
+    })
+}
+fn get_snapshot_file(text_file: &Path, snapshot_dir: &Path, ext: &str) -> PathBuf {
+    let snapshot_name = text_file.file_stem().unwrap().to_string_lossy().to_string() + ext;
+    snapshot_dir.join(snapshot_name)
+}
+#[allow(dead_code)]
+pub fn snapshot_all_text_files(
+    out_dir: &str,
+    snapshot_ext: &str,
+    create_snapshot: impl Copy + Fn(&str) -> String,
+) {
+    let snapshot_dir = get_text_dir().join(out_dir);
+    std::fs::create_dir_all(&snapshot_dir).expect("Failed to create snapshot directory");
+
+    let mut errors = 0;
+    for text_file in get_text_files() {
+        println!("Processing {}", text_file.display());
+        let snapshot_file = get_snapshot_file(&text_file, &snapshot_dir, snapshot_ext);
+        if let Err(e) = tag_file(&text_file, &snapshot_file, create_snapshot) {
+            eprintln!("Error processing {}: {}", text_file.display(), e);
+            errors += 1;
+        }
+    }
+
+    if errors > 0 {
+        panic!("{} errors occurred while processing files", errors);
+    }
+}
--- a/harper-core/tests/text/linters/Alice's
+++ b/harper-core/tests/text/linters/Alice's
--- a/harper-core/tests/text/linters/Computer
+++ b/harper-core/tests/text/linters/Computer
--- a/harper-core/tests/text/linters/Part-of-speech
+++ b/harper-core/tests/text/linters/Part-of-speech
@ -0,0 +1,635 @@
+Lint:    Readability (127 priority)
+Message: |
+       8 | In corpus linguistics, part-of-speech tagging (POS tagging or PoS tagging or
+         | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+       9 | POST), also called grammatical tagging is the process of marking up a word in a
+         | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+      10 | text (corpus) as corresponding to a particular part of speech, based on both its
+         | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+      11 | definition and its context. A simplified form of this is commonly taught to
+         | ~~~~~~~~~~~~~~~~~~~~~~~~~~~ This sentence is 46 words long.
+
+
+
+Lint:    Spelling (63 priority)
+Message: |
+       8 | In corpus linguistics, part-of-speech tagging (POS tagging or PoS tagging or
+         |                                                ^~~ Did you mean to spell “POS” this way?
+       9 | POST), also called grammatical tagging is the process of marking up a word in a
+Suggest:
+  - Replace with: “PBS”
+  - Replace with: “PMS”
+  - Replace with: “POV”
+
+
+
+Lint:    Spelling (63 priority)
+Message: |
+       8 | In corpus linguistics, part-of-speech tagging (POS tagging or PoS tagging or
+         |                                                               ^~~ Did you mean to spell “PoS” this way?
+       9 | POST), also called grammatical tagging is the process of marking up a word in a
+Suggest:
+  - Replace with: “Poe”
+  - Replace with: “PBS”
+  - Replace with: “PMS”
+
+
+
+Lint:    Spelling (63 priority)
+Message: |
+      15 | Once performed by hand, POS tagging is now done in the context of computational
+         |                         ^~~ Did you mean to spell “POS” this way?
+Suggest:
+  - Replace with: “PBS”
+  - Replace with: “PMS”
+  - Replace with: “POV”
+
+
+
+Lint:    Spelling (63 priority)
+Message: |
+      17 | parts of speech, by a set of descriptive tags. POS-tagging algorithms fall into
+         |                                                ^~~ Did you mean to spell “POS” this way?
+      18 | two distinctive groups: rule-based and stochastic. E. Brill's tagger, one of the
+Suggest:
+  - Replace with: “PBS”
+  - Replace with: “PMS”
+  - Replace with: “POV”
+
+
+
+Lint:    Spelling (63 priority)
+Message: |
+      18 | two distinctive groups: rule-based and stochastic. E. Brill's tagger, one of the
+         |                                                    ^~ Did you mean to spell “E.” this way?
+Suggest:
+  - Replace with: “E”
+  - Replace with: “Ea”
+  - Replace with: “Ed”
+
+
+
+Lint:    Spelling (63 priority)
+Message: |
+      18 | two distinctive groups: rule-based and stochastic. E. Brill's tagger, one of the
+         |                                                       ^~~~~~~ Did you mean to spell “Brill's” this way?
+      19 | first and most widely used English POS-taggers, employs rule-based algorithms.
+Suggest:
+  - Replace with: “Brillo's”
+  - Replace with: “Bill's”
+  - Replace with: “Trill's”
+
+
+
+Lint:    Spelling (63 priority)
+Message: |
+      18 | two distinctive groups: rule-based and stochastic. E. Brill's tagger, one of the
+      19 | first and most widely used English POS-taggers, employs rule-based algorithms.
+         |                                    ^~~ Did you mean to spell “POS” this way?
+Suggest:
+  - Replace with: “PBS”
+  - Replace with: “PMS”
+  - Replace with: “POV”
+
+
+
+Lint:    Readability (127 priority)
+Message: |
+      33 | as the more common plural noun. Grammatical context is one way to determine
+         |                                ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+      34 | this; semantic analysis can also be used to infer that "sailor" and "hatch"
+         | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+      35 | implicate "dogs" as 1) in the nautical context and 2) an action applied to the
+         | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+      36 | object "hatch" (in this context, "dogs" is a nautical term meaning "fastens (a
+         | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+      37 | watertight door) securely").
+         | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~ This sentence is 49 words long.
+
+
+
+Lint:    Spelling (63 priority)
+Message: |
+      49 | tags. For example, NN for singular common nouns, NNS for plural common nouns, NP
+         |                    ^~ Did you mean to spell “NN” this way?
+Suggest:
+  - Replace with: “Nun”
+  - Replace with: “Non”
+  - Replace with: “N1”
+
+
+
+Lint:    Spelling (63 priority)
+Message: |
+      49 | tags. For example, NN for singular common nouns, NNS for plural common nouns, NP
+         |                                                  ^~~ Did you mean to spell “NNS” this way?
+      50 | for singular proper nouns (see the POS tags used in the Brown Corpus). Other
+Suggest:
+  - Replace with: “NBS”
+  - Replace with: “NES”
+  - Replace with: “NS”
+
+
+
+Lint:    Spelling (63 priority)
+Message: |
+      49 | tags. For example, NN for singular common nouns, NNS for plural common nouns, NP
+         |                                                                               ^~ Did you mean to spell “NP” this way?
+      50 | for singular proper nouns (see the POS tags used in the Brown Corpus). Other
+Suggest:
+  - Replace with: “N”
+  - Replace with: “Nap”
+  - Replace with: “Nip”
+
+
+
+Lint:    Spelling (63 priority)
+Message: |
+      49 | tags. For example, NN for singular common nouns, NNS for plural common nouns, NP
+      50 | for singular proper nouns (see the POS tags used in the Brown Corpus). Other
+         |                                    ^~~ Did you mean to spell “POS” this way?
+Suggest:
+  - Replace with: “PBS”
+  - Replace with: “PMS”
+  - Replace with: “POV”
+
+
+
+Lint:    Spelling (63 priority)
+Message: |
+      55 | 150 separate parts of speech for English. Work on stochastic methods for tagging
+      56 | Koine Greek (DeRose 1990) has used over 1,000 parts of speech and found that
+         | ^~~~~ Did you mean to spell “Koine” this way?
+Suggest:
+  - Replace with: “Kine”
+  - Replace with: “Kline”
+  - Replace with: “Kane”
+
+
+
+Lint:    Spelling (63 priority)
+Message: |
+      55 | 150 separate parts of speech for English. Work on stochastic methods for tagging
+      56 | Koine Greek (DeRose 1990) has used over 1,000 parts of speech and found that
+         |              ^~~~~~ Did you mean to spell “DeRose” this way?
+Suggest:
+  - Replace with: “Depose”
+  - Replace with: “Defoe”
+  - Replace with: “Denise”
+
+
+
+Lint:    Spelling (63 priority)
+Message: |
+      57 | about as many words were ambiguous in that language as in English. A
+      58 | morphosyntactic descriptor in the case of morphologically rich languages is
+         | ^~~~~~~~~~~~~~~ Did you mean to spell “morphosyntactic” this way?
+Suggest:
+  - Replace with: “morphosyntax's”
+  - Replace with: “morphosyntax”
+
+
+
+Lint:    Spelling (63 priority)
+Message: |
+      58 | morphosyntactic descriptor in the case of morphologically rich languages is
+         |                                           ^~~~~~~~~~~~~~~ Did you mean “morphological”?
+      59 | commonly expressed using very short mnemonics, such as Ncmsan for Category=Noun,
+Suggest:
+  - Replace with: “morphological”
+
+
+
+Lint:    Spelling (63 priority)
+Message: |
+      59 | commonly expressed using very short mnemonics, such as Ncmsan for Category=Noun,
+         |                                                        ^~~~~~ Did you mean to spell “Ncmsan” this way?
+      60 | Type = common, Gender = masculine, Number = singular, Case = accusative, Animate
+Suggest:
+  - Replace with: “Nissan”
+  - Replace with: “Nisan”
+  - Replace with: “Nolan”
+
+
+
+Lint:    Spelling (63 priority)
+Message: |
+      63 | The most popular "tag set" for POS tagging for American English is probably the
+         |                                ^~~ Did you mean to spell “POS” this way?
+Suggest:
+  - Replace with: “PBS”
+  - Replace with: “PMS”
+  - Replace with: “POV”
+
+
+
+Lint:    Spelling (63 priority)
+Message: |
+      63 | The most popular "tag set" for POS tagging for American English is probably the
+      64 | Penn tag set, developed in the Penn Treebank project. It is largely similar to
+         |                                     ^~~~~~~~ Did you mean to spell “Treebank” this way?
+Suggest:
+  - Replace with: “Freeman”
+  - Replace with: “Reembark”
+  - Replace with: “Debank”
+
+
+
+Lint:    Spelling (63 priority)
+Message: |
+      69 | POS tagging work has been done in a variety of languages, and the set of POS
+         | ^~~ Did you mean to spell “POS” this way?
+Suggest:
+  - Replace with: “PBS”
+  - Replace with: “PMS”
+  - Replace with: “POV”
+
+
+
+Lint:    Spelling (63 priority)
+Message: |
+      69 | POS tagging work has been done in a variety of languages, and the set of POS
+         |                                                                          ^~~ Did you mean to spell “POS” this way?
+      70 | tags used varies greatly with language. Tags usually are designed to include
+Suggest:
+  - Replace with: “PBS”
+  - Replace with: “PMS”
+  - Replace with: “POV”
+
+
+
+Lint:    Spelling (63 priority)
+Message: |
+      74 | Greek and Latin can be very large; tagging words in agglutinative languages such
+         |                                                     ^~~~~~~~~~~~~ Did you mean to spell “agglutinative” this way?
+      75 | as Inuit languages may be virtually impossible. At the other extreme, Petrov et
+Suggest:
+  - Replace with: “agglutinate”
+  - Replace with: “agglutinating”
+  - Replace with: “agglutination”
+
+
+
+Lint:    Spelling (63 priority)
+Message: |
+      75 | as Inuit languages may be virtually impossible. At the other extreme, Petrov et
+         |                                                                       ^~~~~~ Did you mean to spell “Petrov” this way?
+      76 | al. have proposed a "universal" tag set, with 12 categories (for example, no
+Suggest:
+  - Replace with: “Petrol”
+  - Replace with: “Pedro”
+  - Replace with: “Peron”
+
+
+
+Lint:    Spelling (63 priority)
+Message: |
+      75 | as Inuit languages may be virtually impossible. At the other extreme, Petrov et
+         |                                                                              ^~~
+      76 | al. have proposed a "universal" tag set, with 12 categories (for example, no
+         | ~~~ Did you mean “et al.”?
+Suggest:
+  - Replace with: “et al.”
+
+
+
+Lint:    Spelling (63 priority)
+Message: |
+      86 | The first major corpus of English for computer analysis was the Brown Corpus
+      87 | developed at Brown University by Henry Kučera and W. Nelson Francis, in the
+         |                                        ^~~~~~ Did you mean to spell “Kučera” this way?
+Suggest:
+  - Replace with: “Kara”
+  - Replace with: “Kendra”
+  - Replace with: “Keri”
+
+
+
+Lint:    Spelling (63 priority)
+Message: |
+      87 | developed at Brown University by Henry Kučera and W. Nelson Francis, in the
+         |                                                   ^~ Did you mean to spell “W.” this way?
+Suggest:
+  - Replace with: “We”
+  - Replace with: “WA”
+  - Replace with: “WC”
+
+
+
+Lint:    Spelling (63 priority)
+Message: |
+      98 | and corrected by hand, and later users sent in errata so that by the late 70s
+         |                                                                             ^ Did you mean to spell “s” this way?
+      99 | the tagging was nearly perfect (allowing for some cases on which even human
+Suggest:
+  - Replace with: “sf”
+  - Replace with: “sh”
+  - Replace with: “so”
+
+
+
+Lint:    Spelling (63 priority)
+Message: |
+     105 | later part-of-speech tagging systems, such as CLAWS and VOLSUNGA. However, by
+         |                                                         ^~~~~~~~ Did you mean to spell “VOLSUNGA” this way?
+
+
+
+Lint:    Readability (127 priority)
+Message: |
+     110 | For some time, part-of-speech tagging was considered an inseparable part of
+         | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+     111 | natural language processing, because there are certain cases where the correct
+         | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+     112 | part of speech cannot be decided without understanding the semantics or even the
+         | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+     113 | pragmatics of the context. This is extremely expensive, especially because
+         | ~~~~~~~~~~~~~~~~~~~~~~~~~~ This sentence is 41 words long.
+
+
+
+Lint:    Spelling (63 priority)
+Message: |
+     119 | In the mid-1980s, researchers in Europe began to use hidden Markov models (HMMs)
+         |                                                                            ^~~~ Did you mean to spell “HMMs” this way?
+     120 | to disambiguate parts of speech, when working to tag the Lancaster-Oslo-Bergen
+Suggest:
+  - Replace with: “Hams”
+  - Replace with: “Ha's”
+  - Replace with: “HMO's”
+
+
+
+Lint:    Spelling (63 priority)
+Message: |
+     121 | Corpus of British English. HMMs involve counting cases (such as from the Brown
+         |                            ^~~~ Did you mean to spell “HMMs” this way?
+Suggest:
+  - Replace with: “Hams”
+  - Replace with: “Ha's”
+  - Replace with: “HMO's”
+
+
+
+Lint:    Spelling (63 priority)
+Message: |
+     129 | More advanced ("higher-order") HMMs learn the probabilities not only of pairs
+         |                                ^~~~ Did you mean to spell “HMMs” this way?
+Suggest:
+  - Replace with: “Hams”
+  - Replace with: “Ha's”
+  - Replace with: “HMO's”
+
+
+
+Lint:    Readability (127 priority)
+Message: |
+     141 | Eugene Charniak points out in Statistical techniques for natural language
+         | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+     142 | parsing (1997) that merely assigning the most common tag to each known word and
+         | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+     143 | the tag "proper noun" to all unknowns will approach 90% accuracy because many
+         | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+     144 | words are unambiguous, and many others only rarely represent their less-common
+         | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+     145 | parts of speech.
+         | ~~~~~~~~~~~~~~~~ This sentence is 50 words long.
+
+
+
+Lint:    Spelling (63 priority)
+Message: |
+     141 | Eugene Charniak points out in Statistical techniques for natural language
+         |        ^~~~~~~~ Did you mean to spell “Charniak” this way?
+Suggest:
+  - Replace with: “Carnap”
+  - Replace with: “Chadian”
+  - Replace with: “Chadwick”
+
+
+
+Lint:    Readability (127 priority)
+Message: |
+     148 | expensive since it enumerated all possibilities. It sometimes had to resort to
+         |                                                 ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+     149 | backup methods when there were simply too many options (the Brown Corpus
+         | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+     150 | contains a case with 17 ambiguous words in a row, and there are words such as
+         | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+     151 | "still" that can represent as many as 7 distinct parts of speech.
+         | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ This sentence is 44 words long.
+
+
+
+Lint:    WordChoice (63 priority)
+Message: |
+     148 | expensive since it enumerated all possibilities. It sometimes had to resort to
+     149 | backup methods when there were simply too many options (the Brown Corpus
+         | ^~~~~~ This word should be a phrasal verb, not a compound noun.
+Suggest:
+  - Replace with: “back up”
+
+
+
+Lint:    Spelling (63 priority)
+Message: |
+     153 | HMMs underlie the functioning of stochastic taggers and are used in various
+         | ^~~~ Did you mean to spell “HMMs” this way?
+Suggest:
+  - Replace with: “Hams”
+  - Replace with: “Ha's”
+  - Replace with: “HMO's”
+
+
+
+Lint:    Spelling (63 priority)
+Message: |
+     159 | In 1987, Steven DeRose and Kenneth W. Church independently developed dynamic
+         |                 ^~~~~~ Did you mean to spell “DeRose” this way?
+Suggest:
+  - Replace with: “Depose”
+  - Replace with: “Defoe”
+  - Replace with: “Denise”
+
+
+
+Lint:    Spelling (63 priority)
+Message: |
+     159 | In 1987, Steven DeRose and Kenneth W. Church independently developed dynamic
+         |                                    ^~ Did you mean to spell “W.” this way?
+Suggest:
+  - Replace with: “We”
+  - Replace with: “WA”
+  - Replace with: “WC”
+
+
+
+Lint:    Spelling (63 priority)
+Message: |
+     160 | programming algorithms to solve the same problem in vastly less time. Their
+     161 | methods were similar to the Viterbi algorithm known for some time in other
+         |                             ^~~~~~~ Did you mean to spell “Viterbi” this way?
+Suggest:
+  - Replace with: “Vite's”
+  - Replace with: “Verdi”
+  - Replace with: “Vite”
+
+
+
+Lint:    Readability (127 priority)
+Message: |
+     162 | fields. DeRose used a table of pairs, while Church used a table of triples and a
+         |        ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+     163 | method of estimating the values for triples that were rare or nonexistent in the
+         | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+     164 | Brown Corpus (an actual measurement of triple probabilities would require a much
+         | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+     165 | larger corpus). Both methods achieved an accuracy of over 95%. DeRose's 1990
+         | ~~~~~~~~~~~~~~~ This sentence is 43 words long.
+
+
+
+Lint:    Spelling (63 priority)
+Message: |
+     162 | fields. DeRose used a table of pairs, while Church used a table of triples and a
+         |         ^~~~~~ Did you mean to spell “DeRose” this way?
+Suggest:
+  - Replace with: “Depose”
+  - Replace with: “Defoe”
+  - Replace with: “Denise”
+
+
+
+Lint:    Spelling (63 priority)
+Message: |
+     165 | larger corpus). Both methods achieved an accuracy of over 95%. DeRose's 1990
+         |                                                                ^~~~~~~~ Did you mean to spell “DeRose's” this way?
+     166 | dissertation at Brown University included analyses of the specific error types,
+Suggest:
+  - Replace with: “Defoe's”
+  - Replace with: “Denise's”
+  - Replace with: “Repose's”
+
+
+
+Lint:    Spelling (63 priority)
+Message: |
+     173 | levels of linguistic analysis: syntax, morphology, semantics, and so on. CLAWS,
+     174 | DeRose's and Church's methods did fail for some of the known cases where
+         | ^~~~~~~~ Did you mean to spell “DeRose's” this way?
+Suggest:
+  - Replace with: “Defoe's”
+  - Replace with: “Denise's”
+  - Replace with: “Repose's”
+
+
+
+Lint:    Readability (127 priority)
+Message: |
+     175 | semantics is required, but those proved negligibly rare. This convinced many in
+         |                                                         ^~~~~~~~~~~~~~~~~~~~~~~~
+     176 | the field that part-of-speech tagging could usefully be separated from the other
+         | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+     177 | levels of processing; this, in turn, simplified the theory and practice of
+         | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+     178 | computerized language analysis and encouraged researchers to find ways to
+         | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+     179 | separate other pieces as well. Markov Models became the standard method for the
+         | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ This sentence is 45 words long.
+
+
+
+Lint:    Spelling (63 priority)
+Message: |
+     186 | "unsupervised" tagging. Unsupervised tagging techniques use an untagged corpus
+         |                                                                ^~~~~~~~ Did you mean to spell “untagged” this way?
+     187 | for their training data and produce the tagset by induction. That is, they
+Suggest:
+  - Replace with: “untapped”
+  - Replace with: “untasted”
+  - Replace with: “unwaged”
+
+
+
+Lint:    Spelling (63 priority)
+Message: |
+     186 | "unsupervised" tagging. Unsupervised tagging techniques use an untagged corpus
+     187 | for their training data and produce the tagset by induction. That is, they
+         |                                         ^~~~~~ Did you mean to spell “tagset” this way?
+Suggest:
+  - Replace with: “tablet”
+  - Replace with: “tagged”
+  - Replace with: “target”
+
+
+
+Lint:    Spelling (63 priority)
+Message: |
+     200 | Some current major algorithms for part-of-speech tagging include the Viterbi
+         |                                                                      ^~~~~~~ Did you mean to spell “Viterbi” this way?
+     201 | algorithm, Brill tagger, Constraint Grammar, and the Baum-Welch algorithm (also
+Suggest:
+  - Replace with: “Vite's”
+  - Replace with: “Verdi”
+  - Replace with: “Vite”
+
+
+
+Lint:    Spelling (63 priority)
+Message: |
+     201 | algorithm, Brill tagger, Constraint Grammar, and the Baum-Welch algorithm (also
+         |                                                           ^~~~~ Did you mean to spell “Welch” this way?
+     202 | known as the forward-backward algorithm). Hidden Markov model and visible Markov
+Suggest:
+  - Replace with: “Welsh”
+  - Replace with: “Belch”
+  - Replace with: “Walsh”
+
+
+
+Lint:    Spelling (63 priority)
+Message: |
+     203 | model taggers can both be implemented using the Viterbi algorithm. The
+         |                                                 ^~~~~~~ Did you mean to spell “Viterbi” this way?
+Suggest:
+  - Replace with: “Vite's”
+  - Replace with: “Verdi”
+  - Replace with: “Vite”
+
+
+
+Lint:    Spelling (63 priority)
+Message: |
+     207 | Many machine learning methods have also been applied to the problem of POS
+         |                                                                        ^~~ Did you mean to spell “POS” this way?
+     208 | tagging. Methods such as SVM, maximum entropy classifier, perceptron, and
+Suggest:
+  - Replace with: “PBS”
+  - Replace with: “PMS”
+  - Replace with: “POV”
+
+
+
+Lint:    Spelling (63 priority)
+Message: |
+     208 | tagging. Methods such as SVM, maximum entropy classifier, perceptron, and
+         |                          ^~~ Did you mean to spell “SVM” this way?
+Suggest:
+  - Replace with: “Sim”
+  - Replace with: “Sam”
+  - Replace with: “SCM”
+
+
+
+Lint:    Spelling (63 priority)
+Message: |
+     213 | Wiki. This comparison uses the Penn tag set on some of the Penn Treebank data,
+         |                                                                 ^~~~~~~~ Did you mean to spell “Treebank” this way?
+     214 | so the results are directly comparable. However, many significant taggers are
+Suggest:
+  - Replace with: “Freeman”
+  - Replace with: “Reembark”
+  - Replace with: “Debank”
+
+
+
--- a/harper-core/tests/text/linters/The
+++ b/harper-core/tests/text/linters/The
--- a/harper-core/tests/text/linters/The
+++ b/harper-core/tests/text/linters/The