feat(brill): train and use Brill tagger (#1344)

Co-authored-by: hippietrail <hippietrail@users.noreply.github.com>
2025-07-07 13:05:01 +00:00 · 2025-06-16 15:33:49 -06:00 · 2025-06-16 15:33:49 -06:00 · db89187c3f
commit db89187c3f
parent e3e573520e
51 changed files with 51011 additions and 15273 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@ -509,6 +509,27 @@ dependencies = [
 "parking_lot_core",
 ]

+[[package]]
+name = "derive_more"
+version = "1.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4a9b99b9cbbe49445b21764dc0625032a89b145a2642e67603e1c936f5458d05"
+dependencies = [
+ "derive_more-impl",
+]
+
+[[package]]
+name = "derive_more-impl"
+version = "1.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "cb7330aeadfbe296029522e6c40f315320aba36fc43a5b3632f3795348f3bd22"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn",
+ "unicode-xid",
+]
+
 [[package]]
 name = "dirs"
 version = "4.0.0"
@ -784,14 +805,25 @@ checksum = "07e28edb80900c19c28f1072f2e8aeca7fa06b23cd4169cefe1af5aa3260783f"

 [[package]]
 name = "half"
-version = "2.4.1"
+version = "2.6.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6dd08c532ae367adf81c312a4580bc67f1d0fe8bc9c460520283f4c0ff277888"
+checksum = "459196ed295495a68f7d7fe1d84f6c4b7ff0e21fe3017b2f283c6fac3ad803c9"
 dependencies = [
 "cfg-if",
 "crunchy",
 ]

+[[package]]
+name = "harper-brill"
+version = "0.42.0"
+dependencies = [
+ "harper-pos-utils",
+ "lazy_static",
+ "rs-conllu",
+ "serde",
+ "serde_json",
+]
+
 [[package]]
 name = "harper-cli"
 version = "0.1.0"
@ -803,6 +835,7 @@ dependencies = [
 "harper-comments",
 "harper-core",
 "harper-literate-haskell",
+ "harper-pos-utils",
 "harper-stats",
 "harper-typst",
 "hashbrown 0.15.4",
@ -854,6 +887,7 @@ dependencies = [
 "criterion",
 "foldhash",
 "fst",
+ "harper-brill",
 "hashbrown 0.15.4",
 "is-macro",
 "itertools 0.14.0",
@ -866,7 +900,7 @@ dependencies = [
 "pulldown-cmark",
 "quickcheck",
 "quickcheck_macros",
- "rand",
+ "rand 0.8.5",
 "rayon",
 "serde",
 "serde_json",
@ -929,6 +963,20 @@ dependencies = [
 "tracing-subscriber",
 ]

+[[package]]
+name = "harper-pos-utils"
+version = "0.42.0"
+dependencies = [
+ "hashbrown 0.15.4",
+ "is-macro",
+ "rand 0.9.1",
+ "rayon",
+ "rs-conllu",
+ "serde",
+ "strum",
+ "strum_macros",
+]
+
 [[package]]
 name = "harper-stats"
 version = "0.42.0"
@ -1569,9 +1617,9 @@ checksum = "42f5e15c9953c5e4ccceeb2e7382a716482c34515315f7b03532b8b4e8393d2d"

 [[package]]
 name = "oorandom"
-version = "11.1.4"
+version = "11.1.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b410bbe7e14ab526a0e86877eb47c6996a2bd7746f027ba551028c925390e4e9"
+checksum = "d6790f58c7ff633d8771f42965289203411a5e5c68388703c06e14f24770b41e"

 [[package]]
 name = "open"
@ -1597,7 +1645,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "e2c1f9f56e534ac6a9b8a4600bdf0f530fb393b5f393e7b4d03489c3cf0c3f01"
 dependencies = [
 "num-traits",
- "rand",
+ "rand 0.8.5",
 "serde",
 ]

@ -1675,7 +1723,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "3c80231409c20246a13fddb31776fb942c38553c51e871f8cbd687a4cfb5843d"
 dependencies = [
 "phf_shared",
- "rand",
+ "rand 0.8.5",
 ]

 [[package]]
@ -1769,7 +1817,7 @@ checksum = "588f6378e4dd99458b60ec275b4477add41ce4fa9f64dcba6f15adccb19b50d6"
 dependencies = [
 "env_logger",
 "log",
- "rand",
+ "rand 0.8.5",
 ]

 [[package]]
@ -1809,7 +1857,7 @@ checksum = "a2fe5ef3495d7d2e377ff17b1a8ce2ee2ec2a18cde8b6ad6619d65d0701c135d"
 dependencies = [
 "bytes",
 "getrandom 0.2.15",
- "rand",
+ "rand 0.8.5",
 "ring",
 "rustc-hash",
 "rustls",
@ -1857,11 +1905,21 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404"
 dependencies = [
 "libc",
- "rand_chacha",
- "rand_core",
+ "rand_chacha 0.3.1",
+ "rand_core 0.6.4",
 "serde",
 ]

+[[package]]
+name = "rand"
+version = "0.9.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9fbfd9d094a40bf3ae768db9361049ace4c0e04a4fd6b359518bd7b73a73dd97"
+dependencies = [
+ "rand_chacha 0.9.0",
+ "rand_core 0.9.3",
+]
+
 [[package]]
 name = "rand_chacha"
 version = "0.3.1"
@ -1869,7 +1927,17 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88"
 dependencies = [
 "ppv-lite86",
- "rand_core",
+ "rand_core 0.6.4",
+]
+
+[[package]]
+name = "rand_chacha"
+version = "0.9.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d3022b5f1df60f26e1ffddd6c66e8aa15de382ae63b3a0c1bfc0e4d3e3f325cb"
+dependencies = [
+ "ppv-lite86",
+ "rand_core 0.9.3",
 ]

 [[package]]
@ -1882,6 +1950,15 @@ dependencies = [
 "serde",
 ]

+[[package]]
+name = "rand_core"
+version = "0.9.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "99d9a13982dcf210057a8a78572b2217b667c3beacbf3a0d8b454f6f82837d38"
+dependencies = [
+ "getrandom 0.3.2",
+]
+
 [[package]]
 name = "rayon"
 version = "1.10.0"
@ -2023,6 +2100,18 @@ dependencies = [
 "windows-sys 0.52.0",
 ]

+[[package]]
+name = "rs-conllu"
+version = "0.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d6de5aecf17f8fff1b35d59a12e2b8c908cad4d67208805166483655554f9169"
+dependencies = [
+ "clap",
+ "derive_more",
+ "thiserror 1.0.69",
+ "walkdir",
+]
+
 [[package]]
 name = "rustc-demangle"
 version = "0.1.24"
@ -2953,6 +3042,12 @@ version = "0.2.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "4a1a07cc7db3810833284e8d372ccdc6da29741639ecc70c9ec107df0fa6154c"

+[[package]]
+name = "unicode-xid"
+version = "0.2.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ebc1c04c71510c7f702b52b7c350734c9ff1295c464a03335b00bb84fc54f853"
+
 [[package]]
 name = "unscanny"
 version = "0.1.0"
--- a/Cargo.toml
+++ b/Cargo.toml
@ -1,5 +1,5 @@
 [workspace]
-members = [ "harper-cli", "harper-core", "harper-ls", "harper-comments", "harper-wasm", "harper-tree-sitter", "harper-html", "harper-literate-haskell", "harper-typst" , "harper-stats"]
+members = [ "harper-cli", "harper-core", "harper-ls", "harper-comments", "harper-wasm", "harper-tree-sitter", "harper-html", "harper-literate-haskell", "harper-typst" , "harper-stats", "harper-pos-utils", "harper-brill"]
 resolver = "2"

 # Comment out the below lines if you plan to use a debugger.
--- a/harper-brill/Cargo.toml
+++ b/harper-brill/Cargo.toml
@ -0,0 +1,16 @@
+[package]
+name = "harper-brill"
+version = "0.42.0"
+edition = "2024"
+
+[dependencies]
+harper-pos-utils = { path = "../harper-pos-utils/", version = "0.42.0" }
+lazy_static = "1.5.0"
+rs-conllu = "0.3.0"
+serde = "1.0.219"
+serde_json = "1.0.140"
+
+[build-dependencies]
+rs-conllu = "0.3.0"
+serde = "1.0.219"
+serde_json = "1.0.140"
--- a/harper-brill/src/lib.rs
+++ b/harper-brill/src/lib.rs
@ -0,0 +1,32 @@
+use lazy_static::lazy_static;
+use std::sync::Arc;
+
+pub use harper_pos_utils::{BrillChunker, BrillTagger, Chunker, FreqDict, Tagger, UPOS};
+
+const BRILL_TAGGER_SOURCE: &str = include_str!("../trained_tagger_model.json");
+
+lazy_static! {
+    static ref BRILL_TAGGER: Arc<BrillTagger<FreqDict>> = Arc::new(uncached_brill_tagger());
+}
+
+fn uncached_brill_tagger() -> BrillTagger<FreqDict> {
+    serde_json::from_str(BRILL_TAGGER_SOURCE).unwrap()
+}
+
+pub fn brill_tagger() -> Arc<BrillTagger<FreqDict>> {
+    (*BRILL_TAGGER).clone()
+}
+
+const BRILL_CHUNKER_SOURCE: &str = include_str!("../trained_chunker_model.json");
+
+lazy_static! {
+    static ref BRILL_CHUNKER: Arc<BrillChunker> = Arc::new(uncached_brill_chunker());
+}
+
+fn uncached_brill_chunker() -> BrillChunker {
+    serde_json::from_str(BRILL_CHUNKER_SOURCE).unwrap()
+}
+
+pub fn brill_chunker() -> Arc<BrillChunker> {
+    (*BRILL_CHUNKER).clone()
+}
--- a/harper-brill/trained_chunker_model.json
+++ b/harper-brill/trained_chunker_model.json
--- a/harper-brill/trained_tagger_model.json
+++ b/harper-brill/trained_tagger_model.json
--- a/harper-cli/Cargo.toml
+++ b/harper-cli/Cargo.toml
@ -13,6 +13,7 @@ harper-stats = { path = "../harper-stats", version = "0.42.0" }
 dirs = "6.0.0"
 harper-literate-haskell = { path = "../harper-literate-haskell", version = "0.42.0" }
 harper-core = { path = "../harper-core", version = "0.42.0" }
+harper-pos-utils = { path = "../harper-pos-utils", version = "0.42.0", features = ["training", "threaded"] }
 harper-comments = { path = "../harper-comments", version = "0.42.0" }
 harper-typst = { path = "../harper-typst", version = "0.42.0" }
 hashbrown = "0.15.4"
--- a/harper-cli/src/main.rs
+++ b/harper-cli/src/main.rs
@ -20,6 +20,7 @@ use harper_core::{
    MutableDictionary, TokenKind, TokenStringExt, WordId, WordMetadata,
 };
 use harper_literate_haskell::LiterateHaskellParser;
+use harper_pos_utils::{BrillChunker, BrillTagger};
 use harper_stats::Stats;
 use serde::Serialize;

@ -77,6 +78,28 @@ enum Args {
        /// The document to mine words from.
        file: PathBuf,
    },
+    TrainBrillTagger {
+        #[arg(short, long, default_value = "1.0")]
+        candidate_selection_chance: f32,
+        /// The path to write the final JSON model file to.
+        output: PathBuf,
+        /// The number of epochs (and patch rules) to train.
+        epochs: usize,
+        /// Path to a `.conllu` dataset to train on.
+        #[arg(num_args = 1..)]
+        datasets: Vec<PathBuf>,
+    },
+    TrainBrillChunker {
+        #[arg(short, long, default_value = "1.0")]
+        candidate_selection_chance: f32,
+        /// The path to write the final JSON model file to.
+        output: PathBuf,
+        /// The number of epochs (and patch rules) to train.
+        epochs: usize,
+        /// Path to a `.conllu` dataset to train on.
+        #[arg(num_args = 1..)]
+        datasets: Vec<PathBuf>,
+    },
    /// Print harper-core version.
    CoreVersion,
    /// Rename a flag in the dictionary and affixes.
@ -91,6 +114,8 @@ enum Args {
    /// Emit a decompressed, line-separated list of the compounds in Harper's dictionary.
    /// As long as there's either an open or hyphenated spelling.
    Compounds,
+    /// Provided a sentence or phrase, emit a list of each noun phrase contained within.
+    NominalPhrases { input: String },
 }

 fn main() -> anyhow::Result<()> {
@ -380,6 +405,27 @@ fn main() -> anyhow::Result<()> {
            println!("harper-core v{}", harper_core::core_version());
            Ok(())
        }
+        Args::TrainBrillTagger {
+            datasets: dataset,
+            epochs,
+            output,
+            candidate_selection_chance,
+        } => {
+            let tagger = BrillTagger::train(&dataset, epochs, candidate_selection_chance);
+            fs::write(output, serde_json::to_string_pretty(&tagger)?)?;
+
+            Ok(())
+        }
+        Args::TrainBrillChunker {
+            datasets,
+            epochs,
+            output,
+            candidate_selection_chance,
+        } => {
+            let chunker = BrillChunker::train(&datasets, epochs, candidate_selection_chance);
+            fs::write(output, serde_json::to_string_pretty(&chunker)?)?;
+            Ok(())
+        }
        Args::RenameFlag { old, new, dir } => {
            use serde_json::Value;

@ -547,6 +593,18 @@ fn main() -> anyhow::Result<()> {
            println!("\nFound {} compound word groups", results.len());
            Ok(())
        }
+        Args::NominalPhrases { input } => {
+            let doc = Document::new_markdown_default_curated(&input);
+
+            for phrase in doc.iter_nominal_phrases() {
+                let s =
+                    doc.get_span_content_str(&phrase.span().ok_or(anyhow!("Unable to get span"))?);
+
+                println!("{s}");
+            }
+
+            Ok(())
+        }
    }
 }

@ -562,6 +620,7 @@ fn load_file(
        .map(|v| v.to_str().unwrap())
    {
        Some("md") => Box::new(Markdown::default()),
+
        Some("lhs") => Box::new(LiterateHaskellParser::new_markdown(
            MarkdownOptions::default(),
        )),
--- a/harper-core/Cargo.toml
+++ b/harper-core/Cargo.toml
@ -31,6 +31,7 @@ foldhash = "0.1.5"
 strum_macros = "0.27.1"
 strum = "0.27.1"
 ammonia = "4.1.0"
+harper-brill = { path = "../harper-brill", version = "0.42.0" }
 bitflags = { version = "2.9.1", features = ["serde"] }

 [dev-dependencies]
--- a/harper-core/src/document.rs
+++ b/harper-core/src/document.rs
@ -2,6 +2,7 @@ use std::cmp::Ordering;
 use std::collections::VecDeque;
 use std::fmt::Display;

+use harper_brill::{Chunker, Tagger, brill_chunker, brill_tagger};
 use paste::paste;

 use crate::expr::{Expr, ExprExt, LongestMatchOf, Repeating, SequenceExpr};
@ -9,10 +10,8 @@ use crate::parsers::{Markdown, MarkdownOptions, Parser, PlainEnglish};
 use crate::patterns::WordSet;
 use crate::punctuation::Punctuation;
 use crate::vec_ext::VecExt;
-use crate::word_metadata::AdjectiveData;
 use crate::{
-    Dictionary, FatStringToken, FatToken, FstDictionary, Lrc, NounData, Token, TokenKind,
-    TokenStringExt,
+    Dictionary, FatStringToken, FatToken, FstDictionary, Lrc, Token, TokenKind, TokenStringExt,
 };
 use crate::{OrdinalSuffix, Span};

@ -140,107 +139,34 @@ impl Document {
        self.condense_ellipsis();
        self.condense_latin();
        self.match_quotes();
-        self.articles_imply_nouns();

-        // annotate word metadata
+        let token_strings: Vec<_> = self
+            .tokens
+            .iter()
+            .filter(|t| !t.kind.is_whitespace())
+            .map(|t| self.get_span_content_str(&t.span))
+            .collect();
+
+        let token_tags = brill_tagger().tag_sentence(&token_strings);
+        let np_flags = brill_chunker().chunk_sentence(&token_strings, &token_tags);
+
+        let mut i = 0;
+
+        // Annotate word metadata
        for token in self.tokens.iter_mut() {
            if let TokenKind::Word(meta) = &mut token.kind {
                let word_source = token.span.get_content(&self.source);
-                let found_meta = dictionary.get_word_metadata(word_source);
-                *meta = found_meta.cloned()
-            }
-        }
+                let mut found_meta = dictionary.get_word_metadata(word_source).cloned();

-        // refine and disambiguate word metadata
-        self.known_preposition();
-        self.articles_imply_not_verb();
-    }
-
-    fn uncached_article_expr() -> Lrc<SequenceExpr> {
-        Lrc::new(
-            SequenceExpr::default()
-                .then_determiner()
-                .then_whitespace()
-                .then(|t: &Token, _source: &[char]| t.kind.is_adjective() && t.kind.is_noun())
-                .then_whitespace()
-                .then_noun(),
-        )
-    }
-
-    thread_local! {static ARTICLE_EXPR: Lrc<SequenceExpr> = Document::uncached_article_expr()}
-
-    /// When a word that is either an adjective or a noun is sandwiched between an article and a noun,
-    /// it definitely is not a noun.
-    fn articles_imply_nouns(&mut self) {
-        let expr = Self::ARTICLE_EXPR.with(|v| v.clone());
-
-        for m in expr.iter_matches_in_doc(self).collect::<Vec<_>>() {
-            if let TokenKind::Word(Some(metadata)) = &mut self.tokens[m.start + 2].kind {
-                metadata.noun = None;
-                metadata.verb = None;
-            }
-        }
-    }
-
-    /// A proposition-like word followed by a determiner or number is typically
-    /// really a preposition.
-    fn known_preposition(&mut self) {
-        fn create_expr() -> Lrc<SequenceExpr> {
-            Lrc::new(
-                SequenceExpr::default()
-                    .then(WordSet::new(&["in", "at", "on", "to", "for", "by", "with"]))
-                    .then_whitespace()
-                    .then(|t: &Token, _source: &[char]| {
-                        t.kind.is_determiner() || t.kind.is_number()
-                    }),
-            )
-        }
-        thread_local! {static EXPR: Lrc<SequenceExpr> = create_expr()}
-
-        let expr = EXPR.with(|v| v.clone());
-
-        for m in expr.iter_matches_in_doc(self).collect::<Vec<_>>() {
-            if let TokenKind::Word(Some(metadata)) = &mut self.tokens[m.start].kind {
-                metadata.noun = None;
-                metadata.pronoun = None;
-                metadata.verb = None;
-                metadata.adjective = None;
-            }
-        }
-    }
-
-    /// The first word after an article cannot be a verb.
-    fn articles_imply_not_verb(&mut self) {
-        fn create_pattern() -> Lrc<SequenceExpr> {
-            Lrc::new(
-                SequenceExpr::default()
-                    .then(WordSet::new(&[
-                        // articles
-                        "a", "an", "the",
-                        // Dependent genitive pronouns serve a similar role to articles.
-                        // Unfortunately, some overlap with other pronoun forms. E.g.
-                        // "I like her", "Something about her struck me as odd."
-                        "my", "your", "thy", "thine", "his", /*"her",*/ "its", "our", "their",
-                        "whose", // "no" is also a determiner
-                        "no",
-                    ]))
-                    .then_whitespace()
-                    .then_verb(),
-            )
-        }
-        thread_local! {static EXPR: Lrc<SequenceExpr> = create_pattern()}
-        let expr = EXPR.with(|v| v.clone());
-
-        for m in expr.iter_matches_in_doc(self).collect::<Vec<_>>() {
-            if let TokenKind::Word(Some(metadata)) = &mut self.tokens[m.end - 1].kind {
-                if metadata.noun.is_none()
-                    && metadata.adjective.is_none()
-                    && metadata.adverb.is_none()
-                {
-                    metadata.noun = Some(NounData::default());
-                    metadata.adjective = Some(AdjectiveData::default());
+                if let Some(inner) = &mut found_meta {
+                    inner.pos_tag = token_tags[i];
+                    inner.np_member = Some(np_flags[i]);
                }
-                metadata.verb = None;
+
+                *meta = found_meta;
+                i += 1;
+            } else if !token.kind.is_whitespace() {
+                i += 1;
            }
        }
    }
@ -331,6 +257,40 @@ impl Document {
        self.tokens.iter()
    }

+    pub fn iter_nominal_phrases(&self) -> impl Iterator<Item = &[Token]> {
+        fn is_np_member(t: &Token) -> bool {
+            t.kind
+                .as_word()
+                .and_then(|x| x.as_ref())
+                .and_then(|w| w.np_member)
+                .unwrap_or(false)
+        }
+
+        fn trim(slice: &[Token]) -> &[Token] {
+            let mut start = 0;
+            let mut end = slice.len();
+            while start < end && slice[start].kind.is_whitespace() {
+                start += 1;
+            }
+            while end > start && slice[end - 1].kind.is_whitespace() {
+                end -= 1;
+            }
+            &slice[start..end]
+        }
+
+        self.tokens
+            .as_slice()
+            .split(|t| !(is_np_member(t) || t.kind.is_whitespace()))
+            .filter_map(|s| {
+                let s = trim(s);
+                if s.iter().any(is_np_member) {
+                    Some(s)
+                } else {
+                    None
+                }
+            })
+    }
+
    /// Get an iterator over all the tokens contained in the document.
    pub fn fat_tokens(&self) -> impl Iterator<Item = FatToken> + '_ {
        self.tokens().map(|token| token.to_fat(&self.source))
--- a/harper-core/src/linting/compound_nouns/mod.rs
+++ b/harper-core/src/linting/compound_nouns/mod.rs
@ -12,13 +12,9 @@ pub(crate) fn is_content_word(tok: &Token, src: &[char]) -> bool {
    };

    tok.span.len() > 1
-        && (meta.is_noun() || meta.is_adjective())
+        && (meta.is_noun() || meta.is_adjective() || meta.is_verb() || meta.is_adverb())
        && !meta.determiner
        && (!meta.preposition || tok.span.get_content_string(src).to_lowercase() == "bar")
-        && !meta.is_adverb()
-        && !meta.is_conjunction()
-        && !meta.is_pronoun()
-        && !meta.is_auxiliary_verb()
 }

 pub(crate) fn predicate(closed: Option<&WordMetadata>, open: Option<&WordMetadata>) -> bool {
--- a/harper-core/src/linting/its_contraction.rs
+++ b/harper-core/src/linting/its_contraction.rs
@ -1,9 +1,15 @@
+use harper_brill::UPOS;
+
+use crate::expr::All;
 use crate::expr::Expr;
 use crate::expr::SequenceExpr;
+use crate::patterns::NominalPhrase;
+use crate::patterns::Pattern;
+use crate::patterns::UPOSSet;
+use crate::patterns::WordSet;
 use crate::{
    Token,
    linting::{ExprLinter, Lint, LintKind, Suggestion},
-    patterns::WordSet,
 };

 pub struct ItsContraction {
@ -12,14 +18,22 @@ pub struct ItsContraction {

 impl Default for ItsContraction {
    fn default() -> Self {
-        let its = WordSet::new(&["its"]);
-        let verbs = WordSet::new(&["had", "been", "got"]);
-        let pattern = SequenceExpr::default()
-            .then(its)
+        let positive = SequenceExpr::default()
+            .t_aco("its")
            .then_whitespace()
-            .then(verbs);
+            .then(UPOSSet::new(&[UPOS::VERB, UPOS::AUX]));
+
+        let exceptions = SequenceExpr::default()
+            .then_anything()
+            .then_anything()
+            .then(WordSet::new(&["own", "intended"]));
+
+        let inverted = SequenceExpr::default().if_not_then_step_one(exceptions);
+
+        let expr = All::new(vec![Box::new(positive), Box::new(inverted)]);
+
        Self {
-            expr: Box::new(pattern),
+            expr: Box::new(expr),
        }
    }
 }
@ -32,6 +46,13 @@ impl ExprLinter for ItsContraction {
    fn match_to_lint(&self, toks: &[Token], source: &[char]) -> Option<Lint> {
        let offender = toks.first()?;
        let offender_chars = offender.span.get_content(source);
+
+        if !toks.get(2)?.kind.is_upos(UPOS::AUX)
+            && NominalPhrase.matches(&toks[2..], source).is_some()
+        {
+            return None;
+        }
+
        Some(Lint {
            span: offender.span,
            lint_kind: LintKind::WordChoice,
@ -39,7 +60,8 @@ impl ExprLinter for ItsContraction {
                Suggestion::replace_with_match_case_str("it's", offender_chars),
                Suggestion::replace_with_match_case_str("it has", offender_chars),
            ],
-            message: "Use `it's` (short for `it has`) here, not the possessive `its`.".to_owned(),
+            message: "Use `it's` (short for `it has` or `it is`) here, not the possessive `its`."
+                .to_owned(),
            priority: 54,
        })
    }
@ -98,4 +120,13 @@ mod tests {
            0,
        );
    }
+
+    #[test]
+    fn ignore_coroutine() {
+        assert_lint_count(
+            "Launch each task within its own child coroutine.",
+            ItsContraction::default(),
+            0,
+        );
+    }
 }
--- a/harper-core/src/linting/then_than.rs
+++ b/harper-core/src/linting/then_than.rs
@ -54,8 +54,7 @@ impl ThenThan {

 // TODO: This can be simplified or eliminated when the adjective improvements make it into the affix system.
 fn is_comparative_adjective(tok: &Token, source: &[char]) -> bool {
-    tok.kind
-        .is_adjective()
+    (tok.kind.is_adjective() || tok.kind.is_adverb())
        .then(|| tok.span.get_content(source))
        .is_some_and(|src| {
            // Regular comparative form?
--- a/harper-core/src/patterns/mod.rs
+++ b/harper-core/src/patterns/mod.rs
@ -13,6 +13,7 @@ mod indefinite_article;
 mod inflection_of_be;
 mod invert;
 mod nominal_phrase;
+mod upos_set;
 mod whitespace_pattern;
 mod within_edit_distance;
 mod word;
@ -24,6 +25,7 @@ pub use indefinite_article::IndefiniteArticle;
 pub use inflection_of_be::InflectionOfBe;
 pub use invert::Invert;
 pub use nominal_phrase::NominalPhrase;
+pub use upos_set::UPOSSet;
 pub use whitespace_pattern::WhitespacePattern;
 pub use within_edit_distance::WithinEditDistance;
 pub use word::Word;
--- a/harper-core/src/patterns/upos_set.rs
+++ b/harper-core/src/patterns/upos_set.rs
@ -0,0 +1,30 @@
+use harper_brill::UPOS;
+use smallvec::{SmallVec, ToSmallVec};
+
+use crate::Token;
+
+use super::Pattern;
+
+pub struct UPOSSet {
+    allowed_tags: SmallVec<[UPOS; 10]>,
+}
+
+impl UPOSSet {
+    pub fn new(allowed: &[UPOS]) -> Self {
+        Self {
+            allowed_tags: allowed.to_smallvec(),
+        }
+    }
+}
+
+impl Pattern for UPOSSet {
+    fn matches(&self, tokens: &[Token], _source: &[char]) -> Option<usize> {
+        tokens.first()?.kind.as_word()?.as_ref().and_then(|w| {
+            if self.allowed_tags.contains(&(w.pos_tag?)) {
+                Some(1)
+            } else {
+                None
+            }
+        })
+    }
+}
--- a/harper-core/src/token_kind.rs
+++ b/harper-core/src/token_kind.rs
@ -1,3 +1,4 @@
+use harper_brill::UPOS;
 use is_macro::Is;
 use serde::{Deserialize, Serialize};

@ -447,4 +448,12 @@ impl TokenKind {
    pub fn is_whitespace(&self) -> bool {
        matches!(self, TokenKind::Space(_) | TokenKind::Newline(_))
    }
+
+    pub fn is_upos(&self, upos: UPOS) -> bool {
+        let Some(Some(meta)) = self.as_word() else {
+            return false;
+        };
+
+        meta.pos_tag == Some(upos)
+    }
 }
--- a/harper-core/src/word_metadata.rs
+++ b/harper-core/src/word_metadata.rs
@ -1,3 +1,4 @@
+use harper_brill::UPOS;
 use is_macro::Is;
 use paste::paste;
 use serde::{Deserialize, Serialize};
@ -32,6 +33,10 @@ pub struct WordMetadata {
    pub common: bool,
    #[serde(default = "default_none")]
    pub derived_from: Option<WordId>,
+    /// Generated by a chunker
+    pub np_member: Option<bool>,
+    /// Generated by a POS tagger
+    pub pos_tag: Option<UPOS>,
 }

 /// Needed for `serde`
@ -120,6 +125,180 @@ impl WordMetadata {
            preposition: self.preposition || other.preposition,
            common: self.common || other.common,
            derived_from: self.derived_from.or(other.derived_from),
+            pos_tag: self.pos_tag.or(other.pos_tag),
+            np_member: self.np_member.or(other.np_member),
+        }
+    }
+
+    /// Given a UPOS tag, discard any metadata that would disagree with the given POS tag.
+    /// For example, if the metadata suggests a word could either be a noun or an adjective, and we
+    /// provide a [`UPOS::NOUN`], this function will remove the adjective data.
+    ///
+    /// Additionally, if the metadata does not currently declare the potential of the word to be
+    /// the specific POS, it becomes so. That means if we provide a [`UPOS::ADJ`] to the function
+    /// for a metadata whose `Self::adjective = None`, it will become `Some`.
+    pub fn enforce_pos_exclusivity(&mut self, pos: &UPOS) {
+        use UPOS::*;
+        match pos {
+            NOUN => {
+                if let Some(noun) = self.noun {
+                    self.noun = Some(NounData {
+                        is_proper: Some(false),
+                        ..noun
+                    })
+                } else {
+                    self.noun = Some(NounData {
+                        is_proper: Some(false),
+                        is_plural: None,
+                        is_possessive: None,
+                    })
+                }
+
+                self.pronoun = None;
+                self.verb = None;
+                self.adjective = None;
+                self.adverb = None;
+                self.conjunction = None;
+                self.determiner = false;
+                self.preposition = false;
+            }
+            PROPN => {
+                if let Some(noun) = self.noun {
+                    self.noun = Some(NounData {
+                        is_proper: Some(true),
+                        ..noun
+                    })
+                } else {
+                    self.noun = Some(NounData {
+                        is_proper: Some(true),
+                        is_plural: None,
+                        is_possessive: None,
+                    })
+                }
+
+                self.pronoun = None;
+                self.verb = None;
+                self.adjective = None;
+                self.adverb = None;
+                self.conjunction = None;
+                self.determiner = false;
+                self.preposition = false;
+            }
+            PRON => {
+                if self.pronoun.is_none() {
+                    self.pronoun = Some(PronounData::default())
+                }
+
+                self.noun = None;
+                self.verb = None;
+                self.adjective = None;
+                self.adverb = None;
+                self.conjunction = None;
+                self.determiner = false;
+                self.preposition = false;
+            }
+            VERB => {
+                if let Some(verb) = self.verb {
+                    self.verb = Some(VerbData {
+                        is_auxiliary: Some(false),
+                        ..verb
+                    })
+                } else {
+                    self.verb = Some(VerbData {
+                        is_auxiliary: Some(false),
+                        ..Default::default()
+                    })
+                }
+
+                self.noun = None;
+                self.pronoun = None;
+                self.adjective = None;
+                self.adverb = None;
+                self.conjunction = None;
+                self.determiner = false;
+                self.preposition = false;
+            }
+            AUX => {
+                if let Some(verb) = self.verb {
+                    self.verb = Some(VerbData {
+                        is_auxiliary: Some(true),
+                        ..verb
+                    })
+                } else {
+                    self.verb = Some(VerbData {
+                        is_auxiliary: Some(true),
+                        ..Default::default()
+                    })
+                }
+
+                self.noun = None;
+                self.pronoun = None;
+                self.adjective = None;
+                self.adverb = None;
+                self.conjunction = None;
+                self.determiner = false;
+                self.preposition = false;
+            }
+            ADJ => {
+                if self.adjective.is_none() {
+                    self.adjective = Some(AdjectiveData::default())
+                }
+
+                self.noun = None;
+                self.pronoun = None;
+                self.verb = None;
+                self.adverb = None;
+                self.conjunction = None;
+                self.determiner = false;
+                self.preposition = false;
+            }
+            ADV => {
+                if self.adverb.is_none() {
+                    self.adverb = Some(AdverbData::default())
+                }
+
+                self.noun = None;
+                self.pronoun = None;
+                self.verb = None;
+                self.adjective = None;
+                self.conjunction = None;
+                self.determiner = false;
+                self.preposition = false;
+            }
+            ADP => {
+                self.noun = None;
+                self.pronoun = None;
+                self.verb = None;
+                self.adjective = None;
+                self.adverb = None;
+                self.conjunction = None;
+                self.determiner = false;
+                self.preposition = true;
+            }
+            DET => {
+                self.noun = None;
+                self.pronoun = None;
+                self.verb = None;
+                self.adjective = None;
+                self.adverb = None;
+                self.conjunction = None;
+                self.preposition = false;
+                self.determiner = true;
+            }
+            CCONJ | SCONJ => {
+                if self.conjunction.is_none() {
+                    self.conjunction = Some(ConjunctionData::default())
+                }
+
+                self.noun = None;
+                self.pronoun = None;
+                self.verb = None;
+                self.adjective = None;
+                self.adverb = None;
+                self.determiner = false;
+                self.preposition = false;
+            }
+            _ => {}
        }
    }

--- a/harper-core/tests/pos_tags.rs
+++ b/harper-core/tests/pos_tags.rs
@ -47,6 +47,7 @@
 //!   - Determiners are denoted by `D`.
 //!   - Prepositions are denoted by `P`.
 //!   - Dialects are denoted by `Am`, `Br`, `Ca`, or `Au`.
+//!   - Noun phrase membership is denoted by `+`
 //!
 //!   The tagger supports uncertainty, so a single word can be e.g. both a
 //!   noun and a verb. This is denoted by a `/` between the tags.
@ -146,6 +147,8 @@ fn format_word_tag(word: &WordMetadata) -> String {
        }
    });

+    add_switch(&mut tags, word.np_member, "+", "");
+
    if tags.is_empty() {
        String::from("W?")
    } else {
--- a/harper-core/tests/run_tests.rs
+++ b/harper-core/tests/run_tests.rs
@ -85,7 +85,7 @@ create_test!(pr_452.md, 2, Dialect::American);
 create_test!(hex_basic_clean.md, 0, Dialect::American);
 create_test!(hex_basic_dirty.md, 1, Dialect::American);
 create_test!(misc_closed_compound_clean.md, 0, Dialect::American);
-create_test!(yogurt_british_clean.md, 0, Dialect::British);
+create_test!(yogurt_british_clean.md, 1, Dialect::British);

 // Make sure it doesn't panic
 create_test!(lukas_homework.md, 3, Dialect::American);
--- a/harper-core/tests/text/linters/Alice's
+++ b/harper-core/tests/text/linters/Alice's
@ -456,6 +456,15 @@ Message: |



+Lint:    Capitalization (31 priority)
+Message: |
+     226 | himself as he came, “Oh! the Duchess, the Duchess! Oh! won’t she be savage if
+         |                          ^~~ This sentence does not start with a capital letter
+Suggest:
+  - Replace with: “The”
+
+
+
 Lint:    Capitalization (31 priority)
 Message: |
     226 | himself as he came, “Oh! the Duchess, the Duchess! Oh! won’t she be savage if
--- a/harper-core/tests/text/linters/Difficult
+++ b/harper-core/tests/text/linters/Difficult
@ -209,6 +209,24 @@ Suggest:



+Lint:    Capitalization (31 priority)
+Message: |
+     340 | on the left, on the right, on the side, on the bottom.
+         | ^~ This sentence does not start with a capital letter
+Suggest:
+  - Replace with: “On”
+
+
+
+Lint:    Capitalization (31 priority)
+Message: |
+     342 | on a bus, on a train, on a plane, on a ferry, on a yacht.
+         | ^~ This sentence does not start with a capital letter
+Suggest:
+  - Replace with: “On”
+
+
+
 Lint:    Miscellaneous (31 priority)
 Message: |
     343 | All of the responsibility is on him.
--- a/harper-core/tests/text/linters/The
+++ b/harper-core/tests/text/linters/The
@ -204,6 +204,16 @@ Message: |



+Lint:    WordChoice (63 priority)
+Message: |
+      89 | third Class at the Expiration of the sixth Year, so that one third may be
+         |                                                                    ^~~~~~ Did you mean the closed compound noun “maybe”?
+      90 | chosen every second Year; and when vacancies happen in the representation of
+Suggest:
+  - Replace with: “maybe”
+
+
+
 Lint:    Readability (127 priority)
 Message: |
      96 | No Person shall be a Senator who shall not have attained to the Age of thirty
@ -1541,6 +1551,16 @@ Message: |



+Lint:    WordChoice (63 priority)
+Message: |
+     658 | questioned. But neither the United States nor any State shall assume or pay any
+     659 | debt or obligation incurred in aid of insurrection or rebellion against the
+         | ^~~~~~~ Did you mean the closed compound noun “debtor”?
+Suggest:
+  - Replace with: “debtor”
+
+
+
 Lint:    Spelling (63 priority)
 Message: |
     663 | ## Article. V.
--- a/harper-core/tests/text/linters/The
+++ b/harper-core/tests/text/linters/The
@ -1949,6 +1949,16 @@ Suggest:



+Lint:    WordChoice (63 priority)
+Message: |
+    1531 | puppyish, convivial way, girls were swooning backward playfully into men’s arms,
+    1532 | even into groups, knowing that some one would arrest their falls—but no one
+         |                                ^~~~~~~~ Did you mean the closed compound noun “someone”?
+Suggest:
+  - Replace with: “someone”
+
+
+
 Lint:    Miscellaneous (31 priority)
 Message: |
    1531 | puppyish, convivial way, girls were swooning backward playfully into men’s arms,
@ -6441,6 +6451,16 @@ Suggest:



+Lint:    WordChoice (63 priority)
+Message: |
+    5181 | easier, surer way of finding out what he wanted to know. By half-past two he was
+    5182 | in West Egg, where he asked some one the way to Gatsby’s house. So by that time
+         |                             ^~~~~~~~ Did you mean the closed compound noun “someone”?
+Suggest:
+  - Replace with: “someone”
+
+
+
 Lint:    Miscellaneous (31 priority)
 Message: |
    5181 | easier, surer way of finding out what he wanted to know. By half-past two he was
@ -7123,6 +7143,16 @@ Suggest:



+Lint:    WordChoice (63 priority)
+Message: |
+    5642 | message or a flower. Dimly I heard some one murmur “Blessed are the dead that
+         |                                    ^~~~~~~~ Did you mean the closed compound noun “someone”?
+    5643 | the rain falls on,” and then the owl-eyed man said “Amen to that,” in a brave
+Suggest:
+  - Replace with: “someone”
+
+
+
 Lint:    Miscellaneous (31 priority)
 Message: |
    5642 | message or a flower. Dimly I heard some one murmur “Blessed are the dead that
@ -7462,6 +7492,16 @@ Suggest:



+Lint:    WordChoice (54 priority)
+Message: |
+    5814 | green breast of the new world. Its vanished trees, the trees that had made way
+         |                                ^~~ Use `it's` (short for `it has` or `it is`) here, not the possessive `its`.
+Suggest:
+  - Replace with: “It's”
+  - Replace with: “It has”
+
+
+
 Lint:    Readability (127 priority)
 Message: |
    5814 | green breast of the new world. Its vanished trees, the trees that had made way
--- a/harper-core/tests/text/tagged/Alice's
+++ b/harper-core/tests/text/tagged/Alice's
--- a/harper-core/tests/text/tagged/Computer
+++ b/harper-core/tests/text/tagged/Computer
--- a/harper-core/tests/text/tagged/Difficult
+++ b/harper-core/tests/text/tagged/Difficult
--- a/harper-core/tests/text/tagged/Part-of-speech
+++ b/harper-core/tests/text/tagged/Part-of-speech
@ -7,434 +7,434 @@
 >            -->
 # Unlintable Unlintable
 >            Part    - of - speech tagging
-# Unlintable NSg/V/J . P  . NSg/V  NSg/V
+# Unlintable NSg/V/J . P  . NSg/V+ NSg/V
 >
 #
-> In        corpus linguistics , part    - of - speech tagging ( POS tagging or      PoS tagging or
-# NPrSg/J/P NSg    NSg         . NSg/V/J . P  . NSg/V  NSg/V   . NSg NSg/V   NPrSg/C NSg NSg/V   NPrSg/C
-> POST      ) , also called grammatical tagging is the process of marking up        a   word in a
-# NPrSg/V/P . . W?   V/J    J           NSg/V   VL D   NSg     P  NSg/V   NSg/V/J/P D/P NSg  P  D/P
-> text ( corpus ) as    corresponding to a   particular part    of speech , based on  both its
-# NSg  . NSg    . NSg/R NSg/V/J       P  D/P NSg/J      NSg/V/J P  NSg/V  . V/J   J/P I/C  ISg/D
-> definition and its   context . A   simplified form  of this is commonly taught to
-# NSg        V/C ISg/D NSg     . D/P J          NSg/V P  I/D  VL R        V      P
-> school - age   children , in the identification of words as    nouns , verbs , adjectives ,
-# NSg/V  . NSg/V NPl      . P  D   NSg            P  NPl/V NSg/R NPl/V . NPl/V . NPl/V      .
+> In        corpus linguistics , part    - of - speech tagging ( POS  tagging or      PoS  tagging or
+# NPrSg/J/P NSg+   NSg         . NSg/V/J . P  . NSg/V  NSg/V   . NSg+ NSg/V   NPrSg/C NSg+ NSg/V   NPrSg/C
+> POST       ) , also called grammatical tagging is the process of marking up        a   word  in        a
+# NPrSg/V/P+ . . W?   V/J    J           NSg/V   VL D   NSg/V   P  NSg/V   NSg/V/J/P D/P NSg/V NPrSg/J/P D/P
+> text  ( corpus ) as    corresponding to a   particular part    of speech , based on  both its
+# NSg/V . NSg+   . NSg/R NSg/V/J       P  D/P NSg/J      NSg/V/J P  NSg/V+ . V/J   J/P I/C  ISg/D+
+> definition and its    context . A   simplified form  of this is commonly taught to
+# NSg        V/C ISg/D+ NSg/V+  . D/P V/J        NSg/V P  I/D+ VL R        V      P
+> school - age   children , in        the identification of words  as    nouns , verbs  , adjectives ,
+# NSg/V  . NSg/V NPl      . NPrSg/J/P D   NSg            P  NPl/V+ NSg/R NPl/V . NPl/V+ . NPl/V      .
 > adverbs , etc.
 # NPl/V   . W?
 >
 #
-> Once  performed by      hand  , POS tagging is now         done    in the context of computational
-# NSg/C V/J       NSg/J/P NSg/V . NSg NSg/V   VL NPrSg/V/J/C NSg/V/J P  D   NSg     P  J
-> linguistics , using algorithms which associate discrete terms , as    well    as    hidden
-# NSg         . V     NPl        I/C   NSg/V/J   J        NPl/V . NSg/R NSg/V/J NSg/R V/J
-> parts of speech , by a   set     of descriptive tags  . POS - tagging algorithms fall  into
-# NPl/V P  NSg/V  . P  D/P NPrSg/J P  NSg/J       NPl/V . NSg . NSg/V   NPl        NSg/V P
-> two distinctive groups : rule  - based and stochastic . E. Brill's tagger , one       of the
-# NSg NSg/J       NPl/V  . NSg/V . V/J   V/C J          . ?  ?       NSg    . NSg/I/V/J P  D
-> first and most    widely used English   POS - taggers , employs rule  - based algorithms .
-# NSg/J V/C NSg/I/J R      V/J  NPrSg/V/J NSg . NPl     . NPl/V   NSg/V . V/J   NPl        .
+> Once  performed by      hand   , POS  tagging is now         done    in        the context of computational
+# NSg/C V/J       NSg/J/P NSg/V+ . NSg+ NSg/V   VL NPrSg/V/J/C NSg/V/J NPrSg/J/P D   NSg/V   P  J+
+> linguistics , using algorithms which associate discrete terms  , as    well    as    hidden
+# NSg+        . V     NPl+       I/C+  NSg/V/J+  J        NPl/V+ . NSg/R NSg/V/J NSg/R V/J
+> parts of speech , by      a   set       of descriptive tags   . POS  - tagging algorithms fall  into
+# NPl/V P  NSg/V+ . NSg/J/P D/P NPrSg/V/J P  NSg/J+      NPl/V+ . NSg+ . NSg/V   NPl        NSg/V P
+> two distinctive groups : rule   - based and  stochastic . E. Brill's tagger , one       of the
+# NSg NSg/J       NPl/V+ . NSg/V+ . V/J+  V/C+ J+         . ?  ?       NSg    . NSg/I/V/J P  D
+> first   and most    widely used English    POS  - taggers , employs rule   - based algorithms .
+# NSg/V/J V/C NSg/I/J R      V/J  NPrSg/V/J+ NSg+ . NPl     . NPl/V   NSg/V+ . V/J   NPl+       .
 >
 #
 > Principle
 # NSg/V
 >
 #
-> Part    - of - speech tagging is harder than just having a   list of words and their
-# NSg/V/J . P  . NSg/V  NSg/V   VL J      C/P  V/J  V      D/P NSg  P  NPl/V V/C D
-> parts of speech , because some  words can      represent more        than one       part    of speech
-# NPl   P  NSg/V  . C/P     I/J/R NPl/V NPrSg/VX V         NPrSg/I/V/J C/P  NSg/I/V/J NSg/V/J P  NSg/V
-> at    different times , and because some  parts of speech are complex . This is not
-# NSg/P NSg/J     NPl/V . V/C C/P     I/J/R NPl/V P  NSg/V  V   NSg/V/J . I/D  VL NSg/C
+> Part    - of - speech tagging is harder than just having a   list  of words and their
+# NSg/V/J . P  . NSg/V  NSg/V   VL J      C/P  V/J  V      D/P NSg/V P  NPl/V V/C D+
+> parts of speech , because some   words  can      represent more        than one       part    of speech
+# NPl/V P  NSg/V+ . C/P     I/J/R+ NPl/V+ NPrSg/VX V         NPrSg/I/V/J C/P  NSg/I/V/J NSg/V/J P  NSg/V+
+> at    different times  , and because some  parts of speech are complex  . This is not
+# NSg/P NSg/J+    NPl/V+ . V/C C/P     I/J/R NPl/V P  NSg/V+ V+  NSg/V/J+ . I/D+ VL NSg/C
 > rare    — in        natural languages ( as    opposed to many    artificial languages ) , a   large
-# NSg/V/J . NPrSg/J/P NSg/J   NPl/V     . NSg/R V/J     P  N/I/J/D J          NPl/V     . . D/P NSg/J
-> percentage of word  - forms are ambiguous . For example , even    " dogs  " , which is
-# NSg        P  NSg/V . NPl/V V   J         . C/P NSg/V   . NSg/V/J . NPl/V . . I/C   VL
-> usually thought of as    just a   plural noun  , can      also be     a   verb :
-# R       NSg/V   P  NSg/R V/J  D/P NSg/J  NSg/V . NPrSg/VX W?   NSg/VX D/P NSg  .
+# NSg/V/J . NPrSg/J/P NSg/J   NPl/V+    . NSg/R V/J     P  N/I/J/D J          NPl/V+    . . D/P NSg/J
+> percentage of word   - forms  are ambiguous . For example , even    " dogs   " , which is
+# NSg        P  NSg/V+ . NPl/V+ V+  J+        . C/P NSg/V+  . NSg/V/J . NPl/V+ . . I/C+  VL
+> usually thought of as    just a    plural noun   , can      also be     a   verb   :
+# R       NSg/V   P  NSg/R V/J  D/P+ NSg/J+ NSg/V+ . NPrSg/VX W?   NSg/VX D/P NSg/V+ .
 >
 #
-> The sailor dogs  the hatch .
-# D   NSg    NPl/V D   NSg   .
+> The sailor dogs  the hatch  .
+# D+  NSg    NPl/V D   NSg/V+ .
 >
 #
-> Correct grammatical tagging will     reflect that    " dogs  " is here    used as    a   verb , not
-# NSg/V/J J           NSg/V   NPrSg/VX V       N/I/C/D . NPl/V . VL NSg/J/R V/J  NSg/R D/P NSg  . NSg/C
-> as    the more      common  plural noun  . Grammatical context is one       way   to determine
-# NSg/R D   NPrSg/I/J NSg/V/J NSg/J  NSg/V . J           NSg/V   VL NSg/I/V/J NSg/J P  V
-> this ; semantic analysis can      also be     used to infer that    " sailor " and " hatch "
-# I/D  . NSg/J    NSg      NPrSg/VX W?   NSg/VX V/J  P  J     N/I/C/D . NSg    . V/C . NSg/V .
-> implicate " dogs  " as    1 ) in the nautical context and 2 ) an  action applied to the
-# NSg/V     . NPl/V . NSg/R # . P  D   J        NSg/V   V/C # . D/P NSg/J  V/J     P  D
-> object " hatch " ( in this context , " dogs  " is a   nautical term    meaning " fastens ( a
-# NSg    . NSg/V . . P  I/D  NSg/V   . . NPl/V . VL D/P J        NSg/V/J NSg/V/J . V       . D/P
-> watertight door  ) securely " ) .
-# J          NSg/V . R        . . .
+> Correct  grammatical tagging will     reflect that     " dogs   " is here    used as    a    verb   , not
+# NSg/V/J+ J           NSg/V   NPrSg/VX V       N/I/C/D+ . NPl/V+ . VL NSg/J/R V/J  NSg/R D/P+ NSg/V+ . NSg/C
+> as    the more        common  plural noun   . Grammatical context is one       way    to determine
+# NSg/R D   NPrSg/I/V/J NSg/V/J NSg/J  NSg/V+ . J           NSg/V+  VL NSg/I/V/J NSg/J+ P  V
+> this ; semantic analysis can      also be     used to infer that     " sailor " and " hatch "
+# I/D+ . NSg/J    NSg+     NPrSg/VX W?   NSg/VX V/J  P  J     N/I/C/D+ . NSg+   . V/C . NSg/V .
+> implicate " dogs  " as    1 ) in        the nautical context and 2 ) an  action   applied to the
+# NSg/V     . NPl/V . NSg/R # . NPrSg/J/P D+  J+       NSg/V+  V/C # . D/P NSg/V/J+ V/J     P  D
+> object " hatch " ( in        this context , " dogs   " is a   nautical term     meaning  " fastens ( a
+# NSg/V+ . NSg/V . . NPrSg/J/P I/D+ NSg/V+  . . NPl/V+ . VL D/P J        NSg/V/J+ NSg/V/J+ . V       . D/P
+> watertight door   ) securely " ) .
+# J          NSg/V+ . R        . . .
 >
 #
-> Tag   sets
-# NSg/V NPl/V
+> Tag    sets
+# NSg/V+ NPl/V
 >
 #
-> Schools commonly teach that    there are 9 parts of speech in        English   : noun  , verb  ,
-# NPl/V   R        NSg/V N/I/C/D W?    V   # NPl/V P  NSg/V  NPrSg/J/P NPrSg/V/J . NSg/V . NSg/V .
+> Schools commonly teach that    there are 9 parts of speech in        English   : noun   , verb   ,
+# NPl/V+  R        NSg/V N/I/C/D +     V   # NPl/V P  NSg/V+ NPrSg/J/P NPrSg/V/J . NSg/V+ . NSg/V+ .
 > article , adjective , preposition , pronoun , adverb , conjunction , and interjection .
-# NSg/V   . NSg/V/J   . NSg/V       . NSg/V   . NSg/V  . NSg/V       . V/C NSg          .
+# NSg/V+  . NSg/V/J+  . NSg/V       . NSg/V+  . NSg/V+ . NSg/V+      . V/C NSg+         .
 > However , there are clearly many    more        categories and sub     - categories . For nouns ,
-# C       . W?    V   R       N/I/J/D NPrSg/I/V/J NPl        V/C NSg/V/P . NPl        . C/P NPl/V .
-> the plural , possessive , and singular forms can      be     distinguished . In many
-# D   NSg/J  . NSg/J      . V/C NSg/J    NPl/V NPrSg/VX NSg/VX V/J           . P  N/I/J/D
-> languages words are also marked for their " case    " ( role as    subject , object ,
-# NPl/V     NPl/V V   W?   V/J    C/P D     . NPrSg/V . . NSg  NSg/R NSg/V/J . NSg/V  .
-> etc. ) , grammatical gender  , and so        on  ; while     verbs are marked for tense   , aspect ,
-# W?   . . J           NSg/V/J . V/C NSg/I/J/C J/P . NSg/V/C/P NPl/V V   V/J    C/P NSg/V/J . NSg/V  .
-> and other   things . In        some  tagging systems , different inflections of the same
-# V/C NSg/V/J NPl/V  . NPrSg/J/P I/J/R NSg/V   NPl     . NSg/J     NPl         P  D   I/J
-> root    word  will     get   different parts of speech , resulting in a   large number  of
-# NPrSg/V NSg/V NPrSg/VX NSg/V NSg/J     NPl/V P  NSg/V  . V         P  D/P NSg/J NSg/V/J P
-> tags  . For example , NN for singular common  nouns , NNS for plural common  nouns , NP
-# NPl/V . C/P NSg/V   . ?  C/P NSg/J    NSg/V/J NPl/V . ?   C/P NSg/J  NSg/V/J NPl/V . NPrSg
-> for singular proper nouns ( see   the POS tags  used in the Brown   Corpus ) . Other
-# C/P NSg/J    NSg/J  NPl/V . NSg/V D   NSg NPl/V V/J  P  D   NPrSg/J NSg    . . NSg/V/J
-> tagging systems use   a   smaller number  of tags  and ignore fine    differences or
-# NSg/V   NPl     NSg/V D/P J       NSg/V/J P  NPl/V V/C V      NSg/V/J NSg/V       NPrSg/C
-> model   them as    features somewhat independent from part    - of - speech .
-# NSg/V/J N/I  NSg/R NPl/V    NSg/I    NSg/J       P    NSg/V/J . P  . NSg/V  .
+# C       . +     V   R       N/I/J/D NPrSg/I/V/J NPl+       V/C NSg/V/P . NPl        . C/P NPl/V .
+> the plural , possessive , and singular forms  can       be      distinguished . In        many
+# D   NSg/J  . NSg/J      . V/C NSg/J    NPl/V+ NPrSg/VX+ NSg/VX+ V/J+          . NPrSg/J/P N/I/J/D+
+> languages words  are also marked for their " case     " ( role as    subject , object ,
+# NPl/V+    NPl/V+ V   W?   V/J    C/P D+    . NPrSg/V+ . . NSg  NSg/R NSg/V/J . NSg/V+ .
+> etc. ) , grammatical gender   , and so        on  ; while     verbs  are marked for tense   , aspect ,
+# +    . . J+          NSg/V/J+ . V/C NSg/I/J/C J/P . NSg/V/C/P NPl/V+ V   V/J    C/P NSg/V/J . NSg/V+ .
+> and other    things . In        some   tagging systems , different inflections of the same
+# V/C NSg/V/J+ NPl/V+ . NPrSg/J/P I/J/R+ NSg/V   NPl+    . NSg/J     NPl         P  D+  I/J+
+> root     word   will     get   different parts of speech , resulting in        a   large number  of
+# NPrSg/V+ NSg/V+ NPrSg/VX NSg/V NSg/J     NPl/V P  NSg/V+ . V         NPrSg/J/P D/P NSg/J NSg/V/J P+
+> tags   . For example , NN for singular common   nouns , NNS for plural common   nouns , NP
+# NPl/V+ . C/P NSg/V+  . ?  C/P NSg/J    NSg/V/J+ NPl/V . ?   C/P NSg/J  NSg/V/J+ NPl/V . NPrSg
+> for singular proper nouns ( see   the POS  tags   used in        the Brown      Corpus ) . Other
+# C/P NSg/J    NSg/J  NPl/V . NSg/V D+  NSg+ NPl/V+ V/J  NPrSg/J/P D+  NPrSg/V/J+ NSg+   . . NSg/V/J
+> tagging systems use   a   smaller number  of tags   and ignore fine    differences or
+# NSg/V   NPl+    NSg/V D/P J       NSg/V/J P  NPl/V+ V/C V      NSg/V/J NSg/V       NPrSg/C
+> model    them as     features somewhat independent from part    - of - speech .
+# NSg/V/J+ N/I+ NSg/R+ NPl/V+   NSg/I    NSg/J       P    NSg/V/J . P  . NSg/V+ .
 >
 #
-> In        part    - of - speech tagging by      computer , it        is typical to distinguish from 50 to
-# NPrSg/J/P NSg/V/J . P  . NSg/V  NSg/V   NSg/J/P NSg/V    . NPrSg/ISg VL NSg/J   P  V           P    #  P
-> 150 separate parts of speech for English   . Work  on  stochastic methods for tagging
-# #   NSg/V/J  NPl/V P  NSg/V  C/P NPrSg/V/J . NSg/V J/P J          NPl/V   C/P NSg/V
+> In        part    - of - speech tagging by      computer , it         is typical to distinguish from 50 to
+# NPrSg/J/P NSg/V/J . P  . NSg/V  NSg/V   NSg/J/P NSg/V+   . NPrSg/ISg+ VL NSg/J   P  V           P    #  P
+> 150 separate parts of speech for English    . Work  on  stochastic methods for tagging
+# #   NSg/V/J  NPl/V P  NSg/V  C/P NPrSg/V/J+ . NSg/V J/P J          NPl/V   C/P NSg/V
 > Koine Greek     ( DeRose 1990 ) has used over      1 , 000 parts of speech and found that
-# ?     NPrSg/V/J . ?      #    . V   V/J  NSg/V/J/P # . #   NPl/V P  NSg/V  V/C NSg/V N/I/C/D
-> about as    many    words were  ambiguous in that    language as    in        English   . A
-# J/P   NSg/R N/I/J/D NPl/V NSg/V J         P  N/I/C/D NSg/V    NSg/R NPrSg/J/P NPrSg/V/J . D/P
-> morphosyntactic descriptor in the case  of morphologically rich      languages is
-# ?               NSg        P  D   NPrSg P  ?               NPrSg/V/J NPl/V     VL
-> commonly expressed using very short       mnemonics , such  as    Ncmsan for Category = Noun  ,
-# R        V/J       V     J    NPrSg/V/J/P NPl       . NSg/I NSg/R ?      C/P NSg      . NSg/V .
+# ?     NPrSg/V/J . ?      #    . V   V/J  NSg/V/J/P # . #   NPl/V P  NSg/V+ V/C NSg/V N/I/C/D
+> about as    many     words  were  ambiguous in        that     language as    in        English    . A
+# J/P   NSg/R N/I/J/D+ NPl/V+ NSg/V J         NPrSg/J/P N/I/C/D+ NSg/V+   NSg/R NPrSg/J/P NPrSg/V/J+ . D/P
+> morphosyntactic descriptor in        the case    of morphologically rich      languages is
+# ?               NSg        NPrSg/J/P D   NPrSg/V P  ?               NPrSg/V/J NPl/V+    VL
+> commonly expressed using very short        mnemonics , such  as    Ncmsan for Category = Noun   ,
+# R        V/J       V     J    NPrSg/V/J/P+ NPl       . NSg/I NSg/R ?      C/P NSg      . NSg/V+ .
 > Type  = common  , Gender  = masculine , Number  = singular , Case    = accusative , Animate
 # NSg/V . NSg/V/J . NSg/V/J . NSg/J     . NSg/V/J . NSg/J    . NPrSg/V . NSg/J      . V/J
 > = no      .
 # . NPrSg/P .
 >
 #
-> The most    popular " tag   set       " for POS tagging for American English   is probably the
-# D   NSg/I/J NSg/J   . NSg/V NPrSg/V/J . C/P NSg NSg/V   C/P NPrSg/J  NPrSg/V/J VL R        D
-> Penn tag   set       , developed in the Penn Treebank project . It        is largely similar to
-# NPr  NSg/V NPrSg/V/J . V/J       P  D   NPr  ?        NSg/V   . NPrSg/ISg VL R       NSg/J   P
-> the earlier Brown     Corpus and LOB   Corpus tag   sets  , though much  smaller . In
-# D   J       NPrSg/V/J NSg    V/C NSg/V NSg    NSg/V NPl/V . V/C    N/I/J J       . NPrSg/J/P
-> Europe , tag   sets  from the Eagles Guidelines see   wide  use   and include versions
-# NPr    . NSg/V NPl/V P    D   NPl    NPl        NSg/V NSg/J NSg/V V/C NSg/V   NPl/V
+> The most    popular " tag    set       " for POS  tagging for American English    is probably the
+# D   NSg/I/J NSg/J   . NSg/V+ NPrSg/V/J . C/P NSg+ NSg/V   C/P NPrSg/J  NPrSg/V/J+ VL R        D+
+> Penn tag    set       , developed in        the Penn Treebank project . It         is largely similar to
+# NPr+ NSg/V+ NPrSg/V/J . V/J       NPrSg/J/P D+  NPr+ ?        NSg/V+  . NPrSg/ISg+ VL R       NSg/J   P
+> the earlier Brown     Corpus and LOB   Corpus tag    sets  , though much   smaller . In
+# D   J       NPrSg/V/J NSg    V/C NSg/V NSg+   NSg/V+ NPl/V . V/C    N/I/J+ J+      . NPrSg/J/P
+> Europe , tag    sets  from the Eagles Guidelines see   wide  use    and include versions
+# NPr+   . NSg/V+ NPl/V P    D+  NPl/V+ NPl+       NSg/V NSg/J NSg/V+ V/C NSg/V   NPl/V
 > for multiple languages .
-# C/P NSg/J    NPl/V     .
+# C/P NSg/J+   NPl/V+    .
 >
 #
-> POS tagging work  has been  done    in a   variety of languages , and the set     of POS
-# NSg NSg/V   NSg/V V   NSg/V NSg/V/J P  D/P NSg     P  NPl/V     . V/C D   NPrSg/J P  NSg
-> tags  used varies greatly with language . Tags  usually are designed to include
-# NPl/V V/J  NPl/V  R       P    NSg/V    . NPl/V R       V   V/J      P  NSg/V
+> POS  tagging work   has been  done    in        a   variety of languages , and the set       of POS
+# NSg+ NSg/V   NSg/V+ V   NSg/V NSg/V/J NPrSg/J/P D/P NSg     P  NPl/V+    . V/C D   NPrSg/V/J P  NSg+
+> tags   used varies greatly with language . Tags   usually are designed to include
+# NPl/V+ V/J  NPl/V  R       P    NSg/V+   . NPl/V+ R       V   V/J      P  NSg/V
 > overt morphological distinctions , although this leads to inconsistencies such  as
-# NSg/J J             NPl          . C        I/D  NPl/V P  NPl             NSg/I NSg/R
-> case    - marking for pronouns but     not   nouns in        English   , and much  larger
-# NPrSg/V . NSg/V   C/P NPl/V    NSg/C/P NSg/C NPl/V NPrSg/J/P NPrSg/V/J . V/C N/I/J J
-> cross       - language differences . The tag sets  for heavily inflected languages such  as
-# NPrSg/V/J/P . NSg/V    NSg/V       . D   NSg NPl/V C/P R       V/J       NPl/V     NSg/I NSg/R
-> Greek     and Latin   can      be     very large ; tagging words in        agglutinative languages such
-# NPrSg/V/J V/C NPrSg/J NPrSg/VX NSg/VX J    NSg/J . NSg/V   NPl/V NPrSg/J/P ?             NPl/V     NSg/I
-> as    Inuit   languages may      be     virtually impossible . At the other extreme , Petrov et
-# NSg/R NPrSg/J NPl/V     NPrSg/VX NSg/VX R         NSg/J      . P  D   NSg/J NSg/J   . ?      ?
-> al. have   proposed a   " universal " tag   set       , with 12 categories ( for example , no
-# ?   NSg/VX V/J      D/P . NSg/J     . NSg/V NPrSg/V/J . P    #  NPl        . C/P NSg/V   . NPrSg/P
-> subtypes of nouns , verbs , punctuation , and so        on  ) . Whether a   very small     set       of
-# NPl      P  NPl/V . NPl/V . NSg         . V/C NSg/I/J/C J/P . . I/C     D/P J    NPrSg/V/J NPrSg/V/J P
-> very broad tags  or      a   much  larger set       of more        precise ones  is preferable , depends
-# J    NSg/J NPl/V NPrSg/C D/P N/I/J J      NPrSg/V/J P  NPrSg/I/V/J V/J     NPl/V VL W?         . NPl/V
-> on the purpose at    hand  . Automatic tagging is easier on  smaller tag   - sets  .
-# P  D   NSg     NSg/P NSg/V . NSg/J     NSg/V   VL J      J/P J       NSg/V . NPl/V .
+# NSg/J J+            NPl+         . C        I/D+ NPl/V P  NPl             NSg/I NSg/R
+> case     - marking for pronouns but     not   nouns in        English    , and much  larger
+# NPrSg/V+ . NSg/V   C/P NPl/V    NSg/C/P NSg/C NPl/V NPrSg/J/P NPrSg/V/J+ . V/C N/I/J J
+> cross        - language differences . The tag    sets  for heavily inflected languages such  as
+# NPrSg/V/J/P+ . NSg/V+   NSg/V       . D+  NSg/V+ NPl/V C/P R       V/J       NPl/V+    NSg/I NSg/R
+> Greek     and Latin   can      be     very large ; tagging words  in        agglutinative languages such
+# NPrSg/V/J V/C NPrSg/J NPrSg/VX NSg/VX J    NSg/J . NSg/V   NPl/V+ NPrSg/J/P ?             NPl/V+    NSg/I
+> as    Inuit   languages may      be     virtually impossible . At    the other    extreme , Petrov et
+# NSg/R NPrSg/J NPl/V+    NPrSg/VX NSg/VX R+        NSg/J+     . NSg/P D+  NSg/V/J+ NSg/J   . ?      ?
+> al. have   proposed a   " universal " tag    set       , with 12 categories ( for example , no
+# ?   NSg/VX V/J      D/P . NSg/J     . NSg/V+ NPrSg/V/J . P    #  NPl        . C/P NSg/V+  . NPrSg/P
+> subtypes of nouns , verbs  , punctuation , and so        on   ) . Whether a   very small     set       of
+# NPl      P  NPl/V . NPl/V+ . NSg+        . V/C NSg/I/J/C J/P+ . . I/C     D/P J    NPrSg/V/J NPrSg/V/J P
+> very broad tags  or      a   much  larger set       of more        precise ones   is preferable , depends
+# J    NSg/J NPl/V NPrSg/C D/P N/I/J J      NPrSg/V/J P  NPrSg/I/V/J V/J     NPl/V+ VL W?         . NPl/V
+> on  the purpose at    hand   . Automatic tagging is easier on  smaller tag    - sets   .
+# J/P D+  NSg/V   NSg/P NSg/V+ . NSg/J     NSg/V   VL J      J/P J       NSg/V+ . NPl/V+ .
 >
 #
 > History
 # NSg
 >
 #
-> The Brown   Corpus
-# D   NPrSg/J NSg
+> The Brown      Corpus
+# D   NPrSg/V/J+ NSg
 >
 #
 > Research on  part    - of - speech tagging has been  closely tied to corpus linguistics .
-# NSg/V    J/P NSg/V/J . P  . NSg/V  NSg/V   V   NSg/V R       V/J  P  NSg    NSg         .
-> The first major     corpus of English   for computer analysis was the Brown   Corpus
-# D   NSg/J NPrSg/V/J NSg    P  NPrSg/V/J C/P NSg/V    NSg      V   D   NPrSg/J NSg
-> developed at    Brown     University by      Henry Kučera and W. Nelson Francis , in the
-# V/J       NSg/P NPrSg/V/J NSg        NSg/J/P NPrSg ?      V/C ?  NPrSg  NPr     . P  D
-> mid     - 1960s . It        consists of about 1 , 000 , 000 words of running   English   prose text  ,
-# NSg/J/P . #d    . NPrSg/ISg NPl/V    P  J/P   # . #   . #   NPl/V P  NSg/V/J/P NPrSg/V/J NSg/V NSg/V .
+# NSg/V    J/P NSg/V/J . P  . NSg/V  NSg/V   V   NSg/V R       V/J  P  NSg    NSg+        .
+> The first   major     corpus of English    for computer analysis was the Brown     Corpus
+# D   NSg/V/J NPrSg/V/J NSg    P  NPrSg/V/J+ C/P NSg/V+   NSg+     V   D   NPrSg/V/J NSg
+> developed at    Brown     University by      Henry  Kučera and W. Nelson Francis , in        the
+# V/J       NSg/P NPrSg/V/J NSg        NSg/J/P NPrSg+ ?      V/C ?  NPrSg+ NPr+    . NPrSg/J/P D
+> mid      - 1960s . It         consists of about 1 , 000 , 000 words of running   English    prose text   ,
+# NSg/J/P+ . #d    . NPrSg/ISg+ NPl/V    P  J/P   # . #   . #   NPl/V P  NSg/V/J/P NPrSg/V/J+ NSg/V NSg/V+ .
 > made  up        of 500 samples from randomly chosen publications . Each sample is 2 , 000
-# NSg/V NSg/V/J/P P  #   NPl/V   P    R        V/J    NPl          . D    NSg/V  VL # . #
-> or      more        words ( ending at the first sentence - end   after 2 , 000 words , so        that    the
-# NPrSg/C NPrSg/I/V/J NPl/V . NSg/V  P  D   NSg/J NSg/V    . NSg/V J/P   # . #   NPl/V . NSg/I/J/C N/I/C/D D
+# NSg/V NSg/V/J/P P  #   NPl/V+  P    R+       V/J    NPl+         . D+   NSg/V+ VL # . #
+> or      more        words  ( ending at    the first    sentence - end   after 2 , 000 words  , so        that    the
+# NPrSg/C NPrSg/I/V/J NPl/V+ . NSg/V  NSg/P D   NSg/V/J+ NSg/V+   . NSg/V J/P   # . #   NPl/V+ . NSg/I/J/C N/I/C/D D+
 > corpus contains only complete sentences ) .
-# NSg    V        W?   NSg/V/J  NPl/V     . .
+# NSg+   V        W?   NSg/V/J+ NPl/V+    . .
 >
 #
-> The Brown   Corpus was painstakingly " tagged " with part    - of - speech markers over
-# D   NPrSg/J NSg    V   R             . V/J    . P    NSg/V/J . P  . NSg/V  NPl/V   NSg/V/J/P
-> many    years . A   first approximation was done    with a   program by      Greene and Rubin ,
-# N/I/J/D NPl   . D/P NSg/J NSg           V   NSg/V/J P    D/P NPrSg   NSg/J/P NPr    V/C NPr   .
-> which consisted of a   huge handmade list  of what  categories could  co        - occur at
-# I/C   V/J       P  D/P J    NSg/J    NSg/V P  NSg/I NPl        NSg/VX NPrSg/I/V . V     NSg/P
-> all       . For example , article then    noun  can      occur , but     article then    verb  ( arguably )
-# NSg/I/J/C . C/P NSg/V   . NSg/V   NSg/J/C NSg/V NPrSg/VX V     . NSg/C/P NSg/V   NSg/J/C NSg/V . R        .
-> cannot . The program got about 70 % correct . Its   results were  repeatedly reviewed
-# NSg/V  . D   NPrSg   V   J/P   #  . NSg/V/J . ISg/D NPl     NSg/V R          V/J
-> and corrected by      hand  , and later users sent  in        errata so        that    by the late  70 s
-# V/C V/J       NSg/J/P NSg/V . V/C J     NPl   NSg/V NPrSg/J/P NSg    NSg/I/J/C N/I/C/D P  D   NSg/J #  ?
-> the tagging was nearly perfect ( allowing for some  cases on  which even    human
-# D   NSg     V   R      NSg/V/J . V        C/P I/J/R NPl/V J/P I/C   NSg/V/J NSg/V/J
+> The Brown     Corpus was painstakingly " tagged " with part    - of - speech markers over
+# D+  NPrSg/V/J NSg    V   R             . V/J    . P    NSg/V/J . P  . NSg/V  NPl/V   NSg/V/J/P
+> many     years . A    first    approximation was done    with a   program by      Greene and Rubin ,
+# N/I/J/D+ NPl+  . D/P+ NSg/V/J+ NSg+          V   NSg/V/J P    D/P NPrSg/V NSg/J/P NPr    V/C NPr   .
+> which consisted of a   huge handmade list  of what   categories could  co         - occur at
+# I/C+  V/J       P  D/P J    NSg/J    NSg/V P  NSg/I+ NPl+       NSg/VX NPrSg/I/V+ . V     NSg/P+
+> all       . For example , article then    noun   can      occur , but     article then    verb   ( arguably )
+# NSg/I/J/C . C/P NSg/V+  . NSg/V+  NSg/J/C NSg/V+ NPrSg/VX V     . NSg/C/P NSg/V+  NSg/J/C NSg/V+ . R        .
+> cannot . The program  got about 70 % correct  . Its    results were  repeatedly reviewed
+# NSg/V  . D+  NPrSg/V+ V   J/P   #  . NSg/V/J+ . ISg/D+ NPl/V+  NSg/V R          V/J
+> and corrected by      hand   , and later users sent  in        errata so        that     by      the late  70 s
+# V/C V/J       NSg/J/P NSg/V+ . V/C J     NPl+  NSg/V NPrSg/J/P NSg    NSg/I/J/C N/I/C/D+ NSg/J/P D   NSg/J #  ?
+> the tagging was nearly perfect ( allowing for some  cases  on  which even    human
+# D   NSg/V   V   R      NSg/V/J . V        C/P I/J/R NPl/V+ J/P I/C+  NSg/V/J NSg/V/J
 > speakers might    not   agree ) .
-# W?       NSg/VX/J NSg/C V     . .
+# +        NSg/VX/J NSg/C V     . .
 >
 #
-> This corpus has been  used for innumerable studies of word  - frequency and of
-# I/D  NSg    V   NSg/V V/J  C/P J           NPl/V   P  NSg/V . NSg       V/C P
-> part    - of - speech and inspired the development of similar " tagged " corpora in many
-# NSg/V/J . P  . NSg/V  V/C V/J      D   NSg         P  NSg/J   . V/J    . NPl     P  N/I/J/D
-> other   languages . Statistics derived by      analyzing it        formed the basis for most
-# NSg/V/J NPl/V     . NPl/V      V/J     NSg/J/P V         NPrSg/ISg V/J    D   NSg   C/P NSg/I/J
-> later part    - of - speech tagging systems , such  as    CLAWS and VOLSUNGA . However , by
-# J     NSg/V/J . P  . NSg/V  NSg/V   NPl     . NSg/I NSg/R NPl/V V/C ?        . C       . P
-> this time    ( 2005 ) it        has been  superseded by      larger corpora such  as    the 100
-# I/D  NSg/V/J . #    . NPrSg/ISg V   NSg/V V/J        NSg/J/P J      NPl     NSg/I NSg/R D   #
-> million word  British National Corpus , even    though larger corpora are rarely so
-# N       NSg/V NPrSg/J NSg/J    NSg    . NSg/V/J V/C    J      NPl     V   R      NSg/I/J/C
+> This corpus has been  used for innumerable studies of word   - frequency and of
+# I/D+ NSg    V   NSg/V V/J  C/P J           NPl/V   P  NSg/V+ . NSg       V/C P
+> part    - of - speech and inspired the development of similar " tagged " corpora in        many
+# NSg/V/J . P  . NSg/V  V/C V/J      D   NSg         P  NSg/J   . V/J    . NPl     NPrSg/J/P N/I/J/D+
+> other    languages . Statistics derived by      analyzing it         formed the basis for most
+# NSg/V/J+ NPl/V+    . NPl/V+     V/J     NSg/J/P V         NPrSg/ISg+ V/J    D   NSg   C/P NSg/I/J
+> later part    - of - speech tagging systems , such  as    CLAWS  and VOLSUNGA . However , by
+# J     NSg/V/J . P  . NSg/V  NSg/V   NPl     . NSg/I NSg/R NPl/V+ V/C ?        . C       . NSg/J/P
+> this time     ( 2005 ) it         has been  superseded by      larger corpora such  as    the 100
+# I/D+ NSg/V/J+ . #    . NPrSg/ISg+ V   NSg/V V/J        NSg/J/P J      NPl+    NSg/I NSg/R D   #
+> million word   British National Corpus , even    though larger corpora are rarely so
+# N       NSg/V+ NPrSg/J NSg/J+   NSg+   . NSg/V/J V/C    J+     NPl+    V   R      NSg/I/J/C
 > thoroughly curated .
-# R          V/J     .
+# R+         V/J+    .
 >
 #
 > For some  time    , part    - of - speech tagging was considered an  inseparable part    of
 # C/P I/J/R NSg/V/J . NSg/V/J . P  . NSg/V  NSg/V   V   V/J        D/P NSg/J       NSg/V/J P
-> natural language processing , because there are certain cases where the correct
-# NSg/J   NSg/V    V          . C/P     W?    V   I/J     NPl/V NSg/C D   NSg/J
+> natural language processing , because there are certain cases  where the correct
+# NSg/J+  NSg/V+   V+         . C/P     +     V   I/J     NPl/V+ NSg/C D   NSg/V/J
 > part    of speech cannot be     decided without understanding the semantics or      even    the
-# NSg/V/J P  NSg/V  NSg/V  NSg/VX NSg/V/J C/P     NSg/V/J       D   NSg       NPrSg/C NSg/V/J D
+# NSg/V/J P  NSg/V+ NSg/V  NSg/VX NSg/V/J C/P     NSg/V/J+      D+  NSg       NPrSg/C NSg/V/J D
 > pragmatics of the context . This is extremely expensive , especially because
-# NPl        P  D   NSg     . I/D  VL R         J         . R          C/P
+# NPl        P  D+  NSg/V+  . I/D+ VL R         J         . R          C/P
 > analyzing the higher levels is much  harder when    multiple part    - of - speech
-# V         D   J      NPl/V  VL N/I/J J      NSg/I/C NSg/J    NSg/V/J . P  . NSg/V
-> possibilities must  be     considered for each word  .
-# NPl           NSg/V NSg/VX V/J        C/P D    NSg/V .
+# V         D+  J+     NPl/V+ VL N/I/J J      NSg/I/C NSg/J    NSg/V/J . P  . NSg/V
+> possibilities must  be     considered for each word   .
+# NPl           NSg/V NSg/VX V/J        C/P D+   NSg/V+ .
 >
 #
 > Use   of hidden Markov models
-# NSg/V P  V/J    NPr    NPl/V
+# NSg/V P  V/J    NPr+   NPl/V
 >
 #
-> In the mid     - 1980s , researchers in        Europe began to use   hidden Markov models ( HMMs )
-# P  D   NSg/J/P . #d    . W?          NPrSg/J/P NPr    V     P  NSg/V V/J    NPr    NPl/V  . ?    .
+> In        the mid     - 1980s , researchers in        Europe began to use   hidden Markov models ( HMMs )
+# NPrSg/J/P D   NSg/J/P . #d    . W?          NPrSg/J/P NPr+   V     P  NSg/V V/J    NPr    NPl/V+ . ?    .
 > to disambiguate parts of speech , when    working to tag   the Lancaster - Oslo - Bergen
-# P  V            NPl/V P  NSg/V  . NSg/I/C V       P  NSg/V D   NPr       . NPr  . NPr
-> Corpus of British English   . HMMs involve counting cases ( such  as    from the Brown
-# NSg    P  NPrSg/J NPrSg/V/J . ?    V       V        NPl/V . NSg/I NSg/R P    D   NPrSg/J
+# P  V            NPl/V P  NSg/V+ . NSg/I/C V       P  NSg/V D   NPr       . NPr+ . NPr
+> Corpus of British  English    . HMMs involve counting cases ( such  as    from the Brown
+# NSg    P  NPrSg/J+ NPrSg/V/J+ . ?    V       V        NPl/V . NSg/I NSg/R P    D+  NPrSg/V/J+
 > Corpus ) and making a   table of the probabilities of certain sequences . For
-# NSg    . V/C NSg/V  D/P NSg   P  D   NPl           P  I/J     NPl/V     . C/P
-> example , once  you've seen  an  article such  as    ' the ' , perhaps the next    word  is a
-# NSg/V   . NSg/C W?     NSg/V D/P NSg     NSg/I NSg/R . D   . . NSg     D   NSg/J/P NSg/V VL D/P
-> noun 40 % of the time  , an  adjective 40 % , and a   number 20 % . Knowing   this , a
-# NSg  #  . P  D   NSg/J . D/P NSg/J     #  . . V/C D/P NSg/J  #  . . NSg/V/J/P I/D  . D/P
-> program can      decide that    " can      " in        " the can   " is far     more        likely to be     a   noun than
-# NPrSg   NPrSg/VX V      N/I/C/D . NPrSg/VX . NPrSg/J/P . D   NPrSg . VL NSg/V/J NPrSg/I/V/J NSg/J  P  NSg/VX D/P NSg  C/P
-> a   verb or      a   modal . The same method can      , of course , be     used to benefit from
-# D/P NSg  NPrSg/C D/P NSg/J . D   I/J  NSg/V  NPrSg/VX . P  NSg/V  . NSg/VX V/J  P  NSg/V   P
-> knowledge about the following words .
-# NSg/V     J/P   D   NSg/J/P   NPl/V .
+# NSg+   . V/C NSg/V  D/P NSg/V P  D   NPl           P  I/J+    NPl/V+    . C/P
+> example , once  you've seen  an  article such  as    ' the ' , perhaps the next     word   is a
+# NSg/V+  . NSg/C W?     NSg/V D/P NSg/V+  NSg/I NSg/R . D   . . NSg     D+  NSg/J/P+ NSg/V+ VL D/P
+> noun  40 % of the time     , an   adjective 40 % , and a    number   20 % . Knowing   this , a
+# NSg/V #  . P  D+  NSg/V/J+ . D/P+ NSg/V/J+  #  . . V/C D/P+ NSg/V/J+ #  . . NSg/V/J/P I/D+ . D/P+
+> program  can      decide that     " can      " in        " the can      " is far     more        likely to be     a   noun  than
+# NPrSg/V+ NPrSg/VX V      N/I/C/D+ . NPrSg/VX . NPrSg/J/P . D+  NPrSg/VX . VL NSg/V/J NPrSg/I/V/J NSg/J  P  NSg/VX D/P NSg/V C/P
+> a   verb  or      a    modal  . The same method can      , of course , be     used to benefit from
+# D/P NSg/V NPrSg/C D/P+ NSg/J+ . D+  I/J+ NSg/V+ NPrSg/VX . P  NSg/V+ . NSg/VX V/J  P  NSg/V   P
+> knowledge about the following  words .
+# NSg/V+    J/P   D+  NSg/V/J/P+ NPl/V .
 >
 #
 > More        advanced ( " higher - order " ) HMMs learn the probabilities not   only of pairs
-# NPrSg/I/V/J V/J      . . J      . NSg/V . . ?    NSg/V D   NPl           NSg/C W?   P  NPl/V
+# NPrSg/I/V/J V/J      . . J      . NSg/V . . ?    NSg/V D+  NPl+          NSg/C W?   P  NPl/V+
 > but     triples or      even    larger sequences . So        , for example , if    you've just seen  a
-# NSg/C/P NPl/V   NPrSg/C NSg/V/J J      NPl/V     . NSg/I/J/C . C/P NSg/V   . NSg/C W?     V/J  NSg/V D/P
-> noun followed by a   verb , the next    item  may      be     very likely a   preposition ,
-# NSg  V/J      P  D/P NSg  . D   NSg/J/P NSg/V NPrSg/VX NSg/VX J    NSg/J  D/P NSg         .
-> article , or      noun  , but     much  less    likely another verb  .
-# NSg/V   . NPrSg/C NSg/V . NSg/C/P N/I/J V/J/C/P NSg/J  I/D     NSg/V .
+# NSg/C/P NPl/V   NPrSg/C NSg/V/J J      NPl/V+    . NSg/I/J/C . C/P NSg/V+  . NSg/C W?     V/J  NSg/V D/P
+> noun  followed by      a    verb   , the next     item   may      be     very likely a   preposition ,
+# NSg/V V/J      NSg/J/P D/P+ NSg/V+ . D+  NSg/J/P+ NSg/V+ NPrSg/VX NSg/VX J    NSg/J  D/P NSg/V       .
+> article , or      noun   , but     much  less    likely another verb  .
+# NSg/V+  . NPrSg/C NSg/V+ . NSg/C/P N/I/J V/J/C/P NSg/J+ I/D     NSg/V .
 >
 #
-> When    several ambiguous words occur together , the possibilities multiply .
-# NSg/I/C J/D     J         NPl/V V     J        . D   NPl           NSg/V    .
-> However , it        is easy    to enumerate every combination and to assign a   relative
-# C       . NPrSg/ISg VL NSg/V/J P  V         D     NSg         V/C P  NSg/V  D/P NSg/J
-> probability to each one       , by      multiplying together the probabilities of each
-# NSg         P  D    NSg/I/V/J . NSg/J/P V           J        D   NPl           P  D
-> choice in        turn  . The combination with the highest probability is then    chosen . The
-# NSg/J  NPrSg/J/P NSg/V . D   NSg         P    D   W?      NSg         VL NSg/J/C V/J    . D
-> European group developed CLAWS , a   tagging program that    did exactly this and
-# NSg/J    NSg/V V/J       NPl/V . D/P NSg     NPrSg/V N/I/C/D V   R       I/D  V/C
-> achieved accuracy in the 93 – 95 % range .
-# V/J      NSg      P  D   #  . #  . NSg/V .
+> When    several ambiguous words  occur together , the possibilities multiply .
+# NSg/I/C J/D     J         NPl/V+ V     J        . D+  NPl           NSg/V+   .
+> However , it         is easy    to enumerate every combination and to assign a   relative
+# C       . NPrSg/ISg+ VL NSg/V/J P  V         D+    NSg+        V/C P  NSg/V  D/P NSg/J
+> probability to each one        , by      multiplying together the probabilities of each
+# NSg         P  D+   NSg/I/V/J+ . NSg/J/P V           J        D   NPl           P  D+
+> choice in        turn  . The combination with the highest probability is then     chosen . The
+# NSg/J+ NPrSg/J/P NSg/V . D   NSg         P    D+  +       NSg+        VL NSg/J/C+ V/J    . D+
+> European group  developed CLAWS  , a   tagging program  that     did exactly this and
+# NSg/J+   NSg/V+ V/J       NPl/V+ . D/P NSg/V+  NPrSg/V+ N/I/C/D+ V   R       I/D+ V/C
+> achieved accuracy in        the 93 – 95 % range  .
+# V/J      NSg+     NPrSg/J/P D   #  . #  . NSg/V+ .
 >
 #
 > Eugene Charniak points out         in        Statistical techniques for natural language
-# NPr    ?        NPl/V  NSg/V/J/R/P NPrSg/J/P J           NPl        C/P NSg/J   NSg/V
-> parsing ( 1997 ) that    merely assigning the most    common  tag   to each known   word  and
-# V       . #    . N/I/C/D R      V         D   NSg/I/J NSg/V/J NSg/V P  D    NSg/V/J NSg/V V/C
-> the tag " proper noun  " to all       unknowns will     approach 90 % accuracy because many
-# D   NSg . NSg/J  NSg/V . P  NSg/I/J/C NPl/V    NPrSg/VX NSg/V    #  . NSg      C/P     N/I/J/D
-> words are unambiguous , and many    others only rarely represent their less  - common
-# NPl/V V   J           . V/C N/I/J/D NPl/V  W?   R      V         D     J/C/P . NSg/V/J
+# NPr+   ?        NPl/V+ NSg/V/J/R/P NPrSg/J/P J           NPl        C/P NSg/J   NSg/V+
+> parsing ( 1997 ) that     merely assigning the most    common  tag   to each known   word  and
+# V       . #    . N/I/C/D+ R      V         D   NSg/I/J NSg/V/J NSg/V P  D+   NSg/V/J NSg/V V/C
+> the tag    " proper noun  " to all        unknowns will     approach 90 % accuracy because many
+# D   NSg/V+ . NSg/J  NSg/V . P  NSg/I/J/C+ NPl/V+   NPrSg/VX NSg/V    #  . NSg+     C/P     N/I/J/D+
+> words  are unambiguous , and many     others only rarely represent their less    - common
+# NPl/V+ V   J           . V/C N/I/J/D+ NPl/V+ W?   R      V         D+    V/J/C/P . NSg/V/J
 > parts of speech .
-# NPl/V P  NSg/V  .
+# NPl/V P  NSg/V+ .
 >
 #
-> CLAWS pioneered the field of HMM - based part    of speech tagging but     was quite
-# NPl/V V/J       D   NSg   P  V   . V/J   NSg/V/J P  NSg/V  NSg/V   NSg/C/P V   NSg
-> expensive since it        enumerated all       possibilities . It        sometimes had to resort to
-# J         C/P   NPrSg/ISg V/J        NSg/I/J/C NPl           . NPrSg/ISg R         V   P  NSg/V  P
-> backup methods when    there were  simply too many    options ( the Brown   Corpus
-# NSg/J  NPl/V   NSg/I/C W?    NSg/V R      W?  N/I/J/D NPl/V   . D   NPrSg/J NSg
-> contains a   case  with 17 ambiguous words in a   row , and there are words such  as
-# V        D/P NPrSg P    #  J         NPl/V P  D/P NSg . V/C W?    V   NPl/V NSg/I NSg/R
-> " still   " that    can      represent as    many    as    7 distinct parts of speech .
-# . NSg/V/J . N/I/C/D NPrSg/VX V         NSg/R N/I/J/D NSg/R # V/J      NPl/V P  NSg/V  .
+> CLAWS  pioneered the field of HMM - based part    of speech tagging but     was quite
+# NPl/V+ V/J       D   NSg/V P  V   . V/J   NSg/V/J P  NSg/V+ NSg/V   NSg/C/P V   NSg
+> expensive since it         enumerated all        possibilities . It         sometimes had to resort to
+# J         C/P   NPrSg/ISg+ V/J        NSg/I/J/C+ NPl+          . NPrSg/ISg+ R         V   P  NSg/V  P
+> backup methods when    there were  simply too many     options ( the Brown      Corpus
+# NSg/J  NPl/V+  NSg/I/C +     NSg/V R      W?  N/I/J/D+ NPl/V   . D+  NPrSg/V/J+ NSg+
+> contains a   case    with 17 ambiguous words in        a    row    , and there are words  such  as
+# V        D/P NPrSg/V P    #  J         NPl/V NPrSg/J/P D/P+ NSg/V+ . V/C +     V   NPl/V+ NSg/I NSg/R
+> " still   " that     can      represent as    many    as    7 distinct parts of speech .
+# . NSg/V/J . N/I/C/D+ NPrSg/VX V         NSg/R N/I/J/D NSg/R # V/J      NPl/V P  NSg/V+ .
 >
 #
 > HMMs underlie the functioning of stochastic taggers and are used in        various
-# ?    V        D   N/J         P  J          NPl     V/C V   V/J  NPrSg/J/P J
+# ?    V        D   V           P  J          NPl     V/C V   V/J  NPrSg/J/P J
 > algorithms one       of the most    widely used being   the bi    - directional inference
-# NPl        NSg/I/V/J P  D   NSg/I/J R      V/J  NSg/V/C D   NSg/J . NSg/J       NSg
+# NPl+       NSg/I/V/J P  D   NSg/I/J R      V/J  NSg/V/C D   NSg/J . NSg/J       NSg+
 > algorithm .
-# NSg       .
+# NSg+      .
 >
 #
 > Dynamic programming methods
-# NSg/J   NSg/V       NPl/V
+# NSg/J+  NSg/V+      NPl/V
 >
 #
-> In 1987 , Steven DeRose and Kenneth W. Church  independently developed dynamic
-# P  #    . NPr    ?      V/C NPr     ?  NPrSg/V R             V/J       NSg/J
-> programming algorithms to solve the same problem in        vastly less    time    . Their
-# NSg/V       NPl        P  NSg/V D   I/J  NSg/J   NPrSg/J/P R      V/J/C/P NSg/V/J . D
-> methods were  similar to the Viterbi algorithm known   for some  time    in        other
-# NPl     NSg/V NSg/J   P  D   ?       NSg       NSg/V/J C/P I/J/R NSg/V/J NPrSg/J/P NSg/V/J
-> fields  . DeRose used a   table of pairs , while     Church  used a   table of triples and a
-# NPrPl/V . ?      V/J  D/P NSg   P  NPl/V . NSg/V/C/P NPrSg/V V/J  D/P NSg   P  NPl/V   V/C D/P
-> method of estimating the values for triples that    were  rare    or      nonexistent in the
-# NSg    P  V          D   NPl    C/P NPl/V   N/I/C/D NSg/V NSg/V/J NPrSg/C NSg/J       P  D
-> Brown   Corpus ( an  actual measurement of triple  probabilities would  require a   much
-# NPrSg/J NSg    . D/P NSg/J  NSg         P  NSg/V/J NPl           NSg/VX NSg/V   D/P N/I/J
+> In        1987 , Steven DeRose and Kenneth W. Church   independently developed dynamic
+# NPrSg/J/P #    . NPr+   ?      V/C NPr+    ?  NPrSg/V+ R             V/J       NSg/J
+> programming algorithms to solve the same problem in        vastly less    time     . Their
+# NSg/V+      NPl+       P  NSg/V D   I/J  NSg/J   NPrSg/J/P R      V/J/C/P NSg/V/J+ . D+
+> methods were  similar to the Viterbi algorithm known   for some  time     in        other
+# NPl/V+  NSg/V NSg/J   P  D   ?       NSg       NSg/V/J C/P I/J/R NSg/V/J+ NPrSg/J/P NSg/V/J+
+> fields   . DeRose used a   table of pairs  , while     Church   used a   table of triples and a
+# NPrPl/V+ . ?      V/J  D/P NSg/V P  NPl/V+ . NSg/V/C/P NPrSg/V+ V/J  D/P NSg/V P  NPl/V   V/C D/P
+> method of estimating the values for triples that     were  rare    or      nonexistent in        the
+# NSg/V  P  V          D   NPl/V  C/P NPl/V   N/I/C/D+ NSg/V NSg/V/J NPrSg/C NSg/J       NPrSg/J/P D+
+> Brown      Corpus ( an  actual measurement of triple  probabilities would  require a   much
+# NPrSg/V/J+ NSg    . D/P NSg/J  NSg         P  NSg/V/J NPl+          NSg/VX NSg/V   D/P N/I/J
 > larger corpus ) . Both methods achieved an  accuracy of over      95 % . DeRose's 1990
-# J      NSg    . . I/C  NPl/V   V/J      D/P NSg      P  NSg/V/J/P #  . . ?        #
-> dissertation at    Brown     University included analyses of the specific error types ,
-# NSg          NSg/P NPrSg/V/J NSg        V/J      NSg/V    P  D   NSg/J    NSg/V NPl/V .
-> probabilities , and other   related data , and replicated his   work for Greek     , where
-# NPl           . V/C NSg/V/J J       NSg  . V/C V/J        ISg/D NSg  C/P NPrSg/V/J . NSg/C
-> it        proved similarly effective .
-# NPrSg/ISg V/J    R         NSg/J     .
+# J      NSg+   . . I/C  NPl/V+  V/J      D/P NSg      P  NSg/V/J/P #  . . ?        #
+> dissertation at    Brown     University included analyses of the specific error  types  ,
+# NSg+         NSg/P NPrSg/V/J NSg+       V/J      NSg/V    P  D+  NSg/J+   NSg/V+ NPl/V+ .
+> probabilities , and other    related data , and replicated his    work  for Greek     , where
+# NPl+          . V/C NSg/V/J+ J+      NSg+ . V/C V/J        ISg/D+ NSg/V C/P NPrSg/V/J . NSg/C
+> it         proved similarly effective .
+# NPrSg/ISg+ V/J    R+        NSg/J     .
 >
 #
 > These findings were  surprisingly disruptive to the field of natural language
-# I/D   NSg      NSg/V R            J          P  D   NSg   P  NSg/J   NSg/V
+# I/D+  NSg      NSg/V R            J          P  D   NSg/V P  NSg/J+  NSg/V+
 > processing . The accuracy reported was higher than the typical accuracy of very
-# V          . D   NSg      V/J      V   J      C/P  D   NSg/J   NSg      P  J
-> sophisticated algorithms that    integrated part    of speech choice with many    higher
-# V/J           NPl        N/I/C/D V/J        NSg/V/J P  NSg/V  NSg/J  P    N/I/J/D J
-> levels of linguistic analysis : syntax , morphology , semantics , and so        on  . CLAWS ,
-# NPl/V  P  J          NSg      . NSg    . NSg        . NSg       . V/C NSg/I/J/C J/P . NPl/V .
-> DeRose's and Church's methods did fail    for some  of the known cases where
-# ?        V/C N$       NPl/V   V   NSg/V/J C/P I/J/R P  D   NSg/J NPl/V NSg/C
-> semantics is required , but     those proved negligibly rare    . This convinced many    in
-# NSg       VL V/J      . NSg/C/P I/D   V/J    R          NSg/V/J . I/D  V/J       N/I/J/D P
-> the field that    part    - of - speech tagging could  usefully be     separated from the other
-# D   NSg   N/I/C/D NSg/V/J . P  . NSg/V  NSg/V   NSg/VX R        NSg/VX V/J       P    D   NSg/J
+# V+         . D+  NSg+     V/J      V   J      C/P  D   NSg/J   NSg      P  J
+> sophisticated algorithms that     integrated part    of speech choice with many    higher
+# V/J           NPl+       N/I/C/D+ V/J        NSg/V/J P  NSg/V+ NSg/J  P    N/I/J/D J
+> levels of linguistic analysis : syntax , morphology , semantics , and so         on  . CLAWS ,
+# NPl/V  P  J          NSg+     . NSg+   . NSg+       . NSg+      . V/C NSg/I/J/C+ J/P . NPl/V .
+> DeRose's and Church's methods did fail    for some  of the known    cases  where
+# ?        V/C N$       NPl/V+  V   NSg/V/J C/P I/J/R P  D+  NSg/V/J+ NPl/V+ NSg/C
+> semantics is required , but     those proved negligibly rare     . This convinced many    in
+# NSg+      VL V/J      . NSg/C/P I/D+  V/J    R+         NSg/V/J+ . I/D+ V/J       N/I/J/D NPrSg/J/P
+> the field  that     part    - of - speech tagging could  usefully be     separated from the other
+# D+  NSg/V+ N/I/C/D+ NSg/V/J . P  . NSg/V  NSg/V   NSg/VX R        NSg/VX V/J       P    D   NSg/V/J
 > levels of processing ; this , in        turn  , simplified the theory and practice of
-# NPl/V  P  V          . I/D  . NPrSg/J/P NSg/V . V/J        D   NSg    V/C NSg/V    P
+# NPl/V  P  V          . I/D+ . NPrSg/J/P NSg/V . V/J        D+  NSg    V/C NSg/V    P
 > computerized language analysis and encouraged researchers to find  ways to
-# V/J          NSg/V    NSg      V/C V/J        W?          P  NSg/V NPl  P
-> separate other   pieces as    well    . Markov Models became the standard method for the
-# NSg/V/J  NSg/V/J NPl/V  NSg/R NSg/V/J . NPr    NPl/V  V      D   NSg/J    NSg/V  C/P D
-> part  - of - speech assignment .
-# NSg/J . P  . NSg/V  NSg        .
+# V/J          NSg/V+   NSg+     V/C V/J        +           P  NSg/V NPl+ P
+> separate other    pieces as     well    . Markov Models became the standard method for the
+# NSg/V/J  NSg/V/J+ NPl/V+ NSg/R+ NSg/V/J . NPr    NPl/V+ V      D   NSg/J    NSg/V  C/P D
+> part    - of - speech assignment .
+# NSg/V/J . P  . NSg/V+ NSg+       .
 >
 #
 > Unsupervised taggers
-# V/J          NPl
+# V/J+         NPl
 >
 #
-> The methods already discussed involve working from a   pre   - existing corpus to
-# D   NPl     W?      V/J       V       V       P    D/P NSg/P . V        NSg    P
-> learn tag   probabilities . It        is , however , also possible to bootstrap using
-# NSg/V NSg/V NPl           . NPrSg/ISg VL . C       . W?   NSg/J    P  NSg/V     V
+> The methods already discussed involve working from a   pre      - existing corpus to
+# D+  NPl/V   W?      V/J       V       V       P    D/P NSg/V/P+ . V        NSg    P
+> learn tag    probabilities . It         is , however , also possible to bootstrap using
+# NSg/V NSg/V+ NPl+          . NPrSg/ISg+ VL . C       . W?   NSg/J    P  NSg/V     V
 > " unsupervised " tagging . Unsupervised tagging techniques use   an  untagged corpus
-# . V/J          . NSg/V   . V/J          NSg/V   NPl        NSg/V D/P ?        NSg
-> for their training data and produce the tagset by      induction . That    is , they
-# C/P D     NSg      NSg  V/C NSg/V   D   NSg    NSg/J/P NSg       . N/I/C/D VL . IPl
-> observe patterns in        word  use   , and derive part    - of - speech categories themselves .
-# NSg/V   NPl/V    NPrSg/J/P NSg/V NSg/V . V/C NSg/V  NSg/V/J . P  . NSg/V  NPl        I          .
-> For example , statistics readily reveal that    " the " , " a   " , and " an  " occur in
-# C/P NSg/V   . NPl/V      R       NSg/V  N/I/C/D . D   . . . D/P . . V/C . D/P . V     NPrSg/J/P
-> similar contexts , while     " eat   " occurs in        very different ones  . With sufficient
-# NSg/J   NPl/V    . NSg/V/C/P . NSg/V . V      NPrSg/J/P J    NSg/J     NPl/V . P    J
-> iteration , similarity classes of words emerge that    are remarkably similar to
-# NSg       . NSg        NPl/V   P  NPl/V NSg/V  N/I/C/D V   R          NSg/J   P
+# . V/J          . NSg/V   . V/J          NSg/V   NPl+       NSg/V D/P ?        NSg
+> for their training data and produce the tagset by       induction . That     is , they
+# C/P D+    NSg/V+   NSg+ V/C NSg/V   D   NSg    NSg/J/P+ NSg       . N/I/C/D+ VL . IPl+
+> observe patterns in        word   use   , and derive part    - of - speech categories themselves .
+# NSg/V   NPl/V+   NPrSg/J/P NSg/V+ NSg/V . V/C NSg/V  NSg/V/J . P  . NSg/V  NPl+       I+         .
+> For example , statistics readily reveal that     " the " , " a   " , and " an  " occur in
+# C/P NSg/V+  . NPl/V+     R       NSg/V  N/I/C/D+ . D   . . . D/P . . V/C . D/P . V     NPrSg/J/P
+> similar contexts , while     " eat   " occurs in        very different ones   . With sufficient
+# NSg/J+  NPl/V+   . NSg/V/C/P . NSg/V . V      NPrSg/J/P J    NSg/J+    NPl/V+ . P    J+
+> iteration , similarity classes of words  emerge that     are remarkably similar to
+# NSg       . NSg        NPl/V   P  NPl/V+ NSg/V  N/I/C/D+ V   R          NSg/J   P
 > those human   linguists would  expect ; and the differences themselves sometimes
-# I/D   NSg/V/J NPl       NSg/VX V      . V/C D   NSg         I          R
-> suggest valuable new     insights .
-# V       NSg/J    NSg/V/J NPl      .
+# I/D+  NSg/V/J NPl+      NSg/VX V      . V/C D+  NSg/V+      I+         R
+> suggest valuable new      insights .
+# V       NSg/J+   NSg/V/J+ NPl+     .
 >
 #
-> These two categories can      be     further subdivided into rule  - based , stochastic , and
-# I/D   NSg NPl        NPrSg/VX NSg/VX V/J     V/J        P    NSg/V . V/J   . J          . V/C
+> These two  categories can      be     further subdivided into rule  - based , stochastic , and
+# I/D   NSg+ NPl        NPrSg/VX NSg/VX V/J     V/J        P    NSg/V . V/J   . J          . V/C
 > neural approaches .
-# J      NPl/V      .
+# J+     NPl/V+     .
 >
 #
-> Other   taggers and methods
-# NSg/V/J NPl     V/C NPl/V
+> Other    taggers and methods
+# NSg/V/J+ NPl     V/C NPl/V
 >
 #
-> Some  current major     algorithms for part    - of - speech tagging include the Viterbi
-# I/J/R NSg/J   NPrSg/V/J NPl        C/P NSg/V/J . P  . NSg/V  NSg/V   NSg/V   D   ?
+> Some   current major     algorithms for part    - of - speech tagging include the Viterbi
+# I/J/R+ NSg/J   NPrSg/V/J NPl        C/P NSg/V/J . P  . NSg/V  NSg/V   NSg/V   D   ?
 > algorithm , Brill tagger , Constraint Grammar , and the Baum - Welch algorithm ( also
-# NSg       . NSg/J NSg    . NSg        NSg/V   . V/C D   NPr  . ?     NSg       . W?
-> known   as    the forward - backward algorithm ) . Hidden Markov model   and visible Markov
-# NSg/V/J NSg/R D   NSg/J   . NSg/J    NSg       . . V/J    NPr    NSg/V/J V/C J       NPr
-> model   taggers can      both be     implemented using the Viterbi algorithm . The
-# NSg/V/J NPl     NPrSg/VX I/C  NSg/VX V/J         V     D   ?       NSg       . D
-> rule - based Brill tagger is unusual in that    it        learns a   set     of rule  patterns , and
-# NSg  . V/J   NSg/J NSg    VL NSg/J   P  N/I/C/D NPrSg/ISg NPl/V  D/P NPrSg/J P  NSg/V NPl/V    . V/C
-> then    applies those patterns rather    than optimizing a   statistical quantity .
-# NSg/J/C V       I/D   NPl/V    NPrSg/V/J C/P  V          D/P J           NSg      .
+# NSg       . NSg/J NSg    . NSg+       NSg/V+  . V/C D   NPr  . ?     NSg       . W?
+> known   as    the forward - backward algorithm ) . Hidden Markov model    and visible Markov
+# NSg/V/J NSg/R D   NSg/V/J . NSg/J    NSg+      . . V/J    NPr    NSg/V/J+ V/C J       NPr
+> model    taggers can      both be     implemented using the Viterbi algorithm . The
+# NSg/V/J+ NPl     NPrSg/VX I/C  NSg/VX V/J         V     D+  ?       NSg       . D
+> rule   - based Brill tagger is unusual in        that    it         learns a   set       of rule   patterns , and
+# NSg/V+ . V/J   NSg/J NSg    VL NSg/J   NPrSg/J/P N/I/C/D NPrSg/ISg+ NPl/V  D/P NPrSg/V/J P  NSg/V+ NPl/V+   . V/C
+> then    applies those patterns rather    than optimizing a    statistical quantity .
+# NSg/J/C V       I/D+  NPl/V+   NPrSg/V/J C/P  V          D/P+ J+          NSg+     .
 >
 #
-> Many    machine learning methods have   also been  applied to the problem of POS
-# N/I/J/D NSg/V   V        NPl/V   NSg/VX W?   NSg/V V/J     P  D   NSg/J   P  NSg
+> Many     machine learning methods have   also been  applied to the problem of POS
+# N/I/J/D+ NSg/V   V+       NPl/V+  NSg/VX W?   NSg/V V/J     P  D   NSg/J   P  NSg+
 > tagging . Methods such  as    SVM , maximum entropy classifier , perceptron , and
-# NSg/V   . NPl/V   NSg/I NSg/R ?   . NSg/J   NSg     NSg        . N          . V/C
+# NSg/V+  . NPl/V+  NSg/I NSg/R ?   . NSg/J   NSg     NSg        . N          . V/C
 > nearest - neighbor have   all       been  tried , and most    can      achieve accuracy above
-# W?      . NSg/V/J  NSg/VX NSg/I/J/C NSg/V V/J   . V/C NSg/I/J NPrSg/VX V       NSg      NSg/J/P
+# W?      . NSg/V/J  NSg/VX NSg/I/J/C NSg/V V/J   . V/C NSg/I/J NPrSg/VX V       NSg+     NSg/J/P
 > 95 % . [ citation needed ]
-# #  . . . NSg      V/J    .
+# #  . . . NSg+     V/J+   .
 >
 #
-> A   direct comparison of several methods is reported ( with references ) at the ACL
-# D/P J      NSg        P  J/D     NPl/V   VL V/J      . P    NPl/V      . P  D   NSg
-> Wiki  . This comparison uses  the Penn tag   set       on  some  of the Penn Treebank data ,
-# NSg/V . I/D  NSg        NPl/V D   NPr  NSg/V NPrSg/V/J J/P I/J/R P  D   NPr  ?        NSg  .
+> A   direct comparison of several methods is reported ( with references ) at    the ACL
+# D/P V/J    NSg        P  J/D+    NPl/V+  VL V/J      . P    NPl/V+     . NSg/P D+  NSg+
+> Wiki   . This comparison uses  the Penn tag    set       on  some  of the Penn Treebank data ,
+# NSg/V+ . I/D+ NSg+       NPl/V D+  NPr+ NSg/V+ NPrSg/V/J J/P I/J/R P  D+  NPr+ ?        NSg+ .
 > so        the results are directly comparable . However , many    significant taggers are
-# NSg/I/J/C D   NPl     V   R/C      NSg/J      . C       . N/I/J/D NSg/J       NPl     V
-> not   included ( perhaps because of the labor       involved in        reconfiguring them for
-# NSg/C V/J      . NSg     C/P     P  D   NPrSg/Am/Au V/J      NPrSg/J/P V             N/I  C/P
-> this particular dataset ) . Thus , it        should not   be     assumed that    the results
-# I/D  NSg/J      NSg     . . NSg  . NPrSg/ISg VX     NSg/C NSg/VX V/J     N/I/C/D D   NPl
-> reported here    are the best    that    can      be     achieved with a   given   approach ; nor   even
-# V/J      NSg/J/R V   D   NPrSg/J N/I/C/D NPrSg/VX NSg/VX V/J      P    D/P NSg/J/P NSg/V    . NSg/C NSg/V/J
-> the best    that    have   been  achieved with a   given   approach .
-# D   NPrSg/J N/I/C/D NSg/VX NSg/V V/J      P    D/P NSg/J/P NSg/V    .
+# NSg/I/J/C D+  NPl/V+  V   R/C      NSg/J+     . C       . N/I/J/D NSg/J       NPl     V
+> not   included ( perhaps because of the labor          involved in        reconfiguring them for
+# NSg/C V/J      . NSg     C/P     P  D+  NPrSg/V/Am/Au+ V/J      NPrSg/J/P V             N/I+ C/P
+> this particular dataset ) . Thus , it         should not   be     assumed that    the results
+# I/D+ NSg/J+     NSg     . . NSg  . NPrSg/ISg+ VX     NSg/C NSg/VX V/J     N/I/C/D D+  NPl/V+
+> reported here    are the best       that     can      be     achieved with a    given      approach ; nor   even
+# V/J      NSg/J/R V   D   NPrSg/VX/J N/I/C/D+ NPrSg/VX NSg/VX V/J      P    D/P+ NSg/V/J/P+ NSg/V+   . NSg/C NSg/V/J
+> the best        that     have   been  achieved with a    given      approach .
+# D+  NPrSg/VX/J+ N/I/C/D+ NSg/VX NSg/V V/J      P    D/P+ NSg/V/J/P+ NSg/V+   .
 >
 #
-> In 2014 , a   paper reporting using the structure regularization method for
-# P  #    . D/P NSg/J V         V     D   NSg       NSg            NSg/V  C/P
-> part    - of - speech tagging , achieving 97.36 % on a   standard benchmark dataset .
-# NSg/V/J . P  . NSg/V  NSg/V   . V         #     . P  D/P NSg/J    NSg/V     NSg     .
+> In        2014 , a    paper    reporting using the structure regularization method for
+# NPrSg/J/P #    . D/P+ NSg/V/J+ V         V     D+  NSg/V+    NSg            NSg/V  C/P
+> part    - of - speech tagging , achieving 97.36 % on  a   standard benchmark dataset .
+# NSg/V/J . P  . NSg/V  NSg/V   . V         #     . J/P D/P NSg/J+   NSg/V+    NSg     .
--- a/harper-core/tests/text/tagged/Spell.md
+++ b/harper-core/tests/text/tagged/Spell.md
@ -2,25 +2,25 @@
 # NSg/V
 >
 #
-> This document contains example sentences with misspelled words that    we  want  to test  the spell checker on  .
-# I/D  NSg/V    V        NSg/V   NPl/V     P    V/J        NPl/V N/I/C/D IPl NSg/V P  NSg/V D   NSg   NSg/V   J/P .
+> This document contains example sentences with misspelled words  that     we   want  to test  the spell checker on  .
+# I/D+ NSg/V    V        NSg/V+  NPl/V     P    V/J+       NPl/V+ N/I/C/D+ IPl+ NSg/V P  NSg/V D   NSg/V NSg/V   J/P .
 >
 #
 > Example Sentences
-# NSg/V   NPl/V
+# NSg/V+  NPl/V
 >
 #
-> My favourite      color      is blu .
-# D  NSg/J/Ca/Au/Br NSg/V/J/Am VL W?  .
-> I   must  defend my honour       !
-# ISg NSg/V NSg/V  D  NSg/Ca/Au/Br .
-> I   recognize that    you recognise me        .
-# ISg V         N/I/C/D IPl V/Au/Br   NPrSg/ISg .
-> I   analyze how   you infantilize me        .
-# ISg V       NSg/C IPl V           NPrSg/ISg .
-> I   analyse how   you infantilise me        .
-# ISg V/Au/Br NSg/C IPl ?           NPrSg/ISg .
-> Careful , traveller    !
-# J       . NSg/Ca/Au/Br .
-> At the centre       of the theatre      I   dropped a   litre        of coke    .
-# P  D   NSg/Ca/Au/Br P  D   NSg/Ca/Au/Br ISg V/J     D/P NSg/Ca/Au/Br P  NPrSg/V .
+> My favourite        color      is  blu .
+# D+ NSg/V/J/Ca/Au/Br NSg/V/J/Am VL+ W?  .
+> I    must  defend my honour          !
+# ISg+ NSg/V NSg/V  D+ NSg/V/Ca/Au/Br+ .
+> I    recognize that    you  recognise me         .
+# ISg+ V         N/I/C/D IPl+ V/Au/Br   NPrSg/ISg+ .
+> I    analyze how   you  infantilize me         .
+# ISg+ V       NSg/C IPl+ V           NPrSg/ISg+ .
+> I    analyse how   you  infantilise me         .
+# ISg+ V/Au/Br NSg/C IPl+ ?           NPrSg/ISg+ .
+> Careful , traveller     !
+# J       . NSg/Ca/Au/Br+ .
+> At    the centre         of the theatre       I    dropped a   litre        of coke     .
+# NSg/P D   NSg/V/Ca/Au/Br P  D+  NSg/Ca/Au/Br+ ISg+ V/J     D/P NSg/Ca/Au/Br P  NPrSg/V+ .
--- a/harper-core/tests/text/tagged/The
+++ b/harper-core/tests/text/tagged/The
--- a/harper-core/tests/text/tagged/The
+++ b/harper-core/tests/text/tagged/The
--- a/harper-core/tests/text/tagged/this
+++ b/harper-core/tests/text/tagged/this
@ -1,74 +1,74 @@
-> " This " and " that    " are common  and fulfill multiple purposes in        everyday English   .
-# . I/D  . V/C . N/I/C/D . V   NSg/V/J V/C V       NSg/J    NPl/V    NPrSg/J/P NSg/J    NPrSg/V/J .
-> As    such  , disambiguating them is necessary .
-# NSg/R NSg/I . V              N/I  VL NSg/J     .
+> " This " and " that     " are common  and fulfill multiple purposes in        everyday English    .
+# . I/D+ . V/C . N/I/C/D+ . V   NSg/V/J V/C V       NSg/J    NPl/V    NPrSg/J/P NSg/J+   NPrSg/V/J+ .
+> As    such  , disambiguating them is  necessary .
+# NSg/R NSg/I . V              N/I+ VL+ NSg/J     .
 >
 #
-> This document contains various sentences that    use   " this " , " that    " , " these " , and
-# I/D  NSg/V    V        J       NPl/V     N/I/C/D NSg/V . I/D  . . . N/I/C/D . . . I/D   . . V/C
-> " those " in        different contexts with a   lot   of edge  cases .
-# . I/D   . NPrSg/J/P NSg/J     NPl/V    P    D/P NPrSg P  NSg/V NPl/V .
+> This document contains various sentences that     use   " this " , " that     " , " these " , and
+# I/D+ NSg/V    V        J       NPl/V+    N/I/C/D+ NSg/V . I/D+ . . . N/I/C/D+ . . . I/D+  . . V/C
+> " those " in        different contexts with a   lot     of edge   cases  .
+# . I/D   . NPrSg/J/P NSg/J     NPl/V    P    D/P NPrSg/V P  NSg/V+ NPl/V+ .
 >
 #
 > Examples
-# NPl/V
+# NPl/V+
 >
 #
-> This triangle is nice      .
-# I/D  NSg      VL NPrSg/V/J .
-> This is nice      .
-# I/D  VL NPrSg/V/J .
-> That    triangle is nice      .
-# N/I/C/D NSg      VL NPrSg/V/J .
-> That    is nice      .
-# N/I/C/D VL NPrSg/V/J .
-> These triangles are nice      .
-# I/D   NPl       V   NPrSg/V/J .
-> These are nice      .
-# I/D   V   NPrSg/V/J .
-> Those triangles are nice      .
-# I/D   NPl       V   NPrSg/V/J .
+> This triangle is  nice       .
+# I/D+ NSg+     VL+ NPrSg/V/J+ .
+> This is  nice       .
+# I/D+ VL+ NPrSg/V/J+ .
+> That     triangle is  nice       .
+# N/I/C/D+ NSg+     VL+ NPrSg/V/J+ .
+> That     is  nice       .
+# N/I/C/D+ VL+ NPrSg/V/J+ .
+> These triangles are nice       .
+# I/D+  NPl+      V+  NPrSg/V/J+ .
+> These are nice       .
+# I/D+  V+  NPrSg/V/J+ .
+> Those triangles are nice       .
+# I/D+  NPl+      V+  NPrSg/V/J+ .
 > Those are nice      .
-# I/D   V   NPrSg/V/J .
+# I/D+  V+  NPrSg/V/J .
 >
 #
-> This massage is nice      .
-# I/D  NSg/V   VL NPrSg/V/J .
-> That    massage is nice      .
-# N/I/C/D NSg/V   VL NPrSg/V/J .
-> These massages are nice      .
-# I/D   NPl/V    V   NPrSg/V/J .
-> Those massages are nice      .
-# I/D   NPl/V    V   NPrSg/V/J .
-> This massages well    .
-# I/D  NPl/V    NSg/V/J .
-> That    massages well    .
-# N/I/C/D NPl/V    NSg/V/J .
-> These massage well    .
-# I/D   NSg/V   NSg/V/J .
-> Those massage well    .
-# I/D   NSg/V   NSg/V/J .
+> This massage is  nice       .
+# I/D+ NSg/V+  VL+ NPrSg/V/J+ .
+> That    massage is  nice       .
+# N/I/C/D NSg/V+  VL+ NPrSg/V/J+ .
+> These massages are nice       .
+# I/D+  NPl/V+   V+  NPrSg/V/J+ .
+> Those massages are nice       .
+# I/D+  NPl/V+   V+  NPrSg/V/J+ .
+> This massages well     .
+# I/D+ NPl/V+   NSg/V/J+ .
+> That     massages well     .
+# N/I/C/D+ NPl/V+   NSg/V/J+ .
+> These massage well     .
+# I/D+  NSg/V+  NSg/V/J+ .
+> Those massage well     .
+# I/D+  NSg/V+  NSg/V/J+ .
 >
 #
-> That    could  be     a   solution .
-# N/I/C/D NSg/VX NSg/VX D/P NSg      .
-> Find  all       candidates that    could  be     a   solution .
-# NSg/V NSg/I/J/C NPl/V      N/I/C/D NSg/VX NSg/VX D/P NSg      .
+> That     could  be     a   solution .
+# N/I/C/D+ NSg/VX NSg/VX D/P NSg      .
+> Find  all        candidates that     could  be     a   solution .
+# NSg/V NSg/I/J/C+ NPl/V+     N/I/C/D+ NSg/VX NSg/VX D/P NSg+     .
 >
 #
-> This is all       that    I   have   .
-# I/D  VL NSg/I/J/C N/I/C/D ISg NSg/VX .
-> This is all       that    solutions can      do     .
-# I/D  VL NSg/I/J/C N/I/C/D NPl       NPrSg/VX NSg/VX .
-> That    solution can      do     .
-# N/I/C/D NSg      NPrSg/VX NSg/VX .
+> This is all       that    I    have    .
+# I/D+ VL NSg/I/J/C N/I/C/D ISg+ NSg/VX+ .
+> This is all       that    solutions can       do     .
+# I/D+ VL NSg/I/J/C N/I/C/D NPl+      NPrSg/VX+ NSg/VX .
+> That    solution can       do     .
+# N/I/C/D NSg+     NPrSg/VX+ NSg/VX .
 >
 #
-> We  can      do     this !
-# IPl NPrSg/VX NSg/VX I/D  .
-> I   can      do     this and that    .
-# ISg NPrSg/VX NSg/VX I/D  V/C N/I/C/D .
+> We   can      do     this !
+# IPl+ NPrSg/VX NSg/VX I/D+ .
+> I    can      do     this and that     .
+# ISg+ NPrSg/VX NSg/VX I/D  V/C N/I/C/D+ .
 >
 #
-> We  unite to stand united in        unity .
-# IPl NSg/V P  NSg/V V/J    NPrSg/J/P NSg   .
+> We   unite to stand united in        unity .
+# IPl+ NSg/V P  NSg/V V/J    NPrSg/J/P NSg+  .
--- a/harper-pos-utils/Cargo.toml
+++ b/harper-pos-utils/Cargo.toml
@ -0,0 +1,19 @@
+[package]
+name = "harper-pos-utils"
+version = "0.42.0"
+edition = "2024"
+
+[dependencies]
+rs-conllu = "0.3.0"
+hashbrown = { version = "0.15.3", features = ["serde"] }
+strum = "0.27.1"
+strum_macros = "0.27.1"
+serde = { version = "1.0.219", features = ["derive"] }
+is-macro = "0.3.7"
+rayon = { version = "1.10.0", optional = true }
+rand = { version = "0.9.1", optional = true }
+
+[features]
+default = []
+threaded = ["dep:rayon"]
+training = ["dep:rand"]
--- a/harper-pos-utils/src/chunker/brill_chunker/mod.rs
+++ b/harper-pos-utils/src/chunker/brill_chunker/mod.rs
@ -0,0 +1,270 @@
+mod patch;
+
+#[cfg(feature = "training")]
+use std::path::Path;
+
+#[cfg(feature = "training")]
+use crate::word_counter::WordCounter;
+use crate::{
+    UPOS,
+    chunker::{Chunker, upos_freq_dict::UPOSFreqDict},
+};
+
+use patch::Patch;
+use serde::{Deserialize, Serialize};
+
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct BrillChunker {
+    base: UPOSFreqDict,
+    patches: Vec<Patch>,
+}
+
+impl BrillChunker {
+    pub fn new(base: UPOSFreqDict) -> Self {
+        Self {
+            base,
+            patches: Vec::new(),
+        }
+    }
+
+    fn apply_patches(&self, sentence: &[String], tags: &[Option<UPOS>], np_states: &mut [bool]) {
+        for patch in &self.patches {
+            for i in 0..sentence.len() {
+                if patch.from == np_states[i]
+                    && patch.criteria.fulfils(sentence, tags, np_states, i)
+                {
+                    np_states[i] = !np_states[i];
+                }
+            }
+        }
+    }
+}
+
+impl Chunker for BrillChunker {
+    fn chunk_sentence(&self, sentence: &[String], tags: &[Option<UPOS>]) -> Vec<bool> {
+        let mut initial_pass = self.base.chunk_sentence(sentence, tags);
+
+        self.apply_patches(sentence, tags, &mut initial_pass);
+
+        initial_pass
+    }
+}
+
+#[cfg(feature = "training")]
+type CandidateArgs = (Vec<String>, Vec<Option<UPOS>>, Vec<bool>);
+
+#[cfg(feature = "training")]
+impl BrillChunker {
+    /// Tag a provided sentence with the tagger, providing the "correct" tags (from a dataset or
+    /// other source), returning the number of errors.
+    pub fn count_patch_errors(
+        &self,
+        sentence: &[String],
+        tags: &[Option<UPOS>],
+        base_flags: &[bool],
+        correct_np_flags: &[bool],
+    ) -> usize {
+        let mut flags = base_flags.to_vec();
+        self.apply_patches(sentence, tags, &mut flags);
+
+        let mut loss = 0;
+        for (a, b) in flags.into_iter().zip(correct_np_flags) {
+            if a != *b {
+                loss += 1;
+            }
+        }
+
+        loss
+    }
+
+    /// Tag a provided sentence with the tagger, providing the "correct" tags (from a dataset or
+    /// other source), returning the number of errors.
+    pub fn count_chunk_errors(
+        &self,
+        sentence: &[String],
+        tags: &[Option<UPOS>],
+        correct_np_flags: &[bool],
+        relevant_words: &mut WordCounter,
+    ) -> usize {
+        let flags = self.chunk_sentence(sentence, tags);
+
+        let mut loss = 0;
+        for ((a, b), word) in flags.into_iter().zip(correct_np_flags).zip(sentence) {
+            if a != *b {
+                loss += 1;
+                relevant_words.inc(word);
+            }
+        }
+
+        loss
+    }
+
+    /// To speed up training, only try a subset of all possible candidates.
+    /// How many to select is given by the `candidate_selection_chance`. A higher chance means a
+    /// longer training time.
+    fn epoch(&mut self, training_files: &[impl AsRef<Path>], candidate_selection_chance: f32) {
+        use crate::conllu_utils::iter_sentences_in_conllu;
+        use rs_conllu::Sentence;
+        use std::time::Instant;
+
+        assert!((0.0..=1.0).contains(&candidate_selection_chance));
+
+        let mut total_tokens = 0;
+        let mut error_counter = 0;
+
+        let sentences: Vec<Sentence> = training_files
+            .iter()
+            .flat_map(iter_sentences_in_conllu)
+            .collect();
+        let mut sentences_flagged: Vec<CandidateArgs> = Vec::new();
+
+        for sent in &sentences {
+            use hashbrown::HashSet;
+
+            use crate::chunker::np_extraction::locate_noun_phrases_in_sent;
+
+            let mut toks: Vec<String> = Vec::new();
+            let mut tags = Vec::new();
+
+            for token in &sent.tokens {
+                let form = token.form.clone();
+                if let Some(last) = toks.last_mut() {
+                    match form.as_str() {
+                        "sn't" | "n't" | "'ll" | "'ve" | "'re" | "'d" | "'m" | "'s" => {
+                            last.push_str(&form);
+                            continue;
+                        }
+                        _ => {}
+                    }
+                }
+                toks.push(form);
+                tags.push(token.upos.and_then(UPOS::from_conllu));
+            }
+
+            let actual = locate_noun_phrases_in_sent(sent);
+            let actual_flat = actual.into_iter().fold(HashSet::new(), |mut a, b| {
+                a.extend(b.into_iter());
+                a
+            });
+
+            let mut actual_seq = Vec::new();
+
+            for el in actual_flat {
+                if el >= actual_seq.len() {
+                    actual_seq.resize(el + 1, false);
+                }
+                actual_seq[el] = true;
+            }
+
+            sentences_flagged.push((toks, tags, actual_seq));
+        }
+
+        let mut relevant_words = WordCounter::default();
+
+        for (tok_buf, tag_buf, flag_buf) in &sentences_flagged {
+            total_tokens += tok_buf.len();
+            error_counter += self.count_chunk_errors(
+                tok_buf.as_slice(),
+                tag_buf,
+                flag_buf.as_slice(),
+                &mut relevant_words,
+            );
+        }
+
+        println!("=============");
+        println!("Total tokens in training set: {}", total_tokens);
+        println!("Tokens incorrectly flagged: {}", error_counter);
+        println!(
+            "Error rate: {}%",
+            error_counter as f32 / total_tokens as f32 * 100.
+        );
+
+        // Before adding any patches, let's get a good base.
+        let mut base_flags = Vec::new();
+        for (toks, tags, _) in &sentences_flagged {
+            base_flags.push(self.chunk_sentence(toks, tags));
+        }
+
+        let all_candidates = Patch::generate_candidate_patches(&relevant_words);
+        let mut pruned_candidates: Vec<Patch> = rand::seq::IndexedRandom::choose_multiple(
+            all_candidates.as_slice(),
+            &mut rand::rng(),
+            (all_candidates.len() as f32 * candidate_selection_chance) as usize,
+        )
+        .cloned()
+        .collect();
+
+        let start = Instant::now();
+
+        #[cfg(feature = "threaded")]
+        rayon::slice::ParallelSliceMut::par_sort_by_cached_key(
+            pruned_candidates.as_mut_slice(),
+            |candidate: &Patch| {
+                self.score_candidate(candidate.clone(), &sentences_flagged, &base_flags)
+            },
+        );
+
+        #[cfg(not(feature = "threaded"))]
+        pruned_candidates.sort_by_cached_key(|candidate| {
+            self.score_candidate(candidate.clone(), &sentences_flagged, &base_flags)
+        });
+
+        let duration = start.elapsed();
+        let seconds = duration.as_secs();
+        let millis = duration.subsec_millis();
+
+        println!(
+            "It took {} seconds and {} milliseconds to search through {} candidates at {} c/sec.",
+            seconds,
+            millis,
+            pruned_candidates.len(),
+            pruned_candidates.len() as f32 / seconds as f32
+        );
+
+        if let Some(best) = pruned_candidates.first() {
+            self.patches.push(best.clone());
+        }
+    }
+
+    /// Lower is better
+    fn score_candidate(
+        &self,
+        candidate: Patch,
+        sentences_flagged: &[CandidateArgs],
+        base_flags: &[Vec<bool>],
+    ) -> usize {
+        let mut tagger = BrillChunker::new(UPOSFreqDict::default());
+        tagger.patches.push(candidate);
+
+        let mut errors = 0;
+
+        for ((toks, tags, flags), base) in sentences_flagged.iter().zip(base_flags.iter()) {
+            errors += tagger.count_patch_errors(toks.as_slice(), tags.as_slice(), base, flags);
+        }
+
+        errors
+    }
+
+    /// Train a brand-new tagger on a `.conllu` dataset, provided via a path.
+    /// This does not do _any_ error handling, and should not run in production.
+    /// It should be used for training a model that _will_ be used in production.
+    pub fn train(
+        training_files: &[impl AsRef<Path>],
+        epochs: usize,
+        candidate_selection_chance: f32,
+    ) -> Self {
+        let mut freq_dict = UPOSFreqDict::default();
+
+        for file in training_files {
+            freq_dict.inc_from_conllu_file(file);
+        }
+
+        let mut chunker = Self::new(freq_dict);
+
+        for _ in 0..epochs {
+            chunker.epoch(training_files, candidate_selection_chance);
+        }
+
+        chunker
+    }
+}
--- a/harper-pos-utils/src/chunker/brill_chunker/patch.rs
+++ b/harper-pos-utils/src/chunker/brill_chunker/patch.rs
@ -0,0 +1,121 @@
+use serde::{Deserialize, Serialize};
+
+use crate::patch_criteria::PatchCriteria;
+#[cfg(feature = "training")]
+use crate::word_counter::WordCounter;
+
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct Patch {
+    pub from: bool,
+    pub criteria: PatchCriteria,
+}
+
+#[cfg(feature = "training")]
+impl Patch {
+    pub fn generate_candidate_patches(relevant_words: &WordCounter) -> Vec<Self> {
+        use crate::UPOS;
+        use strum::IntoEnumIterator;
+
+        const TOP_N_WORDS: usize = 50;
+        const REL_POS: [isize; 7] = [-3, -2, -1, 0, 1, 2, 3];
+
+        let mut atoms: Vec<(bool, PatchCriteria)> = Vec::new();
+
+        for from in [false, true] {
+            for rel in REL_POS {
+                for tag in UPOS::iter() {
+                    atoms.push((
+                        from,
+                        PatchCriteria::WordIsTaggedWith {
+                            relative: rel,
+                            is_tagged: tag,
+                        },
+                    ));
+                }
+            }
+            for max_rel in 1..=5 {
+                for tag in UPOS::iter() {
+                    atoms.push((
+                        from,
+                        PatchCriteria::AnyWordIsTaggedWith {
+                            max_relative: max_rel,
+                            is_tagged: tag,
+                        },
+                    ));
+                }
+            }
+            for prev in UPOS::iter() {
+                for post in UPOS::iter() {
+                    atoms.push((
+                        from,
+                        PatchCriteria::SandwichTaggedWith {
+                            prev_word_tagged: prev,
+                            post_word_tagged: post,
+                        },
+                    ));
+                }
+            }
+            for rel in REL_POS {
+                for is_np in [false, true] {
+                    atoms.push((
+                        from,
+                        PatchCriteria::NounPhraseAt {
+                            is_np,
+                            relative: rel,
+                        },
+                    ));
+                }
+            }
+        }
+
+        let tag_atom_count = atoms.len();
+
+        let mut word_atoms: Vec<(bool, PatchCriteria)> = Vec::new();
+        for from in [false, true] {
+            for rel in REL_POS {
+                for w in relevant_words.iter_top_n_words(TOP_N_WORDS) {
+                    word_atoms.push((
+                        from,
+                        PatchCriteria::WordIs {
+                            relative: rel,
+                            word: w.clone(),
+                        },
+                    ));
+                }
+            }
+        }
+
+        atoms.extend(word_atoms);
+
+        let total_atoms = atoms.len();
+        let word_start = tag_atom_count;
+        let word_atoms_ct = total_atoms - word_start;
+        let combos_ct = word_atoms_ct * total_atoms - word_atoms_ct;
+        let mut patches = Vec::with_capacity(total_atoms + combos_ct);
+
+        for (from, crit) in &atoms {
+            patches.push(Self {
+                from: *from,
+                criteria: crit.clone(),
+            });
+        }
+
+        for i in word_start..total_atoms {
+            let (from_i, ref crit_i) = atoms[i];
+            for (j, (_from_j, crit_j)) in atoms.iter().enumerate() {
+                if i == j {
+                    continue;
+                }
+                patches.push(Self {
+                    from: from_i,
+                    criteria: PatchCriteria::Combined {
+                        a: Box::new(crit_i.clone()),
+                        b: Box::new(crit_j.clone()),
+                    },
+                });
+            }
+        }
+
+        patches
+    }
+}
--- a/harper-pos-utils/src/chunker/mod.rs
+++ b/harper-pos-utils/src/chunker/mod.rs
@ -0,0 +1,17 @@
+use crate::UPOS;
+
+mod brill_chunker;
+#[cfg(feature = "training")]
+mod np_extraction;
+mod upos_freq_dict;
+
+pub use brill_chunker::BrillChunker;
+pub use upos_freq_dict::UPOSFreqDict;
+
+/// An implementer of this trait is capable of identifying the noun phrases in a provided sentence.
+pub trait Chunker {
+    /// Iterate over the sentence, identifying the noun phrases contained within.
+    /// A token marked `true` is a component of a noun phrase.
+    /// A token marked `false` is not.
+    fn chunk_sentence(&self, sentence: &[String], tags: &[Option<UPOS>]) -> Vec<bool>;
+}
--- a/harper-pos-utils/src/chunker/np_extraction.rs
+++ b/harper-pos-utils/src/chunker/np_extraction.rs
@ -0,0 +1,106 @@
+use std::collections::VecDeque;
+
+use hashbrown::HashSet;
+use rs_conllu::{Sentence, Token, TokenID, UPOS};
+
+pub fn locate_noun_phrases_in_sent(sent: &Sentence) -> Vec<HashSet<usize>> {
+    let mut found_noun_phrases = Vec::new();
+
+    for (i, token) in sent.tokens.iter().enumerate() {
+        if token.upos.is_some_and(is_root_upos) {
+            let noun_phrase = locate_noun_phrase_with_head_at(i, sent);
+
+            found_noun_phrases.push(noun_phrase);
+        }
+    }
+
+    found_noun_phrases.retain(is_contiguous);
+
+    reduce_to_maximal_nonoverlapping(found_noun_phrases)
+}
+
+fn is_contiguous(indices: &HashSet<usize>) -> bool {
+    if indices.is_empty() {
+        return false;
+    }
+    let lo = *indices.iter().min().unwrap();
+    let hi = *indices.iter().max().unwrap();
+    hi - lo + 1 == indices.len()
+}
+
+fn reduce_to_maximal_nonoverlapping(mut phrases: Vec<HashSet<usize>>) -> Vec<HashSet<usize>> {
+    phrases.sort_by_key(|s| usize::MAX - s.len());
+    let mut selected = Vec::new();
+    let mut occupied = HashSet::new();
+
+    for p in phrases {
+        if p.is_disjoint(&occupied) {
+            occupied.extend(&p);
+            selected.push(p);
+        }
+    }
+
+    selected
+}
+
+fn locate_noun_phrase_with_head_at(head_index: usize, sent: &Sentence) -> HashSet<usize> {
+    let mut children = HashSet::new();
+    let mut queue = VecDeque::new();
+    queue.push_back(head_index);
+
+    while let Some(c_i) = queue.pop_front() {
+        if children.contains(&c_i) {
+            continue;
+        }
+
+        let tok = &sent.tokens[c_i];
+
+        if is_noun_phrase_constituent(tok) || tok.upos.is_some_and(is_root_upos) {
+            children.insert(c_i);
+            queue.extend(get_children(sent, c_i));
+        }
+    }
+
+    children
+}
+
+fn is_root_upos(upos: UPOS) -> bool {
+    use UPOS::*;
+    matches!(upos, NOUN | PROPN | PRON)
+}
+
+/// Get the indices of the children of a given node.
+fn get_children(sent: &Sentence, of_node: usize) -> Vec<usize> {
+    let mut children = Vec::new();
+
+    for (index, token) in sent.tokens.iter().enumerate() {
+        if index == of_node {
+            continue;
+        }
+
+        if let Some(head) = token.head {
+            let is_child = match head {
+                TokenID::Single(i) => i != 0 && i - 1 == of_node,
+                TokenID::Range(start, end) => (start - 1..end - 1).contains(&of_node),
+                TokenID::Empty(_, _) => false,
+            };
+
+            if is_child {
+                children.push(index)
+            }
+        }
+    }
+
+    children
+}
+
+fn is_noun_phrase_constituent(token: &Token) -> bool {
+    let Some(ref deprel) = token.deprel else {
+        return false;
+    };
+
+    matches!(
+        deprel.as_str(),
+        "det" | "amod" | "nummod" | "compound" | "fixed" | "flat" | "acl" | "aux:pass"
+    )
+}
--- a/harper-pos-utils/src/chunker/upos_freq_dict.rs
+++ b/harper-pos-utils/src/chunker/upos_freq_dict.rs
@ -0,0 +1,71 @@
+#[cfg(feature = "training")]
+use std::path::Path;
+
+use hashbrown::HashMap;
+use serde::{Deserialize, Serialize};
+
+use crate::UPOS;
+
+use super::Chunker;
+
+/// Tracks the number of times any given UPOS is associated with a noun phrase.
+/// Used as the baseline for the chunker.
+#[derive(Debug, Clone, Serialize, Deserialize, Default)]
+pub struct UPOSFreqDict {
+    /// The # of times each [`UPOS`] was not part of an NP subtracted from the number of times it
+    /// was.
+    pub counts: HashMap<UPOS, isize>,
+}
+
+impl UPOSFreqDict {
+    pub fn is_likely_np_component(&self, upos: &UPOS) -> bool {
+        self.counts.get(upos).cloned().unwrap_or_default() > 0
+    }
+}
+
+impl Chunker for UPOSFreqDict {
+    fn chunk_sentence(&self, _sentence: &[String], tags: &[Option<UPOS>]) -> Vec<bool> {
+        tags.iter()
+            .map(|t| {
+                t.as_ref()
+                    .map(|t| self.is_likely_np_component(t))
+                    .unwrap_or(false)
+            })
+            .collect()
+    }
+}
+
+#[cfg(feature = "training")]
+impl UPOSFreqDict {
+    /// Increment the count for a particular lint kind.
+    pub fn inc_is_np(&mut self, upos: UPOS, is_np: bool) {
+        self.counts
+            .entry(upos)
+            .and_modify(|counter| *counter += if is_np { 1 } else { -1 })
+            .or_insert(1);
+    }
+
+    /// Parse a `.conllu` file and use it to train a frequency dictionary.
+    /// For error-handling purposes, this function should not be made accessible outside of training.
+    pub fn inc_from_conllu_file(&mut self, path: impl AsRef<Path>) {
+        use super::np_extraction::locate_noun_phrases_in_sent;
+        use crate::conllu_utils::iter_sentences_in_conllu;
+
+        for sent in iter_sentences_in_conllu(path) {
+            use hashbrown::HashSet;
+
+            let noun_phrases = locate_noun_phrases_in_sent(&sent);
+
+            let flat = noun_phrases.into_iter().fold(HashSet::new(), |mut a, b| {
+                a.extend(b);
+                a
+            });
+
+            for (i, token) in sent.tokens.iter().enumerate() {
+                if let Some(upos) = token.upos.and_then(UPOS::from_conllu) {
+                    self.inc_is_np(upos, flat.contains(&i))
+                }
+            }
+        }
+    }
+}
--- a/harper-pos-utils/src/conllu_utils.rs
+++ b/harper-pos-utils/src/conllu_utils.rs
@ -0,0 +1,12 @@
+use std::{fs::File, path::Path};
+
+use rs_conllu::{Sentence, parse_file};
+
+/// Produce an iterator over the sentences in a `.conllu` file.
+/// Will panic on error, so this should not be used outside of training.
+pub fn iter_sentences_in_conllu(path: impl AsRef<Path>) -> impl Iterator<Item = Sentence> {
+    let file = File::open(path).unwrap();
+    let doc = parse_file(file);
+
+    doc.map(|v| v.unwrap())
+}
--- a/harper-pos-utils/src/lib.rs
+++ b/harper-pos-utils/src/lib.rs
@ -0,0 +1,12 @@
+mod chunker;
+#[cfg(feature = "training")]
+mod conllu_utils;
+mod patch_criteria;
+mod tagger;
+mod upos;
+#[cfg(feature = "training")]
+mod word_counter;
+
+pub use chunker::{BrillChunker, Chunker, UPOSFreqDict};
+pub use tagger::{BrillTagger, FreqDict, FreqDictBuilder, Tagger};
+pub use upos::{UPOS, UPOSIter};
--- a/harper-pos-utils/src/patch_criteria.rs
+++ b/harper-pos-utils/src/patch_criteria.rs
@ -0,0 +1,126 @@
+use serde::{Deserialize, Serialize};
+
+use crate::UPOS;
+
+#[derive(Debug, Clone, Serialize, Deserialize, Hash, PartialEq, Eq)]
+pub enum PatchCriteria {
+    WordIsTaggedWith {
+        /// Which token to inspect.
+        relative: isize,
+        is_tagged: UPOS,
+    },
+    AnyWordIsTaggedWith {
+        /// The farthest relative index to look
+        max_relative: isize,
+        is_tagged: UPOS,
+    },
+    SandwichTaggedWith {
+        prev_word_tagged: UPOS,
+        post_word_tagged: UPOS,
+    },
+    WordIs {
+        relative: isize,
+        word: String,
+    },
+    /// Not applicable to the Brill Tagger, only the chunker
+    NounPhraseAt {
+        is_np: bool,
+        relative: isize,
+    },
+    Combined {
+        a: Box<PatchCriteria>,
+        b: Box<PatchCriteria>,
+    },
+}
+
+impl PatchCriteria {
+    pub fn fulfils(
+        &self,
+        tokens: &[String],
+        tags: &[Option<UPOS>],
+        np_flags: &[bool],
+        index: usize,
+    ) -> bool {
+        match self {
+            PatchCriteria::WordIsTaggedWith {
+                relative,
+                is_tagged,
+            } => {
+                let Some(index) = add(index, *relative) else {
+                    return false;
+                };
+
+                tags.get(index)
+                    .copied()
+                    .flatten()
+                    .is_some_and(|t| t == *is_tagged)
+            }
+            PatchCriteria::AnyWordIsTaggedWith {
+                max_relative: relative,
+                is_tagged,
+            } => {
+                let Some(farthest_index) = add(index, *relative) else {
+                    return false;
+                };
+
+                (farthest_index.min(index)..farthest_index.max(index)).any(|i| {
+                    tags.get(i)
+                        .copied()
+                        .flatten()
+                        .is_some_and(|t| t == *is_tagged)
+                })
+            }
+            PatchCriteria::SandwichTaggedWith {
+                prev_word_tagged,
+                post_word_tagged,
+            } => {
+                if index == 0 {
+                    return false;
+                }
+
+                let prev_i = index - 1;
+                let post_i = index + 1;
+
+                tags.get(prev_i)
+                    .copied()
+                    .flatten()
+                    .is_some_and(|t| t == *prev_word_tagged)
+                    && tags
+                        .get(post_i)
+                        .copied()
+                        .flatten()
+                        .is_some_and(|t| t == *post_word_tagged)
+            }
+            Self::WordIs { relative, word } => {
+                let Some(index) = add(index, *relative) else {
+                    return false;
+                };
+
+                tokens.get(index).is_some_and(|w| {
+                    w.chars()
+                        .zip(word.chars())
+                        .all(|(a, b)| a.eq_ignore_ascii_case(&b))
+                })
+            }
+
+            Self::NounPhraseAt { is_np, relative } => {
+                let Some(index) = add(index, *relative) else {
+                    return false;
+                };
+
+                np_flags.get(index).is_some_and(|f| *is_np == *f)
+            }
+            Self::Combined { a, b } => {
+                a.fulfils(tokens, tags, np_flags, index) && b.fulfils(tokens, tags, np_flags, index)
+            }
+        }
+    }
+}
+
+fn add(u: usize, i: isize) -> Option<usize> {
+    if i.is_negative() {
+        u.checked_sub(i.wrapping_abs() as u32 as usize)
+    } else {
+        u.checked_add(i as usize)
+    }
+}
--- a/harper-pos-utils/src/tagger/brill_tagger/mod.rs
+++ b/harper-pos-utils/src/tagger/brill_tagger/mod.rs
@ -0,0 +1,281 @@
+mod patch;
+
+#[cfg(feature = "training")]
+use std::path::Path;
+
+use patch::Patch;
+use serde::{Deserialize, Serialize};
+
+#[cfg(feature = "training")]
+use super::FreqDict;
+#[cfg(feature = "training")]
+use super::error_counter::{ErrorCounter, ErrorKind};
+
+use crate::{Tagger, UPOS};
+
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct BrillTagger<B>
+where
+    B: Tagger,
+{
+    base: B,
+    patches: Vec<Patch>,
+}
+
+impl<B> BrillTagger<B>
+where
+    B: Tagger,
+{
+    pub fn new(base: B) -> Self {
+        Self {
+            base,
+            patches: Vec::new(),
+        }
+    }
+
+    fn apply_patches(&self, sentence: &[String], tags: &mut [Option<UPOS>]) {
+        for patch in &self.patches {
+            for i in 0..sentence.len() {
+                let Some(i_tag) = tags.get(i).copied().flatten() else {
+                    continue;
+                };
+
+                if patch.from == i_tag && patch.criteria.fulfils(sentence, tags, &[], i) {
+                    tags[i] = Some(patch.to);
+                }
+            }
+        }
+    }
+}
+
+impl<B> Tagger for BrillTagger<B>
+where
+    B: Tagger,
+{
+    /// Tag a sentence using the provided frequency dictionary and current patch set.
+    /// If the tagger is unable to determine a POS, it returns [`None`] in that position.
+    fn tag_sentence(&self, sentence: &[String]) -> Vec<Option<UPOS>> {
+        let mut tags = self.base.tag_sentence(sentence);
+        self.apply_patches(sentence, &mut tags);
+
+        tags
+    }
+}
+
+#[cfg(feature = "training")]
+impl BrillTagger<FreqDict> {
+    /// Tag a provided sentence with patches, providing the "correct" tags (from a dataset or
+    /// other source), returning the number of errors.
+    pub fn locate_patch_errors(
+        &self,
+        sentence: &[String],
+        correct_tags: &[Option<UPOS>],
+        base_tags: &[Option<UPOS>],
+        errors: &mut ErrorCounter,
+    ) {
+        let mut base_tags = base_tags.to_vec();
+        self.apply_patches(sentence, &mut base_tags);
+
+        for ((tag, correct_tag), word) in base_tags.iter().zip(correct_tags.iter()).zip(sentence) {
+            if let Some(tag) = tag {
+                if let Some(correct_tag) = correct_tag {
+                    if tag != correct_tag {
+                        errors.inc(
+                            ErrorKind {
+                                was_tagged: *tag,
+                                correct_tag: *correct_tag,
+                            },
+                            word.as_str(),
+                        )
+                    }
+                }
+            }
+        }
+    }
+
+    /// Tag a provided sentence with the tagger, providing the "correct" tags (from a dataset or
+    /// other source), returning the number of errors.
+    pub fn locate_tag_errors(
+        &self,
+        sentence: &[String],
+        correct_tags: &[Option<UPOS>],
+    ) -> ErrorCounter {
+        let tags = self.tag_sentence(sentence);
+
+        let mut errors = ErrorCounter::new();
+
+        for ((tag, correct_tag), word) in tags.iter().zip(correct_tags.iter()).zip(sentence) {
+            if let Some(tag) = tag {
+                if let Some(correct_tag) = correct_tag {
+                    if tag != correct_tag {
+                        errors.inc(
+                            ErrorKind {
+                                was_tagged: *tag,
+                                correct_tag: *correct_tag,
+                            },
+                            word.as_str(),
+                        )
+                    }
+                }
+            }
+        }
+
+        errors
+    }
+
+    /// To speed up training, only try a subset of all possible candidates.
+    /// How many to select is given by the `candidate_selection_chance`. A higher chance means a
+    /// longer training time.
+    fn epoch(&mut self, training_files: &[impl AsRef<Path>], candidate_selection_chance: f32) {
+        use crate::conllu_utils::iter_sentences_in_conllu;
+        use rs_conllu::Sentence;
+        use std::time::Instant;
+
+        assert!((0.0..=1.0).contains(&candidate_selection_chance));
+
+        let mut total_tokens = 0;
+        let mut error_counter = ErrorCounter::new();
+
+        let sentences: Vec<Sentence> = training_files
+            .iter()
+            .flat_map(iter_sentences_in_conllu)
+            .collect();
+        let mut sentences_tagged: Vec<(Vec<String>, Vec<Option<UPOS>>)> = Vec::new();
+
+        for sent in &sentences {
+            let mut toks: Vec<String> = Vec::new();
+            let mut tags = Vec::new();
+
+            for token in &sent.tokens {
+                let form = token.form.clone();
+                if let Some(last) = toks.last_mut() {
+                    match form.as_str() {
+                        "sn't" | "n't" | "'ll" | "'ve" | "'re" | "'d" | "'m" | "'s" => {
+                            last.push_str(&form);
+                            continue;
+                        }
+                        _ => {}
+                    }
+                }
+                toks.push(form);
+                tags.push(token.upos.and_then(UPOS::from_conllu));
+            }
+
+            sentences_tagged.push((toks, tags));
+        }
+
+        for (tok_buf, tag_buf) in &sentences_tagged {
+            total_tokens += tok_buf.len();
+            error_counter
+                .merge_from(self.locate_tag_errors(tok_buf.as_slice(), tag_buf.as_slice()));
+        }
+
+        println!("=============");
+        println!("Total tokens in training set: {}", total_tokens);
+        println!(
+            "Tokens incorrectly tagged: {}",
+            error_counter.total_errors()
+        );
+        println!(
+            "Error rate: {}%",
+            error_counter.total_errors() as f32 / total_tokens as f32 * 100.
+        );
+
+        // Before adding any patches, let's get a good base.
+        let mut base_tags = Vec::new();
+        for (toks, _) in &sentences_tagged {
+            base_tags.push(self.tag_sentence(toks));
+        }
+
+        let all_candidates = Patch::generate_candidate_patches(&error_counter);
+        let mut pruned_candidates: Vec<Patch> = rand::seq::IndexedRandom::choose_multiple(
+            all_candidates.as_slice(),
+            &mut rand::rng(),
+            (all_candidates.len() as f32 * candidate_selection_chance) as usize,
+        )
+        .cloned()
+        .collect();
+
+        let start = Instant::now();
+
+        #[cfg(feature = "threaded")]
+        rayon::slice::ParallelSliceMut::par_sort_by_cached_key(
+            pruned_candidates.as_mut_slice(),
+            |candidate: &Patch| {
+                self.score_candidate(candidate.clone(), &sentences_tagged, &base_tags)
+            },
+        );
+
+        #[cfg(not(feature = "threaded"))]
+        pruned_candidates.sort_by_cached_key(|candidate| {
+            self.score_candidate(candidate.clone(), &sentences_tagged, &base_tags)
+        });
+
+        let duration = start.elapsed();
+        let seconds = duration.as_secs();
+        let millis = duration.subsec_millis();
+
+        println!(
+            "It took {} seconds and {} milliseconds to search through {} candidates at {} c/sec.",
+            seconds,
+            millis,
+            pruned_candidates.len(),
+            pruned_candidates.len() as f32 / seconds as f32
+        );
+
+        if let Some(best) = pruned_candidates.first() {
+            self.patches.push(best.clone());
+        }
+    }
+
+    /// Lower is better
+    fn score_candidate(
+        &self,
+        candidate: Patch,
+        sentences_tagged: &[(Vec<String>, Vec<Option<UPOS>>)],
+        base_tags: &[Vec<Option<UPOS>>],
+    ) -> usize {
+        let mut tagger = BrillTagger::new(FreqDict::default());
+        tagger.patches.push(candidate);
+
+        let mut candidate_errors = ErrorCounter::new();
+
+        for ((toks, tags), base) in sentences_tagged.iter().zip(base_tags.iter()) {
+            tagger.locate_patch_errors(
+                toks.as_slice(),
+                tags.as_slice(),
+                base,
+                &mut candidate_errors,
+            );
+        }
+
+        candidate_errors.total_errors()
+    }
+
+    /// Train a brand-new tagger on a `.conllu` dataset, provided via a path.
+    /// This does not do _any_ error handling, and should not run in production.
+    /// It should be used for training a model that _will_ be used in production.
+    pub fn train(
+        training_files: &[impl AsRef<Path>],
+        epochs: usize,
+        candidate_selection_chance: f32,
+    ) -> Self {
+        use crate::FreqDictBuilder;
+
+        let mut freq_dict_builder = FreqDictBuilder::new();
+
+        for file in training_files {
+            freq_dict_builder.inc_from_conllu_file(file);
+        }
+
+        let freq_dict = freq_dict_builder.build();
+
+        let mut tagger = Self::new(freq_dict);
+
+        for _ in 0..epochs {
+            tagger.epoch(training_files, candidate_selection_chance);
+        }
+
+        tagger
+    }
+}
--- a/harper-pos-utils/src/tagger/brill_tagger/patch.rs
+++ b/harper-pos-utils/src/tagger/brill_tagger/patch.rs
@ -0,0 +1,92 @@
+#[cfg(feature = "training")]
+use crate::tagger::error_counter::ErrorCounter;
+use crate::{UPOS, patch_criteria::PatchCriteria};
+#[cfg(feature = "training")]
+use hashbrown::HashSet;
+use serde::{Deserialize, Serialize};
+
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct Patch {
+    pub from: UPOS,
+    pub to: UPOS,
+    pub criteria: PatchCriteria,
+}
+
+#[cfg(feature = "training")]
+impl Patch {
+    /// Given a list of tagging errors, generate a collection of candidate patches that _might_ fix
+    /// them. Training involves determining which candidates actually work.
+    pub fn generate_candidate_patches(error_counter: &ErrorCounter) -> Vec<Patch> {
+        let mut candidates = Vec::new();
+
+        for key in error_counter.error_counts.keys() {
+            candidates.extend(Self::gen_simple_candidates().into_iter().map(|c| Patch {
+                from: key.was_tagged,
+                to: key.correct_tag,
+                criteria: c,
+            }));
+
+            for c in &Self::gen_simple_candidates() {
+                for word in error_counter.word_counts.iter_top_n_words(10) {
+                    for r in -3..3 {
+                        candidates.push(Patch {
+                            from: key.was_tagged,
+                            to: key.correct_tag,
+                            criteria: PatchCriteria::Combined {
+                                a: Box::new(PatchCriteria::WordIs {
+                                    relative: r,
+                                    word: word.to_string(),
+                                }),
+                                b: Box::new(c.clone()),
+                            },
+                        })
+                    }
+                }
+            }
+        }
+
+        candidates
+    }
+
+    /// Candidates to be tested against a dataset during training.
+    fn gen_simple_candidates() -> Vec<PatchCriteria> {
+        use strum::IntoEnumIterator;
+
+        let mut criteria = HashSet::new();
+        for upos in UPOS::iter() {
+            for i in -4..=4 {
+                criteria.insert(PatchCriteria::WordIsTaggedWith {
+                    relative: i,
+                    is_tagged: upos,
+                });
+            }
+
+            for i in -4..=4 {
+                criteria.insert(PatchCriteria::AnyWordIsTaggedWith {
+                    max_relative: i,
+                    is_tagged: upos,
+                });
+            }
+
+            for upos_b in UPOS::iter() {
+                criteria.insert(PatchCriteria::SandwichTaggedWith {
+                    prev_word_tagged: upos,
+                    post_word_tagged: upos_b,
+                });
+
+                criteria.insert(PatchCriteria::Combined {
+                    a: Box::new(PatchCriteria::WordIsTaggedWith {
+                        relative: 1,
+                        is_tagged: upos,
+                    }),
+                    b: Box::new(PatchCriteria::WordIsTaggedWith {
+                        relative: -2,
+                        is_tagged: upos_b,
+                    }),
+                });
+            }
+        }
+
+        criteria.into_iter().collect()
+    }
+}
--- a/harper-pos-utils/src/tagger/error_counter.rs
+++ b/harper-pos-utils/src/tagger/error_counter.rs
@ -0,0 +1,52 @@
+use hashbrown::HashMap;
+
+use crate::{UPOS, word_counter::WordCounter};
+
+#[derive(Debug, Default, Clone, Hash, PartialEq, Eq)]
+pub struct ErrorKind {
+    pub was_tagged: UPOS,
+    pub correct_tag: UPOS,
+}
+
+#[derive(Debug, Default)]
+pub struct ErrorCounter {
+    pub error_counts: HashMap<ErrorKind, usize>,
+    /// The number of times a word is associated with an error.
+    pub word_counts: WordCounter,
+}
+
+impl ErrorCounter {
+    pub fn new() -> Self {
+        Self::default()
+    }
+
+    /// Increment the count for a particular lint kind.
+    pub fn inc(&mut self, kind: ErrorKind, word: &str) {
+        self.error_counts
+            .entry(kind)
+            .and_modify(|counter| *counter += 1)
+            .or_insert(1);
+        self.word_counts.inc(word)
+    }
+
+    pub fn merge_from(&mut self, other: Self) {
+        for (key, value) in other.error_counts {
+            self.error_counts
+                .entry(key)
+                .and_modify(|counter| *counter += value)
+                .or_insert(value);
+        }
+
+        for (key, value) in other.word_counts.word_counts {
+            self.word_counts
+                .word_counts
+                .entry(key)
+                .and_modify(|counter| *counter += value)
+                .or_insert(value);
+        }
+    }
+
+    pub fn total_errors(&self) -> usize {
+        self.error_counts.values().sum()
+    }
+}
--- a/harper-pos-utils/src/tagger/freq_dict.rs
+++ b/harper-pos-utils/src/tagger/freq_dict.rs
@ -0,0 +1,32 @@
+use hashbrown::HashMap;
+use serde::{Deserialize, Serialize};
+
+use super::Tagger;
+use crate::upos::UPOS;
+
+/// A mapping between words (normalized to lowercase) and their most common UPOS tag.
+/// Can be used as a minimally accurate [`Tagger`].
+#[derive(Debug, Default, Serialize, Deserialize, Clone)]
+pub struct FreqDict {
+    pub mapping: HashMap<String, UPOS>,
+}
+
+impl FreqDict {
+    pub fn get(&self, word: &str) -> Option<UPOS> {
+        let word_lower = word.to_lowercase();
+        self.mapping.get(word_lower.as_str()).copied()
+    }
+}
+
+impl Tagger for FreqDict {
+    fn tag_sentence(&self, sentence: &[String]) -> Vec<Option<UPOS>> {
+        let mut tags = Vec::new();
+
+        for word in sentence {
+            let tag = self.get(word);
+            tags.push(tag);
+        }
+
+        tags
+    }
+}
--- a/harper-pos-utils/src/tagger/freq_dict_builder.rs
+++ b/harper-pos-utils/src/tagger/freq_dict_builder.rs
@ -0,0 +1,99 @@
+#[cfg(feature = "training")]
+use std::path::Path;
+
+use hashbrown::{Equivalent, HashMap};
+use strum::IntoEnumIterator;
+
+use crate::{UPOS, tagger::FreqDict};
+
+/// A mapping between words and the frequency of each UPOS.
+/// If an element is missing from the map, it's count is assumed to be zero.
+#[derive(Debug, Default)]
+pub struct FreqDictBuilder {
+    mapping: HashMap<FreqDictBuilderKey, usize>,
+}
+
+impl FreqDictBuilder {
+    pub fn new() -> Self {
+        Default::default()
+    }
+
+    pub fn inc(&mut self, word: &str, tag: &UPOS) {
+        let word_lower = word.to_lowercase();
+        let counter = self.mapping.get_mut(&(word_lower.as_str(), tag));
+
+        if let Some(counter) = counter {
+            *counter += 1;
+        } else {
+            self.mapping.insert(
+                FreqDictBuilderKey {
+                    word: word_lower.to_string(),
+                    pos: *tag,
+                },
+                1,
+            );
+        }
+    }
+
+    // Inefficient, but effective method that gets the most used POS for a word in the map.
+    // Returns none if the word does not exist in the map.
+    fn most_freq_pos(&self, word: &str) -> Option<UPOS> {
+        let word_lower = word.to_lowercase();
+        let mut max_found: Option<(UPOS, usize)> = None;
+
+        for pos in UPOS::iter() {
+            if let Some(count) = self.mapping.get(&(word_lower.as_str(), &pos)) {
+                if let Some((_, max_count)) = max_found {
+                    if *count > max_count {
+                        max_found = Some((pos, *count))
+                    }
+                } else {
+                    max_found = Some((pos, *count))
+                }
+            }
+        }
+
+        max_found.map(|v| v.0)
+    }
+
+    /// Parse a `.conllu` file and use it to train a frequency dictionary.
+    /// For error-handling purposes, this function should not be made accessible outside of training.
+    #[cfg(feature = "training")]
+    pub fn inc_from_conllu_file(&mut self, path: impl AsRef<Path>) {
+        use crate::conllu_utils::iter_sentences_in_conllu;
+
+        for sent in iter_sentences_in_conllu(path) {
+            for token in sent.tokens {
+                if let Some(upos) = token.upos.and_then(UPOS::from_conllu) {
+                    self.inc(&token.form, &upos)
+                }
+            }
+        }
+    }
+
+    pub fn build(self) -> FreqDict {
+        let mut output = HashMap::new();
+
+        for key in self.mapping.keys() {
+            if output.contains_key(&key.word) {
+                continue;
+            }
+
+            output.insert(key.word.to_string(), self.most_freq_pos(&key.word).unwrap());
+        }
+
+        FreqDict { mapping: output }
+    }
+}
+
+#[derive(Debug, Eq, PartialEq, Hash)]
+struct FreqDictBuilderKey {
+    word: String,
+    pos: UPOS,
+}
+
+impl Equivalent<FreqDictBuilderKey> for (&str, &UPOS) {
+    fn equivalent(&self, key: &FreqDictBuilderKey) -> bool {
+        self.0 == key.word && *self.1 == key.pos
+    }
+}
--- a/harper-pos-utils/src/tagger/mod.rs
+++ b/harper-pos-utils/src/tagger/mod.rs
@ -0,0 +1,16 @@
+mod brill_tagger;
+#[cfg(feature = "training")]
+mod error_counter;
+mod freq_dict;
+mod freq_dict_builder;
+
+use crate::UPOS;
+
+pub use brill_tagger::BrillTagger;
+pub use freq_dict::FreqDict;
+pub use freq_dict_builder::FreqDictBuilder;
+
+/// An implementer of this trait is capable of assigned Part-of-Speech tags to a provided sentence.
+pub trait Tagger {
+    fn tag_sentence(&self, sentence: &[String]) -> Vec<Option<UPOS>>;
+}
--- a/harper-pos-utils/src/upos.rs
+++ b/harper-pos-utils/src/upos.rs
@ -0,0 +1,68 @@
+use is_macro::Is;
+use serde::{Deserialize, Serialize};
+use strum_macros::{AsRefStr, EnumIter};
+
+/// Represents the universal parts of speech as outlined by [universaldependencies.org](https://universaldependencies.org/u/pos/index.html).
+#[derive(
+    Debug,
+    Default,
+    Hash,
+    Eq,
+    PartialEq,
+    Clone,
+    Copy,
+    EnumIter,
+    AsRefStr,
+    Serialize,
+    Deserialize,
+    PartialOrd,
+    Ord,
+    Is,
+)]
+pub enum UPOS {
+    ADJ,
+    ADP,
+    ADV,
+    AUX,
+    CCONJ,
+    DET,
+    INTJ,
+    #[default]
+    NOUN,
+    NUM,
+    PART,
+    PRON,
+    PROPN,
+    PUNCT,
+    SCONJ,
+    SYM,
+    VERB,
+}
+
+impl UPOS {
+    pub fn from_conllu(other: rs_conllu::UPOS) -> Option<Self> {
+        Some(match other {
+            rs_conllu::UPOS::ADJ => UPOS::ADJ,
+            rs_conllu::UPOS::ADP => UPOS::ADP,
+            rs_conllu::UPOS::ADV => UPOS::ADV,
+            rs_conllu::UPOS::AUX => UPOS::AUX,
+            rs_conllu::UPOS::CCONJ => UPOS::CCONJ,
+            rs_conllu::UPOS::DET => UPOS::DET,
+            rs_conllu::UPOS::INTJ => UPOS::INTJ,
+            rs_conllu::UPOS::NOUN => UPOS::NOUN,
+            rs_conllu::UPOS::NUM => UPOS::NUM,
+            rs_conllu::UPOS::PART => UPOS::PART,
+            rs_conllu::UPOS::PRON => UPOS::PRON,
+            rs_conllu::UPOS::PROPN => UPOS::PROPN,
+            rs_conllu::UPOS::PUNCT => UPOS::PUNCT,
+            rs_conllu::UPOS::SCONJ => UPOS::SCONJ,
+            rs_conllu::UPOS::SYM => UPOS::SYM,
+            rs_conllu::UPOS::VERB => UPOS::VERB,
+            rs_conllu::UPOS::X => return None,
+        })
+    }
+
+    pub fn is_nominal(&self) -> bool {
+        matches!(self, Self::NOUN | Self::PROPN)
+    }
+}
--- a/harper-pos-utils/src/word_counter.rs
+++ b/harper-pos-utils/src/word_counter.rs
@ -0,0 +1,28 @@
+use hashbrown::HashMap;
+
+#[derive(Debug, Default)]
+pub struct WordCounter {
+    /// The number of times a word is associated with an error.
+    pub word_counts: HashMap<String, usize>,
+}
+
+impl WordCounter {
+    pub fn new() -> Self {
+        Self::default()
+    }
+
+    /// Increment the count for a particular word.
+    pub fn inc(&mut self, word: &str) {
+        self.word_counts
+            .entry_ref(word)
+            .and_modify(|counter| *counter += 1)
+            .or_insert(1);
+    }
+
+    /// Get an iterator over the most frequent words associated with errors.
+    pub fn iter_top_n_words(&self, n: usize) -> impl Iterator<Item = &String> {
+        let mut counts: Vec<(&String, &usize)> = self.word_counts.iter().collect();
+        counts.sort_unstable_by(|a, b| b.1.cmp(a.1));
+        counts.into_iter().take(n).map(|(a, _b)| a)
+    }
+}
--- a/packages/obsidian-plugin/data.json
+++ b/packages/obsidian-plugin/data.json
@ -1,368 +0,0 @@
-{
-	"ignoredLints": "{\"context_hashes\":[11327540533206285101]}",
-	"useWebWorker": true,
-	"lintSettings": {
-		"ACoupleMore": null,
-		"ALongTime": null,
-		"ALotWorst": null,
-		"APart": null,
-		"AWholeEntire": null,
-		"AdjectiveOfA": null,
-		"AfterAWhile": null,
-		"AlzheimersDisease": null,
-		"AmazonNames": null,
-		"Americas": null,
-		"AmountsFor": null,
-		"AnA": null,
-		"AnAnother": null,
-		"AndIn": null,
-		"AndTheLike": null,
-		"AnotherAn": null,
-		"AnotherOnes": null,
-		"AnotherThings": null,
-		"Anybody": null,
-		"Anyhow": null,
-		"Anyone": null,
-		"Anywhere": null,
-		"AppleNames": null,
-		"AsFarBackAs": null,
-		"AsOfLate": null,
-		"AsWell": null,
-		"AskNoPreposition": null,
-		"AtFaceValue": null,
-		"Australia": null,
-		"AvoidAndAlso": null,
-		"AvoidCurses": null,
-		"AzureNames": null,
-		"BackInTheDay": null,
-		"Backplane": null,
-		"BadRap": null,
-		"BaitedBreath": null,
-		"BanTogether": null,
-		"BareInMind": null,
-		"BatedBreath": null,
-		"BeckAndCall": null,
-		"BeenThere": null,
-		"BestRegards": null,
-		"BlanketStatement": null,
-		"BoringWords": null,
-		"Brutality": null,
-		"ByAccident": null,
-		"CanBeSeen": null,
-		"Canada": null,
-		"CapitalizePersonalPronouns": null,
-		"CaseInPoint": null,
-		"CaseSensitive": null,
-		"ChangeOfTack": null,
-		"ChangeTack": null,
-		"ChangedTack": null,
-		"ChangesOfTack": null,
-		"ChangesTack": null,
-		"ChangingOfTack": null,
-		"ChangingTack": null,
-		"ChineseCommunistParty": null,
-		"ChockFull": null,
-		"ClientSide": null,
-		"CommaFixes": null,
-		"CompaniesProductsAndTrademarks": null,
-		"CompoundNouns": null,
-		"CondenseAllThe": null,
-		"Confident": null,
-		"CorrectNumberSuffix": null,
-		"Countries": null,
-		"CoursingThroughVeins": null,
-		"CurrencyPlacement": null,
-		"DampSquib": null,
-		"Dashes": null,
-		"DayAndAge": null,
-		"DayOneNames": null,
-		"DefiniteArticle": null,
-		"DefiniteArticles": null,
-		"Desktop": null,
-		"DespiteOf": null,
-		"Devops": null,
-		"Discuss": null,
-		"Discussed": null,
-		"Discusses": null,
-		"Discussing": null,
-		"DoNotWant": null,
-		"DotInitialisms": null,
-		"EachAndEveryOne": null,
-		"EllipsisLength": null,
-		"ElsePossessive": null,
-		"EludedTo": null,
-		"EnMasse": null,
-		"EverPresent": null,
-		"Everybody": null,
-		"Everyday": null,
-		"Everyone": null,
-		"Everywhere": null,
-		"Excellent": null,
-		"ExpandBecause": null,
-		"ExpandDependencies": null,
-		"ExpandDependency": null,
-		"ExpandMinimum": null,
-		"ExpandStandardInput": null,
-		"ExpandStandardOutput": null,
-		"ExpandTimeShorthands": null,
-		"ExpandWith": null,
-		"ExpandWithout": null,
-		"Expatriate": null,
-		"ExplanationMark": null,
-		"ExplanationMarks": null,
-		"ExplanationPoint": null,
-		"FaceFirst": null,
-		"FairBit": null,
-		"FarWorse": null,
-		"FastPaste": null,
-		"FatalOutcome": null,
-		"FetalPosition": null,
-		"FirstAidKit": null,
-		"ForALongTime": null,
-		"ForAWhile": null,
-		"ForAllIntentsAndPurposes": null,
-		"ForNoun": null,
-		"FreeRein": null,
-		"Freezing": null,
-		"FurtherAdo": null,
-		"Furthermore": null,
-		"GetRidOff": null,
-		"GetsRidOff": null,
-		"GettingRidOff": null,
-		"GildedAge": null,
-		"GoingTo": null,
-		"GoogleNames": null,
-		"GotRidOff": null,
-		"GottenRidOff": null,
-		"GuineaBissau": null,
-		"HadGone": null,
-		"HadOf": null,
-		"HadPassed": null,
-		"HalfAnHour": null,
-		"Haphazard": null,
-		"HasGone": null,
-		"HasPassed": null,
-		"HaveGone": null,
-		"HavePassed": null,
-		"HavingGone": null,
-		"HavingPassed": null,
-		"Hedging": null,
-		"Henceforth": null,
-		"Hereby": null,
-		"Holidays": null,
-		"HomeInOn": null,
-		"HomedInOn": null,
-		"HomesInOn": null,
-		"HomingInOn": null,
-		"HopHope": null,
-		"HowTo": null,
-		"However": null,
-		"HumanBeings": null,
-		"HumanLife": null,
-		"HungerPang": null,
-		"HyphenateNumberDay": null,
-		"IAm": null,
-		"InAWhile": null,
-		"InAndOfItself": null,
-		"InAnyWay": null,
-		"InCase": null,
-		"InDetail": null,
-		"InMoreDetail": null,
-		"InNeedOf": null,
-		"InOneFellSwoop": null,
-		"InThe": null,
-		"InflectedVerbAfterTo": null,
-		"Insofar": null,
-		"Instead": null,
-		"InsteadOf": null,
-		"Insurmountable": null,
-		"Intact": null,
-		"Into": null,
-		"InvestIn": null,
-		"InvestedIn": null,
-		"InvestingIn": null,
-		"InvestsIn": null,
-		"IsKnownFor": null,
-		"ItCan": null,
-		"ItsContraction": null,
-		"Itself": null,
-		"IveGotTo": null,
-		"JawDropping": null,
-		"JetpackNames": null,
-		"JustDeserts": null,
-		"KindOf": null,
-		"KindRegards": null,
-		"Koreas": null,
-		"Laptop": null,
-		"LastButNotLeast": null,
-		"LastDitch": null,
-		"LeftRightHand": null,
-		"LetAlone": null,
-		"LetsConfusion": null,
-		"LikeThePlague": null,
-		"Likewise": null,
-		"LinkingVerbs": null,
-		"LongSentences": null,
-		"Malaysia": null,
-		"MergeWords": null,
-		"MetaNames": null,
-		"MicrosoftNames": null,
-		"Middleware": null,
-		"Misunderstand": null,
-		"Misunderstood": null,
-		"Misuse": null,
-		"Misused": null,
-		"ModalOf": null,
-		"Monumentous": null,
-		"MostNumber": null,
-		"MuchAdo": null,
-		"MuchWorse": null,
-		"Multicore": null,
-		"Multimedia": null,
-		"MultipleSequentialPronouns": null,
-		"Multithreading": null,
-		"MutePoint": null,
-		"MyHouse": null,
-		"Myself": null,
-		"NailOnTheHead": null,
-		"NationalCapitals": null,
-		"NeedHelp": null,
-		"NerveRacking": null,
-		"NoOxfordComma": null,
-		"Nobody": null,
-		"NominalWants": null,
-		"Nonetheless": null,
-		"NotIn": null,
-		"NotTo": null,
-		"NotablePlaces": null,
-		"Nothing": null,
-		"Notwithstanding": null,
-		"NounInsteadOfVerb": null,
-		"Nowhere": null,
-		"NumberSuffixCapitalization": null,
-		"OceansAndSeas": null,
-		"OfCourse": null,
-		"OffTheCuff": null,
-		"OldWivesTale": null,
-		"OnSecondThought": null,
-		"OnTheSpurOfTheMoment": null,
-		"OnceInAWhile": null,
-		"OneAndTheSame": null,
-		"OpenCompounds": null,
-		"OpenTheLight": null,
-		"OperativeSystem": null,
-		"OperativeSystems": null,
-		"OutOfDate": null,
-		"Overall": null,
-		"Overclocking": null,
-		"Overload": null,
-		"Overnight": null,
-		"OxfordComma": null,
-		"Oxymorons": null,
-		"PeaceOfMind": null,
-		"PhrasalVerbAsCompoundNoun": null,
-		"PiggyBag": null,
-		"PiggyBagged": null,
-		"PiggyBagging": null,
-		"PiqueInterest": null,
-		"PocketCastsNames": null,
-		"PointIsMoot": null,
-		"PointsOfView": null,
-		"PortAuPrince": null,
-		"PortoNovo": null,
-		"PossessiveYour": null,
-		"Postpone": null,
-		"PrayingMantis": null,
-		"PronounContraction": null,
-		"PronounKnew": null,
-		"Proofread": null,
-		"ProperNouns": null,
-		"RapidFire": null,
-		"RealTrouper": null,
-		"Regardless": null,
-		"RepeatedWords": null,
-		"RifeWith": null,
-		"RoadMap": null,
-		"SameAs": null,
-		"SaveToSafe": null,
-		"ScantilyClad": null,
-		"SentenceCapitalization": null,
-		"ServerSide": null,
-		"SimpleGrammatical": null,
-		"SinceDuration": null,
-		"SneakingSuspicion": null,
-		"Somebody": null,
-		"Somehow": null,
-		"Someone": null,
-		"SomewhatSomething": null,
-		"Somewhere": null,
-		"SoonerOrLater": null,
-		"Spaces": null,
-		"SpecialAttention": null,
-		"SpellCheck": null,
-		"SpelledNumbers": null,
-		"SpokeTooSoon": null,
-		"Starving": null,
-		"StateOfTheArt": null,
-		"SufficeItToSay": null,
-		"SupposedTo": null,
-		"TakeItPersonally": null,
-		"TakeItSeriously": null,
-		"ThatChallenged": null,
-		"ThatThis": null,
-		"ThatWhich": null,
-		"TheAnother": null,
-		"TheHowWhy": null,
-		"TheMy": null,
-		"ThenThan": null,
-		"ThereIsAny": null,
-		"Therefore": null,
-		"Thereupon": null,
-		"ThoughtProcess": null,
-		"ThrowRubbish": null,
-		"TickingTimeClock": null,
-		"ToDoHyphen": null,
-		"ToTheMannerBorn": null,
-		"Towards": null,
-		"TrialAndError": null,
-		"TumblrNames": null,
-		"TurnForTheWorse": null,
-		"TurnItOff": null,
-		"USUniversities": null,
-		"UnclosedQuotes": null,
-		"Underclock": null,
-		"UnitedOrganizations": null,
-		"Unless": null,
-		"Upset": null,
-		"Upward": null,
-		"UseGenitive": null,
-		"WantBe": null,
-		"WasAloud": null,
-		"WaveFunction": null,
-		"WellBeing": null,
-		"WellKept": null,
-		"WhatHeLooksLike": null,
-		"WhatItLooksLike": null,
-		"WhatSheLooksLike": null,
-		"WhatTheyLookLike": null,
-		"Whereas": null,
-		"Whereupon": null,
-		"WhetYourAppetite": null,
-		"WholeEntire": null,
-		"WidelyAccepted": null,
-		"Widespread": null,
-		"WillContain": null,
-		"WinPrize": null,
-		"WordPressDotcom": null,
-		"WorldWarII": null,
-		"Worldwide": null,
-		"WorseAndWorse": null,
-		"WorseCaseScenario": null,
-		"WorseThan": null,
-		"WorstCaseScenario": null,
-		"WorstEver": null
-	},
-	"userDictionary": [],
-	"dialect": 0,
-	"delay": -1
-}
--- a/packages/web/src/routes/docs/contributors/brill/+page.md
+++ b/packages/web/src/routes/docs/contributors/brill/+page.md
@ -0,0 +1,8 @@
+---
+title: Brill Tagging
+---
+
+Harper uses Brill tagging as a refinement step to a dictionary-based [POS tagging](https://en.wikipedia.org/wiki/Part-of-speech_tagging) approach.
+This method retains low-latency and high-throughput without bundling a large, high-entropy language model.
+
+While documentation on this site is sparse, initial development was accompanied by [a blog post](https://elijahpotter.dev/articles/transformation-based_learning), which can hopefully explain some of the more abstract details of the process.
--- a/packages/web/vite.config.ts
+++ b/packages/web/vite.config.ts
@ -189,6 +189,10 @@ export default defineConfig({
 									title: 'Local Statistics',
 									to: '/docs/contributors/local-stats',
 								},
+								{
+									title: 'Brill Tagging',
+									to: '/docs/contributors/brill',
+								},
 								{
 									title: 'FAQ',
 									to: '/docs/contributors/faq',