feat(brill): train and use Brill tagger (#1344)

Co-authored-by: hippietrail <hippietrail@users.noreply.github.com>
This commit is contained in:
Elijah Potter 2025-06-16 15:33:49 -06:00 committed by GitHub
parent e3e573520e
commit db89187c3f
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
51 changed files with 51011 additions and 15273 deletions

119
Cargo.lock generated
View file

@ -509,6 +509,27 @@ dependencies = [
"parking_lot_core",
]
[[package]]
name = "derive_more"
version = "1.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4a9b99b9cbbe49445b21764dc0625032a89b145a2642e67603e1c936f5458d05"
dependencies = [
"derive_more-impl",
]
[[package]]
name = "derive_more-impl"
version = "1.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "cb7330aeadfbe296029522e6c40f315320aba36fc43a5b3632f3795348f3bd22"
dependencies = [
"proc-macro2",
"quote",
"syn",
"unicode-xid",
]
[[package]]
name = "dirs"
version = "4.0.0"
@ -784,14 +805,25 @@ checksum = "07e28edb80900c19c28f1072f2e8aeca7fa06b23cd4169cefe1af5aa3260783f"
[[package]]
name = "half"
version = "2.4.1"
version = "2.6.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6dd08c532ae367adf81c312a4580bc67f1d0fe8bc9c460520283f4c0ff277888"
checksum = "459196ed295495a68f7d7fe1d84f6c4b7ff0e21fe3017b2f283c6fac3ad803c9"
dependencies = [
"cfg-if",
"crunchy",
]
[[package]]
name = "harper-brill"
version = "0.42.0"
dependencies = [
"harper-pos-utils",
"lazy_static",
"rs-conllu",
"serde",
"serde_json",
]
[[package]]
name = "harper-cli"
version = "0.1.0"
@ -803,6 +835,7 @@ dependencies = [
"harper-comments",
"harper-core",
"harper-literate-haskell",
"harper-pos-utils",
"harper-stats",
"harper-typst",
"hashbrown 0.15.4",
@ -854,6 +887,7 @@ dependencies = [
"criterion",
"foldhash",
"fst",
"harper-brill",
"hashbrown 0.15.4",
"is-macro",
"itertools 0.14.0",
@ -866,7 +900,7 @@ dependencies = [
"pulldown-cmark",
"quickcheck",
"quickcheck_macros",
"rand",
"rand 0.8.5",
"rayon",
"serde",
"serde_json",
@ -929,6 +963,20 @@ dependencies = [
"tracing-subscriber",
]
[[package]]
name = "harper-pos-utils"
version = "0.42.0"
dependencies = [
"hashbrown 0.15.4",
"is-macro",
"rand 0.9.1",
"rayon",
"rs-conllu",
"serde",
"strum",
"strum_macros",
]
[[package]]
name = "harper-stats"
version = "0.42.0"
@ -1569,9 +1617,9 @@ checksum = "42f5e15c9953c5e4ccceeb2e7382a716482c34515315f7b03532b8b4e8393d2d"
[[package]]
name = "oorandom"
version = "11.1.4"
version = "11.1.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b410bbe7e14ab526a0e86877eb47c6996a2bd7746f027ba551028c925390e4e9"
checksum = "d6790f58c7ff633d8771f42965289203411a5e5c68388703c06e14f24770b41e"
[[package]]
name = "open"
@ -1597,7 +1645,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e2c1f9f56e534ac6a9b8a4600bdf0f530fb393b5f393e7b4d03489c3cf0c3f01"
dependencies = [
"num-traits",
"rand",
"rand 0.8.5",
"serde",
]
@ -1675,7 +1723,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3c80231409c20246a13fddb31776fb942c38553c51e871f8cbd687a4cfb5843d"
dependencies = [
"phf_shared",
"rand",
"rand 0.8.5",
]
[[package]]
@ -1769,7 +1817,7 @@ checksum = "588f6378e4dd99458b60ec275b4477add41ce4fa9f64dcba6f15adccb19b50d6"
dependencies = [
"env_logger",
"log",
"rand",
"rand 0.8.5",
]
[[package]]
@ -1809,7 +1857,7 @@ checksum = "a2fe5ef3495d7d2e377ff17b1a8ce2ee2ec2a18cde8b6ad6619d65d0701c135d"
dependencies = [
"bytes",
"getrandom 0.2.15",
"rand",
"rand 0.8.5",
"ring",
"rustc-hash",
"rustls",
@ -1857,11 +1905,21 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404"
dependencies = [
"libc",
"rand_chacha",
"rand_core",
"rand_chacha 0.3.1",
"rand_core 0.6.4",
"serde",
]
[[package]]
name = "rand"
version = "0.9.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9fbfd9d094a40bf3ae768db9361049ace4c0e04a4fd6b359518bd7b73a73dd97"
dependencies = [
"rand_chacha 0.9.0",
"rand_core 0.9.3",
]
[[package]]
name = "rand_chacha"
version = "0.3.1"
@ -1869,7 +1927,17 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88"
dependencies = [
"ppv-lite86",
"rand_core",
"rand_core 0.6.4",
]
[[package]]
name = "rand_chacha"
version = "0.9.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d3022b5f1df60f26e1ffddd6c66e8aa15de382ae63b3a0c1bfc0e4d3e3f325cb"
dependencies = [
"ppv-lite86",
"rand_core 0.9.3",
]
[[package]]
@ -1882,6 +1950,15 @@ dependencies = [
"serde",
]
[[package]]
name = "rand_core"
version = "0.9.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "99d9a13982dcf210057a8a78572b2217b667c3beacbf3a0d8b454f6f82837d38"
dependencies = [
"getrandom 0.3.2",
]
[[package]]
name = "rayon"
version = "1.10.0"
@ -2023,6 +2100,18 @@ dependencies = [
"windows-sys 0.52.0",
]
[[package]]
name = "rs-conllu"
version = "0.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d6de5aecf17f8fff1b35d59a12e2b8c908cad4d67208805166483655554f9169"
dependencies = [
"clap",
"derive_more",
"thiserror 1.0.69",
"walkdir",
]
[[package]]
name = "rustc-demangle"
version = "0.1.24"
@ -2953,6 +3042,12 @@ version = "0.2.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4a1a07cc7db3810833284e8d372ccdc6da29741639ecc70c9ec107df0fa6154c"
[[package]]
name = "unicode-xid"
version = "0.2.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ebc1c04c71510c7f702b52b7c350734c9ff1295c464a03335b00bb84fc54f853"
[[package]]
name = "unscanny"
version = "0.1.0"

View file

@ -1,5 +1,5 @@
[workspace]
members = [ "harper-cli", "harper-core", "harper-ls", "harper-comments", "harper-wasm", "harper-tree-sitter", "harper-html", "harper-literate-haskell", "harper-typst" , "harper-stats"]
members = [ "harper-cli", "harper-core", "harper-ls", "harper-comments", "harper-wasm", "harper-tree-sitter", "harper-html", "harper-literate-haskell", "harper-typst" , "harper-stats", "harper-pos-utils", "harper-brill"]
resolver = "2"
# Comment out the below lines if you plan to use a debugger.

16
harper-brill/Cargo.toml Normal file
View file

@ -0,0 +1,16 @@
[package]
name = "harper-brill"
version = "0.42.0"
edition = "2024"
[dependencies]
harper-pos-utils = { path = "../harper-pos-utils/", version = "0.42.0" }
lazy_static = "1.5.0"
rs-conllu = "0.3.0"
serde = "1.0.219"
serde_json = "1.0.140"
[build-dependencies]
rs-conllu = "0.3.0"
serde = "1.0.219"
serde_json = "1.0.140"

32
harper-brill/src/lib.rs Normal file
View file

@ -0,0 +1,32 @@
use lazy_static::lazy_static;
use std::sync::Arc;
pub use harper_pos_utils::{BrillChunker, BrillTagger, Chunker, FreqDict, Tagger, UPOS};
const BRILL_TAGGER_SOURCE: &str = include_str!("../trained_tagger_model.json");
lazy_static! {
static ref BRILL_TAGGER: Arc<BrillTagger<FreqDict>> = Arc::new(uncached_brill_tagger());
}
fn uncached_brill_tagger() -> BrillTagger<FreqDict> {
serde_json::from_str(BRILL_TAGGER_SOURCE).unwrap()
}
pub fn brill_tagger() -> Arc<BrillTagger<FreqDict>> {
(*BRILL_TAGGER).clone()
}
const BRILL_CHUNKER_SOURCE: &str = include_str!("../trained_chunker_model.json");
lazy_static! {
static ref BRILL_CHUNKER: Arc<BrillChunker> = Arc::new(uncached_brill_chunker());
}
fn uncached_brill_chunker() -> BrillChunker {
serde_json::from_str(BRILL_CHUNKER_SOURCE).unwrap()
}
pub fn brill_chunker() -> Arc<BrillChunker> {
(*BRILL_CHUNKER).clone()
}

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

View file

@ -13,6 +13,7 @@ harper-stats = { path = "../harper-stats", version = "0.42.0" }
dirs = "6.0.0"
harper-literate-haskell = { path = "../harper-literate-haskell", version = "0.42.0" }
harper-core = { path = "../harper-core", version = "0.42.0" }
harper-pos-utils = { path = "../harper-pos-utils", version = "0.42.0", features = ["training", "threaded"] }
harper-comments = { path = "../harper-comments", version = "0.42.0" }
harper-typst = { path = "../harper-typst", version = "0.42.0" }
hashbrown = "0.15.4"

View file

@ -20,6 +20,7 @@ use harper_core::{
MutableDictionary, TokenKind, TokenStringExt, WordId, WordMetadata,
};
use harper_literate_haskell::LiterateHaskellParser;
use harper_pos_utils::{BrillChunker, BrillTagger};
use harper_stats::Stats;
use serde::Serialize;
@ -77,6 +78,28 @@ enum Args {
/// The document to mine words from.
file: PathBuf,
},
TrainBrillTagger {
#[arg(short, long, default_value = "1.0")]
candidate_selection_chance: f32,
/// The path to write the final JSON model file to.
output: PathBuf,
/// The number of epochs (and patch rules) to train.
epochs: usize,
/// Path to a `.conllu` dataset to train on.
#[arg(num_args = 1..)]
datasets: Vec<PathBuf>,
},
TrainBrillChunker {
#[arg(short, long, default_value = "1.0")]
candidate_selection_chance: f32,
/// The path to write the final JSON model file to.
output: PathBuf,
/// The number of epochs (and patch rules) to train.
epochs: usize,
/// Path to a `.conllu` dataset to train on.
#[arg(num_args = 1..)]
datasets: Vec<PathBuf>,
},
/// Print harper-core version.
CoreVersion,
/// Rename a flag in the dictionary and affixes.
@ -91,6 +114,8 @@ enum Args {
/// Emit a decompressed, line-separated list of the compounds in Harper's dictionary.
/// As long as there's either an open or hyphenated spelling.
Compounds,
/// Provided a sentence or phrase, emit a list of each noun phrase contained within.
NominalPhrases { input: String },
}
fn main() -> anyhow::Result<()> {
@ -380,6 +405,27 @@ fn main() -> anyhow::Result<()> {
println!("harper-core v{}", harper_core::core_version());
Ok(())
}
Args::TrainBrillTagger {
datasets: dataset,
epochs,
output,
candidate_selection_chance,
} => {
let tagger = BrillTagger::train(&dataset, epochs, candidate_selection_chance);
fs::write(output, serde_json::to_string_pretty(&tagger)?)?;
Ok(())
}
Args::TrainBrillChunker {
datasets,
epochs,
output,
candidate_selection_chance,
} => {
let chunker = BrillChunker::train(&datasets, epochs, candidate_selection_chance);
fs::write(output, serde_json::to_string_pretty(&chunker)?)?;
Ok(())
}
Args::RenameFlag { old, new, dir } => {
use serde_json::Value;
@ -547,6 +593,18 @@ fn main() -> anyhow::Result<()> {
println!("\nFound {} compound word groups", results.len());
Ok(())
}
Args::NominalPhrases { input } => {
let doc = Document::new_markdown_default_curated(&input);
for phrase in doc.iter_nominal_phrases() {
let s =
doc.get_span_content_str(&phrase.span().ok_or(anyhow!("Unable to get span"))?);
println!("{s}");
}
Ok(())
}
}
}
@ -562,6 +620,7 @@ fn load_file(
.map(|v| v.to_str().unwrap())
{
Some("md") => Box::new(Markdown::default()),
Some("lhs") => Box::new(LiterateHaskellParser::new_markdown(
MarkdownOptions::default(),
)),

View file

@ -31,6 +31,7 @@ foldhash = "0.1.5"
strum_macros = "0.27.1"
strum = "0.27.1"
ammonia = "4.1.0"
harper-brill = { path = "../harper-brill", version = "0.42.0" }
bitflags = { version = "2.9.1", features = ["serde"] }
[dev-dependencies]

View file

@ -2,6 +2,7 @@ use std::cmp::Ordering;
use std::collections::VecDeque;
use std::fmt::Display;
use harper_brill::{Chunker, Tagger, brill_chunker, brill_tagger};
use paste::paste;
use crate::expr::{Expr, ExprExt, LongestMatchOf, Repeating, SequenceExpr};
@ -9,10 +10,8 @@ use crate::parsers::{Markdown, MarkdownOptions, Parser, PlainEnglish};
use crate::patterns::WordSet;
use crate::punctuation::Punctuation;
use crate::vec_ext::VecExt;
use crate::word_metadata::AdjectiveData;
use crate::{
Dictionary, FatStringToken, FatToken, FstDictionary, Lrc, NounData, Token, TokenKind,
TokenStringExt,
Dictionary, FatStringToken, FatToken, FstDictionary, Lrc, Token, TokenKind, TokenStringExt,
};
use crate::{OrdinalSuffix, Span};
@ -140,107 +139,34 @@ impl Document {
self.condense_ellipsis();
self.condense_latin();
self.match_quotes();
self.articles_imply_nouns();
// annotate word metadata
let token_strings: Vec<_> = self
.tokens
.iter()
.filter(|t| !t.kind.is_whitespace())
.map(|t| self.get_span_content_str(&t.span))
.collect();
let token_tags = brill_tagger().tag_sentence(&token_strings);
let np_flags = brill_chunker().chunk_sentence(&token_strings, &token_tags);
let mut i = 0;
// Annotate word metadata
for token in self.tokens.iter_mut() {
if let TokenKind::Word(meta) = &mut token.kind {
let word_source = token.span.get_content(&self.source);
let found_meta = dictionary.get_word_metadata(word_source);
*meta = found_meta.cloned()
}
}
let mut found_meta = dictionary.get_word_metadata(word_source).cloned();
// refine and disambiguate word metadata
self.known_preposition();
self.articles_imply_not_verb();
}
fn uncached_article_expr() -> Lrc<SequenceExpr> {
Lrc::new(
SequenceExpr::default()
.then_determiner()
.then_whitespace()
.then(|t: &Token, _source: &[char]| t.kind.is_adjective() && t.kind.is_noun())
.then_whitespace()
.then_noun(),
)
}
thread_local! {static ARTICLE_EXPR: Lrc<SequenceExpr> = Document::uncached_article_expr()}
/// When a word that is either an adjective or a noun is sandwiched between an article and a noun,
/// it definitely is not a noun.
fn articles_imply_nouns(&mut self) {
let expr = Self::ARTICLE_EXPR.with(|v| v.clone());
for m in expr.iter_matches_in_doc(self).collect::<Vec<_>>() {
if let TokenKind::Word(Some(metadata)) = &mut self.tokens[m.start + 2].kind {
metadata.noun = None;
metadata.verb = None;
}
}
}
/// A proposition-like word followed by a determiner or number is typically
/// really a preposition.
fn known_preposition(&mut self) {
fn create_expr() -> Lrc<SequenceExpr> {
Lrc::new(
SequenceExpr::default()
.then(WordSet::new(&["in", "at", "on", "to", "for", "by", "with"]))
.then_whitespace()
.then(|t: &Token, _source: &[char]| {
t.kind.is_determiner() || t.kind.is_number()
}),
)
}
thread_local! {static EXPR: Lrc<SequenceExpr> = create_expr()}
let expr = EXPR.with(|v| v.clone());
for m in expr.iter_matches_in_doc(self).collect::<Vec<_>>() {
if let TokenKind::Word(Some(metadata)) = &mut self.tokens[m.start].kind {
metadata.noun = None;
metadata.pronoun = None;
metadata.verb = None;
metadata.adjective = None;
}
}
}
/// The first word after an article cannot be a verb.
fn articles_imply_not_verb(&mut self) {
fn create_pattern() -> Lrc<SequenceExpr> {
Lrc::new(
SequenceExpr::default()
.then(WordSet::new(&[
// articles
"a", "an", "the",
// Dependent genitive pronouns serve a similar role to articles.
// Unfortunately, some overlap with other pronoun forms. E.g.
// "I like her", "Something about her struck me as odd."
"my", "your", "thy", "thine", "his", /*"her",*/ "its", "our", "their",
"whose", // "no" is also a determiner
"no",
]))
.then_whitespace()
.then_verb(),
)
}
thread_local! {static EXPR: Lrc<SequenceExpr> = create_pattern()}
let expr = EXPR.with(|v| v.clone());
for m in expr.iter_matches_in_doc(self).collect::<Vec<_>>() {
if let TokenKind::Word(Some(metadata)) = &mut self.tokens[m.end - 1].kind {
if metadata.noun.is_none()
&& metadata.adjective.is_none()
&& metadata.adverb.is_none()
{
metadata.noun = Some(NounData::default());
metadata.adjective = Some(AdjectiveData::default());
if let Some(inner) = &mut found_meta {
inner.pos_tag = token_tags[i];
inner.np_member = Some(np_flags[i]);
}
metadata.verb = None;
*meta = found_meta;
i += 1;
} else if !token.kind.is_whitespace() {
i += 1;
}
}
}
@ -331,6 +257,40 @@ impl Document {
self.tokens.iter()
}
pub fn iter_nominal_phrases(&self) -> impl Iterator<Item = &[Token]> {
fn is_np_member(t: &Token) -> bool {
t.kind
.as_word()
.and_then(|x| x.as_ref())
.and_then(|w| w.np_member)
.unwrap_or(false)
}
fn trim(slice: &[Token]) -> &[Token] {
let mut start = 0;
let mut end = slice.len();
while start < end && slice[start].kind.is_whitespace() {
start += 1;
}
while end > start && slice[end - 1].kind.is_whitespace() {
end -= 1;
}
&slice[start..end]
}
self.tokens
.as_slice()
.split(|t| !(is_np_member(t) || t.kind.is_whitespace()))
.filter_map(|s| {
let s = trim(s);
if s.iter().any(is_np_member) {
Some(s)
} else {
None
}
})
}
/// Get an iterator over all the tokens contained in the document.
pub fn fat_tokens(&self) -> impl Iterator<Item = FatToken> + '_ {
self.tokens().map(|token| token.to_fat(&self.source))

View file

@ -12,13 +12,9 @@ pub(crate) fn is_content_word(tok: &Token, src: &[char]) -> bool {
};
tok.span.len() > 1
&& (meta.is_noun() || meta.is_adjective())
&& (meta.is_noun() || meta.is_adjective() || meta.is_verb() || meta.is_adverb())
&& !meta.determiner
&& (!meta.preposition || tok.span.get_content_string(src).to_lowercase() == "bar")
&& !meta.is_adverb()
&& !meta.is_conjunction()
&& !meta.is_pronoun()
&& !meta.is_auxiliary_verb()
}
pub(crate) fn predicate(closed: Option<&WordMetadata>, open: Option<&WordMetadata>) -> bool {

View file

@ -1,9 +1,15 @@
use harper_brill::UPOS;
use crate::expr::All;
use crate::expr::Expr;
use crate::expr::SequenceExpr;
use crate::patterns::NominalPhrase;
use crate::patterns::Pattern;
use crate::patterns::UPOSSet;
use crate::patterns::WordSet;
use crate::{
Token,
linting::{ExprLinter, Lint, LintKind, Suggestion},
patterns::WordSet,
};
pub struct ItsContraction {
@ -12,14 +18,22 @@ pub struct ItsContraction {
impl Default for ItsContraction {
fn default() -> Self {
let its = WordSet::new(&["its"]);
let verbs = WordSet::new(&["had", "been", "got"]);
let pattern = SequenceExpr::default()
.then(its)
let positive = SequenceExpr::default()
.t_aco("its")
.then_whitespace()
.then(verbs);
.then(UPOSSet::new(&[UPOS::VERB, UPOS::AUX]));
let exceptions = SequenceExpr::default()
.then_anything()
.then_anything()
.then(WordSet::new(&["own", "intended"]));
let inverted = SequenceExpr::default().if_not_then_step_one(exceptions);
let expr = All::new(vec![Box::new(positive), Box::new(inverted)]);
Self {
expr: Box::new(pattern),
expr: Box::new(expr),
}
}
}
@ -32,6 +46,13 @@ impl ExprLinter for ItsContraction {
fn match_to_lint(&self, toks: &[Token], source: &[char]) -> Option<Lint> {
let offender = toks.first()?;
let offender_chars = offender.span.get_content(source);
if !toks.get(2)?.kind.is_upos(UPOS::AUX)
&& NominalPhrase.matches(&toks[2..], source).is_some()
{
return None;
}
Some(Lint {
span: offender.span,
lint_kind: LintKind::WordChoice,
@ -39,7 +60,8 @@ impl ExprLinter for ItsContraction {
Suggestion::replace_with_match_case_str("it's", offender_chars),
Suggestion::replace_with_match_case_str("it has", offender_chars),
],
message: "Use `it's` (short for `it has`) here, not the possessive `its`.".to_owned(),
message: "Use `it's` (short for `it has` or `it is`) here, not the possessive `its`."
.to_owned(),
priority: 54,
})
}
@ -98,4 +120,13 @@ mod tests {
0,
);
}
#[test]
fn ignore_coroutine() {
assert_lint_count(
"Launch each task within its own child coroutine.",
ItsContraction::default(),
0,
);
}
}

View file

@ -54,8 +54,7 @@ impl ThenThan {
// TODO: This can be simplified or eliminated when the adjective improvements make it into the affix system.
fn is_comparative_adjective(tok: &Token, source: &[char]) -> bool {
tok.kind
.is_adjective()
(tok.kind.is_adjective() || tok.kind.is_adverb())
.then(|| tok.span.get_content(source))
.is_some_and(|src| {
// Regular comparative form?

View file

@ -13,6 +13,7 @@ mod indefinite_article;
mod inflection_of_be;
mod invert;
mod nominal_phrase;
mod upos_set;
mod whitespace_pattern;
mod within_edit_distance;
mod word;
@ -24,6 +25,7 @@ pub use indefinite_article::IndefiniteArticle;
pub use inflection_of_be::InflectionOfBe;
pub use invert::Invert;
pub use nominal_phrase::NominalPhrase;
pub use upos_set::UPOSSet;
pub use whitespace_pattern::WhitespacePattern;
pub use within_edit_distance::WithinEditDistance;
pub use word::Word;

View file

@ -0,0 +1,30 @@
use harper_brill::UPOS;
use smallvec::{SmallVec, ToSmallVec};
use crate::Token;
use super::Pattern;
pub struct UPOSSet {
allowed_tags: SmallVec<[UPOS; 10]>,
}
impl UPOSSet {
pub fn new(allowed: &[UPOS]) -> Self {
Self {
allowed_tags: allowed.to_smallvec(),
}
}
}
impl Pattern for UPOSSet {
fn matches(&self, tokens: &[Token], _source: &[char]) -> Option<usize> {
tokens.first()?.kind.as_word()?.as_ref().and_then(|w| {
if self.allowed_tags.contains(&(w.pos_tag?)) {
Some(1)
} else {
None
}
})
}
}

View file

@ -1,3 +1,4 @@
use harper_brill::UPOS;
use is_macro::Is;
use serde::{Deserialize, Serialize};
@ -447,4 +448,12 @@ impl TokenKind {
pub fn is_whitespace(&self) -> bool {
matches!(self, TokenKind::Space(_) | TokenKind::Newline(_))
}
pub fn is_upos(&self, upos: UPOS) -> bool {
let Some(Some(meta)) = self.as_word() else {
return false;
};
meta.pos_tag == Some(upos)
}
}

View file

@ -1,3 +1,4 @@
use harper_brill::UPOS;
use is_macro::Is;
use paste::paste;
use serde::{Deserialize, Serialize};
@ -32,6 +33,10 @@ pub struct WordMetadata {
pub common: bool,
#[serde(default = "default_none")]
pub derived_from: Option<WordId>,
/// Generated by a chunker
pub np_member: Option<bool>,
/// Generated by a POS tagger
pub pos_tag: Option<UPOS>,
}
/// Needed for `serde`
@ -120,6 +125,180 @@ impl WordMetadata {
preposition: self.preposition || other.preposition,
common: self.common || other.common,
derived_from: self.derived_from.or(other.derived_from),
pos_tag: self.pos_tag.or(other.pos_tag),
np_member: self.np_member.or(other.np_member),
}
}
/// Given a UPOS tag, discard any metadata that would disagree with the given POS tag.
/// For example, if the metadata suggests a word could either be a noun or an adjective, and we
/// provide a [`UPOS::NOUN`], this function will remove the adjective data.
///
/// Additionally, if the metadata does not currently declare the potential of the word to be
/// the specific POS, it becomes so. That means if we provide a [`UPOS::ADJ`] to the function
/// for a metadata whose `Self::adjective = None`, it will become `Some`.
pub fn enforce_pos_exclusivity(&mut self, pos: &UPOS) {
use UPOS::*;
match pos {
NOUN => {
if let Some(noun) = self.noun {
self.noun = Some(NounData {
is_proper: Some(false),
..noun
})
} else {
self.noun = Some(NounData {
is_proper: Some(false),
is_plural: None,
is_possessive: None,
})
}
self.pronoun = None;
self.verb = None;
self.adjective = None;
self.adverb = None;
self.conjunction = None;
self.determiner = false;
self.preposition = false;
}
PROPN => {
if let Some(noun) = self.noun {
self.noun = Some(NounData {
is_proper: Some(true),
..noun
})
} else {
self.noun = Some(NounData {
is_proper: Some(true),
is_plural: None,
is_possessive: None,
})
}
self.pronoun = None;
self.verb = None;
self.adjective = None;
self.adverb = None;
self.conjunction = None;
self.determiner = false;
self.preposition = false;
}
PRON => {
if self.pronoun.is_none() {
self.pronoun = Some(PronounData::default())
}
self.noun = None;
self.verb = None;
self.adjective = None;
self.adverb = None;
self.conjunction = None;
self.determiner = false;
self.preposition = false;
}
VERB => {
if let Some(verb) = self.verb {
self.verb = Some(VerbData {
is_auxiliary: Some(false),
..verb
})
} else {
self.verb = Some(VerbData {
is_auxiliary: Some(false),
..Default::default()
})
}
self.noun = None;
self.pronoun = None;
self.adjective = None;
self.adverb = None;
self.conjunction = None;
self.determiner = false;
self.preposition = false;
}
AUX => {
if let Some(verb) = self.verb {
self.verb = Some(VerbData {
is_auxiliary: Some(true),
..verb
})
} else {
self.verb = Some(VerbData {
is_auxiliary: Some(true),
..Default::default()
})
}
self.noun = None;
self.pronoun = None;
self.adjective = None;
self.adverb = None;
self.conjunction = None;
self.determiner = false;
self.preposition = false;
}
ADJ => {
if self.adjective.is_none() {
self.adjective = Some(AdjectiveData::default())
}
self.noun = None;
self.pronoun = None;
self.verb = None;
self.adverb = None;
self.conjunction = None;
self.determiner = false;
self.preposition = false;
}
ADV => {
if self.adverb.is_none() {
self.adverb = Some(AdverbData::default())
}
self.noun = None;
self.pronoun = None;
self.verb = None;
self.adjective = None;
self.conjunction = None;
self.determiner = false;
self.preposition = false;
}
ADP => {
self.noun = None;
self.pronoun = None;
self.verb = None;
self.adjective = None;
self.adverb = None;
self.conjunction = None;
self.determiner = false;
self.preposition = true;
}
DET => {
self.noun = None;
self.pronoun = None;
self.verb = None;
self.adjective = None;
self.adverb = None;
self.conjunction = None;
self.preposition = false;
self.determiner = true;
}
CCONJ | SCONJ => {
if self.conjunction.is_none() {
self.conjunction = Some(ConjunctionData::default())
}
self.noun = None;
self.pronoun = None;
self.verb = None;
self.adjective = None;
self.adverb = None;
self.determiner = false;
self.preposition = false;
}
_ => {}
}
}

View file

@ -47,6 +47,7 @@
//! - Determiners are denoted by `D`.
//! - Prepositions are denoted by `P`.
//! - Dialects are denoted by `Am`, `Br`, `Ca`, or `Au`.
//! - Noun phrase membership is denoted by `+`
//!
//! The tagger supports uncertainty, so a single word can be e.g. both a
//! noun and a verb. This is denoted by a `/` between the tags.
@ -146,6 +147,8 @@ fn format_word_tag(word: &WordMetadata) -> String {
}
});
add_switch(&mut tags, word.np_member, "+", "");
if tags.is_empty() {
String::from("W?")
} else {

View file

@ -85,7 +85,7 @@ create_test!(pr_452.md, 2, Dialect::American);
create_test!(hex_basic_clean.md, 0, Dialect::American);
create_test!(hex_basic_dirty.md, 1, Dialect::American);
create_test!(misc_closed_compound_clean.md, 0, Dialect::American);
create_test!(yogurt_british_clean.md, 0, Dialect::British);
create_test!(yogurt_british_clean.md, 1, Dialect::British);
// Make sure it doesn't panic
create_test!(lukas_homework.md, 3, Dialect::American);

View file

@ -456,6 +456,15 @@ Message: |
Lint: Capitalization (31 priority)
Message: |
226 | himself as he came, “Oh! the Duchess, the Duchess! Oh! wont she be savage if
| ^~~ This sentence does not start with a capital letter
Suggest:
- Replace with: “The”
Lint: Capitalization (31 priority)
Message: |
226 | himself as he came, “Oh! the Duchess, the Duchess! Oh! wont she be savage if

View file

@ -209,6 +209,24 @@ Suggest:
Lint: Capitalization (31 priority)
Message: |
340 | on the left, on the right, on the side, on the bottom.
| ^~ This sentence does not start with a capital letter
Suggest:
- Replace with: “On”
Lint: Capitalization (31 priority)
Message: |
342 | on a bus, on a train, on a plane, on a ferry, on a yacht.
| ^~ This sentence does not start with a capital letter
Suggest:
- Replace with: “On”
Lint: Miscellaneous (31 priority)
Message: |
343 | All of the responsibility is on him.

View file

@ -204,6 +204,16 @@ Message: |
Lint: WordChoice (63 priority)
Message: |
89 | third Class at the Expiration of the sixth Year, so that one third may be
| ^~~~~~ Did you mean the closed compound noun “maybe”?
90 | chosen every second Year; and when vacancies happen in the representation of
Suggest:
- Replace with: “maybe”
Lint: Readability (127 priority)
Message: |
96 | No Person shall be a Senator who shall not have attained to the Age of thirty
@ -1541,6 +1551,16 @@ Message: |
Lint: WordChoice (63 priority)
Message: |
658 | questioned. But neither the United States nor any State shall assume or pay any
659 | debt or obligation incurred in aid of insurrection or rebellion against the
| ^~~~~~~ Did you mean the closed compound noun “debtor”?
Suggest:
- Replace with: “debtor”
Lint: Spelling (63 priority)
Message: |
663 | ## Article. V.

View file

@ -1949,6 +1949,16 @@ Suggest:
Lint: WordChoice (63 priority)
Message: |
1531 | puppyish, convivial way, girls were swooning backward playfully into mens arms,
1532 | even into groups, knowing that some one would arrest their falls—but no one
| ^~~~~~~~ Did you mean the closed compound noun “someone”?
Suggest:
- Replace with: “someone”
Lint: Miscellaneous (31 priority)
Message: |
1531 | puppyish, convivial way, girls were swooning backward playfully into mens arms,
@ -6441,6 +6451,16 @@ Suggest:
Lint: WordChoice (63 priority)
Message: |
5181 | easier, surer way of finding out what he wanted to know. By half-past two he was
5182 | in West Egg, where he asked some one the way to Gatsbys house. So by that time
| ^~~~~~~~ Did you mean the closed compound noun “someone”?
Suggest:
- Replace with: “someone”
Lint: Miscellaneous (31 priority)
Message: |
5181 | easier, surer way of finding out what he wanted to know. By half-past two he was
@ -7123,6 +7143,16 @@ Suggest:
Lint: WordChoice (63 priority)
Message: |
5642 | message or a flower. Dimly I heard some one murmur “Blessed are the dead that
| ^~~~~~~~ Did you mean the closed compound noun “someone”?
5643 | the rain falls on,” and then the owl-eyed man said “Amen to that,” in a brave
Suggest:
- Replace with: “someone”
Lint: Miscellaneous (31 priority)
Message: |
5642 | message or a flower. Dimly I heard some one murmur “Blessed are the dead that
@ -7462,6 +7492,16 @@ Suggest:
Lint: WordChoice (54 priority)
Message: |
5814 | green breast of the new world. Its vanished trees, the trees that had made way
| ^~~ Use `it's` (short for `it has` or `it is`) here, not the possessive `its`.
Suggest:
- Replace with: “It's”
- Replace with: “It has”
Lint: Readability (127 priority)
Message: |
5814 | green breast of the new world. Its vanished trees, the trees that had made way

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

View file

@ -7,434 +7,434 @@
> -->
# Unlintable Unlintable
> Part - of - speech tagging
# Unlintable NSg/V/J . P . NSg/V NSg/V
# Unlintable NSg/V/J . P . NSg/V+ NSg/V
>
#
> In corpus linguistics , part - of - speech tagging ( POS tagging or PoS tagging or
# NPrSg/J/P NSg NSg . NSg/V/J . P . NSg/V NSg/V . NSg NSg/V NPrSg/C NSg NSg/V NPrSg/C
> POST ) , also called grammatical tagging is the process of marking up a word in a
# NPrSg/V/P . . W? V/J J NSg/V VL D NSg P NSg/V NSg/V/J/P D/P NSg P D/P
> text ( corpus ) as corresponding to a particular part of speech , based on both its
# NSg . NSg . NSg/R NSg/V/J P D/P NSg/J NSg/V/J P NSg/V . V/J J/P I/C ISg/D
> definition and its context . A simplified form of this is commonly taught to
# NSg V/C ISg/D NSg . D/P J NSg/V P I/D VL R V P
> school - age children , in the identification of words as nouns , verbs , adjectives ,
# NSg/V . NSg/V NPl . P D NSg P NPl/V NSg/R NPl/V . NPl/V . NPl/V .
> In corpus linguistics , part - of - speech tagging ( POS tagging or PoS tagging or
# NPrSg/J/P NSg+ NSg . NSg/V/J . P . NSg/V NSg/V . NSg+ NSg/V NPrSg/C NSg+ NSg/V NPrSg/C
> POST ) , also called grammatical tagging is the process of marking up a word in a
# NPrSg/V/P+ . . W? V/J J NSg/V VL D NSg/V P NSg/V NSg/V/J/P D/P NSg/V NPrSg/J/P D/P
> text ( corpus ) as corresponding to a particular part of speech , based on both its
# NSg/V . NSg+ . NSg/R NSg/V/J P D/P NSg/J NSg/V/J P NSg/V+ . V/J J/P I/C ISg/D+
> definition and its context . A simplified form of this is commonly taught to
# NSg V/C ISg/D+ NSg/V+ . D/P V/J NSg/V P I/D+ VL R V P
> school - age children , in the identification of words as nouns , verbs , adjectives ,
# NSg/V . NSg/V NPl . NPrSg/J/P D NSg P NPl/V+ NSg/R NPl/V . NPl/V+ . NPl/V .
> adverbs , etc.
# NPl/V . W?
>
#
> Once performed by hand , POS tagging is now done in the context of computational
# NSg/C V/J NSg/J/P NSg/V . NSg NSg/V VL NPrSg/V/J/C NSg/V/J P D NSg P J
> linguistics , using algorithms which associate discrete terms , as well as hidden
# NSg . V NPl I/C NSg/V/J J NPl/V . NSg/R NSg/V/J NSg/R V/J
> parts of speech , by a set of descriptive tags . POS - tagging algorithms fall into
# NPl/V P NSg/V . P D/P NPrSg/J P NSg/J NPl/V . NSg . NSg/V NPl NSg/V P
> two distinctive groups : rule - based and stochastic . E. Brill's tagger , one of the
# NSg NSg/J NPl/V . NSg/V . V/J V/C J . ? ? NSg . NSg/I/V/J P D
> first and most widely used English POS - taggers , employs rule - based algorithms .
# NSg/J V/C NSg/I/J R V/J NPrSg/V/J NSg . NPl . NPl/V NSg/V . V/J NPl .
> Once performed by hand , POS tagging is now done in the context of computational
# NSg/C V/J NSg/J/P NSg/V+ . NSg+ NSg/V VL NPrSg/V/J/C NSg/V/J NPrSg/J/P D NSg/V P J+
> linguistics , using algorithms which associate discrete terms , as well as hidden
# NSg+ . V NPl+ I/C+ NSg/V/J+ J NPl/V+ . NSg/R NSg/V/J NSg/R V/J
> parts of speech , by a set of descriptive tags . POS - tagging algorithms fall into
# NPl/V P NSg/V+ . NSg/J/P D/P NPrSg/V/J P NSg/J+ NPl/V+ . NSg+ . NSg/V NPl NSg/V P
> two distinctive groups : rule - based and stochastic . E. Brill's tagger , one of the
# NSg NSg/J NPl/V+ . NSg/V+ . V/J+ V/C+ J+ . ? ? NSg . NSg/I/V/J P D
> first and most widely used English POS - taggers , employs rule - based algorithms .
# NSg/V/J V/C NSg/I/J R V/J NPrSg/V/J+ NSg+ . NPl . NPl/V NSg/V+ . V/J NPl+ .
>
#
> Principle
# NSg/V
>
#
> Part - of - speech tagging is harder than just having a list of words and their
# NSg/V/J . P . NSg/V NSg/V VL J C/P V/J V D/P NSg P NPl/V V/C D
> parts of speech , because some words can represent more than one part of speech
# NPl P NSg/V . C/P I/J/R NPl/V NPrSg/VX V NPrSg/I/V/J C/P NSg/I/V/J NSg/V/J P NSg/V
> at different times , and because some parts of speech are complex . This is not
# NSg/P NSg/J NPl/V . V/C C/P I/J/R NPl/V P NSg/V V NSg/V/J . I/D VL NSg/C
> Part - of - speech tagging is harder than just having a list of words and their
# NSg/V/J . P . NSg/V NSg/V VL J C/P V/J V D/P NSg/V P NPl/V V/C D+
> parts of speech , because some words can represent more than one part of speech
# NPl/V P NSg/V+ . C/P I/J/R+ NPl/V+ NPrSg/VX V NPrSg/I/V/J C/P NSg/I/V/J NSg/V/J P NSg/V+
> at different times , and because some parts of speech are complex . This is not
# NSg/P NSg/J+ NPl/V+ . V/C C/P I/J/R NPl/V P NSg/V+ V+ NSg/V/J+ . I/D+ VL NSg/C
> rare — in natural languages ( as opposed to many artificial languages ) , a large
# NSg/V/J . NPrSg/J/P NSg/J NPl/V . NSg/R V/J P N/I/J/D J NPl/V . . D/P NSg/J
> percentage of word - forms are ambiguous . For example , even " dogs " , which is
# NSg P NSg/V . NPl/V V J . C/P NSg/V . NSg/V/J . NPl/V . . I/C VL
> usually thought of as just a plural noun , can also be a verb :
# R NSg/V P NSg/R V/J D/P NSg/J NSg/V . NPrSg/VX W? NSg/VX D/P NSg .
# NSg/V/J . NPrSg/J/P NSg/J NPl/V+ . NSg/R V/J P N/I/J/D J NPl/V+ . . D/P NSg/J
> percentage of word - forms are ambiguous . For example , even " dogs " , which is
# NSg P NSg/V+ . NPl/V+ V+ J+ . C/P NSg/V+ . NSg/V/J . NPl/V+ . . I/C+ VL
> usually thought of as just a plural noun , can also be a verb :
# R NSg/V P NSg/R V/J D/P+ NSg/J+ NSg/V+ . NPrSg/VX W? NSg/VX D/P NSg/V+ .
>
#
> The sailor dogs the hatch .
# D NSg NPl/V D NSg .
> The sailor dogs the hatch .
# D+ NSg NPl/V D NSg/V+ .
>
#
> Correct grammatical tagging will reflect that " dogs " is here used as a verb , not
# NSg/V/J J NSg/V NPrSg/VX V N/I/C/D . NPl/V . VL NSg/J/R V/J NSg/R D/P NSg . NSg/C
> as the more common plural noun . Grammatical context is one way to determine
# NSg/R D NPrSg/I/J NSg/V/J NSg/J NSg/V . J NSg/V VL NSg/I/V/J NSg/J P V
> this ; semantic analysis can also be used to infer that " sailor " and " hatch "
# I/D . NSg/J NSg NPrSg/VX W? NSg/VX V/J P J N/I/C/D . NSg . V/C . NSg/V .
> implicate " dogs " as 1 ) in the nautical context and 2 ) an action applied to the
# NSg/V . NPl/V . NSg/R # . P D J NSg/V V/C # . D/P NSg/J V/J P D
> object " hatch " ( in this context , " dogs " is a nautical term meaning " fastens ( a
# NSg . NSg/V . . P I/D NSg/V . . NPl/V . VL D/P J NSg/V/J NSg/V/J . V . D/P
> watertight door ) securely " ) .
# J NSg/V . R . . .
> Correct grammatical tagging will reflect that " dogs " is here used as a verb , not
# NSg/V/J+ J NSg/V NPrSg/VX V N/I/C/D+ . NPl/V+ . VL NSg/J/R V/J NSg/R D/P+ NSg/V+ . NSg/C
> as the more common plural noun . Grammatical context is one way to determine
# NSg/R D NPrSg/I/V/J NSg/V/J NSg/J NSg/V+ . J NSg/V+ VL NSg/I/V/J NSg/J+ P V
> this ; semantic analysis can also be used to infer that " sailor " and " hatch "
# I/D+ . NSg/J NSg+ NPrSg/VX W? NSg/VX V/J P J N/I/C/D+ . NSg+ . V/C . NSg/V .
> implicate " dogs " as 1 ) in the nautical context and 2 ) an action applied to the
# NSg/V . NPl/V . NSg/R # . NPrSg/J/P D+ J+ NSg/V+ V/C # . D/P NSg/V/J+ V/J P D
> object " hatch " ( in this context , " dogs " is a nautical term meaning " fastens ( a
# NSg/V+ . NSg/V . . NPrSg/J/P I/D+ NSg/V+ . . NPl/V+ . VL D/P J NSg/V/J+ NSg/V/J+ . V . D/P
> watertight door ) securely " ) .
# J NSg/V+ . R . . .
>
#
> Tag sets
# NSg/V NPl/V
> Tag sets
# NSg/V+ NPl/V
>
#
> Schools commonly teach that there are 9 parts of speech in English : noun , verb ,
# NPl/V R NSg/V N/I/C/D W? V # NPl/V P NSg/V NPrSg/J/P NPrSg/V/J . NSg/V . NSg/V .
> Schools commonly teach that there are 9 parts of speech in English : noun , verb ,
# NPl/V+ R NSg/V N/I/C/D + V # NPl/V P NSg/V+ NPrSg/J/P NPrSg/V/J . NSg/V+ . NSg/V+ .
> article , adjective , preposition , pronoun , adverb , conjunction , and interjection .
# NSg/V . NSg/V/J . NSg/V . NSg/V . NSg/V . NSg/V . V/C NSg .
# NSg/V+ . NSg/V/J+ . NSg/V . NSg/V+ . NSg/V+ . NSg/V+ . V/C NSg+ .
> However , there are clearly many more categories and sub - categories . For nouns ,
# C . W? V R N/I/J/D NPrSg/I/V/J NPl V/C NSg/V/P . NPl . C/P NPl/V .
> the plural , possessive , and singular forms can be distinguished . In many
# D NSg/J . NSg/J . V/C NSg/J NPl/V NPrSg/VX NSg/VX V/J . P N/I/J/D
> languages words are also marked for their " case " ( role as subject , object ,
# NPl/V NPl/V V W? V/J C/P D . NPrSg/V . . NSg NSg/R NSg/V/J . NSg/V .
> etc. ) , grammatical gender , and so on ; while verbs are marked for tense , aspect ,
# W? . . J NSg/V/J . V/C NSg/I/J/C J/P . NSg/V/C/P NPl/V V V/J C/P NSg/V/J . NSg/V .
> and other things . In some tagging systems , different inflections of the same
# V/C NSg/V/J NPl/V . NPrSg/J/P I/J/R NSg/V NPl . NSg/J NPl P D I/J
> root word will get different parts of speech , resulting in a large number of
# NPrSg/V NSg/V NPrSg/VX NSg/V NSg/J NPl/V P NSg/V . V P D/P NSg/J NSg/V/J P
> tags . For example , NN for singular common nouns , NNS for plural common nouns , NP
# NPl/V . C/P NSg/V . ? C/P NSg/J NSg/V/J NPl/V . ? C/P NSg/J NSg/V/J NPl/V . NPrSg
> for singular proper nouns ( see the POS tags used in the Brown Corpus ) . Other
# C/P NSg/J NSg/J NPl/V . NSg/V D NSg NPl/V V/J P D NPrSg/J NSg . . NSg/V/J
> tagging systems use a smaller number of tags and ignore fine differences or
# NSg/V NPl NSg/V D/P J NSg/V/J P NPl/V V/C V NSg/V/J NSg/V NPrSg/C
> model them as features somewhat independent from part - of - speech .
# NSg/V/J N/I NSg/R NPl/V NSg/I NSg/J P NSg/V/J . P . NSg/V .
# C . + V R N/I/J/D NPrSg/I/V/J NPl+ V/C NSg/V/P . NPl . C/P NPl/V .
> the plural , possessive , and singular forms can be distinguished . In many
# D NSg/J . NSg/J . V/C NSg/J NPl/V+ NPrSg/VX+ NSg/VX+ V/J+ . NPrSg/J/P N/I/J/D+
> languages words are also marked for their " case " ( role as subject , object ,
# NPl/V+ NPl/V+ V W? V/J C/P D+ . NPrSg/V+ . . NSg NSg/R NSg/V/J . NSg/V+ .
> etc. ) , grammatical gender , and so on ; while verbs are marked for tense , aspect ,
# + . . J+ NSg/V/J+ . V/C NSg/I/J/C J/P . NSg/V/C/P NPl/V+ V V/J C/P NSg/V/J . NSg/V+ .
> and other things . In some tagging systems , different inflections of the same
# V/C NSg/V/J+ NPl/V+ . NPrSg/J/P I/J/R+ NSg/V NPl+ . NSg/J NPl P D+ I/J+
> root word will get different parts of speech , resulting in a large number of
# NPrSg/V+ NSg/V+ NPrSg/VX NSg/V NSg/J NPl/V P NSg/V+ . V NPrSg/J/P D/P NSg/J NSg/V/J P+
> tags . For example , NN for singular common nouns , NNS for plural common nouns , NP
# NPl/V+ . C/P NSg/V+ . ? C/P NSg/J NSg/V/J+ NPl/V . ? C/P NSg/J NSg/V/J+ NPl/V . NPrSg
> for singular proper nouns ( see the POS tags used in the Brown Corpus ) . Other
# C/P NSg/J NSg/J NPl/V . NSg/V D+ NSg+ NPl/V+ V/J NPrSg/J/P D+ NPrSg/V/J+ NSg+ . . NSg/V/J
> tagging systems use a smaller number of tags and ignore fine differences or
# NSg/V NPl+ NSg/V D/P J NSg/V/J P NPl/V+ V/C V NSg/V/J NSg/V NPrSg/C
> model them as features somewhat independent from part - of - speech .
# NSg/V/J+ N/I+ NSg/R+ NPl/V+ NSg/I NSg/J P NSg/V/J . P . NSg/V+ .
>
#
> In part - of - speech tagging by computer , it is typical to distinguish from 50 to
# NPrSg/J/P NSg/V/J . P . NSg/V NSg/V NSg/J/P NSg/V . NPrSg/ISg VL NSg/J P V P # P
> 150 separate parts of speech for English . Work on stochastic methods for tagging
# # NSg/V/J NPl/V P NSg/V C/P NPrSg/V/J . NSg/V J/P J NPl/V C/P NSg/V
> In part - of - speech tagging by computer , it is typical to distinguish from 50 to
# NPrSg/J/P NSg/V/J . P . NSg/V NSg/V NSg/J/P NSg/V+ . NPrSg/ISg+ VL NSg/J P V P # P
> 150 separate parts of speech for English . Work on stochastic methods for tagging
# # NSg/V/J NPl/V P NSg/V C/P NPrSg/V/J+ . NSg/V J/P J NPl/V C/P NSg/V
> Koine Greek ( DeRose 1990 ) has used over 1 , 000 parts of speech and found that
# ? NPrSg/V/J . ? # . V V/J NSg/V/J/P # . # NPl/V P NSg/V V/C NSg/V N/I/C/D
> about as many words were ambiguous in that language as in English . A
# J/P NSg/R N/I/J/D NPl/V NSg/V J P N/I/C/D NSg/V NSg/R NPrSg/J/P NPrSg/V/J . D/P
> morphosyntactic descriptor in the case of morphologically rich languages is
# ? NSg P D NPrSg P ? NPrSg/V/J NPl/V VL
> commonly expressed using very short mnemonics , such as Ncmsan for Category = Noun ,
# R V/J V J NPrSg/V/J/P NPl . NSg/I NSg/R ? C/P NSg . NSg/V .
# ? NPrSg/V/J . ? # . V V/J NSg/V/J/P # . # NPl/V P NSg/V+ V/C NSg/V N/I/C/D
> about as many words were ambiguous in that language as in English . A
# J/P NSg/R N/I/J/D+ NPl/V+ NSg/V J NPrSg/J/P N/I/C/D+ NSg/V+ NSg/R NPrSg/J/P NPrSg/V/J+ . D/P
> morphosyntactic descriptor in the case of morphologically rich languages is
# ? NSg NPrSg/J/P D NPrSg/V P ? NPrSg/V/J NPl/V+ VL
> commonly expressed using very short mnemonics , such as Ncmsan for Category = Noun ,
# R V/J V J NPrSg/V/J/P+ NPl . NSg/I NSg/R ? C/P NSg . NSg/V+ .
> Type = common , Gender = masculine , Number = singular , Case = accusative , Animate
# NSg/V . NSg/V/J . NSg/V/J . NSg/J . NSg/V/J . NSg/J . NPrSg/V . NSg/J . V/J
> = no .
# . NPrSg/P .
>
#
> The most popular " tag set " for POS tagging for American English is probably the
# D NSg/I/J NSg/J . NSg/V NPrSg/V/J . C/P NSg NSg/V C/P NPrSg/J NPrSg/V/J VL R D
> Penn tag set , developed in the Penn Treebank project . It is largely similar to
# NPr NSg/V NPrSg/V/J . V/J P D NPr ? NSg/V . NPrSg/ISg VL R NSg/J P
> the earlier Brown Corpus and LOB Corpus tag sets , though much smaller . In
# D J NPrSg/V/J NSg V/C NSg/V NSg NSg/V NPl/V . V/C N/I/J J . NPrSg/J/P
> Europe , tag sets from the Eagles Guidelines see wide use and include versions
# NPr . NSg/V NPl/V P D NPl NPl NSg/V NSg/J NSg/V V/C NSg/V NPl/V
> The most popular " tag set " for POS tagging for American English is probably the
# D NSg/I/J NSg/J . NSg/V+ NPrSg/V/J . C/P NSg+ NSg/V C/P NPrSg/J NPrSg/V/J+ VL R D+
> Penn tag set , developed in the Penn Treebank project . It is largely similar to
# NPr+ NSg/V+ NPrSg/V/J . V/J NPrSg/J/P D+ NPr+ ? NSg/V+ . NPrSg/ISg+ VL R NSg/J P
> the earlier Brown Corpus and LOB Corpus tag sets , though much smaller . In
# D J NPrSg/V/J NSg V/C NSg/V NSg+ NSg/V+ NPl/V . V/C N/I/J+ J+ . NPrSg/J/P
> Europe , tag sets from the Eagles Guidelines see wide use and include versions
# NPr+ . NSg/V+ NPl/V P D+ NPl/V+ NPl+ NSg/V NSg/J NSg/V+ V/C NSg/V NPl/V
> for multiple languages .
# C/P NSg/J NPl/V .
# C/P NSg/J+ NPl/V+ .
>
#
> POS tagging work has been done in a variety of languages , and the set of POS
# NSg NSg/V NSg/V V NSg/V NSg/V/J P D/P NSg P NPl/V . V/C D NPrSg/J P NSg
> tags used varies greatly with language . Tags usually are designed to include
# NPl/V V/J NPl/V R P NSg/V . NPl/V R V V/J P NSg/V
> POS tagging work has been done in a variety of languages , and the set of POS
# NSg+ NSg/V NSg/V+ V NSg/V NSg/V/J NPrSg/J/P D/P NSg P NPl/V+ . V/C D NPrSg/V/J P NSg+
> tags used varies greatly with language . Tags usually are designed to include
# NPl/V+ V/J NPl/V R P NSg/V+ . NPl/V+ R V V/J P NSg/V
> overt morphological distinctions , although this leads to inconsistencies such as
# NSg/J J NPl . C I/D NPl/V P NPl NSg/I NSg/R
> case - marking for pronouns but not nouns in English , and much larger
# NPrSg/V . NSg/V C/P NPl/V NSg/C/P NSg/C NPl/V NPrSg/J/P NPrSg/V/J . V/C N/I/J J
> cross - language differences . The tag sets for heavily inflected languages such as
# NPrSg/V/J/P . NSg/V NSg/V . D NSg NPl/V C/P R V/J NPl/V NSg/I NSg/R
> Greek and Latin can be very large ; tagging words in agglutinative languages such
# NPrSg/V/J V/C NPrSg/J NPrSg/VX NSg/VX J NSg/J . NSg/V NPl/V NPrSg/J/P ? NPl/V NSg/I
> as Inuit languages may be virtually impossible . At the other extreme , Petrov et
# NSg/R NPrSg/J NPl/V NPrSg/VX NSg/VX R NSg/J . P D NSg/J NSg/J . ? ?
> al. have proposed a " universal " tag set , with 12 categories ( for example , no
# ? NSg/VX V/J D/P . NSg/J . NSg/V NPrSg/V/J . P # NPl . C/P NSg/V . NPrSg/P
> subtypes of nouns , verbs , punctuation , and so on ) . Whether a very small set of
# NPl P NPl/V . NPl/V . NSg . V/C NSg/I/J/C J/P . . I/C D/P J NPrSg/V/J NPrSg/V/J P
> very broad tags or a much larger set of more precise ones is preferable , depends
# J NSg/J NPl/V NPrSg/C D/P N/I/J J NPrSg/V/J P NPrSg/I/V/J V/J NPl/V VL W? . NPl/V
> on the purpose at hand . Automatic tagging is easier on smaller tag - sets .
# P D NSg NSg/P NSg/V . NSg/J NSg/V VL J J/P J NSg/V . NPl/V .
# NSg/J J+ NPl+ . C I/D+ NPl/V P NPl NSg/I NSg/R
> case - marking for pronouns but not nouns in English , and much larger
# NPrSg/V+ . NSg/V C/P NPl/V NSg/C/P NSg/C NPl/V NPrSg/J/P NPrSg/V/J+ . V/C N/I/J J
> cross - language differences . The tag sets for heavily inflected languages such as
# NPrSg/V/J/P+ . NSg/V+ NSg/V . D+ NSg/V+ NPl/V C/P R V/J NPl/V+ NSg/I NSg/R
> Greek and Latin can be very large ; tagging words in agglutinative languages such
# NPrSg/V/J V/C NPrSg/J NPrSg/VX NSg/VX J NSg/J . NSg/V NPl/V+ NPrSg/J/P ? NPl/V+ NSg/I
> as Inuit languages may be virtually impossible . At the other extreme , Petrov et
# NSg/R NPrSg/J NPl/V+ NPrSg/VX NSg/VX R+ NSg/J+ . NSg/P D+ NSg/V/J+ NSg/J . ? ?
> al. have proposed a " universal " tag set , with 12 categories ( for example , no
# ? NSg/VX V/J D/P . NSg/J . NSg/V+ NPrSg/V/J . P # NPl . C/P NSg/V+ . NPrSg/P
> subtypes of nouns , verbs , punctuation , and so on ) . Whether a very small set of
# NPl P NPl/V . NPl/V+ . NSg+ . V/C NSg/I/J/C J/P+ . . I/C D/P J NPrSg/V/J NPrSg/V/J P
> very broad tags or a much larger set of more precise ones is preferable , depends
# J NSg/J NPl/V NPrSg/C D/P N/I/J J NPrSg/V/J P NPrSg/I/V/J V/J NPl/V+ VL W? . NPl/V
> on the purpose at hand . Automatic tagging is easier on smaller tag - sets .
# J/P D+ NSg/V NSg/P NSg/V+ . NSg/J NSg/V VL J J/P J NSg/V+ . NPl/V+ .
>
#
> History
# NSg
>
#
> The Brown Corpus
# D NPrSg/J NSg
> The Brown Corpus
# D NPrSg/V/J+ NSg
>
#
> Research on part - of - speech tagging has been closely tied to corpus linguistics .
# NSg/V J/P NSg/V/J . P . NSg/V NSg/V V NSg/V R V/J P NSg NSg .
> The first major corpus of English for computer analysis was the Brown Corpus
# D NSg/J NPrSg/V/J NSg P NPrSg/V/J C/P NSg/V NSg V D NPrSg/J NSg
> developed at Brown University by Henry Kučera and W. Nelson Francis , in the
# V/J NSg/P NPrSg/V/J NSg NSg/J/P NPrSg ? V/C ? NPrSg NPr . P D
> mid - 1960s . It consists of about 1 , 000 , 000 words of running English prose text ,
# NSg/J/P . #d . NPrSg/ISg NPl/V P J/P # . # . # NPl/V P NSg/V/J/P NPrSg/V/J NSg/V NSg/V .
# NSg/V J/P NSg/V/J . P . NSg/V NSg/V V NSg/V R V/J P NSg NSg+ .
> The first major corpus of English for computer analysis was the Brown Corpus
# D NSg/V/J NPrSg/V/J NSg P NPrSg/V/J+ C/P NSg/V+ NSg+ V D NPrSg/V/J NSg
> developed at Brown University by Henry Kučera and W. Nelson Francis , in the
# V/J NSg/P NPrSg/V/J NSg NSg/J/P NPrSg+ ? V/C ? NPrSg+ NPr+ . NPrSg/J/P D
> mid - 1960s . It consists of about 1 , 000 , 000 words of running English prose text ,
# NSg/J/P+ . #d . NPrSg/ISg+ NPl/V P J/P # . # . # NPl/V P NSg/V/J/P NPrSg/V/J+ NSg/V NSg/V+ .
> made up of 500 samples from randomly chosen publications . Each sample is 2 , 000
# NSg/V NSg/V/J/P P # NPl/V P R V/J NPl . D NSg/V VL # . #
> or more words ( ending at the first sentence - end after 2 , 000 words , so that the
# NPrSg/C NPrSg/I/V/J NPl/V . NSg/V P D NSg/J NSg/V . NSg/V J/P # . # NPl/V . NSg/I/J/C N/I/C/D D
# NSg/V NSg/V/J/P P # NPl/V+ P R+ V/J NPl+ . D+ NSg/V+ VL # . #
> or more words ( ending at the first sentence - end after 2 , 000 words , so that the
# NPrSg/C NPrSg/I/V/J NPl/V+ . NSg/V NSg/P D NSg/V/J+ NSg/V+ . NSg/V J/P # . # NPl/V+ . NSg/I/J/C N/I/C/D D+
> corpus contains only complete sentences ) .
# NSg V W? NSg/V/J NPl/V . .
# NSg+ V W? NSg/V/J+ NPl/V+ . .
>
#
> The Brown Corpus was painstakingly " tagged " with part - of - speech markers over
# D NPrSg/J NSg V R . V/J . P NSg/V/J . P . NSg/V NPl/V NSg/V/J/P
> many years . A first approximation was done with a program by Greene and Rubin ,
# N/I/J/D NPl . D/P NSg/J NSg V NSg/V/J P D/P NPrSg NSg/J/P NPr V/C NPr .
> which consisted of a huge handmade list of what categories could co - occur at
# I/C V/J P D/P J NSg/J NSg/V P NSg/I NPl NSg/VX NPrSg/I/V . V NSg/P
> all . For example , article then noun can occur , but article then verb ( arguably )
# NSg/I/J/C . C/P NSg/V . NSg/V NSg/J/C NSg/V NPrSg/VX V . NSg/C/P NSg/V NSg/J/C NSg/V . R .
> cannot . The program got about 70 % correct . Its results were repeatedly reviewed
# NSg/V . D NPrSg V J/P # . NSg/V/J . ISg/D NPl NSg/V R V/J
> and corrected by hand , and later users sent in errata so that by the late 70 s
# V/C V/J NSg/J/P NSg/V . V/C J NPl NSg/V NPrSg/J/P NSg NSg/I/J/C N/I/C/D P D NSg/J # ?
> the tagging was nearly perfect ( allowing for some cases on which even human
# D NSg V R NSg/V/J . V C/P I/J/R NPl/V J/P I/C NSg/V/J NSg/V/J
> The Brown Corpus was painstakingly " tagged " with part - of - speech markers over
# D+ NPrSg/V/J NSg V R . V/J . P NSg/V/J . P . NSg/V NPl/V NSg/V/J/P
> many years . A first approximation was done with a program by Greene and Rubin ,
# N/I/J/D+ NPl+ . D/P+ NSg/V/J+ NSg+ V NSg/V/J P D/P NPrSg/V NSg/J/P NPr V/C NPr .
> which consisted of a huge handmade list of what categories could co - occur at
# I/C+ V/J P D/P J NSg/J NSg/V P NSg/I+ NPl+ NSg/VX NPrSg/I/V+ . V NSg/P+
> all . For example , article then noun can occur , but article then verb ( arguably )
# NSg/I/J/C . C/P NSg/V+ . NSg/V+ NSg/J/C NSg/V+ NPrSg/VX V . NSg/C/P NSg/V+ NSg/J/C NSg/V+ . R .
> cannot . The program got about 70 % correct . Its results were repeatedly reviewed
# NSg/V . D+ NPrSg/V+ V J/P # . NSg/V/J+ . ISg/D+ NPl/V+ NSg/V R V/J
> and corrected by hand , and later users sent in errata so that by the late 70 s
# V/C V/J NSg/J/P NSg/V+ . V/C J NPl+ NSg/V NPrSg/J/P NSg NSg/I/J/C N/I/C/D+ NSg/J/P D NSg/J # ?
> the tagging was nearly perfect ( allowing for some cases on which even human
# D NSg/V V R NSg/V/J . V C/P I/J/R NPl/V+ J/P I/C+ NSg/V/J NSg/V/J
> speakers might not agree ) .
# W? NSg/VX/J NSg/C V . .
# + NSg/VX/J NSg/C V . .
>
#
> This corpus has been used for innumerable studies of word - frequency and of
# I/D NSg V NSg/V V/J C/P J NPl/V P NSg/V . NSg V/C P
> part - of - speech and inspired the development of similar " tagged " corpora in many
# NSg/V/J . P . NSg/V V/C V/J D NSg P NSg/J . V/J . NPl P N/I/J/D
> other languages . Statistics derived by analyzing it formed the basis for most
# NSg/V/J NPl/V . NPl/V V/J NSg/J/P V NPrSg/ISg V/J D NSg C/P NSg/I/J
> later part - of - speech tagging systems , such as CLAWS and VOLSUNGA . However , by
# J NSg/V/J . P . NSg/V NSg/V NPl . NSg/I NSg/R NPl/V V/C ? . C . P
> this time ( 2005 ) it has been superseded by larger corpora such as the 100
# I/D NSg/V/J . # . NPrSg/ISg V NSg/V V/J NSg/J/P J NPl NSg/I NSg/R D #
> million word British National Corpus , even though larger corpora are rarely so
# N NSg/V NPrSg/J NSg/J NSg . NSg/V/J V/C J NPl V R NSg/I/J/C
> This corpus has been used for innumerable studies of word - frequency and of
# I/D+ NSg V NSg/V V/J C/P J NPl/V P NSg/V+ . NSg V/C P
> part - of - speech and inspired the development of similar " tagged " corpora in many
# NSg/V/J . P . NSg/V V/C V/J D NSg P NSg/J . V/J . NPl NPrSg/J/P N/I/J/D+
> other languages . Statistics derived by analyzing it formed the basis for most
# NSg/V/J+ NPl/V+ . NPl/V+ V/J NSg/J/P V NPrSg/ISg+ V/J D NSg C/P NSg/I/J
> later part - of - speech tagging systems , such as CLAWS and VOLSUNGA . However , by
# J NSg/V/J . P . NSg/V NSg/V NPl . NSg/I NSg/R NPl/V+ V/C ? . C . NSg/J/P
> this time ( 2005 ) it has been superseded by larger corpora such as the 100
# I/D+ NSg/V/J+ . # . NPrSg/ISg+ V NSg/V V/J NSg/J/P J NPl+ NSg/I NSg/R D #
> million word British National Corpus , even though larger corpora are rarely so
# N NSg/V+ NPrSg/J NSg/J+ NSg+ . NSg/V/J V/C J+ NPl+ V R NSg/I/J/C
> thoroughly curated .
# R V/J .
# R+ V/J+ .
>
#
> For some time , part - of - speech tagging was considered an inseparable part of
# C/P I/J/R NSg/V/J . NSg/V/J . P . NSg/V NSg/V V V/J D/P NSg/J NSg/V/J P
> natural language processing , because there are certain cases where the correct
# NSg/J NSg/V V . C/P W? V I/J NPl/V NSg/C D NSg/J
> natural language processing , because there are certain cases where the correct
# NSg/J+ NSg/V+ V+ . C/P + V I/J NPl/V+ NSg/C D NSg/V/J
> part of speech cannot be decided without understanding the semantics or even the
# NSg/V/J P NSg/V NSg/V NSg/VX NSg/V/J C/P NSg/V/J D NSg NPrSg/C NSg/V/J D
# NSg/V/J P NSg/V+ NSg/V NSg/VX NSg/V/J C/P NSg/V/J+ D+ NSg NPrSg/C NSg/V/J D
> pragmatics of the context . This is extremely expensive , especially because
# NPl P D NSg . I/D VL R J . R C/P
# NPl P D+ NSg/V+ . I/D+ VL R J . R C/P
> analyzing the higher levels is much harder when multiple part - of - speech
# V D J NPl/V VL N/I/J J NSg/I/C NSg/J NSg/V/J . P . NSg/V
> possibilities must be considered for each word .
# NPl NSg/V NSg/VX V/J C/P D NSg/V .
# V D+ J+ NPl/V+ VL N/I/J J NSg/I/C NSg/J NSg/V/J . P . NSg/V
> possibilities must be considered for each word .
# NPl NSg/V NSg/VX V/J C/P D+ NSg/V+ .
>
#
> Use of hidden Markov models
# NSg/V P V/J NPr NPl/V
# NSg/V P V/J NPr+ NPl/V
>
#
> In the mid - 1980s , researchers in Europe began to use hidden Markov models ( HMMs )
# P D NSg/J/P . #d . W? NPrSg/J/P NPr V P NSg/V V/J NPr NPl/V . ? .
> In the mid - 1980s , researchers in Europe began to use hidden Markov models ( HMMs )
# NPrSg/J/P D NSg/J/P . #d . W? NPrSg/J/P NPr+ V P NSg/V V/J NPr NPl/V+ . ? .
> to disambiguate parts of speech , when working to tag the Lancaster - Oslo - Bergen
# P V NPl/V P NSg/V . NSg/I/C V P NSg/V D NPr . NPr . NPr
> Corpus of British English . HMMs involve counting cases ( such as from the Brown
# NSg P NPrSg/J NPrSg/V/J . ? V V NPl/V . NSg/I NSg/R P D NPrSg/J
# P V NPl/V P NSg/V+ . NSg/I/C V P NSg/V D NPr . NPr+ . NPr
> Corpus of British English . HMMs involve counting cases ( such as from the Brown
# NSg P NPrSg/J+ NPrSg/V/J+ . ? V V NPl/V . NSg/I NSg/R P D+ NPrSg/V/J+
> Corpus ) and making a table of the probabilities of certain sequences . For
# NSg . V/C NSg/V D/P NSg P D NPl P I/J NPl/V . C/P
> example , once you've seen an article such as ' the ' , perhaps the next word is a
# NSg/V . NSg/C W? NSg/V D/P NSg NSg/I NSg/R . D . . NSg D NSg/J/P NSg/V VL D/P
> noun 40 % of the time , an adjective 40 % , and a number 20 % . Knowing this , a
# NSg # . P D NSg/J . D/P NSg/J # . . V/C D/P NSg/J # . . NSg/V/J/P I/D . D/P
> program can decide that " can " in " the can " is far more likely to be a noun than
# NPrSg NPrSg/VX V N/I/C/D . NPrSg/VX . NPrSg/J/P . D NPrSg . VL NSg/V/J NPrSg/I/V/J NSg/J P NSg/VX D/P NSg C/P
> a verb or a modal . The same method can , of course , be used to benefit from
# D/P NSg NPrSg/C D/P NSg/J . D I/J NSg/V NPrSg/VX . P NSg/V . NSg/VX V/J P NSg/V P
> knowledge about the following words .
# NSg/V J/P D NSg/J/P NPl/V .
# NSg+ . V/C NSg/V D/P NSg/V P D NPl P I/J+ NPl/V+ . C/P
> example , once you've seen an article such as ' the ' , perhaps the next word is a
# NSg/V+ . NSg/C W? NSg/V D/P NSg/V+ NSg/I NSg/R . D . . NSg D+ NSg/J/P+ NSg/V+ VL D/P
> noun 40 % of the time , an adjective 40 % , and a number 20 % . Knowing this , a
# NSg/V # . P D+ NSg/V/J+ . D/P+ NSg/V/J+ # . . V/C D/P+ NSg/V/J+ # . . NSg/V/J/P I/D+ . D/P+
> program can decide that " can " in " the can " is far more likely to be a noun than
# NPrSg/V+ NPrSg/VX V N/I/C/D+ . NPrSg/VX . NPrSg/J/P . D+ NPrSg/VX . VL NSg/V/J NPrSg/I/V/J NSg/J P NSg/VX D/P NSg/V C/P
> a verb or a modal . The same method can , of course , be used to benefit from
# D/P NSg/V NPrSg/C D/P+ NSg/J+ . D+ I/J+ NSg/V+ NPrSg/VX . P NSg/V+ . NSg/VX V/J P NSg/V P
> knowledge about the following words .
# NSg/V+ J/P D+ NSg/V/J/P+ NPl/V .
>
#
> More advanced ( " higher - order " ) HMMs learn the probabilities not only of pairs
# NPrSg/I/V/J V/J . . J . NSg/V . . ? NSg/V D NPl NSg/C W? P NPl/V
# NPrSg/I/V/J V/J . . J . NSg/V . . ? NSg/V D+ NPl+ NSg/C W? P NPl/V+
> but triples or even larger sequences . So , for example , if you've just seen a
# NSg/C/P NPl/V NPrSg/C NSg/V/J J NPl/V . NSg/I/J/C . C/P NSg/V . NSg/C W? V/J NSg/V D/P
> noun followed by a verb , the next item may be very likely a preposition ,
# NSg V/J P D/P NSg . D NSg/J/P NSg/V NPrSg/VX NSg/VX J NSg/J D/P NSg .
> article , or noun , but much less likely another verb .
# NSg/V . NPrSg/C NSg/V . NSg/C/P N/I/J V/J/C/P NSg/J I/D NSg/V .
# NSg/C/P NPl/V NPrSg/C NSg/V/J J NPl/V+ . NSg/I/J/C . C/P NSg/V+ . NSg/C W? V/J NSg/V D/P
> noun followed by a verb , the next item may be very likely a preposition ,
# NSg/V V/J NSg/J/P D/P+ NSg/V+ . D+ NSg/J/P+ NSg/V+ NPrSg/VX NSg/VX J NSg/J D/P NSg/V .
> article , or noun , but much less likely another verb .
# NSg/V+ . NPrSg/C NSg/V+ . NSg/C/P N/I/J V/J/C/P NSg/J+ I/D NSg/V .
>
#
> When several ambiguous words occur together , the possibilities multiply .
# NSg/I/C J/D J NPl/V V J . D NPl NSg/V .
> However , it is easy to enumerate every combination and to assign a relative
# C . NPrSg/ISg VL NSg/V/J P V D NSg V/C P NSg/V D/P NSg/J
> probability to each one , by multiplying together the probabilities of each
# NSg P D NSg/I/V/J . NSg/J/P V J D NPl P D
> choice in turn . The combination with the highest probability is then chosen . The
# NSg/J NPrSg/J/P NSg/V . D NSg P D W? NSg VL NSg/J/C V/J . D
> European group developed CLAWS , a tagging program that did exactly this and
# NSg/J NSg/V V/J NPl/V . D/P NSg NPrSg/V N/I/C/D V R I/D V/C
> achieved accuracy in the 93 95 % range .
# V/J NSg P D # . # . NSg/V .
> When several ambiguous words occur together , the possibilities multiply .
# NSg/I/C J/D J NPl/V+ V J . D+ NPl NSg/V+ .
> However , it is easy to enumerate every combination and to assign a relative
# C . NPrSg/ISg+ VL NSg/V/J P V D+ NSg+ V/C P NSg/V D/P NSg/J
> probability to each one , by multiplying together the probabilities of each
# NSg P D+ NSg/I/V/J+ . NSg/J/P V J D NPl P D+
> choice in turn . The combination with the highest probability is then chosen . The
# NSg/J+ NPrSg/J/P NSg/V . D NSg P D+ + NSg+ VL NSg/J/C+ V/J . D+
> European group developed CLAWS , a tagging program that did exactly this and
# NSg/J+ NSg/V+ V/J NPl/V+ . D/P NSg/V+ NPrSg/V+ N/I/C/D+ V R I/D+ V/C
> achieved accuracy in the 93 95 % range .
# V/J NSg+ NPrSg/J/P D # . # . NSg/V+ .
>
#
> Eugene Charniak points out in Statistical techniques for natural language
# NPr ? NPl/V NSg/V/J/R/P NPrSg/J/P J NPl C/P NSg/J NSg/V
> parsing ( 1997 ) that merely assigning the most common tag to each known word and
# V . # . N/I/C/D R V D NSg/I/J NSg/V/J NSg/V P D NSg/V/J NSg/V V/C
> the tag " proper noun " to all unknowns will approach 90 % accuracy because many
# D NSg . NSg/J NSg/V . P NSg/I/J/C NPl/V NPrSg/VX NSg/V # . NSg C/P N/I/J/D
> words are unambiguous , and many others only rarely represent their less - common
# NPl/V V J . V/C N/I/J/D NPl/V W? R V D J/C/P . NSg/V/J
# NPr+ ? NPl/V+ NSg/V/J/R/P NPrSg/J/P J NPl C/P NSg/J NSg/V+
> parsing ( 1997 ) that merely assigning the most common tag to each known word and
# V . # . N/I/C/D+ R V D NSg/I/J NSg/V/J NSg/V P D+ NSg/V/J NSg/V V/C
> the tag " proper noun " to all unknowns will approach 90 % accuracy because many
# D NSg/V+ . NSg/J NSg/V . P NSg/I/J/C+ NPl/V+ NPrSg/VX NSg/V # . NSg+ C/P N/I/J/D+
> words are unambiguous , and many others only rarely represent their less - common
# NPl/V+ V J . V/C N/I/J/D+ NPl/V+ W? R V D+ V/J/C/P . NSg/V/J
> parts of speech .
# NPl/V P NSg/V .
# NPl/V P NSg/V+ .
>
#
> CLAWS pioneered the field of HMM - based part of speech tagging but was quite
# NPl/V V/J D NSg P V . V/J NSg/V/J P NSg/V NSg/V NSg/C/P V NSg
> expensive since it enumerated all possibilities . It sometimes had to resort to
# J C/P NPrSg/ISg V/J NSg/I/J/C NPl . NPrSg/ISg R V P NSg/V P
> backup methods when there were simply too many options ( the Brown Corpus
# NSg/J NPl/V NSg/I/C W? NSg/V R W? N/I/J/D NPl/V . D NPrSg/J NSg
> contains a case with 17 ambiguous words in a row , and there are words such as
# V D/P NPrSg P # J NPl/V P D/P NSg . V/C W? V NPl/V NSg/I NSg/R
> " still " that can represent as many as 7 distinct parts of speech .
# . NSg/V/J . N/I/C/D NPrSg/VX V NSg/R N/I/J/D NSg/R # V/J NPl/V P NSg/V .
> CLAWS pioneered the field of HMM - based part of speech tagging but was quite
# NPl/V+ V/J D NSg/V P V . V/J NSg/V/J P NSg/V+ NSg/V NSg/C/P V NSg
> expensive since it enumerated all possibilities . It sometimes had to resort to
# J C/P NPrSg/ISg+ V/J NSg/I/J/C+ NPl+ . NPrSg/ISg+ R V P NSg/V P
> backup methods when there were simply too many options ( the Brown Corpus
# NSg/J NPl/V+ NSg/I/C + NSg/V R W? N/I/J/D+ NPl/V . D+ NPrSg/V/J+ NSg+
> contains a case with 17 ambiguous words in a row , and there are words such as
# V D/P NPrSg/V P # J NPl/V NPrSg/J/P D/P+ NSg/V+ . V/C + V NPl/V+ NSg/I NSg/R
> " still " that can represent as many as 7 distinct parts of speech .
# . NSg/V/J . N/I/C/D+ NPrSg/VX V NSg/R N/I/J/D NSg/R # V/J NPl/V P NSg/V+ .
>
#
> HMMs underlie the functioning of stochastic taggers and are used in various
# ? V D N/J P J NPl V/C V V/J NPrSg/J/P J
# ? V D V P J NPl V/C V V/J NPrSg/J/P J
> algorithms one of the most widely used being the bi - directional inference
# NPl NSg/I/V/J P D NSg/I/J R V/J NSg/V/C D NSg/J . NSg/J NSg
# NPl+ NSg/I/V/J P D NSg/I/J R V/J NSg/V/C D NSg/J . NSg/J NSg+
> algorithm .
# NSg .
# NSg+ .
>
#
> Dynamic programming methods
# NSg/J NSg/V NPl/V
# NSg/J+ NSg/V+ NPl/V
>
#
> In 1987 , Steven DeRose and Kenneth W. Church independently developed dynamic
# P # . NPr ? V/C NPr ? NPrSg/V R V/J NSg/J
> programming algorithms to solve the same problem in vastly less time . Their
# NSg/V NPl P NSg/V D I/J NSg/J NPrSg/J/P R V/J/C/P NSg/V/J . D
> methods were similar to the Viterbi algorithm known for some time in other
# NPl NSg/V NSg/J P D ? NSg NSg/V/J C/P I/J/R NSg/V/J NPrSg/J/P NSg/V/J
> fields . DeRose used a table of pairs , while Church used a table of triples and a
# NPrPl/V . ? V/J D/P NSg P NPl/V . NSg/V/C/P NPrSg/V V/J D/P NSg P NPl/V V/C D/P
> method of estimating the values for triples that were rare or nonexistent in the
# NSg P V D NPl C/P NPl/V N/I/C/D NSg/V NSg/V/J NPrSg/C NSg/J P D
> Brown Corpus ( an actual measurement of triple probabilities would require a much
# NPrSg/J NSg . D/P NSg/J NSg P NSg/V/J NPl NSg/VX NSg/V D/P N/I/J
> In 1987 , Steven DeRose and Kenneth W. Church independently developed dynamic
# NPrSg/J/P # . NPr+ ? V/C NPr+ ? NPrSg/V+ R V/J NSg/J
> programming algorithms to solve the same problem in vastly less time . Their
# NSg/V+ NPl+ P NSg/V D I/J NSg/J NPrSg/J/P R V/J/C/P NSg/V/J+ . D+
> methods were similar to the Viterbi algorithm known for some time in other
# NPl/V+ NSg/V NSg/J P D ? NSg NSg/V/J C/P I/J/R NSg/V/J+ NPrSg/J/P NSg/V/J+
> fields . DeRose used a table of pairs , while Church used a table of triples and a
# NPrPl/V+ . ? V/J D/P NSg/V P NPl/V+ . NSg/V/C/P NPrSg/V+ V/J D/P NSg/V P NPl/V V/C D/P
> method of estimating the values for triples that were rare or nonexistent in the
# NSg/V P V D NPl/V C/P NPl/V N/I/C/D+ NSg/V NSg/V/J NPrSg/C NSg/J NPrSg/J/P D+
> Brown Corpus ( an actual measurement of triple probabilities would require a much
# NPrSg/V/J+ NSg . D/P NSg/J NSg P NSg/V/J NPl+ NSg/VX NSg/V D/P N/I/J
> larger corpus ) . Both methods achieved an accuracy of over 95 % . DeRose's 1990
# J NSg . . I/C NPl/V V/J D/P NSg P NSg/V/J/P # . . ? #
> dissertation at Brown University included analyses of the specific error types ,
# NSg NSg/P NPrSg/V/J NSg V/J NSg/V P D NSg/J NSg/V NPl/V .
> probabilities , and other related data , and replicated his work for Greek , where
# NPl . V/C NSg/V/J J NSg . V/C V/J ISg/D NSg C/P NPrSg/V/J . NSg/C
> it proved similarly effective .
# NPrSg/ISg V/J R NSg/J .
# J NSg+ . . I/C NPl/V+ V/J D/P NSg P NSg/V/J/P # . . ? #
> dissertation at Brown University included analyses of the specific error types ,
# NSg+ NSg/P NPrSg/V/J NSg+ V/J NSg/V P D+ NSg/J+ NSg/V+ NPl/V+ .
> probabilities , and other related data , and replicated his work for Greek , where
# NPl+ . V/C NSg/V/J+ J+ NSg+ . V/C V/J ISg/D+ NSg/V C/P NPrSg/V/J . NSg/C
> it proved similarly effective .
# NPrSg/ISg+ V/J R+ NSg/J .
>
#
> These findings were surprisingly disruptive to the field of natural language
# I/D NSg NSg/V R J P D NSg P NSg/J NSg/V
# I/D+ NSg NSg/V R J P D NSg/V P NSg/J+ NSg/V+
> processing . The accuracy reported was higher than the typical accuracy of very
# V . D NSg V/J V J C/P D NSg/J NSg P J
> sophisticated algorithms that integrated part of speech choice with many higher
# V/J NPl N/I/C/D V/J NSg/V/J P NSg/V NSg/J P N/I/J/D J
> levels of linguistic analysis : syntax , morphology , semantics , and so on . CLAWS ,
# NPl/V P J NSg . NSg . NSg . NSg . V/C NSg/I/J/C J/P . NPl/V .
> DeRose's and Church's methods did fail for some of the known cases where
# ? V/C N$ NPl/V V NSg/V/J C/P I/J/R P D NSg/J NPl/V NSg/C
> semantics is required , but those proved negligibly rare . This convinced many in
# NSg VL V/J . NSg/C/P I/D V/J R NSg/V/J . I/D V/J N/I/J/D P
> the field that part - of - speech tagging could usefully be separated from the other
# D NSg N/I/C/D NSg/V/J . P . NSg/V NSg/V NSg/VX R NSg/VX V/J P D NSg/J
# V+ . D+ NSg+ V/J V J C/P D NSg/J NSg P J
> sophisticated algorithms that integrated part of speech choice with many higher
# V/J NPl+ N/I/C/D+ V/J NSg/V/J P NSg/V+ NSg/J P N/I/J/D J
> levels of linguistic analysis : syntax , morphology , semantics , and so on . CLAWS ,
# NPl/V P J NSg+ . NSg+ . NSg+ . NSg+ . V/C NSg/I/J/C+ J/P . NPl/V .
> DeRose's and Church's methods did fail for some of the known cases where
# ? V/C N$ NPl/V+ V NSg/V/J C/P I/J/R P D+ NSg/V/J+ NPl/V+ NSg/C
> semantics is required , but those proved negligibly rare . This convinced many in
# NSg+ VL V/J . NSg/C/P I/D+ V/J R+ NSg/V/J+ . I/D+ V/J N/I/J/D NPrSg/J/P
> the field that part - of - speech tagging could usefully be separated from the other
# D+ NSg/V+ N/I/C/D+ NSg/V/J . P . NSg/V NSg/V NSg/VX R NSg/VX V/J P D NSg/V/J
> levels of processing ; this , in turn , simplified the theory and practice of
# NPl/V P V . I/D . NPrSg/J/P NSg/V . V/J D NSg V/C NSg/V P
# NPl/V P V . I/D+ . NPrSg/J/P NSg/V . V/J D+ NSg V/C NSg/V P
> computerized language analysis and encouraged researchers to find ways to
# V/J NSg/V NSg V/C V/J W? P NSg/V NPl P
> separate other pieces as well . Markov Models became the standard method for the
# NSg/V/J NSg/V/J NPl/V NSg/R NSg/V/J . NPr NPl/V V D NSg/J NSg/V C/P D
> part - of - speech assignment .
# NSg/J . P . NSg/V NSg .
# V/J NSg/V+ NSg+ V/C V/J + P NSg/V NPl+ P
> separate other pieces as well . Markov Models became the standard method for the
# NSg/V/J NSg/V/J+ NPl/V+ NSg/R+ NSg/V/J . NPr NPl/V+ V D NSg/J NSg/V C/P D
> part - of - speech assignment .
# NSg/V/J . P . NSg/V+ NSg+ .
>
#
> Unsupervised taggers
# V/J NPl
# V/J+ NPl
>
#
> The methods already discussed involve working from a pre - existing corpus to
# D NPl W? V/J V V P D/P NSg/P . V NSg P
> learn tag probabilities . It is , however , also possible to bootstrap using
# NSg/V NSg/V NPl . NPrSg/ISg VL . C . W? NSg/J P NSg/V V
> The methods already discussed involve working from a pre - existing corpus to
# D+ NPl/V W? V/J V V P D/P NSg/V/P+ . V NSg P
> learn tag probabilities . It is , however , also possible to bootstrap using
# NSg/V NSg/V+ NPl+ . NPrSg/ISg+ VL . C . W? NSg/J P NSg/V V
> " unsupervised " tagging . Unsupervised tagging techniques use an untagged corpus
# . V/J . NSg/V . V/J NSg/V NPl NSg/V D/P ? NSg
> for their training data and produce the tagset by induction . That is , they
# C/P D NSg NSg V/C NSg/V D NSg NSg/J/P NSg . N/I/C/D VL . IPl
> observe patterns in word use , and derive part - of - speech categories themselves .
# NSg/V NPl/V NPrSg/J/P NSg/V NSg/V . V/C NSg/V NSg/V/J . P . NSg/V NPl I .
> For example , statistics readily reveal that " the " , " a " , and " an " occur in
# C/P NSg/V . NPl/V R NSg/V N/I/C/D . D . . . D/P . . V/C . D/P . V NPrSg/J/P
> similar contexts , while " eat " occurs in very different ones . With sufficient
# NSg/J NPl/V . NSg/V/C/P . NSg/V . V NPrSg/J/P J NSg/J NPl/V . P J
> iteration , similarity classes of words emerge that are remarkably similar to
# NSg . NSg NPl/V P NPl/V NSg/V N/I/C/D V R NSg/J P
# . V/J . NSg/V . V/J NSg/V NPl+ NSg/V D/P ? NSg
> for their training data and produce the tagset by induction . That is , they
# C/P D+ NSg/V+ NSg+ V/C NSg/V D NSg NSg/J/P+ NSg . N/I/C/D+ VL . IPl+
> observe patterns in word use , and derive part - of - speech categories themselves .
# NSg/V NPl/V+ NPrSg/J/P NSg/V+ NSg/V . V/C NSg/V NSg/V/J . P . NSg/V NPl+ I+ .
> For example , statistics readily reveal that " the " , " a " , and " an " occur in
# C/P NSg/V+ . NPl/V+ R NSg/V N/I/C/D+ . D . . . D/P . . V/C . D/P . V NPrSg/J/P
> similar contexts , while " eat " occurs in very different ones . With sufficient
# NSg/J+ NPl/V+ . NSg/V/C/P . NSg/V . V NPrSg/J/P J NSg/J+ NPl/V+ . P J+
> iteration , similarity classes of words emerge that are remarkably similar to
# NSg . NSg NPl/V P NPl/V+ NSg/V N/I/C/D+ V R NSg/J P
> those human linguists would expect ; and the differences themselves sometimes
# I/D NSg/V/J NPl NSg/VX V . V/C D NSg I R
> suggest valuable new insights .
# V NSg/J NSg/V/J NPl .
# I/D+ NSg/V/J NPl+ NSg/VX V . V/C D+ NSg/V+ I+ R
> suggest valuable new insights .
# V NSg/J+ NSg/V/J+ NPl+ .
>
#
> These two categories can be further subdivided into rule - based , stochastic , and
# I/D NSg NPl NPrSg/VX NSg/VX V/J V/J P NSg/V . V/J . J . V/C
> These two categories can be further subdivided into rule - based , stochastic , and
# I/D NSg+ NPl NPrSg/VX NSg/VX V/J V/J P NSg/V . V/J . J . V/C
> neural approaches .
# J NPl/V .
# J+ NPl/V+ .
>
#
> Other taggers and methods
# NSg/V/J NPl V/C NPl/V
> Other taggers and methods
# NSg/V/J+ NPl V/C NPl/V
>
#
> Some current major algorithms for part - of - speech tagging include the Viterbi
# I/J/R NSg/J NPrSg/V/J NPl C/P NSg/V/J . P . NSg/V NSg/V NSg/V D ?
> Some current major algorithms for part - of - speech tagging include the Viterbi
# I/J/R+ NSg/J NPrSg/V/J NPl C/P NSg/V/J . P . NSg/V NSg/V NSg/V D ?
> algorithm , Brill tagger , Constraint Grammar , and the Baum - Welch algorithm ( also
# NSg . NSg/J NSg . NSg NSg/V . V/C D NPr . ? NSg . W?
> known as the forward - backward algorithm ) . Hidden Markov model and visible Markov
# NSg/V/J NSg/R D NSg/J . NSg/J NSg . . V/J NPr NSg/V/J V/C J NPr
> model taggers can both be implemented using the Viterbi algorithm . The
# NSg/V/J NPl NPrSg/VX I/C NSg/VX V/J V D ? NSg . D
> rule - based Brill tagger is unusual in that it learns a set of rule patterns , and
# NSg . V/J NSg/J NSg VL NSg/J P N/I/C/D NPrSg/ISg NPl/V D/P NPrSg/J P NSg/V NPl/V . V/C
> then applies those patterns rather than optimizing a statistical quantity .
# NSg/J/C V I/D NPl/V NPrSg/V/J C/P V D/P J NSg .
# NSg . NSg/J NSg . NSg+ NSg/V+ . V/C D NPr . ? NSg . W?
> known as the forward - backward algorithm ) . Hidden Markov model and visible Markov
# NSg/V/J NSg/R D NSg/V/J . NSg/J NSg+ . . V/J NPr NSg/V/J+ V/C J NPr
> model taggers can both be implemented using the Viterbi algorithm . The
# NSg/V/J+ NPl NPrSg/VX I/C NSg/VX V/J V D+ ? NSg . D
> rule - based Brill tagger is unusual in that it learns a set of rule patterns , and
# NSg/V+ . V/J NSg/J NSg VL NSg/J NPrSg/J/P N/I/C/D NPrSg/ISg+ NPl/V D/P NPrSg/V/J P NSg/V+ NPl/V+ . V/C
> then applies those patterns rather than optimizing a statistical quantity .
# NSg/J/C V I/D+ NPl/V+ NPrSg/V/J C/P V D/P+ J+ NSg+ .
>
#
> Many machine learning methods have also been applied to the problem of POS
# N/I/J/D NSg/V V NPl/V NSg/VX W? NSg/V V/J P D NSg/J P NSg
> Many machine learning methods have also been applied to the problem of POS
# N/I/J/D+ NSg/V V+ NPl/V+ NSg/VX W? NSg/V V/J P D NSg/J P NSg+
> tagging . Methods such as SVM , maximum entropy classifier , perceptron , and
# NSg/V . NPl/V NSg/I NSg/R ? . NSg/J NSg NSg . N . V/C
# NSg/V+ . NPl/V+ NSg/I NSg/R ? . NSg/J NSg NSg . N . V/C
> nearest - neighbor have all been tried , and most can achieve accuracy above
# W? . NSg/V/J NSg/VX NSg/I/J/C NSg/V V/J . V/C NSg/I/J NPrSg/VX V NSg NSg/J/P
# W? . NSg/V/J NSg/VX NSg/I/J/C NSg/V V/J . V/C NSg/I/J NPrSg/VX V NSg+ NSg/J/P
> 95 % . [ citation needed ]
# # . . . NSg V/J .
# # . . . NSg+ V/J+ .
>
#
> A direct comparison of several methods is reported ( with references ) at the ACL
# D/P J NSg P J/D NPl/V VL V/J . P NPl/V . P D NSg
> Wiki . This comparison uses the Penn tag set on some of the Penn Treebank data ,
# NSg/V . I/D NSg NPl/V D NPr NSg/V NPrSg/V/J J/P I/J/R P D NPr ? NSg .
> A direct comparison of several methods is reported ( with references ) at the ACL
# D/P V/J NSg P J/D+ NPl/V+ VL V/J . P NPl/V+ . NSg/P D+ NSg+
> Wiki . This comparison uses the Penn tag set on some of the Penn Treebank data ,
# NSg/V+ . I/D+ NSg+ NPl/V D+ NPr+ NSg/V+ NPrSg/V/J J/P I/J/R P D+ NPr+ ? NSg+ .
> so the results are directly comparable . However , many significant taggers are
# NSg/I/J/C D NPl V R/C NSg/J . C . N/I/J/D NSg/J NPl V
> not included ( perhaps because of the labor involved in reconfiguring them for
# NSg/C V/J . NSg C/P P D NPrSg/Am/Au V/J NPrSg/J/P V N/I C/P
> this particular dataset ) . Thus , it should not be assumed that the results
# I/D NSg/J NSg . . NSg . NPrSg/ISg VX NSg/C NSg/VX V/J N/I/C/D D NPl
> reported here are the best that can be achieved with a given approach ; nor even
# V/J NSg/J/R V D NPrSg/J N/I/C/D NPrSg/VX NSg/VX V/J P D/P NSg/J/P NSg/V . NSg/C NSg/V/J
> the best that have been achieved with a given approach .
# D NPrSg/J N/I/C/D NSg/VX NSg/V V/J P D/P NSg/J/P NSg/V .
# NSg/I/J/C D+ NPl/V+ V R/C NSg/J+ . C . N/I/J/D NSg/J NPl V
> not included ( perhaps because of the labor involved in reconfiguring them for
# NSg/C V/J . NSg C/P P D+ NPrSg/V/Am/Au+ V/J NPrSg/J/P V N/I+ C/P
> this particular dataset ) . Thus , it should not be assumed that the results
# I/D+ NSg/J+ NSg . . NSg . NPrSg/ISg+ VX NSg/C NSg/VX V/J N/I/C/D D+ NPl/V+
> reported here are the best that can be achieved with a given approach ; nor even
# V/J NSg/J/R V D NPrSg/VX/J N/I/C/D+ NPrSg/VX NSg/VX V/J P D/P+ NSg/V/J/P+ NSg/V+ . NSg/C NSg/V/J
> the best that have been achieved with a given approach .
# D+ NPrSg/VX/J+ N/I/C/D+ NSg/VX NSg/V V/J P D/P+ NSg/V/J/P+ NSg/V+ .
>
#
> In 2014 , a paper reporting using the structure regularization method for
# P # . D/P NSg/J V V D NSg NSg NSg/V C/P
> part - of - speech tagging , achieving 97.36 % on a standard benchmark dataset .
# NSg/V/J . P . NSg/V NSg/V . V # . P D/P NSg/J NSg/V NSg .
> In 2014 , a paper reporting using the structure regularization method for
# NPrSg/J/P # . D/P+ NSg/V/J+ V V D+ NSg/V+ NSg NSg/V C/P
> part - of - speech tagging , achieving 97.36 % on a standard benchmark dataset .
# NSg/V/J . P . NSg/V NSg/V . V # . J/P D/P NSg/J+ NSg/V+ NSg .

View file

@ -2,25 +2,25 @@
# NSg/V
>
#
> This document contains example sentences with misspelled words that we want to test the spell checker on .
# I/D NSg/V V NSg/V NPl/V P V/J NPl/V N/I/C/D IPl NSg/V P NSg/V D NSg NSg/V J/P .
> This document contains example sentences with misspelled words that we want to test the spell checker on .
# I/D+ NSg/V V NSg/V+ NPl/V P V/J+ NPl/V+ N/I/C/D+ IPl+ NSg/V P NSg/V D NSg/V NSg/V J/P .
>
#
> Example Sentences
# NSg/V NPl/V
# NSg/V+ NPl/V
>
#
> My favourite color is blu .
# D NSg/J/Ca/Au/Br NSg/V/J/Am VL W? .
> I must defend my honour !
# ISg NSg/V NSg/V D NSg/Ca/Au/Br .
> I recognize that you recognise me .
# ISg V N/I/C/D IPl V/Au/Br NPrSg/ISg .
> I analyze how you infantilize me .
# ISg V NSg/C IPl V NPrSg/ISg .
> I analyse how you infantilise me .
# ISg V/Au/Br NSg/C IPl ? NPrSg/ISg .
> Careful , traveller !
# J . NSg/Ca/Au/Br .
> At the centre of the theatre I dropped a litre of coke .
# P D NSg/Ca/Au/Br P D NSg/Ca/Au/Br ISg V/J D/P NSg/Ca/Au/Br P NPrSg/V .
> My favourite color is blu .
# D+ NSg/V/J/Ca/Au/Br NSg/V/J/Am VL+ W? .
> I must defend my honour !
# ISg+ NSg/V NSg/V D+ NSg/V/Ca/Au/Br+ .
> I recognize that you recognise me .
# ISg+ V N/I/C/D IPl+ V/Au/Br NPrSg/ISg+ .
> I analyze how you infantilize me .
# ISg+ V NSg/C IPl+ V NPrSg/ISg+ .
> I analyse how you infantilise me .
# ISg+ V/Au/Br NSg/C IPl+ ? NPrSg/ISg+ .
> Careful , traveller !
# J . NSg/Ca/Au/Br+ .
> At the centre of the theatre I dropped a litre of coke .
# NSg/P D NSg/V/Ca/Au/Br P D+ NSg/Ca/Au/Br+ ISg+ V/J D/P NSg/Ca/Au/Br P NPrSg/V+ .

File diff suppressed because it is too large Load diff

View file

@ -1,74 +1,74 @@
> " This " and " that " are common and fulfill multiple purposes in everyday English .
# . I/D . V/C . N/I/C/D . V NSg/V/J V/C V NSg/J NPl/V NPrSg/J/P NSg/J NPrSg/V/J .
> As such , disambiguating them is necessary .
# NSg/R NSg/I . V N/I VL NSg/J .
> " This " and " that " are common and fulfill multiple purposes in everyday English .
# . I/D+ . V/C . N/I/C/D+ . V NSg/V/J V/C V NSg/J NPl/V NPrSg/J/P NSg/J+ NPrSg/V/J+ .
> As such , disambiguating them is necessary .
# NSg/R NSg/I . V N/I+ VL+ NSg/J .
>
#
> This document contains various sentences that use " this " , " that " , " these " , and
# I/D NSg/V V J NPl/V N/I/C/D NSg/V . I/D . . . N/I/C/D . . . I/D . . V/C
> " those " in different contexts with a lot of edge cases .
# . I/D . NPrSg/J/P NSg/J NPl/V P D/P NPrSg P NSg/V NPl/V .
> This document contains various sentences that use " this " , " that " , " these " , and
# I/D+ NSg/V V J NPl/V+ N/I/C/D+ NSg/V . I/D+ . . . N/I/C/D+ . . . I/D+ . . V/C
> " those " in different contexts with a lot of edge cases .
# . I/D . NPrSg/J/P NSg/J NPl/V P D/P NPrSg/V P NSg/V+ NPl/V+ .
>
#
> Examples
# NPl/V
# NPl/V+
>
#
> This triangle is nice .
# I/D NSg VL NPrSg/V/J .
> This is nice .
# I/D VL NPrSg/V/J .
> That triangle is nice .
# N/I/C/D NSg VL NPrSg/V/J .
> That is nice .
# N/I/C/D VL NPrSg/V/J .
> These triangles are nice .
# I/D NPl V NPrSg/V/J .
> These are nice .
# I/D V NPrSg/V/J .
> Those triangles are nice .
# I/D NPl V NPrSg/V/J .
> This triangle is nice .
# I/D+ NSg+ VL+ NPrSg/V/J+ .
> This is nice .
# I/D+ VL+ NPrSg/V/J+ .
> That triangle is nice .
# N/I/C/D+ NSg+ VL+ NPrSg/V/J+ .
> That is nice .
# N/I/C/D+ VL+ NPrSg/V/J+ .
> These triangles are nice .
# I/D+ NPl+ V+ NPrSg/V/J+ .
> These are nice .
# I/D+ V+ NPrSg/V/J+ .
> Those triangles are nice .
# I/D+ NPl+ V+ NPrSg/V/J+ .
> Those are nice .
# I/D V NPrSg/V/J .
# I/D+ V+ NPrSg/V/J .
>
#
> This massage is nice .
# I/D NSg/V VL NPrSg/V/J .
> That massage is nice .
# N/I/C/D NSg/V VL NPrSg/V/J .
> These massages are nice .
# I/D NPl/V V NPrSg/V/J .
> Those massages are nice .
# I/D NPl/V V NPrSg/V/J .
> This massages well .
# I/D NPl/V NSg/V/J .
> That massages well .
# N/I/C/D NPl/V NSg/V/J .
> These massage well .
# I/D NSg/V NSg/V/J .
> Those massage well .
# I/D NSg/V NSg/V/J .
> This massage is nice .
# I/D+ NSg/V+ VL+ NPrSg/V/J+ .
> That massage is nice .
# N/I/C/D NSg/V+ VL+ NPrSg/V/J+ .
> These massages are nice .
# I/D+ NPl/V+ V+ NPrSg/V/J+ .
> Those massages are nice .
# I/D+ NPl/V+ V+ NPrSg/V/J+ .
> This massages well .
# I/D+ NPl/V+ NSg/V/J+ .
> That massages well .
# N/I/C/D+ NPl/V+ NSg/V/J+ .
> These massage well .
# I/D+ NSg/V+ NSg/V/J+ .
> Those massage well .
# I/D+ NSg/V+ NSg/V/J+ .
>
#
> That could be a solution .
# N/I/C/D NSg/VX NSg/VX D/P NSg .
> Find all candidates that could be a solution .
# NSg/V NSg/I/J/C NPl/V N/I/C/D NSg/VX NSg/VX D/P NSg .
> That could be a solution .
# N/I/C/D+ NSg/VX NSg/VX D/P NSg .
> Find all candidates that could be a solution .
# NSg/V NSg/I/J/C+ NPl/V+ N/I/C/D+ NSg/VX NSg/VX D/P NSg+ .
>
#
> This is all that I have .
# I/D VL NSg/I/J/C N/I/C/D ISg NSg/VX .
> This is all that solutions can do .
# I/D VL NSg/I/J/C N/I/C/D NPl NPrSg/VX NSg/VX .
> That solution can do .
# N/I/C/D NSg NPrSg/VX NSg/VX .
> This is all that I have .
# I/D+ VL NSg/I/J/C N/I/C/D ISg+ NSg/VX+ .
> This is all that solutions can do .
# I/D+ VL NSg/I/J/C N/I/C/D NPl+ NPrSg/VX+ NSg/VX .
> That solution can do .
# N/I/C/D NSg+ NPrSg/VX+ NSg/VX .
>
#
> We can do this !
# IPl NPrSg/VX NSg/VX I/D .
> I can do this and that .
# ISg NPrSg/VX NSg/VX I/D V/C N/I/C/D .
> We can do this !
# IPl+ NPrSg/VX NSg/VX I/D+ .
> I can do this and that .
# ISg+ NPrSg/VX NSg/VX I/D V/C N/I/C/D+ .
>
#
> We unite to stand united in unity .
# IPl NSg/V P NSg/V V/J NPrSg/J/P NSg .
> We unite to stand united in unity .
# IPl+ NSg/V P NSg/V V/J NPrSg/J/P NSg+ .

View file

@ -0,0 +1,19 @@
[package]
name = "harper-pos-utils"
version = "0.42.0"
edition = "2024"
[dependencies]
rs-conllu = "0.3.0"
hashbrown = { version = "0.15.3", features = ["serde"] }
strum = "0.27.1"
strum_macros = "0.27.1"
serde = { version = "1.0.219", features = ["derive"] }
is-macro = "0.3.7"
rayon = { version = "1.10.0", optional = true }
rand = { version = "0.9.1", optional = true }
[features]
default = []
threaded = ["dep:rayon"]
training = ["dep:rand"]

View file

@ -0,0 +1,270 @@
mod patch;
#[cfg(feature = "training")]
use std::path::Path;
#[cfg(feature = "training")]
use crate::word_counter::WordCounter;
use crate::{
UPOS,
chunker::{Chunker, upos_freq_dict::UPOSFreqDict},
};
use patch::Patch;
use serde::{Deserialize, Serialize};
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct BrillChunker {
base: UPOSFreqDict,
patches: Vec<Patch>,
}
impl BrillChunker {
pub fn new(base: UPOSFreqDict) -> Self {
Self {
base,
patches: Vec::new(),
}
}
fn apply_patches(&self, sentence: &[String], tags: &[Option<UPOS>], np_states: &mut [bool]) {
for patch in &self.patches {
for i in 0..sentence.len() {
if patch.from == np_states[i]
&& patch.criteria.fulfils(sentence, tags, np_states, i)
{
np_states[i] = !np_states[i];
}
}
}
}
}
impl Chunker for BrillChunker {
fn chunk_sentence(&self, sentence: &[String], tags: &[Option<UPOS>]) -> Vec<bool> {
let mut initial_pass = self.base.chunk_sentence(sentence, tags);
self.apply_patches(sentence, tags, &mut initial_pass);
initial_pass
}
}
#[cfg(feature = "training")]
type CandidateArgs = (Vec<String>, Vec<Option<UPOS>>, Vec<bool>);
#[cfg(feature = "training")]
impl BrillChunker {
/// Tag a provided sentence with the tagger, providing the "correct" tags (from a dataset or
/// other source), returning the number of errors.
pub fn count_patch_errors(
&self,
sentence: &[String],
tags: &[Option<UPOS>],
base_flags: &[bool],
correct_np_flags: &[bool],
) -> usize {
let mut flags = base_flags.to_vec();
self.apply_patches(sentence, tags, &mut flags);
let mut loss = 0;
for (a, b) in flags.into_iter().zip(correct_np_flags) {
if a != *b {
loss += 1;
}
}
loss
}
/// Tag a provided sentence with the tagger, providing the "correct" tags (from a dataset or
/// other source), returning the number of errors.
pub fn count_chunk_errors(
&self,
sentence: &[String],
tags: &[Option<UPOS>],
correct_np_flags: &[bool],
relevant_words: &mut WordCounter,
) -> usize {
let flags = self.chunk_sentence(sentence, tags);
let mut loss = 0;
for ((a, b), word) in flags.into_iter().zip(correct_np_flags).zip(sentence) {
if a != *b {
loss += 1;
relevant_words.inc(word);
}
}
loss
}
/// To speed up training, only try a subset of all possible candidates.
/// How many to select is given by the `candidate_selection_chance`. A higher chance means a
/// longer training time.
fn epoch(&mut self, training_files: &[impl AsRef<Path>], candidate_selection_chance: f32) {
use crate::conllu_utils::iter_sentences_in_conllu;
use rs_conllu::Sentence;
use std::time::Instant;
assert!((0.0..=1.0).contains(&candidate_selection_chance));
let mut total_tokens = 0;
let mut error_counter = 0;
let sentences: Vec<Sentence> = training_files
.iter()
.flat_map(iter_sentences_in_conllu)
.collect();
let mut sentences_flagged: Vec<CandidateArgs> = Vec::new();
for sent in &sentences {
use hashbrown::HashSet;
use crate::chunker::np_extraction::locate_noun_phrases_in_sent;
let mut toks: Vec<String> = Vec::new();
let mut tags = Vec::new();
for token in &sent.tokens {
let form = token.form.clone();
if let Some(last) = toks.last_mut() {
match form.as_str() {
"sn't" | "n't" | "'ll" | "'ve" | "'re" | "'d" | "'m" | "'s" => {
last.push_str(&form);
continue;
}
_ => {}
}
}
toks.push(form);
tags.push(token.upos.and_then(UPOS::from_conllu));
}
let actual = locate_noun_phrases_in_sent(sent);
let actual_flat = actual.into_iter().fold(HashSet::new(), |mut a, b| {
a.extend(b.into_iter());
a
});
let mut actual_seq = Vec::new();
for el in actual_flat {
if el >= actual_seq.len() {
actual_seq.resize(el + 1, false);
}
actual_seq[el] = true;
}
sentences_flagged.push((toks, tags, actual_seq));
}
let mut relevant_words = WordCounter::default();
for (tok_buf, tag_buf, flag_buf) in &sentences_flagged {
total_tokens += tok_buf.len();
error_counter += self.count_chunk_errors(
tok_buf.as_slice(),
tag_buf,
flag_buf.as_slice(),
&mut relevant_words,
);
}
println!("=============");
println!("Total tokens in training set: {}", total_tokens);
println!("Tokens incorrectly flagged: {}", error_counter);
println!(
"Error rate: {}%",
error_counter as f32 / total_tokens as f32 * 100.
);
// Before adding any patches, let's get a good base.
let mut base_flags = Vec::new();
for (toks, tags, _) in &sentences_flagged {
base_flags.push(self.chunk_sentence(toks, tags));
}
let all_candidates = Patch::generate_candidate_patches(&relevant_words);
let mut pruned_candidates: Vec<Patch> = rand::seq::IndexedRandom::choose_multiple(
all_candidates.as_slice(),
&mut rand::rng(),
(all_candidates.len() as f32 * candidate_selection_chance) as usize,
)
.cloned()
.collect();
let start = Instant::now();
#[cfg(feature = "threaded")]
rayon::slice::ParallelSliceMut::par_sort_by_cached_key(
pruned_candidates.as_mut_slice(),
|candidate: &Patch| {
self.score_candidate(candidate.clone(), &sentences_flagged, &base_flags)
},
);
#[cfg(not(feature = "threaded"))]
pruned_candidates.sort_by_cached_key(|candidate| {
self.score_candidate(candidate.clone(), &sentences_flagged, &base_flags)
});
let duration = start.elapsed();
let seconds = duration.as_secs();
let millis = duration.subsec_millis();
println!(
"It took {} seconds and {} milliseconds to search through {} candidates at {} c/sec.",
seconds,
millis,
pruned_candidates.len(),
pruned_candidates.len() as f32 / seconds as f32
);
if let Some(best) = pruned_candidates.first() {
self.patches.push(best.clone());
}
}
/// Lower is better
fn score_candidate(
&self,
candidate: Patch,
sentences_flagged: &[CandidateArgs],
base_flags: &[Vec<bool>],
) -> usize {
let mut tagger = BrillChunker::new(UPOSFreqDict::default());
tagger.patches.push(candidate);
let mut errors = 0;
for ((toks, tags, flags), base) in sentences_flagged.iter().zip(base_flags.iter()) {
errors += tagger.count_patch_errors(toks.as_slice(), tags.as_slice(), base, flags);
}
errors
}
/// Train a brand-new tagger on a `.conllu` dataset, provided via a path.
/// This does not do _any_ error handling, and should not run in production.
/// It should be used for training a model that _will_ be used in production.
pub fn train(
training_files: &[impl AsRef<Path>],
epochs: usize,
candidate_selection_chance: f32,
) -> Self {
let mut freq_dict = UPOSFreqDict::default();
for file in training_files {
freq_dict.inc_from_conllu_file(file);
}
let mut chunker = Self::new(freq_dict);
for _ in 0..epochs {
chunker.epoch(training_files, candidate_selection_chance);
}
chunker
}
}

View file

@ -0,0 +1,121 @@
use serde::{Deserialize, Serialize};
use crate::patch_criteria::PatchCriteria;
#[cfg(feature = "training")]
use crate::word_counter::WordCounter;
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct Patch {
pub from: bool,
pub criteria: PatchCriteria,
}
#[cfg(feature = "training")]
impl Patch {
pub fn generate_candidate_patches(relevant_words: &WordCounter) -> Vec<Self> {
use crate::UPOS;
use strum::IntoEnumIterator;
const TOP_N_WORDS: usize = 50;
const REL_POS: [isize; 7] = [-3, -2, -1, 0, 1, 2, 3];
let mut atoms: Vec<(bool, PatchCriteria)> = Vec::new();
for from in [false, true] {
for rel in REL_POS {
for tag in UPOS::iter() {
atoms.push((
from,
PatchCriteria::WordIsTaggedWith {
relative: rel,
is_tagged: tag,
},
));
}
}
for max_rel in 1..=5 {
for tag in UPOS::iter() {
atoms.push((
from,
PatchCriteria::AnyWordIsTaggedWith {
max_relative: max_rel,
is_tagged: tag,
},
));
}
}
for prev in UPOS::iter() {
for post in UPOS::iter() {
atoms.push((
from,
PatchCriteria::SandwichTaggedWith {
prev_word_tagged: prev,
post_word_tagged: post,
},
));
}
}
for rel in REL_POS {
for is_np in [false, true] {
atoms.push((
from,
PatchCriteria::NounPhraseAt {
is_np,
relative: rel,
},
));
}
}
}
let tag_atom_count = atoms.len();
let mut word_atoms: Vec<(bool, PatchCriteria)> = Vec::new();
for from in [false, true] {
for rel in REL_POS {
for w in relevant_words.iter_top_n_words(TOP_N_WORDS) {
word_atoms.push((
from,
PatchCriteria::WordIs {
relative: rel,
word: w.clone(),
},
));
}
}
}
atoms.extend(word_atoms);
let total_atoms = atoms.len();
let word_start = tag_atom_count;
let word_atoms_ct = total_atoms - word_start;
let combos_ct = word_atoms_ct * total_atoms - word_atoms_ct;
let mut patches = Vec::with_capacity(total_atoms + combos_ct);
for (from, crit) in &atoms {
patches.push(Self {
from: *from,
criteria: crit.clone(),
});
}
for i in word_start..total_atoms {
let (from_i, ref crit_i) = atoms[i];
for (j, (_from_j, crit_j)) in atoms.iter().enumerate() {
if i == j {
continue;
}
patches.push(Self {
from: from_i,
criteria: PatchCriteria::Combined {
a: Box::new(crit_i.clone()),
b: Box::new(crit_j.clone()),
},
});
}
}
patches
}
}

View file

@ -0,0 +1,17 @@
use crate::UPOS;
mod brill_chunker;
#[cfg(feature = "training")]
mod np_extraction;
mod upos_freq_dict;
pub use brill_chunker::BrillChunker;
pub use upos_freq_dict::UPOSFreqDict;
/// An implementer of this trait is capable of identifying the noun phrases in a provided sentence.
pub trait Chunker {
/// Iterate over the sentence, identifying the noun phrases contained within.
/// A token marked `true` is a component of a noun phrase.
/// A token marked `false` is not.
fn chunk_sentence(&self, sentence: &[String], tags: &[Option<UPOS>]) -> Vec<bool>;
}

View file

@ -0,0 +1,106 @@
use std::collections::VecDeque;
use hashbrown::HashSet;
use rs_conllu::{Sentence, Token, TokenID, UPOS};
pub fn locate_noun_phrases_in_sent(sent: &Sentence) -> Vec<HashSet<usize>> {
let mut found_noun_phrases = Vec::new();
for (i, token) in sent.tokens.iter().enumerate() {
if token.upos.is_some_and(is_root_upos) {
let noun_phrase = locate_noun_phrase_with_head_at(i, sent);
found_noun_phrases.push(noun_phrase);
}
}
found_noun_phrases.retain(is_contiguous);
reduce_to_maximal_nonoverlapping(found_noun_phrases)
}
fn is_contiguous(indices: &HashSet<usize>) -> bool {
if indices.is_empty() {
return false;
}
let lo = *indices.iter().min().unwrap();
let hi = *indices.iter().max().unwrap();
hi - lo + 1 == indices.len()
}
fn reduce_to_maximal_nonoverlapping(mut phrases: Vec<HashSet<usize>>) -> Vec<HashSet<usize>> {
phrases.sort_by_key(|s| usize::MAX - s.len());
let mut selected = Vec::new();
let mut occupied = HashSet::new();
for p in phrases {
if p.is_disjoint(&occupied) {
occupied.extend(&p);
selected.push(p);
}
}
selected
}
fn locate_noun_phrase_with_head_at(head_index: usize, sent: &Sentence) -> HashSet<usize> {
let mut children = HashSet::new();
let mut queue = VecDeque::new();
queue.push_back(head_index);
while let Some(c_i) = queue.pop_front() {
if children.contains(&c_i) {
continue;
}
let tok = &sent.tokens[c_i];
if is_noun_phrase_constituent(tok) || tok.upos.is_some_and(is_root_upos) {
children.insert(c_i);
queue.extend(get_children(sent, c_i));
}
}
children
}
fn is_root_upos(upos: UPOS) -> bool {
use UPOS::*;
matches!(upos, NOUN | PROPN | PRON)
}
/// Get the indices of the children of a given node.
fn get_children(sent: &Sentence, of_node: usize) -> Vec<usize> {
let mut children = Vec::new();
for (index, token) in sent.tokens.iter().enumerate() {
if index == of_node {
continue;
}
if let Some(head) = token.head {
let is_child = match head {
TokenID::Single(i) => i != 0 && i - 1 == of_node,
TokenID::Range(start, end) => (start - 1..end - 1).contains(&of_node),
TokenID::Empty(_, _) => false,
};
if is_child {
children.push(index)
}
}
}
children
}
fn is_noun_phrase_constituent(token: &Token) -> bool {
let Some(ref deprel) = token.deprel else {
return false;
};
matches!(
deprel.as_str(),
"det" | "amod" | "nummod" | "compound" | "fixed" | "flat" | "acl" | "aux:pass"
)
}

View file

@ -0,0 +1,71 @@
#[cfg(feature = "training")]
use std::path::Path;
use hashbrown::HashMap;
use serde::{Deserialize, Serialize};
use crate::UPOS;
use super::Chunker;
/// Tracks the number of times any given UPOS is associated with a noun phrase.
/// Used as the baseline for the chunker.
#[derive(Debug, Clone, Serialize, Deserialize, Default)]
pub struct UPOSFreqDict {
/// The # of times each [`UPOS`] was not part of an NP subtracted from the number of times it
/// was.
pub counts: HashMap<UPOS, isize>,
}
impl UPOSFreqDict {
pub fn is_likely_np_component(&self, upos: &UPOS) -> bool {
self.counts.get(upos).cloned().unwrap_or_default() > 0
}
}
impl Chunker for UPOSFreqDict {
fn chunk_sentence(&self, _sentence: &[String], tags: &[Option<UPOS>]) -> Vec<bool> {
tags.iter()
.map(|t| {
t.as_ref()
.map(|t| self.is_likely_np_component(t))
.unwrap_or(false)
})
.collect()
}
}
#[cfg(feature = "training")]
impl UPOSFreqDict {
/// Increment the count for a particular lint kind.
pub fn inc_is_np(&mut self, upos: UPOS, is_np: bool) {
self.counts
.entry(upos)
.and_modify(|counter| *counter += if is_np { 1 } else { -1 })
.or_insert(1);
}
/// Parse a `.conllu` file and use it to train a frequency dictionary.
/// For error-handling purposes, this function should not be made accessible outside of training.
pub fn inc_from_conllu_file(&mut self, path: impl AsRef<Path>) {
use super::np_extraction::locate_noun_phrases_in_sent;
use crate::conllu_utils::iter_sentences_in_conllu;
for sent in iter_sentences_in_conllu(path) {
use hashbrown::HashSet;
let noun_phrases = locate_noun_phrases_in_sent(&sent);
let flat = noun_phrases.into_iter().fold(HashSet::new(), |mut a, b| {
a.extend(b);
a
});
for (i, token) in sent.tokens.iter().enumerate() {
if let Some(upos) = token.upos.and_then(UPOS::from_conllu) {
self.inc_is_np(upos, flat.contains(&i))
}
}
}
}
}

View file

@ -0,0 +1,12 @@
use std::{fs::File, path::Path};
use rs_conllu::{Sentence, parse_file};
/// Produce an iterator over the sentences in a `.conllu` file.
/// Will panic on error, so this should not be used outside of training.
pub fn iter_sentences_in_conllu(path: impl AsRef<Path>) -> impl Iterator<Item = Sentence> {
let file = File::open(path).unwrap();
let doc = parse_file(file);
doc.map(|v| v.unwrap())
}

View file

@ -0,0 +1,12 @@
mod chunker;
#[cfg(feature = "training")]
mod conllu_utils;
mod patch_criteria;
mod tagger;
mod upos;
#[cfg(feature = "training")]
mod word_counter;
pub use chunker::{BrillChunker, Chunker, UPOSFreqDict};
pub use tagger::{BrillTagger, FreqDict, FreqDictBuilder, Tagger};
pub use upos::{UPOS, UPOSIter};

View file

@ -0,0 +1,126 @@
use serde::{Deserialize, Serialize};
use crate::UPOS;
#[derive(Debug, Clone, Serialize, Deserialize, Hash, PartialEq, Eq)]
pub enum PatchCriteria {
WordIsTaggedWith {
/// Which token to inspect.
relative: isize,
is_tagged: UPOS,
},
AnyWordIsTaggedWith {
/// The farthest relative index to look
max_relative: isize,
is_tagged: UPOS,
},
SandwichTaggedWith {
prev_word_tagged: UPOS,
post_word_tagged: UPOS,
},
WordIs {
relative: isize,
word: String,
},
/// Not applicable to the Brill Tagger, only the chunker
NounPhraseAt {
is_np: bool,
relative: isize,
},
Combined {
a: Box<PatchCriteria>,
b: Box<PatchCriteria>,
},
}
impl PatchCriteria {
pub fn fulfils(
&self,
tokens: &[String],
tags: &[Option<UPOS>],
np_flags: &[bool],
index: usize,
) -> bool {
match self {
PatchCriteria::WordIsTaggedWith {
relative,
is_tagged,
} => {
let Some(index) = add(index, *relative) else {
return false;
};
tags.get(index)
.copied()
.flatten()
.is_some_and(|t| t == *is_tagged)
}
PatchCriteria::AnyWordIsTaggedWith {
max_relative: relative,
is_tagged,
} => {
let Some(farthest_index) = add(index, *relative) else {
return false;
};
(farthest_index.min(index)..farthest_index.max(index)).any(|i| {
tags.get(i)
.copied()
.flatten()
.is_some_and(|t| t == *is_tagged)
})
}
PatchCriteria::SandwichTaggedWith {
prev_word_tagged,
post_word_tagged,
} => {
if index == 0 {
return false;
}
let prev_i = index - 1;
let post_i = index + 1;
tags.get(prev_i)
.copied()
.flatten()
.is_some_and(|t| t == *prev_word_tagged)
&& tags
.get(post_i)
.copied()
.flatten()
.is_some_and(|t| t == *post_word_tagged)
}
Self::WordIs { relative, word } => {
let Some(index) = add(index, *relative) else {
return false;
};
tokens.get(index).is_some_and(|w| {
w.chars()
.zip(word.chars())
.all(|(a, b)| a.eq_ignore_ascii_case(&b))
})
}
Self::NounPhraseAt { is_np, relative } => {
let Some(index) = add(index, *relative) else {
return false;
};
np_flags.get(index).is_some_and(|f| *is_np == *f)
}
Self::Combined { a, b } => {
a.fulfils(tokens, tags, np_flags, index) && b.fulfils(tokens, tags, np_flags, index)
}
}
}
}
fn add(u: usize, i: isize) -> Option<usize> {
if i.is_negative() {
u.checked_sub(i.wrapping_abs() as u32 as usize)
} else {
u.checked_add(i as usize)
}
}

View file

@ -0,0 +1,281 @@
mod patch;
#[cfg(feature = "training")]
use std::path::Path;
use patch::Patch;
use serde::{Deserialize, Serialize};
#[cfg(feature = "training")]
use super::FreqDict;
#[cfg(feature = "training")]
use super::error_counter::{ErrorCounter, ErrorKind};
use crate::{Tagger, UPOS};
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct BrillTagger<B>
where
B: Tagger,
{
base: B,
patches: Vec<Patch>,
}
impl<B> BrillTagger<B>
where
B: Tagger,
{
pub fn new(base: B) -> Self {
Self {
base,
patches: Vec::new(),
}
}
fn apply_patches(&self, sentence: &[String], tags: &mut [Option<UPOS>]) {
for patch in &self.patches {
for i in 0..sentence.len() {
let Some(i_tag) = tags.get(i).copied().flatten() else {
continue;
};
if patch.from == i_tag && patch.criteria.fulfils(sentence, tags, &[], i) {
tags[i] = Some(patch.to);
}
}
}
}
}
impl<B> Tagger for BrillTagger<B>
where
B: Tagger,
{
/// Tag a sentence using the provided frequency dictionary and current patch set.
/// If the tagger is unable to determine a POS, it returns [`None`] in that position.
fn tag_sentence(&self, sentence: &[String]) -> Vec<Option<UPOS>> {
let mut tags = self.base.tag_sentence(sentence);
self.apply_patches(sentence, &mut tags);
tags
}
}
#[cfg(feature = "training")]
impl BrillTagger<FreqDict> {
/// Tag a provided sentence with patches, providing the "correct" tags (from a dataset or
/// other source), returning the number of errors.
pub fn locate_patch_errors(
&self,
sentence: &[String],
correct_tags: &[Option<UPOS>],
base_tags: &[Option<UPOS>],
errors: &mut ErrorCounter,
) {
let mut base_tags = base_tags.to_vec();
self.apply_patches(sentence, &mut base_tags);
for ((tag, correct_tag), word) in base_tags.iter().zip(correct_tags.iter()).zip(sentence) {
if let Some(tag) = tag {
if let Some(correct_tag) = correct_tag {
if tag != correct_tag {
errors.inc(
ErrorKind {
was_tagged: *tag,
correct_tag: *correct_tag,
},
word.as_str(),
)
}
}
}
}
}
/// Tag a provided sentence with the tagger, providing the "correct" tags (from a dataset or
/// other source), returning the number of errors.
pub fn locate_tag_errors(
&self,
sentence: &[String],
correct_tags: &[Option<UPOS>],
) -> ErrorCounter {
let tags = self.tag_sentence(sentence);
let mut errors = ErrorCounter::new();
for ((tag, correct_tag), word) in tags.iter().zip(correct_tags.iter()).zip(sentence) {
if let Some(tag) = tag {
if let Some(correct_tag) = correct_tag {
if tag != correct_tag {
errors.inc(
ErrorKind {
was_tagged: *tag,
correct_tag: *correct_tag,
},
word.as_str(),
)
}
}
}
}
errors
}
/// To speed up training, only try a subset of all possible candidates.
/// How many to select is given by the `candidate_selection_chance`. A higher chance means a
/// longer training time.
fn epoch(&mut self, training_files: &[impl AsRef<Path>], candidate_selection_chance: f32) {
use crate::conllu_utils::iter_sentences_in_conllu;
use rs_conllu::Sentence;
use std::time::Instant;
assert!((0.0..=1.0).contains(&candidate_selection_chance));
let mut total_tokens = 0;
let mut error_counter = ErrorCounter::new();
let sentences: Vec<Sentence> = training_files
.iter()
.flat_map(iter_sentences_in_conllu)
.collect();
let mut sentences_tagged: Vec<(Vec<String>, Vec<Option<UPOS>>)> = Vec::new();
for sent in &sentences {
let mut toks: Vec<String> = Vec::new();
let mut tags = Vec::new();
for token in &sent.tokens {
let form = token.form.clone();
if let Some(last) = toks.last_mut() {
match form.as_str() {
"sn't" | "n't" | "'ll" | "'ve" | "'re" | "'d" | "'m" | "'s" => {
last.push_str(&form);
continue;
}
_ => {}
}
}
toks.push(form);
tags.push(token.upos.and_then(UPOS::from_conllu));
}
sentences_tagged.push((toks, tags));
}
for (tok_buf, tag_buf) in &sentences_tagged {
total_tokens += tok_buf.len();
error_counter
.merge_from(self.locate_tag_errors(tok_buf.as_slice(), tag_buf.as_slice()));
}
println!("=============");
println!("Total tokens in training set: {}", total_tokens);
println!(
"Tokens incorrectly tagged: {}",
error_counter.total_errors()
);
println!(
"Error rate: {}%",
error_counter.total_errors() as f32 / total_tokens as f32 * 100.
);
// Before adding any patches, let's get a good base.
let mut base_tags = Vec::new();
for (toks, _) in &sentences_tagged {
base_tags.push(self.tag_sentence(toks));
}
let all_candidates = Patch::generate_candidate_patches(&error_counter);
let mut pruned_candidates: Vec<Patch> = rand::seq::IndexedRandom::choose_multiple(
all_candidates.as_slice(),
&mut rand::rng(),
(all_candidates.len() as f32 * candidate_selection_chance) as usize,
)
.cloned()
.collect();
let start = Instant::now();
#[cfg(feature = "threaded")]
rayon::slice::ParallelSliceMut::par_sort_by_cached_key(
pruned_candidates.as_mut_slice(),
|candidate: &Patch| {
self.score_candidate(candidate.clone(), &sentences_tagged, &base_tags)
},
);
#[cfg(not(feature = "threaded"))]
pruned_candidates.sort_by_cached_key(|candidate| {
self.score_candidate(candidate.clone(), &sentences_tagged, &base_tags)
});
let duration = start.elapsed();
let seconds = duration.as_secs();
let millis = duration.subsec_millis();
println!(
"It took {} seconds and {} milliseconds to search through {} candidates at {} c/sec.",
seconds,
millis,
pruned_candidates.len(),
pruned_candidates.len() as f32 / seconds as f32
);
if let Some(best) = pruned_candidates.first() {
self.patches.push(best.clone());
}
}
/// Lower is better
fn score_candidate(
&self,
candidate: Patch,
sentences_tagged: &[(Vec<String>, Vec<Option<UPOS>>)],
base_tags: &[Vec<Option<UPOS>>],
) -> usize {
let mut tagger = BrillTagger::new(FreqDict::default());
tagger.patches.push(candidate);
let mut candidate_errors = ErrorCounter::new();
for ((toks, tags), base) in sentences_tagged.iter().zip(base_tags.iter()) {
tagger.locate_patch_errors(
toks.as_slice(),
tags.as_slice(),
base,
&mut candidate_errors,
);
}
candidate_errors.total_errors()
}
/// Train a brand-new tagger on a `.conllu` dataset, provided via a path.
/// This does not do _any_ error handling, and should not run in production.
/// It should be used for training a model that _will_ be used in production.
pub fn train(
training_files: &[impl AsRef<Path>],
epochs: usize,
candidate_selection_chance: f32,
) -> Self {
use crate::FreqDictBuilder;
let mut freq_dict_builder = FreqDictBuilder::new();
for file in training_files {
freq_dict_builder.inc_from_conllu_file(file);
}
let freq_dict = freq_dict_builder.build();
let mut tagger = Self::new(freq_dict);
for _ in 0..epochs {
tagger.epoch(training_files, candidate_selection_chance);
}
tagger
}
}

View file

@ -0,0 +1,92 @@
#[cfg(feature = "training")]
use crate::tagger::error_counter::ErrorCounter;
use crate::{UPOS, patch_criteria::PatchCriteria};
#[cfg(feature = "training")]
use hashbrown::HashSet;
use serde::{Deserialize, Serialize};
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct Patch {
pub from: UPOS,
pub to: UPOS,
pub criteria: PatchCriteria,
}
#[cfg(feature = "training")]
impl Patch {
/// Given a list of tagging errors, generate a collection of candidate patches that _might_ fix
/// them. Training involves determining which candidates actually work.
pub fn generate_candidate_patches(error_counter: &ErrorCounter) -> Vec<Patch> {
let mut candidates = Vec::new();
for key in error_counter.error_counts.keys() {
candidates.extend(Self::gen_simple_candidates().into_iter().map(|c| Patch {
from: key.was_tagged,
to: key.correct_tag,
criteria: c,
}));
for c in &Self::gen_simple_candidates() {
for word in error_counter.word_counts.iter_top_n_words(10) {
for r in -3..3 {
candidates.push(Patch {
from: key.was_tagged,
to: key.correct_tag,
criteria: PatchCriteria::Combined {
a: Box::new(PatchCriteria::WordIs {
relative: r,
word: word.to_string(),
}),
b: Box::new(c.clone()),
},
})
}
}
}
}
candidates
}
/// Candidates to be tested against a dataset during training.
fn gen_simple_candidates() -> Vec<PatchCriteria> {
use strum::IntoEnumIterator;
let mut criteria = HashSet::new();
for upos in UPOS::iter() {
for i in -4..=4 {
criteria.insert(PatchCriteria::WordIsTaggedWith {
relative: i,
is_tagged: upos,
});
}
for i in -4..=4 {
criteria.insert(PatchCriteria::AnyWordIsTaggedWith {
max_relative: i,
is_tagged: upos,
});
}
for upos_b in UPOS::iter() {
criteria.insert(PatchCriteria::SandwichTaggedWith {
prev_word_tagged: upos,
post_word_tagged: upos_b,
});
criteria.insert(PatchCriteria::Combined {
a: Box::new(PatchCriteria::WordIsTaggedWith {
relative: 1,
is_tagged: upos,
}),
b: Box::new(PatchCriteria::WordIsTaggedWith {
relative: -2,
is_tagged: upos_b,
}),
});
}
}
criteria.into_iter().collect()
}
}

View file

@ -0,0 +1,52 @@
use hashbrown::HashMap;
use crate::{UPOS, word_counter::WordCounter};
#[derive(Debug, Default, Clone, Hash, PartialEq, Eq)]
pub struct ErrorKind {
pub was_tagged: UPOS,
pub correct_tag: UPOS,
}
#[derive(Debug, Default)]
pub struct ErrorCounter {
pub error_counts: HashMap<ErrorKind, usize>,
/// The number of times a word is associated with an error.
pub word_counts: WordCounter,
}
impl ErrorCounter {
pub fn new() -> Self {
Self::default()
}
/// Increment the count for a particular lint kind.
pub fn inc(&mut self, kind: ErrorKind, word: &str) {
self.error_counts
.entry(kind)
.and_modify(|counter| *counter += 1)
.or_insert(1);
self.word_counts.inc(word)
}
pub fn merge_from(&mut self, other: Self) {
for (key, value) in other.error_counts {
self.error_counts
.entry(key)
.and_modify(|counter| *counter += value)
.or_insert(value);
}
for (key, value) in other.word_counts.word_counts {
self.word_counts
.word_counts
.entry(key)
.and_modify(|counter| *counter += value)
.or_insert(value);
}
}
pub fn total_errors(&self) -> usize {
self.error_counts.values().sum()
}
}

View file

@ -0,0 +1,32 @@
use hashbrown::HashMap;
use serde::{Deserialize, Serialize};
use super::Tagger;
use crate::upos::UPOS;
/// A mapping between words (normalized to lowercase) and their most common UPOS tag.
/// Can be used as a minimally accurate [`Tagger`].
#[derive(Debug, Default, Serialize, Deserialize, Clone)]
pub struct FreqDict {
pub mapping: HashMap<String, UPOS>,
}
impl FreqDict {
pub fn get(&self, word: &str) -> Option<UPOS> {
let word_lower = word.to_lowercase();
self.mapping.get(word_lower.as_str()).copied()
}
}
impl Tagger for FreqDict {
fn tag_sentence(&self, sentence: &[String]) -> Vec<Option<UPOS>> {
let mut tags = Vec::new();
for word in sentence {
let tag = self.get(word);
tags.push(tag);
}
tags
}
}

View file

@ -0,0 +1,99 @@
#[cfg(feature = "training")]
use std::path::Path;
use hashbrown::{Equivalent, HashMap};
use strum::IntoEnumIterator;
use crate::{UPOS, tagger::FreqDict};
/// A mapping between words and the frequency of each UPOS.
/// If an element is missing from the map, it's count is assumed to be zero.
#[derive(Debug, Default)]
pub struct FreqDictBuilder {
mapping: HashMap<FreqDictBuilderKey, usize>,
}
impl FreqDictBuilder {
pub fn new() -> Self {
Default::default()
}
pub fn inc(&mut self, word: &str, tag: &UPOS) {
let word_lower = word.to_lowercase();
let counter = self.mapping.get_mut(&(word_lower.as_str(), tag));
if let Some(counter) = counter {
*counter += 1;
} else {
self.mapping.insert(
FreqDictBuilderKey {
word: word_lower.to_string(),
pos: *tag,
},
1,
);
}
}
// Inefficient, but effective method that gets the most used POS for a word in the map.
// Returns none if the word does not exist in the map.
fn most_freq_pos(&self, word: &str) -> Option<UPOS> {
let word_lower = word.to_lowercase();
let mut max_found: Option<(UPOS, usize)> = None;
for pos in UPOS::iter() {
if let Some(count) = self.mapping.get(&(word_lower.as_str(), &pos)) {
if let Some((_, max_count)) = max_found {
if *count > max_count {
max_found = Some((pos, *count))
}
} else {
max_found = Some((pos, *count))
}
}
}
max_found.map(|v| v.0)
}
/// Parse a `.conllu` file and use it to train a frequency dictionary.
/// For error-handling purposes, this function should not be made accessible outside of training.
#[cfg(feature = "training")]
pub fn inc_from_conllu_file(&mut self, path: impl AsRef<Path>) {
use crate::conllu_utils::iter_sentences_in_conllu;
for sent in iter_sentences_in_conllu(path) {
for token in sent.tokens {
if let Some(upos) = token.upos.and_then(UPOS::from_conllu) {
self.inc(&token.form, &upos)
}
}
}
}
pub fn build(self) -> FreqDict {
let mut output = HashMap::new();
for key in self.mapping.keys() {
if output.contains_key(&key.word) {
continue;
}
output.insert(key.word.to_string(), self.most_freq_pos(&key.word).unwrap());
}
FreqDict { mapping: output }
}
}
#[derive(Debug, Eq, PartialEq, Hash)]
struct FreqDictBuilderKey {
word: String,
pos: UPOS,
}
impl Equivalent<FreqDictBuilderKey> for (&str, &UPOS) {
fn equivalent(&self, key: &FreqDictBuilderKey) -> bool {
self.0 == key.word && *self.1 == key.pos
}
}

View file

@ -0,0 +1,16 @@
mod brill_tagger;
#[cfg(feature = "training")]
mod error_counter;
mod freq_dict;
mod freq_dict_builder;
use crate::UPOS;
pub use brill_tagger::BrillTagger;
pub use freq_dict::FreqDict;
pub use freq_dict_builder::FreqDictBuilder;
/// An implementer of this trait is capable of assigned Part-of-Speech tags to a provided sentence.
pub trait Tagger {
fn tag_sentence(&self, sentence: &[String]) -> Vec<Option<UPOS>>;
}

View file

@ -0,0 +1,68 @@
use is_macro::Is;
use serde::{Deserialize, Serialize};
use strum_macros::{AsRefStr, EnumIter};
/// Represents the universal parts of speech as outlined by [universaldependencies.org](https://universaldependencies.org/u/pos/index.html).
#[derive(
Debug,
Default,
Hash,
Eq,
PartialEq,
Clone,
Copy,
EnumIter,
AsRefStr,
Serialize,
Deserialize,
PartialOrd,
Ord,
Is,
)]
pub enum UPOS {
ADJ,
ADP,
ADV,
AUX,
CCONJ,
DET,
INTJ,
#[default]
NOUN,
NUM,
PART,
PRON,
PROPN,
PUNCT,
SCONJ,
SYM,
VERB,
}
impl UPOS {
pub fn from_conllu(other: rs_conllu::UPOS) -> Option<Self> {
Some(match other {
rs_conllu::UPOS::ADJ => UPOS::ADJ,
rs_conllu::UPOS::ADP => UPOS::ADP,
rs_conllu::UPOS::ADV => UPOS::ADV,
rs_conllu::UPOS::AUX => UPOS::AUX,
rs_conllu::UPOS::CCONJ => UPOS::CCONJ,
rs_conllu::UPOS::DET => UPOS::DET,
rs_conllu::UPOS::INTJ => UPOS::INTJ,
rs_conllu::UPOS::NOUN => UPOS::NOUN,
rs_conllu::UPOS::NUM => UPOS::NUM,
rs_conllu::UPOS::PART => UPOS::PART,
rs_conllu::UPOS::PRON => UPOS::PRON,
rs_conllu::UPOS::PROPN => UPOS::PROPN,
rs_conllu::UPOS::PUNCT => UPOS::PUNCT,
rs_conllu::UPOS::SCONJ => UPOS::SCONJ,
rs_conllu::UPOS::SYM => UPOS::SYM,
rs_conllu::UPOS::VERB => UPOS::VERB,
rs_conllu::UPOS::X => return None,
})
}
pub fn is_nominal(&self) -> bool {
matches!(self, Self::NOUN | Self::PROPN)
}
}

View file

@ -0,0 +1,28 @@
use hashbrown::HashMap;
#[derive(Debug, Default)]
pub struct WordCounter {
/// The number of times a word is associated with an error.
pub word_counts: HashMap<String, usize>,
}
impl WordCounter {
pub fn new() -> Self {
Self::default()
}
/// Increment the count for a particular word.
pub fn inc(&mut self, word: &str) {
self.word_counts
.entry_ref(word)
.and_modify(|counter| *counter += 1)
.or_insert(1);
}
/// Get an iterator over the most frequent words associated with errors.
pub fn iter_top_n_words(&self, n: usize) -> impl Iterator<Item = &String> {
let mut counts: Vec<(&String, &usize)> = self.word_counts.iter().collect();
counts.sort_unstable_by(|a, b| b.1.cmp(a.1));
counts.into_iter().take(n).map(|(a, _b)| a)
}
}

View file

@ -1,368 +0,0 @@
{
"ignoredLints": "{\"context_hashes\":[11327540533206285101]}",
"useWebWorker": true,
"lintSettings": {
"ACoupleMore": null,
"ALongTime": null,
"ALotWorst": null,
"APart": null,
"AWholeEntire": null,
"AdjectiveOfA": null,
"AfterAWhile": null,
"AlzheimersDisease": null,
"AmazonNames": null,
"Americas": null,
"AmountsFor": null,
"AnA": null,
"AnAnother": null,
"AndIn": null,
"AndTheLike": null,
"AnotherAn": null,
"AnotherOnes": null,
"AnotherThings": null,
"Anybody": null,
"Anyhow": null,
"Anyone": null,
"Anywhere": null,
"AppleNames": null,
"AsFarBackAs": null,
"AsOfLate": null,
"AsWell": null,
"AskNoPreposition": null,
"AtFaceValue": null,
"Australia": null,
"AvoidAndAlso": null,
"AvoidCurses": null,
"AzureNames": null,
"BackInTheDay": null,
"Backplane": null,
"BadRap": null,
"BaitedBreath": null,
"BanTogether": null,
"BareInMind": null,
"BatedBreath": null,
"BeckAndCall": null,
"BeenThere": null,
"BestRegards": null,
"BlanketStatement": null,
"BoringWords": null,
"Brutality": null,
"ByAccident": null,
"CanBeSeen": null,
"Canada": null,
"CapitalizePersonalPronouns": null,
"CaseInPoint": null,
"CaseSensitive": null,
"ChangeOfTack": null,
"ChangeTack": null,
"ChangedTack": null,
"ChangesOfTack": null,
"ChangesTack": null,
"ChangingOfTack": null,
"ChangingTack": null,
"ChineseCommunistParty": null,
"ChockFull": null,
"ClientSide": null,
"CommaFixes": null,
"CompaniesProductsAndTrademarks": null,
"CompoundNouns": null,
"CondenseAllThe": null,
"Confident": null,
"CorrectNumberSuffix": null,
"Countries": null,
"CoursingThroughVeins": null,
"CurrencyPlacement": null,
"DampSquib": null,
"Dashes": null,
"DayAndAge": null,
"DayOneNames": null,
"DefiniteArticle": null,
"DefiniteArticles": null,
"Desktop": null,
"DespiteOf": null,
"Devops": null,
"Discuss": null,
"Discussed": null,
"Discusses": null,
"Discussing": null,
"DoNotWant": null,
"DotInitialisms": null,
"EachAndEveryOne": null,
"EllipsisLength": null,
"ElsePossessive": null,
"EludedTo": null,
"EnMasse": null,
"EverPresent": null,
"Everybody": null,
"Everyday": null,
"Everyone": null,
"Everywhere": null,
"Excellent": null,
"ExpandBecause": null,
"ExpandDependencies": null,
"ExpandDependency": null,
"ExpandMinimum": null,
"ExpandStandardInput": null,
"ExpandStandardOutput": null,
"ExpandTimeShorthands": null,
"ExpandWith": null,
"ExpandWithout": null,
"Expatriate": null,
"ExplanationMark": null,
"ExplanationMarks": null,
"ExplanationPoint": null,
"FaceFirst": null,
"FairBit": null,
"FarWorse": null,
"FastPaste": null,
"FatalOutcome": null,
"FetalPosition": null,
"FirstAidKit": null,
"ForALongTime": null,
"ForAWhile": null,
"ForAllIntentsAndPurposes": null,
"ForNoun": null,
"FreeRein": null,
"Freezing": null,
"FurtherAdo": null,
"Furthermore": null,
"GetRidOff": null,
"GetsRidOff": null,
"GettingRidOff": null,
"GildedAge": null,
"GoingTo": null,
"GoogleNames": null,
"GotRidOff": null,
"GottenRidOff": null,
"GuineaBissau": null,
"HadGone": null,
"HadOf": null,
"HadPassed": null,
"HalfAnHour": null,
"Haphazard": null,
"HasGone": null,
"HasPassed": null,
"HaveGone": null,
"HavePassed": null,
"HavingGone": null,
"HavingPassed": null,
"Hedging": null,
"Henceforth": null,
"Hereby": null,
"Holidays": null,
"HomeInOn": null,
"HomedInOn": null,
"HomesInOn": null,
"HomingInOn": null,
"HopHope": null,
"HowTo": null,
"However": null,
"HumanBeings": null,
"HumanLife": null,
"HungerPang": null,
"HyphenateNumberDay": null,
"IAm": null,
"InAWhile": null,
"InAndOfItself": null,
"InAnyWay": null,
"InCase": null,
"InDetail": null,
"InMoreDetail": null,
"InNeedOf": null,
"InOneFellSwoop": null,
"InThe": null,
"InflectedVerbAfterTo": null,
"Insofar": null,
"Instead": null,
"InsteadOf": null,
"Insurmountable": null,
"Intact": null,
"Into": null,
"InvestIn": null,
"InvestedIn": null,
"InvestingIn": null,
"InvestsIn": null,
"IsKnownFor": null,
"ItCan": null,
"ItsContraction": null,
"Itself": null,
"IveGotTo": null,
"JawDropping": null,
"JetpackNames": null,
"JustDeserts": null,
"KindOf": null,
"KindRegards": null,
"Koreas": null,
"Laptop": null,
"LastButNotLeast": null,
"LastDitch": null,
"LeftRightHand": null,
"LetAlone": null,
"LetsConfusion": null,
"LikeThePlague": null,
"Likewise": null,
"LinkingVerbs": null,
"LongSentences": null,
"Malaysia": null,
"MergeWords": null,
"MetaNames": null,
"MicrosoftNames": null,
"Middleware": null,
"Misunderstand": null,
"Misunderstood": null,
"Misuse": null,
"Misused": null,
"ModalOf": null,
"Monumentous": null,
"MostNumber": null,
"MuchAdo": null,
"MuchWorse": null,
"Multicore": null,
"Multimedia": null,
"MultipleSequentialPronouns": null,
"Multithreading": null,
"MutePoint": null,
"MyHouse": null,
"Myself": null,
"NailOnTheHead": null,
"NationalCapitals": null,
"NeedHelp": null,
"NerveRacking": null,
"NoOxfordComma": null,
"Nobody": null,
"NominalWants": null,
"Nonetheless": null,
"NotIn": null,
"NotTo": null,
"NotablePlaces": null,
"Nothing": null,
"Notwithstanding": null,
"NounInsteadOfVerb": null,
"Nowhere": null,
"NumberSuffixCapitalization": null,
"OceansAndSeas": null,
"OfCourse": null,
"OffTheCuff": null,
"OldWivesTale": null,
"OnSecondThought": null,
"OnTheSpurOfTheMoment": null,
"OnceInAWhile": null,
"OneAndTheSame": null,
"OpenCompounds": null,
"OpenTheLight": null,
"OperativeSystem": null,
"OperativeSystems": null,
"OutOfDate": null,
"Overall": null,
"Overclocking": null,
"Overload": null,
"Overnight": null,
"OxfordComma": null,
"Oxymorons": null,
"PeaceOfMind": null,
"PhrasalVerbAsCompoundNoun": null,
"PiggyBag": null,
"PiggyBagged": null,
"PiggyBagging": null,
"PiqueInterest": null,
"PocketCastsNames": null,
"PointIsMoot": null,
"PointsOfView": null,
"PortAuPrince": null,
"PortoNovo": null,
"PossessiveYour": null,
"Postpone": null,
"PrayingMantis": null,
"PronounContraction": null,
"PronounKnew": null,
"Proofread": null,
"ProperNouns": null,
"RapidFire": null,
"RealTrouper": null,
"Regardless": null,
"RepeatedWords": null,
"RifeWith": null,
"RoadMap": null,
"SameAs": null,
"SaveToSafe": null,
"ScantilyClad": null,
"SentenceCapitalization": null,
"ServerSide": null,
"SimpleGrammatical": null,
"SinceDuration": null,
"SneakingSuspicion": null,
"Somebody": null,
"Somehow": null,
"Someone": null,
"SomewhatSomething": null,
"Somewhere": null,
"SoonerOrLater": null,
"Spaces": null,
"SpecialAttention": null,
"SpellCheck": null,
"SpelledNumbers": null,
"SpokeTooSoon": null,
"Starving": null,
"StateOfTheArt": null,
"SufficeItToSay": null,
"SupposedTo": null,
"TakeItPersonally": null,
"TakeItSeriously": null,
"ThatChallenged": null,
"ThatThis": null,
"ThatWhich": null,
"TheAnother": null,
"TheHowWhy": null,
"TheMy": null,
"ThenThan": null,
"ThereIsAny": null,
"Therefore": null,
"Thereupon": null,
"ThoughtProcess": null,
"ThrowRubbish": null,
"TickingTimeClock": null,
"ToDoHyphen": null,
"ToTheMannerBorn": null,
"Towards": null,
"TrialAndError": null,
"TumblrNames": null,
"TurnForTheWorse": null,
"TurnItOff": null,
"USUniversities": null,
"UnclosedQuotes": null,
"Underclock": null,
"UnitedOrganizations": null,
"Unless": null,
"Upset": null,
"Upward": null,
"UseGenitive": null,
"WantBe": null,
"WasAloud": null,
"WaveFunction": null,
"WellBeing": null,
"WellKept": null,
"WhatHeLooksLike": null,
"WhatItLooksLike": null,
"WhatSheLooksLike": null,
"WhatTheyLookLike": null,
"Whereas": null,
"Whereupon": null,
"WhetYourAppetite": null,
"WholeEntire": null,
"WidelyAccepted": null,
"Widespread": null,
"WillContain": null,
"WinPrize": null,
"WordPressDotcom": null,
"WorldWarII": null,
"Worldwide": null,
"WorseAndWorse": null,
"WorseCaseScenario": null,
"WorseThan": null,
"WorstCaseScenario": null,
"WorstEver": null
},
"userDictionary": [],
"dialect": 0,
"delay": -1
}

View file

@ -0,0 +1,8 @@
---
title: Brill Tagging
---
Harper uses Brill tagging as a refinement step to a dictionary-based [POS tagging](https://en.wikipedia.org/wiki/Part-of-speech_tagging) approach.
This method retains low-latency and high-throughput without bundling a large, high-entropy language model.
While documentation on this site is sparse, initial development was accompanied by [a blog post](https://elijahpotter.dev/articles/transformation-based_learning), which can hopefully explain some of the more abstract details of the process.

View file

@ -189,6 +189,10 @@ export default defineConfig({
title: 'Local Statistics',
to: '/docs/contributors/local-stats',
},
{
title: 'Brill Tagging',
to: '/docs/contributors/brill',
},
{
title: 'FAQ',
to: '/docs/contributors/faq',