mirror of
https://github.com/Automattic/harper.git
synced 2025-07-07 13:05:01 +00:00
feat(brill): train and use Brill tagger (#1344)
Co-authored-by: hippietrail <hippietrail@users.noreply.github.com>
This commit is contained in:
parent
e3e573520e
commit
db89187c3f
51 changed files with 51011 additions and 15273 deletions
119
Cargo.lock
generated
119
Cargo.lock
generated
|
@ -509,6 +509,27 @@ dependencies = [
|
|||
"parking_lot_core",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "derive_more"
|
||||
version = "1.0.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "4a9b99b9cbbe49445b21764dc0625032a89b145a2642e67603e1c936f5458d05"
|
||||
dependencies = [
|
||||
"derive_more-impl",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "derive_more-impl"
|
||||
version = "1.0.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "cb7330aeadfbe296029522e6c40f315320aba36fc43a5b3632f3795348f3bd22"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn",
|
||||
"unicode-xid",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "dirs"
|
||||
version = "4.0.0"
|
||||
|
@ -784,14 +805,25 @@ checksum = "07e28edb80900c19c28f1072f2e8aeca7fa06b23cd4169cefe1af5aa3260783f"
|
|||
|
||||
[[package]]
|
||||
name = "half"
|
||||
version = "2.4.1"
|
||||
version = "2.6.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "6dd08c532ae367adf81c312a4580bc67f1d0fe8bc9c460520283f4c0ff277888"
|
||||
checksum = "459196ed295495a68f7d7fe1d84f6c4b7ff0e21fe3017b2f283c6fac3ad803c9"
|
||||
dependencies = [
|
||||
"cfg-if",
|
||||
"crunchy",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "harper-brill"
|
||||
version = "0.42.0"
|
||||
dependencies = [
|
||||
"harper-pos-utils",
|
||||
"lazy_static",
|
||||
"rs-conllu",
|
||||
"serde",
|
||||
"serde_json",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "harper-cli"
|
||||
version = "0.1.0"
|
||||
|
@ -803,6 +835,7 @@ dependencies = [
|
|||
"harper-comments",
|
||||
"harper-core",
|
||||
"harper-literate-haskell",
|
||||
"harper-pos-utils",
|
||||
"harper-stats",
|
||||
"harper-typst",
|
||||
"hashbrown 0.15.4",
|
||||
|
@ -854,6 +887,7 @@ dependencies = [
|
|||
"criterion",
|
||||
"foldhash",
|
||||
"fst",
|
||||
"harper-brill",
|
||||
"hashbrown 0.15.4",
|
||||
"is-macro",
|
||||
"itertools 0.14.0",
|
||||
|
@ -866,7 +900,7 @@ dependencies = [
|
|||
"pulldown-cmark",
|
||||
"quickcheck",
|
||||
"quickcheck_macros",
|
||||
"rand",
|
||||
"rand 0.8.5",
|
||||
"rayon",
|
||||
"serde",
|
||||
"serde_json",
|
||||
|
@ -929,6 +963,20 @@ dependencies = [
|
|||
"tracing-subscriber",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "harper-pos-utils"
|
||||
version = "0.42.0"
|
||||
dependencies = [
|
||||
"hashbrown 0.15.4",
|
||||
"is-macro",
|
||||
"rand 0.9.1",
|
||||
"rayon",
|
||||
"rs-conllu",
|
||||
"serde",
|
||||
"strum",
|
||||
"strum_macros",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "harper-stats"
|
||||
version = "0.42.0"
|
||||
|
@ -1569,9 +1617,9 @@ checksum = "42f5e15c9953c5e4ccceeb2e7382a716482c34515315f7b03532b8b4e8393d2d"
|
|||
|
||||
[[package]]
|
||||
name = "oorandom"
|
||||
version = "11.1.4"
|
||||
version = "11.1.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "b410bbe7e14ab526a0e86877eb47c6996a2bd7746f027ba551028c925390e4e9"
|
||||
checksum = "d6790f58c7ff633d8771f42965289203411a5e5c68388703c06e14f24770b41e"
|
||||
|
||||
[[package]]
|
||||
name = "open"
|
||||
|
@ -1597,7 +1645,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
|||
checksum = "e2c1f9f56e534ac6a9b8a4600bdf0f530fb393b5f393e7b4d03489c3cf0c3f01"
|
||||
dependencies = [
|
||||
"num-traits",
|
||||
"rand",
|
||||
"rand 0.8.5",
|
||||
"serde",
|
||||
]
|
||||
|
||||
|
@ -1675,7 +1723,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
|||
checksum = "3c80231409c20246a13fddb31776fb942c38553c51e871f8cbd687a4cfb5843d"
|
||||
dependencies = [
|
||||
"phf_shared",
|
||||
"rand",
|
||||
"rand 0.8.5",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
|
@ -1769,7 +1817,7 @@ checksum = "588f6378e4dd99458b60ec275b4477add41ce4fa9f64dcba6f15adccb19b50d6"
|
|||
dependencies = [
|
||||
"env_logger",
|
||||
"log",
|
||||
"rand",
|
||||
"rand 0.8.5",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
|
@ -1809,7 +1857,7 @@ checksum = "a2fe5ef3495d7d2e377ff17b1a8ce2ee2ec2a18cde8b6ad6619d65d0701c135d"
|
|||
dependencies = [
|
||||
"bytes",
|
||||
"getrandom 0.2.15",
|
||||
"rand",
|
||||
"rand 0.8.5",
|
||||
"ring",
|
||||
"rustc-hash",
|
||||
"rustls",
|
||||
|
@ -1857,11 +1905,21 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
|||
checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404"
|
||||
dependencies = [
|
||||
"libc",
|
||||
"rand_chacha",
|
||||
"rand_core",
|
||||
"rand_chacha 0.3.1",
|
||||
"rand_core 0.6.4",
|
||||
"serde",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rand"
|
||||
version = "0.9.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "9fbfd9d094a40bf3ae768db9361049ace4c0e04a4fd6b359518bd7b73a73dd97"
|
||||
dependencies = [
|
||||
"rand_chacha 0.9.0",
|
||||
"rand_core 0.9.3",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rand_chacha"
|
||||
version = "0.3.1"
|
||||
|
@ -1869,7 +1927,17 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
|||
checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88"
|
||||
dependencies = [
|
||||
"ppv-lite86",
|
||||
"rand_core",
|
||||
"rand_core 0.6.4",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rand_chacha"
|
||||
version = "0.9.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d3022b5f1df60f26e1ffddd6c66e8aa15de382ae63b3a0c1bfc0e4d3e3f325cb"
|
||||
dependencies = [
|
||||
"ppv-lite86",
|
||||
"rand_core 0.9.3",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
|
@ -1882,6 +1950,15 @@ dependencies = [
|
|||
"serde",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rand_core"
|
||||
version = "0.9.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "99d9a13982dcf210057a8a78572b2217b667c3beacbf3a0d8b454f6f82837d38"
|
||||
dependencies = [
|
||||
"getrandom 0.3.2",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rayon"
|
||||
version = "1.10.0"
|
||||
|
@ -2023,6 +2100,18 @@ dependencies = [
|
|||
"windows-sys 0.52.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rs-conllu"
|
||||
version = "0.3.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d6de5aecf17f8fff1b35d59a12e2b8c908cad4d67208805166483655554f9169"
|
||||
dependencies = [
|
||||
"clap",
|
||||
"derive_more",
|
||||
"thiserror 1.0.69",
|
||||
"walkdir",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rustc-demangle"
|
||||
version = "0.1.24"
|
||||
|
@ -2953,6 +3042,12 @@ version = "0.2.1"
|
|||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "4a1a07cc7db3810833284e8d372ccdc6da29741639ecc70c9ec107df0fa6154c"
|
||||
|
||||
[[package]]
|
||||
name = "unicode-xid"
|
||||
version = "0.2.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ebc1c04c71510c7f702b52b7c350734c9ff1295c464a03335b00bb84fc54f853"
|
||||
|
||||
[[package]]
|
||||
name = "unscanny"
|
||||
version = "0.1.0"
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
[workspace]
|
||||
members = [ "harper-cli", "harper-core", "harper-ls", "harper-comments", "harper-wasm", "harper-tree-sitter", "harper-html", "harper-literate-haskell", "harper-typst" , "harper-stats"]
|
||||
members = [ "harper-cli", "harper-core", "harper-ls", "harper-comments", "harper-wasm", "harper-tree-sitter", "harper-html", "harper-literate-haskell", "harper-typst" , "harper-stats", "harper-pos-utils", "harper-brill"]
|
||||
resolver = "2"
|
||||
|
||||
# Comment out the below lines if you plan to use a debugger.
|
||||
|
|
16
harper-brill/Cargo.toml
Normal file
16
harper-brill/Cargo.toml
Normal file
|
@ -0,0 +1,16 @@
|
|||
[package]
|
||||
name = "harper-brill"
|
||||
version = "0.42.0"
|
||||
edition = "2024"
|
||||
|
||||
[dependencies]
|
||||
harper-pos-utils = { path = "../harper-pos-utils/", version = "0.42.0" }
|
||||
lazy_static = "1.5.0"
|
||||
rs-conllu = "0.3.0"
|
||||
serde = "1.0.219"
|
||||
serde_json = "1.0.140"
|
||||
|
||||
[build-dependencies]
|
||||
rs-conllu = "0.3.0"
|
||||
serde = "1.0.219"
|
||||
serde_json = "1.0.140"
|
32
harper-brill/src/lib.rs
Normal file
32
harper-brill/src/lib.rs
Normal file
|
@ -0,0 +1,32 @@
|
|||
use lazy_static::lazy_static;
|
||||
use std::sync::Arc;
|
||||
|
||||
pub use harper_pos_utils::{BrillChunker, BrillTagger, Chunker, FreqDict, Tagger, UPOS};
|
||||
|
||||
const BRILL_TAGGER_SOURCE: &str = include_str!("../trained_tagger_model.json");
|
||||
|
||||
lazy_static! {
|
||||
static ref BRILL_TAGGER: Arc<BrillTagger<FreqDict>> = Arc::new(uncached_brill_tagger());
|
||||
}
|
||||
|
||||
fn uncached_brill_tagger() -> BrillTagger<FreqDict> {
|
||||
serde_json::from_str(BRILL_TAGGER_SOURCE).unwrap()
|
||||
}
|
||||
|
||||
pub fn brill_tagger() -> Arc<BrillTagger<FreqDict>> {
|
||||
(*BRILL_TAGGER).clone()
|
||||
}
|
||||
|
||||
const BRILL_CHUNKER_SOURCE: &str = include_str!("../trained_chunker_model.json");
|
||||
|
||||
lazy_static! {
|
||||
static ref BRILL_CHUNKER: Arc<BrillChunker> = Arc::new(uncached_brill_chunker());
|
||||
}
|
||||
|
||||
fn uncached_brill_chunker() -> BrillChunker {
|
||||
serde_json::from_str(BRILL_CHUNKER_SOURCE).unwrap()
|
||||
}
|
||||
|
||||
pub fn brill_chunker() -> Arc<BrillChunker> {
|
||||
(*BRILL_CHUNKER).clone()
|
||||
}
|
3724
harper-brill/trained_chunker_model.json
Normal file
3724
harper-brill/trained_chunker_model.json
Normal file
File diff suppressed because it is too large
Load diff
30448
harper-brill/trained_tagger_model.json
Normal file
30448
harper-brill/trained_tagger_model.json
Normal file
File diff suppressed because it is too large
Load diff
|
@ -13,6 +13,7 @@ harper-stats = { path = "../harper-stats", version = "0.42.0" }
|
|||
dirs = "6.0.0"
|
||||
harper-literate-haskell = { path = "../harper-literate-haskell", version = "0.42.0" }
|
||||
harper-core = { path = "../harper-core", version = "0.42.0" }
|
||||
harper-pos-utils = { path = "../harper-pos-utils", version = "0.42.0", features = ["training", "threaded"] }
|
||||
harper-comments = { path = "../harper-comments", version = "0.42.0" }
|
||||
harper-typst = { path = "../harper-typst", version = "0.42.0" }
|
||||
hashbrown = "0.15.4"
|
||||
|
|
|
@ -20,6 +20,7 @@ use harper_core::{
|
|||
MutableDictionary, TokenKind, TokenStringExt, WordId, WordMetadata,
|
||||
};
|
||||
use harper_literate_haskell::LiterateHaskellParser;
|
||||
use harper_pos_utils::{BrillChunker, BrillTagger};
|
||||
use harper_stats::Stats;
|
||||
use serde::Serialize;
|
||||
|
||||
|
@ -77,6 +78,28 @@ enum Args {
|
|||
/// The document to mine words from.
|
||||
file: PathBuf,
|
||||
},
|
||||
TrainBrillTagger {
|
||||
#[arg(short, long, default_value = "1.0")]
|
||||
candidate_selection_chance: f32,
|
||||
/// The path to write the final JSON model file to.
|
||||
output: PathBuf,
|
||||
/// The number of epochs (and patch rules) to train.
|
||||
epochs: usize,
|
||||
/// Path to a `.conllu` dataset to train on.
|
||||
#[arg(num_args = 1..)]
|
||||
datasets: Vec<PathBuf>,
|
||||
},
|
||||
TrainBrillChunker {
|
||||
#[arg(short, long, default_value = "1.0")]
|
||||
candidate_selection_chance: f32,
|
||||
/// The path to write the final JSON model file to.
|
||||
output: PathBuf,
|
||||
/// The number of epochs (and patch rules) to train.
|
||||
epochs: usize,
|
||||
/// Path to a `.conllu` dataset to train on.
|
||||
#[arg(num_args = 1..)]
|
||||
datasets: Vec<PathBuf>,
|
||||
},
|
||||
/// Print harper-core version.
|
||||
CoreVersion,
|
||||
/// Rename a flag in the dictionary and affixes.
|
||||
|
@ -91,6 +114,8 @@ enum Args {
|
|||
/// Emit a decompressed, line-separated list of the compounds in Harper's dictionary.
|
||||
/// As long as there's either an open or hyphenated spelling.
|
||||
Compounds,
|
||||
/// Provided a sentence or phrase, emit a list of each noun phrase contained within.
|
||||
NominalPhrases { input: String },
|
||||
}
|
||||
|
||||
fn main() -> anyhow::Result<()> {
|
||||
|
@ -380,6 +405,27 @@ fn main() -> anyhow::Result<()> {
|
|||
println!("harper-core v{}", harper_core::core_version());
|
||||
Ok(())
|
||||
}
|
||||
Args::TrainBrillTagger {
|
||||
datasets: dataset,
|
||||
epochs,
|
||||
output,
|
||||
candidate_selection_chance,
|
||||
} => {
|
||||
let tagger = BrillTagger::train(&dataset, epochs, candidate_selection_chance);
|
||||
fs::write(output, serde_json::to_string_pretty(&tagger)?)?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
Args::TrainBrillChunker {
|
||||
datasets,
|
||||
epochs,
|
||||
output,
|
||||
candidate_selection_chance,
|
||||
} => {
|
||||
let chunker = BrillChunker::train(&datasets, epochs, candidate_selection_chance);
|
||||
fs::write(output, serde_json::to_string_pretty(&chunker)?)?;
|
||||
Ok(())
|
||||
}
|
||||
Args::RenameFlag { old, new, dir } => {
|
||||
use serde_json::Value;
|
||||
|
||||
|
@ -547,6 +593,18 @@ fn main() -> anyhow::Result<()> {
|
|||
println!("\nFound {} compound word groups", results.len());
|
||||
Ok(())
|
||||
}
|
||||
Args::NominalPhrases { input } => {
|
||||
let doc = Document::new_markdown_default_curated(&input);
|
||||
|
||||
for phrase in doc.iter_nominal_phrases() {
|
||||
let s =
|
||||
doc.get_span_content_str(&phrase.span().ok_or(anyhow!("Unable to get span"))?);
|
||||
|
||||
println!("{s}");
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -562,6 +620,7 @@ fn load_file(
|
|||
.map(|v| v.to_str().unwrap())
|
||||
{
|
||||
Some("md") => Box::new(Markdown::default()),
|
||||
|
||||
Some("lhs") => Box::new(LiterateHaskellParser::new_markdown(
|
||||
MarkdownOptions::default(),
|
||||
)),
|
||||
|
|
|
@ -31,6 +31,7 @@ foldhash = "0.1.5"
|
|||
strum_macros = "0.27.1"
|
||||
strum = "0.27.1"
|
||||
ammonia = "4.1.0"
|
||||
harper-brill = { path = "../harper-brill", version = "0.42.0" }
|
||||
bitflags = { version = "2.9.1", features = ["serde"] }
|
||||
|
||||
[dev-dependencies]
|
||||
|
|
|
@ -2,6 +2,7 @@ use std::cmp::Ordering;
|
|||
use std::collections::VecDeque;
|
||||
use std::fmt::Display;
|
||||
|
||||
use harper_brill::{Chunker, Tagger, brill_chunker, brill_tagger};
|
||||
use paste::paste;
|
||||
|
||||
use crate::expr::{Expr, ExprExt, LongestMatchOf, Repeating, SequenceExpr};
|
||||
|
@ -9,10 +10,8 @@ use crate::parsers::{Markdown, MarkdownOptions, Parser, PlainEnglish};
|
|||
use crate::patterns::WordSet;
|
||||
use crate::punctuation::Punctuation;
|
||||
use crate::vec_ext::VecExt;
|
||||
use crate::word_metadata::AdjectiveData;
|
||||
use crate::{
|
||||
Dictionary, FatStringToken, FatToken, FstDictionary, Lrc, NounData, Token, TokenKind,
|
||||
TokenStringExt,
|
||||
Dictionary, FatStringToken, FatToken, FstDictionary, Lrc, Token, TokenKind, TokenStringExt,
|
||||
};
|
||||
use crate::{OrdinalSuffix, Span};
|
||||
|
||||
|
@ -140,107 +139,34 @@ impl Document {
|
|||
self.condense_ellipsis();
|
||||
self.condense_latin();
|
||||
self.match_quotes();
|
||||
self.articles_imply_nouns();
|
||||
|
||||
// annotate word metadata
|
||||
let token_strings: Vec<_> = self
|
||||
.tokens
|
||||
.iter()
|
||||
.filter(|t| !t.kind.is_whitespace())
|
||||
.map(|t| self.get_span_content_str(&t.span))
|
||||
.collect();
|
||||
|
||||
let token_tags = brill_tagger().tag_sentence(&token_strings);
|
||||
let np_flags = brill_chunker().chunk_sentence(&token_strings, &token_tags);
|
||||
|
||||
let mut i = 0;
|
||||
|
||||
// Annotate word metadata
|
||||
for token in self.tokens.iter_mut() {
|
||||
if let TokenKind::Word(meta) = &mut token.kind {
|
||||
let word_source = token.span.get_content(&self.source);
|
||||
let found_meta = dictionary.get_word_metadata(word_source);
|
||||
*meta = found_meta.cloned()
|
||||
}
|
||||
}
|
||||
let mut found_meta = dictionary.get_word_metadata(word_source).cloned();
|
||||
|
||||
// refine and disambiguate word metadata
|
||||
self.known_preposition();
|
||||
self.articles_imply_not_verb();
|
||||
}
|
||||
|
||||
fn uncached_article_expr() -> Lrc<SequenceExpr> {
|
||||
Lrc::new(
|
||||
SequenceExpr::default()
|
||||
.then_determiner()
|
||||
.then_whitespace()
|
||||
.then(|t: &Token, _source: &[char]| t.kind.is_adjective() && t.kind.is_noun())
|
||||
.then_whitespace()
|
||||
.then_noun(),
|
||||
)
|
||||
}
|
||||
|
||||
thread_local! {static ARTICLE_EXPR: Lrc<SequenceExpr> = Document::uncached_article_expr()}
|
||||
|
||||
/// When a word that is either an adjective or a noun is sandwiched between an article and a noun,
|
||||
/// it definitely is not a noun.
|
||||
fn articles_imply_nouns(&mut self) {
|
||||
let expr = Self::ARTICLE_EXPR.with(|v| v.clone());
|
||||
|
||||
for m in expr.iter_matches_in_doc(self).collect::<Vec<_>>() {
|
||||
if let TokenKind::Word(Some(metadata)) = &mut self.tokens[m.start + 2].kind {
|
||||
metadata.noun = None;
|
||||
metadata.verb = None;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// A proposition-like word followed by a determiner or number is typically
|
||||
/// really a preposition.
|
||||
fn known_preposition(&mut self) {
|
||||
fn create_expr() -> Lrc<SequenceExpr> {
|
||||
Lrc::new(
|
||||
SequenceExpr::default()
|
||||
.then(WordSet::new(&["in", "at", "on", "to", "for", "by", "with"]))
|
||||
.then_whitespace()
|
||||
.then(|t: &Token, _source: &[char]| {
|
||||
t.kind.is_determiner() || t.kind.is_number()
|
||||
}),
|
||||
)
|
||||
}
|
||||
thread_local! {static EXPR: Lrc<SequenceExpr> = create_expr()}
|
||||
|
||||
let expr = EXPR.with(|v| v.clone());
|
||||
|
||||
for m in expr.iter_matches_in_doc(self).collect::<Vec<_>>() {
|
||||
if let TokenKind::Word(Some(metadata)) = &mut self.tokens[m.start].kind {
|
||||
metadata.noun = None;
|
||||
metadata.pronoun = None;
|
||||
metadata.verb = None;
|
||||
metadata.adjective = None;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// The first word after an article cannot be a verb.
|
||||
fn articles_imply_not_verb(&mut self) {
|
||||
fn create_pattern() -> Lrc<SequenceExpr> {
|
||||
Lrc::new(
|
||||
SequenceExpr::default()
|
||||
.then(WordSet::new(&[
|
||||
// articles
|
||||
"a", "an", "the",
|
||||
// Dependent genitive pronouns serve a similar role to articles.
|
||||
// Unfortunately, some overlap with other pronoun forms. E.g.
|
||||
// "I like her", "Something about her struck me as odd."
|
||||
"my", "your", "thy", "thine", "his", /*"her",*/ "its", "our", "their",
|
||||
"whose", // "no" is also a determiner
|
||||
"no",
|
||||
]))
|
||||
.then_whitespace()
|
||||
.then_verb(),
|
||||
)
|
||||
}
|
||||
thread_local! {static EXPR: Lrc<SequenceExpr> = create_pattern()}
|
||||
let expr = EXPR.with(|v| v.clone());
|
||||
|
||||
for m in expr.iter_matches_in_doc(self).collect::<Vec<_>>() {
|
||||
if let TokenKind::Word(Some(metadata)) = &mut self.tokens[m.end - 1].kind {
|
||||
if metadata.noun.is_none()
|
||||
&& metadata.adjective.is_none()
|
||||
&& metadata.adverb.is_none()
|
||||
{
|
||||
metadata.noun = Some(NounData::default());
|
||||
metadata.adjective = Some(AdjectiveData::default());
|
||||
if let Some(inner) = &mut found_meta {
|
||||
inner.pos_tag = token_tags[i];
|
||||
inner.np_member = Some(np_flags[i]);
|
||||
}
|
||||
metadata.verb = None;
|
||||
|
||||
*meta = found_meta;
|
||||
i += 1;
|
||||
} else if !token.kind.is_whitespace() {
|
||||
i += 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -331,6 +257,40 @@ impl Document {
|
|||
self.tokens.iter()
|
||||
}
|
||||
|
||||
pub fn iter_nominal_phrases(&self) -> impl Iterator<Item = &[Token]> {
|
||||
fn is_np_member(t: &Token) -> bool {
|
||||
t.kind
|
||||
.as_word()
|
||||
.and_then(|x| x.as_ref())
|
||||
.and_then(|w| w.np_member)
|
||||
.unwrap_or(false)
|
||||
}
|
||||
|
||||
fn trim(slice: &[Token]) -> &[Token] {
|
||||
let mut start = 0;
|
||||
let mut end = slice.len();
|
||||
while start < end && slice[start].kind.is_whitespace() {
|
||||
start += 1;
|
||||
}
|
||||
while end > start && slice[end - 1].kind.is_whitespace() {
|
||||
end -= 1;
|
||||
}
|
||||
&slice[start..end]
|
||||
}
|
||||
|
||||
self.tokens
|
||||
.as_slice()
|
||||
.split(|t| !(is_np_member(t) || t.kind.is_whitespace()))
|
||||
.filter_map(|s| {
|
||||
let s = trim(s);
|
||||
if s.iter().any(is_np_member) {
|
||||
Some(s)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
/// Get an iterator over all the tokens contained in the document.
|
||||
pub fn fat_tokens(&self) -> impl Iterator<Item = FatToken> + '_ {
|
||||
self.tokens().map(|token| token.to_fat(&self.source))
|
||||
|
|
|
@ -12,13 +12,9 @@ pub(crate) fn is_content_word(tok: &Token, src: &[char]) -> bool {
|
|||
};
|
||||
|
||||
tok.span.len() > 1
|
||||
&& (meta.is_noun() || meta.is_adjective())
|
||||
&& (meta.is_noun() || meta.is_adjective() || meta.is_verb() || meta.is_adverb())
|
||||
&& !meta.determiner
|
||||
&& (!meta.preposition || tok.span.get_content_string(src).to_lowercase() == "bar")
|
||||
&& !meta.is_adverb()
|
||||
&& !meta.is_conjunction()
|
||||
&& !meta.is_pronoun()
|
||||
&& !meta.is_auxiliary_verb()
|
||||
}
|
||||
|
||||
pub(crate) fn predicate(closed: Option<&WordMetadata>, open: Option<&WordMetadata>) -> bool {
|
||||
|
|
|
@ -1,9 +1,15 @@
|
|||
use harper_brill::UPOS;
|
||||
|
||||
use crate::expr::All;
|
||||
use crate::expr::Expr;
|
||||
use crate::expr::SequenceExpr;
|
||||
use crate::patterns::NominalPhrase;
|
||||
use crate::patterns::Pattern;
|
||||
use crate::patterns::UPOSSet;
|
||||
use crate::patterns::WordSet;
|
||||
use crate::{
|
||||
Token,
|
||||
linting::{ExprLinter, Lint, LintKind, Suggestion},
|
||||
patterns::WordSet,
|
||||
};
|
||||
|
||||
pub struct ItsContraction {
|
||||
|
@ -12,14 +18,22 @@ pub struct ItsContraction {
|
|||
|
||||
impl Default for ItsContraction {
|
||||
fn default() -> Self {
|
||||
let its = WordSet::new(&["its"]);
|
||||
let verbs = WordSet::new(&["had", "been", "got"]);
|
||||
let pattern = SequenceExpr::default()
|
||||
.then(its)
|
||||
let positive = SequenceExpr::default()
|
||||
.t_aco("its")
|
||||
.then_whitespace()
|
||||
.then(verbs);
|
||||
.then(UPOSSet::new(&[UPOS::VERB, UPOS::AUX]));
|
||||
|
||||
let exceptions = SequenceExpr::default()
|
||||
.then_anything()
|
||||
.then_anything()
|
||||
.then(WordSet::new(&["own", "intended"]));
|
||||
|
||||
let inverted = SequenceExpr::default().if_not_then_step_one(exceptions);
|
||||
|
||||
let expr = All::new(vec![Box::new(positive), Box::new(inverted)]);
|
||||
|
||||
Self {
|
||||
expr: Box::new(pattern),
|
||||
expr: Box::new(expr),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -32,6 +46,13 @@ impl ExprLinter for ItsContraction {
|
|||
fn match_to_lint(&self, toks: &[Token], source: &[char]) -> Option<Lint> {
|
||||
let offender = toks.first()?;
|
||||
let offender_chars = offender.span.get_content(source);
|
||||
|
||||
if !toks.get(2)?.kind.is_upos(UPOS::AUX)
|
||||
&& NominalPhrase.matches(&toks[2..], source).is_some()
|
||||
{
|
||||
return None;
|
||||
}
|
||||
|
||||
Some(Lint {
|
||||
span: offender.span,
|
||||
lint_kind: LintKind::WordChoice,
|
||||
|
@ -39,7 +60,8 @@ impl ExprLinter for ItsContraction {
|
|||
Suggestion::replace_with_match_case_str("it's", offender_chars),
|
||||
Suggestion::replace_with_match_case_str("it has", offender_chars),
|
||||
],
|
||||
message: "Use `it's` (short for `it has`) here, not the possessive `its`.".to_owned(),
|
||||
message: "Use `it's` (short for `it has` or `it is`) here, not the possessive `its`."
|
||||
.to_owned(),
|
||||
priority: 54,
|
||||
})
|
||||
}
|
||||
|
@ -98,4 +120,13 @@ mod tests {
|
|||
0,
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn ignore_coroutine() {
|
||||
assert_lint_count(
|
||||
"Launch each task within its own child coroutine.",
|
||||
ItsContraction::default(),
|
||||
0,
|
||||
);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -54,8 +54,7 @@ impl ThenThan {
|
|||
|
||||
// TODO: This can be simplified or eliminated when the adjective improvements make it into the affix system.
|
||||
fn is_comparative_adjective(tok: &Token, source: &[char]) -> bool {
|
||||
tok.kind
|
||||
.is_adjective()
|
||||
(tok.kind.is_adjective() || tok.kind.is_adverb())
|
||||
.then(|| tok.span.get_content(source))
|
||||
.is_some_and(|src| {
|
||||
// Regular comparative form?
|
||||
|
|
|
@ -13,6 +13,7 @@ mod indefinite_article;
|
|||
mod inflection_of_be;
|
||||
mod invert;
|
||||
mod nominal_phrase;
|
||||
mod upos_set;
|
||||
mod whitespace_pattern;
|
||||
mod within_edit_distance;
|
||||
mod word;
|
||||
|
@ -24,6 +25,7 @@ pub use indefinite_article::IndefiniteArticle;
|
|||
pub use inflection_of_be::InflectionOfBe;
|
||||
pub use invert::Invert;
|
||||
pub use nominal_phrase::NominalPhrase;
|
||||
pub use upos_set::UPOSSet;
|
||||
pub use whitespace_pattern::WhitespacePattern;
|
||||
pub use within_edit_distance::WithinEditDistance;
|
||||
pub use word::Word;
|
||||
|
|
30
harper-core/src/patterns/upos_set.rs
Normal file
30
harper-core/src/patterns/upos_set.rs
Normal file
|
@ -0,0 +1,30 @@
|
|||
use harper_brill::UPOS;
|
||||
use smallvec::{SmallVec, ToSmallVec};
|
||||
|
||||
use crate::Token;
|
||||
|
||||
use super::Pattern;
|
||||
|
||||
pub struct UPOSSet {
|
||||
allowed_tags: SmallVec<[UPOS; 10]>,
|
||||
}
|
||||
|
||||
impl UPOSSet {
|
||||
pub fn new(allowed: &[UPOS]) -> Self {
|
||||
Self {
|
||||
allowed_tags: allowed.to_smallvec(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Pattern for UPOSSet {
|
||||
fn matches(&self, tokens: &[Token], _source: &[char]) -> Option<usize> {
|
||||
tokens.first()?.kind.as_word()?.as_ref().and_then(|w| {
|
||||
if self.allowed_tags.contains(&(w.pos_tag?)) {
|
||||
Some(1)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
|
@ -1,3 +1,4 @@
|
|||
use harper_brill::UPOS;
|
||||
use is_macro::Is;
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
|
@ -447,4 +448,12 @@ impl TokenKind {
|
|||
pub fn is_whitespace(&self) -> bool {
|
||||
matches!(self, TokenKind::Space(_) | TokenKind::Newline(_))
|
||||
}
|
||||
|
||||
pub fn is_upos(&self, upos: UPOS) -> bool {
|
||||
let Some(Some(meta)) = self.as_word() else {
|
||||
return false;
|
||||
};
|
||||
|
||||
meta.pos_tag == Some(upos)
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,3 +1,4 @@
|
|||
use harper_brill::UPOS;
|
||||
use is_macro::Is;
|
||||
use paste::paste;
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
@ -32,6 +33,10 @@ pub struct WordMetadata {
|
|||
pub common: bool,
|
||||
#[serde(default = "default_none")]
|
||||
pub derived_from: Option<WordId>,
|
||||
/// Generated by a chunker
|
||||
pub np_member: Option<bool>,
|
||||
/// Generated by a POS tagger
|
||||
pub pos_tag: Option<UPOS>,
|
||||
}
|
||||
|
||||
/// Needed for `serde`
|
||||
|
@ -120,6 +125,180 @@ impl WordMetadata {
|
|||
preposition: self.preposition || other.preposition,
|
||||
common: self.common || other.common,
|
||||
derived_from: self.derived_from.or(other.derived_from),
|
||||
pos_tag: self.pos_tag.or(other.pos_tag),
|
||||
np_member: self.np_member.or(other.np_member),
|
||||
}
|
||||
}
|
||||
|
||||
/// Given a UPOS tag, discard any metadata that would disagree with the given POS tag.
|
||||
/// For example, if the metadata suggests a word could either be a noun or an adjective, and we
|
||||
/// provide a [`UPOS::NOUN`], this function will remove the adjective data.
|
||||
///
|
||||
/// Additionally, if the metadata does not currently declare the potential of the word to be
|
||||
/// the specific POS, it becomes so. That means if we provide a [`UPOS::ADJ`] to the function
|
||||
/// for a metadata whose `Self::adjective = None`, it will become `Some`.
|
||||
pub fn enforce_pos_exclusivity(&mut self, pos: &UPOS) {
|
||||
use UPOS::*;
|
||||
match pos {
|
||||
NOUN => {
|
||||
if let Some(noun) = self.noun {
|
||||
self.noun = Some(NounData {
|
||||
is_proper: Some(false),
|
||||
..noun
|
||||
})
|
||||
} else {
|
||||
self.noun = Some(NounData {
|
||||
is_proper: Some(false),
|
||||
is_plural: None,
|
||||
is_possessive: None,
|
||||
})
|
||||
}
|
||||
|
||||
self.pronoun = None;
|
||||
self.verb = None;
|
||||
self.adjective = None;
|
||||
self.adverb = None;
|
||||
self.conjunction = None;
|
||||
self.determiner = false;
|
||||
self.preposition = false;
|
||||
}
|
||||
PROPN => {
|
||||
if let Some(noun) = self.noun {
|
||||
self.noun = Some(NounData {
|
||||
is_proper: Some(true),
|
||||
..noun
|
||||
})
|
||||
} else {
|
||||
self.noun = Some(NounData {
|
||||
is_proper: Some(true),
|
||||
is_plural: None,
|
||||
is_possessive: None,
|
||||
})
|
||||
}
|
||||
|
||||
self.pronoun = None;
|
||||
self.verb = None;
|
||||
self.adjective = None;
|
||||
self.adverb = None;
|
||||
self.conjunction = None;
|
||||
self.determiner = false;
|
||||
self.preposition = false;
|
||||
}
|
||||
PRON => {
|
||||
if self.pronoun.is_none() {
|
||||
self.pronoun = Some(PronounData::default())
|
||||
}
|
||||
|
||||
self.noun = None;
|
||||
self.verb = None;
|
||||
self.adjective = None;
|
||||
self.adverb = None;
|
||||
self.conjunction = None;
|
||||
self.determiner = false;
|
||||
self.preposition = false;
|
||||
}
|
||||
VERB => {
|
||||
if let Some(verb) = self.verb {
|
||||
self.verb = Some(VerbData {
|
||||
is_auxiliary: Some(false),
|
||||
..verb
|
||||
})
|
||||
} else {
|
||||
self.verb = Some(VerbData {
|
||||
is_auxiliary: Some(false),
|
||||
..Default::default()
|
||||
})
|
||||
}
|
||||
|
||||
self.noun = None;
|
||||
self.pronoun = None;
|
||||
self.adjective = None;
|
||||
self.adverb = None;
|
||||
self.conjunction = None;
|
||||
self.determiner = false;
|
||||
self.preposition = false;
|
||||
}
|
||||
AUX => {
|
||||
if let Some(verb) = self.verb {
|
||||
self.verb = Some(VerbData {
|
||||
is_auxiliary: Some(true),
|
||||
..verb
|
||||
})
|
||||
} else {
|
||||
self.verb = Some(VerbData {
|
||||
is_auxiliary: Some(true),
|
||||
..Default::default()
|
||||
})
|
||||
}
|
||||
|
||||
self.noun = None;
|
||||
self.pronoun = None;
|
||||
self.adjective = None;
|
||||
self.adverb = None;
|
||||
self.conjunction = None;
|
||||
self.determiner = false;
|
||||
self.preposition = false;
|
||||
}
|
||||
ADJ => {
|
||||
if self.adjective.is_none() {
|
||||
self.adjective = Some(AdjectiveData::default())
|
||||
}
|
||||
|
||||
self.noun = None;
|
||||
self.pronoun = None;
|
||||
self.verb = None;
|
||||
self.adverb = None;
|
||||
self.conjunction = None;
|
||||
self.determiner = false;
|
||||
self.preposition = false;
|
||||
}
|
||||
ADV => {
|
||||
if self.adverb.is_none() {
|
||||
self.adverb = Some(AdverbData::default())
|
||||
}
|
||||
|
||||
self.noun = None;
|
||||
self.pronoun = None;
|
||||
self.verb = None;
|
||||
self.adjective = None;
|
||||
self.conjunction = None;
|
||||
self.determiner = false;
|
||||
self.preposition = false;
|
||||
}
|
||||
ADP => {
|
||||
self.noun = None;
|
||||
self.pronoun = None;
|
||||
self.verb = None;
|
||||
self.adjective = None;
|
||||
self.adverb = None;
|
||||
self.conjunction = None;
|
||||
self.determiner = false;
|
||||
self.preposition = true;
|
||||
}
|
||||
DET => {
|
||||
self.noun = None;
|
||||
self.pronoun = None;
|
||||
self.verb = None;
|
||||
self.adjective = None;
|
||||
self.adverb = None;
|
||||
self.conjunction = None;
|
||||
self.preposition = false;
|
||||
self.determiner = true;
|
||||
}
|
||||
CCONJ | SCONJ => {
|
||||
if self.conjunction.is_none() {
|
||||
self.conjunction = Some(ConjunctionData::default())
|
||||
}
|
||||
|
||||
self.noun = None;
|
||||
self.pronoun = None;
|
||||
self.verb = None;
|
||||
self.adjective = None;
|
||||
self.adverb = None;
|
||||
self.determiner = false;
|
||||
self.preposition = false;
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -47,6 +47,7 @@
|
|||
//! - Determiners are denoted by `D`.
|
||||
//! - Prepositions are denoted by `P`.
|
||||
//! - Dialects are denoted by `Am`, `Br`, `Ca`, or `Au`.
|
||||
//! - Noun phrase membership is denoted by `+`
|
||||
//!
|
||||
//! The tagger supports uncertainty, so a single word can be e.g. both a
|
||||
//! noun and a verb. This is denoted by a `/` between the tags.
|
||||
|
@ -146,6 +147,8 @@ fn format_word_tag(word: &WordMetadata) -> String {
|
|||
}
|
||||
});
|
||||
|
||||
add_switch(&mut tags, word.np_member, "+", "");
|
||||
|
||||
if tags.is_empty() {
|
||||
String::from("W?")
|
||||
} else {
|
||||
|
|
|
@ -85,7 +85,7 @@ create_test!(pr_452.md, 2, Dialect::American);
|
|||
create_test!(hex_basic_clean.md, 0, Dialect::American);
|
||||
create_test!(hex_basic_dirty.md, 1, Dialect::American);
|
||||
create_test!(misc_closed_compound_clean.md, 0, Dialect::American);
|
||||
create_test!(yogurt_british_clean.md, 0, Dialect::British);
|
||||
create_test!(yogurt_british_clean.md, 1, Dialect::British);
|
||||
|
||||
// Make sure it doesn't panic
|
||||
create_test!(lukas_homework.md, 3, Dialect::American);
|
||||
|
|
|
@ -456,6 +456,15 @@ Message: |
|
|||
|
||||
|
||||
|
||||
Lint: Capitalization (31 priority)
|
||||
Message: |
|
||||
226 | himself as he came, “Oh! the Duchess, the Duchess! Oh! won’t she be savage if
|
||||
| ^~~ This sentence does not start with a capital letter
|
||||
Suggest:
|
||||
- Replace with: “The”
|
||||
|
||||
|
||||
|
||||
Lint: Capitalization (31 priority)
|
||||
Message: |
|
||||
226 | himself as he came, “Oh! the Duchess, the Duchess! Oh! won’t she be savage if
|
||||
|
|
|
@ -209,6 +209,24 @@ Suggest:
|
|||
|
||||
|
||||
|
||||
Lint: Capitalization (31 priority)
|
||||
Message: |
|
||||
340 | on the left, on the right, on the side, on the bottom.
|
||||
| ^~ This sentence does not start with a capital letter
|
||||
Suggest:
|
||||
- Replace with: “On”
|
||||
|
||||
|
||||
|
||||
Lint: Capitalization (31 priority)
|
||||
Message: |
|
||||
342 | on a bus, on a train, on a plane, on a ferry, on a yacht.
|
||||
| ^~ This sentence does not start with a capital letter
|
||||
Suggest:
|
||||
- Replace with: “On”
|
||||
|
||||
|
||||
|
||||
Lint: Miscellaneous (31 priority)
|
||||
Message: |
|
||||
343 | All of the responsibility is on him.
|
||||
|
|
|
@ -204,6 +204,16 @@ Message: |
|
|||
|
||||
|
||||
|
||||
Lint: WordChoice (63 priority)
|
||||
Message: |
|
||||
89 | third Class at the Expiration of the sixth Year, so that one third may be
|
||||
| ^~~~~~ Did you mean the closed compound noun “maybe”?
|
||||
90 | chosen every second Year; and when vacancies happen in the representation of
|
||||
Suggest:
|
||||
- Replace with: “maybe”
|
||||
|
||||
|
||||
|
||||
Lint: Readability (127 priority)
|
||||
Message: |
|
||||
96 | No Person shall be a Senator who shall not have attained to the Age of thirty
|
||||
|
@ -1541,6 +1551,16 @@ Message: |
|
|||
|
||||
|
||||
|
||||
Lint: WordChoice (63 priority)
|
||||
Message: |
|
||||
658 | questioned. But neither the United States nor any State shall assume or pay any
|
||||
659 | debt or obligation incurred in aid of insurrection or rebellion against the
|
||||
| ^~~~~~~ Did you mean the closed compound noun “debtor”?
|
||||
Suggest:
|
||||
- Replace with: “debtor”
|
||||
|
||||
|
||||
|
||||
Lint: Spelling (63 priority)
|
||||
Message: |
|
||||
663 | ## Article. V.
|
||||
|
|
|
@ -1949,6 +1949,16 @@ Suggest:
|
|||
|
||||
|
||||
|
||||
Lint: WordChoice (63 priority)
|
||||
Message: |
|
||||
1531 | puppyish, convivial way, girls were swooning backward playfully into men’s arms,
|
||||
1532 | even into groups, knowing that some one would arrest their falls—but no one
|
||||
| ^~~~~~~~ Did you mean the closed compound noun “someone”?
|
||||
Suggest:
|
||||
- Replace with: “someone”
|
||||
|
||||
|
||||
|
||||
Lint: Miscellaneous (31 priority)
|
||||
Message: |
|
||||
1531 | puppyish, convivial way, girls were swooning backward playfully into men’s arms,
|
||||
|
@ -6441,6 +6451,16 @@ Suggest:
|
|||
|
||||
|
||||
|
||||
Lint: WordChoice (63 priority)
|
||||
Message: |
|
||||
5181 | easier, surer way of finding out what he wanted to know. By half-past two he was
|
||||
5182 | in West Egg, where he asked some one the way to Gatsby’s house. So by that time
|
||||
| ^~~~~~~~ Did you mean the closed compound noun “someone”?
|
||||
Suggest:
|
||||
- Replace with: “someone”
|
||||
|
||||
|
||||
|
||||
Lint: Miscellaneous (31 priority)
|
||||
Message: |
|
||||
5181 | easier, surer way of finding out what he wanted to know. By half-past two he was
|
||||
|
@ -7123,6 +7143,16 @@ Suggest:
|
|||
|
||||
|
||||
|
||||
Lint: WordChoice (63 priority)
|
||||
Message: |
|
||||
5642 | message or a flower. Dimly I heard some one murmur “Blessed are the dead that
|
||||
| ^~~~~~~~ Did you mean the closed compound noun “someone”?
|
||||
5643 | the rain falls on,” and then the owl-eyed man said “Amen to that,” in a brave
|
||||
Suggest:
|
||||
- Replace with: “someone”
|
||||
|
||||
|
||||
|
||||
Lint: Miscellaneous (31 priority)
|
||||
Message: |
|
||||
5642 | message or a flower. Dimly I heard some one murmur “Blessed are the dead that
|
||||
|
@ -7462,6 +7492,16 @@ Suggest:
|
|||
|
||||
|
||||
|
||||
Lint: WordChoice (54 priority)
|
||||
Message: |
|
||||
5814 | green breast of the new world. Its vanished trees, the trees that had made way
|
||||
| ^~~ Use `it's` (short for `it has` or `it is`) here, not the possessive `its`.
|
||||
Suggest:
|
||||
- Replace with: “It's”
|
||||
- Replace with: “It has”
|
||||
|
||||
|
||||
|
||||
Lint: Readability (127 priority)
|
||||
Message: |
|
||||
5814 | green breast of the new world. Its vanished trees, the trees that had made way
|
||||
|
|
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
|
@ -7,434 +7,434 @@
|
|||
> -->
|
||||
# Unlintable Unlintable
|
||||
> Part - of - speech tagging
|
||||
# Unlintable NSg/V/J . P . NSg/V NSg/V
|
||||
# Unlintable NSg/V/J . P . NSg/V+ NSg/V
|
||||
>
|
||||
#
|
||||
> In corpus linguistics , part - of - speech tagging ( POS tagging or PoS tagging or
|
||||
# NPrSg/J/P NSg NSg . NSg/V/J . P . NSg/V NSg/V . NSg NSg/V NPrSg/C NSg NSg/V NPrSg/C
|
||||
> POST ) , also called grammatical tagging is the process of marking up a word in a
|
||||
# NPrSg/V/P . . W? V/J J NSg/V VL D NSg P NSg/V NSg/V/J/P D/P NSg P D/P
|
||||
> text ( corpus ) as corresponding to a particular part of speech , based on both its
|
||||
# NSg . NSg . NSg/R NSg/V/J P D/P NSg/J NSg/V/J P NSg/V . V/J J/P I/C ISg/D
|
||||
> definition and its context . A simplified form of this is commonly taught to
|
||||
# NSg V/C ISg/D NSg . D/P J NSg/V P I/D VL R V P
|
||||
> school - age children , in the identification of words as nouns , verbs , adjectives ,
|
||||
# NSg/V . NSg/V NPl . P D NSg P NPl/V NSg/R NPl/V . NPl/V . NPl/V .
|
||||
> In corpus linguistics , part - of - speech tagging ( POS tagging or PoS tagging or
|
||||
# NPrSg/J/P NSg+ NSg . NSg/V/J . P . NSg/V NSg/V . NSg+ NSg/V NPrSg/C NSg+ NSg/V NPrSg/C
|
||||
> POST ) , also called grammatical tagging is the process of marking up a word in a
|
||||
# NPrSg/V/P+ . . W? V/J J NSg/V VL D NSg/V P NSg/V NSg/V/J/P D/P NSg/V NPrSg/J/P D/P
|
||||
> text ( corpus ) as corresponding to a particular part of speech , based on both its
|
||||
# NSg/V . NSg+ . NSg/R NSg/V/J P D/P NSg/J NSg/V/J P NSg/V+ . V/J J/P I/C ISg/D+
|
||||
> definition and its context . A simplified form of this is commonly taught to
|
||||
# NSg V/C ISg/D+ NSg/V+ . D/P V/J NSg/V P I/D+ VL R V P
|
||||
> school - age children , in the identification of words as nouns , verbs , adjectives ,
|
||||
# NSg/V . NSg/V NPl . NPrSg/J/P D NSg P NPl/V+ NSg/R NPl/V . NPl/V+ . NPl/V .
|
||||
> adverbs , etc.
|
||||
# NPl/V . W?
|
||||
>
|
||||
#
|
||||
> Once performed by hand , POS tagging is now done in the context of computational
|
||||
# NSg/C V/J NSg/J/P NSg/V . NSg NSg/V VL NPrSg/V/J/C NSg/V/J P D NSg P J
|
||||
> linguistics , using algorithms which associate discrete terms , as well as hidden
|
||||
# NSg . V NPl I/C NSg/V/J J NPl/V . NSg/R NSg/V/J NSg/R V/J
|
||||
> parts of speech , by a set of descriptive tags . POS - tagging algorithms fall into
|
||||
# NPl/V P NSg/V . P D/P NPrSg/J P NSg/J NPl/V . NSg . NSg/V NPl NSg/V P
|
||||
> two distinctive groups : rule - based and stochastic . E. Brill's tagger , one of the
|
||||
# NSg NSg/J NPl/V . NSg/V . V/J V/C J . ? ? NSg . NSg/I/V/J P D
|
||||
> first and most widely used English POS - taggers , employs rule - based algorithms .
|
||||
# NSg/J V/C NSg/I/J R V/J NPrSg/V/J NSg . NPl . NPl/V NSg/V . V/J NPl .
|
||||
> Once performed by hand , POS tagging is now done in the context of computational
|
||||
# NSg/C V/J NSg/J/P NSg/V+ . NSg+ NSg/V VL NPrSg/V/J/C NSg/V/J NPrSg/J/P D NSg/V P J+
|
||||
> linguistics , using algorithms which associate discrete terms , as well as hidden
|
||||
# NSg+ . V NPl+ I/C+ NSg/V/J+ J NPl/V+ . NSg/R NSg/V/J NSg/R V/J
|
||||
> parts of speech , by a set of descriptive tags . POS - tagging algorithms fall into
|
||||
# NPl/V P NSg/V+ . NSg/J/P D/P NPrSg/V/J P NSg/J+ NPl/V+ . NSg+ . NSg/V NPl NSg/V P
|
||||
> two distinctive groups : rule - based and stochastic . E. Brill's tagger , one of the
|
||||
# NSg NSg/J NPl/V+ . NSg/V+ . V/J+ V/C+ J+ . ? ? NSg . NSg/I/V/J P D
|
||||
> first and most widely used English POS - taggers , employs rule - based algorithms .
|
||||
# NSg/V/J V/C NSg/I/J R V/J NPrSg/V/J+ NSg+ . NPl . NPl/V NSg/V+ . V/J NPl+ .
|
||||
>
|
||||
#
|
||||
> Principle
|
||||
# NSg/V
|
||||
>
|
||||
#
|
||||
> Part - of - speech tagging is harder than just having a list of words and their
|
||||
# NSg/V/J . P . NSg/V NSg/V VL J C/P V/J V D/P NSg P NPl/V V/C D
|
||||
> parts of speech , because some words can represent more than one part of speech
|
||||
# NPl P NSg/V . C/P I/J/R NPl/V NPrSg/VX V NPrSg/I/V/J C/P NSg/I/V/J NSg/V/J P NSg/V
|
||||
> at different times , and because some parts of speech are complex . This is not
|
||||
# NSg/P NSg/J NPl/V . V/C C/P I/J/R NPl/V P NSg/V V NSg/V/J . I/D VL NSg/C
|
||||
> Part - of - speech tagging is harder than just having a list of words and their
|
||||
# NSg/V/J . P . NSg/V NSg/V VL J C/P V/J V D/P NSg/V P NPl/V V/C D+
|
||||
> parts of speech , because some words can represent more than one part of speech
|
||||
# NPl/V P NSg/V+ . C/P I/J/R+ NPl/V+ NPrSg/VX V NPrSg/I/V/J C/P NSg/I/V/J NSg/V/J P NSg/V+
|
||||
> at different times , and because some parts of speech are complex . This is not
|
||||
# NSg/P NSg/J+ NPl/V+ . V/C C/P I/J/R NPl/V P NSg/V+ V+ NSg/V/J+ . I/D+ VL NSg/C
|
||||
> rare — in natural languages ( as opposed to many artificial languages ) , a large
|
||||
# NSg/V/J . NPrSg/J/P NSg/J NPl/V . NSg/R V/J P N/I/J/D J NPl/V . . D/P NSg/J
|
||||
> percentage of word - forms are ambiguous . For example , even " dogs " , which is
|
||||
# NSg P NSg/V . NPl/V V J . C/P NSg/V . NSg/V/J . NPl/V . . I/C VL
|
||||
> usually thought of as just a plural noun , can also be a verb :
|
||||
# R NSg/V P NSg/R V/J D/P NSg/J NSg/V . NPrSg/VX W? NSg/VX D/P NSg .
|
||||
# NSg/V/J . NPrSg/J/P NSg/J NPl/V+ . NSg/R V/J P N/I/J/D J NPl/V+ . . D/P NSg/J
|
||||
> percentage of word - forms are ambiguous . For example , even " dogs " , which is
|
||||
# NSg P NSg/V+ . NPl/V+ V+ J+ . C/P NSg/V+ . NSg/V/J . NPl/V+ . . I/C+ VL
|
||||
> usually thought of as just a plural noun , can also be a verb :
|
||||
# R NSg/V P NSg/R V/J D/P+ NSg/J+ NSg/V+ . NPrSg/VX W? NSg/VX D/P NSg/V+ .
|
||||
>
|
||||
#
|
||||
> The sailor dogs the hatch .
|
||||
# D NSg NPl/V D NSg .
|
||||
> The sailor dogs the hatch .
|
||||
# D+ NSg NPl/V D NSg/V+ .
|
||||
>
|
||||
#
|
||||
> Correct grammatical tagging will reflect that " dogs " is here used as a verb , not
|
||||
# NSg/V/J J NSg/V NPrSg/VX V N/I/C/D . NPl/V . VL NSg/J/R V/J NSg/R D/P NSg . NSg/C
|
||||
> as the more common plural noun . Grammatical context is one way to determine
|
||||
# NSg/R D NPrSg/I/J NSg/V/J NSg/J NSg/V . J NSg/V VL NSg/I/V/J NSg/J P V
|
||||
> this ; semantic analysis can also be used to infer that " sailor " and " hatch "
|
||||
# I/D . NSg/J NSg NPrSg/VX W? NSg/VX V/J P J N/I/C/D . NSg . V/C . NSg/V .
|
||||
> implicate " dogs " as 1 ) in the nautical context and 2 ) an action applied to the
|
||||
# NSg/V . NPl/V . NSg/R # . P D J NSg/V V/C # . D/P NSg/J V/J P D
|
||||
> object " hatch " ( in this context , " dogs " is a nautical term meaning " fastens ( a
|
||||
# NSg . NSg/V . . P I/D NSg/V . . NPl/V . VL D/P J NSg/V/J NSg/V/J . V . D/P
|
||||
> watertight door ) securely " ) .
|
||||
# J NSg/V . R . . .
|
||||
> Correct grammatical tagging will reflect that " dogs " is here used as a verb , not
|
||||
# NSg/V/J+ J NSg/V NPrSg/VX V N/I/C/D+ . NPl/V+ . VL NSg/J/R V/J NSg/R D/P+ NSg/V+ . NSg/C
|
||||
> as the more common plural noun . Grammatical context is one way to determine
|
||||
# NSg/R D NPrSg/I/V/J NSg/V/J NSg/J NSg/V+ . J NSg/V+ VL NSg/I/V/J NSg/J+ P V
|
||||
> this ; semantic analysis can also be used to infer that " sailor " and " hatch "
|
||||
# I/D+ . NSg/J NSg+ NPrSg/VX W? NSg/VX V/J P J N/I/C/D+ . NSg+ . V/C . NSg/V .
|
||||
> implicate " dogs " as 1 ) in the nautical context and 2 ) an action applied to the
|
||||
# NSg/V . NPl/V . NSg/R # . NPrSg/J/P D+ J+ NSg/V+ V/C # . D/P NSg/V/J+ V/J P D
|
||||
> object " hatch " ( in this context , " dogs " is a nautical term meaning " fastens ( a
|
||||
# NSg/V+ . NSg/V . . NPrSg/J/P I/D+ NSg/V+ . . NPl/V+ . VL D/P J NSg/V/J+ NSg/V/J+ . V . D/P
|
||||
> watertight door ) securely " ) .
|
||||
# J NSg/V+ . R . . .
|
||||
>
|
||||
#
|
||||
> Tag sets
|
||||
# NSg/V NPl/V
|
||||
> Tag sets
|
||||
# NSg/V+ NPl/V
|
||||
>
|
||||
#
|
||||
> Schools commonly teach that there are 9 parts of speech in English : noun , verb ,
|
||||
# NPl/V R NSg/V N/I/C/D W? V # NPl/V P NSg/V NPrSg/J/P NPrSg/V/J . NSg/V . NSg/V .
|
||||
> Schools commonly teach that there are 9 parts of speech in English : noun , verb ,
|
||||
# NPl/V+ R NSg/V N/I/C/D + V # NPl/V P NSg/V+ NPrSg/J/P NPrSg/V/J . NSg/V+ . NSg/V+ .
|
||||
> article , adjective , preposition , pronoun , adverb , conjunction , and interjection .
|
||||
# NSg/V . NSg/V/J . NSg/V . NSg/V . NSg/V . NSg/V . V/C NSg .
|
||||
# NSg/V+ . NSg/V/J+ . NSg/V . NSg/V+ . NSg/V+ . NSg/V+ . V/C NSg+ .
|
||||
> However , there are clearly many more categories and sub - categories . For nouns ,
|
||||
# C . W? V R N/I/J/D NPrSg/I/V/J NPl V/C NSg/V/P . NPl . C/P NPl/V .
|
||||
> the plural , possessive , and singular forms can be distinguished . In many
|
||||
# D NSg/J . NSg/J . V/C NSg/J NPl/V NPrSg/VX NSg/VX V/J . P N/I/J/D
|
||||
> languages words are also marked for their " case " ( role as subject , object ,
|
||||
# NPl/V NPl/V V W? V/J C/P D . NPrSg/V . . NSg NSg/R NSg/V/J . NSg/V .
|
||||
> etc. ) , grammatical gender , and so on ; while verbs are marked for tense , aspect ,
|
||||
# W? . . J NSg/V/J . V/C NSg/I/J/C J/P . NSg/V/C/P NPl/V V V/J C/P NSg/V/J . NSg/V .
|
||||
> and other things . In some tagging systems , different inflections of the same
|
||||
# V/C NSg/V/J NPl/V . NPrSg/J/P I/J/R NSg/V NPl . NSg/J NPl P D I/J
|
||||
> root word will get different parts of speech , resulting in a large number of
|
||||
# NPrSg/V NSg/V NPrSg/VX NSg/V NSg/J NPl/V P NSg/V . V P D/P NSg/J NSg/V/J P
|
||||
> tags . For example , NN for singular common nouns , NNS for plural common nouns , NP
|
||||
# NPl/V . C/P NSg/V . ? C/P NSg/J NSg/V/J NPl/V . ? C/P NSg/J NSg/V/J NPl/V . NPrSg
|
||||
> for singular proper nouns ( see the POS tags used in the Brown Corpus ) . Other
|
||||
# C/P NSg/J NSg/J NPl/V . NSg/V D NSg NPl/V V/J P D NPrSg/J NSg . . NSg/V/J
|
||||
> tagging systems use a smaller number of tags and ignore fine differences or
|
||||
# NSg/V NPl NSg/V D/P J NSg/V/J P NPl/V V/C V NSg/V/J NSg/V NPrSg/C
|
||||
> model them as features somewhat independent from part - of - speech .
|
||||
# NSg/V/J N/I NSg/R NPl/V NSg/I NSg/J P NSg/V/J . P . NSg/V .
|
||||
# C . + V R N/I/J/D NPrSg/I/V/J NPl+ V/C NSg/V/P . NPl . C/P NPl/V .
|
||||
> the plural , possessive , and singular forms can be distinguished . In many
|
||||
# D NSg/J . NSg/J . V/C NSg/J NPl/V+ NPrSg/VX+ NSg/VX+ V/J+ . NPrSg/J/P N/I/J/D+
|
||||
> languages words are also marked for their " case " ( role as subject , object ,
|
||||
# NPl/V+ NPl/V+ V W? V/J C/P D+ . NPrSg/V+ . . NSg NSg/R NSg/V/J . NSg/V+ .
|
||||
> etc. ) , grammatical gender , and so on ; while verbs are marked for tense , aspect ,
|
||||
# + . . J+ NSg/V/J+ . V/C NSg/I/J/C J/P . NSg/V/C/P NPl/V+ V V/J C/P NSg/V/J . NSg/V+ .
|
||||
> and other things . In some tagging systems , different inflections of the same
|
||||
# V/C NSg/V/J+ NPl/V+ . NPrSg/J/P I/J/R+ NSg/V NPl+ . NSg/J NPl P D+ I/J+
|
||||
> root word will get different parts of speech , resulting in a large number of
|
||||
# NPrSg/V+ NSg/V+ NPrSg/VX NSg/V NSg/J NPl/V P NSg/V+ . V NPrSg/J/P D/P NSg/J NSg/V/J P+
|
||||
> tags . For example , NN for singular common nouns , NNS for plural common nouns , NP
|
||||
# NPl/V+ . C/P NSg/V+ . ? C/P NSg/J NSg/V/J+ NPl/V . ? C/P NSg/J NSg/V/J+ NPl/V . NPrSg
|
||||
> for singular proper nouns ( see the POS tags used in the Brown Corpus ) . Other
|
||||
# C/P NSg/J NSg/J NPl/V . NSg/V D+ NSg+ NPl/V+ V/J NPrSg/J/P D+ NPrSg/V/J+ NSg+ . . NSg/V/J
|
||||
> tagging systems use a smaller number of tags and ignore fine differences or
|
||||
# NSg/V NPl+ NSg/V D/P J NSg/V/J P NPl/V+ V/C V NSg/V/J NSg/V NPrSg/C
|
||||
> model them as features somewhat independent from part - of - speech .
|
||||
# NSg/V/J+ N/I+ NSg/R+ NPl/V+ NSg/I NSg/J P NSg/V/J . P . NSg/V+ .
|
||||
>
|
||||
#
|
||||
> In part - of - speech tagging by computer , it is typical to distinguish from 50 to
|
||||
# NPrSg/J/P NSg/V/J . P . NSg/V NSg/V NSg/J/P NSg/V . NPrSg/ISg VL NSg/J P V P # P
|
||||
> 150 separate parts of speech for English . Work on stochastic methods for tagging
|
||||
# # NSg/V/J NPl/V P NSg/V C/P NPrSg/V/J . NSg/V J/P J NPl/V C/P NSg/V
|
||||
> In part - of - speech tagging by computer , it is typical to distinguish from 50 to
|
||||
# NPrSg/J/P NSg/V/J . P . NSg/V NSg/V NSg/J/P NSg/V+ . NPrSg/ISg+ VL NSg/J P V P # P
|
||||
> 150 separate parts of speech for English . Work on stochastic methods for tagging
|
||||
# # NSg/V/J NPl/V P NSg/V C/P NPrSg/V/J+ . NSg/V J/P J NPl/V C/P NSg/V
|
||||
> Koine Greek ( DeRose 1990 ) has used over 1 , 000 parts of speech and found that
|
||||
# ? NPrSg/V/J . ? # . V V/J NSg/V/J/P # . # NPl/V P NSg/V V/C NSg/V N/I/C/D
|
||||
> about as many words were ambiguous in that language as in English . A
|
||||
# J/P NSg/R N/I/J/D NPl/V NSg/V J P N/I/C/D NSg/V NSg/R NPrSg/J/P NPrSg/V/J . D/P
|
||||
> morphosyntactic descriptor in the case of morphologically rich languages is
|
||||
# ? NSg P D NPrSg P ? NPrSg/V/J NPl/V VL
|
||||
> commonly expressed using very short mnemonics , such as Ncmsan for Category = Noun ,
|
||||
# R V/J V J NPrSg/V/J/P NPl . NSg/I NSg/R ? C/P NSg . NSg/V .
|
||||
# ? NPrSg/V/J . ? # . V V/J NSg/V/J/P # . # NPl/V P NSg/V+ V/C NSg/V N/I/C/D
|
||||
> about as many words were ambiguous in that language as in English . A
|
||||
# J/P NSg/R N/I/J/D+ NPl/V+ NSg/V J NPrSg/J/P N/I/C/D+ NSg/V+ NSg/R NPrSg/J/P NPrSg/V/J+ . D/P
|
||||
> morphosyntactic descriptor in the case of morphologically rich languages is
|
||||
# ? NSg NPrSg/J/P D NPrSg/V P ? NPrSg/V/J NPl/V+ VL
|
||||
> commonly expressed using very short mnemonics , such as Ncmsan for Category = Noun ,
|
||||
# R V/J V J NPrSg/V/J/P+ NPl . NSg/I NSg/R ? C/P NSg . NSg/V+ .
|
||||
> Type = common , Gender = masculine , Number = singular , Case = accusative , Animate
|
||||
# NSg/V . NSg/V/J . NSg/V/J . NSg/J . NSg/V/J . NSg/J . NPrSg/V . NSg/J . V/J
|
||||
> = no .
|
||||
# . NPrSg/P .
|
||||
>
|
||||
#
|
||||
> The most popular " tag set " for POS tagging for American English is probably the
|
||||
# D NSg/I/J NSg/J . NSg/V NPrSg/V/J . C/P NSg NSg/V C/P NPrSg/J NPrSg/V/J VL R D
|
||||
> Penn tag set , developed in the Penn Treebank project . It is largely similar to
|
||||
# NPr NSg/V NPrSg/V/J . V/J P D NPr ? NSg/V . NPrSg/ISg VL R NSg/J P
|
||||
> the earlier Brown Corpus and LOB Corpus tag sets , though much smaller . In
|
||||
# D J NPrSg/V/J NSg V/C NSg/V NSg NSg/V NPl/V . V/C N/I/J J . NPrSg/J/P
|
||||
> Europe , tag sets from the Eagles Guidelines see wide use and include versions
|
||||
# NPr . NSg/V NPl/V P D NPl NPl NSg/V NSg/J NSg/V V/C NSg/V NPl/V
|
||||
> The most popular " tag set " for POS tagging for American English is probably the
|
||||
# D NSg/I/J NSg/J . NSg/V+ NPrSg/V/J . C/P NSg+ NSg/V C/P NPrSg/J NPrSg/V/J+ VL R D+
|
||||
> Penn tag set , developed in the Penn Treebank project . It is largely similar to
|
||||
# NPr+ NSg/V+ NPrSg/V/J . V/J NPrSg/J/P D+ NPr+ ? NSg/V+ . NPrSg/ISg+ VL R NSg/J P
|
||||
> the earlier Brown Corpus and LOB Corpus tag sets , though much smaller . In
|
||||
# D J NPrSg/V/J NSg V/C NSg/V NSg+ NSg/V+ NPl/V . V/C N/I/J+ J+ . NPrSg/J/P
|
||||
> Europe , tag sets from the Eagles Guidelines see wide use and include versions
|
||||
# NPr+ . NSg/V+ NPl/V P D+ NPl/V+ NPl+ NSg/V NSg/J NSg/V+ V/C NSg/V NPl/V
|
||||
> for multiple languages .
|
||||
# C/P NSg/J NPl/V .
|
||||
# C/P NSg/J+ NPl/V+ .
|
||||
>
|
||||
#
|
||||
> POS tagging work has been done in a variety of languages , and the set of POS
|
||||
# NSg NSg/V NSg/V V NSg/V NSg/V/J P D/P NSg P NPl/V . V/C D NPrSg/J P NSg
|
||||
> tags used varies greatly with language . Tags usually are designed to include
|
||||
# NPl/V V/J NPl/V R P NSg/V . NPl/V R V V/J P NSg/V
|
||||
> POS tagging work has been done in a variety of languages , and the set of POS
|
||||
# NSg+ NSg/V NSg/V+ V NSg/V NSg/V/J NPrSg/J/P D/P NSg P NPl/V+ . V/C D NPrSg/V/J P NSg+
|
||||
> tags used varies greatly with language . Tags usually are designed to include
|
||||
# NPl/V+ V/J NPl/V R P NSg/V+ . NPl/V+ R V V/J P NSg/V
|
||||
> overt morphological distinctions , although this leads to inconsistencies such as
|
||||
# NSg/J J NPl . C I/D NPl/V P NPl NSg/I NSg/R
|
||||
> case - marking for pronouns but not nouns in English , and much larger
|
||||
# NPrSg/V . NSg/V C/P NPl/V NSg/C/P NSg/C NPl/V NPrSg/J/P NPrSg/V/J . V/C N/I/J J
|
||||
> cross - language differences . The tag sets for heavily inflected languages such as
|
||||
# NPrSg/V/J/P . NSg/V NSg/V . D NSg NPl/V C/P R V/J NPl/V NSg/I NSg/R
|
||||
> Greek and Latin can be very large ; tagging words in agglutinative languages such
|
||||
# NPrSg/V/J V/C NPrSg/J NPrSg/VX NSg/VX J NSg/J . NSg/V NPl/V NPrSg/J/P ? NPl/V NSg/I
|
||||
> as Inuit languages may be virtually impossible . At the other extreme , Petrov et
|
||||
# NSg/R NPrSg/J NPl/V NPrSg/VX NSg/VX R NSg/J . P D NSg/J NSg/J . ? ?
|
||||
> al. have proposed a " universal " tag set , with 12 categories ( for example , no
|
||||
# ? NSg/VX V/J D/P . NSg/J . NSg/V NPrSg/V/J . P # NPl . C/P NSg/V . NPrSg/P
|
||||
> subtypes of nouns , verbs , punctuation , and so on ) . Whether a very small set of
|
||||
# NPl P NPl/V . NPl/V . NSg . V/C NSg/I/J/C J/P . . I/C D/P J NPrSg/V/J NPrSg/V/J P
|
||||
> very broad tags or a much larger set of more precise ones is preferable , depends
|
||||
# J NSg/J NPl/V NPrSg/C D/P N/I/J J NPrSg/V/J P NPrSg/I/V/J V/J NPl/V VL W? . NPl/V
|
||||
> on the purpose at hand . Automatic tagging is easier on smaller tag - sets .
|
||||
# P D NSg NSg/P NSg/V . NSg/J NSg/V VL J J/P J NSg/V . NPl/V .
|
||||
# NSg/J J+ NPl+ . C I/D+ NPl/V P NPl NSg/I NSg/R
|
||||
> case - marking for pronouns but not nouns in English , and much larger
|
||||
# NPrSg/V+ . NSg/V C/P NPl/V NSg/C/P NSg/C NPl/V NPrSg/J/P NPrSg/V/J+ . V/C N/I/J J
|
||||
> cross - language differences . The tag sets for heavily inflected languages such as
|
||||
# NPrSg/V/J/P+ . NSg/V+ NSg/V . D+ NSg/V+ NPl/V C/P R V/J NPl/V+ NSg/I NSg/R
|
||||
> Greek and Latin can be very large ; tagging words in agglutinative languages such
|
||||
# NPrSg/V/J V/C NPrSg/J NPrSg/VX NSg/VX J NSg/J . NSg/V NPl/V+ NPrSg/J/P ? NPl/V+ NSg/I
|
||||
> as Inuit languages may be virtually impossible . At the other extreme , Petrov et
|
||||
# NSg/R NPrSg/J NPl/V+ NPrSg/VX NSg/VX R+ NSg/J+ . NSg/P D+ NSg/V/J+ NSg/J . ? ?
|
||||
> al. have proposed a " universal " tag set , with 12 categories ( for example , no
|
||||
# ? NSg/VX V/J D/P . NSg/J . NSg/V+ NPrSg/V/J . P # NPl . C/P NSg/V+ . NPrSg/P
|
||||
> subtypes of nouns , verbs , punctuation , and so on ) . Whether a very small set of
|
||||
# NPl P NPl/V . NPl/V+ . NSg+ . V/C NSg/I/J/C J/P+ . . I/C D/P J NPrSg/V/J NPrSg/V/J P
|
||||
> very broad tags or a much larger set of more precise ones is preferable , depends
|
||||
# J NSg/J NPl/V NPrSg/C D/P N/I/J J NPrSg/V/J P NPrSg/I/V/J V/J NPl/V+ VL W? . NPl/V
|
||||
> on the purpose at hand . Automatic tagging is easier on smaller tag - sets .
|
||||
# J/P D+ NSg/V NSg/P NSg/V+ . NSg/J NSg/V VL J J/P J NSg/V+ . NPl/V+ .
|
||||
>
|
||||
#
|
||||
> History
|
||||
# NSg
|
||||
>
|
||||
#
|
||||
> The Brown Corpus
|
||||
# D NPrSg/J NSg
|
||||
> The Brown Corpus
|
||||
# D NPrSg/V/J+ NSg
|
||||
>
|
||||
#
|
||||
> Research on part - of - speech tagging has been closely tied to corpus linguistics .
|
||||
# NSg/V J/P NSg/V/J . P . NSg/V NSg/V V NSg/V R V/J P NSg NSg .
|
||||
> The first major corpus of English for computer analysis was the Brown Corpus
|
||||
# D NSg/J NPrSg/V/J NSg P NPrSg/V/J C/P NSg/V NSg V D NPrSg/J NSg
|
||||
> developed at Brown University by Henry Kučera and W. Nelson Francis , in the
|
||||
# V/J NSg/P NPrSg/V/J NSg NSg/J/P NPrSg ? V/C ? NPrSg NPr . P D
|
||||
> mid - 1960s . It consists of about 1 , 000 , 000 words of running English prose text ,
|
||||
# NSg/J/P . #d . NPrSg/ISg NPl/V P J/P # . # . # NPl/V P NSg/V/J/P NPrSg/V/J NSg/V NSg/V .
|
||||
# NSg/V J/P NSg/V/J . P . NSg/V NSg/V V NSg/V R V/J P NSg NSg+ .
|
||||
> The first major corpus of English for computer analysis was the Brown Corpus
|
||||
# D NSg/V/J NPrSg/V/J NSg P NPrSg/V/J+ C/P NSg/V+ NSg+ V D NPrSg/V/J NSg
|
||||
> developed at Brown University by Henry Kučera and W. Nelson Francis , in the
|
||||
# V/J NSg/P NPrSg/V/J NSg NSg/J/P NPrSg+ ? V/C ? NPrSg+ NPr+ . NPrSg/J/P D
|
||||
> mid - 1960s . It consists of about 1 , 000 , 000 words of running English prose text ,
|
||||
# NSg/J/P+ . #d . NPrSg/ISg+ NPl/V P J/P # . # . # NPl/V P NSg/V/J/P NPrSg/V/J+ NSg/V NSg/V+ .
|
||||
> made up of 500 samples from randomly chosen publications . Each sample is 2 , 000
|
||||
# NSg/V NSg/V/J/P P # NPl/V P R V/J NPl . D NSg/V VL # . #
|
||||
> or more words ( ending at the first sentence - end after 2 , 000 words , so that the
|
||||
# NPrSg/C NPrSg/I/V/J NPl/V . NSg/V P D NSg/J NSg/V . NSg/V J/P # . # NPl/V . NSg/I/J/C N/I/C/D D
|
||||
# NSg/V NSg/V/J/P P # NPl/V+ P R+ V/J NPl+ . D+ NSg/V+ VL # . #
|
||||
> or more words ( ending at the first sentence - end after 2 , 000 words , so that the
|
||||
# NPrSg/C NPrSg/I/V/J NPl/V+ . NSg/V NSg/P D NSg/V/J+ NSg/V+ . NSg/V J/P # . # NPl/V+ . NSg/I/J/C N/I/C/D D+
|
||||
> corpus contains only complete sentences ) .
|
||||
# NSg V W? NSg/V/J NPl/V . .
|
||||
# NSg+ V W? NSg/V/J+ NPl/V+ . .
|
||||
>
|
||||
#
|
||||
> The Brown Corpus was painstakingly " tagged " with part - of - speech markers over
|
||||
# D NPrSg/J NSg V R . V/J . P NSg/V/J . P . NSg/V NPl/V NSg/V/J/P
|
||||
> many years . A first approximation was done with a program by Greene and Rubin ,
|
||||
# N/I/J/D NPl . D/P NSg/J NSg V NSg/V/J P D/P NPrSg NSg/J/P NPr V/C NPr .
|
||||
> which consisted of a huge handmade list of what categories could co - occur at
|
||||
# I/C V/J P D/P J NSg/J NSg/V P NSg/I NPl NSg/VX NPrSg/I/V . V NSg/P
|
||||
> all . For example , article then noun can occur , but article then verb ( arguably )
|
||||
# NSg/I/J/C . C/P NSg/V . NSg/V NSg/J/C NSg/V NPrSg/VX V . NSg/C/P NSg/V NSg/J/C NSg/V . R .
|
||||
> cannot . The program got about 70 % correct . Its results were repeatedly reviewed
|
||||
# NSg/V . D NPrSg V J/P # . NSg/V/J . ISg/D NPl NSg/V R V/J
|
||||
> and corrected by hand , and later users sent in errata so that by the late 70 s
|
||||
# V/C V/J NSg/J/P NSg/V . V/C J NPl NSg/V NPrSg/J/P NSg NSg/I/J/C N/I/C/D P D NSg/J # ?
|
||||
> the tagging was nearly perfect ( allowing for some cases on which even human
|
||||
# D NSg V R NSg/V/J . V C/P I/J/R NPl/V J/P I/C NSg/V/J NSg/V/J
|
||||
> The Brown Corpus was painstakingly " tagged " with part - of - speech markers over
|
||||
# D+ NPrSg/V/J NSg V R . V/J . P NSg/V/J . P . NSg/V NPl/V NSg/V/J/P
|
||||
> many years . A first approximation was done with a program by Greene and Rubin ,
|
||||
# N/I/J/D+ NPl+ . D/P+ NSg/V/J+ NSg+ V NSg/V/J P D/P NPrSg/V NSg/J/P NPr V/C NPr .
|
||||
> which consisted of a huge handmade list of what categories could co - occur at
|
||||
# I/C+ V/J P D/P J NSg/J NSg/V P NSg/I+ NPl+ NSg/VX NPrSg/I/V+ . V NSg/P+
|
||||
> all . For example , article then noun can occur , but article then verb ( arguably )
|
||||
# NSg/I/J/C . C/P NSg/V+ . NSg/V+ NSg/J/C NSg/V+ NPrSg/VX V . NSg/C/P NSg/V+ NSg/J/C NSg/V+ . R .
|
||||
> cannot . The program got about 70 % correct . Its results were repeatedly reviewed
|
||||
# NSg/V . D+ NPrSg/V+ V J/P # . NSg/V/J+ . ISg/D+ NPl/V+ NSg/V R V/J
|
||||
> and corrected by hand , and later users sent in errata so that by the late 70 s
|
||||
# V/C V/J NSg/J/P NSg/V+ . V/C J NPl+ NSg/V NPrSg/J/P NSg NSg/I/J/C N/I/C/D+ NSg/J/P D NSg/J # ?
|
||||
> the tagging was nearly perfect ( allowing for some cases on which even human
|
||||
# D NSg/V V R NSg/V/J . V C/P I/J/R NPl/V+ J/P I/C+ NSg/V/J NSg/V/J
|
||||
> speakers might not agree ) .
|
||||
# W? NSg/VX/J NSg/C V . .
|
||||
# + NSg/VX/J NSg/C V . .
|
||||
>
|
||||
#
|
||||
> This corpus has been used for innumerable studies of word - frequency and of
|
||||
# I/D NSg V NSg/V V/J C/P J NPl/V P NSg/V . NSg V/C P
|
||||
> part - of - speech and inspired the development of similar " tagged " corpora in many
|
||||
# NSg/V/J . P . NSg/V V/C V/J D NSg P NSg/J . V/J . NPl P N/I/J/D
|
||||
> other languages . Statistics derived by analyzing it formed the basis for most
|
||||
# NSg/V/J NPl/V . NPl/V V/J NSg/J/P V NPrSg/ISg V/J D NSg C/P NSg/I/J
|
||||
> later part - of - speech tagging systems , such as CLAWS and VOLSUNGA . However , by
|
||||
# J NSg/V/J . P . NSg/V NSg/V NPl . NSg/I NSg/R NPl/V V/C ? . C . P
|
||||
> this time ( 2005 ) it has been superseded by larger corpora such as the 100
|
||||
# I/D NSg/V/J . # . NPrSg/ISg V NSg/V V/J NSg/J/P J NPl NSg/I NSg/R D #
|
||||
> million word British National Corpus , even though larger corpora are rarely so
|
||||
# N NSg/V NPrSg/J NSg/J NSg . NSg/V/J V/C J NPl V R NSg/I/J/C
|
||||
> This corpus has been used for innumerable studies of word - frequency and of
|
||||
# I/D+ NSg V NSg/V V/J C/P J NPl/V P NSg/V+ . NSg V/C P
|
||||
> part - of - speech and inspired the development of similar " tagged " corpora in many
|
||||
# NSg/V/J . P . NSg/V V/C V/J D NSg P NSg/J . V/J . NPl NPrSg/J/P N/I/J/D+
|
||||
> other languages . Statistics derived by analyzing it formed the basis for most
|
||||
# NSg/V/J+ NPl/V+ . NPl/V+ V/J NSg/J/P V NPrSg/ISg+ V/J D NSg C/P NSg/I/J
|
||||
> later part - of - speech tagging systems , such as CLAWS and VOLSUNGA . However , by
|
||||
# J NSg/V/J . P . NSg/V NSg/V NPl . NSg/I NSg/R NPl/V+ V/C ? . C . NSg/J/P
|
||||
> this time ( 2005 ) it has been superseded by larger corpora such as the 100
|
||||
# I/D+ NSg/V/J+ . # . NPrSg/ISg+ V NSg/V V/J NSg/J/P J NPl+ NSg/I NSg/R D #
|
||||
> million word British National Corpus , even though larger corpora are rarely so
|
||||
# N NSg/V+ NPrSg/J NSg/J+ NSg+ . NSg/V/J V/C J+ NPl+ V R NSg/I/J/C
|
||||
> thoroughly curated .
|
||||
# R V/J .
|
||||
# R+ V/J+ .
|
||||
>
|
||||
#
|
||||
> For some time , part - of - speech tagging was considered an inseparable part of
|
||||
# C/P I/J/R NSg/V/J . NSg/V/J . P . NSg/V NSg/V V V/J D/P NSg/J NSg/V/J P
|
||||
> natural language processing , because there are certain cases where the correct
|
||||
# NSg/J NSg/V V . C/P W? V I/J NPl/V NSg/C D NSg/J
|
||||
> natural language processing , because there are certain cases where the correct
|
||||
# NSg/J+ NSg/V+ V+ . C/P + V I/J NPl/V+ NSg/C D NSg/V/J
|
||||
> part of speech cannot be decided without understanding the semantics or even the
|
||||
# NSg/V/J P NSg/V NSg/V NSg/VX NSg/V/J C/P NSg/V/J D NSg NPrSg/C NSg/V/J D
|
||||
# NSg/V/J P NSg/V+ NSg/V NSg/VX NSg/V/J C/P NSg/V/J+ D+ NSg NPrSg/C NSg/V/J D
|
||||
> pragmatics of the context . This is extremely expensive , especially because
|
||||
# NPl P D NSg . I/D VL R J . R C/P
|
||||
# NPl P D+ NSg/V+ . I/D+ VL R J . R C/P
|
||||
> analyzing the higher levels is much harder when multiple part - of - speech
|
||||
# V D J NPl/V VL N/I/J J NSg/I/C NSg/J NSg/V/J . P . NSg/V
|
||||
> possibilities must be considered for each word .
|
||||
# NPl NSg/V NSg/VX V/J C/P D NSg/V .
|
||||
# V D+ J+ NPl/V+ VL N/I/J J NSg/I/C NSg/J NSg/V/J . P . NSg/V
|
||||
> possibilities must be considered for each word .
|
||||
# NPl NSg/V NSg/VX V/J C/P D+ NSg/V+ .
|
||||
>
|
||||
#
|
||||
> Use of hidden Markov models
|
||||
# NSg/V P V/J NPr NPl/V
|
||||
# NSg/V P V/J NPr+ NPl/V
|
||||
>
|
||||
#
|
||||
> In the mid - 1980s , researchers in Europe began to use hidden Markov models ( HMMs )
|
||||
# P D NSg/J/P . #d . W? NPrSg/J/P NPr V P NSg/V V/J NPr NPl/V . ? .
|
||||
> In the mid - 1980s , researchers in Europe began to use hidden Markov models ( HMMs )
|
||||
# NPrSg/J/P D NSg/J/P . #d . W? NPrSg/J/P NPr+ V P NSg/V V/J NPr NPl/V+ . ? .
|
||||
> to disambiguate parts of speech , when working to tag the Lancaster - Oslo - Bergen
|
||||
# P V NPl/V P NSg/V . NSg/I/C V P NSg/V D NPr . NPr . NPr
|
||||
> Corpus of British English . HMMs involve counting cases ( such as from the Brown
|
||||
# NSg P NPrSg/J NPrSg/V/J . ? V V NPl/V . NSg/I NSg/R P D NPrSg/J
|
||||
# P V NPl/V P NSg/V+ . NSg/I/C V P NSg/V D NPr . NPr+ . NPr
|
||||
> Corpus of British English . HMMs involve counting cases ( such as from the Brown
|
||||
# NSg P NPrSg/J+ NPrSg/V/J+ . ? V V NPl/V . NSg/I NSg/R P D+ NPrSg/V/J+
|
||||
> Corpus ) and making a table of the probabilities of certain sequences . For
|
||||
# NSg . V/C NSg/V D/P NSg P D NPl P I/J NPl/V . C/P
|
||||
> example , once you've seen an article such as ' the ' , perhaps the next word is a
|
||||
# NSg/V . NSg/C W? NSg/V D/P NSg NSg/I NSg/R . D . . NSg D NSg/J/P NSg/V VL D/P
|
||||
> noun 40 % of the time , an adjective 40 % , and a number 20 % . Knowing this , a
|
||||
# NSg # . P D NSg/J . D/P NSg/J # . . V/C D/P NSg/J # . . NSg/V/J/P I/D . D/P
|
||||
> program can decide that " can " in " the can " is far more likely to be a noun than
|
||||
# NPrSg NPrSg/VX V N/I/C/D . NPrSg/VX . NPrSg/J/P . D NPrSg . VL NSg/V/J NPrSg/I/V/J NSg/J P NSg/VX D/P NSg C/P
|
||||
> a verb or a modal . The same method can , of course , be used to benefit from
|
||||
# D/P NSg NPrSg/C D/P NSg/J . D I/J NSg/V NPrSg/VX . P NSg/V . NSg/VX V/J P NSg/V P
|
||||
> knowledge about the following words .
|
||||
# NSg/V J/P D NSg/J/P NPl/V .
|
||||
# NSg+ . V/C NSg/V D/P NSg/V P D NPl P I/J+ NPl/V+ . C/P
|
||||
> example , once you've seen an article such as ' the ' , perhaps the next word is a
|
||||
# NSg/V+ . NSg/C W? NSg/V D/P NSg/V+ NSg/I NSg/R . D . . NSg D+ NSg/J/P+ NSg/V+ VL D/P
|
||||
> noun 40 % of the time , an adjective 40 % , and a number 20 % . Knowing this , a
|
||||
# NSg/V # . P D+ NSg/V/J+ . D/P+ NSg/V/J+ # . . V/C D/P+ NSg/V/J+ # . . NSg/V/J/P I/D+ . D/P+
|
||||
> program can decide that " can " in " the can " is far more likely to be a noun than
|
||||
# NPrSg/V+ NPrSg/VX V N/I/C/D+ . NPrSg/VX . NPrSg/J/P . D+ NPrSg/VX . VL NSg/V/J NPrSg/I/V/J NSg/J P NSg/VX D/P NSg/V C/P
|
||||
> a verb or a modal . The same method can , of course , be used to benefit from
|
||||
# D/P NSg/V NPrSg/C D/P+ NSg/J+ . D+ I/J+ NSg/V+ NPrSg/VX . P NSg/V+ . NSg/VX V/J P NSg/V P
|
||||
> knowledge about the following words .
|
||||
# NSg/V+ J/P D+ NSg/V/J/P+ NPl/V .
|
||||
>
|
||||
#
|
||||
> More advanced ( " higher - order " ) HMMs learn the probabilities not only of pairs
|
||||
# NPrSg/I/V/J V/J . . J . NSg/V . . ? NSg/V D NPl NSg/C W? P NPl/V
|
||||
# NPrSg/I/V/J V/J . . J . NSg/V . . ? NSg/V D+ NPl+ NSg/C W? P NPl/V+
|
||||
> but triples or even larger sequences . So , for example , if you've just seen a
|
||||
# NSg/C/P NPl/V NPrSg/C NSg/V/J J NPl/V . NSg/I/J/C . C/P NSg/V . NSg/C W? V/J NSg/V D/P
|
||||
> noun followed by a verb , the next item may be very likely a preposition ,
|
||||
# NSg V/J P D/P NSg . D NSg/J/P NSg/V NPrSg/VX NSg/VX J NSg/J D/P NSg .
|
||||
> article , or noun , but much less likely another verb .
|
||||
# NSg/V . NPrSg/C NSg/V . NSg/C/P N/I/J V/J/C/P NSg/J I/D NSg/V .
|
||||
# NSg/C/P NPl/V NPrSg/C NSg/V/J J NPl/V+ . NSg/I/J/C . C/P NSg/V+ . NSg/C W? V/J NSg/V D/P
|
||||
> noun followed by a verb , the next item may be very likely a preposition ,
|
||||
# NSg/V V/J NSg/J/P D/P+ NSg/V+ . D+ NSg/J/P+ NSg/V+ NPrSg/VX NSg/VX J NSg/J D/P NSg/V .
|
||||
> article , or noun , but much less likely another verb .
|
||||
# NSg/V+ . NPrSg/C NSg/V+ . NSg/C/P N/I/J V/J/C/P NSg/J+ I/D NSg/V .
|
||||
>
|
||||
#
|
||||
> When several ambiguous words occur together , the possibilities multiply .
|
||||
# NSg/I/C J/D J NPl/V V J . D NPl NSg/V .
|
||||
> However , it is easy to enumerate every combination and to assign a relative
|
||||
# C . NPrSg/ISg VL NSg/V/J P V D NSg V/C P NSg/V D/P NSg/J
|
||||
> probability to each one , by multiplying together the probabilities of each
|
||||
# NSg P D NSg/I/V/J . NSg/J/P V J D NPl P D
|
||||
> choice in turn . The combination with the highest probability is then chosen . The
|
||||
# NSg/J NPrSg/J/P NSg/V . D NSg P D W? NSg VL NSg/J/C V/J . D
|
||||
> European group developed CLAWS , a tagging program that did exactly this and
|
||||
# NSg/J NSg/V V/J NPl/V . D/P NSg NPrSg/V N/I/C/D V R I/D V/C
|
||||
> achieved accuracy in the 93 – 95 % range .
|
||||
# V/J NSg P D # . # . NSg/V .
|
||||
> When several ambiguous words occur together , the possibilities multiply .
|
||||
# NSg/I/C J/D J NPl/V+ V J . D+ NPl NSg/V+ .
|
||||
> However , it is easy to enumerate every combination and to assign a relative
|
||||
# C . NPrSg/ISg+ VL NSg/V/J P V D+ NSg+ V/C P NSg/V D/P NSg/J
|
||||
> probability to each one , by multiplying together the probabilities of each
|
||||
# NSg P D+ NSg/I/V/J+ . NSg/J/P V J D NPl P D+
|
||||
> choice in turn . The combination with the highest probability is then chosen . The
|
||||
# NSg/J+ NPrSg/J/P NSg/V . D NSg P D+ + NSg+ VL NSg/J/C+ V/J . D+
|
||||
> European group developed CLAWS , a tagging program that did exactly this and
|
||||
# NSg/J+ NSg/V+ V/J NPl/V+ . D/P NSg/V+ NPrSg/V+ N/I/C/D+ V R I/D+ V/C
|
||||
> achieved accuracy in the 93 – 95 % range .
|
||||
# V/J NSg+ NPrSg/J/P D # . # . NSg/V+ .
|
||||
>
|
||||
#
|
||||
> Eugene Charniak points out in Statistical techniques for natural language
|
||||
# NPr ? NPl/V NSg/V/J/R/P NPrSg/J/P J NPl C/P NSg/J NSg/V
|
||||
> parsing ( 1997 ) that merely assigning the most common tag to each known word and
|
||||
# V . # . N/I/C/D R V D NSg/I/J NSg/V/J NSg/V P D NSg/V/J NSg/V V/C
|
||||
> the tag " proper noun " to all unknowns will approach 90 % accuracy because many
|
||||
# D NSg . NSg/J NSg/V . P NSg/I/J/C NPl/V NPrSg/VX NSg/V # . NSg C/P N/I/J/D
|
||||
> words are unambiguous , and many others only rarely represent their less - common
|
||||
# NPl/V V J . V/C N/I/J/D NPl/V W? R V D J/C/P . NSg/V/J
|
||||
# NPr+ ? NPl/V+ NSg/V/J/R/P NPrSg/J/P J NPl C/P NSg/J NSg/V+
|
||||
> parsing ( 1997 ) that merely assigning the most common tag to each known word and
|
||||
# V . # . N/I/C/D+ R V D NSg/I/J NSg/V/J NSg/V P D+ NSg/V/J NSg/V V/C
|
||||
> the tag " proper noun " to all unknowns will approach 90 % accuracy because many
|
||||
# D NSg/V+ . NSg/J NSg/V . P NSg/I/J/C+ NPl/V+ NPrSg/VX NSg/V # . NSg+ C/P N/I/J/D+
|
||||
> words are unambiguous , and many others only rarely represent their less - common
|
||||
# NPl/V+ V J . V/C N/I/J/D+ NPl/V+ W? R V D+ V/J/C/P . NSg/V/J
|
||||
> parts of speech .
|
||||
# NPl/V P NSg/V .
|
||||
# NPl/V P NSg/V+ .
|
||||
>
|
||||
#
|
||||
> CLAWS pioneered the field of HMM - based part of speech tagging but was quite
|
||||
# NPl/V V/J D NSg P V . V/J NSg/V/J P NSg/V NSg/V NSg/C/P V NSg
|
||||
> expensive since it enumerated all possibilities . It sometimes had to resort to
|
||||
# J C/P NPrSg/ISg V/J NSg/I/J/C NPl . NPrSg/ISg R V P NSg/V P
|
||||
> backup methods when there were simply too many options ( the Brown Corpus
|
||||
# NSg/J NPl/V NSg/I/C W? NSg/V R W? N/I/J/D NPl/V . D NPrSg/J NSg
|
||||
> contains a case with 17 ambiguous words in a row , and there are words such as
|
||||
# V D/P NPrSg P # J NPl/V P D/P NSg . V/C W? V NPl/V NSg/I NSg/R
|
||||
> " still " that can represent as many as 7 distinct parts of speech .
|
||||
# . NSg/V/J . N/I/C/D NPrSg/VX V NSg/R N/I/J/D NSg/R # V/J NPl/V P NSg/V .
|
||||
> CLAWS pioneered the field of HMM - based part of speech tagging but was quite
|
||||
# NPl/V+ V/J D NSg/V P V . V/J NSg/V/J P NSg/V+ NSg/V NSg/C/P V NSg
|
||||
> expensive since it enumerated all possibilities . It sometimes had to resort to
|
||||
# J C/P NPrSg/ISg+ V/J NSg/I/J/C+ NPl+ . NPrSg/ISg+ R V P NSg/V P
|
||||
> backup methods when there were simply too many options ( the Brown Corpus
|
||||
# NSg/J NPl/V+ NSg/I/C + NSg/V R W? N/I/J/D+ NPl/V . D+ NPrSg/V/J+ NSg+
|
||||
> contains a case with 17 ambiguous words in a row , and there are words such as
|
||||
# V D/P NPrSg/V P # J NPl/V NPrSg/J/P D/P+ NSg/V+ . V/C + V NPl/V+ NSg/I NSg/R
|
||||
> " still " that can represent as many as 7 distinct parts of speech .
|
||||
# . NSg/V/J . N/I/C/D+ NPrSg/VX V NSg/R N/I/J/D NSg/R # V/J NPl/V P NSg/V+ .
|
||||
>
|
||||
#
|
||||
> HMMs underlie the functioning of stochastic taggers and are used in various
|
||||
# ? V D N/J P J NPl V/C V V/J NPrSg/J/P J
|
||||
# ? V D V P J NPl V/C V V/J NPrSg/J/P J
|
||||
> algorithms one of the most widely used being the bi - directional inference
|
||||
# NPl NSg/I/V/J P D NSg/I/J R V/J NSg/V/C D NSg/J . NSg/J NSg
|
||||
# NPl+ NSg/I/V/J P D NSg/I/J R V/J NSg/V/C D NSg/J . NSg/J NSg+
|
||||
> algorithm .
|
||||
# NSg .
|
||||
# NSg+ .
|
||||
>
|
||||
#
|
||||
> Dynamic programming methods
|
||||
# NSg/J NSg/V NPl/V
|
||||
# NSg/J+ NSg/V+ NPl/V
|
||||
>
|
||||
#
|
||||
> In 1987 , Steven DeRose and Kenneth W. Church independently developed dynamic
|
||||
# P # . NPr ? V/C NPr ? NPrSg/V R V/J NSg/J
|
||||
> programming algorithms to solve the same problem in vastly less time . Their
|
||||
# NSg/V NPl P NSg/V D I/J NSg/J NPrSg/J/P R V/J/C/P NSg/V/J . D
|
||||
> methods were similar to the Viterbi algorithm known for some time in other
|
||||
# NPl NSg/V NSg/J P D ? NSg NSg/V/J C/P I/J/R NSg/V/J NPrSg/J/P NSg/V/J
|
||||
> fields . DeRose used a table of pairs , while Church used a table of triples and a
|
||||
# NPrPl/V . ? V/J D/P NSg P NPl/V . NSg/V/C/P NPrSg/V V/J D/P NSg P NPl/V V/C D/P
|
||||
> method of estimating the values for triples that were rare or nonexistent in the
|
||||
# NSg P V D NPl C/P NPl/V N/I/C/D NSg/V NSg/V/J NPrSg/C NSg/J P D
|
||||
> Brown Corpus ( an actual measurement of triple probabilities would require a much
|
||||
# NPrSg/J NSg . D/P NSg/J NSg P NSg/V/J NPl NSg/VX NSg/V D/P N/I/J
|
||||
> In 1987 , Steven DeRose and Kenneth W. Church independently developed dynamic
|
||||
# NPrSg/J/P # . NPr+ ? V/C NPr+ ? NPrSg/V+ R V/J NSg/J
|
||||
> programming algorithms to solve the same problem in vastly less time . Their
|
||||
# NSg/V+ NPl+ P NSg/V D I/J NSg/J NPrSg/J/P R V/J/C/P NSg/V/J+ . D+
|
||||
> methods were similar to the Viterbi algorithm known for some time in other
|
||||
# NPl/V+ NSg/V NSg/J P D ? NSg NSg/V/J C/P I/J/R NSg/V/J+ NPrSg/J/P NSg/V/J+
|
||||
> fields . DeRose used a table of pairs , while Church used a table of triples and a
|
||||
# NPrPl/V+ . ? V/J D/P NSg/V P NPl/V+ . NSg/V/C/P NPrSg/V+ V/J D/P NSg/V P NPl/V V/C D/P
|
||||
> method of estimating the values for triples that were rare or nonexistent in the
|
||||
# NSg/V P V D NPl/V C/P NPl/V N/I/C/D+ NSg/V NSg/V/J NPrSg/C NSg/J NPrSg/J/P D+
|
||||
> Brown Corpus ( an actual measurement of triple probabilities would require a much
|
||||
# NPrSg/V/J+ NSg . D/P NSg/J NSg P NSg/V/J NPl+ NSg/VX NSg/V D/P N/I/J
|
||||
> larger corpus ) . Both methods achieved an accuracy of over 95 % . DeRose's 1990
|
||||
# J NSg . . I/C NPl/V V/J D/P NSg P NSg/V/J/P # . . ? #
|
||||
> dissertation at Brown University included analyses of the specific error types ,
|
||||
# NSg NSg/P NPrSg/V/J NSg V/J NSg/V P D NSg/J NSg/V NPl/V .
|
||||
> probabilities , and other related data , and replicated his work for Greek , where
|
||||
# NPl . V/C NSg/V/J J NSg . V/C V/J ISg/D NSg C/P NPrSg/V/J . NSg/C
|
||||
> it proved similarly effective .
|
||||
# NPrSg/ISg V/J R NSg/J .
|
||||
# J NSg+ . . I/C NPl/V+ V/J D/P NSg P NSg/V/J/P # . . ? #
|
||||
> dissertation at Brown University included analyses of the specific error types ,
|
||||
# NSg+ NSg/P NPrSg/V/J NSg+ V/J NSg/V P D+ NSg/J+ NSg/V+ NPl/V+ .
|
||||
> probabilities , and other related data , and replicated his work for Greek , where
|
||||
# NPl+ . V/C NSg/V/J+ J+ NSg+ . V/C V/J ISg/D+ NSg/V C/P NPrSg/V/J . NSg/C
|
||||
> it proved similarly effective .
|
||||
# NPrSg/ISg+ V/J R+ NSg/J .
|
||||
>
|
||||
#
|
||||
> These findings were surprisingly disruptive to the field of natural language
|
||||
# I/D NSg NSg/V R J P D NSg P NSg/J NSg/V
|
||||
# I/D+ NSg NSg/V R J P D NSg/V P NSg/J+ NSg/V+
|
||||
> processing . The accuracy reported was higher than the typical accuracy of very
|
||||
# V . D NSg V/J V J C/P D NSg/J NSg P J
|
||||
> sophisticated algorithms that integrated part of speech choice with many higher
|
||||
# V/J NPl N/I/C/D V/J NSg/V/J P NSg/V NSg/J P N/I/J/D J
|
||||
> levels of linguistic analysis : syntax , morphology , semantics , and so on . CLAWS ,
|
||||
# NPl/V P J NSg . NSg . NSg . NSg . V/C NSg/I/J/C J/P . NPl/V .
|
||||
> DeRose's and Church's methods did fail for some of the known cases where
|
||||
# ? V/C N$ NPl/V V NSg/V/J C/P I/J/R P D NSg/J NPl/V NSg/C
|
||||
> semantics is required , but those proved negligibly rare . This convinced many in
|
||||
# NSg VL V/J . NSg/C/P I/D V/J R NSg/V/J . I/D V/J N/I/J/D P
|
||||
> the field that part - of - speech tagging could usefully be separated from the other
|
||||
# D NSg N/I/C/D NSg/V/J . P . NSg/V NSg/V NSg/VX R NSg/VX V/J P D NSg/J
|
||||
# V+ . D+ NSg+ V/J V J C/P D NSg/J NSg P J
|
||||
> sophisticated algorithms that integrated part of speech choice with many higher
|
||||
# V/J NPl+ N/I/C/D+ V/J NSg/V/J P NSg/V+ NSg/J P N/I/J/D J
|
||||
> levels of linguistic analysis : syntax , morphology , semantics , and so on . CLAWS ,
|
||||
# NPl/V P J NSg+ . NSg+ . NSg+ . NSg+ . V/C NSg/I/J/C+ J/P . NPl/V .
|
||||
> DeRose's and Church's methods did fail for some of the known cases where
|
||||
# ? V/C N$ NPl/V+ V NSg/V/J C/P I/J/R P D+ NSg/V/J+ NPl/V+ NSg/C
|
||||
> semantics is required , but those proved negligibly rare . This convinced many in
|
||||
# NSg+ VL V/J . NSg/C/P I/D+ V/J R+ NSg/V/J+ . I/D+ V/J N/I/J/D NPrSg/J/P
|
||||
> the field that part - of - speech tagging could usefully be separated from the other
|
||||
# D+ NSg/V+ N/I/C/D+ NSg/V/J . P . NSg/V NSg/V NSg/VX R NSg/VX V/J P D NSg/V/J
|
||||
> levels of processing ; this , in turn , simplified the theory and practice of
|
||||
# NPl/V P V . I/D . NPrSg/J/P NSg/V . V/J D NSg V/C NSg/V P
|
||||
# NPl/V P V . I/D+ . NPrSg/J/P NSg/V . V/J D+ NSg V/C NSg/V P
|
||||
> computerized language analysis and encouraged researchers to find ways to
|
||||
# V/J NSg/V NSg V/C V/J W? P NSg/V NPl P
|
||||
> separate other pieces as well . Markov Models became the standard method for the
|
||||
# NSg/V/J NSg/V/J NPl/V NSg/R NSg/V/J . NPr NPl/V V D NSg/J NSg/V C/P D
|
||||
> part - of - speech assignment .
|
||||
# NSg/J . P . NSg/V NSg .
|
||||
# V/J NSg/V+ NSg+ V/C V/J + P NSg/V NPl+ P
|
||||
> separate other pieces as well . Markov Models became the standard method for the
|
||||
# NSg/V/J NSg/V/J+ NPl/V+ NSg/R+ NSg/V/J . NPr NPl/V+ V D NSg/J NSg/V C/P D
|
||||
> part - of - speech assignment .
|
||||
# NSg/V/J . P . NSg/V+ NSg+ .
|
||||
>
|
||||
#
|
||||
> Unsupervised taggers
|
||||
# V/J NPl
|
||||
# V/J+ NPl
|
||||
>
|
||||
#
|
||||
> The methods already discussed involve working from a pre - existing corpus to
|
||||
# D NPl W? V/J V V P D/P NSg/P . V NSg P
|
||||
> learn tag probabilities . It is , however , also possible to bootstrap using
|
||||
# NSg/V NSg/V NPl . NPrSg/ISg VL . C . W? NSg/J P NSg/V V
|
||||
> The methods already discussed involve working from a pre - existing corpus to
|
||||
# D+ NPl/V W? V/J V V P D/P NSg/V/P+ . V NSg P
|
||||
> learn tag probabilities . It is , however , also possible to bootstrap using
|
||||
# NSg/V NSg/V+ NPl+ . NPrSg/ISg+ VL . C . W? NSg/J P NSg/V V
|
||||
> " unsupervised " tagging . Unsupervised tagging techniques use an untagged corpus
|
||||
# . V/J . NSg/V . V/J NSg/V NPl NSg/V D/P ? NSg
|
||||
> for their training data and produce the tagset by induction . That is , they
|
||||
# C/P D NSg NSg V/C NSg/V D NSg NSg/J/P NSg . N/I/C/D VL . IPl
|
||||
> observe patterns in word use , and derive part - of - speech categories themselves .
|
||||
# NSg/V NPl/V NPrSg/J/P NSg/V NSg/V . V/C NSg/V NSg/V/J . P . NSg/V NPl I .
|
||||
> For example , statistics readily reveal that " the " , " a " , and " an " occur in
|
||||
# C/P NSg/V . NPl/V R NSg/V N/I/C/D . D . . . D/P . . V/C . D/P . V NPrSg/J/P
|
||||
> similar contexts , while " eat " occurs in very different ones . With sufficient
|
||||
# NSg/J NPl/V . NSg/V/C/P . NSg/V . V NPrSg/J/P J NSg/J NPl/V . P J
|
||||
> iteration , similarity classes of words emerge that are remarkably similar to
|
||||
# NSg . NSg NPl/V P NPl/V NSg/V N/I/C/D V R NSg/J P
|
||||
# . V/J . NSg/V . V/J NSg/V NPl+ NSg/V D/P ? NSg
|
||||
> for their training data and produce the tagset by induction . That is , they
|
||||
# C/P D+ NSg/V+ NSg+ V/C NSg/V D NSg NSg/J/P+ NSg . N/I/C/D+ VL . IPl+
|
||||
> observe patterns in word use , and derive part - of - speech categories themselves .
|
||||
# NSg/V NPl/V+ NPrSg/J/P NSg/V+ NSg/V . V/C NSg/V NSg/V/J . P . NSg/V NPl+ I+ .
|
||||
> For example , statistics readily reveal that " the " , " a " , and " an " occur in
|
||||
# C/P NSg/V+ . NPl/V+ R NSg/V N/I/C/D+ . D . . . D/P . . V/C . D/P . V NPrSg/J/P
|
||||
> similar contexts , while " eat " occurs in very different ones . With sufficient
|
||||
# NSg/J+ NPl/V+ . NSg/V/C/P . NSg/V . V NPrSg/J/P J NSg/J+ NPl/V+ . P J+
|
||||
> iteration , similarity classes of words emerge that are remarkably similar to
|
||||
# NSg . NSg NPl/V P NPl/V+ NSg/V N/I/C/D+ V R NSg/J P
|
||||
> those human linguists would expect ; and the differences themselves sometimes
|
||||
# I/D NSg/V/J NPl NSg/VX V . V/C D NSg I R
|
||||
> suggest valuable new insights .
|
||||
# V NSg/J NSg/V/J NPl .
|
||||
# I/D+ NSg/V/J NPl+ NSg/VX V . V/C D+ NSg/V+ I+ R
|
||||
> suggest valuable new insights .
|
||||
# V NSg/J+ NSg/V/J+ NPl+ .
|
||||
>
|
||||
#
|
||||
> These two categories can be further subdivided into rule - based , stochastic , and
|
||||
# I/D NSg NPl NPrSg/VX NSg/VX V/J V/J P NSg/V . V/J . J . V/C
|
||||
> These two categories can be further subdivided into rule - based , stochastic , and
|
||||
# I/D NSg+ NPl NPrSg/VX NSg/VX V/J V/J P NSg/V . V/J . J . V/C
|
||||
> neural approaches .
|
||||
# J NPl/V .
|
||||
# J+ NPl/V+ .
|
||||
>
|
||||
#
|
||||
> Other taggers and methods
|
||||
# NSg/V/J NPl V/C NPl/V
|
||||
> Other taggers and methods
|
||||
# NSg/V/J+ NPl V/C NPl/V
|
||||
>
|
||||
#
|
||||
> Some current major algorithms for part - of - speech tagging include the Viterbi
|
||||
# I/J/R NSg/J NPrSg/V/J NPl C/P NSg/V/J . P . NSg/V NSg/V NSg/V D ?
|
||||
> Some current major algorithms for part - of - speech tagging include the Viterbi
|
||||
# I/J/R+ NSg/J NPrSg/V/J NPl C/P NSg/V/J . P . NSg/V NSg/V NSg/V D ?
|
||||
> algorithm , Brill tagger , Constraint Grammar , and the Baum - Welch algorithm ( also
|
||||
# NSg . NSg/J NSg . NSg NSg/V . V/C D NPr . ? NSg . W?
|
||||
> known as the forward - backward algorithm ) . Hidden Markov model and visible Markov
|
||||
# NSg/V/J NSg/R D NSg/J . NSg/J NSg . . V/J NPr NSg/V/J V/C J NPr
|
||||
> model taggers can both be implemented using the Viterbi algorithm . The
|
||||
# NSg/V/J NPl NPrSg/VX I/C NSg/VX V/J V D ? NSg . D
|
||||
> rule - based Brill tagger is unusual in that it learns a set of rule patterns , and
|
||||
# NSg . V/J NSg/J NSg VL NSg/J P N/I/C/D NPrSg/ISg NPl/V D/P NPrSg/J P NSg/V NPl/V . V/C
|
||||
> then applies those patterns rather than optimizing a statistical quantity .
|
||||
# NSg/J/C V I/D NPl/V NPrSg/V/J C/P V D/P J NSg .
|
||||
# NSg . NSg/J NSg . NSg+ NSg/V+ . V/C D NPr . ? NSg . W?
|
||||
> known as the forward - backward algorithm ) . Hidden Markov model and visible Markov
|
||||
# NSg/V/J NSg/R D NSg/V/J . NSg/J NSg+ . . V/J NPr NSg/V/J+ V/C J NPr
|
||||
> model taggers can both be implemented using the Viterbi algorithm . The
|
||||
# NSg/V/J+ NPl NPrSg/VX I/C NSg/VX V/J V D+ ? NSg . D
|
||||
> rule - based Brill tagger is unusual in that it learns a set of rule patterns , and
|
||||
# NSg/V+ . V/J NSg/J NSg VL NSg/J NPrSg/J/P N/I/C/D NPrSg/ISg+ NPl/V D/P NPrSg/V/J P NSg/V+ NPl/V+ . V/C
|
||||
> then applies those patterns rather than optimizing a statistical quantity .
|
||||
# NSg/J/C V I/D+ NPl/V+ NPrSg/V/J C/P V D/P+ J+ NSg+ .
|
||||
>
|
||||
#
|
||||
> Many machine learning methods have also been applied to the problem of POS
|
||||
# N/I/J/D NSg/V V NPl/V NSg/VX W? NSg/V V/J P D NSg/J P NSg
|
||||
> Many machine learning methods have also been applied to the problem of POS
|
||||
# N/I/J/D+ NSg/V V+ NPl/V+ NSg/VX W? NSg/V V/J P D NSg/J P NSg+
|
||||
> tagging . Methods such as SVM , maximum entropy classifier , perceptron , and
|
||||
# NSg/V . NPl/V NSg/I NSg/R ? . NSg/J NSg NSg . N . V/C
|
||||
# NSg/V+ . NPl/V+ NSg/I NSg/R ? . NSg/J NSg NSg . N . V/C
|
||||
> nearest - neighbor have all been tried , and most can achieve accuracy above
|
||||
# W? . NSg/V/J NSg/VX NSg/I/J/C NSg/V V/J . V/C NSg/I/J NPrSg/VX V NSg NSg/J/P
|
||||
# W? . NSg/V/J NSg/VX NSg/I/J/C NSg/V V/J . V/C NSg/I/J NPrSg/VX V NSg+ NSg/J/P
|
||||
> 95 % . [ citation needed ]
|
||||
# # . . . NSg V/J .
|
||||
# # . . . NSg+ V/J+ .
|
||||
>
|
||||
#
|
||||
> A direct comparison of several methods is reported ( with references ) at the ACL
|
||||
# D/P J NSg P J/D NPl/V VL V/J . P NPl/V . P D NSg
|
||||
> Wiki . This comparison uses the Penn tag set on some of the Penn Treebank data ,
|
||||
# NSg/V . I/D NSg NPl/V D NPr NSg/V NPrSg/V/J J/P I/J/R P D NPr ? NSg .
|
||||
> A direct comparison of several methods is reported ( with references ) at the ACL
|
||||
# D/P V/J NSg P J/D+ NPl/V+ VL V/J . P NPl/V+ . NSg/P D+ NSg+
|
||||
> Wiki . This comparison uses the Penn tag set on some of the Penn Treebank data ,
|
||||
# NSg/V+ . I/D+ NSg+ NPl/V D+ NPr+ NSg/V+ NPrSg/V/J J/P I/J/R P D+ NPr+ ? NSg+ .
|
||||
> so the results are directly comparable . However , many significant taggers are
|
||||
# NSg/I/J/C D NPl V R/C NSg/J . C . N/I/J/D NSg/J NPl V
|
||||
> not included ( perhaps because of the labor involved in reconfiguring them for
|
||||
# NSg/C V/J . NSg C/P P D NPrSg/Am/Au V/J NPrSg/J/P V N/I C/P
|
||||
> this particular dataset ) . Thus , it should not be assumed that the results
|
||||
# I/D NSg/J NSg . . NSg . NPrSg/ISg VX NSg/C NSg/VX V/J N/I/C/D D NPl
|
||||
> reported here are the best that can be achieved with a given approach ; nor even
|
||||
# V/J NSg/J/R V D NPrSg/J N/I/C/D NPrSg/VX NSg/VX V/J P D/P NSg/J/P NSg/V . NSg/C NSg/V/J
|
||||
> the best that have been achieved with a given approach .
|
||||
# D NPrSg/J N/I/C/D NSg/VX NSg/V V/J P D/P NSg/J/P NSg/V .
|
||||
# NSg/I/J/C D+ NPl/V+ V R/C NSg/J+ . C . N/I/J/D NSg/J NPl V
|
||||
> not included ( perhaps because of the labor involved in reconfiguring them for
|
||||
# NSg/C V/J . NSg C/P P D+ NPrSg/V/Am/Au+ V/J NPrSg/J/P V N/I+ C/P
|
||||
> this particular dataset ) . Thus , it should not be assumed that the results
|
||||
# I/D+ NSg/J+ NSg . . NSg . NPrSg/ISg+ VX NSg/C NSg/VX V/J N/I/C/D D+ NPl/V+
|
||||
> reported here are the best that can be achieved with a given approach ; nor even
|
||||
# V/J NSg/J/R V D NPrSg/VX/J N/I/C/D+ NPrSg/VX NSg/VX V/J P D/P+ NSg/V/J/P+ NSg/V+ . NSg/C NSg/V/J
|
||||
> the best that have been achieved with a given approach .
|
||||
# D+ NPrSg/VX/J+ N/I/C/D+ NSg/VX NSg/V V/J P D/P+ NSg/V/J/P+ NSg/V+ .
|
||||
>
|
||||
#
|
||||
> In 2014 , a paper reporting using the structure regularization method for
|
||||
# P # . D/P NSg/J V V D NSg NSg NSg/V C/P
|
||||
> part - of - speech tagging , achieving 97.36 % on a standard benchmark dataset .
|
||||
# NSg/V/J . P . NSg/V NSg/V . V # . P D/P NSg/J NSg/V NSg .
|
||||
> In 2014 , a paper reporting using the structure regularization method for
|
||||
# NPrSg/J/P # . D/P+ NSg/V/J+ V V D+ NSg/V+ NSg NSg/V C/P
|
||||
> part - of - speech tagging , achieving 97.36 % on a standard benchmark dataset .
|
||||
# NSg/V/J . P . NSg/V NSg/V . V # . J/P D/P NSg/J+ NSg/V+ NSg .
|
||||
|
|
|
@ -2,25 +2,25 @@
|
|||
# NSg/V
|
||||
>
|
||||
#
|
||||
> This document contains example sentences with misspelled words that we want to test the spell checker on .
|
||||
# I/D NSg/V V NSg/V NPl/V P V/J NPl/V N/I/C/D IPl NSg/V P NSg/V D NSg NSg/V J/P .
|
||||
> This document contains example sentences with misspelled words that we want to test the spell checker on .
|
||||
# I/D+ NSg/V V NSg/V+ NPl/V P V/J+ NPl/V+ N/I/C/D+ IPl+ NSg/V P NSg/V D NSg/V NSg/V J/P .
|
||||
>
|
||||
#
|
||||
> Example Sentences
|
||||
# NSg/V NPl/V
|
||||
# NSg/V+ NPl/V
|
||||
>
|
||||
#
|
||||
> My favourite color is blu .
|
||||
# D NSg/J/Ca/Au/Br NSg/V/J/Am VL W? .
|
||||
> I must defend my honour !
|
||||
# ISg NSg/V NSg/V D NSg/Ca/Au/Br .
|
||||
> I recognize that you recognise me .
|
||||
# ISg V N/I/C/D IPl V/Au/Br NPrSg/ISg .
|
||||
> I analyze how you infantilize me .
|
||||
# ISg V NSg/C IPl V NPrSg/ISg .
|
||||
> I analyse how you infantilise me .
|
||||
# ISg V/Au/Br NSg/C IPl ? NPrSg/ISg .
|
||||
> Careful , traveller !
|
||||
# J . NSg/Ca/Au/Br .
|
||||
> At the centre of the theatre I dropped a litre of coke .
|
||||
# P D NSg/Ca/Au/Br P D NSg/Ca/Au/Br ISg V/J D/P NSg/Ca/Au/Br P NPrSg/V .
|
||||
> My favourite color is blu .
|
||||
# D+ NSg/V/J/Ca/Au/Br NSg/V/J/Am VL+ W? .
|
||||
> I must defend my honour !
|
||||
# ISg+ NSg/V NSg/V D+ NSg/V/Ca/Au/Br+ .
|
||||
> I recognize that you recognise me .
|
||||
# ISg+ V N/I/C/D IPl+ V/Au/Br NPrSg/ISg+ .
|
||||
> I analyze how you infantilize me .
|
||||
# ISg+ V NSg/C IPl+ V NPrSg/ISg+ .
|
||||
> I analyse how you infantilise me .
|
||||
# ISg+ V/Au/Br NSg/C IPl+ ? NPrSg/ISg+ .
|
||||
> Careful , traveller !
|
||||
# J . NSg/Ca/Au/Br+ .
|
||||
> At the centre of the theatre I dropped a litre of coke .
|
||||
# NSg/P D NSg/V/Ca/Au/Br P D+ NSg/Ca/Au/Br+ ISg+ V/J D/P NSg/Ca/Au/Br P NPrSg/V+ .
|
||||
|
|
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
|
@ -1,74 +1,74 @@
|
|||
> " This " and " that " are common and fulfill multiple purposes in everyday English .
|
||||
# . I/D . V/C . N/I/C/D . V NSg/V/J V/C V NSg/J NPl/V NPrSg/J/P NSg/J NPrSg/V/J .
|
||||
> As such , disambiguating them is necessary .
|
||||
# NSg/R NSg/I . V N/I VL NSg/J .
|
||||
> " This " and " that " are common and fulfill multiple purposes in everyday English .
|
||||
# . I/D+ . V/C . N/I/C/D+ . V NSg/V/J V/C V NSg/J NPl/V NPrSg/J/P NSg/J+ NPrSg/V/J+ .
|
||||
> As such , disambiguating them is necessary .
|
||||
# NSg/R NSg/I . V N/I+ VL+ NSg/J .
|
||||
>
|
||||
#
|
||||
> This document contains various sentences that use " this " , " that " , " these " , and
|
||||
# I/D NSg/V V J NPl/V N/I/C/D NSg/V . I/D . . . N/I/C/D . . . I/D . . V/C
|
||||
> " those " in different contexts with a lot of edge cases .
|
||||
# . I/D . NPrSg/J/P NSg/J NPl/V P D/P NPrSg P NSg/V NPl/V .
|
||||
> This document contains various sentences that use " this " , " that " , " these " , and
|
||||
# I/D+ NSg/V V J NPl/V+ N/I/C/D+ NSg/V . I/D+ . . . N/I/C/D+ . . . I/D+ . . V/C
|
||||
> " those " in different contexts with a lot of edge cases .
|
||||
# . I/D . NPrSg/J/P NSg/J NPl/V P D/P NPrSg/V P NSg/V+ NPl/V+ .
|
||||
>
|
||||
#
|
||||
> Examples
|
||||
# NPl/V
|
||||
# NPl/V+
|
||||
>
|
||||
#
|
||||
> This triangle is nice .
|
||||
# I/D NSg VL NPrSg/V/J .
|
||||
> This is nice .
|
||||
# I/D VL NPrSg/V/J .
|
||||
> That triangle is nice .
|
||||
# N/I/C/D NSg VL NPrSg/V/J .
|
||||
> That is nice .
|
||||
# N/I/C/D VL NPrSg/V/J .
|
||||
> These triangles are nice .
|
||||
# I/D NPl V NPrSg/V/J .
|
||||
> These are nice .
|
||||
# I/D V NPrSg/V/J .
|
||||
> Those triangles are nice .
|
||||
# I/D NPl V NPrSg/V/J .
|
||||
> This triangle is nice .
|
||||
# I/D+ NSg+ VL+ NPrSg/V/J+ .
|
||||
> This is nice .
|
||||
# I/D+ VL+ NPrSg/V/J+ .
|
||||
> That triangle is nice .
|
||||
# N/I/C/D+ NSg+ VL+ NPrSg/V/J+ .
|
||||
> That is nice .
|
||||
# N/I/C/D+ VL+ NPrSg/V/J+ .
|
||||
> These triangles are nice .
|
||||
# I/D+ NPl+ V+ NPrSg/V/J+ .
|
||||
> These are nice .
|
||||
# I/D+ V+ NPrSg/V/J+ .
|
||||
> Those triangles are nice .
|
||||
# I/D+ NPl+ V+ NPrSg/V/J+ .
|
||||
> Those are nice .
|
||||
# I/D V NPrSg/V/J .
|
||||
# I/D+ V+ NPrSg/V/J .
|
||||
>
|
||||
#
|
||||
> This massage is nice .
|
||||
# I/D NSg/V VL NPrSg/V/J .
|
||||
> That massage is nice .
|
||||
# N/I/C/D NSg/V VL NPrSg/V/J .
|
||||
> These massages are nice .
|
||||
# I/D NPl/V V NPrSg/V/J .
|
||||
> Those massages are nice .
|
||||
# I/D NPl/V V NPrSg/V/J .
|
||||
> This massages well .
|
||||
# I/D NPl/V NSg/V/J .
|
||||
> That massages well .
|
||||
# N/I/C/D NPl/V NSg/V/J .
|
||||
> These massage well .
|
||||
# I/D NSg/V NSg/V/J .
|
||||
> Those massage well .
|
||||
# I/D NSg/V NSg/V/J .
|
||||
> This massage is nice .
|
||||
# I/D+ NSg/V+ VL+ NPrSg/V/J+ .
|
||||
> That massage is nice .
|
||||
# N/I/C/D NSg/V+ VL+ NPrSg/V/J+ .
|
||||
> These massages are nice .
|
||||
# I/D+ NPl/V+ V+ NPrSg/V/J+ .
|
||||
> Those massages are nice .
|
||||
# I/D+ NPl/V+ V+ NPrSg/V/J+ .
|
||||
> This massages well .
|
||||
# I/D+ NPl/V+ NSg/V/J+ .
|
||||
> That massages well .
|
||||
# N/I/C/D+ NPl/V+ NSg/V/J+ .
|
||||
> These massage well .
|
||||
# I/D+ NSg/V+ NSg/V/J+ .
|
||||
> Those massage well .
|
||||
# I/D+ NSg/V+ NSg/V/J+ .
|
||||
>
|
||||
#
|
||||
> That could be a solution .
|
||||
# N/I/C/D NSg/VX NSg/VX D/P NSg .
|
||||
> Find all candidates that could be a solution .
|
||||
# NSg/V NSg/I/J/C NPl/V N/I/C/D NSg/VX NSg/VX D/P NSg .
|
||||
> That could be a solution .
|
||||
# N/I/C/D+ NSg/VX NSg/VX D/P NSg .
|
||||
> Find all candidates that could be a solution .
|
||||
# NSg/V NSg/I/J/C+ NPl/V+ N/I/C/D+ NSg/VX NSg/VX D/P NSg+ .
|
||||
>
|
||||
#
|
||||
> This is all that I have .
|
||||
# I/D VL NSg/I/J/C N/I/C/D ISg NSg/VX .
|
||||
> This is all that solutions can do .
|
||||
# I/D VL NSg/I/J/C N/I/C/D NPl NPrSg/VX NSg/VX .
|
||||
> That solution can do .
|
||||
# N/I/C/D NSg NPrSg/VX NSg/VX .
|
||||
> This is all that I have .
|
||||
# I/D+ VL NSg/I/J/C N/I/C/D ISg+ NSg/VX+ .
|
||||
> This is all that solutions can do .
|
||||
# I/D+ VL NSg/I/J/C N/I/C/D NPl+ NPrSg/VX+ NSg/VX .
|
||||
> That solution can do .
|
||||
# N/I/C/D NSg+ NPrSg/VX+ NSg/VX .
|
||||
>
|
||||
#
|
||||
> We can do this !
|
||||
# IPl NPrSg/VX NSg/VX I/D .
|
||||
> I can do this and that .
|
||||
# ISg NPrSg/VX NSg/VX I/D V/C N/I/C/D .
|
||||
> We can do this !
|
||||
# IPl+ NPrSg/VX NSg/VX I/D+ .
|
||||
> I can do this and that .
|
||||
# ISg+ NPrSg/VX NSg/VX I/D V/C N/I/C/D+ .
|
||||
>
|
||||
#
|
||||
> We unite to stand united in unity .
|
||||
# IPl NSg/V P NSg/V V/J NPrSg/J/P NSg .
|
||||
> We unite to stand united in unity .
|
||||
# IPl+ NSg/V P NSg/V V/J NPrSg/J/P NSg+ .
|
||||
|
|
19
harper-pos-utils/Cargo.toml
Normal file
19
harper-pos-utils/Cargo.toml
Normal file
|
@ -0,0 +1,19 @@
|
|||
[package]
|
||||
name = "harper-pos-utils"
|
||||
version = "0.42.0"
|
||||
edition = "2024"
|
||||
|
||||
[dependencies]
|
||||
rs-conllu = "0.3.0"
|
||||
hashbrown = { version = "0.15.3", features = ["serde"] }
|
||||
strum = "0.27.1"
|
||||
strum_macros = "0.27.1"
|
||||
serde = { version = "1.0.219", features = ["derive"] }
|
||||
is-macro = "0.3.7"
|
||||
rayon = { version = "1.10.0", optional = true }
|
||||
rand = { version = "0.9.1", optional = true }
|
||||
|
||||
[features]
|
||||
default = []
|
||||
threaded = ["dep:rayon"]
|
||||
training = ["dep:rand"]
|
270
harper-pos-utils/src/chunker/brill_chunker/mod.rs
Normal file
270
harper-pos-utils/src/chunker/brill_chunker/mod.rs
Normal file
|
@ -0,0 +1,270 @@
|
|||
mod patch;
|
||||
|
||||
#[cfg(feature = "training")]
|
||||
use std::path::Path;
|
||||
|
||||
#[cfg(feature = "training")]
|
||||
use crate::word_counter::WordCounter;
|
||||
use crate::{
|
||||
UPOS,
|
||||
chunker::{Chunker, upos_freq_dict::UPOSFreqDict},
|
||||
};
|
||||
|
||||
use patch::Patch;
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct BrillChunker {
|
||||
base: UPOSFreqDict,
|
||||
patches: Vec<Patch>,
|
||||
}
|
||||
|
||||
impl BrillChunker {
|
||||
pub fn new(base: UPOSFreqDict) -> Self {
|
||||
Self {
|
||||
base,
|
||||
patches: Vec::new(),
|
||||
}
|
||||
}
|
||||
|
||||
fn apply_patches(&self, sentence: &[String], tags: &[Option<UPOS>], np_states: &mut [bool]) {
|
||||
for patch in &self.patches {
|
||||
for i in 0..sentence.len() {
|
||||
if patch.from == np_states[i]
|
||||
&& patch.criteria.fulfils(sentence, tags, np_states, i)
|
||||
{
|
||||
np_states[i] = !np_states[i];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Chunker for BrillChunker {
|
||||
fn chunk_sentence(&self, sentence: &[String], tags: &[Option<UPOS>]) -> Vec<bool> {
|
||||
let mut initial_pass = self.base.chunk_sentence(sentence, tags);
|
||||
|
||||
self.apply_patches(sentence, tags, &mut initial_pass);
|
||||
|
||||
initial_pass
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(feature = "training")]
|
||||
type CandidateArgs = (Vec<String>, Vec<Option<UPOS>>, Vec<bool>);
|
||||
|
||||
#[cfg(feature = "training")]
|
||||
impl BrillChunker {
|
||||
/// Tag a provided sentence with the tagger, providing the "correct" tags (from a dataset or
|
||||
/// other source), returning the number of errors.
|
||||
pub fn count_patch_errors(
|
||||
&self,
|
||||
sentence: &[String],
|
||||
tags: &[Option<UPOS>],
|
||||
base_flags: &[bool],
|
||||
correct_np_flags: &[bool],
|
||||
) -> usize {
|
||||
let mut flags = base_flags.to_vec();
|
||||
self.apply_patches(sentence, tags, &mut flags);
|
||||
|
||||
let mut loss = 0;
|
||||
for (a, b) in flags.into_iter().zip(correct_np_flags) {
|
||||
if a != *b {
|
||||
loss += 1;
|
||||
}
|
||||
}
|
||||
|
||||
loss
|
||||
}
|
||||
|
||||
/// Tag a provided sentence with the tagger, providing the "correct" tags (from a dataset or
|
||||
/// other source), returning the number of errors.
|
||||
pub fn count_chunk_errors(
|
||||
&self,
|
||||
sentence: &[String],
|
||||
tags: &[Option<UPOS>],
|
||||
correct_np_flags: &[bool],
|
||||
relevant_words: &mut WordCounter,
|
||||
) -> usize {
|
||||
let flags = self.chunk_sentence(sentence, tags);
|
||||
|
||||
let mut loss = 0;
|
||||
for ((a, b), word) in flags.into_iter().zip(correct_np_flags).zip(sentence) {
|
||||
if a != *b {
|
||||
loss += 1;
|
||||
relevant_words.inc(word);
|
||||
}
|
||||
}
|
||||
|
||||
loss
|
||||
}
|
||||
|
||||
/// To speed up training, only try a subset of all possible candidates.
|
||||
/// How many to select is given by the `candidate_selection_chance`. A higher chance means a
|
||||
/// longer training time.
|
||||
fn epoch(&mut self, training_files: &[impl AsRef<Path>], candidate_selection_chance: f32) {
|
||||
use crate::conllu_utils::iter_sentences_in_conllu;
|
||||
use rs_conllu::Sentence;
|
||||
use std::time::Instant;
|
||||
|
||||
assert!((0.0..=1.0).contains(&candidate_selection_chance));
|
||||
|
||||
let mut total_tokens = 0;
|
||||
let mut error_counter = 0;
|
||||
|
||||
let sentences: Vec<Sentence> = training_files
|
||||
.iter()
|
||||
.flat_map(iter_sentences_in_conllu)
|
||||
.collect();
|
||||
let mut sentences_flagged: Vec<CandidateArgs> = Vec::new();
|
||||
|
||||
for sent in &sentences {
|
||||
use hashbrown::HashSet;
|
||||
|
||||
use crate::chunker::np_extraction::locate_noun_phrases_in_sent;
|
||||
|
||||
let mut toks: Vec<String> = Vec::new();
|
||||
let mut tags = Vec::new();
|
||||
|
||||
for token in &sent.tokens {
|
||||
let form = token.form.clone();
|
||||
if let Some(last) = toks.last_mut() {
|
||||
match form.as_str() {
|
||||
"sn't" | "n't" | "'ll" | "'ve" | "'re" | "'d" | "'m" | "'s" => {
|
||||
last.push_str(&form);
|
||||
continue;
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
toks.push(form);
|
||||
tags.push(token.upos.and_then(UPOS::from_conllu));
|
||||
}
|
||||
|
||||
let actual = locate_noun_phrases_in_sent(sent);
|
||||
let actual_flat = actual.into_iter().fold(HashSet::new(), |mut a, b| {
|
||||
a.extend(b.into_iter());
|
||||
a
|
||||
});
|
||||
|
||||
let mut actual_seq = Vec::new();
|
||||
|
||||
for el in actual_flat {
|
||||
if el >= actual_seq.len() {
|
||||
actual_seq.resize(el + 1, false);
|
||||
}
|
||||
actual_seq[el] = true;
|
||||
}
|
||||
|
||||
sentences_flagged.push((toks, tags, actual_seq));
|
||||
}
|
||||
|
||||
let mut relevant_words = WordCounter::default();
|
||||
|
||||
for (tok_buf, tag_buf, flag_buf) in &sentences_flagged {
|
||||
total_tokens += tok_buf.len();
|
||||
error_counter += self.count_chunk_errors(
|
||||
tok_buf.as_slice(),
|
||||
tag_buf,
|
||||
flag_buf.as_slice(),
|
||||
&mut relevant_words,
|
||||
);
|
||||
}
|
||||
|
||||
println!("=============");
|
||||
println!("Total tokens in training set: {}", total_tokens);
|
||||
println!("Tokens incorrectly flagged: {}", error_counter);
|
||||
println!(
|
||||
"Error rate: {}%",
|
||||
error_counter as f32 / total_tokens as f32 * 100.
|
||||
);
|
||||
|
||||
// Before adding any patches, let's get a good base.
|
||||
let mut base_flags = Vec::new();
|
||||
for (toks, tags, _) in &sentences_flagged {
|
||||
base_flags.push(self.chunk_sentence(toks, tags));
|
||||
}
|
||||
|
||||
let all_candidates = Patch::generate_candidate_patches(&relevant_words);
|
||||
let mut pruned_candidates: Vec<Patch> = rand::seq::IndexedRandom::choose_multiple(
|
||||
all_candidates.as_slice(),
|
||||
&mut rand::rng(),
|
||||
(all_candidates.len() as f32 * candidate_selection_chance) as usize,
|
||||
)
|
||||
.cloned()
|
||||
.collect();
|
||||
|
||||
let start = Instant::now();
|
||||
|
||||
#[cfg(feature = "threaded")]
|
||||
rayon::slice::ParallelSliceMut::par_sort_by_cached_key(
|
||||
pruned_candidates.as_mut_slice(),
|
||||
|candidate: &Patch| {
|
||||
self.score_candidate(candidate.clone(), &sentences_flagged, &base_flags)
|
||||
},
|
||||
);
|
||||
|
||||
#[cfg(not(feature = "threaded"))]
|
||||
pruned_candidates.sort_by_cached_key(|candidate| {
|
||||
self.score_candidate(candidate.clone(), &sentences_flagged, &base_flags)
|
||||
});
|
||||
|
||||
let duration = start.elapsed();
|
||||
let seconds = duration.as_secs();
|
||||
let millis = duration.subsec_millis();
|
||||
|
||||
println!(
|
||||
"It took {} seconds and {} milliseconds to search through {} candidates at {} c/sec.",
|
||||
seconds,
|
||||
millis,
|
||||
pruned_candidates.len(),
|
||||
pruned_candidates.len() as f32 / seconds as f32
|
||||
);
|
||||
|
||||
if let Some(best) = pruned_candidates.first() {
|
||||
self.patches.push(best.clone());
|
||||
}
|
||||
}
|
||||
|
||||
/// Lower is better
|
||||
fn score_candidate(
|
||||
&self,
|
||||
candidate: Patch,
|
||||
sentences_flagged: &[CandidateArgs],
|
||||
base_flags: &[Vec<bool>],
|
||||
) -> usize {
|
||||
let mut tagger = BrillChunker::new(UPOSFreqDict::default());
|
||||
tagger.patches.push(candidate);
|
||||
|
||||
let mut errors = 0;
|
||||
|
||||
for ((toks, tags, flags), base) in sentences_flagged.iter().zip(base_flags.iter()) {
|
||||
errors += tagger.count_patch_errors(toks.as_slice(), tags.as_slice(), base, flags);
|
||||
}
|
||||
|
||||
errors
|
||||
}
|
||||
|
||||
/// Train a brand-new tagger on a `.conllu` dataset, provided via a path.
|
||||
/// This does not do _any_ error handling, and should not run in production.
|
||||
/// It should be used for training a model that _will_ be used in production.
|
||||
pub fn train(
|
||||
training_files: &[impl AsRef<Path>],
|
||||
epochs: usize,
|
||||
candidate_selection_chance: f32,
|
||||
) -> Self {
|
||||
let mut freq_dict = UPOSFreqDict::default();
|
||||
|
||||
for file in training_files {
|
||||
freq_dict.inc_from_conllu_file(file);
|
||||
}
|
||||
|
||||
let mut chunker = Self::new(freq_dict);
|
||||
|
||||
for _ in 0..epochs {
|
||||
chunker.epoch(training_files, candidate_selection_chance);
|
||||
}
|
||||
|
||||
chunker
|
||||
}
|
||||
}
|
121
harper-pos-utils/src/chunker/brill_chunker/patch.rs
Normal file
121
harper-pos-utils/src/chunker/brill_chunker/patch.rs
Normal file
|
@ -0,0 +1,121 @@
|
|||
use serde::{Deserialize, Serialize};
|
||||
|
||||
use crate::patch_criteria::PatchCriteria;
|
||||
#[cfg(feature = "training")]
|
||||
use crate::word_counter::WordCounter;
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct Patch {
|
||||
pub from: bool,
|
||||
pub criteria: PatchCriteria,
|
||||
}
|
||||
|
||||
#[cfg(feature = "training")]
|
||||
impl Patch {
|
||||
pub fn generate_candidate_patches(relevant_words: &WordCounter) -> Vec<Self> {
|
||||
use crate::UPOS;
|
||||
use strum::IntoEnumIterator;
|
||||
|
||||
const TOP_N_WORDS: usize = 50;
|
||||
const REL_POS: [isize; 7] = [-3, -2, -1, 0, 1, 2, 3];
|
||||
|
||||
let mut atoms: Vec<(bool, PatchCriteria)> = Vec::new();
|
||||
|
||||
for from in [false, true] {
|
||||
for rel in REL_POS {
|
||||
for tag in UPOS::iter() {
|
||||
atoms.push((
|
||||
from,
|
||||
PatchCriteria::WordIsTaggedWith {
|
||||
relative: rel,
|
||||
is_tagged: tag,
|
||||
},
|
||||
));
|
||||
}
|
||||
}
|
||||
for max_rel in 1..=5 {
|
||||
for tag in UPOS::iter() {
|
||||
atoms.push((
|
||||
from,
|
||||
PatchCriteria::AnyWordIsTaggedWith {
|
||||
max_relative: max_rel,
|
||||
is_tagged: tag,
|
||||
},
|
||||
));
|
||||
}
|
||||
}
|
||||
for prev in UPOS::iter() {
|
||||
for post in UPOS::iter() {
|
||||
atoms.push((
|
||||
from,
|
||||
PatchCriteria::SandwichTaggedWith {
|
||||
prev_word_tagged: prev,
|
||||
post_word_tagged: post,
|
||||
},
|
||||
));
|
||||
}
|
||||
}
|
||||
for rel in REL_POS {
|
||||
for is_np in [false, true] {
|
||||
atoms.push((
|
||||
from,
|
||||
PatchCriteria::NounPhraseAt {
|
||||
is_np,
|
||||
relative: rel,
|
||||
},
|
||||
));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let tag_atom_count = atoms.len();
|
||||
|
||||
let mut word_atoms: Vec<(bool, PatchCriteria)> = Vec::new();
|
||||
for from in [false, true] {
|
||||
for rel in REL_POS {
|
||||
for w in relevant_words.iter_top_n_words(TOP_N_WORDS) {
|
||||
word_atoms.push((
|
||||
from,
|
||||
PatchCriteria::WordIs {
|
||||
relative: rel,
|
||||
word: w.clone(),
|
||||
},
|
||||
));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
atoms.extend(word_atoms);
|
||||
|
||||
let total_atoms = atoms.len();
|
||||
let word_start = tag_atom_count;
|
||||
let word_atoms_ct = total_atoms - word_start;
|
||||
let combos_ct = word_atoms_ct * total_atoms - word_atoms_ct;
|
||||
let mut patches = Vec::with_capacity(total_atoms + combos_ct);
|
||||
|
||||
for (from, crit) in &atoms {
|
||||
patches.push(Self {
|
||||
from: *from,
|
||||
criteria: crit.clone(),
|
||||
});
|
||||
}
|
||||
|
||||
for i in word_start..total_atoms {
|
||||
let (from_i, ref crit_i) = atoms[i];
|
||||
for (j, (_from_j, crit_j)) in atoms.iter().enumerate() {
|
||||
if i == j {
|
||||
continue;
|
||||
}
|
||||
patches.push(Self {
|
||||
from: from_i,
|
||||
criteria: PatchCriteria::Combined {
|
||||
a: Box::new(crit_i.clone()),
|
||||
b: Box::new(crit_j.clone()),
|
||||
},
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
patches
|
||||
}
|
||||
}
|
17
harper-pos-utils/src/chunker/mod.rs
Normal file
17
harper-pos-utils/src/chunker/mod.rs
Normal file
|
@ -0,0 +1,17 @@
|
|||
use crate::UPOS;
|
||||
|
||||
mod brill_chunker;
|
||||
#[cfg(feature = "training")]
|
||||
mod np_extraction;
|
||||
mod upos_freq_dict;
|
||||
|
||||
pub use brill_chunker::BrillChunker;
|
||||
pub use upos_freq_dict::UPOSFreqDict;
|
||||
|
||||
/// An implementer of this trait is capable of identifying the noun phrases in a provided sentence.
|
||||
pub trait Chunker {
|
||||
/// Iterate over the sentence, identifying the noun phrases contained within.
|
||||
/// A token marked `true` is a component of a noun phrase.
|
||||
/// A token marked `false` is not.
|
||||
fn chunk_sentence(&self, sentence: &[String], tags: &[Option<UPOS>]) -> Vec<bool>;
|
||||
}
|
106
harper-pos-utils/src/chunker/np_extraction.rs
Normal file
106
harper-pos-utils/src/chunker/np_extraction.rs
Normal file
|
@ -0,0 +1,106 @@
|
|||
use std::collections::VecDeque;
|
||||
|
||||
use hashbrown::HashSet;
|
||||
use rs_conllu::{Sentence, Token, TokenID, UPOS};
|
||||
|
||||
pub fn locate_noun_phrases_in_sent(sent: &Sentence) -> Vec<HashSet<usize>> {
|
||||
let mut found_noun_phrases = Vec::new();
|
||||
|
||||
for (i, token) in sent.tokens.iter().enumerate() {
|
||||
if token.upos.is_some_and(is_root_upos) {
|
||||
let noun_phrase = locate_noun_phrase_with_head_at(i, sent);
|
||||
|
||||
found_noun_phrases.push(noun_phrase);
|
||||
}
|
||||
}
|
||||
|
||||
found_noun_phrases.retain(is_contiguous);
|
||||
|
||||
reduce_to_maximal_nonoverlapping(found_noun_phrases)
|
||||
}
|
||||
|
||||
fn is_contiguous(indices: &HashSet<usize>) -> bool {
|
||||
if indices.is_empty() {
|
||||
return false;
|
||||
}
|
||||
let lo = *indices.iter().min().unwrap();
|
||||
let hi = *indices.iter().max().unwrap();
|
||||
hi - lo + 1 == indices.len()
|
||||
}
|
||||
|
||||
fn reduce_to_maximal_nonoverlapping(mut phrases: Vec<HashSet<usize>>) -> Vec<HashSet<usize>> {
|
||||
phrases.sort_by_key(|s| usize::MAX - s.len());
|
||||
let mut selected = Vec::new();
|
||||
let mut occupied = HashSet::new();
|
||||
|
||||
for p in phrases {
|
||||
if p.is_disjoint(&occupied) {
|
||||
occupied.extend(&p);
|
||||
selected.push(p);
|
||||
}
|
||||
}
|
||||
|
||||
selected
|
||||
}
|
||||
|
||||
fn locate_noun_phrase_with_head_at(head_index: usize, sent: &Sentence) -> HashSet<usize> {
|
||||
let mut children = HashSet::new();
|
||||
let mut queue = VecDeque::new();
|
||||
queue.push_back(head_index);
|
||||
|
||||
while let Some(c_i) = queue.pop_front() {
|
||||
if children.contains(&c_i) {
|
||||
continue;
|
||||
}
|
||||
|
||||
let tok = &sent.tokens[c_i];
|
||||
|
||||
if is_noun_phrase_constituent(tok) || tok.upos.is_some_and(is_root_upos) {
|
||||
children.insert(c_i);
|
||||
queue.extend(get_children(sent, c_i));
|
||||
}
|
||||
}
|
||||
|
||||
children
|
||||
}
|
||||
|
||||
fn is_root_upos(upos: UPOS) -> bool {
|
||||
use UPOS::*;
|
||||
matches!(upos, NOUN | PROPN | PRON)
|
||||
}
|
||||
|
||||
/// Get the indices of the children of a given node.
|
||||
fn get_children(sent: &Sentence, of_node: usize) -> Vec<usize> {
|
||||
let mut children = Vec::new();
|
||||
|
||||
for (index, token) in sent.tokens.iter().enumerate() {
|
||||
if index == of_node {
|
||||
continue;
|
||||
}
|
||||
|
||||
if let Some(head) = token.head {
|
||||
let is_child = match head {
|
||||
TokenID::Single(i) => i != 0 && i - 1 == of_node,
|
||||
TokenID::Range(start, end) => (start - 1..end - 1).contains(&of_node),
|
||||
TokenID::Empty(_, _) => false,
|
||||
};
|
||||
|
||||
if is_child {
|
||||
children.push(index)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
children
|
||||
}
|
||||
|
||||
fn is_noun_phrase_constituent(token: &Token) -> bool {
|
||||
let Some(ref deprel) = token.deprel else {
|
||||
return false;
|
||||
};
|
||||
|
||||
matches!(
|
||||
deprel.as_str(),
|
||||
"det" | "amod" | "nummod" | "compound" | "fixed" | "flat" | "acl" | "aux:pass"
|
||||
)
|
||||
}
|
71
harper-pos-utils/src/chunker/upos_freq_dict.rs
Normal file
71
harper-pos-utils/src/chunker/upos_freq_dict.rs
Normal file
|
@ -0,0 +1,71 @@
|
|||
#[cfg(feature = "training")]
|
||||
use std::path::Path;
|
||||
|
||||
use hashbrown::HashMap;
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
use crate::UPOS;
|
||||
|
||||
use super::Chunker;
|
||||
|
||||
/// Tracks the number of times any given UPOS is associated with a noun phrase.
|
||||
/// Used as the baseline for the chunker.
|
||||
#[derive(Debug, Clone, Serialize, Deserialize, Default)]
|
||||
pub struct UPOSFreqDict {
|
||||
/// The # of times each [`UPOS`] was not part of an NP subtracted from the number of times it
|
||||
/// was.
|
||||
pub counts: HashMap<UPOS, isize>,
|
||||
}
|
||||
|
||||
impl UPOSFreqDict {
|
||||
pub fn is_likely_np_component(&self, upos: &UPOS) -> bool {
|
||||
self.counts.get(upos).cloned().unwrap_or_default() > 0
|
||||
}
|
||||
}
|
||||
|
||||
impl Chunker for UPOSFreqDict {
|
||||
fn chunk_sentence(&self, _sentence: &[String], tags: &[Option<UPOS>]) -> Vec<bool> {
|
||||
tags.iter()
|
||||
.map(|t| {
|
||||
t.as_ref()
|
||||
.map(|t| self.is_likely_np_component(t))
|
||||
.unwrap_or(false)
|
||||
})
|
||||
.collect()
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(feature = "training")]
|
||||
impl UPOSFreqDict {
|
||||
/// Increment the count for a particular lint kind.
|
||||
pub fn inc_is_np(&mut self, upos: UPOS, is_np: bool) {
|
||||
self.counts
|
||||
.entry(upos)
|
||||
.and_modify(|counter| *counter += if is_np { 1 } else { -1 })
|
||||
.or_insert(1);
|
||||
}
|
||||
|
||||
/// Parse a `.conllu` file and use it to train a frequency dictionary.
|
||||
/// For error-handling purposes, this function should not be made accessible outside of training.
|
||||
pub fn inc_from_conllu_file(&mut self, path: impl AsRef<Path>) {
|
||||
use super::np_extraction::locate_noun_phrases_in_sent;
|
||||
use crate::conllu_utils::iter_sentences_in_conllu;
|
||||
|
||||
for sent in iter_sentences_in_conllu(path) {
|
||||
use hashbrown::HashSet;
|
||||
|
||||
let noun_phrases = locate_noun_phrases_in_sent(&sent);
|
||||
|
||||
let flat = noun_phrases.into_iter().fold(HashSet::new(), |mut a, b| {
|
||||
a.extend(b);
|
||||
a
|
||||
});
|
||||
|
||||
for (i, token) in sent.tokens.iter().enumerate() {
|
||||
if let Some(upos) = token.upos.and_then(UPOS::from_conllu) {
|
||||
self.inc_is_np(upos, flat.contains(&i))
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
12
harper-pos-utils/src/conllu_utils.rs
Normal file
12
harper-pos-utils/src/conllu_utils.rs
Normal file
|
@ -0,0 +1,12 @@
|
|||
use std::{fs::File, path::Path};
|
||||
|
||||
use rs_conllu::{Sentence, parse_file};
|
||||
|
||||
/// Produce an iterator over the sentences in a `.conllu` file.
|
||||
/// Will panic on error, so this should not be used outside of training.
|
||||
pub fn iter_sentences_in_conllu(path: impl AsRef<Path>) -> impl Iterator<Item = Sentence> {
|
||||
let file = File::open(path).unwrap();
|
||||
let doc = parse_file(file);
|
||||
|
||||
doc.map(|v| v.unwrap())
|
||||
}
|
12
harper-pos-utils/src/lib.rs
Normal file
12
harper-pos-utils/src/lib.rs
Normal file
|
@ -0,0 +1,12 @@
|
|||
mod chunker;
|
||||
#[cfg(feature = "training")]
|
||||
mod conllu_utils;
|
||||
mod patch_criteria;
|
||||
mod tagger;
|
||||
mod upos;
|
||||
#[cfg(feature = "training")]
|
||||
mod word_counter;
|
||||
|
||||
pub use chunker::{BrillChunker, Chunker, UPOSFreqDict};
|
||||
pub use tagger::{BrillTagger, FreqDict, FreqDictBuilder, Tagger};
|
||||
pub use upos::{UPOS, UPOSIter};
|
126
harper-pos-utils/src/patch_criteria.rs
Normal file
126
harper-pos-utils/src/patch_criteria.rs
Normal file
|
@ -0,0 +1,126 @@
|
|||
use serde::{Deserialize, Serialize};
|
||||
|
||||
use crate::UPOS;
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize, Hash, PartialEq, Eq)]
|
||||
pub enum PatchCriteria {
|
||||
WordIsTaggedWith {
|
||||
/// Which token to inspect.
|
||||
relative: isize,
|
||||
is_tagged: UPOS,
|
||||
},
|
||||
AnyWordIsTaggedWith {
|
||||
/// The farthest relative index to look
|
||||
max_relative: isize,
|
||||
is_tagged: UPOS,
|
||||
},
|
||||
SandwichTaggedWith {
|
||||
prev_word_tagged: UPOS,
|
||||
post_word_tagged: UPOS,
|
||||
},
|
||||
WordIs {
|
||||
relative: isize,
|
||||
word: String,
|
||||
},
|
||||
/// Not applicable to the Brill Tagger, only the chunker
|
||||
NounPhraseAt {
|
||||
is_np: bool,
|
||||
relative: isize,
|
||||
},
|
||||
Combined {
|
||||
a: Box<PatchCriteria>,
|
||||
b: Box<PatchCriteria>,
|
||||
},
|
||||
}
|
||||
|
||||
impl PatchCriteria {
|
||||
pub fn fulfils(
|
||||
&self,
|
||||
tokens: &[String],
|
||||
tags: &[Option<UPOS>],
|
||||
np_flags: &[bool],
|
||||
index: usize,
|
||||
) -> bool {
|
||||
match self {
|
||||
PatchCriteria::WordIsTaggedWith {
|
||||
relative,
|
||||
is_tagged,
|
||||
} => {
|
||||
let Some(index) = add(index, *relative) else {
|
||||
return false;
|
||||
};
|
||||
|
||||
tags.get(index)
|
||||
.copied()
|
||||
.flatten()
|
||||
.is_some_and(|t| t == *is_tagged)
|
||||
}
|
||||
PatchCriteria::AnyWordIsTaggedWith {
|
||||
max_relative: relative,
|
||||
is_tagged,
|
||||
} => {
|
||||
let Some(farthest_index) = add(index, *relative) else {
|
||||
return false;
|
||||
};
|
||||
|
||||
(farthest_index.min(index)..farthest_index.max(index)).any(|i| {
|
||||
tags.get(i)
|
||||
.copied()
|
||||
.flatten()
|
||||
.is_some_and(|t| t == *is_tagged)
|
||||
})
|
||||
}
|
||||
PatchCriteria::SandwichTaggedWith {
|
||||
prev_word_tagged,
|
||||
post_word_tagged,
|
||||
} => {
|
||||
if index == 0 {
|
||||
return false;
|
||||
}
|
||||
|
||||
let prev_i = index - 1;
|
||||
let post_i = index + 1;
|
||||
|
||||
tags.get(prev_i)
|
||||
.copied()
|
||||
.flatten()
|
||||
.is_some_and(|t| t == *prev_word_tagged)
|
||||
&& tags
|
||||
.get(post_i)
|
||||
.copied()
|
||||
.flatten()
|
||||
.is_some_and(|t| t == *post_word_tagged)
|
||||
}
|
||||
Self::WordIs { relative, word } => {
|
||||
let Some(index) = add(index, *relative) else {
|
||||
return false;
|
||||
};
|
||||
|
||||
tokens.get(index).is_some_and(|w| {
|
||||
w.chars()
|
||||
.zip(word.chars())
|
||||
.all(|(a, b)| a.eq_ignore_ascii_case(&b))
|
||||
})
|
||||
}
|
||||
|
||||
Self::NounPhraseAt { is_np, relative } => {
|
||||
let Some(index) = add(index, *relative) else {
|
||||
return false;
|
||||
};
|
||||
|
||||
np_flags.get(index).is_some_and(|f| *is_np == *f)
|
||||
}
|
||||
Self::Combined { a, b } => {
|
||||
a.fulfils(tokens, tags, np_flags, index) && b.fulfils(tokens, tags, np_flags, index)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn add(u: usize, i: isize) -> Option<usize> {
|
||||
if i.is_negative() {
|
||||
u.checked_sub(i.wrapping_abs() as u32 as usize)
|
||||
} else {
|
||||
u.checked_add(i as usize)
|
||||
}
|
||||
}
|
281
harper-pos-utils/src/tagger/brill_tagger/mod.rs
Normal file
281
harper-pos-utils/src/tagger/brill_tagger/mod.rs
Normal file
|
@ -0,0 +1,281 @@
|
|||
mod patch;
|
||||
|
||||
#[cfg(feature = "training")]
|
||||
use std::path::Path;
|
||||
|
||||
use patch::Patch;
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
#[cfg(feature = "training")]
|
||||
use super::FreqDict;
|
||||
#[cfg(feature = "training")]
|
||||
use super::error_counter::{ErrorCounter, ErrorKind};
|
||||
|
||||
use crate::{Tagger, UPOS};
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct BrillTagger<B>
|
||||
where
|
||||
B: Tagger,
|
||||
{
|
||||
base: B,
|
||||
patches: Vec<Patch>,
|
||||
}
|
||||
|
||||
impl<B> BrillTagger<B>
|
||||
where
|
||||
B: Tagger,
|
||||
{
|
||||
pub fn new(base: B) -> Self {
|
||||
Self {
|
||||
base,
|
||||
patches: Vec::new(),
|
||||
}
|
||||
}
|
||||
|
||||
fn apply_patches(&self, sentence: &[String], tags: &mut [Option<UPOS>]) {
|
||||
for patch in &self.patches {
|
||||
for i in 0..sentence.len() {
|
||||
let Some(i_tag) = tags.get(i).copied().flatten() else {
|
||||
continue;
|
||||
};
|
||||
|
||||
if patch.from == i_tag && patch.criteria.fulfils(sentence, tags, &[], i) {
|
||||
tags[i] = Some(patch.to);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<B> Tagger for BrillTagger<B>
|
||||
where
|
||||
B: Tagger,
|
||||
{
|
||||
/// Tag a sentence using the provided frequency dictionary and current patch set.
|
||||
/// If the tagger is unable to determine a POS, it returns [`None`] in that position.
|
||||
fn tag_sentence(&self, sentence: &[String]) -> Vec<Option<UPOS>> {
|
||||
let mut tags = self.base.tag_sentence(sentence);
|
||||
self.apply_patches(sentence, &mut tags);
|
||||
|
||||
tags
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(feature = "training")]
|
||||
impl BrillTagger<FreqDict> {
|
||||
/// Tag a provided sentence with patches, providing the "correct" tags (from a dataset or
|
||||
/// other source), returning the number of errors.
|
||||
pub fn locate_patch_errors(
|
||||
&self,
|
||||
sentence: &[String],
|
||||
correct_tags: &[Option<UPOS>],
|
||||
base_tags: &[Option<UPOS>],
|
||||
errors: &mut ErrorCounter,
|
||||
) {
|
||||
let mut base_tags = base_tags.to_vec();
|
||||
self.apply_patches(sentence, &mut base_tags);
|
||||
|
||||
for ((tag, correct_tag), word) in base_tags.iter().zip(correct_tags.iter()).zip(sentence) {
|
||||
if let Some(tag) = tag {
|
||||
if let Some(correct_tag) = correct_tag {
|
||||
if tag != correct_tag {
|
||||
errors.inc(
|
||||
ErrorKind {
|
||||
was_tagged: *tag,
|
||||
correct_tag: *correct_tag,
|
||||
},
|
||||
word.as_str(),
|
||||
)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Tag a provided sentence with the tagger, providing the "correct" tags (from a dataset or
|
||||
/// other source), returning the number of errors.
|
||||
pub fn locate_tag_errors(
|
||||
&self,
|
||||
sentence: &[String],
|
||||
correct_tags: &[Option<UPOS>],
|
||||
) -> ErrorCounter {
|
||||
let tags = self.tag_sentence(sentence);
|
||||
|
||||
let mut errors = ErrorCounter::new();
|
||||
|
||||
for ((tag, correct_tag), word) in tags.iter().zip(correct_tags.iter()).zip(sentence) {
|
||||
if let Some(tag) = tag {
|
||||
if let Some(correct_tag) = correct_tag {
|
||||
if tag != correct_tag {
|
||||
errors.inc(
|
||||
ErrorKind {
|
||||
was_tagged: *tag,
|
||||
correct_tag: *correct_tag,
|
||||
},
|
||||
word.as_str(),
|
||||
)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
errors
|
||||
}
|
||||
|
||||
/// To speed up training, only try a subset of all possible candidates.
|
||||
/// How many to select is given by the `candidate_selection_chance`. A higher chance means a
|
||||
/// longer training time.
|
||||
fn epoch(&mut self, training_files: &[impl AsRef<Path>], candidate_selection_chance: f32) {
|
||||
use crate::conllu_utils::iter_sentences_in_conllu;
|
||||
use rs_conllu::Sentence;
|
||||
use std::time::Instant;
|
||||
|
||||
assert!((0.0..=1.0).contains(&candidate_selection_chance));
|
||||
|
||||
let mut total_tokens = 0;
|
||||
let mut error_counter = ErrorCounter::new();
|
||||
|
||||
let sentences: Vec<Sentence> = training_files
|
||||
.iter()
|
||||
.flat_map(iter_sentences_in_conllu)
|
||||
.collect();
|
||||
let mut sentences_tagged: Vec<(Vec<String>, Vec<Option<UPOS>>)> = Vec::new();
|
||||
|
||||
for sent in &sentences {
|
||||
let mut toks: Vec<String> = Vec::new();
|
||||
let mut tags = Vec::new();
|
||||
|
||||
for token in &sent.tokens {
|
||||
let form = token.form.clone();
|
||||
if let Some(last) = toks.last_mut() {
|
||||
match form.as_str() {
|
||||
"sn't" | "n't" | "'ll" | "'ve" | "'re" | "'d" | "'m" | "'s" => {
|
||||
last.push_str(&form);
|
||||
continue;
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
toks.push(form);
|
||||
tags.push(token.upos.and_then(UPOS::from_conllu));
|
||||
}
|
||||
|
||||
sentences_tagged.push((toks, tags));
|
||||
}
|
||||
|
||||
for (tok_buf, tag_buf) in &sentences_tagged {
|
||||
total_tokens += tok_buf.len();
|
||||
error_counter
|
||||
.merge_from(self.locate_tag_errors(tok_buf.as_slice(), tag_buf.as_slice()));
|
||||
}
|
||||
|
||||
println!("=============");
|
||||
println!("Total tokens in training set: {}", total_tokens);
|
||||
println!(
|
||||
"Tokens incorrectly tagged: {}",
|
||||
error_counter.total_errors()
|
||||
);
|
||||
println!(
|
||||
"Error rate: {}%",
|
||||
error_counter.total_errors() as f32 / total_tokens as f32 * 100.
|
||||
);
|
||||
|
||||
// Before adding any patches, let's get a good base.
|
||||
let mut base_tags = Vec::new();
|
||||
for (toks, _) in &sentences_tagged {
|
||||
base_tags.push(self.tag_sentence(toks));
|
||||
}
|
||||
|
||||
let all_candidates = Patch::generate_candidate_patches(&error_counter);
|
||||
let mut pruned_candidates: Vec<Patch> = rand::seq::IndexedRandom::choose_multiple(
|
||||
all_candidates.as_slice(),
|
||||
&mut rand::rng(),
|
||||
(all_candidates.len() as f32 * candidate_selection_chance) as usize,
|
||||
)
|
||||
.cloned()
|
||||
.collect();
|
||||
|
||||
let start = Instant::now();
|
||||
|
||||
#[cfg(feature = "threaded")]
|
||||
rayon::slice::ParallelSliceMut::par_sort_by_cached_key(
|
||||
pruned_candidates.as_mut_slice(),
|
||||
|candidate: &Patch| {
|
||||
self.score_candidate(candidate.clone(), &sentences_tagged, &base_tags)
|
||||
},
|
||||
);
|
||||
|
||||
#[cfg(not(feature = "threaded"))]
|
||||
pruned_candidates.sort_by_cached_key(|candidate| {
|
||||
self.score_candidate(candidate.clone(), &sentences_tagged, &base_tags)
|
||||
});
|
||||
|
||||
let duration = start.elapsed();
|
||||
let seconds = duration.as_secs();
|
||||
let millis = duration.subsec_millis();
|
||||
|
||||
println!(
|
||||
"It took {} seconds and {} milliseconds to search through {} candidates at {} c/sec.",
|
||||
seconds,
|
||||
millis,
|
||||
pruned_candidates.len(),
|
||||
pruned_candidates.len() as f32 / seconds as f32
|
||||
);
|
||||
|
||||
if let Some(best) = pruned_candidates.first() {
|
||||
self.patches.push(best.clone());
|
||||
}
|
||||
}
|
||||
|
||||
/// Lower is better
|
||||
fn score_candidate(
|
||||
&self,
|
||||
candidate: Patch,
|
||||
sentences_tagged: &[(Vec<String>, Vec<Option<UPOS>>)],
|
||||
base_tags: &[Vec<Option<UPOS>>],
|
||||
) -> usize {
|
||||
let mut tagger = BrillTagger::new(FreqDict::default());
|
||||
tagger.patches.push(candidate);
|
||||
|
||||
let mut candidate_errors = ErrorCounter::new();
|
||||
|
||||
for ((toks, tags), base) in sentences_tagged.iter().zip(base_tags.iter()) {
|
||||
tagger.locate_patch_errors(
|
||||
toks.as_slice(),
|
||||
tags.as_slice(),
|
||||
base,
|
||||
&mut candidate_errors,
|
||||
);
|
||||
}
|
||||
|
||||
candidate_errors.total_errors()
|
||||
}
|
||||
|
||||
/// Train a brand-new tagger on a `.conllu` dataset, provided via a path.
|
||||
/// This does not do _any_ error handling, and should not run in production.
|
||||
/// It should be used for training a model that _will_ be used in production.
|
||||
pub fn train(
|
||||
training_files: &[impl AsRef<Path>],
|
||||
epochs: usize,
|
||||
candidate_selection_chance: f32,
|
||||
) -> Self {
|
||||
use crate::FreqDictBuilder;
|
||||
|
||||
let mut freq_dict_builder = FreqDictBuilder::new();
|
||||
|
||||
for file in training_files {
|
||||
freq_dict_builder.inc_from_conllu_file(file);
|
||||
}
|
||||
|
||||
let freq_dict = freq_dict_builder.build();
|
||||
|
||||
let mut tagger = Self::new(freq_dict);
|
||||
|
||||
for _ in 0..epochs {
|
||||
tagger.epoch(training_files, candidate_selection_chance);
|
||||
}
|
||||
|
||||
tagger
|
||||
}
|
||||
}
|
92
harper-pos-utils/src/tagger/brill_tagger/patch.rs
Normal file
92
harper-pos-utils/src/tagger/brill_tagger/patch.rs
Normal file
|
@ -0,0 +1,92 @@
|
|||
#[cfg(feature = "training")]
|
||||
use crate::tagger::error_counter::ErrorCounter;
|
||||
use crate::{UPOS, patch_criteria::PatchCriteria};
|
||||
#[cfg(feature = "training")]
|
||||
use hashbrown::HashSet;
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct Patch {
|
||||
pub from: UPOS,
|
||||
pub to: UPOS,
|
||||
pub criteria: PatchCriteria,
|
||||
}
|
||||
|
||||
#[cfg(feature = "training")]
|
||||
impl Patch {
|
||||
/// Given a list of tagging errors, generate a collection of candidate patches that _might_ fix
|
||||
/// them. Training involves determining which candidates actually work.
|
||||
pub fn generate_candidate_patches(error_counter: &ErrorCounter) -> Vec<Patch> {
|
||||
let mut candidates = Vec::new();
|
||||
|
||||
for key in error_counter.error_counts.keys() {
|
||||
candidates.extend(Self::gen_simple_candidates().into_iter().map(|c| Patch {
|
||||
from: key.was_tagged,
|
||||
to: key.correct_tag,
|
||||
criteria: c,
|
||||
}));
|
||||
|
||||
for c in &Self::gen_simple_candidates() {
|
||||
for word in error_counter.word_counts.iter_top_n_words(10) {
|
||||
for r in -3..3 {
|
||||
candidates.push(Patch {
|
||||
from: key.was_tagged,
|
||||
to: key.correct_tag,
|
||||
criteria: PatchCriteria::Combined {
|
||||
a: Box::new(PatchCriteria::WordIs {
|
||||
relative: r,
|
||||
word: word.to_string(),
|
||||
}),
|
||||
b: Box::new(c.clone()),
|
||||
},
|
||||
})
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
candidates
|
||||
}
|
||||
|
||||
/// Candidates to be tested against a dataset during training.
|
||||
fn gen_simple_candidates() -> Vec<PatchCriteria> {
|
||||
use strum::IntoEnumIterator;
|
||||
|
||||
let mut criteria = HashSet::new();
|
||||
for upos in UPOS::iter() {
|
||||
for i in -4..=4 {
|
||||
criteria.insert(PatchCriteria::WordIsTaggedWith {
|
||||
relative: i,
|
||||
is_tagged: upos,
|
||||
});
|
||||
}
|
||||
|
||||
for i in -4..=4 {
|
||||
criteria.insert(PatchCriteria::AnyWordIsTaggedWith {
|
||||
max_relative: i,
|
||||
is_tagged: upos,
|
||||
});
|
||||
}
|
||||
|
||||
for upos_b in UPOS::iter() {
|
||||
criteria.insert(PatchCriteria::SandwichTaggedWith {
|
||||
prev_word_tagged: upos,
|
||||
post_word_tagged: upos_b,
|
||||
});
|
||||
|
||||
criteria.insert(PatchCriteria::Combined {
|
||||
a: Box::new(PatchCriteria::WordIsTaggedWith {
|
||||
relative: 1,
|
||||
is_tagged: upos,
|
||||
}),
|
||||
b: Box::new(PatchCriteria::WordIsTaggedWith {
|
||||
relative: -2,
|
||||
is_tagged: upos_b,
|
||||
}),
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
criteria.into_iter().collect()
|
||||
}
|
||||
}
|
52
harper-pos-utils/src/tagger/error_counter.rs
Normal file
52
harper-pos-utils/src/tagger/error_counter.rs
Normal file
|
@ -0,0 +1,52 @@
|
|||
use hashbrown::HashMap;
|
||||
|
||||
use crate::{UPOS, word_counter::WordCounter};
|
||||
|
||||
#[derive(Debug, Default, Clone, Hash, PartialEq, Eq)]
|
||||
pub struct ErrorKind {
|
||||
pub was_tagged: UPOS,
|
||||
pub correct_tag: UPOS,
|
||||
}
|
||||
|
||||
#[derive(Debug, Default)]
|
||||
pub struct ErrorCounter {
|
||||
pub error_counts: HashMap<ErrorKind, usize>,
|
||||
/// The number of times a word is associated with an error.
|
||||
pub word_counts: WordCounter,
|
||||
}
|
||||
|
||||
impl ErrorCounter {
|
||||
pub fn new() -> Self {
|
||||
Self::default()
|
||||
}
|
||||
|
||||
/// Increment the count for a particular lint kind.
|
||||
pub fn inc(&mut self, kind: ErrorKind, word: &str) {
|
||||
self.error_counts
|
||||
.entry(kind)
|
||||
.and_modify(|counter| *counter += 1)
|
||||
.or_insert(1);
|
||||
self.word_counts.inc(word)
|
||||
}
|
||||
|
||||
pub fn merge_from(&mut self, other: Self) {
|
||||
for (key, value) in other.error_counts {
|
||||
self.error_counts
|
||||
.entry(key)
|
||||
.and_modify(|counter| *counter += value)
|
||||
.or_insert(value);
|
||||
}
|
||||
|
||||
for (key, value) in other.word_counts.word_counts {
|
||||
self.word_counts
|
||||
.word_counts
|
||||
.entry(key)
|
||||
.and_modify(|counter| *counter += value)
|
||||
.or_insert(value);
|
||||
}
|
||||
}
|
||||
|
||||
pub fn total_errors(&self) -> usize {
|
||||
self.error_counts.values().sum()
|
||||
}
|
||||
}
|
32
harper-pos-utils/src/tagger/freq_dict.rs
Normal file
32
harper-pos-utils/src/tagger/freq_dict.rs
Normal file
|
@ -0,0 +1,32 @@
|
|||
use hashbrown::HashMap;
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
use super::Tagger;
|
||||
use crate::upos::UPOS;
|
||||
|
||||
/// A mapping between words (normalized to lowercase) and their most common UPOS tag.
|
||||
/// Can be used as a minimally accurate [`Tagger`].
|
||||
#[derive(Debug, Default, Serialize, Deserialize, Clone)]
|
||||
pub struct FreqDict {
|
||||
pub mapping: HashMap<String, UPOS>,
|
||||
}
|
||||
|
||||
impl FreqDict {
|
||||
pub fn get(&self, word: &str) -> Option<UPOS> {
|
||||
let word_lower = word.to_lowercase();
|
||||
self.mapping.get(word_lower.as_str()).copied()
|
||||
}
|
||||
}
|
||||
|
||||
impl Tagger for FreqDict {
|
||||
fn tag_sentence(&self, sentence: &[String]) -> Vec<Option<UPOS>> {
|
||||
let mut tags = Vec::new();
|
||||
|
||||
for word in sentence {
|
||||
let tag = self.get(word);
|
||||
tags.push(tag);
|
||||
}
|
||||
|
||||
tags
|
||||
}
|
||||
}
|
99
harper-pos-utils/src/tagger/freq_dict_builder.rs
Normal file
99
harper-pos-utils/src/tagger/freq_dict_builder.rs
Normal file
|
@ -0,0 +1,99 @@
|
|||
#[cfg(feature = "training")]
|
||||
use std::path::Path;
|
||||
|
||||
use hashbrown::{Equivalent, HashMap};
|
||||
use strum::IntoEnumIterator;
|
||||
|
||||
use crate::{UPOS, tagger::FreqDict};
|
||||
|
||||
/// A mapping between words and the frequency of each UPOS.
|
||||
/// If an element is missing from the map, it's count is assumed to be zero.
|
||||
#[derive(Debug, Default)]
|
||||
pub struct FreqDictBuilder {
|
||||
mapping: HashMap<FreqDictBuilderKey, usize>,
|
||||
}
|
||||
|
||||
impl FreqDictBuilder {
|
||||
pub fn new() -> Self {
|
||||
Default::default()
|
||||
}
|
||||
|
||||
pub fn inc(&mut self, word: &str, tag: &UPOS) {
|
||||
let word_lower = word.to_lowercase();
|
||||
let counter = self.mapping.get_mut(&(word_lower.as_str(), tag));
|
||||
|
||||
if let Some(counter) = counter {
|
||||
*counter += 1;
|
||||
} else {
|
||||
self.mapping.insert(
|
||||
FreqDictBuilderKey {
|
||||
word: word_lower.to_string(),
|
||||
pos: *tag,
|
||||
},
|
||||
1,
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
// Inefficient, but effective method that gets the most used POS for a word in the map.
|
||||
// Returns none if the word does not exist in the map.
|
||||
fn most_freq_pos(&self, word: &str) -> Option<UPOS> {
|
||||
let word_lower = word.to_lowercase();
|
||||
let mut max_found: Option<(UPOS, usize)> = None;
|
||||
|
||||
for pos in UPOS::iter() {
|
||||
if let Some(count) = self.mapping.get(&(word_lower.as_str(), &pos)) {
|
||||
if let Some((_, max_count)) = max_found {
|
||||
if *count > max_count {
|
||||
max_found = Some((pos, *count))
|
||||
}
|
||||
} else {
|
||||
max_found = Some((pos, *count))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
max_found.map(|v| v.0)
|
||||
}
|
||||
|
||||
/// Parse a `.conllu` file and use it to train a frequency dictionary.
|
||||
/// For error-handling purposes, this function should not be made accessible outside of training.
|
||||
#[cfg(feature = "training")]
|
||||
pub fn inc_from_conllu_file(&mut self, path: impl AsRef<Path>) {
|
||||
use crate::conllu_utils::iter_sentences_in_conllu;
|
||||
|
||||
for sent in iter_sentences_in_conllu(path) {
|
||||
for token in sent.tokens {
|
||||
if let Some(upos) = token.upos.and_then(UPOS::from_conllu) {
|
||||
self.inc(&token.form, &upos)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub fn build(self) -> FreqDict {
|
||||
let mut output = HashMap::new();
|
||||
|
||||
for key in self.mapping.keys() {
|
||||
if output.contains_key(&key.word) {
|
||||
continue;
|
||||
}
|
||||
|
||||
output.insert(key.word.to_string(), self.most_freq_pos(&key.word).unwrap());
|
||||
}
|
||||
|
||||
FreqDict { mapping: output }
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Eq, PartialEq, Hash)]
|
||||
struct FreqDictBuilderKey {
|
||||
word: String,
|
||||
pos: UPOS,
|
||||
}
|
||||
|
||||
impl Equivalent<FreqDictBuilderKey> for (&str, &UPOS) {
|
||||
fn equivalent(&self, key: &FreqDictBuilderKey) -> bool {
|
||||
self.0 == key.word && *self.1 == key.pos
|
||||
}
|
||||
}
|
16
harper-pos-utils/src/tagger/mod.rs
Normal file
16
harper-pos-utils/src/tagger/mod.rs
Normal file
|
@ -0,0 +1,16 @@
|
|||
mod brill_tagger;
|
||||
#[cfg(feature = "training")]
|
||||
mod error_counter;
|
||||
mod freq_dict;
|
||||
mod freq_dict_builder;
|
||||
|
||||
use crate::UPOS;
|
||||
|
||||
pub use brill_tagger::BrillTagger;
|
||||
pub use freq_dict::FreqDict;
|
||||
pub use freq_dict_builder::FreqDictBuilder;
|
||||
|
||||
/// An implementer of this trait is capable of assigned Part-of-Speech tags to a provided sentence.
|
||||
pub trait Tagger {
|
||||
fn tag_sentence(&self, sentence: &[String]) -> Vec<Option<UPOS>>;
|
||||
}
|
68
harper-pos-utils/src/upos.rs
Normal file
68
harper-pos-utils/src/upos.rs
Normal file
|
@ -0,0 +1,68 @@
|
|||
use is_macro::Is;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use strum_macros::{AsRefStr, EnumIter};
|
||||
|
||||
/// Represents the universal parts of speech as outlined by [universaldependencies.org](https://universaldependencies.org/u/pos/index.html).
|
||||
#[derive(
|
||||
Debug,
|
||||
Default,
|
||||
Hash,
|
||||
Eq,
|
||||
PartialEq,
|
||||
Clone,
|
||||
Copy,
|
||||
EnumIter,
|
||||
AsRefStr,
|
||||
Serialize,
|
||||
Deserialize,
|
||||
PartialOrd,
|
||||
Ord,
|
||||
Is,
|
||||
)]
|
||||
pub enum UPOS {
|
||||
ADJ,
|
||||
ADP,
|
||||
ADV,
|
||||
AUX,
|
||||
CCONJ,
|
||||
DET,
|
||||
INTJ,
|
||||
#[default]
|
||||
NOUN,
|
||||
NUM,
|
||||
PART,
|
||||
PRON,
|
||||
PROPN,
|
||||
PUNCT,
|
||||
SCONJ,
|
||||
SYM,
|
||||
VERB,
|
||||
}
|
||||
|
||||
impl UPOS {
|
||||
pub fn from_conllu(other: rs_conllu::UPOS) -> Option<Self> {
|
||||
Some(match other {
|
||||
rs_conllu::UPOS::ADJ => UPOS::ADJ,
|
||||
rs_conllu::UPOS::ADP => UPOS::ADP,
|
||||
rs_conllu::UPOS::ADV => UPOS::ADV,
|
||||
rs_conllu::UPOS::AUX => UPOS::AUX,
|
||||
rs_conllu::UPOS::CCONJ => UPOS::CCONJ,
|
||||
rs_conllu::UPOS::DET => UPOS::DET,
|
||||
rs_conllu::UPOS::INTJ => UPOS::INTJ,
|
||||
rs_conllu::UPOS::NOUN => UPOS::NOUN,
|
||||
rs_conllu::UPOS::NUM => UPOS::NUM,
|
||||
rs_conllu::UPOS::PART => UPOS::PART,
|
||||
rs_conllu::UPOS::PRON => UPOS::PRON,
|
||||
rs_conllu::UPOS::PROPN => UPOS::PROPN,
|
||||
rs_conllu::UPOS::PUNCT => UPOS::PUNCT,
|
||||
rs_conllu::UPOS::SCONJ => UPOS::SCONJ,
|
||||
rs_conllu::UPOS::SYM => UPOS::SYM,
|
||||
rs_conllu::UPOS::VERB => UPOS::VERB,
|
||||
rs_conllu::UPOS::X => return None,
|
||||
})
|
||||
}
|
||||
|
||||
pub fn is_nominal(&self) -> bool {
|
||||
matches!(self, Self::NOUN | Self::PROPN)
|
||||
}
|
||||
}
|
28
harper-pos-utils/src/word_counter.rs
Normal file
28
harper-pos-utils/src/word_counter.rs
Normal file
|
@ -0,0 +1,28 @@
|
|||
use hashbrown::HashMap;
|
||||
|
||||
#[derive(Debug, Default)]
|
||||
pub struct WordCounter {
|
||||
/// The number of times a word is associated with an error.
|
||||
pub word_counts: HashMap<String, usize>,
|
||||
}
|
||||
|
||||
impl WordCounter {
|
||||
pub fn new() -> Self {
|
||||
Self::default()
|
||||
}
|
||||
|
||||
/// Increment the count for a particular word.
|
||||
pub fn inc(&mut self, word: &str) {
|
||||
self.word_counts
|
||||
.entry_ref(word)
|
||||
.and_modify(|counter| *counter += 1)
|
||||
.or_insert(1);
|
||||
}
|
||||
|
||||
/// Get an iterator over the most frequent words associated with errors.
|
||||
pub fn iter_top_n_words(&self, n: usize) -> impl Iterator<Item = &String> {
|
||||
let mut counts: Vec<(&String, &usize)> = self.word_counts.iter().collect();
|
||||
counts.sort_unstable_by(|a, b| b.1.cmp(a.1));
|
||||
counts.into_iter().take(n).map(|(a, _b)| a)
|
||||
}
|
||||
}
|
|
@ -1,368 +0,0 @@
|
|||
{
|
||||
"ignoredLints": "{\"context_hashes\":[11327540533206285101]}",
|
||||
"useWebWorker": true,
|
||||
"lintSettings": {
|
||||
"ACoupleMore": null,
|
||||
"ALongTime": null,
|
||||
"ALotWorst": null,
|
||||
"APart": null,
|
||||
"AWholeEntire": null,
|
||||
"AdjectiveOfA": null,
|
||||
"AfterAWhile": null,
|
||||
"AlzheimersDisease": null,
|
||||
"AmazonNames": null,
|
||||
"Americas": null,
|
||||
"AmountsFor": null,
|
||||
"AnA": null,
|
||||
"AnAnother": null,
|
||||
"AndIn": null,
|
||||
"AndTheLike": null,
|
||||
"AnotherAn": null,
|
||||
"AnotherOnes": null,
|
||||
"AnotherThings": null,
|
||||
"Anybody": null,
|
||||
"Anyhow": null,
|
||||
"Anyone": null,
|
||||
"Anywhere": null,
|
||||
"AppleNames": null,
|
||||
"AsFarBackAs": null,
|
||||
"AsOfLate": null,
|
||||
"AsWell": null,
|
||||
"AskNoPreposition": null,
|
||||
"AtFaceValue": null,
|
||||
"Australia": null,
|
||||
"AvoidAndAlso": null,
|
||||
"AvoidCurses": null,
|
||||
"AzureNames": null,
|
||||
"BackInTheDay": null,
|
||||
"Backplane": null,
|
||||
"BadRap": null,
|
||||
"BaitedBreath": null,
|
||||
"BanTogether": null,
|
||||
"BareInMind": null,
|
||||
"BatedBreath": null,
|
||||
"BeckAndCall": null,
|
||||
"BeenThere": null,
|
||||
"BestRegards": null,
|
||||
"BlanketStatement": null,
|
||||
"BoringWords": null,
|
||||
"Brutality": null,
|
||||
"ByAccident": null,
|
||||
"CanBeSeen": null,
|
||||
"Canada": null,
|
||||
"CapitalizePersonalPronouns": null,
|
||||
"CaseInPoint": null,
|
||||
"CaseSensitive": null,
|
||||
"ChangeOfTack": null,
|
||||
"ChangeTack": null,
|
||||
"ChangedTack": null,
|
||||
"ChangesOfTack": null,
|
||||
"ChangesTack": null,
|
||||
"ChangingOfTack": null,
|
||||
"ChangingTack": null,
|
||||
"ChineseCommunistParty": null,
|
||||
"ChockFull": null,
|
||||
"ClientSide": null,
|
||||
"CommaFixes": null,
|
||||
"CompaniesProductsAndTrademarks": null,
|
||||
"CompoundNouns": null,
|
||||
"CondenseAllThe": null,
|
||||
"Confident": null,
|
||||
"CorrectNumberSuffix": null,
|
||||
"Countries": null,
|
||||
"CoursingThroughVeins": null,
|
||||
"CurrencyPlacement": null,
|
||||
"DampSquib": null,
|
||||
"Dashes": null,
|
||||
"DayAndAge": null,
|
||||
"DayOneNames": null,
|
||||
"DefiniteArticle": null,
|
||||
"DefiniteArticles": null,
|
||||
"Desktop": null,
|
||||
"DespiteOf": null,
|
||||
"Devops": null,
|
||||
"Discuss": null,
|
||||
"Discussed": null,
|
||||
"Discusses": null,
|
||||
"Discussing": null,
|
||||
"DoNotWant": null,
|
||||
"DotInitialisms": null,
|
||||
"EachAndEveryOne": null,
|
||||
"EllipsisLength": null,
|
||||
"ElsePossessive": null,
|
||||
"EludedTo": null,
|
||||
"EnMasse": null,
|
||||
"EverPresent": null,
|
||||
"Everybody": null,
|
||||
"Everyday": null,
|
||||
"Everyone": null,
|
||||
"Everywhere": null,
|
||||
"Excellent": null,
|
||||
"ExpandBecause": null,
|
||||
"ExpandDependencies": null,
|
||||
"ExpandDependency": null,
|
||||
"ExpandMinimum": null,
|
||||
"ExpandStandardInput": null,
|
||||
"ExpandStandardOutput": null,
|
||||
"ExpandTimeShorthands": null,
|
||||
"ExpandWith": null,
|
||||
"ExpandWithout": null,
|
||||
"Expatriate": null,
|
||||
"ExplanationMark": null,
|
||||
"ExplanationMarks": null,
|
||||
"ExplanationPoint": null,
|
||||
"FaceFirst": null,
|
||||
"FairBit": null,
|
||||
"FarWorse": null,
|
||||
"FastPaste": null,
|
||||
"FatalOutcome": null,
|
||||
"FetalPosition": null,
|
||||
"FirstAidKit": null,
|
||||
"ForALongTime": null,
|
||||
"ForAWhile": null,
|
||||
"ForAllIntentsAndPurposes": null,
|
||||
"ForNoun": null,
|
||||
"FreeRein": null,
|
||||
"Freezing": null,
|
||||
"FurtherAdo": null,
|
||||
"Furthermore": null,
|
||||
"GetRidOff": null,
|
||||
"GetsRidOff": null,
|
||||
"GettingRidOff": null,
|
||||
"GildedAge": null,
|
||||
"GoingTo": null,
|
||||
"GoogleNames": null,
|
||||
"GotRidOff": null,
|
||||
"GottenRidOff": null,
|
||||
"GuineaBissau": null,
|
||||
"HadGone": null,
|
||||
"HadOf": null,
|
||||
"HadPassed": null,
|
||||
"HalfAnHour": null,
|
||||
"Haphazard": null,
|
||||
"HasGone": null,
|
||||
"HasPassed": null,
|
||||
"HaveGone": null,
|
||||
"HavePassed": null,
|
||||
"HavingGone": null,
|
||||
"HavingPassed": null,
|
||||
"Hedging": null,
|
||||
"Henceforth": null,
|
||||
"Hereby": null,
|
||||
"Holidays": null,
|
||||
"HomeInOn": null,
|
||||
"HomedInOn": null,
|
||||
"HomesInOn": null,
|
||||
"HomingInOn": null,
|
||||
"HopHope": null,
|
||||
"HowTo": null,
|
||||
"However": null,
|
||||
"HumanBeings": null,
|
||||
"HumanLife": null,
|
||||
"HungerPang": null,
|
||||
"HyphenateNumberDay": null,
|
||||
"IAm": null,
|
||||
"InAWhile": null,
|
||||
"InAndOfItself": null,
|
||||
"InAnyWay": null,
|
||||
"InCase": null,
|
||||
"InDetail": null,
|
||||
"InMoreDetail": null,
|
||||
"InNeedOf": null,
|
||||
"InOneFellSwoop": null,
|
||||
"InThe": null,
|
||||
"InflectedVerbAfterTo": null,
|
||||
"Insofar": null,
|
||||
"Instead": null,
|
||||
"InsteadOf": null,
|
||||
"Insurmountable": null,
|
||||
"Intact": null,
|
||||
"Into": null,
|
||||
"InvestIn": null,
|
||||
"InvestedIn": null,
|
||||
"InvestingIn": null,
|
||||
"InvestsIn": null,
|
||||
"IsKnownFor": null,
|
||||
"ItCan": null,
|
||||
"ItsContraction": null,
|
||||
"Itself": null,
|
||||
"IveGotTo": null,
|
||||
"JawDropping": null,
|
||||
"JetpackNames": null,
|
||||
"JustDeserts": null,
|
||||
"KindOf": null,
|
||||
"KindRegards": null,
|
||||
"Koreas": null,
|
||||
"Laptop": null,
|
||||
"LastButNotLeast": null,
|
||||
"LastDitch": null,
|
||||
"LeftRightHand": null,
|
||||
"LetAlone": null,
|
||||
"LetsConfusion": null,
|
||||
"LikeThePlague": null,
|
||||
"Likewise": null,
|
||||
"LinkingVerbs": null,
|
||||
"LongSentences": null,
|
||||
"Malaysia": null,
|
||||
"MergeWords": null,
|
||||
"MetaNames": null,
|
||||
"MicrosoftNames": null,
|
||||
"Middleware": null,
|
||||
"Misunderstand": null,
|
||||
"Misunderstood": null,
|
||||
"Misuse": null,
|
||||
"Misused": null,
|
||||
"ModalOf": null,
|
||||
"Monumentous": null,
|
||||
"MostNumber": null,
|
||||
"MuchAdo": null,
|
||||
"MuchWorse": null,
|
||||
"Multicore": null,
|
||||
"Multimedia": null,
|
||||
"MultipleSequentialPronouns": null,
|
||||
"Multithreading": null,
|
||||
"MutePoint": null,
|
||||
"MyHouse": null,
|
||||
"Myself": null,
|
||||
"NailOnTheHead": null,
|
||||
"NationalCapitals": null,
|
||||
"NeedHelp": null,
|
||||
"NerveRacking": null,
|
||||
"NoOxfordComma": null,
|
||||
"Nobody": null,
|
||||
"NominalWants": null,
|
||||
"Nonetheless": null,
|
||||
"NotIn": null,
|
||||
"NotTo": null,
|
||||
"NotablePlaces": null,
|
||||
"Nothing": null,
|
||||
"Notwithstanding": null,
|
||||
"NounInsteadOfVerb": null,
|
||||
"Nowhere": null,
|
||||
"NumberSuffixCapitalization": null,
|
||||
"OceansAndSeas": null,
|
||||
"OfCourse": null,
|
||||
"OffTheCuff": null,
|
||||
"OldWivesTale": null,
|
||||
"OnSecondThought": null,
|
||||
"OnTheSpurOfTheMoment": null,
|
||||
"OnceInAWhile": null,
|
||||
"OneAndTheSame": null,
|
||||
"OpenCompounds": null,
|
||||
"OpenTheLight": null,
|
||||
"OperativeSystem": null,
|
||||
"OperativeSystems": null,
|
||||
"OutOfDate": null,
|
||||
"Overall": null,
|
||||
"Overclocking": null,
|
||||
"Overload": null,
|
||||
"Overnight": null,
|
||||
"OxfordComma": null,
|
||||
"Oxymorons": null,
|
||||
"PeaceOfMind": null,
|
||||
"PhrasalVerbAsCompoundNoun": null,
|
||||
"PiggyBag": null,
|
||||
"PiggyBagged": null,
|
||||
"PiggyBagging": null,
|
||||
"PiqueInterest": null,
|
||||
"PocketCastsNames": null,
|
||||
"PointIsMoot": null,
|
||||
"PointsOfView": null,
|
||||
"PortAuPrince": null,
|
||||
"PortoNovo": null,
|
||||
"PossessiveYour": null,
|
||||
"Postpone": null,
|
||||
"PrayingMantis": null,
|
||||
"PronounContraction": null,
|
||||
"PronounKnew": null,
|
||||
"Proofread": null,
|
||||
"ProperNouns": null,
|
||||
"RapidFire": null,
|
||||
"RealTrouper": null,
|
||||
"Regardless": null,
|
||||
"RepeatedWords": null,
|
||||
"RifeWith": null,
|
||||
"RoadMap": null,
|
||||
"SameAs": null,
|
||||
"SaveToSafe": null,
|
||||
"ScantilyClad": null,
|
||||
"SentenceCapitalization": null,
|
||||
"ServerSide": null,
|
||||
"SimpleGrammatical": null,
|
||||
"SinceDuration": null,
|
||||
"SneakingSuspicion": null,
|
||||
"Somebody": null,
|
||||
"Somehow": null,
|
||||
"Someone": null,
|
||||
"SomewhatSomething": null,
|
||||
"Somewhere": null,
|
||||
"SoonerOrLater": null,
|
||||
"Spaces": null,
|
||||
"SpecialAttention": null,
|
||||
"SpellCheck": null,
|
||||
"SpelledNumbers": null,
|
||||
"SpokeTooSoon": null,
|
||||
"Starving": null,
|
||||
"StateOfTheArt": null,
|
||||
"SufficeItToSay": null,
|
||||
"SupposedTo": null,
|
||||
"TakeItPersonally": null,
|
||||
"TakeItSeriously": null,
|
||||
"ThatChallenged": null,
|
||||
"ThatThis": null,
|
||||
"ThatWhich": null,
|
||||
"TheAnother": null,
|
||||
"TheHowWhy": null,
|
||||
"TheMy": null,
|
||||
"ThenThan": null,
|
||||
"ThereIsAny": null,
|
||||
"Therefore": null,
|
||||
"Thereupon": null,
|
||||
"ThoughtProcess": null,
|
||||
"ThrowRubbish": null,
|
||||
"TickingTimeClock": null,
|
||||
"ToDoHyphen": null,
|
||||
"ToTheMannerBorn": null,
|
||||
"Towards": null,
|
||||
"TrialAndError": null,
|
||||
"TumblrNames": null,
|
||||
"TurnForTheWorse": null,
|
||||
"TurnItOff": null,
|
||||
"USUniversities": null,
|
||||
"UnclosedQuotes": null,
|
||||
"Underclock": null,
|
||||
"UnitedOrganizations": null,
|
||||
"Unless": null,
|
||||
"Upset": null,
|
||||
"Upward": null,
|
||||
"UseGenitive": null,
|
||||
"WantBe": null,
|
||||
"WasAloud": null,
|
||||
"WaveFunction": null,
|
||||
"WellBeing": null,
|
||||
"WellKept": null,
|
||||
"WhatHeLooksLike": null,
|
||||
"WhatItLooksLike": null,
|
||||
"WhatSheLooksLike": null,
|
||||
"WhatTheyLookLike": null,
|
||||
"Whereas": null,
|
||||
"Whereupon": null,
|
||||
"WhetYourAppetite": null,
|
||||
"WholeEntire": null,
|
||||
"WidelyAccepted": null,
|
||||
"Widespread": null,
|
||||
"WillContain": null,
|
||||
"WinPrize": null,
|
||||
"WordPressDotcom": null,
|
||||
"WorldWarII": null,
|
||||
"Worldwide": null,
|
||||
"WorseAndWorse": null,
|
||||
"WorseCaseScenario": null,
|
||||
"WorseThan": null,
|
||||
"WorstCaseScenario": null,
|
||||
"WorstEver": null
|
||||
},
|
||||
"userDictionary": [],
|
||||
"dialect": 0,
|
||||
"delay": -1
|
||||
}
|
8
packages/web/src/routes/docs/contributors/brill/+page.md
Normal file
8
packages/web/src/routes/docs/contributors/brill/+page.md
Normal file
|
@ -0,0 +1,8 @@
|
|||
---
|
||||
title: Brill Tagging
|
||||
---
|
||||
|
||||
Harper uses Brill tagging as a refinement step to a dictionary-based [POS tagging](https://en.wikipedia.org/wiki/Part-of-speech_tagging) approach.
|
||||
This method retains low-latency and high-throughput without bundling a large, high-entropy language model.
|
||||
|
||||
While documentation on this site is sparse, initial development was accompanied by [a blog post](https://elijahpotter.dev/articles/transformation-based_learning), which can hopefully explain some of the more abstract details of the process.
|
|
@ -189,6 +189,10 @@ export default defineConfig({
|
|||
title: 'Local Statistics',
|
||||
to: '/docs/contributors/local-stats',
|
||||
},
|
||||
{
|
||||
title: 'Brill Tagging',
|
||||
to: '/docs/contributors/brill',
|
||||
},
|
||||
{
|
||||
title: 'FAQ',
|
||||
to: '/docs/contributors/faq',
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue