Irregular verbs (#2285)
Some checks are pending
Binaries / harper-cli - macOS-aarch64 (push) Waiting to run
Binaries / harper-cli - Linux-aarch64-GNU (push) Waiting to run
Binaries / harper-cli - Linux-aarch64-musl (push) Waiting to run
Binaries / harper-cli - macOS-x86_64 (push) Waiting to run
Binaries / harper-cli - Linux-x86_64-GNU (push) Waiting to run
Binaries / harper-cli - Linux-x86_64-musl (push) Waiting to run
Binaries / harper-cli - Windows-x86_64 (push) Waiting to run
Binaries / harper-ls - macOS-aarch64 (push) Waiting to run
Binaries / harper-ls - Linux-aarch64-GNU (push) Waiting to run
Binaries / harper-ls - Linux-aarch64-musl (push) Waiting to run
Binaries / harper-ls - macOS-x86_64 (push) Waiting to run
Binaries / harper-ls - Linux-x86_64-GNU (push) Waiting to run
Binaries / harper-ls - Linux-x86_64-musl (push) Waiting to run
Binaries / harper-ls - Windows-x86_64 (push) Waiting to run
Build Web / build-web (push) Waiting to run
Chrome Plugin / chrome-plugin (push) Waiting to run
Just Checks / just check-js (push) Waiting to run
Just Checks / just check-rust (push) Waiting to run
Just Checks / just test-chrome-plugin (push) Waiting to run
Just Checks / just test-firefox-plugin (push) Waiting to run
Just Checks / just test-harperjs (push) Waiting to run
Just Checks / just test-obsidian (push) Waiting to run
Just Checks / just test-rust (push) Waiting to run
Just Checks / just test-vscode (push) Waiting to run
VS Code Plugin / alpine-arm64 (push) Waiting to run
VS Code Plugin / alpine-x64 (push) Waiting to run
VS Code Plugin / darwin-arm64 (push) Waiting to run
VS Code Plugin / linux-arm64 (push) Waiting to run
VS Code Plugin / darwin-x64 (push) Waiting to run
VS Code Plugin / linux-armhf (push) Waiting to run
VS Code Plugin / linux-x64 (push) Waiting to run
VS Code Plugin / win32-arm64 (push) Waiting to run
VS Code Plugin / win32-x64 (push) Waiting to run
WordPress Plugin / wp-plugin (push) Waiting to run

* chore: start working on irregular plural module

* feat: irregular verb module

* fix: `will_non_lemma.rs` shouldn't be included here

* fix: `just format`

* refactor: in response to PR feedback

* fix: implement @elijah's requested changes
This commit is contained in:
Andrew Dunbar 2025-12-08 16:30:04 +00:00 committed by GitHub
parent 4b8c619bb7
commit 66f3e84357
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
6 changed files with 558 additions and 103 deletions

View file

@ -0,0 +1,162 @@
[
"// comments can appear in the line before an entry",
"// or in place of an entry",
["child", "children"],
["foot", "feet"],
["goose", "geese"],
["man", "men"],
["mouse", "mice"],
["ox", "oxen"],
["person", "people"],
["seraph", "seraphim"],
["woman", "women"],
["addendum", "addenda"],
["aircraft", "aircraft"],
["aircraftman", "aircraftmen"],
["aircraftwoman", "aircraftwomen"],
["airman", "airmen"],
["alderman", "aldermen"],
["alga", "algae"],
["alveolus", "alveoli"],
["anchorman", "anchormen"],
["anchorwoman", "anchorwomen"],
["atrium", "atria"],
["axis", "axes"],
["bacillus", "bacilli"],
["bacterium", "bacteria"],
["bandsman", "bandsmen"],
["bargeman", "bargemen"],
["bellman", "bellmen"],
["biceps", "biceps"],
["boatman", "boatmen"],
["bronchus", "bronchi"],
["businesswoman", "businesswomen"],
["cactus", "cacti"],
["cameraperson", "camerapeople"],
["candelabrum", "candelabra"],
["catharsis", "catharses"],
["chairman", "chairmen"],
["chairwoman", "chairwomen"],
["churchwoman", "churchwomen"],
["clansman", "clansmen"],
["clanswoman", "clanswomen"],
["committeeman", "committeemen"],
["committeewoman", "committeewomen"],
["continuum", "continua"],
["corpus", "corpora"],
["craftsman", "craftsmen"],
["craftswoman", "craftswomen"],
["crisis", "crises"],
["cyclops", "cyclopes"],
["datum", "data"],
["diaeresis", "diaereses"],
["diagnosis", "diagnoses"],
["dominatrix", "dominatrices"],
["draughtsman", "draughtsmen"],
["draughtswoman", "draughtswomen"],
["effluvium", "effluvia"],
["emphasis", "emphases"],
["esophagus", "esophagi"],
["extremum", "extrema"],
["fish", "fish"],
["footman", "footmen"],
["formula", "formulae"],
["forum", "fora"],
["freeman", "freemen"],
["frontiersman", "frontiersmen"],
["frontierswoman", "frontierswomen"],
["garbageman", "garbagemen"],
["genesis", "geneses"],
["genie", "genii"],
["genius", "genii"],
["genus", "genera"],
["glissando", "glissandi"],
["graffito", "graffiti"],
["grandchild", "grandchildren"],
["handyman", "handymen"],
["hitman", "hitmen"],
["houseman", "housemen"],
["iceman", "icemen"],
["ilium", "ilia"],
["index", "indices"],
["intermezzo", "intermezzi"],
["journeyman", "journeymen"],
["labium", "labia"],
["lamina", "laminae"],
["laundrywoman", "laundrywomen"],
["laywoman", "laywomen"],
["linesman", "linesmen"],
["lira", " lire"],
["longshoreman", "longshoremen"],
["louse", "lice"],
["madman", "madmen"],
["mailman", "mailmen"],
["memorandum", "memoranda"],
["metathesis", "metatheses"],
["minimum", "minima"],
["mitosis", "mitoses"],
["motorman", "motormen"],
["muscleman", "musclemen"],
["nemesis", "nemeses"],
["nightwatchman", "nightwatchmen"],
["oarsman", "oarsmen"],
["oarswoman", "oarswomen"],
["oasis", "oases"],
["ombudsman", "ombudsmen"],
["optimum", "optima"],
["palazzo", "palazzi"],
["papyrus", "papyri"],
["parenthesis", "parentheses"],
["patina", "patinae"],
["patrolman", "patrolmen"],
["pericardium", "pericardia"],
["periphrasis", "periphrases"],
["pharynx", "pharynges"],
["phenomenon", "phenomena"],
["plainclothesman", "plainclothesmen"],
["pneumococcus", "pneumococci"],
["pressman", "pressmen"],
["prosthesis", "protheses"],
["quantum", "quanta"],
["radius", "radii"],
["radix", "radices"],
["repairman", "repairmen"],
["salesman", "salesmen"],
["saleswoman", "saleswomen"],
["sandman", "sandmen"],
["schema", "schemata"],
["sheep", "sheep"],
["shoreman", "shoremen"],
["signore", "signori"],
["simulacrum", "simulacra"],
["solarium", "solaria"],
["spokesman", "spokesmen"],
["spokesperson", "spokespeople"],
["spokeswoman", "spokeswomen"],
["statesman", "statesmen"],
["stateswoman", "stateswomen"],
["steersman", "steersmen"],
["stratum", "strata"],
["streptococcus", "streptococci"],
["succubus", "succubi"],
["symbiosis", "symbioses"],
["tarsus", "tarsi"],
["taxon", "taxa"],
["testatrix", "testatrices"],
["testis", "testes"],
["thesis", "theses"],
["thrombosis", "thromboses"],
["tooth", "teeth"],
["townsman", "townsmen"],
["townswoman", "townswomen"],
["tradesman", "tradesmen"],
["tradeswoman", "tradeswomen"],
["uterus", "uteri"],
["vertebra", "vertebrae"],
["vertex", "vertices"],
["vivarium", "vivaria"],
["washerwoman", "washerwomen"],
["woodlouse", "woodlice"],
["workingwoman", "workingwomen"],
["workman", "workmen"]
]

View file

@ -0,0 +1,127 @@
[
"// comments can appear in the line before an entry",
"// or in place of an entry",
["arise", "arose", "arisen"],
["awake", "awoke", "awoken"],
"// be/am/are/is -- was/were -- been",
["become", "became", "become"],
["begin", "began", "begun"],
["bend", "bent", "bent"],
["bet", "bet", "bet"],
["bid", "bade", "bidden"],
["bind", "bound", "bound"],
["bite", "bit", "bitten"],
["bleed", "bled", "bled"],
["blow", "blew", "blown"],
["break", "broke", "broken"],
["breed", "bred", "bred"],
["bring", "brought", "brought"],
["build", "built", "built"],
["burst", "burst", "burst"],
["buy", "bought", "bought"],
["catch", "caught", "caught"],
["choose", "chose", "chosen"],
["come", "came", "come"],
["cost", "cost", "cost"],
["cut", "cut", "cut"],
["dive", "dove", "dove"],
["do", "did", "done"],
["drink", "drank", "drunk"],
["drive", "drove", "driven"],
["eat", "ate", "eaten"],
["fall", "fell", "fallen"],
["feed", "fed", "fed"],
["feel", "felt", "felt"],
["fight", "fought", "fought"],
["find", "found", "found"],
["fly", "flew", "flown"],
["forget", "forgot", "forgotten"],
["forgo", "forwent", "forgone"],
["freeze", "froze", "frozen"],
"// get -- got -- gotten",
["get", "got", "got"],
["give", "gave", "given"],
["go", "went", "gone"],
["grow", "grew", "grown"],
["have", "had", "had"],
["hear", "heard", "heard"],
["hit", "hit", "hit"],
["hold", "held", "held"],
["hurt", "hurt", "hurt"],
["input", "input", "input"],
["keep", "kept", "kept"],
["know", "knew", "known"],
["lay", "laid", "lain"],
["lead", "led", "led"],
["light", "lit", "lit"],
["lose", "lost", "lost"],
["make", "made", "made"],
["mistake", "mistook", "mistaken"],
["output", "output", "output"],
["overtake", "overtook", "overtaken"],
["overthrow", "overthrew", "overthrown"],
["overwrite", "overwrote", "overwritten"],
["partake", "partook", "partaken"],
["pay", "paid", "paid"],
["put", "put", "put"],
["read", "read", "read"],
["redo", "redid", "redone"],
["remake", "remade", "remade"],
["reread", "reread", "reread"],
["reset", "reset", "reset"],
["ride", "rode", "ridden"],
["ring", "rang", "rung"],
["rise", "rose", "risen"],
["run", "ran", "run"],
["see", "saw", "seen"],
["sell", "sold", "sold"],
["send", "sent", "sent"],
["set", "set", "set"],
["shake", "shook", "shaken"],
["shed", "shed", "shed"],
["shine", "shone", "shone"],
["shoe", "shod", "shod"],
["shoot", "shot", "shot"],
["show", "showed", "shown"],
["shrink", "shrank", "shrunk"],
["shut", "shut", "shut"],
["sing", "sang", "sung"],
"// sink -- sank -- sunken??",
["sink", "sank", "sunk"],
["sit", "sat", "sat"],
["slay", "slew", "slain"],
["sleep", "slept", "slept"],
["slide", "slid", "slid"],
["slit", "slit", "slit"],
"// sneak -- sneaked/snuck -- sneaked/snuck",
["speak", "spoke", "spoken"],
["spin", "spun", "spun"],
["spit", "spat", "spat"],
["split", "split", "split"],
["spread", "spread", "spread"],
["spring", "sprang", "sprung"],
["stand", "stood", "stood"],
["steal", "stole", "stolen"],
["stick", "stuck", "stuck"],
["sting", "stung", "stung"],
["stink", "stank", "stunk"],
["stride", "strode", "stridden"],
["strike", "struck", "stricken"],
["string", "strung", "strung"],
["sew", "sewed", "sewn"],
["swear", "swore", "sworn"],
["swim", "swam", "swum"],
["swing", "swung", "swung"],
["take", "took", "taken"],
["teach", "taught", "taught"],
["tear", "tore", "torn"],
["think", "thought", "thought"],
["throw", "threw", "thrown"],
["tread", "trod", "trodden"],
["undo", "undid", "undone"],
["wake", "woke", "woken"],
["wear", "wore", "worn"],
["weave", "wove", "woven"],
["wind", "wound", "wound"],
["write", "wrote", "written"]
]

View file

@ -0,0 +1,121 @@
use lazy_static::lazy_static;
use serde::Deserialize;
use std::sync::Arc;
type Noun = (String, String);
#[derive(Debug, Deserialize)]
pub struct IrregularNouns {
nouns: Vec<Noun>,
}
/// The uncached function that is used to produce the original copy of the
/// irregular noun table.
fn uncached_inner_new() -> Arc<IrregularNouns> {
IrregularNouns::from_json_file(include_str!("../irregular_nouns.json"))
.map(Arc::new)
.unwrap_or_else(|e| panic!("Failed to load irregular noun table: {}", e))
}
lazy_static! {
static ref NOUNS: Arc<IrregularNouns> = uncached_inner_new();
}
impl IrregularNouns {
pub fn new() -> Self {
Self { nouns: vec![] }
}
pub fn from_json_file(json: &str) -> Result<Self, serde_json::Error> {
// Deserialize into Vec<serde_json::Value> to handle mixed types
let values: Vec<serde_json::Value> =
serde_json::from_str(json).expect("Failed to parse irregular nouns JSON");
let mut nouns = Vec::new();
for value in values {
match value {
serde_json::Value::Array(arr) if arr.len() == 2 => {
// Handle array of 2 strings
if let (Some(singular), Some(plural)) = (arr[0].as_str(), arr[1].as_str()) {
nouns.push((singular.to_string(), plural.to_string()));
}
}
// Strings are used for comments to guide contributors editing the file
serde_json::Value::String(_) => {}
_ => {}
}
}
Ok(Self { nouns })
}
pub fn curated() -> Arc<Self> {
(*NOUNS).clone()
}
pub fn get_plural_for_singular(&self, singular: &str) -> Option<&str> {
self.nouns
.iter()
.find(|(sg, _)| sg.eq_ignore_ascii_case(singular))
.map(|(_, pl)| pl.as_str())
}
pub fn get_singular_for_plural(&self, plural: &str) -> Option<&str> {
self.nouns
.iter()
.find(|(_, pl)| pl.eq_ignore_ascii_case(plural))
.map(|(sg, _)| sg.as_str())
}
}
impl Default for IrregularNouns {
fn default() -> Self {
Self::new()
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn can_find_irregular_plural_for_singular_lowercase() {
assert_eq!(
IrregularNouns::curated().get_plural_for_singular("man"),
Some("men")
);
}
#[test]
fn can_find_irregular_plural_for_singular_uppercase() {
assert_eq!(
IrregularNouns::curated().get_plural_for_singular("WOMAN"),
Some("women")
);
}
#[test]
fn can_find_singular_for_irregular_plural() {
assert_eq!(
IrregularNouns::curated().get_singular_for_plural("children"),
Some("child")
);
}
#[test]
fn cant_find_regular_plural() {
assert_eq!(
IrregularNouns::curated().get_plural_for_singular("car"),
None
);
}
#[test]
fn cant_find_non_noun() {
assert_eq!(
IrregularNouns::curated().get_plural_for_singular("the"),
None
);
}
}

View file

@ -0,0 +1,120 @@
use lazy_static::lazy_static;
use serde::Deserialize;
use std::sync::Arc;
type Verb = (String, String, String);
#[derive(Debug, Deserialize)]
pub struct IrregularVerbs {
verbs: Vec<Verb>,
}
/// The uncached function that is used to produce the original copy of the
/// irregular verb table.
fn uncached_inner_new() -> Arc<IrregularVerbs> {
IrregularVerbs::from_json_file(include_str!("../irregular_verbs.json"))
.map(Arc::new)
.unwrap_or_else(|e| panic!("Failed to load irregular verb table: {}", e))
}
lazy_static! {
static ref VERBS: Arc<IrregularVerbs> = uncached_inner_new();
}
impl IrregularVerbs {
pub fn new() -> Self {
Self { verbs: vec![] }
}
pub fn from_json_file(json: &str) -> Result<Self, serde_json::Error> {
// Deserialize into Vec<serde_json::Value> to handle mixed types
let values: Vec<serde_json::Value> =
serde_json::from_str(json).expect("Failed to parse irregular verbs JSON");
let mut verbs = Vec::new();
for value in values {
match value {
serde_json::Value::Array(arr) if arr.len() == 3 => {
// Handle array of 3 strings
if let (Some(lemma), Some(preterite), Some(past_participle)) =
(arr[0].as_str(), arr[1].as_str(), arr[2].as_str())
{
verbs.push((
lemma.to_string(),
preterite.to_string(),
past_participle.to_string(),
));
}
}
// Strings are used for comments to guide contributors editing the file
serde_json::Value::String(_) => {}
_ => {}
}
}
Ok(Self { verbs })
}
pub fn curated() -> Arc<Self> {
(*VERBS).clone()
}
pub fn get_past_participle_for_preterite(&self, preterite: &str) -> Option<&str> {
self.verbs
.iter()
.find(|(_, pt, _)| pt.eq_ignore_ascii_case(preterite))
.map(|(_, _, pp)| pp.as_str())
}
}
impl Default for IrregularVerbs {
fn default() -> Self {
Self::new()
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn can_find_irregular_past_participle_for_preterite_lowercase() {
assert_eq!(
IrregularVerbs::curated().get_past_participle_for_preterite("arose"),
Some("arisen")
);
}
#[test]
fn can_find_irregular_past_participle_for_preterite_uppercase() {
assert_eq!(
IrregularVerbs::curated().get_past_participle_for_preterite("WENT"),
Some("gone")
);
}
#[test]
fn can_find_irregular_past_participle_same_as_past_tense() {
assert_eq!(
IrregularVerbs::curated().get_past_participle_for_preterite("taught"),
Some("taught")
);
}
#[test]
fn cant_find_regular_past_participle() {
assert_eq!(
IrregularVerbs::curated().get_past_participle_for_preterite("walked"),
None
);
}
#[test]
fn cant_find_non_verb() {
assert_eq!(
IrregularVerbs::curated().get_past_participle_for_preterite("the"),
None
);
}
}

View file

@ -11,6 +11,8 @@ mod edit_distance;
pub mod expr;
mod fat_token;
mod ignored_lints;
mod irregular_nouns;
mod irregular_verbs;
pub mod language_detection;
mod lexing;
pub mod linting;
@ -42,6 +44,8 @@ pub use dict_word_metadata_orthography::{OrthFlags, Orthography};
pub use document::Document;
pub use fat_token::{FatStringToken, FatToken};
pub use ignored_lints::{IgnoredLints, LintContext};
pub use irregular_nouns::IrregularNouns;
pub use irregular_verbs::IrregularVerbs;
use linting::Lint;
pub use mask::{Mask, Masker};
pub use number::{Number, OrdinalSuffix};

View file

@ -1,82 +1,12 @@
use crate::linting::expr_linter::Chunk;
use crate::{
Token,
char_string::CharStringExt,
expr::{All, Expr, FirstMatchOf, SequenceExpr},
irregular_verbs::IrregularVerbs,
linting::{ExprLinter, Lint, LintKind, Suggestion},
patterns::{InflectionOfBe, WordSet},
};
/// Maps common irregular verbs between their simple past and past participle forms.
const IRREGULAR_VERBS: &[(&str, &str)] = &[
("arose", "arisen"),
("ate", "eaten"),
("awoke", "awoken"),
("bade", "bidden"),
("became", "become"),
("began", "begun"),
("bit", "bitten"),
("blew", "blown"),
("bought", "bought"),
("brang", "brung"),
("broke", "broken"),
("brought", "brought"),
("came", "come"),
("chose", "chosen"),
("did", "done"),
("drank", "drunk"),
("drove", "driven"),
("fell", "fallen"),
("felt", "felt"),
("flew", "flown"),
("forgot", "forgotten"),
("forwent", "forgone"),
("gave", "given"),
("grew", "grown"),
("had", "had"),
("heard", "heard"),
("hit", "hit"),
("input", "input"),
("knew", "known"),
("led", "led"),
("mistook", "mistaken"),
("output", "output"),
("overtook", "overtaken"),
("paid", "paid"),
("partook", "partaken"),
// proved, proved/proven
("put", "put"),
("ran", "run"),
("rang", "rung"),
("read", "read"),
("reset", "reset"),
("rode", "ridden"),
("rose", "risen"),
("sang", "sung"),
("sank", "sunken"),
("saw", "seen"),
("set", "set"),
("sewed", "sewn"),
("slew", "slain"),
("slid", "slid"),
("spoke", "spoken"),
("sprang", "sprung"),
("stank", "stunk"),
("stole", "stolen"),
("stood", "stood"),
("swam", "swum"),
("swore", "sworn"),
("thought", "thought"),
("trod", "trodden"),
("took", "taken"),
// was, been
// were, been
("went", "gone"),
("woke", "woken"),
("wove", "woven"),
("wrote", "written"),
];
/// Corrects simple past tense verbs to past participle after auxiliary verbs like "have" or "be".
pub struct SimplePastToPastParticiple {
expr: Box<dyn Expr>,
@ -141,41 +71,32 @@ impl ExprLinter for SimplePastToPastParticiple {
let verb_tok = &toks[2];
let verb_ch = verb_tok.span.get_content(src);
if !IRREGULAR_VERBS
.iter()
.any(|(t, p)| verb_ch.eq_ignore_ascii_case_str(t) && p != t)
let simple_past = verb_tok.span.get_content_string(src);
if let Some(past_participle) = IrregularVerbs::curated()
.get_past_participle_for_preterite(&simple_past)
.filter(|pp| pp != &simple_past)
{
return None;
}
let suggestions = vec![Suggestion::replace_with_match_case(
past_participle.chars().collect(),
verb_tok.span.get_content(src),
)];
let (simple_past, past_participle) = IRREGULAR_VERBS
.iter()
.find(|(simple_past, _)| {
verb_tok
.span
.get_content(src)
.eq_ignore_ascii_case_str(simple_past)
let message = format!(
"Use the past participle `{}` instead of `{}` when using compound tenses or passive voice.",
past_participle, simple_past
);
Some(Lint {
span: verb_tok.span,
lint_kind: LintKind::Grammar,
suggestions,
message,
..Default::default()
})
.unwrap();
let suggestions = vec![Suggestion::replace_with_match_case(
past_participle.chars().collect(),
verb_tok.span.get_content(src),
)];
let message = format!(
"Use the past participle `{}` instead of `{}` when using compound tenses or passive voice.",
past_participle, simple_past
);
Some(Lint {
span: verb_tok.span,
lint_kind: LintKind::Grammar,
suggestions,
message,
..Default::default()
})
} else {
None
}
}
fn description(&self) -> &str {