feat: adding prefixes to dictionary (#2212)

* feat: adding prefixes to dictionary

* feat: `AffixData` for `DictWordMetadata`
This commit is contained in:
Andrew Dunbar 2025-12-11 19:00:29 +00:00 committed by GitHub
parent 6ac8406e29
commit f15778ed28
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
3 changed files with 112 additions and 51 deletions

View file

@ -997,6 +997,14 @@
"metadata": {
"//": "not yet implemented"
}
},
"(": {
"#": "prefix property",
"metadata": {
"affix": {
"is_prefix": true
}
}
}
}
}

View file

@ -11157,7 +11157,7 @@ Zworykin/g
Zyrtec/g
Zyuganov/g
Zzz
a/~DP
a/~DP(
a.m./
aah/NV
aardvark/~NSg
@ -12585,7 +12585,7 @@ antagonist/~NSg
antagonistic/~JQ
antagonize/VdSG
antarctic/~J
ante/~NSgV
ante/~NSgV(
anteater/NgS
antebellum/~J
antecedence/Nmg
@ -12620,7 +12620,7 @@ anthropomorphise/V!_
anthropomorphism/Nmg
anthropomorphize/V
anthropomorphous/J
anti/~JNSgP
anti/~JNSgP(
antiabortion/J
antiabortionist/NgS
antiaircraft/JN
@ -12949,7 +12949,7 @@ arbutus/NgS
arc/~NSgVdG
arcade/~NgSV
arcane/~J
arch/~NgSVGd>J^YpZv
arch/~NgSVGd>J^YpZv(
archaeological/~JY
archaeologist/~NSg
archaeology/~Nmg
@ -13508,7 +13508,7 @@ authorized/~JVtT
authorship/~Nmg
autism/~Nmg
autistic/~JN
auto/~JNgSV
auto/~JNgSV(
autobahn/~NSg
autobiographer/NSg
autobiographic/J
@ -14436,6 +14436,7 @@ bend/~VbG>SNgBZ
bendability/Nmg
bender/~Ng
bendy/J^>N
# bene # prefixes that are not also words in their own right don't belong in the dictionary
beneath/~P
benedictine/~
benediction/NwSg
@ -14572,7 +14573,7 @@ bezel/NgS
bezier/NgS
bf/~N
bhaji/N
bi/~J>NSgZ
bi/~J>NSgZ(
biannual/JYN
bias/~NgSVGdJ
biased/~JVtTU
@ -16892,6 +16893,7 @@ center/~NgJVdG
centerboard/NSg
centerfold/NgS
centerpiece/~NgS
# centi # prefixes that are not also words in their own right don't belong in the dictionary
centigrade/JN
centigram/NSg
centiliter/NgS<
@ -17554,6 +17556,7 @@ circularize/VdSG
circulate/~VdSGr
circulation/~NSgr
circulatory/JN
# circum # prefixes that are not also words in their own right don't belong in the dictionary
circumcise/VdSGXn
circumcised/JVtTNU
circumcision/~Ng
@ -17913,7 +17916,7 @@ clxvi
clxvii
cm/~
cnidarian/NgS
co/~NSIdE
co/~NSIdE(
coach/~NgSVdG
coachload/NS
coachman/N0g
@ -20368,6 +20371,7 @@ dc/~N
dd/~NSdG
dded/K
dding/K
# de # prefixes that are not also words in their own right don't belong in the dictionary
deacon/~NgSV
deaconess/NgS
dead/~J^Y>NgVXn
@ -20439,6 +20443,7 @@ debtor/~NgS
debugger/NSg
debut/~NgVGd
debutante/NSg
# deca # prefixes that are not also words in their own right don't belong in the dictionary
decade/~NgS
decadence/~Nmg
decadency/Nmg
@ -21117,6 +21122,7 @@ dextrose/Nmg
dharma/~NwgS
dhoti/NSg
dhow/NgS
# di # prefixes that are not also words in their own right don't belong in the dictionary
diabetes/~Nmg
diabetic/~JNSg
diabolic/J
@ -21406,7 +21412,7 @@ dirtball/NS
dirtily/Ry
dirtiness/Nmg
dirty/~J^>VdSGp
dis/~VNgI
dis/~VNgI(
disable/~VdSGJL
disablement/Ng
disambiguate/~VSdGn
@ -22399,6 +22405,7 @@ dynamo/~NSg
dynastic/~J
dynasty/~NSg
dyno/NSg
# dys # prefixes that are not also words in their own right don't belong in the dictionary
dysentery/~Nmg
dysfunction/~Nmg
dysfunctional/~J
@ -22989,7 +22996,7 @@ emulsification/Nmg
emulsifier/~NgS
emulsify/Vd>SGnZ
emulsion/~NwgSV
en/~NSgPI
en/~NSgPI(
enable/~Vd>SGZ
enabler/NgS
enact/~VSdGrL
@ -23575,6 +23582,7 @@ etude/NSg
etymological/~JY
etymologist/NSg
etymology/~NwSg
# eu # prefixes that are not also words in their own right don't belong in the dictionary
eucalypti/N9
eucalyptus/~N0gS
euchre/NSgVdG
@ -23682,7 +23690,7 @@ evolutionist/NSg
evolve/~VdSG
ewe/~NSg>Z
ewer/Ng
ex/~NgSVJ
ex/~NgSVJ(
exabyte/NgS
exacerbate/VGdSn
exacerbation/Nwg
@ -24032,7 +24040,7 @@ extortion/~Ng>Z
extortionate/JY
extortioner/Ng
extortionist/NgS
extra/~JNSg
extra/~JNSg(
extracellular/~J
extract/~NgSVdGv
extraction/~NwSg
@ -25314,7 +25322,7 @@ forceps/N09g
forcible/~J
forcibly/~Ry
ford/~NgSVdGB
fore/~JNgS
fore/~JNgS(
forearm/~NSgVGd
forebear/NgSV
forebode/VGdSNz
@ -28126,7 +28134,7 @@ hesitate/~VdSGnX
hesitating/VNYU
hesitation/~Ng
hessian/~N
hetero/~JNSg
hetero/~JNSg(
heterodox/J
heterodoxy/Nmg
heterogeneity/Ng
@ -28427,7 +28435,7 @@ homily/~NSg
hominid/NSgJ
hominoid/NS
hominy/Ng
homo/~NgSJ
homo/~NgSJ(
homoerotic/J
homogeneity/Ng
homogeneous/~JY
@ -28917,6 +28925,7 @@ hymn/~NgSVdG
hymnal/~NgSJ
hymnbook/NSg
hype/~NmgSVGd>J
# hyper # prefixes that are not also words in their own right don't belong in the dictionary
hyperactive/J
hyperactivity/~Ng
hyperaggressive/J
@ -29113,6 +29122,7 @@ ignore/~VGdS
iguana/~NgS
ii/~
iii/~
# il # prefixes that are not also words in their own right don't belong in the dictionary
ilea/N
ileitis/Ng
ileum/Ng
@ -29153,6 +29163,7 @@ illustrative/~JY
illustrator/~NSg
illustrious/~JYp
illustriousness/Nmg
# im # prefixes that are not also words in their own right don't belong in the dictionary
image/~NwSgVdG
imager/NgS
imagery/~Nmg
@ -29464,7 +29475,7 @@ impure/~JY^>V
impurity/~NSg
imputation/NSg
impute/VdSGB
in/~PJRrg # removed `4`, verb senses are obsolete, `NS`, noun sense is marginal
in/~PJRrg( # removed `4`, verb senses are obsolete, `NS`, noun sense is marginal
inaccuracy/NwgS
inaction/~Nmg
inadequacy/NS
@ -30114,7 +30125,7 @@ intent/~NSgJYp
intention/~NgSV
intentional/~JYNU
intentness/Ng
inter/~VSEL
inter/~VSEL(
interact/~VGdSNv
interaction/~NwSg
interactive/~JYN
@ -30298,6 +30309,7 @@ intonation/~NSg
intoxicant/NSgJ
intoxicate/VdSGJn
intoxication/~Ng
# intra # prefixes that are not also words in their own right don't belong in the dictionary
intracranial/~J
intramural/~JN
intramuscular/J
@ -30316,7 +30328,7 @@ intriguer/Ng
intriguing/~JYV6N
intrinsic/~JNgS
intrinsically/~Ry
intro/~NSgV
intro/~NSgV(
introduce/~VGdSr
introduction/~N0gr
introductions/~N9
@ -31100,7 +31112,7 @@ killer/~NgJ
killing/~JNgV
killjoy/NSg
kiln/~NgSVdG
kilo/~NgS
kilo/~NgS(
kilobit/NSg
kilobyte/NSg
kilocoulomb/S
@ -32654,7 +32666,7 @@ mackerel/~NwSg
mackinaw/NSg
mackintosh/~NgS
macrame/NgV
macro/~JNSg
macro/~JNSg(
macroaggregate/Ng
macrobiotic/JS
macrobiotics/Nwg
@ -33383,7 +33395,7 @@ meeting/~NwgSV
meetinghouse/NSg
meetup/NgS
meg/~NSV
mega/~JN
mega/~JN(
megabit/NSg
megabucks/Ng
megabyte/NgS
@ -33700,7 +33712,7 @@ mica/~Ng
mice/~N9V
mick/~NSJ
mickey/~NgSV
micro/~JNSgV
micro/~JNSgV(
microaggression/NSg
microarchitecture/NgS
microbe/NgS
@ -33762,7 +33774,7 @@ microtransaction/NSg
microvascular/J
microwave/~NSgVdGB
microwaveable/J
mid/~JPN
mid/~JPN(
midair/J
midcentury/J
midday/~Ng
@ -33864,6 +33876,7 @@ millennial/JNgS
millennium/~NgS
miller/~Ng
millet/~Ng
# milli # prefixes that are not also words in their own right don't belong in the dictionary
milliamp/NgS
milliard/Sg
millibar/NgS
@ -33929,7 +33942,7 @@ minestrone/Nmg
minesweeper/NSg
mingle/VdGSN
mingy/J
mini/~JNgS
mini/~JNgS(
miniature/~NgSJV
miniaturisation/Ng!_
miniaturise/VGdS!_
@ -34001,6 +34014,7 @@ mirthful/JYp
mirthfulness/Nmg
mirthless/JY
miry/J>^
# mis # prefixes that are not also words in their own right don't belong in the dictionary
misaddress/VdSG
misadventure/NwgS
misaligned/JV
@ -34400,7 +34414,7 @@ monkey/~NgSVdG
monkeyshine/NSg
monkish/J
monkshood/NSg
mono/~NgJ
mono/~NgJ(
monochromatic/~J
monochrome/~NgSJ
monocle/NSgd
@ -34774,7 +34788,7 @@ mullet/~NgS
mulligan/~NSg
mulligatawny/Ng
mullion/NSgVd
multi/~N
multi/~N(
multibillion/J
multibyte/J
multicellular/J
@ -35273,6 +35287,7 @@ nelson/~NSg
nematode/NSg
nemeses/N9
nemesis/~N0g
# neo # prefixes that are not also words in their own right don't belong in the dictionary
neoclassic/J
neoclassical/~JN
neoclassicism/Nmg
@ -35594,7 +35609,7 @@ nomination's/r
nominative/~JNSg
nominator/~NSge
nominee/~NgS
non/~N
non/~N(
nonabrasive/JN
nonabsorbent/JSg
nonacademic/JN
@ -36462,6 +36477,7 @@ omission/~NwgS
omit/~VS
omitted/~V
omitting/~VN
# omni # prefixes that are not also words in their own right don't belong in the dictionary
omnibus/~NgSJV
omnidirectional/J
omnipotence/Nmg
@ -36779,7 +36795,7 @@ ourself/Ia1F # I:pronoun a:personal 1:person .~singular F:reflexive (of t
ourselves/~Ia1F: # I:pronoun a:personal 1:person :~plural F:reflexive
oust/~VGd>SZ
ouster/~NgSV
out/~PNSgVGd>JRz
out/~PNSgVGd>JRz(
outage/NSg
outargue/VGdS
outback/~NgSJV
@ -36942,7 +36958,7 @@ oven/~NgSV
ovenbird/NSg
ovenproof/J
ovenware/Nmg
over/~JYNgSP
over/~JYNgSP(
overabundance/Ng
overabundant/J
overachieve/VGd>SZ
@ -37439,7 +37455,7 @@ pampas/Ng
pamper/VdGSN
pamphlet/~NgSV
pamphleteer/NgSV
pan/~NSgVJ
pan/~NSgVJ(
panacea/NSg
panache/Ng
panama/~NgS
@ -37528,7 +37544,7 @@ paprika/~NmgJ
papyri/~N9
papyrus/~N0g
par/~NSgJ>PVGdZBz
para/~NgSJ
para/~NgSJ(
parable/~NgSVJ
parabola/N0Sg
parabolæ/N9
@ -38137,6 +38153,7 @@ peppy/J^>Np
pepsin/Ng
peptic/JNgS
peptide/~NS
# per # prefixes that are not also words in their own right don't belong in the dictionary
peradventure/Ng
perambulate/VGdSXn
perambulation/Nwg
@ -39241,7 +39258,7 @@ polonaise/NSgV
polonium/Nmg
poltergeist/~NgS
poltroon/NSgJ
poly/~NJV
poly/~NJV(
polyacrylamide/N
polyamory/NS
polyandrous/J
@ -39448,7 +39465,7 @@ possibility/~NSg
possible/~JNSg
possibly/~R # adverb of probability/certainty/affirmation; modal adverb
possum/~NSgV
post/~NwgSVGd>PZz
post/~NwgSVGd>PZz(
postage/~Nmg
postal/~J
postbag/NgS
@ -40034,7 +40051,7 @@ prizefighter/Ng
prizefighting/Ng
prizewinner/NgS
prizewinning/J
pro/~NSgPJ
pro/~NSgPJ(
probabilistic/~J
probability/~NSg
probable/~JNSg
@ -40316,6 +40333,7 @@ protein/~NwSg
protest/NwgS
protestant/~JNgS
protestation/NwgS
# proto # prefixes that are not also words in their own right don't belong in the dictionary
protocol/~NwgSV
proton/~NSg
protoplasm/Nmg
@ -40389,7 +40407,7 @@ psaltery/NSg
psephologist/NS
psephology/N
pseud/NS
pseudo/~NSJ
pseudo/~NSJ(
pseudocode/NmgG
pseudonym/~NSg
pseudonymous/~J
@ -41244,7 +41262,7 @@ razz/NgSVGd
razzmatazz/Ng
rcpt/N
rd/~N
re/PNSgvz
re/PNSgvz(
reach/~VdGSNgB
reachable/~JNU
reacquire/VdSG
@ -42130,7 +42148,7 @@ retributive/J
retrieval/~NSg
retrieve/~Vd>GSNgZB
retriever/Ng
retro/~JNmgS
retro/~JNmgS(
retroactive/~JY
retrofire/NSVGdJ
retrofit/~VSNg
@ -43736,7 +43754,7 @@ semaphore/NSgVdG
semblance/NSgr
semen/~Nmg
semester/~NSg
semi/~NgS
semi/~NgS(
semiannual/JYN
semiarid/J
semiautomatic/JNgSQ
@ -46775,7 +46793,7 @@ suasion/NgE
suave/J>Y^Np
suaveness/Ng
suavity/Ng
sub/~NSgVP
sub/~NSgVP(
subaltern/JNgS
subaqua/J
subarctic/~ONJ
@ -47144,7 +47162,7 @@ suntanning/V6
suntrap/NS
sunup/Ng
sup/~V>SNgJZ
super/~JNgV
super/~JNgV(
superabundance/NwgS
superabundant/J
superannuate/VGdSn
@ -47283,6 +47301,7 @@ supremacy/~Ng
supreme/~JYVN
supremo/NS
supt/V
# sur # prefixes that are not also words in their own right don't belong in the dictionary
surcease/NSgVdG
surcharge/NSgVdG
surcingle/NSgV
@ -48004,6 +48023,7 @@ teetotalism/Ng
teetotaller/NgS!@_
tektite/NSg
tel/~N
# tele # prefixes that are not also words in their own right don't belong in the dictionary
telecast/~VG>SNgZ
telecaster/Ng
telecom/NgS
@ -49152,7 +49172,7 @@ tranquilizer/Ng
tranquillise/Vd>SGZ!_
tranquilliser/Ng!_
tranquillity/Ng!_
trans/~JNVi
trans/~JNVi(
transact/VdGS
transaction/~NSg
transactional/J
@ -49372,6 +49392,7 @@ tress/NgSVE
trestle/~NgS
trews/N
trey/~NgS
# tri # prefixes that are not also words in their own right don't belong in the dictionary
triad/~NSg
triage/NmgVd
triager/NSg
@ -49900,7 +49921,7 @@ ulterior/J
ultimate/~JYNgV
ultimatum/~NgS
ultimo/~JN
ultra/~JNSg
ultra/~JNSg(
ultraconservative/JNSg
ultrahigh/J
ultraist/NSg
@ -49929,6 +49950,7 @@ umlaut/NgSV
ump/NSgVGd
umpire/~NgSVGd
umpteen/H
# un # prefixes that are not also words in their own right don't belong in the dictionary
unabridged/~JNgS
unacceptability/Nmg
unacceptable/~JN
@ -50026,7 +50048,7 @@ undecided/~JNSgV
undefine/VGdS
undemonstrative/JY
undeniably/Ry
under/~PJN
under/~PJN(
underachieve/VGd>SLZ
underachiever/Ng
underact/VSdG
@ -50259,6 +50281,7 @@ unhealthy/~J^
unhistorical/J
unholy/~J^
unhurt/J
# uni # prefixes that are not also words in their own right don't belong in the dictionary
unibody/NSg
unicameral/~J
unicellular/JN
@ -51037,7 +51060,7 @@ vicar/~NSg
vicarage/~NSg
vicarious/JYp
vicariousness/Ng
vice/~NgSVJPe
vice/~NgSVJPe(
viced/JVtT
vicegerent/NSgJ
vicennial/JN
@ -53414,7 +53437,7 @@ pentest/VSdG
pentester/NSg # penetration tester
pentesting/NmgV6
postfix/NgSVdG
pre/~PNV # !! please check and comment !! dictionaries only list prefix pre-
pre/~PNV( # !! please check and comment !! dictionaries only list prefix pre-
preshared/J
quadtree/NgS # data structure
quicksort/NgSVdG # algo

View file

@ -18,12 +18,20 @@ use crate::{Document, TokenKind, TokenStringExt};
/// having their own lexeme, but "Ivy" and "ivy" sharing the same lexeme.
#[derive(Debug, Default, Clone, PartialEq, Eq, Serialize, Deserialize, PartialOrd, Hash)]
pub struct DictWordMetadata {
/// The main parts of speech which have extra data.
pub noun: Option<NounData>,
pub pronoun: Option<PronounData>,
pub verb: Option<VerbData>,
pub adjective: Option<AdjectiveData>,
pub adverb: Option<AdverbData>,
pub conjunction: Option<ConjunctionData>,
pub determiner: Option<DeterminerData>,
pub affix: Option<AffixData>,
/// Parts of speech which don't have extra data.
/// Whether the word is a [preposition](https://www.merriam-webster.com/dictionary/preposition).
#[serde(default = "default_false")]
pub preposition: bool,
/// Whether the word is an offensive word.
pub swear: Option<bool>,
/// The dialects this word belongs to.
/// If no dialects are defined, it can be assumed that the word is
@ -33,11 +41,6 @@ pub struct DictWordMetadata {
/// Orthographic information: letter case, spaces, hyphens, etc.
#[serde(default = "OrthFlags::empty")]
pub orth_info: OrthFlags,
/// Whether the word is a [determiner](https://en.wikipedia.org/wiki/English_determiners).
pub determiner: Option<DeterminerData>,
/// Whether the word is a [preposition](https://www.merriam-webster.com/dictionary/preposition).
#[serde(default = "default_false")]
pub preposition: bool,
/// Whether the word is considered especially common.
#[serde(default = "default_false")]
pub common: bool,
@ -189,11 +192,12 @@ impl DictWordMetadata {
adjective: merge!(self.adjective, other.adjective),
adverb: merge!(self.adverb, other.adverb),
conjunction: merge!(self.conjunction, other.conjunction),
determiner: merge!(self.determiner, other.determiner),
affix: merge!(self.affix, other.affix),
preposition: self.preposition || other.preposition,
dialects: self.dialects | other.dialects,
orth_info: self.orth_info | other.orth_info,
swear: self.swear.or(other.swear),
determiner: merge!(self.determiner, other.determiner),
preposition: self.preposition || other.preposition,
common: self.common || other.common,
derived_from: self.derived_from.or(other.derived_from),
pos_tag: self.pos_tag.or(other.pos_tag),
@ -234,6 +238,7 @@ impl DictWordMetadata {
self.adverb = None;
self.conjunction = None;
self.determiner = None;
self.affix = None;
self.preposition = false;
}
PROPN => {
@ -259,6 +264,7 @@ impl DictWordMetadata {
self.adverb = None;
self.conjunction = None;
self.determiner = None;
self.affix = None;
self.preposition = false;
}
PRON => {
@ -272,6 +278,7 @@ impl DictWordMetadata {
self.adverb = None;
self.conjunction = None;
self.determiner = None;
self.affix = None;
self.preposition = false;
}
VERB => {
@ -293,6 +300,7 @@ impl DictWordMetadata {
self.adverb = None;
self.conjunction = None;
self.determiner = None;
self.affix = None;
self.preposition = false;
}
AUX => {
@ -314,6 +322,7 @@ impl DictWordMetadata {
self.adverb = None;
self.conjunction = None;
self.determiner = None;
self.affix = None;
self.preposition = false;
}
ADJ => {
@ -327,6 +336,7 @@ impl DictWordMetadata {
self.adverb = None;
self.conjunction = None;
self.determiner = None;
self.affix = None;
self.preposition = false;
}
ADV => {
@ -340,6 +350,7 @@ impl DictWordMetadata {
self.adjective = None;
self.conjunction = None;
self.determiner = None;
self.affix = None;
self.preposition = false;
}
ADP => {
@ -350,6 +361,7 @@ impl DictWordMetadata {
self.adverb = None;
self.conjunction = None;
self.determiner = None;
self.affix = None;
self.preposition = true;
}
DET => {
@ -359,6 +371,7 @@ impl DictWordMetadata {
self.adjective = None;
self.adverb = None;
self.conjunction = None;
self.affix = None;
self.preposition = false;
self.determiner = Some(DeterminerData::default());
}
@ -373,6 +386,7 @@ impl DictWordMetadata {
self.adjective = None;
self.adverb = None;
self.determiner = None;
self.affix = None;
self.preposition = false;
}
_ => {}
@ -958,6 +972,22 @@ impl ConjunctionData {
}
}
#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, PartialOrd, Eq, Hash, Default)]
pub struct AffixData {
pub is_prefix: Option<bool>,
pub is_suffix: Option<bool>,
}
impl AffixData {
/// Produce a copy of `self` with the known properties of `other` set.
pub fn or(&self, _other: &Self) -> Self {
Self {
is_prefix: self.is_prefix.or(_other.is_prefix),
is_suffix: self.is_suffix.or(_other.is_suffix),
}
}
}
/// A regional dialect.
///
/// Note: these have bit-shifted values so that they can ergonomically integrate with