diff --git a/harper-core/annotations.json b/harper-core/annotations.json index b72f0eaf..85871024 100644 --- a/harper-core/annotations.json +++ b/harper-core/annotations.json @@ -997,6 +997,14 @@ "metadata": { "//": "not yet implemented" } + }, + "(": { + "#": "prefix property", + "metadata": { + "affix": { + "is_prefix": true + } + } } } } diff --git a/harper-core/dictionary.dict b/harper-core/dictionary.dict index 5ba8e549..84855c6b 100644 --- a/harper-core/dictionary.dict +++ b/harper-core/dictionary.dict @@ -11157,7 +11157,7 @@ Zworykin/g Zyrtec/g Zyuganov/g Zzz -a/~DP +a/~DP( a.m./ aah/NV aardvark/~NSg @@ -12585,7 +12585,7 @@ antagonist/~NSg antagonistic/~JQ antagonize/VdSG antarctic/~J -ante/~NSgV +ante/~NSgV( anteater/NgS antebellum/~J antecedence/Nmg @@ -12620,7 +12620,7 @@ anthropomorphise/V!_ anthropomorphism/Nmg anthropomorphize/V anthropomorphous/J -anti/~JNSgP +anti/~JNSgP( antiabortion/J antiabortionist/NgS antiaircraft/JN @@ -12949,7 +12949,7 @@ arbutus/NgS arc/~NSgVdG arcade/~NgSV arcane/~J -arch/~NgSVGd>J^YpZv +arch/~NgSVGd>J^YpZv( archaeological/~JY archaeologist/~NSg archaeology/~Nmg @@ -13508,7 +13508,7 @@ authorized/~JVtT authorship/~Nmg autism/~Nmg autistic/~JN -auto/~JNgSV +auto/~JNgSV( autobahn/~NSg autobiographer/NSg autobiographic/J @@ -14436,6 +14436,7 @@ bend/~VbG>SNgBZ bendability/Nmg bender/~Ng bendy/J^>N +# bene # prefixes that are not also words in their own right don't belong in the dictionary beneath/~P benedictine/~ benediction/NwSg @@ -14572,7 +14573,7 @@ bezel/NgS bezier/NgS bf/~N bhaji/N -bi/~J>NSgZ +bi/~J>NSgZ( biannual/JYN bias/~NgSVGdJ biased/~JVtTU @@ -16892,6 +16893,7 @@ center/~NgJVdG centerboard/NSg centerfold/NgS centerpiece/~NgS +# centi # prefixes that are not also words in their own right don't belong in the dictionary centigrade/JN centigram/NSg centiliter/NgS< @@ -17554,6 +17556,7 @@ circularize/VdSG circulate/~VdSGr circulation/~NSgr circulatory/JN +# circum # prefixes that are not also words in their own right don't belong in the dictionary circumcise/VdSGXn circumcised/JVtTNU circumcision/~Ng @@ -17913,7 +17916,7 @@ clxvi clxvii cm/~ cnidarian/NgS -co/~NSIdE +co/~NSIdE( coach/~NgSVdG coachload/NS coachman/N0g @@ -20368,6 +20371,7 @@ dc/~N dd/~NSdG dded/K dding/K +# de # prefixes that are not also words in their own right don't belong in the dictionary deacon/~NgSV deaconess/NgS dead/~J^Y>NgVXn @@ -20439,6 +20443,7 @@ debtor/~NgS debugger/NSg debut/~NgVGd debutante/NSg +# deca # prefixes that are not also words in their own right don't belong in the dictionary decade/~NgS decadence/~Nmg decadency/Nmg @@ -21117,6 +21122,7 @@ dextrose/Nmg dharma/~NwgS dhoti/NSg dhow/NgS +# di # prefixes that are not also words in their own right don't belong in the dictionary diabetes/~Nmg diabetic/~JNSg diabolic/J @@ -21406,7 +21412,7 @@ dirtball/NS dirtily/Ry dirtiness/Nmg dirty/~J^>VdSGp -dis/~VNgI +dis/~VNgI( disable/~VdSGJL disablement/Ng disambiguate/~VSdGn @@ -22399,6 +22405,7 @@ dynamo/~NSg dynastic/~J dynasty/~NSg dyno/NSg +# dys # prefixes that are not also words in their own right don't belong in the dictionary dysentery/~Nmg dysfunction/~Nmg dysfunctional/~J @@ -22989,7 +22996,7 @@ emulsification/Nmg emulsifier/~NgS emulsify/Vd>SGnZ emulsion/~NwgSV -en/~NSgPI +en/~NSgPI( enable/~Vd>SGZ enabler/NgS enact/~VSdGrL @@ -23575,6 +23582,7 @@ etude/NSg etymological/~JY etymologist/NSg etymology/~NwSg +# eu # prefixes that are not also words in their own right don't belong in the dictionary eucalypti/N9 eucalyptus/~N0gS euchre/NSgVdG @@ -23682,7 +23690,7 @@ evolutionist/NSg evolve/~VdSG ewe/~NSg>Z ewer/Ng -ex/~NgSVJ +ex/~NgSVJ( exabyte/NgS exacerbate/VGdSn exacerbation/Nwg @@ -24032,7 +24040,7 @@ extortion/~Ng>Z extortionate/JY extortioner/Ng extortionist/NgS -extra/~JNSg +extra/~JNSg( extracellular/~J extract/~NgSVdGv extraction/~NwSg @@ -25314,7 +25322,7 @@ forceps/N09g forcible/~J forcibly/~Ry ford/~NgSVdGB -fore/~JNgS +fore/~JNgS( forearm/~NSgVGd forebear/NgSV forebode/VGdSNz @@ -28126,7 +28134,7 @@ hesitate/~VdSGnX hesitating/VNYU hesitation/~Ng hessian/~N -hetero/~JNSg +hetero/~JNSg( heterodox/J heterodoxy/Nmg heterogeneity/Ng @@ -28427,7 +28435,7 @@ homily/~NSg hominid/NSgJ hominoid/NS hominy/Ng -homo/~NgSJ +homo/~NgSJ( homoerotic/J homogeneity/Ng homogeneous/~JY @@ -28917,6 +28925,7 @@ hymn/~NgSVdG hymnal/~NgSJ hymnbook/NSg hype/~NmgSVGd>J +# hyper # prefixes that are not also words in their own right don't belong in the dictionary hyperactive/J hyperactivity/~Ng hyperaggressive/J @@ -29113,6 +29122,7 @@ ignore/~VGdS iguana/~NgS ii/~ iii/~ +# il # prefixes that are not also words in their own right don't belong in the dictionary ilea/N ileitis/Ng ileum/Ng @@ -29153,6 +29163,7 @@ illustrative/~JY illustrator/~NSg illustrious/~JYp illustriousness/Nmg +# im # prefixes that are not also words in their own right don't belong in the dictionary image/~NwSgVdG imager/NgS imagery/~Nmg @@ -29464,7 +29475,7 @@ impure/~JY^>V impurity/~NSg imputation/NSg impute/VdSGB -in/~PJRrg # removed `4`, verb senses are obsolete, `NS`, noun sense is marginal +in/~PJRrg( # removed `4`, verb senses are obsolete, `NS`, noun sense is marginal inaccuracy/NwgS inaction/~Nmg inadequacy/NS @@ -30114,7 +30125,7 @@ intent/~NSgJYp intention/~NgSV intentional/~JYNU intentness/Ng -inter/~VSEL +inter/~VSEL( interact/~VGdSNv interaction/~NwSg interactive/~JYN @@ -30298,6 +30309,7 @@ intonation/~NSg intoxicant/NSgJ intoxicate/VdSGJn intoxication/~Ng +# intra # prefixes that are not also words in their own right don't belong in the dictionary intracranial/~J intramural/~JN intramuscular/J @@ -30316,7 +30328,7 @@ intriguer/Ng intriguing/~JYV6N intrinsic/~JNgS intrinsically/~Ry -intro/~NSgV +intro/~NSgV( introduce/~VGdSr introduction/~N0gr introductions/~N9 @@ -31100,7 +31112,7 @@ killer/~NgJ killing/~JNgV killjoy/NSg kiln/~NgSVdG -kilo/~NgS +kilo/~NgS( kilobit/NSg kilobyte/NSg kilocoulomb/S @@ -32654,7 +32666,7 @@ mackerel/~NwSg mackinaw/NSg mackintosh/~NgS macrame/NgV -macro/~JNSg +macro/~JNSg( macroaggregate/Ng macrobiotic/JS macrobiotics/Nwg @@ -33383,7 +33395,7 @@ meeting/~NwgSV meetinghouse/NSg meetup/NgS meg/~NSV -mega/~JN +mega/~JN( megabit/NSg megabucks/Ng megabyte/NgS @@ -33700,7 +33712,7 @@ mica/~Ng mice/~N9V mick/~NSJ mickey/~NgSV -micro/~JNSgV +micro/~JNSgV( microaggression/NSg microarchitecture/NgS microbe/NgS @@ -33762,7 +33774,7 @@ microtransaction/NSg microvascular/J microwave/~NSgVdGB microwaveable/J -mid/~JPN +mid/~JPN( midair/J midcentury/J midday/~Ng @@ -33864,6 +33876,7 @@ millennial/JNgS millennium/~NgS miller/~Ng millet/~Ng +# milli # prefixes that are not also words in their own right don't belong in the dictionary milliamp/NgS milliard/Sg millibar/NgS @@ -33929,7 +33942,7 @@ minestrone/Nmg minesweeper/NSg mingle/VdGSN mingy/J -mini/~JNgS +mini/~JNgS( miniature/~NgSJV miniaturisation/Ng!_ miniaturise/VGdS!_ @@ -34001,6 +34014,7 @@ mirthful/JYp mirthfulness/Nmg mirthless/JY miry/J>^ +# mis # prefixes that are not also words in their own right don't belong in the dictionary misaddress/VdSG misadventure/NwgS misaligned/JV @@ -34400,7 +34414,7 @@ monkey/~NgSVdG monkeyshine/NSg monkish/J monkshood/NSg -mono/~NgJ +mono/~NgJ( monochromatic/~J monochrome/~NgSJ monocle/NSgd @@ -34774,7 +34788,7 @@ mullet/~NgS mulligan/~NSg mulligatawny/Ng mullion/NSgVd -multi/~N +multi/~N( multibillion/J multibyte/J multicellular/J @@ -35273,6 +35287,7 @@ nelson/~NSg nematode/NSg nemeses/N9 nemesis/~N0g +# neo # prefixes that are not also words in their own right don't belong in the dictionary neoclassic/J neoclassical/~JN neoclassicism/Nmg @@ -35594,7 +35609,7 @@ nomination's/r nominative/~JNSg nominator/~NSge nominee/~NgS -non/~N +non/~N( nonabrasive/JN nonabsorbent/JSg nonacademic/JN @@ -36462,6 +36477,7 @@ omission/~NwgS omit/~VS omitted/~V omitting/~VN +# omni # prefixes that are not also words in their own right don't belong in the dictionary omnibus/~NgSJV omnidirectional/J omnipotence/Nmg @@ -36779,7 +36795,7 @@ ourself/Ia1F # I:pronoun a:personal 1:person .~singular F:reflexive (of t ourselves/~Ia1F: # I:pronoun a:personal 1:person :~plural F:reflexive oust/~VGd>SZ ouster/~NgSV -out/~PNSgVGd>JRz +out/~PNSgVGd>JRz( outage/NSg outargue/VGdS outback/~NgSJV @@ -36942,7 +36958,7 @@ oven/~NgSV ovenbird/NSg ovenproof/J ovenware/Nmg -over/~JYNgSP +over/~JYNgSP( overabundance/Ng overabundant/J overachieve/VGd>SZ @@ -37439,7 +37455,7 @@ pampas/Ng pamper/VdGSN pamphlet/~NgSV pamphleteer/NgSV -pan/~NSgVJ +pan/~NSgVJ( panacea/NSg panache/Ng panama/~NgS @@ -37528,7 +37544,7 @@ paprika/~NmgJ papyri/~N9 papyrus/~N0g par/~NSgJ>PVGdZBz -para/~NgSJ +para/~NgSJ( parable/~NgSVJ parabola/N0Sg parabolæ/N9 @@ -38137,6 +38153,7 @@ peppy/J^>Np pepsin/Ng peptic/JNgS peptide/~NS +# per # prefixes that are not also words in their own right don't belong in the dictionary peradventure/Ng perambulate/VGdSXn perambulation/Nwg @@ -39241,7 +39258,7 @@ polonaise/NSgV polonium/Nmg poltergeist/~NgS poltroon/NSgJ -poly/~NJV +poly/~NJV( polyacrylamide/N polyamory/NS polyandrous/J @@ -39448,7 +39465,7 @@ possibility/~NSg possible/~JNSg possibly/~R # adverb of probability/certainty/affirmation; modal adverb possum/~NSgV -post/~NwgSVGd>PZz +post/~NwgSVGd>PZz( postage/~Nmg postal/~J postbag/NgS @@ -40034,7 +40051,7 @@ prizefighter/Ng prizefighting/Ng prizewinner/NgS prizewinning/J -pro/~NSgPJ +pro/~NSgPJ( probabilistic/~J probability/~NSg probable/~JNSg @@ -40316,6 +40333,7 @@ protein/~NwSg protest/NwgS protestant/~JNgS protestation/NwgS +# proto # prefixes that are not also words in their own right don't belong in the dictionary protocol/~NwgSV proton/~NSg protoplasm/Nmg @@ -40389,7 +40407,7 @@ psaltery/NSg psephologist/NS psephology/N pseud/NS -pseudo/~NSJ +pseudo/~NSJ( pseudocode/NmgG pseudonym/~NSg pseudonymous/~J @@ -41244,7 +41262,7 @@ razz/NgSVGd razzmatazz/Ng rcpt/N rd/~N -re/PNSgvz +re/PNSgvz( reach/~VdGSNgB reachable/~JNU reacquire/VdSG @@ -42130,7 +42148,7 @@ retributive/J retrieval/~NSg retrieve/~Vd>GSNgZB retriever/Ng -retro/~JNmgS +retro/~JNmgS( retroactive/~JY retrofire/NSVGdJ retrofit/~VSNg @@ -43736,7 +43754,7 @@ semaphore/NSgVdG semblance/NSgr semen/~Nmg semester/~NSg -semi/~NgS +semi/~NgS( semiannual/JYN semiarid/J semiautomatic/JNgSQ @@ -46775,7 +46793,7 @@ suasion/NgE suave/J>Y^Np suaveness/Ng suavity/Ng -sub/~NSgVP +sub/~NSgVP( subaltern/JNgS subaqua/J subarctic/~ONJ @@ -47144,7 +47162,7 @@ suntanning/V6 suntrap/NS sunup/Ng sup/~V>SNgJZ -super/~JNgV +super/~JNgV( superabundance/NwgS superabundant/J superannuate/VGdSn @@ -47283,6 +47301,7 @@ supremacy/~Ng supreme/~JYVN supremo/NS supt/V +# sur # prefixes that are not also words in their own right don't belong in the dictionary surcease/NSgVdG surcharge/NSgVdG surcingle/NSgV @@ -48004,6 +48023,7 @@ teetotalism/Ng teetotaller/NgS!@_ tektite/NSg tel/~N +# tele # prefixes that are not also words in their own right don't belong in the dictionary telecast/~VG>SNgZ telecaster/Ng telecom/NgS @@ -49152,7 +49172,7 @@ tranquilizer/Ng tranquillise/Vd>SGZ!_ tranquilliser/Ng!_ tranquillity/Ng!_ -trans/~JNVi +trans/~JNVi( transact/VdGS transaction/~NSg transactional/J @@ -49372,6 +49392,7 @@ tress/NgSVE trestle/~NgS trews/N trey/~NgS +# tri # prefixes that are not also words in their own right don't belong in the dictionary triad/~NSg triage/NmgVd triager/NSg @@ -49900,7 +49921,7 @@ ulterior/J ultimate/~JYNgV ultimatum/~NgS ultimo/~JN -ultra/~JNSg +ultra/~JNSg( ultraconservative/JNSg ultrahigh/J ultraist/NSg @@ -49929,6 +49950,7 @@ umlaut/NgSV ump/NSgVGd umpire/~NgSVGd umpteen/H +# un # prefixes that are not also words in their own right don't belong in the dictionary unabridged/~JNgS unacceptability/Nmg unacceptable/~JN @@ -50026,7 +50048,7 @@ undecided/~JNSgV undefine/VGdS undemonstrative/JY undeniably/Ry -under/~PJN +under/~PJN( underachieve/VGd>SLZ underachiever/Ng underact/VSdG @@ -50259,6 +50281,7 @@ unhealthy/~J^ unhistorical/J unholy/~J^ unhurt/J +# uni # prefixes that are not also words in their own right don't belong in the dictionary unibody/NSg unicameral/~J unicellular/JN @@ -51037,7 +51060,7 @@ vicar/~NSg vicarage/~NSg vicarious/JYp vicariousness/Ng -vice/~NgSVJPe +vice/~NgSVJPe( viced/JVtT vicegerent/NSgJ vicennial/JN @@ -53414,7 +53437,7 @@ pentest/VSdG pentester/NSg # penetration tester pentesting/NmgV6 postfix/NgSVdG -pre/~PNV # !! please check and comment !! dictionaries only list prefix pre- +pre/~PNV( # !! please check and comment !! dictionaries only list prefix pre- preshared/J quadtree/NgS # data structure quicksort/NgSVdG # algo diff --git a/harper-core/src/dict_word_metadata.rs b/harper-core/src/dict_word_metadata.rs index 6f941fcf..a09cd54c 100644 --- a/harper-core/src/dict_word_metadata.rs +++ b/harper-core/src/dict_word_metadata.rs @@ -18,12 +18,20 @@ use crate::{Document, TokenKind, TokenStringExt}; /// having their own lexeme, but "Ivy" and "ivy" sharing the same lexeme. #[derive(Debug, Default, Clone, PartialEq, Eq, Serialize, Deserialize, PartialOrd, Hash)] pub struct DictWordMetadata { + /// The main parts of speech which have extra data. pub noun: Option, pub pronoun: Option, pub verb: Option, pub adjective: Option, pub adverb: Option, pub conjunction: Option, + pub determiner: Option, + pub affix: Option, + /// Parts of speech which don't have extra data. + /// Whether the word is a [preposition](https://www.merriam-webster.com/dictionary/preposition). + #[serde(default = "default_false")] + pub preposition: bool, + /// Whether the word is an offensive word. pub swear: Option, /// The dialects this word belongs to. /// If no dialects are defined, it can be assumed that the word is @@ -33,11 +41,6 @@ pub struct DictWordMetadata { /// Orthographic information: letter case, spaces, hyphens, etc. #[serde(default = "OrthFlags::empty")] pub orth_info: OrthFlags, - /// Whether the word is a [determiner](https://en.wikipedia.org/wiki/English_determiners). - pub determiner: Option, - /// Whether the word is a [preposition](https://www.merriam-webster.com/dictionary/preposition). - #[serde(default = "default_false")] - pub preposition: bool, /// Whether the word is considered especially common. #[serde(default = "default_false")] pub common: bool, @@ -189,11 +192,12 @@ impl DictWordMetadata { adjective: merge!(self.adjective, other.adjective), adverb: merge!(self.adverb, other.adverb), conjunction: merge!(self.conjunction, other.conjunction), + determiner: merge!(self.determiner, other.determiner), + affix: merge!(self.affix, other.affix), + preposition: self.preposition || other.preposition, dialects: self.dialects | other.dialects, orth_info: self.orth_info | other.orth_info, swear: self.swear.or(other.swear), - determiner: merge!(self.determiner, other.determiner), - preposition: self.preposition || other.preposition, common: self.common || other.common, derived_from: self.derived_from.or(other.derived_from), pos_tag: self.pos_tag.or(other.pos_tag), @@ -234,6 +238,7 @@ impl DictWordMetadata { self.adverb = None; self.conjunction = None; self.determiner = None; + self.affix = None; self.preposition = false; } PROPN => { @@ -259,6 +264,7 @@ impl DictWordMetadata { self.adverb = None; self.conjunction = None; self.determiner = None; + self.affix = None; self.preposition = false; } PRON => { @@ -272,6 +278,7 @@ impl DictWordMetadata { self.adverb = None; self.conjunction = None; self.determiner = None; + self.affix = None; self.preposition = false; } VERB => { @@ -293,6 +300,7 @@ impl DictWordMetadata { self.adverb = None; self.conjunction = None; self.determiner = None; + self.affix = None; self.preposition = false; } AUX => { @@ -314,6 +322,7 @@ impl DictWordMetadata { self.adverb = None; self.conjunction = None; self.determiner = None; + self.affix = None; self.preposition = false; } ADJ => { @@ -327,6 +336,7 @@ impl DictWordMetadata { self.adverb = None; self.conjunction = None; self.determiner = None; + self.affix = None; self.preposition = false; } ADV => { @@ -340,6 +350,7 @@ impl DictWordMetadata { self.adjective = None; self.conjunction = None; self.determiner = None; + self.affix = None; self.preposition = false; } ADP => { @@ -350,6 +361,7 @@ impl DictWordMetadata { self.adverb = None; self.conjunction = None; self.determiner = None; + self.affix = None; self.preposition = true; } DET => { @@ -359,6 +371,7 @@ impl DictWordMetadata { self.adjective = None; self.adverb = None; self.conjunction = None; + self.affix = None; self.preposition = false; self.determiner = Some(DeterminerData::default()); } @@ -373,6 +386,7 @@ impl DictWordMetadata { self.adjective = None; self.adverb = None; self.determiner = None; + self.affix = None; self.preposition = false; } _ => {} @@ -958,6 +972,22 @@ impl ConjunctionData { } } +#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, PartialOrd, Eq, Hash, Default)] +pub struct AffixData { + pub is_prefix: Option, + pub is_suffix: Option, +} + +impl AffixData { + /// Produce a copy of `self` with the known properties of `other` set. + pub fn or(&self, _other: &Self) -> Self { + Self { + is_prefix: self.is_prefix.or(_other.is_prefix), + is_suffix: self.is_suffix.or(_other.is_suffix), + } + } +} + /// A regional dialect. /// /// Note: these have bit-shifted values so that they can ergonomically integrate with