mirror of
https://github.com/Automattic/harper.git
synced 2025-12-23 08:48:15 +00:00
405 lines
12 KiB
Rust
405 lines
12 KiB
Rust
use harper_brill::UPOS;
|
|
use is_macro::Is;
|
|
use serde::{Deserialize, Serialize};
|
|
|
|
use crate::{DictWordMetadata, Number, Punctuation, Quote, TokenKind::Word};
|
|
|
|
/// Generate wrapper code to pass a function call to the inner [`DictWordMetadata`],
|
|
/// if the token is indeed a word, while also emitting method-level documentation.
|
|
macro_rules! delegate_to_metadata {
|
|
($($method:ident),* $(,)?) => {
|
|
$(
|
|
#[doc = concat!(
|
|
"Delegates to [`DictWordMetadata::",
|
|
stringify!($method),
|
|
"`] when this token is a word.\n\n",
|
|
"Returns `false` if the token is not a word."
|
|
)]
|
|
pub fn $method(&self) -> bool {
|
|
let Word(Some(metadata)) = self else {
|
|
return false;
|
|
};
|
|
metadata.$method()
|
|
}
|
|
)*
|
|
};
|
|
}
|
|
|
|
/// The parsed value of a [`Token`](crate::Token).
|
|
/// Has a variety of queries available.
|
|
/// If there is a query missing, it may be easy to implement by just calling the
|
|
/// `delegate_to_metadata` macro.
|
|
#[derive(Debug, Is, Clone, Serialize, Deserialize, Default, PartialOrd, Hash, Eq, PartialEq)]
|
|
#[serde(tag = "kind", content = "value")]
|
|
pub enum TokenKind {
|
|
/// `None` if the word does not exist in the dictionary.
|
|
Word(Option<DictWordMetadata>),
|
|
Punctuation(Punctuation),
|
|
Decade,
|
|
Number(Number),
|
|
/// A sequence of " " spaces.
|
|
Space(usize),
|
|
/// A sequence of "\n" newlines
|
|
Newline(usize),
|
|
EmailAddress,
|
|
Url,
|
|
Hostname,
|
|
/// A special token used for things like inline code blocks that should be
|
|
/// ignored by all linters.
|
|
#[default]
|
|
Unlintable,
|
|
ParagraphBreak,
|
|
Regexish,
|
|
}
|
|
|
|
impl TokenKind {
|
|
// DictWord metadata delegation methods grouped by part of speech
|
|
delegate_to_metadata! {
|
|
// Nominal methods (nouns and pronouns)
|
|
is_nominal,
|
|
is_noun,
|
|
is_pronoun,
|
|
is_proper_noun,
|
|
is_singular_nominal,
|
|
is_plural_nominal,
|
|
is_possessive_nominal,
|
|
is_non_plural_nominal,
|
|
is_singular_noun,
|
|
is_plural_noun,
|
|
is_non_plural_noun,
|
|
is_countable_noun,
|
|
is_non_countable_noun,
|
|
is_mass_noun,
|
|
is_mass_noun_only,
|
|
is_non_mass_noun,
|
|
is_singular_pronoun,
|
|
is_plural_pronoun,
|
|
is_non_plural_pronoun,
|
|
is_reflexive_pronoun,
|
|
is_personal_pronoun,
|
|
is_first_person_singular_pronoun,
|
|
is_first_person_plural_pronoun,
|
|
is_second_person_pronoun,
|
|
is_third_person_pronoun,
|
|
is_third_person_singular_pronoun,
|
|
is_third_person_plural_pronoun,
|
|
is_subject_pronoun,
|
|
is_object_pronoun,
|
|
is_possessive_noun,
|
|
is_possessive_pronoun,
|
|
|
|
// Verb methods
|
|
is_verb,
|
|
is_auxiliary_verb,
|
|
is_linking_verb,
|
|
is_verb_lemma,
|
|
is_verb_past_form,
|
|
is_verb_simple_past_form,
|
|
is_verb_past_participle_form,
|
|
is_verb_progressive_form,
|
|
is_verb_third_person_singular_present_form,
|
|
|
|
// Adjective methods
|
|
is_adjective,
|
|
is_comparative_adjective,
|
|
is_superlative_adjective,
|
|
is_positive_adjective,
|
|
|
|
// Adverb methods
|
|
is_adverb,
|
|
is_manner_adverb,
|
|
is_frequency_adverb,
|
|
is_degree_adverb,
|
|
|
|
// Determiner methods
|
|
is_determiner,
|
|
is_demonstrative_determiner,
|
|
is_possessive_determiner,
|
|
is_quantifier,
|
|
is_non_quantifier_determiner,
|
|
is_non_demonstrative_determiner,
|
|
|
|
// Conjunction methods
|
|
is_conjunction,
|
|
|
|
// Generic word methods
|
|
is_swear,
|
|
is_likely_homograph,
|
|
|
|
// Orthography methods
|
|
is_lowercase,
|
|
is_titlecase,
|
|
is_allcaps,
|
|
is_lower_camel,
|
|
is_upper_camel,
|
|
is_apostrophized,
|
|
|
|
is_roman_numerals
|
|
}
|
|
|
|
// DictWord metadata delegation methods not generated by macro
|
|
pub fn is_preposition(&self) -> bool {
|
|
let Word(Some(metadata)) = self else {
|
|
return false;
|
|
};
|
|
metadata.preposition
|
|
}
|
|
|
|
// Generic word is-methods
|
|
|
|
pub fn is_common_word(&self) -> bool {
|
|
let Word(Some(metadata)) = self else {
|
|
return true;
|
|
};
|
|
metadata.common
|
|
}
|
|
|
|
/// Checks whether the token is a member of a nominal phrase.
|
|
pub fn is_np_member(&self) -> bool {
|
|
let Word(Some(metadata)) = self else {
|
|
return false;
|
|
};
|
|
metadata.np_member.unwrap_or(false)
|
|
}
|
|
|
|
/// Checks whether a word token is out-of-vocabulary (not found in the dictionary).
|
|
///
|
|
/// Returns `true` if the token is a word that was not found in the dictionary,
|
|
/// `false` if the token is a word found in the dictionary or is not a word token.
|
|
pub fn is_oov(&self) -> bool {
|
|
matches!(self, TokenKind::Word(None))
|
|
}
|
|
|
|
// Punctuation and symbol is-methods
|
|
|
|
pub fn is_open_square(&self) -> bool {
|
|
matches!(self, TokenKind::Punctuation(Punctuation::OpenSquare))
|
|
}
|
|
|
|
pub fn is_close_square(&self) -> bool {
|
|
matches!(self, TokenKind::Punctuation(Punctuation::CloseSquare))
|
|
}
|
|
|
|
pub fn is_open_round(&self) -> bool {
|
|
matches!(self, TokenKind::Punctuation(Punctuation::OpenRound))
|
|
}
|
|
|
|
pub fn is_close_round(&self) -> bool {
|
|
matches!(self, TokenKind::Punctuation(Punctuation::CloseRound))
|
|
}
|
|
|
|
pub fn is_pipe(&self) -> bool {
|
|
matches!(self, TokenKind::Punctuation(Punctuation::Pipe))
|
|
}
|
|
|
|
pub fn is_currency(&self) -> bool {
|
|
matches!(self, TokenKind::Punctuation(Punctuation::Currency(..)))
|
|
}
|
|
|
|
pub fn is_ellipsis(&self) -> bool {
|
|
matches!(self, TokenKind::Punctuation(Punctuation::Ellipsis))
|
|
}
|
|
|
|
pub fn is_hyphen(&self) -> bool {
|
|
matches!(self, TokenKind::Punctuation(Punctuation::Hyphen))
|
|
}
|
|
|
|
pub fn is_quote(&self) -> bool {
|
|
matches!(self, TokenKind::Punctuation(Punctuation::Quote(_)))
|
|
}
|
|
|
|
pub fn is_apostrophe(&self) -> bool {
|
|
matches!(self, TokenKind::Punctuation(Punctuation::Apostrophe))
|
|
}
|
|
|
|
pub fn is_period(&self) -> bool {
|
|
matches!(self, TokenKind::Punctuation(Punctuation::Period))
|
|
}
|
|
|
|
pub fn is_at(&self) -> bool {
|
|
matches!(self, TokenKind::Punctuation(Punctuation::At))
|
|
}
|
|
|
|
pub fn is_comma(&self) -> bool {
|
|
matches!(self, TokenKind::Punctuation(Punctuation::Comma))
|
|
}
|
|
|
|
pub fn is_semicolon(&self) -> bool {
|
|
matches!(self, TokenKind::Punctuation(Punctuation::Semicolon))
|
|
}
|
|
|
|
pub fn is_ampersand(&self) -> bool {
|
|
matches!(self, TokenKind::Punctuation(Punctuation::Ampersand))
|
|
}
|
|
|
|
pub fn is_slash(&self) -> bool {
|
|
matches!(self, TokenKind::Punctuation(Punctuation::ForwardSlash))
|
|
}
|
|
|
|
// Miscellaneous is-methods
|
|
|
|
/// Checks whether a token is word-like--meaning it is more complex than punctuation and can
|
|
/// hold semantic meaning in the way a word does.
|
|
pub fn is_word_like(&self) -> bool {
|
|
matches!(
|
|
self,
|
|
TokenKind::Word(..)
|
|
| TokenKind::EmailAddress
|
|
| TokenKind::Hostname
|
|
| TokenKind::Decade
|
|
| TokenKind::Number(..)
|
|
)
|
|
}
|
|
|
|
pub(crate) fn is_chunk_terminator(&self) -> bool {
|
|
if self.is_sentence_terminator() {
|
|
return true;
|
|
}
|
|
|
|
match self {
|
|
TokenKind::Punctuation(punct) => {
|
|
matches!(
|
|
punct,
|
|
Punctuation::Comma | Punctuation::Quote { .. } | Punctuation::Colon
|
|
)
|
|
}
|
|
_ => false,
|
|
}
|
|
}
|
|
|
|
pub(crate) fn is_sentence_terminator(&self) -> bool {
|
|
match self {
|
|
TokenKind::Punctuation(punct) => [
|
|
Punctuation::Period,
|
|
Punctuation::Bang,
|
|
Punctuation::Question,
|
|
]
|
|
.contains(punct),
|
|
TokenKind::ParagraphBreak => true,
|
|
_ => false,
|
|
}
|
|
}
|
|
|
|
/// Used by `crate::parsers::CollapseIdentifiers`
|
|
/// TODO: Separate this into two functions and add OR functionality to
|
|
/// pattern matching
|
|
pub fn is_case_separator(&self) -> bool {
|
|
matches!(self, TokenKind::Punctuation(Punctuation::Underscore))
|
|
|| matches!(self, TokenKind::Punctuation(Punctuation::Hyphen))
|
|
}
|
|
|
|
/// Checks whether the token is whitespace.
|
|
pub fn is_whitespace(&self) -> bool {
|
|
matches!(self, TokenKind::Space(_) | TokenKind::Newline(_))
|
|
}
|
|
|
|
pub fn is_upos(&self, upos: UPOS) -> bool {
|
|
let Some(Some(meta)) = self.as_word() else {
|
|
return false;
|
|
};
|
|
|
|
meta.pos_tag == Some(upos)
|
|
}
|
|
|
|
// Miscellaneous non-is methods
|
|
|
|
/// Checks that `self` is the same enum variant as `other`, regardless of
|
|
/// whether the inner metadata is also equal.
|
|
pub fn matches_variant_of(&self, other: &Self) -> bool {
|
|
self.with_default_data() == other.with_default_data()
|
|
}
|
|
|
|
/// Produces a copy of `self` with any inner data replaced with its default
|
|
/// value. Useful for making comparisons on just the variant of the
|
|
/// enum.
|
|
pub fn with_default_data(&self) -> Self {
|
|
match self {
|
|
TokenKind::Word(_) => TokenKind::Word(Default::default()),
|
|
TokenKind::Punctuation(_) => TokenKind::Punctuation(Default::default()),
|
|
TokenKind::Number(..) => TokenKind::Number(Default::default()),
|
|
TokenKind::Space(_) => TokenKind::Space(Default::default()),
|
|
TokenKind::Newline(_) => TokenKind::Newline(Default::default()),
|
|
_ => self.clone(),
|
|
}
|
|
}
|
|
|
|
/// Construct a [`TokenKind::Word`] with no metadata.
|
|
pub fn blank_word() -> Self {
|
|
Self::Word(None)
|
|
}
|
|
|
|
// Punctuation and symbol non-is methods
|
|
|
|
pub fn as_mut_quote(&mut self) -> Option<&mut Quote> {
|
|
self.as_mut_punctuation()?.as_mut_quote()
|
|
}
|
|
|
|
pub fn as_quote(&self) -> Option<&Quote> {
|
|
self.as_punctuation()?.as_quote()
|
|
}
|
|
}
|
|
|
|
#[cfg(test)]
|
|
mod tests {
|
|
use crate::Document;
|
|
|
|
#[test]
|
|
fn car_is_singular_noun() {
|
|
let doc = Document::new_plain_english_curated("car");
|
|
let tk = &doc.tokens().next().unwrap().kind;
|
|
assert!(tk.is_singular_noun());
|
|
}
|
|
|
|
#[test]
|
|
fn traffic_is_mass_noun_only() {
|
|
let doc = Document::new_plain_english_curated("traffic");
|
|
let tk = &doc.tokens().next().unwrap().kind;
|
|
assert!(tk.is_mass_noun_only());
|
|
}
|
|
|
|
#[test]
|
|
fn equipment_is_mass_noun() {
|
|
let doc = Document::new_plain_english_curated("equipment");
|
|
let tk = &doc.tokens().next().unwrap().kind;
|
|
assert!(tk.is_mass_noun());
|
|
}
|
|
|
|
#[test]
|
|
fn equipment_is_non_countable_noun() {
|
|
let doc = Document::new_plain_english_curated("equipment");
|
|
let tk = &doc.tokens().next().unwrap().kind;
|
|
assert!(tk.is_non_countable_noun());
|
|
}
|
|
|
|
#[test]
|
|
fn equipment_isnt_countable_noun() {
|
|
let doc = Document::new_plain_english_curated("equipment");
|
|
let tk = &doc.tokens().next().unwrap().kind;
|
|
assert!(!tk.is_countable_noun());
|
|
}
|
|
|
|
#[test]
|
|
fn oov_word_is_oov() {
|
|
let doc = Document::new_plain_english_curated("nonexistentword");
|
|
let tk = &doc.tokens().next().unwrap().kind;
|
|
assert!(tk.is_oov());
|
|
}
|
|
|
|
#[test]
|
|
fn known_word_is_not_oov() {
|
|
let doc = Document::new_plain_english_curated("car");
|
|
let tk = &doc.tokens().next().unwrap().kind;
|
|
assert!(!tk.is_oov());
|
|
}
|
|
|
|
#[test]
|
|
fn non_word_tokens_are_not_oov() {
|
|
let doc = Document::new_plain_english_curated("Hello, world!");
|
|
let tokens: Vec<_> = doc.tokens().collect();
|
|
|
|
// Comma should not be OOV
|
|
assert!(!tokens[1].kind.is_oov());
|
|
// Exclamation mark should not be OOV
|
|
assert!(!tokens[3].kind.is_oov());
|
|
}
|
|
}
|