harper/harper-core/src/token_kind.rs

use harper_brill::UPOS;
use is_macro::Is;
use serde::{Deserialize, Serialize};

use crate::{DictWordMetadata, Number, Punctuation, Quote, TokenKind::Word};

/// Generate wrapper code to pass a function call to the inner [`DictWordMetadata`],
/// if the token is indeed a word, while also emitting method-level documentation.
macro_rules! delegate_to_metadata {
    ($($method:ident),* $(,)?) => {
        $(
            #[doc = concat!(
                "Delegates to [`DictWordMetadata::",
                stringify!($method),
                "`] when this token is a word.\n\n",
                "Returns `false` if the token is not a word."
            )]
            pub fn $method(&self) -> bool {
                let Word(Some(metadata)) = self else {
                    return false;
                };
                metadata.$method()
            }
        )*
    };
}

/// The parsed value of a [`Token`](crate::Token).
/// Has a variety of queries available.
/// If there is a query missing, it may be easy to implement by just calling the
/// `delegate_to_metadata` macro.
#[derive(Debug, Is, Clone, Serialize, Deserialize, Default, PartialOrd, Hash, Eq, PartialEq)]
#[serde(tag = "kind", content = "value")]
pub enum TokenKind {
    /// `None` if the word does not exist in the dictionary.
    Word(Option<DictWordMetadata>),
    Punctuation(Punctuation),
    Decade,
    Number(Number),
    /// A sequence of " " spaces.
    Space(usize),
    /// A sequence of "\n" newlines
    Newline(usize),
    EmailAddress,
    Url,
    Hostname,
    /// A special token used for things like inline code blocks that should be
    /// ignored by all linters.
    #[default]
    Unlintable,
    ParagraphBreak,
    Regexish,
}

impl TokenKind {
    // DictWord metadata delegation methods grouped by part of speech
    delegate_to_metadata! {
        // Nominal methods (nouns and pronouns)
        is_nominal,
        is_noun,
        is_pronoun,
        is_proper_noun,
        is_singular_nominal,
        is_plural_nominal,
        is_possessive_nominal,
        is_non_plural_nominal,
        is_singular_noun,
        is_plural_noun,
        is_non_plural_noun,
        is_countable_noun,
        is_non_countable_noun,
        is_mass_noun,
        is_mass_noun_only,
        is_non_mass_noun,
        is_singular_pronoun,
        is_plural_pronoun,
        is_non_plural_pronoun,
        is_reflexive_pronoun,
        is_personal_pronoun,
        is_first_person_singular_pronoun,
        is_first_person_plural_pronoun,
        is_second_person_pronoun,
        is_third_person_pronoun,
        is_third_person_singular_pronoun,
        is_third_person_plural_pronoun,
        is_subject_pronoun,
        is_object_pronoun,
        is_possessive_noun,
        is_possessive_pronoun,

        // Verb methods
        is_verb,
        is_auxiliary_verb,
        is_linking_verb,
        is_verb_lemma,
        is_verb_past_form,
        is_verb_simple_past_form,
        is_verb_past_participle_form,
        is_verb_progressive_form,
        is_verb_third_person_singular_present_form,

        // Adjective methods
        is_adjective,
        is_comparative_adjective,
        is_superlative_adjective,
        is_positive_adjective,

        // Adverb methods
        is_adverb,
        is_manner_adverb,
        is_frequency_adverb,
        is_degree_adverb,

        // Determiner methods
        is_determiner,
        is_demonstrative_determiner,
        is_possessive_determiner,
        is_quantifier,
        is_non_quantifier_determiner,
        is_non_demonstrative_determiner,

        // Conjunction methods
        is_conjunction,

        // Generic word methods
        is_swear,
        is_likely_homograph,

        // Orthography methods
        is_lowercase,
        is_titlecase,
        is_allcaps,
        is_lower_camel,
        is_upper_camel,
        is_apostrophized,

        is_roman_numerals
    }

    // DictWord metadata delegation methods not generated by macro
    pub fn is_preposition(&self) -> bool {
        let Word(Some(metadata)) = self else {
            return false;
        };
        metadata.preposition
    }

    // Generic word is-methods

    pub fn is_common_word(&self) -> bool {
        let Word(Some(metadata)) = self else {
            return true;
        };
        metadata.common
    }

    /// Checks whether the token is a member of a nominal phrase.
    pub fn is_np_member(&self) -> bool {
        let Word(Some(metadata)) = self else {
            return false;
        };
        metadata.np_member.unwrap_or(false)
    }

    /// Checks whether a word token is out-of-vocabulary (not found in the dictionary).
    ///
    /// Returns `true` if the token is a word that was not found in the dictionary,
    /// `false` if the token is a word found in the dictionary or is not a word token.
    pub fn is_oov(&self) -> bool {
        matches!(self, TokenKind::Word(None))
    }

    // Punctuation and symbol is-methods

    pub fn is_open_square(&self) -> bool {
        matches!(self, TokenKind::Punctuation(Punctuation::OpenSquare))
    }

    pub fn is_close_square(&self) -> bool {
        matches!(self, TokenKind::Punctuation(Punctuation::CloseSquare))
    }

    pub fn is_open_round(&self) -> bool {
        matches!(self, TokenKind::Punctuation(Punctuation::OpenRound))
    }

    pub fn is_close_round(&self) -> bool {
        matches!(self, TokenKind::Punctuation(Punctuation::CloseRound))
    }

    pub fn is_pipe(&self) -> bool {
        matches!(self, TokenKind::Punctuation(Punctuation::Pipe))
    }

    pub fn is_currency(&self) -> bool {
        matches!(self, TokenKind::Punctuation(Punctuation::Currency(..)))
    }

    pub fn is_ellipsis(&self) -> bool {
        matches!(self, TokenKind::Punctuation(Punctuation::Ellipsis))
    }

    pub fn is_hyphen(&self) -> bool {
        matches!(self, TokenKind::Punctuation(Punctuation::Hyphen))
    }

    pub fn is_quote(&self) -> bool {
        matches!(self, TokenKind::Punctuation(Punctuation::Quote(_)))
    }

    pub fn is_apostrophe(&self) -> bool {
        matches!(self, TokenKind::Punctuation(Punctuation::Apostrophe))
    }

    pub fn is_period(&self) -> bool {
        matches!(self, TokenKind::Punctuation(Punctuation::Period))
    }

    pub fn is_at(&self) -> bool {
        matches!(self, TokenKind::Punctuation(Punctuation::At))
    }

    pub fn is_comma(&self) -> bool {
        matches!(self, TokenKind::Punctuation(Punctuation::Comma))
    }

    pub fn is_semicolon(&self) -> bool {
        matches!(self, TokenKind::Punctuation(Punctuation::Semicolon))
    }

    pub fn is_ampersand(&self) -> bool {
        matches!(self, TokenKind::Punctuation(Punctuation::Ampersand))
    }

    pub fn is_slash(&self) -> bool {
        matches!(self, TokenKind::Punctuation(Punctuation::ForwardSlash))
    }

    // Miscellaneous is-methods

    /// Checks whether a token is word-like--meaning it is more complex than punctuation and can
    /// hold semantic meaning in the way a word does.
    pub fn is_word_like(&self) -> bool {
        matches!(
            self,
            TokenKind::Word(..)
                | TokenKind::EmailAddress
                | TokenKind::Hostname
                | TokenKind::Decade
                | TokenKind::Number(..)
        )
    }

    pub(crate) fn is_chunk_terminator(&self) -> bool {
        if self.is_sentence_terminator() {
            return true;
        }

        match self {
            TokenKind::Punctuation(punct) => {
                matches!(
                    punct,
                    Punctuation::Comma | Punctuation::Quote { .. } | Punctuation::Colon
                )
            }
            _ => false,
        }
    }

    pub(crate) fn is_sentence_terminator(&self) -> bool {
        match self {
            TokenKind::Punctuation(punct) => [
                Punctuation::Period,
                Punctuation::Bang,
                Punctuation::Question,
            ]
            .contains(punct),
            TokenKind::ParagraphBreak => true,
            _ => false,
        }
    }

    /// Used by `crate::parsers::CollapseIdentifiers`
    /// TODO: Separate this into two functions and add OR functionality to
    /// pattern matching
    pub fn is_case_separator(&self) -> bool {
        matches!(self, TokenKind::Punctuation(Punctuation::Underscore))
            || matches!(self, TokenKind::Punctuation(Punctuation::Hyphen))
    }

    /// Checks whether the token is whitespace.
    pub fn is_whitespace(&self) -> bool {
        matches!(self, TokenKind::Space(_) | TokenKind::Newline(_))
    }

    pub fn is_upos(&self, upos: UPOS) -> bool {
        let Some(Some(meta)) = self.as_word() else {
            return false;
        };

        meta.pos_tag == Some(upos)
    }

    // Miscellaneous non-is methods

    /// Checks that `self` is the same enum variant as `other`, regardless of
    /// whether the inner metadata is also equal.
    pub fn matches_variant_of(&self, other: &Self) -> bool {
        self.with_default_data() == other.with_default_data()
    }

    /// Produces a copy of `self` with any inner data replaced with its default
    /// value. Useful for making comparisons on just the variant of the
    /// enum.
    pub fn with_default_data(&self) -> Self {
        match self {
            TokenKind::Word(_) => TokenKind::Word(Default::default()),
            TokenKind::Punctuation(_) => TokenKind::Punctuation(Default::default()),
            TokenKind::Number(..) => TokenKind::Number(Default::default()),
            TokenKind::Space(_) => TokenKind::Space(Default::default()),
            TokenKind::Newline(_) => TokenKind::Newline(Default::default()),
            _ => self.clone(),
        }
    }

    /// Construct a [`TokenKind::Word`] with no metadata.
    pub fn blank_word() -> Self {
        Self::Word(None)
    }

    // Punctuation and symbol non-is methods

    pub fn as_mut_quote(&mut self) -> Option<&mut Quote> {
        self.as_mut_punctuation()?.as_mut_quote()
    }

    pub fn as_quote(&self) -> Option<&Quote> {
        self.as_punctuation()?.as_quote()
    }
}

#[cfg(test)]
mod tests {
    use crate::Document;

    #[test]
    fn car_is_singular_noun() {
        let doc = Document::new_plain_english_curated("car");
        let tk = &doc.tokens().next().unwrap().kind;
        assert!(tk.is_singular_noun());
    }

    #[test]
    fn traffic_is_mass_noun_only() {
        let doc = Document::new_plain_english_curated("traffic");
        let tk = &doc.tokens().next().unwrap().kind;
        assert!(tk.is_mass_noun_only());
    }

    #[test]
    fn equipment_is_mass_noun() {
        let doc = Document::new_plain_english_curated("equipment");
        let tk = &doc.tokens().next().unwrap().kind;
        assert!(tk.is_mass_noun());
    }

    #[test]
    fn equipment_is_non_countable_noun() {
        let doc = Document::new_plain_english_curated("equipment");
        let tk = &doc.tokens().next().unwrap().kind;
        assert!(tk.is_non_countable_noun());
    }

    #[test]
    fn equipment_isnt_countable_noun() {
        let doc = Document::new_plain_english_curated("equipment");
        let tk = &doc.tokens().next().unwrap().kind;
        assert!(!tk.is_countable_noun());
    }

    #[test]
    fn oov_word_is_oov() {
        let doc = Document::new_plain_english_curated("nonexistentword");
        let tk = &doc.tokens().next().unwrap().kind;
        assert!(tk.is_oov());
    }

    #[test]
    fn known_word_is_not_oov() {
        let doc = Document::new_plain_english_curated("car");
        let tk = &doc.tokens().next().unwrap().kind;
        assert!(!tk.is_oov());
    }

    #[test]
    fn non_word_tokens_are_not_oov() {
        let doc = Document::new_plain_english_curated("Hello, world!");
        let tokens: Vec<_> = doc.tokens().collect();

        // Comma should not be OOV
        assert!(!tokens[1].kind.is_oov());
        // Exclamation mark should not be OOV
        assert!(!tokens[3].kind.is_oov());
    }
}