mirror of
https://github.com/Automattic/harper.git
synced 2025-12-23 08:48:15 +00:00
feat: try to detect space in wrong spot between words
This commit is contained in:
parent
0dc6738905
commit
a7f8eb42fa
6 changed files with 242 additions and 6 deletions
|
|
@ -1,4 +1,4 @@
|
|||
54100
|
||||
54200
|
||||
|
||||
# Start of original dictionary import
|
||||
# combined with dialect spelling dictionary import.
|
||||
|
|
@ -40931,7 +40931,6 @@ quote's
|
|||
quotidian/JN
|
||||
quotient/~NSg
|
||||
qwerty/J # dictionaries prefer QWERTY
|
||||
r/~VS # !! 'r' is a verb??
|
||||
rabbet/NgSVGd
|
||||
rabbi/~NSg
|
||||
rabbinate/Ng
|
||||
|
|
|
|||
|
|
@ -185,6 +185,7 @@ use super::throw_rubbish::ThrowRubbish;
|
|||
use super::to_adverb::ToAdverb;
|
||||
use super::to_two_too::ToTwoToo;
|
||||
use super::touristic::Touristic;
|
||||
use super::transposed_space::TransposedSpace;
|
||||
use super::unclosed_quotes::UnclosedQuotes;
|
||||
use super::update_place_names::UpdatePlaceNames;
|
||||
use super::use_genitive::UseGenitive;
|
||||
|
|
@ -706,6 +707,9 @@ impl LintGroup {
|
|||
);
|
||||
out.config.set_rule_enabled("DisjointPrefixes", true);
|
||||
|
||||
out.add_chunk_expr_linter("TransposedSpace", TransposedSpace::new(dictionary.clone()));
|
||||
out.config.set_rule_enabled("TransposedSpace", true);
|
||||
|
||||
out
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -198,6 +198,7 @@ mod throw_rubbish;
|
|||
mod to_adverb;
|
||||
mod to_two_too;
|
||||
mod touristic;
|
||||
mod transposed_space;
|
||||
mod unclosed_quotes;
|
||||
mod update_place_names;
|
||||
mod use_genitive;
|
||||
|
|
|
|||
181
harper-core/src/linting/transposed_space.rs
Normal file
181
harper-core/src/linting/transposed_space.rs
Normal file
|
|
@ -0,0 +1,181 @@
|
|||
use crate::{
|
||||
Lint, Token, TokenStringExt,
|
||||
expr::{Expr, FirstMatchOf, SequenceExpr},
|
||||
linting::{ExprLinter, LintKind, Suggestion, expr_linter::Chunk},
|
||||
spell::Dictionary,
|
||||
};
|
||||
|
||||
pub struct TransposedSpace<D: Dictionary + 'static> {
|
||||
expr: Box<dyn Expr>,
|
||||
dict: D,
|
||||
}
|
||||
|
||||
impl<D: Dictionary + 'static> TransposedSpace<D> {
|
||||
pub fn new(dict: D) -> Self {
|
||||
Self {
|
||||
expr: Box::new(FirstMatchOf::new(vec![Box::new(
|
||||
SequenceExpr::default().then_oov().t_ws().then_oov(),
|
||||
)])),
|
||||
dict,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn sensitive(dict: D) -> Self {
|
||||
Self {
|
||||
expr: Box::new(FirstMatchOf::new(vec![
|
||||
Box::new(SequenceExpr::default().then_oov().t_ws().then_any_word()),
|
||||
Box::new(SequenceExpr::default().then_any_word().t_ws().then_oov()),
|
||||
Box::new(SequenceExpr::default().then_oov().t_ws().then_oov()),
|
||||
])),
|
||||
dict,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn keep_unique(values: &mut Vec<String>, word1: &[char], word2: &[char]) {
|
||||
let value = format!(
|
||||
"{} {}",
|
||||
word1.iter().collect::<String>(),
|
||||
word2.iter().collect::<String>()
|
||||
);
|
||||
if !values.contains(&value) {
|
||||
values.push(value);
|
||||
}
|
||||
}
|
||||
|
||||
impl<D: Dictionary + 'static> ExprLinter for TransposedSpace<D> {
|
||||
type Unit = Chunk;
|
||||
|
||||
fn expr(&self) -> &dyn Expr {
|
||||
self.expr.as_ref()
|
||||
}
|
||||
|
||||
fn match_to_lint(&self, toks: &[Token], src: &[char]) -> Option<Lint> {
|
||||
let toks_span = toks.span()?;
|
||||
|
||||
// "thec" "at" / "th ecat"
|
||||
let word1 = toks.first()?.span.get_content(src);
|
||||
let word2 = toks.last()?.span.get_content(src);
|
||||
|
||||
// "thec" -> "the c"
|
||||
let w1_start = &word1[..word1.len() - 1];
|
||||
let w1_last = word1.iter().last()?;
|
||||
|
||||
// "ecat" -> "e cat"
|
||||
let w2_first = word2.first()?;
|
||||
let w2_end = &word2[1..];
|
||||
|
||||
// "c" + "at" -> "cat"
|
||||
let mut w1_last_plus_w2 = word2.to_vec();
|
||||
w1_last_plus_w2.insert(0, *w1_last);
|
||||
|
||||
// "th" + "e" -> "the"
|
||||
let mut w1_plus_w2_first = word1.to_vec();
|
||||
w1_plus_w2_first.push(*w2_first);
|
||||
|
||||
let mut values = vec![];
|
||||
|
||||
// "thec" "at" -> "the cat"
|
||||
if self.dict.contains_word(w1_start) && self.dict.contains_word(&w1_last_plus_w2) {
|
||||
let maybe_canon_w2 = self.dict.get_correct_capitalization_of(&w1_last_plus_w2);
|
||||
if let Some(canon_w1) = self.dict.get_correct_capitalization_of(w1_start) {
|
||||
if let Some(canon_w2) = maybe_canon_w2 {
|
||||
keep_unique(&mut values, canon_w1, canon_w2);
|
||||
} else {
|
||||
keep_unique(&mut values, canon_w1, &w1_last_plus_w2);
|
||||
}
|
||||
} else if let Some(canon_w2) = maybe_canon_w2 {
|
||||
keep_unique(&mut values, w1_start, canon_w2);
|
||||
}
|
||||
|
||||
keep_unique(&mut values, w1_start, &w1_last_plus_w2);
|
||||
}
|
||||
|
||||
// "th" "ecat" -> "the cat"
|
||||
if self.dict.contains_word(&w1_plus_w2_first) && self.dict.contains_word(w2_end) {
|
||||
let maybe_canon_w2 = self.dict.get_correct_capitalization_of(w2_end);
|
||||
if let Some(canon_w1) = self.dict.get_correct_capitalization_of(&w1_plus_w2_first) {
|
||||
if let Some(canon_w2) = maybe_canon_w2 {
|
||||
keep_unique(&mut values, canon_w1, canon_w2);
|
||||
} else {
|
||||
keep_unique(&mut values, canon_w1, w2_end);
|
||||
}
|
||||
} else if let Some(canon_w2) = maybe_canon_w2 {
|
||||
keep_unique(&mut values, &w1_plus_w2_first, canon_w2);
|
||||
}
|
||||
|
||||
keep_unique(&mut values, &w1_plus_w2_first, w2_end);
|
||||
}
|
||||
|
||||
if values.is_empty() {
|
||||
return None;
|
||||
}
|
||||
|
||||
let suggestions = values
|
||||
.iter()
|
||||
.map(|value| {
|
||||
Suggestion::replace_with_match_case(
|
||||
value.chars().collect(),
|
||||
toks_span.get_content(src),
|
||||
)
|
||||
})
|
||||
.collect();
|
||||
|
||||
Some(Lint {
|
||||
span: toks_span,
|
||||
lint_kind: LintKind::Typo,
|
||||
suggestions,
|
||||
message: format!(
|
||||
"Is the space between `{}` and `{}` one character out of place?",
|
||||
word1.iter().collect::<String>(),
|
||||
word2.iter().collect::<String>()
|
||||
),
|
||||
..Default::default()
|
||||
})
|
||||
}
|
||||
|
||||
fn description(&self) -> &str {
|
||||
"Looks for a space one character too early or too late between words."
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::TransposedSpace;
|
||||
use crate::{linting::tests::assert_suggestion_result, spell::FstDictionary};
|
||||
|
||||
#[test]
|
||||
fn space_too_early() {
|
||||
assert_suggestion_result(
|
||||
"Th ecat sat on the mat.",
|
||||
TransposedSpace::sensitive(FstDictionary::curated()),
|
||||
"The cat sat on the mat.",
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn space_too_late() {
|
||||
assert_suggestion_result(
|
||||
"Thec at sat on the mat.",
|
||||
TransposedSpace::sensitive(FstDictionary::curated()),
|
||||
"The cat sat on the mat.",
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_early() {
|
||||
assert_suggestion_result(
|
||||
"Sometimes the spac eis one character early.",
|
||||
TransposedSpace::new(FstDictionary::curated()),
|
||||
"Sometimes the space is one character early.",
|
||||
);
|
||||
}
|
||||
#[test]
|
||||
fn test_late() {
|
||||
assert_suggestion_result(
|
||||
"Ands ometimes the space is a character late.",
|
||||
TransposedSpace::new(FstDictionary::curated()),
|
||||
"And sometimes the space is a character late.",
|
||||
);
|
||||
}
|
||||
}
|
||||
|
|
@ -6197,6 +6197,37 @@ Suggest:
|
|||
|
||||
|
||||
|
||||
Lint: Capitalization (127 priority)
|
||||
Message: |
|
||||
4486 | “No, r—” corrected the man, “M-a-v-r-o———”
|
||||
| ^ This word's canonical spelling is all-caps.
|
||||
Suggest:
|
||||
- Replace with: “R”
|
||||
|
||||
|
||||
|
||||
Lint: Spelling (63 priority)
|
||||
Message: |
|
||||
4486 | “No, r—” corrected the man, “M-a-v-r-o———”
|
||||
| ^ Did you mean to spell `r` this way?
|
||||
Suggest:
|
||||
- Replace with: “re”
|
||||
- Replace with: “a”
|
||||
- Replace with: “e”
|
||||
|
||||
|
||||
|
||||
Lint: Spelling (63 priority)
|
||||
Message: |
|
||||
4486 | “No, r—” corrected the man, “M-a-v-r-o———”
|
||||
| ^ Did you mean to spell `r` this way?
|
||||
Suggest:
|
||||
- Replace with: “re”
|
||||
- Replace with: “a”
|
||||
- Replace with: “e”
|
||||
|
||||
|
||||
|
||||
Lint: Spelling (63 priority)
|
||||
Message: |
|
||||
4486 | “No, r—” corrected the man, “M-a-v-r-o———”
|
||||
|
|
@ -6208,6 +6239,26 @@ Suggest:
|
|||
|
||||
|
||||
|
||||
Lint: Capitalization (127 priority)
|
||||
Message: |
|
||||
4490 | “r—” said the policeman, “o———”
|
||||
| ^ This word's canonical spelling is all-caps.
|
||||
Suggest:
|
||||
- Replace with: “R”
|
||||
|
||||
|
||||
|
||||
Lint: Spelling (63 priority)
|
||||
Message: |
|
||||
4490 | “r—” said the policeman, “o———”
|
||||
| ^ Did you mean to spell `r` this way?
|
||||
Suggest:
|
||||
- Replace with: “re”
|
||||
- Replace with: “a”
|
||||
- Replace with: “e”
|
||||
|
||||
|
||||
|
||||
Lint: Capitalization (127 priority)
|
||||
Message: |
|
||||
4490 | “r—” said the policeman, “o———”
|
||||
|
|
|
|||
|
|
@ -8968,16 +8968,16 @@
|
|||
# . NPr/VB/J/#r . D/P . NSg/P/#r . . D+ NSg+ VPt N🅪Sg/Vg/J . . . NPr/J/P . . . .
|
||||
>
|
||||
#
|
||||
> “ No , r — ” corrected the man , “ M - a - v - r - o — — — ”
|
||||
# . NPr/Dq/P . NPr/VB/J . . VP/J D+ NPr/VB/J+ . . NPr/VB/J/#r . D/P . NSg/P/#r . NPr/VB/J . NPr/J/P . . . .
|
||||
> “ No , r — ” corrected the man , “ M - a - v - r - o — — — ”
|
||||
# . NPr/Dq/P . NPr/J . . VP/J D+ NPr/VB/J+ . . NPr/VB/J/#r . D/P . NSg/P/#r . NPr/J . NPr/J/P . . . .
|
||||
>
|
||||
#
|
||||
> “ Listen to me ! ” muttered Tom fiercely .
|
||||
# . NSg/VB P NPr/ISg+ . . VP/J NPr/VB+ R .
|
||||
>
|
||||
#
|
||||
> “ r — ” said the policeman , “ o — — — ”
|
||||
# . NPr/VB/J . . VP/J D+ NSg+ . . NPr/J/P . . . .
|
||||
> “ r — ” said the policeman , “ o — — — ”
|
||||
# . NPr/J . . VP/J D+ NSg+ . . NPr/J/P . . . .
|
||||
>
|
||||
#
|
||||
> “ g — — — ”
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue