test: add snapshots for linters (#1228)

* test: add snapshots for linters

* fix(core): update snapshots

---------

Co-authored-by: Elijah Potter <me@elijahpotter.dev>
This commit is contained in:
Michael Schmidt 2025-05-06 20:29:52 +02:00 committed by GitHub
parent 08927bdaae
commit 663c729e46
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
8 changed files with 14954 additions and 91 deletions

View file

@ -0,0 +1,237 @@
//! This test creats snapshots of the reports of all linters.
//!
//! # Usage
//!
//! To add a new snapshot, simply add the document to `tests/text` and run this
//! test. It will automatically create a new snapshot in `tests/text/linters`.
//! To update an existing snapshot, also just run this test.
//!
//! Note: This test will fail if the snapshot files are not up to date. This
//! ensures that CI will fail if linters change their behavior.
use harper_core::{
Dialect, Document, FstDictionary,
linting::{LintGroup, Linter},
};
mod snapshot;
struct LinePos {
/// 0-based index of the line
pub line: usize,
/// 0-based index of the column
pub col: usize,
}
struct Lines<'a> {
lines: Vec<&'a str>,
offsets: Vec<usize>,
}
impl Lines<'_> {
fn new(source: &str) -> Lines {
let lines: Vec<&str> = source.split('\n').collect();
let offsets: Vec<usize> = lines
.iter()
.scan(0, |offset, line| {
let old_offset = *offset;
*offset += line.chars().count() + 1;
Some(old_offset)
})
.collect();
Lines { lines, offsets }
}
fn len(&self) -> usize {
self.lines.len()
}
fn get_pos(&self, offset: usize) -> LinePos {
let line_index = self
.offsets
.binary_search(&offset)
.unwrap_or_else(|x| x - 1);
LinePos {
line: line_index,
col: offset - self.offsets[line_index],
}
}
}
impl<'a> std::ops::Index<usize> for Lines<'a> {
type Output = &'a str;
fn index(&self, index: usize) -> &Self::Output {
&self.lines[index]
}
}
fn print_error(lines: &Lines, start: usize, end: usize, message: &str) -> String {
let mut out = String::new();
fn print_line(out: &mut String, line: &str, number: usize) {
out.push_str(&format!("{number:>6} | {line}\n"));
}
fn is_sentence_boundary(c: char) -> bool {
matches!(c, '.' | '?' | '!' | ':' | ';')
}
fn print_pre_line_context(
out: &mut String,
context_line: &str,
number: usize,
line: &str,
start_col: usize,
) {
if context_line.is_empty() {
return;
}
if start_col > 40 {
// that's enough context
return;
}
let last_char = context_line.chars().last().unwrap();
let mut chars_before = line.chars().take(start_col);
if !is_sentence_boundary(last_char) && !chars_before.any(is_sentence_boundary) {
print_line(out, context_line, number);
}
}
fn print_post_line_context(
out: &mut String,
context_line: &str,
number: usize,
line: &str,
end_col: usize,
) {
if context_line.is_empty() {
return;
}
if end_col < 40 {
// that's enough context
return;
}
let mut chars_after = line.chars().skip(end_col);
if !chars_after.any(is_sentence_boundary) {
print_line(out, context_line, number);
}
}
fn print_underline(
out: &mut String,
start_col: usize,
end_col: usize,
continuation: bool,
message: &str,
) {
out.push_str(" | ");
for _ in 0..start_col {
out.push(' ');
}
out.push(if continuation { '~' } else { '^' });
for _ in 0..end_col.saturating_sub(start_col) {
out.push('~');
}
if !message.is_empty() {
out.push(' ');
out.push_str(message);
}
out.push('\n');
}
let start = lines.get_pos(start);
let end = lines.get_pos(end - 1);
if start.line > 0 {
print_pre_line_context(
&mut out,
lines[start.line - 1],
start.line,
lines[start.line],
start.col,
);
}
if start.line == end.line {
print_line(&mut out, lines[start.line], start.line + 1);
print_underline(&mut out, start.col, end.col, false, message);
} else {
for i in start.line..end.line {
let line = lines[i];
print_line(&mut out, line, i + 1);
print_underline(
&mut out,
if i == start.line { start.col } else { 0 },
line.chars().count(),
i != start.line,
"",
);
}
print_line(&mut out, lines[end.line], end.line + 1);
print_underline(&mut out, 0, end.col, true, message);
}
if end.line + 1 < lines.len() {
print_post_line_context(
&mut out,
lines[end.line + 1],
end.line + 2,
lines[end.line],
end.col,
);
}
out
}
#[test]
fn test_most_lints() {
snapshot::snapshot_all_text_files("linters", ".snap.yml", |source| {
let dict = FstDictionary::curated();
let document = Document::new_markdown_default(source, &dict);
let mut linter = LintGroup::new_curated(dict, Dialect::American);
let mut lints = linter.lint(&document);
lints.sort_by(|a, b| {
a.span
.start
.cmp(&b.span.start)
.then(a.span.end.cmp(&b.span.end))
});
// split the input document into lines
let lines = Lines::new(source);
let mut out = String::new();
for lint in lints {
out.push_str(&format!(
"Lint: {:?} ({} priority)\n",
lint.lint_kind, lint.priority
));
let message = print_error(&lines, lint.span.start, lint.span.end, &lint.message);
out.push_str("Message: |\n");
for l in message.lines() {
out.push_str(" ");
out.push_str(l);
out.push('\n');
}
if !lint.suggestions.is_empty() {
out.push_str("Suggest:\n");
for suggestion in &lint.suggestions {
out.push_str(&format!(" - {}\n", suggestion));
}
}
out.push_str("\n\n\n");
}
out
});
}

View file

@ -58,37 +58,11 @@
//! - [`TokenKind::Space`], [`TokenKind::Newline`], and
//! [`TokenKind::ParagraphBreak`] are ignored.
//! - All other token kinds are denoted by their variant name.
use std::{borrow::Cow, path::PathBuf};
use std::borrow::Cow;
use harper_core::{Degree, Document, FstDictionary, TokenKind, WordMetadata};
fn get_tests_dir() -> PathBuf {
PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("tests")
}
fn get_text_dir() -> PathBuf {
get_tests_dir().join("text")
}
fn get_snapshot_dir() -> PathBuf {
get_tests_dir().join("text/tagged")
}
fn get_text_files() -> Vec<PathBuf> {
let mut files = vec![];
for entry in std::fs::read_dir(get_text_dir())
.unwrap()
.filter_map(|f| f.ok())
.filter(|f| f.metadata().unwrap().is_file())
{
let path = entry.path();
let ext = path
.extension()
.map(|e| e.to_string_lossy().to_string())
.unwrap_or_default();
if matches!(ext.as_str(), "txt" | "md") {
files.push(entry.path());
}
}
files
}
mod snapshot;
fn format_word_tag(word: &WordMetadata) -> String {
// These tags are inspired by the Penn Treebank POS tagset
@ -258,70 +232,31 @@ impl Formatter {
}
}
fn tag_text(source: &str) -> String {
let dict = FstDictionary::curated();
let document = Document::new_markdown_default(&source.replace("\r\n", "\n"), &dict);
let mut formatter = Formatter::new();
for token in document.fat_string_tokens() {
match token.kind {
TokenKind::Space(_) => { /* ignore */ }
TokenKind::ParagraphBreak => {
formatter.new_line();
formatter.new_line();
}
TokenKind::Newline(_) => {
formatter.new_line();
}
kind => {
let text = &token.content;
let tag = format_tag(&kind);
formatter.add(text, &tag);
}
}
}
formatter.finish()
}
fn tag_file(path: &PathBuf) -> Result<(), Box<dyn std::error::Error>> {
let source = std::fs::read_to_string(path)?.replace("\r\n", "\n");
let tagged = tag_text(source.trim_end());
// compare with snapshot
let snapshot_name = path.file_stem().unwrap().to_string_lossy().to_string() + ".md";
let snapshot_file = get_snapshot_dir().join(snapshot_name);
let has_snapshot = snapshot_file.exists();
if has_snapshot {
let snapshot = std::fs::read_to_string(&snapshot_file)?;
if tagged == snapshot {
return Ok(());
}
}
// write snapshot
std::fs::create_dir_all(get_snapshot_dir())?;
std::fs::write(snapshot_file, tagged)?;
Err(if has_snapshot {
"Snapshot mismatches!"
} else {
"No snapshot!"
}
.into())
}
#[test]
fn test_pos_tagger() {
let mut errors = 0;
for file in get_text_files() {
println!("Processing {}", file.display());
if let Err(e) = tag_file(&file) {
eprintln!("Error processing {}: {}", file.display(), e);
errors += 1;
snapshot::snapshot_all_text_files("tagged", ".md", |source| {
let dict = FstDictionary::curated();
let document = Document::new_markdown_default(source, &dict);
let mut formatter = Formatter::new();
for token in document.fat_string_tokens() {
match token.kind {
TokenKind::Space(_) => { /* ignore */ }
TokenKind::ParagraphBreak => {
formatter.new_line();
formatter.new_line();
}
TokenKind::Newline(_) => {
formatter.new_line();
}
kind => {
let text = &token.content;
let tag = format_tag(&kind);
formatter.add(text, &tag);
}
}
}
}
if errors > 0 {
panic!("{} errors occurred while processing files", errors);
}
formatter.finish()
});
}

View file

@ -0,0 +1,81 @@
use std::path::{Path, PathBuf};
fn get_tests_dir() -> PathBuf {
PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("tests")
}
fn get_text_dir() -> PathBuf {
get_tests_dir().join("text")
}
pub fn get_text_files() -> Vec<PathBuf> {
let mut files = vec![];
for entry in std::fs::read_dir(get_text_dir())
.unwrap()
.filter_map(|f| f.ok())
.filter(|f| f.metadata().unwrap().is_file())
{
let path = entry.path();
let ext = path
.extension()
.map(|e| e.to_string_lossy().to_string())
.unwrap_or_default();
if matches!(ext.as_str(), "txt" | "md") {
files.push(entry.path());
}
}
files
}
fn tag_file(
text_file: &Path,
snapshot_file: &Path,
create_snapshot: impl Fn(&str) -> String,
) -> Result<(), Box<dyn std::error::Error>> {
let source = std::fs::read_to_string(text_file)?.replace("\r\n", "\n");
let tagged = create_snapshot(source.trim_end());
// compare with snapshot
let has_snapshot = snapshot_file.exists();
if has_snapshot {
let snapshot = std::fs::read_to_string(snapshot_file)?;
if tagged == snapshot {
return Ok(());
}
}
// write snapshot
std::fs::write(snapshot_file, tagged)?;
Err(if has_snapshot {
"Snapshot mismatches!".into()
} else {
"No snapshot!".into()
})
}
fn get_snapshot_file(text_file: &Path, snapshot_dir: &Path, ext: &str) -> PathBuf {
let snapshot_name = text_file.file_stem().unwrap().to_string_lossy().to_string() + ext;
snapshot_dir.join(snapshot_name)
}
#[allow(dead_code)]
pub fn snapshot_all_text_files(
out_dir: &str,
snapshot_ext: &str,
create_snapshot: impl Copy + Fn(&str) -> String,
) {
let snapshot_dir = get_text_dir().join(out_dir);
std::fs::create_dir_all(&snapshot_dir).expect("Failed to create snapshot directory");
let mut errors = 0;
for text_file in get_text_files() {
println!("Processing {}", text_file.display());
let snapshot_file = get_snapshot_file(&text_file, &snapshot_dir, snapshot_ext);
if let Err(e) = tag_file(&text_file, &snapshot_file, create_snapshot) {
eprintln!("Error processing {}: {}", text_file.display(), e);
errors += 1;
}
}
if errors > 0 {
panic!("{} errors occurred while processing files", errors);
}
}

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,635 @@
Lint: Readability (127 priority)
Message: |
8 | In corpus linguistics, part-of-speech tagging (POS tagging or PoS tagging or
| ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
9 | POST), also called grammatical tagging is the process of marking up a word in a
| ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
10 | text (corpus) as corresponding to a particular part of speech, based on both its
| ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
11 | definition and its context. A simplified form of this is commonly taught to
| ~~~~~~~~~~~~~~~~~~~~~~~~~~~ This sentence is 46 words long.
Lint: Spelling (63 priority)
Message: |
8 | In corpus linguistics, part-of-speech tagging (POS tagging or PoS tagging or
| ^~~ Did you mean to spell “POS” this way?
9 | POST), also called grammatical tagging is the process of marking up a word in a
Suggest:
- Replace with: “PBS”
- Replace with: “PMS”
- Replace with: “POV”
Lint: Spelling (63 priority)
Message: |
8 | In corpus linguistics, part-of-speech tagging (POS tagging or PoS tagging or
| ^~~ Did you mean to spell “PoS” this way?
9 | POST), also called grammatical tagging is the process of marking up a word in a
Suggest:
- Replace with: “Poe”
- Replace with: “PBS”
- Replace with: “PMS”
Lint: Spelling (63 priority)
Message: |
15 | Once performed by hand, POS tagging is now done in the context of computational
| ^~~ Did you mean to spell “POS” this way?
Suggest:
- Replace with: “PBS”
- Replace with: “PMS”
- Replace with: “POV”
Lint: Spelling (63 priority)
Message: |
17 | parts of speech, by a set of descriptive tags. POS-tagging algorithms fall into
| ^~~ Did you mean to spell “POS” this way?
18 | two distinctive groups: rule-based and stochastic. E. Brill's tagger, one of the
Suggest:
- Replace with: “PBS”
- Replace with: “PMS”
- Replace with: “POV”
Lint: Spelling (63 priority)
Message: |
18 | two distinctive groups: rule-based and stochastic. E. Brill's tagger, one of the
| ^~ Did you mean to spell “E.” this way?
Suggest:
- Replace with: “E”
- Replace with: “Ea”
- Replace with: “Ed”
Lint: Spelling (63 priority)
Message: |
18 | two distinctive groups: rule-based and stochastic. E. Brill's tagger, one of the
| ^~~~~~~ Did you mean to spell “Brill's” this way?
19 | first and most widely used English POS-taggers, employs rule-based algorithms.
Suggest:
- Replace with: “Brillo's”
- Replace with: “Bill's”
- Replace with: “Trill's”
Lint: Spelling (63 priority)
Message: |
18 | two distinctive groups: rule-based and stochastic. E. Brill's tagger, one of the
19 | first and most widely used English POS-taggers, employs rule-based algorithms.
| ^~~ Did you mean to spell “POS” this way?
Suggest:
- Replace with: “PBS”
- Replace with: “PMS”
- Replace with: “POV”
Lint: Readability (127 priority)
Message: |
33 | as the more common plural noun. Grammatical context is one way to determine
| ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
34 | this; semantic analysis can also be used to infer that "sailor" and "hatch"
| ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
35 | implicate "dogs" as 1) in the nautical context and 2) an action applied to the
| ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
36 | object "hatch" (in this context, "dogs" is a nautical term meaning "fastens (a
| ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
37 | watertight door) securely").
| ~~~~~~~~~~~~~~~~~~~~~~~~~~~~ This sentence is 49 words long.
Lint: Spelling (63 priority)
Message: |
49 | tags. For example, NN for singular common nouns, NNS for plural common nouns, NP
| ^~ Did you mean to spell “NN” this way?
Suggest:
- Replace with: “Nun”
- Replace with: “Non”
- Replace with: “N1”
Lint: Spelling (63 priority)
Message: |
49 | tags. For example, NN for singular common nouns, NNS for plural common nouns, NP
| ^~~ Did you mean to spell “NNS” this way?
50 | for singular proper nouns (see the POS tags used in the Brown Corpus). Other
Suggest:
- Replace with: “NBS”
- Replace with: “NES”
- Replace with: “NS”
Lint: Spelling (63 priority)
Message: |
49 | tags. For example, NN for singular common nouns, NNS for plural common nouns, NP
| ^~ Did you mean to spell “NP” this way?
50 | for singular proper nouns (see the POS tags used in the Brown Corpus). Other
Suggest:
- Replace with: “N”
- Replace with: “Nap”
- Replace with: “Nip”
Lint: Spelling (63 priority)
Message: |
49 | tags. For example, NN for singular common nouns, NNS for plural common nouns, NP
50 | for singular proper nouns (see the POS tags used in the Brown Corpus). Other
| ^~~ Did you mean to spell “POS” this way?
Suggest:
- Replace with: “PBS”
- Replace with: “PMS”
- Replace with: “POV”
Lint: Spelling (63 priority)
Message: |
55 | 150 separate parts of speech for English. Work on stochastic methods for tagging
56 | Koine Greek (DeRose 1990) has used over 1,000 parts of speech and found that
| ^~~~~ Did you mean to spell “Koine” this way?
Suggest:
- Replace with: “Kine”
- Replace with: “Kline”
- Replace with: “Kane”
Lint: Spelling (63 priority)
Message: |
55 | 150 separate parts of speech for English. Work on stochastic methods for tagging
56 | Koine Greek (DeRose 1990) has used over 1,000 parts of speech and found that
| ^~~~~~ Did you mean to spell “DeRose” this way?
Suggest:
- Replace with: “Depose”
- Replace with: “Defoe”
- Replace with: “Denise”
Lint: Spelling (63 priority)
Message: |
57 | about as many words were ambiguous in that language as in English. A
58 | morphosyntactic descriptor in the case of morphologically rich languages is
| ^~~~~~~~~~~~~~~ Did you mean to spell “morphosyntactic” this way?
Suggest:
- Replace with: “morphosyntax's”
- Replace with: “morphosyntax”
Lint: Spelling (63 priority)
Message: |
58 | morphosyntactic descriptor in the case of morphologically rich languages is
| ^~~~~~~~~~~~~~~ Did you mean “morphological”?
59 | commonly expressed using very short mnemonics, such as Ncmsan for Category=Noun,
Suggest:
- Replace with: “morphological”
Lint: Spelling (63 priority)
Message: |
59 | commonly expressed using very short mnemonics, such as Ncmsan for Category=Noun,
| ^~~~~~ Did you mean to spell “Ncmsan” this way?
60 | Type = common, Gender = masculine, Number = singular, Case = accusative, Animate
Suggest:
- Replace with: “Nissan”
- Replace with: “Nisan”
- Replace with: “Nolan”
Lint: Spelling (63 priority)
Message: |
63 | The most popular "tag set" for POS tagging for American English is probably the
| ^~~ Did you mean to spell “POS” this way?
Suggest:
- Replace with: “PBS”
- Replace with: “PMS”
- Replace with: “POV”
Lint: Spelling (63 priority)
Message: |
63 | The most popular "tag set" for POS tagging for American English is probably the
64 | Penn tag set, developed in the Penn Treebank project. It is largely similar to
| ^~~~~~~~ Did you mean to spell “Treebank” this way?
Suggest:
- Replace with: “Freeman”
- Replace with: “Reembark”
- Replace with: “Debank”
Lint: Spelling (63 priority)
Message: |
69 | POS tagging work has been done in a variety of languages, and the set of POS
| ^~~ Did you mean to spell “POS” this way?
Suggest:
- Replace with: “PBS”
- Replace with: “PMS”
- Replace with: “POV”
Lint: Spelling (63 priority)
Message: |
69 | POS tagging work has been done in a variety of languages, and the set of POS
| ^~~ Did you mean to spell “POS” this way?
70 | tags used varies greatly with language. Tags usually are designed to include
Suggest:
- Replace with: “PBS”
- Replace with: “PMS”
- Replace with: “POV”
Lint: Spelling (63 priority)
Message: |
74 | Greek and Latin can be very large; tagging words in agglutinative languages such
| ^~~~~~~~~~~~~ Did you mean to spell “agglutinative” this way?
75 | as Inuit languages may be virtually impossible. At the other extreme, Petrov et
Suggest:
- Replace with: “agglutinate”
- Replace with: “agglutinating”
- Replace with: “agglutination”
Lint: Spelling (63 priority)
Message: |
75 | as Inuit languages may be virtually impossible. At the other extreme, Petrov et
| ^~~~~~ Did you mean to spell “Petrov” this way?
76 | al. have proposed a "universal" tag set, with 12 categories (for example, no
Suggest:
- Replace with: “Petrol”
- Replace with: “Pedro”
- Replace with: “Peron”
Lint: Spelling (63 priority)
Message: |
75 | as Inuit languages may be virtually impossible. At the other extreme, Petrov et
| ^~~
76 | al. have proposed a "universal" tag set, with 12 categories (for example, no
| ~~~ Did you mean “et al.”?
Suggest:
- Replace with: “et al.”
Lint: Spelling (63 priority)
Message: |
86 | The first major corpus of English for computer analysis was the Brown Corpus
87 | developed at Brown University by Henry Kučera and W. Nelson Francis, in the
| ^~~~~~ Did you mean to spell “Kučera” this way?
Suggest:
- Replace with: “Kara”
- Replace with: “Kendra”
- Replace with: “Keri”
Lint: Spelling (63 priority)
Message: |
87 | developed at Brown University by Henry Kučera and W. Nelson Francis, in the
| ^~ Did you mean to spell “W.” this way?
Suggest:
- Replace with: “We”
- Replace with: “WA”
- Replace with: “WC”
Lint: Spelling (63 priority)
Message: |
98 | and corrected by hand, and later users sent in errata so that by the late 70s
| ^ Did you mean to spell “s” this way?
99 | the tagging was nearly perfect (allowing for some cases on which even human
Suggest:
- Replace with: “sf”
- Replace with: “sh”
- Replace with: “so”
Lint: Spelling (63 priority)
Message: |
105 | later part-of-speech tagging systems, such as CLAWS and VOLSUNGA. However, by
| ^~~~~~~~ Did you mean to spell “VOLSUNGA” this way?
Lint: Readability (127 priority)
Message: |
110 | For some time, part-of-speech tagging was considered an inseparable part of
| ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
111 | natural language processing, because there are certain cases where the correct
| ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
112 | part of speech cannot be decided without understanding the semantics or even the
| ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
113 | pragmatics of the context. This is extremely expensive, especially because
| ~~~~~~~~~~~~~~~~~~~~~~~~~~ This sentence is 41 words long.
Lint: Spelling (63 priority)
Message: |
119 | In the mid-1980s, researchers in Europe began to use hidden Markov models (HMMs)
| ^~~~ Did you mean to spell “HMMs” this way?
120 | to disambiguate parts of speech, when working to tag the Lancaster-Oslo-Bergen
Suggest:
- Replace with: “Hams”
- Replace with: “Ha's”
- Replace with: “HMO's”
Lint: Spelling (63 priority)
Message: |
121 | Corpus of British English. HMMs involve counting cases (such as from the Brown
| ^~~~ Did you mean to spell “HMMs” this way?
Suggest:
- Replace with: “Hams”
- Replace with: “Ha's”
- Replace with: “HMO's”
Lint: Spelling (63 priority)
Message: |
129 | More advanced ("higher-order") HMMs learn the probabilities not only of pairs
| ^~~~ Did you mean to spell “HMMs” this way?
Suggest:
- Replace with: “Hams”
- Replace with: “Ha's”
- Replace with: “HMO's”
Lint: Readability (127 priority)
Message: |
141 | Eugene Charniak points out in Statistical techniques for natural language
| ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
142 | parsing (1997) that merely assigning the most common tag to each known word and
| ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
143 | the tag "proper noun" to all unknowns will approach 90% accuracy because many
| ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
144 | words are unambiguous, and many others only rarely represent their less-common
| ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
145 | parts of speech.
| ~~~~~~~~~~~~~~~~ This sentence is 50 words long.
Lint: Spelling (63 priority)
Message: |
141 | Eugene Charniak points out in Statistical techniques for natural language
| ^~~~~~~~ Did you mean to spell “Charniak” this way?
Suggest:
- Replace with: “Carnap”
- Replace with: “Chadian”
- Replace with: “Chadwick”
Lint: Readability (127 priority)
Message: |
148 | expensive since it enumerated all possibilities. It sometimes had to resort to
| ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
149 | backup methods when there were simply too many options (the Brown Corpus
| ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
150 | contains a case with 17 ambiguous words in a row, and there are words such as
| ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
151 | "still" that can represent as many as 7 distinct parts of speech.
| ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ This sentence is 44 words long.
Lint: WordChoice (63 priority)
Message: |
148 | expensive since it enumerated all possibilities. It sometimes had to resort to
149 | backup methods when there were simply too many options (the Brown Corpus
| ^~~~~~ This word should be a phrasal verb, not a compound noun.
Suggest:
- Replace with: “back up”
Lint: Spelling (63 priority)
Message: |
153 | HMMs underlie the functioning of stochastic taggers and are used in various
| ^~~~ Did you mean to spell “HMMs” this way?
Suggest:
- Replace with: “Hams”
- Replace with: “Ha's”
- Replace with: “HMO's”
Lint: Spelling (63 priority)
Message: |
159 | In 1987, Steven DeRose and Kenneth W. Church independently developed dynamic
| ^~~~~~ Did you mean to spell “DeRose” this way?
Suggest:
- Replace with: “Depose”
- Replace with: “Defoe”
- Replace with: “Denise”
Lint: Spelling (63 priority)
Message: |
159 | In 1987, Steven DeRose and Kenneth W. Church independently developed dynamic
| ^~ Did you mean to spell “W.” this way?
Suggest:
- Replace with: “We”
- Replace with: “WA”
- Replace with: “WC”
Lint: Spelling (63 priority)
Message: |
160 | programming algorithms to solve the same problem in vastly less time. Their
161 | methods were similar to the Viterbi algorithm known for some time in other
| ^~~~~~~ Did you mean to spell “Viterbi” this way?
Suggest:
- Replace with: “Vite's”
- Replace with: “Verdi”
- Replace with: “Vite”
Lint: Readability (127 priority)
Message: |
162 | fields. DeRose used a table of pairs, while Church used a table of triples and a
| ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
163 | method of estimating the values for triples that were rare or nonexistent in the
| ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
164 | Brown Corpus (an actual measurement of triple probabilities would require a much
| ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
165 | larger corpus). Both methods achieved an accuracy of over 95%. DeRose's 1990
| ~~~~~~~~~~~~~~~ This sentence is 43 words long.
Lint: Spelling (63 priority)
Message: |
162 | fields. DeRose used a table of pairs, while Church used a table of triples and a
| ^~~~~~ Did you mean to spell “DeRose” this way?
Suggest:
- Replace with: “Depose”
- Replace with: “Defoe”
- Replace with: “Denise”
Lint: Spelling (63 priority)
Message: |
165 | larger corpus). Both methods achieved an accuracy of over 95%. DeRose's 1990
| ^~~~~~~~ Did you mean to spell “DeRose's” this way?
166 | dissertation at Brown University included analyses of the specific error types,
Suggest:
- Replace with: “Defoe's”
- Replace with: “Denise's”
- Replace with: “Repose's”
Lint: Spelling (63 priority)
Message: |
173 | levels of linguistic analysis: syntax, morphology, semantics, and so on. CLAWS,
174 | DeRose's and Church's methods did fail for some of the known cases where
| ^~~~~~~~ Did you mean to spell “DeRose's” this way?
Suggest:
- Replace with: “Defoe's”
- Replace with: “Denise's”
- Replace with: “Repose's”
Lint: Readability (127 priority)
Message: |
175 | semantics is required, but those proved negligibly rare. This convinced many in
| ^~~~~~~~~~~~~~~~~~~~~~~~
176 | the field that part-of-speech tagging could usefully be separated from the other
| ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
177 | levels of processing; this, in turn, simplified the theory and practice of
| ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
178 | computerized language analysis and encouraged researchers to find ways to
| ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
179 | separate other pieces as well. Markov Models became the standard method for the
| ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ This sentence is 45 words long.
Lint: Spelling (63 priority)
Message: |
186 | "unsupervised" tagging. Unsupervised tagging techniques use an untagged corpus
| ^~~~~~~~ Did you mean to spell “untagged” this way?
187 | for their training data and produce the tagset by induction. That is, they
Suggest:
- Replace with: “untapped”
- Replace with: “untasted”
- Replace with: “unwaged”
Lint: Spelling (63 priority)
Message: |
186 | "unsupervised" tagging. Unsupervised tagging techniques use an untagged corpus
187 | for their training data and produce the tagset by induction. That is, they
| ^~~~~~ Did you mean to spell “tagset” this way?
Suggest:
- Replace with: “tablet”
- Replace with: “tagged”
- Replace with: “target”
Lint: Spelling (63 priority)
Message: |
200 | Some current major algorithms for part-of-speech tagging include the Viterbi
| ^~~~~~~ Did you mean to spell “Viterbi” this way?
201 | algorithm, Brill tagger, Constraint Grammar, and the Baum-Welch algorithm (also
Suggest:
- Replace with: “Vite's”
- Replace with: “Verdi”
- Replace with: “Vite”
Lint: Spelling (63 priority)
Message: |
201 | algorithm, Brill tagger, Constraint Grammar, and the Baum-Welch algorithm (also
| ^~~~~ Did you mean to spell “Welch” this way?
202 | known as the forward-backward algorithm). Hidden Markov model and visible Markov
Suggest:
- Replace with: “Welsh”
- Replace with: “Belch”
- Replace with: “Walsh”
Lint: Spelling (63 priority)
Message: |
203 | model taggers can both be implemented using the Viterbi algorithm. The
| ^~~~~~~ Did you mean to spell “Viterbi” this way?
Suggest:
- Replace with: “Vite's”
- Replace with: “Verdi”
- Replace with: “Vite”
Lint: Spelling (63 priority)
Message: |
207 | Many machine learning methods have also been applied to the problem of POS
| ^~~ Did you mean to spell “POS” this way?
208 | tagging. Methods such as SVM, maximum entropy classifier, perceptron, and
Suggest:
- Replace with: “PBS”
- Replace with: “PMS”
- Replace with: “POV”
Lint: Spelling (63 priority)
Message: |
208 | tagging. Methods such as SVM, maximum entropy classifier, perceptron, and
| ^~~ Did you mean to spell “SVM” this way?
Suggest:
- Replace with: “Sim”
- Replace with: “Sam”
- Replace with: “SCM”
Lint: Spelling (63 priority)
Message: |
213 | Wiki. This comparison uses the Penn tag set on some of the Penn Treebank data,
| ^~~~~~~~ Did you mean to spell “Treebank” this way?
214 | so the results are directly comparable. However, many significant taggers are
Suggest:
- Replace with: “Freeman”
- Replace with: “Reembark”
- Replace with: “Debank”

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff