mirror of
https://github.com/Automattic/harper.git
synced 2025-12-23 08:48:15 +00:00
test: add snapshots for linters (#1228)
* test: add snapshots for linters * fix(core): update snapshots --------- Co-authored-by: Elijah Potter <me@elijahpotter.dev>
This commit is contained in:
parent
08927bdaae
commit
663c729e46
8 changed files with 14954 additions and 91 deletions
237
harper-core/tests/linters.rs
Normal file
237
harper-core/tests/linters.rs
Normal file
|
|
@ -0,0 +1,237 @@
|
|||
//! This test creats snapshots of the reports of all linters.
|
||||
//!
|
||||
//! # Usage
|
||||
//!
|
||||
//! To add a new snapshot, simply add the document to `tests/text` and run this
|
||||
//! test. It will automatically create a new snapshot in `tests/text/linters`.
|
||||
//! To update an existing snapshot, also just run this test.
|
||||
//!
|
||||
//! Note: This test will fail if the snapshot files are not up to date. This
|
||||
//! ensures that CI will fail if linters change their behavior.
|
||||
|
||||
use harper_core::{
|
||||
Dialect, Document, FstDictionary,
|
||||
linting::{LintGroup, Linter},
|
||||
};
|
||||
|
||||
mod snapshot;
|
||||
|
||||
struct LinePos {
|
||||
/// 0-based index of the line
|
||||
pub line: usize,
|
||||
/// 0-based index of the column
|
||||
pub col: usize,
|
||||
}
|
||||
|
||||
struct Lines<'a> {
|
||||
lines: Vec<&'a str>,
|
||||
offsets: Vec<usize>,
|
||||
}
|
||||
impl Lines<'_> {
|
||||
fn new(source: &str) -> Lines {
|
||||
let lines: Vec<&str> = source.split('\n').collect();
|
||||
let offsets: Vec<usize> = lines
|
||||
.iter()
|
||||
.scan(0, |offset, line| {
|
||||
let old_offset = *offset;
|
||||
*offset += line.chars().count() + 1;
|
||||
Some(old_offset)
|
||||
})
|
||||
.collect();
|
||||
|
||||
Lines { lines, offsets }
|
||||
}
|
||||
|
||||
fn len(&self) -> usize {
|
||||
self.lines.len()
|
||||
}
|
||||
|
||||
fn get_pos(&self, offset: usize) -> LinePos {
|
||||
let line_index = self
|
||||
.offsets
|
||||
.binary_search(&offset)
|
||||
.unwrap_or_else(|x| x - 1);
|
||||
|
||||
LinePos {
|
||||
line: line_index,
|
||||
col: offset - self.offsets[line_index],
|
||||
}
|
||||
}
|
||||
}
|
||||
impl<'a> std::ops::Index<usize> for Lines<'a> {
|
||||
type Output = &'a str;
|
||||
|
||||
fn index(&self, index: usize) -> &Self::Output {
|
||||
&self.lines[index]
|
||||
}
|
||||
}
|
||||
|
||||
fn print_error(lines: &Lines, start: usize, end: usize, message: &str) -> String {
|
||||
let mut out = String::new();
|
||||
|
||||
fn print_line(out: &mut String, line: &str, number: usize) {
|
||||
out.push_str(&format!("{number:>6} | {line}\n"));
|
||||
}
|
||||
|
||||
fn is_sentence_boundary(c: char) -> bool {
|
||||
matches!(c, '.' | '?' | '!' | ':' | ';')
|
||||
}
|
||||
fn print_pre_line_context(
|
||||
out: &mut String,
|
||||
context_line: &str,
|
||||
number: usize,
|
||||
line: &str,
|
||||
start_col: usize,
|
||||
) {
|
||||
if context_line.is_empty() {
|
||||
return;
|
||||
}
|
||||
if start_col > 40 {
|
||||
// that's enough context
|
||||
return;
|
||||
}
|
||||
|
||||
let last_char = context_line.chars().last().unwrap();
|
||||
let mut chars_before = line.chars().take(start_col);
|
||||
if !is_sentence_boundary(last_char) && !chars_before.any(is_sentence_boundary) {
|
||||
print_line(out, context_line, number);
|
||||
}
|
||||
}
|
||||
fn print_post_line_context(
|
||||
out: &mut String,
|
||||
context_line: &str,
|
||||
number: usize,
|
||||
line: &str,
|
||||
end_col: usize,
|
||||
) {
|
||||
if context_line.is_empty() {
|
||||
return;
|
||||
}
|
||||
if end_col < 40 {
|
||||
// that's enough context
|
||||
return;
|
||||
}
|
||||
|
||||
let mut chars_after = line.chars().skip(end_col);
|
||||
if !chars_after.any(is_sentence_boundary) {
|
||||
print_line(out, context_line, number);
|
||||
}
|
||||
}
|
||||
|
||||
fn print_underline(
|
||||
out: &mut String,
|
||||
start_col: usize,
|
||||
end_col: usize,
|
||||
continuation: bool,
|
||||
message: &str,
|
||||
) {
|
||||
out.push_str(" | ");
|
||||
for _ in 0..start_col {
|
||||
out.push(' ');
|
||||
}
|
||||
out.push(if continuation { '~' } else { '^' });
|
||||
for _ in 0..end_col.saturating_sub(start_col) {
|
||||
out.push('~');
|
||||
}
|
||||
|
||||
if !message.is_empty() {
|
||||
out.push(' ');
|
||||
out.push_str(message);
|
||||
}
|
||||
out.push('\n');
|
||||
}
|
||||
|
||||
let start = lines.get_pos(start);
|
||||
let end = lines.get_pos(end - 1);
|
||||
|
||||
if start.line > 0 {
|
||||
print_pre_line_context(
|
||||
&mut out,
|
||||
lines[start.line - 1],
|
||||
start.line,
|
||||
lines[start.line],
|
||||
start.col,
|
||||
);
|
||||
}
|
||||
|
||||
if start.line == end.line {
|
||||
print_line(&mut out, lines[start.line], start.line + 1);
|
||||
print_underline(&mut out, start.col, end.col, false, message);
|
||||
} else {
|
||||
for i in start.line..end.line {
|
||||
let line = lines[i];
|
||||
print_line(&mut out, line, i + 1);
|
||||
print_underline(
|
||||
&mut out,
|
||||
if i == start.line { start.col } else { 0 },
|
||||
line.chars().count(),
|
||||
i != start.line,
|
||||
"",
|
||||
);
|
||||
}
|
||||
|
||||
print_line(&mut out, lines[end.line], end.line + 1);
|
||||
print_underline(&mut out, 0, end.col, true, message);
|
||||
}
|
||||
|
||||
if end.line + 1 < lines.len() {
|
||||
print_post_line_context(
|
||||
&mut out,
|
||||
lines[end.line + 1],
|
||||
end.line + 2,
|
||||
lines[end.line],
|
||||
end.col,
|
||||
);
|
||||
}
|
||||
|
||||
out
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_most_lints() {
|
||||
snapshot::snapshot_all_text_files("linters", ".snap.yml", |source| {
|
||||
let dict = FstDictionary::curated();
|
||||
let document = Document::new_markdown_default(source, &dict);
|
||||
|
||||
let mut linter = LintGroup::new_curated(dict, Dialect::American);
|
||||
|
||||
let mut lints = linter.lint(&document);
|
||||
lints.sort_by(|a, b| {
|
||||
a.span
|
||||
.start
|
||||
.cmp(&b.span.start)
|
||||
.then(a.span.end.cmp(&b.span.end))
|
||||
});
|
||||
|
||||
// split the input document into lines
|
||||
let lines = Lines::new(source);
|
||||
|
||||
let mut out = String::new();
|
||||
|
||||
for lint in lints {
|
||||
out.push_str(&format!(
|
||||
"Lint: {:?} ({} priority)\n",
|
||||
lint.lint_kind, lint.priority
|
||||
));
|
||||
|
||||
let message = print_error(&lines, lint.span.start, lint.span.end, &lint.message);
|
||||
out.push_str("Message: |\n");
|
||||
for l in message.lines() {
|
||||
out.push_str(" ");
|
||||
out.push_str(l);
|
||||
out.push('\n');
|
||||
}
|
||||
|
||||
if !lint.suggestions.is_empty() {
|
||||
out.push_str("Suggest:\n");
|
||||
for suggestion in &lint.suggestions {
|
||||
out.push_str(&format!(" - {}\n", suggestion));
|
||||
}
|
||||
}
|
||||
|
||||
out.push_str("\n\n\n");
|
||||
}
|
||||
|
||||
out
|
||||
});
|
||||
}
|
||||
|
|
@ -58,37 +58,11 @@
|
|||
//! - [`TokenKind::Space`], [`TokenKind::Newline`], and
|
||||
//! [`TokenKind::ParagraphBreak`] are ignored.
|
||||
//! - All other token kinds are denoted by their variant name.
|
||||
use std::{borrow::Cow, path::PathBuf};
|
||||
use std::borrow::Cow;
|
||||
|
||||
use harper_core::{Degree, Document, FstDictionary, TokenKind, WordMetadata};
|
||||
|
||||
fn get_tests_dir() -> PathBuf {
|
||||
PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("tests")
|
||||
}
|
||||
fn get_text_dir() -> PathBuf {
|
||||
get_tests_dir().join("text")
|
||||
}
|
||||
fn get_snapshot_dir() -> PathBuf {
|
||||
get_tests_dir().join("text/tagged")
|
||||
}
|
||||
fn get_text_files() -> Vec<PathBuf> {
|
||||
let mut files = vec![];
|
||||
for entry in std::fs::read_dir(get_text_dir())
|
||||
.unwrap()
|
||||
.filter_map(|f| f.ok())
|
||||
.filter(|f| f.metadata().unwrap().is_file())
|
||||
{
|
||||
let path = entry.path();
|
||||
let ext = path
|
||||
.extension()
|
||||
.map(|e| e.to_string_lossy().to_string())
|
||||
.unwrap_or_default();
|
||||
if matches!(ext.as_str(), "txt" | "md") {
|
||||
files.push(entry.path());
|
||||
}
|
||||
}
|
||||
files
|
||||
}
|
||||
mod snapshot;
|
||||
|
||||
fn format_word_tag(word: &WordMetadata) -> String {
|
||||
// These tags are inspired by the Penn Treebank POS tagset
|
||||
|
|
@ -258,70 +232,31 @@ impl Formatter {
|
|||
}
|
||||
}
|
||||
|
||||
fn tag_text(source: &str) -> String {
|
||||
let dict = FstDictionary::curated();
|
||||
let document = Document::new_markdown_default(&source.replace("\r\n", "\n"), &dict);
|
||||
|
||||
let mut formatter = Formatter::new();
|
||||
for token in document.fat_string_tokens() {
|
||||
match token.kind {
|
||||
TokenKind::Space(_) => { /* ignore */ }
|
||||
TokenKind::ParagraphBreak => {
|
||||
formatter.new_line();
|
||||
formatter.new_line();
|
||||
}
|
||||
TokenKind::Newline(_) => {
|
||||
formatter.new_line();
|
||||
}
|
||||
kind => {
|
||||
let text = &token.content;
|
||||
let tag = format_tag(&kind);
|
||||
formatter.add(text, &tag);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
formatter.finish()
|
||||
}
|
||||
|
||||
fn tag_file(path: &PathBuf) -> Result<(), Box<dyn std::error::Error>> {
|
||||
let source = std::fs::read_to_string(path)?.replace("\r\n", "\n");
|
||||
let tagged = tag_text(source.trim_end());
|
||||
|
||||
// compare with snapshot
|
||||
let snapshot_name = path.file_stem().unwrap().to_string_lossy().to_string() + ".md";
|
||||
let snapshot_file = get_snapshot_dir().join(snapshot_name);
|
||||
let has_snapshot = snapshot_file.exists();
|
||||
if has_snapshot {
|
||||
let snapshot = std::fs::read_to_string(&snapshot_file)?;
|
||||
if tagged == snapshot {
|
||||
return Ok(());
|
||||
}
|
||||
}
|
||||
|
||||
// write snapshot
|
||||
std::fs::create_dir_all(get_snapshot_dir())?;
|
||||
std::fs::write(snapshot_file, tagged)?;
|
||||
|
||||
Err(if has_snapshot {
|
||||
"Snapshot mismatches!"
|
||||
} else {
|
||||
"No snapshot!"
|
||||
}
|
||||
.into())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_pos_tagger() {
|
||||
let mut errors = 0;
|
||||
for file in get_text_files() {
|
||||
println!("Processing {}", file.display());
|
||||
if let Err(e) = tag_file(&file) {
|
||||
eprintln!("Error processing {}: {}", file.display(), e);
|
||||
errors += 1;
|
||||
snapshot::snapshot_all_text_files("tagged", ".md", |source| {
|
||||
let dict = FstDictionary::curated();
|
||||
let document = Document::new_markdown_default(source, &dict);
|
||||
|
||||
let mut formatter = Formatter::new();
|
||||
for token in document.fat_string_tokens() {
|
||||
match token.kind {
|
||||
TokenKind::Space(_) => { /* ignore */ }
|
||||
TokenKind::ParagraphBreak => {
|
||||
formatter.new_line();
|
||||
formatter.new_line();
|
||||
}
|
||||
TokenKind::Newline(_) => {
|
||||
formatter.new_line();
|
||||
}
|
||||
kind => {
|
||||
let text = &token.content;
|
||||
let tag = format_tag(&kind);
|
||||
formatter.add(text, &tag);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
if errors > 0 {
|
||||
panic!("{} errors occurred while processing files", errors);
|
||||
}
|
||||
|
||||
formatter.finish()
|
||||
});
|
||||
}
|
||||
|
|
|
|||
81
harper-core/tests/snapshot.rs
Normal file
81
harper-core/tests/snapshot.rs
Normal file
|
|
@ -0,0 +1,81 @@
|
|||
use std::path::{Path, PathBuf};
|
||||
|
||||
fn get_tests_dir() -> PathBuf {
|
||||
PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("tests")
|
||||
}
|
||||
fn get_text_dir() -> PathBuf {
|
||||
get_tests_dir().join("text")
|
||||
}
|
||||
|
||||
pub fn get_text_files() -> Vec<PathBuf> {
|
||||
let mut files = vec![];
|
||||
for entry in std::fs::read_dir(get_text_dir())
|
||||
.unwrap()
|
||||
.filter_map(|f| f.ok())
|
||||
.filter(|f| f.metadata().unwrap().is_file())
|
||||
{
|
||||
let path = entry.path();
|
||||
let ext = path
|
||||
.extension()
|
||||
.map(|e| e.to_string_lossy().to_string())
|
||||
.unwrap_or_default();
|
||||
if matches!(ext.as_str(), "txt" | "md") {
|
||||
files.push(entry.path());
|
||||
}
|
||||
}
|
||||
files
|
||||
}
|
||||
|
||||
fn tag_file(
|
||||
text_file: &Path,
|
||||
snapshot_file: &Path,
|
||||
create_snapshot: impl Fn(&str) -> String,
|
||||
) -> Result<(), Box<dyn std::error::Error>> {
|
||||
let source = std::fs::read_to_string(text_file)?.replace("\r\n", "\n");
|
||||
let tagged = create_snapshot(source.trim_end());
|
||||
|
||||
// compare with snapshot
|
||||
let has_snapshot = snapshot_file.exists();
|
||||
if has_snapshot {
|
||||
let snapshot = std::fs::read_to_string(snapshot_file)?;
|
||||
if tagged == snapshot {
|
||||
return Ok(());
|
||||
}
|
||||
}
|
||||
|
||||
// write snapshot
|
||||
std::fs::write(snapshot_file, tagged)?;
|
||||
|
||||
Err(if has_snapshot {
|
||||
"Snapshot mismatches!".into()
|
||||
} else {
|
||||
"No snapshot!".into()
|
||||
})
|
||||
}
|
||||
fn get_snapshot_file(text_file: &Path, snapshot_dir: &Path, ext: &str) -> PathBuf {
|
||||
let snapshot_name = text_file.file_stem().unwrap().to_string_lossy().to_string() + ext;
|
||||
snapshot_dir.join(snapshot_name)
|
||||
}
|
||||
#[allow(dead_code)]
|
||||
pub fn snapshot_all_text_files(
|
||||
out_dir: &str,
|
||||
snapshot_ext: &str,
|
||||
create_snapshot: impl Copy + Fn(&str) -> String,
|
||||
) {
|
||||
let snapshot_dir = get_text_dir().join(out_dir);
|
||||
std::fs::create_dir_all(&snapshot_dir).expect("Failed to create snapshot directory");
|
||||
|
||||
let mut errors = 0;
|
||||
for text_file in get_text_files() {
|
||||
println!("Processing {}", text_file.display());
|
||||
let snapshot_file = get_snapshot_file(&text_file, &snapshot_dir, snapshot_ext);
|
||||
if let Err(e) = tag_file(&text_file, &snapshot_file, create_snapshot) {
|
||||
eprintln!("Error processing {}: {}", text_file.display(), e);
|
||||
errors += 1;
|
||||
}
|
||||
}
|
||||
|
||||
if errors > 0 {
|
||||
panic!("{} errors occurred while processing files", errors);
|
||||
}
|
||||
}
|
||||
File diff suppressed because it is too large
Load diff
1213
harper-core/tests/text/linters/Computer science.snap.yml
Normal file
1213
harper-core/tests/text/linters/Computer science.snap.yml
Normal file
File diff suppressed because it is too large
Load diff
635
harper-core/tests/text/linters/Part-of-speech tagging.snap.yml
Normal file
635
harper-core/tests/text/linters/Part-of-speech tagging.snap.yml
Normal file
|
|
@ -0,0 +1,635 @@
|
|||
Lint: Readability (127 priority)
|
||||
Message: |
|
||||
8 | In corpus linguistics, part-of-speech tagging (POS tagging or PoS tagging or
|
||||
| ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
9 | POST), also called grammatical tagging is the process of marking up a word in a
|
||||
| ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
10 | text (corpus) as corresponding to a particular part of speech, based on both its
|
||||
| ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
11 | definition and its context. A simplified form of this is commonly taught to
|
||||
| ~~~~~~~~~~~~~~~~~~~~~~~~~~~ This sentence is 46 words long.
|
||||
|
||||
|
||||
|
||||
Lint: Spelling (63 priority)
|
||||
Message: |
|
||||
8 | In corpus linguistics, part-of-speech tagging (POS tagging or PoS tagging or
|
||||
| ^~~ Did you mean to spell “POS” this way?
|
||||
9 | POST), also called grammatical tagging is the process of marking up a word in a
|
||||
Suggest:
|
||||
- Replace with: “PBS”
|
||||
- Replace with: “PMS”
|
||||
- Replace with: “POV”
|
||||
|
||||
|
||||
|
||||
Lint: Spelling (63 priority)
|
||||
Message: |
|
||||
8 | In corpus linguistics, part-of-speech tagging (POS tagging or PoS tagging or
|
||||
| ^~~ Did you mean to spell “PoS” this way?
|
||||
9 | POST), also called grammatical tagging is the process of marking up a word in a
|
||||
Suggest:
|
||||
- Replace with: “Poe”
|
||||
- Replace with: “PBS”
|
||||
- Replace with: “PMS”
|
||||
|
||||
|
||||
|
||||
Lint: Spelling (63 priority)
|
||||
Message: |
|
||||
15 | Once performed by hand, POS tagging is now done in the context of computational
|
||||
| ^~~ Did you mean to spell “POS” this way?
|
||||
Suggest:
|
||||
- Replace with: “PBS”
|
||||
- Replace with: “PMS”
|
||||
- Replace with: “POV”
|
||||
|
||||
|
||||
|
||||
Lint: Spelling (63 priority)
|
||||
Message: |
|
||||
17 | parts of speech, by a set of descriptive tags. POS-tagging algorithms fall into
|
||||
| ^~~ Did you mean to spell “POS” this way?
|
||||
18 | two distinctive groups: rule-based and stochastic. E. Brill's tagger, one of the
|
||||
Suggest:
|
||||
- Replace with: “PBS”
|
||||
- Replace with: “PMS”
|
||||
- Replace with: “POV”
|
||||
|
||||
|
||||
|
||||
Lint: Spelling (63 priority)
|
||||
Message: |
|
||||
18 | two distinctive groups: rule-based and stochastic. E. Brill's tagger, one of the
|
||||
| ^~ Did you mean to spell “E.” this way?
|
||||
Suggest:
|
||||
- Replace with: “E”
|
||||
- Replace with: “Ea”
|
||||
- Replace with: “Ed”
|
||||
|
||||
|
||||
|
||||
Lint: Spelling (63 priority)
|
||||
Message: |
|
||||
18 | two distinctive groups: rule-based and stochastic. E. Brill's tagger, one of the
|
||||
| ^~~~~~~ Did you mean to spell “Brill's” this way?
|
||||
19 | first and most widely used English POS-taggers, employs rule-based algorithms.
|
||||
Suggest:
|
||||
- Replace with: “Brillo's”
|
||||
- Replace with: “Bill's”
|
||||
- Replace with: “Trill's”
|
||||
|
||||
|
||||
|
||||
Lint: Spelling (63 priority)
|
||||
Message: |
|
||||
18 | two distinctive groups: rule-based and stochastic. E. Brill's tagger, one of the
|
||||
19 | first and most widely used English POS-taggers, employs rule-based algorithms.
|
||||
| ^~~ Did you mean to spell “POS” this way?
|
||||
Suggest:
|
||||
- Replace with: “PBS”
|
||||
- Replace with: “PMS”
|
||||
- Replace with: “POV”
|
||||
|
||||
|
||||
|
||||
Lint: Readability (127 priority)
|
||||
Message: |
|
||||
33 | as the more common plural noun. Grammatical context is one way to determine
|
||||
| ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
34 | this; semantic analysis can also be used to infer that "sailor" and "hatch"
|
||||
| ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
35 | implicate "dogs" as 1) in the nautical context and 2) an action applied to the
|
||||
| ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
36 | object "hatch" (in this context, "dogs" is a nautical term meaning "fastens (a
|
||||
| ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
37 | watertight door) securely").
|
||||
| ~~~~~~~~~~~~~~~~~~~~~~~~~~~~ This sentence is 49 words long.
|
||||
|
||||
|
||||
|
||||
Lint: Spelling (63 priority)
|
||||
Message: |
|
||||
49 | tags. For example, NN for singular common nouns, NNS for plural common nouns, NP
|
||||
| ^~ Did you mean to spell “NN” this way?
|
||||
Suggest:
|
||||
- Replace with: “Nun”
|
||||
- Replace with: “Non”
|
||||
- Replace with: “N1”
|
||||
|
||||
|
||||
|
||||
Lint: Spelling (63 priority)
|
||||
Message: |
|
||||
49 | tags. For example, NN for singular common nouns, NNS for plural common nouns, NP
|
||||
| ^~~ Did you mean to spell “NNS” this way?
|
||||
50 | for singular proper nouns (see the POS tags used in the Brown Corpus). Other
|
||||
Suggest:
|
||||
- Replace with: “NBS”
|
||||
- Replace with: “NES”
|
||||
- Replace with: “NS”
|
||||
|
||||
|
||||
|
||||
Lint: Spelling (63 priority)
|
||||
Message: |
|
||||
49 | tags. For example, NN for singular common nouns, NNS for plural common nouns, NP
|
||||
| ^~ Did you mean to spell “NP” this way?
|
||||
50 | for singular proper nouns (see the POS tags used in the Brown Corpus). Other
|
||||
Suggest:
|
||||
- Replace with: “N”
|
||||
- Replace with: “Nap”
|
||||
- Replace with: “Nip”
|
||||
|
||||
|
||||
|
||||
Lint: Spelling (63 priority)
|
||||
Message: |
|
||||
49 | tags. For example, NN for singular common nouns, NNS for plural common nouns, NP
|
||||
50 | for singular proper nouns (see the POS tags used in the Brown Corpus). Other
|
||||
| ^~~ Did you mean to spell “POS” this way?
|
||||
Suggest:
|
||||
- Replace with: “PBS”
|
||||
- Replace with: “PMS”
|
||||
- Replace with: “POV”
|
||||
|
||||
|
||||
|
||||
Lint: Spelling (63 priority)
|
||||
Message: |
|
||||
55 | 150 separate parts of speech for English. Work on stochastic methods for tagging
|
||||
56 | Koine Greek (DeRose 1990) has used over 1,000 parts of speech and found that
|
||||
| ^~~~~ Did you mean to spell “Koine” this way?
|
||||
Suggest:
|
||||
- Replace with: “Kine”
|
||||
- Replace with: “Kline”
|
||||
- Replace with: “Kane”
|
||||
|
||||
|
||||
|
||||
Lint: Spelling (63 priority)
|
||||
Message: |
|
||||
55 | 150 separate parts of speech for English. Work on stochastic methods for tagging
|
||||
56 | Koine Greek (DeRose 1990) has used over 1,000 parts of speech and found that
|
||||
| ^~~~~~ Did you mean to spell “DeRose” this way?
|
||||
Suggest:
|
||||
- Replace with: “Depose”
|
||||
- Replace with: “Defoe”
|
||||
- Replace with: “Denise”
|
||||
|
||||
|
||||
|
||||
Lint: Spelling (63 priority)
|
||||
Message: |
|
||||
57 | about as many words were ambiguous in that language as in English. A
|
||||
58 | morphosyntactic descriptor in the case of morphologically rich languages is
|
||||
| ^~~~~~~~~~~~~~~ Did you mean to spell “morphosyntactic” this way?
|
||||
Suggest:
|
||||
- Replace with: “morphosyntax's”
|
||||
- Replace with: “morphosyntax”
|
||||
|
||||
|
||||
|
||||
Lint: Spelling (63 priority)
|
||||
Message: |
|
||||
58 | morphosyntactic descriptor in the case of morphologically rich languages is
|
||||
| ^~~~~~~~~~~~~~~ Did you mean “morphological”?
|
||||
59 | commonly expressed using very short mnemonics, such as Ncmsan for Category=Noun,
|
||||
Suggest:
|
||||
- Replace with: “morphological”
|
||||
|
||||
|
||||
|
||||
Lint: Spelling (63 priority)
|
||||
Message: |
|
||||
59 | commonly expressed using very short mnemonics, such as Ncmsan for Category=Noun,
|
||||
| ^~~~~~ Did you mean to spell “Ncmsan” this way?
|
||||
60 | Type = common, Gender = masculine, Number = singular, Case = accusative, Animate
|
||||
Suggest:
|
||||
- Replace with: “Nissan”
|
||||
- Replace with: “Nisan”
|
||||
- Replace with: “Nolan”
|
||||
|
||||
|
||||
|
||||
Lint: Spelling (63 priority)
|
||||
Message: |
|
||||
63 | The most popular "tag set" for POS tagging for American English is probably the
|
||||
| ^~~ Did you mean to spell “POS” this way?
|
||||
Suggest:
|
||||
- Replace with: “PBS”
|
||||
- Replace with: “PMS”
|
||||
- Replace with: “POV”
|
||||
|
||||
|
||||
|
||||
Lint: Spelling (63 priority)
|
||||
Message: |
|
||||
63 | The most popular "tag set" for POS tagging for American English is probably the
|
||||
64 | Penn tag set, developed in the Penn Treebank project. It is largely similar to
|
||||
| ^~~~~~~~ Did you mean to spell “Treebank” this way?
|
||||
Suggest:
|
||||
- Replace with: “Freeman”
|
||||
- Replace with: “Reembark”
|
||||
- Replace with: “Debank”
|
||||
|
||||
|
||||
|
||||
Lint: Spelling (63 priority)
|
||||
Message: |
|
||||
69 | POS tagging work has been done in a variety of languages, and the set of POS
|
||||
| ^~~ Did you mean to spell “POS” this way?
|
||||
Suggest:
|
||||
- Replace with: “PBS”
|
||||
- Replace with: “PMS”
|
||||
- Replace with: “POV”
|
||||
|
||||
|
||||
|
||||
Lint: Spelling (63 priority)
|
||||
Message: |
|
||||
69 | POS tagging work has been done in a variety of languages, and the set of POS
|
||||
| ^~~ Did you mean to spell “POS” this way?
|
||||
70 | tags used varies greatly with language. Tags usually are designed to include
|
||||
Suggest:
|
||||
- Replace with: “PBS”
|
||||
- Replace with: “PMS”
|
||||
- Replace with: “POV”
|
||||
|
||||
|
||||
|
||||
Lint: Spelling (63 priority)
|
||||
Message: |
|
||||
74 | Greek and Latin can be very large; tagging words in agglutinative languages such
|
||||
| ^~~~~~~~~~~~~ Did you mean to spell “agglutinative” this way?
|
||||
75 | as Inuit languages may be virtually impossible. At the other extreme, Petrov et
|
||||
Suggest:
|
||||
- Replace with: “agglutinate”
|
||||
- Replace with: “agglutinating”
|
||||
- Replace with: “agglutination”
|
||||
|
||||
|
||||
|
||||
Lint: Spelling (63 priority)
|
||||
Message: |
|
||||
75 | as Inuit languages may be virtually impossible. At the other extreme, Petrov et
|
||||
| ^~~~~~ Did you mean to spell “Petrov” this way?
|
||||
76 | al. have proposed a "universal" tag set, with 12 categories (for example, no
|
||||
Suggest:
|
||||
- Replace with: “Petrol”
|
||||
- Replace with: “Pedro”
|
||||
- Replace with: “Peron”
|
||||
|
||||
|
||||
|
||||
Lint: Spelling (63 priority)
|
||||
Message: |
|
||||
75 | as Inuit languages may be virtually impossible. At the other extreme, Petrov et
|
||||
| ^~~
|
||||
76 | al. have proposed a "universal" tag set, with 12 categories (for example, no
|
||||
| ~~~ Did you mean “et al.”?
|
||||
Suggest:
|
||||
- Replace with: “et al.”
|
||||
|
||||
|
||||
|
||||
Lint: Spelling (63 priority)
|
||||
Message: |
|
||||
86 | The first major corpus of English for computer analysis was the Brown Corpus
|
||||
87 | developed at Brown University by Henry Kučera and W. Nelson Francis, in the
|
||||
| ^~~~~~ Did you mean to spell “Kučera” this way?
|
||||
Suggest:
|
||||
- Replace with: “Kara”
|
||||
- Replace with: “Kendra”
|
||||
- Replace with: “Keri”
|
||||
|
||||
|
||||
|
||||
Lint: Spelling (63 priority)
|
||||
Message: |
|
||||
87 | developed at Brown University by Henry Kučera and W. Nelson Francis, in the
|
||||
| ^~ Did you mean to spell “W.” this way?
|
||||
Suggest:
|
||||
- Replace with: “We”
|
||||
- Replace with: “WA”
|
||||
- Replace with: “WC”
|
||||
|
||||
|
||||
|
||||
Lint: Spelling (63 priority)
|
||||
Message: |
|
||||
98 | and corrected by hand, and later users sent in errata so that by the late 70s
|
||||
| ^ Did you mean to spell “s” this way?
|
||||
99 | the tagging was nearly perfect (allowing for some cases on which even human
|
||||
Suggest:
|
||||
- Replace with: “sf”
|
||||
- Replace with: “sh”
|
||||
- Replace with: “so”
|
||||
|
||||
|
||||
|
||||
Lint: Spelling (63 priority)
|
||||
Message: |
|
||||
105 | later part-of-speech tagging systems, such as CLAWS and VOLSUNGA. However, by
|
||||
| ^~~~~~~~ Did you mean to spell “VOLSUNGA” this way?
|
||||
|
||||
|
||||
|
||||
Lint: Readability (127 priority)
|
||||
Message: |
|
||||
110 | For some time, part-of-speech tagging was considered an inseparable part of
|
||||
| ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
111 | natural language processing, because there are certain cases where the correct
|
||||
| ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
112 | part of speech cannot be decided without understanding the semantics or even the
|
||||
| ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
113 | pragmatics of the context. This is extremely expensive, especially because
|
||||
| ~~~~~~~~~~~~~~~~~~~~~~~~~~ This sentence is 41 words long.
|
||||
|
||||
|
||||
|
||||
Lint: Spelling (63 priority)
|
||||
Message: |
|
||||
119 | In the mid-1980s, researchers in Europe began to use hidden Markov models (HMMs)
|
||||
| ^~~~ Did you mean to spell “HMMs” this way?
|
||||
120 | to disambiguate parts of speech, when working to tag the Lancaster-Oslo-Bergen
|
||||
Suggest:
|
||||
- Replace with: “Hams”
|
||||
- Replace with: “Ha's”
|
||||
- Replace with: “HMO's”
|
||||
|
||||
|
||||
|
||||
Lint: Spelling (63 priority)
|
||||
Message: |
|
||||
121 | Corpus of British English. HMMs involve counting cases (such as from the Brown
|
||||
| ^~~~ Did you mean to spell “HMMs” this way?
|
||||
Suggest:
|
||||
- Replace with: “Hams”
|
||||
- Replace with: “Ha's”
|
||||
- Replace with: “HMO's”
|
||||
|
||||
|
||||
|
||||
Lint: Spelling (63 priority)
|
||||
Message: |
|
||||
129 | More advanced ("higher-order") HMMs learn the probabilities not only of pairs
|
||||
| ^~~~ Did you mean to spell “HMMs” this way?
|
||||
Suggest:
|
||||
- Replace with: “Hams”
|
||||
- Replace with: “Ha's”
|
||||
- Replace with: “HMO's”
|
||||
|
||||
|
||||
|
||||
Lint: Readability (127 priority)
|
||||
Message: |
|
||||
141 | Eugene Charniak points out in Statistical techniques for natural language
|
||||
| ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
142 | parsing (1997) that merely assigning the most common tag to each known word and
|
||||
| ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
143 | the tag "proper noun" to all unknowns will approach 90% accuracy because many
|
||||
| ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
144 | words are unambiguous, and many others only rarely represent their less-common
|
||||
| ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
145 | parts of speech.
|
||||
| ~~~~~~~~~~~~~~~~ This sentence is 50 words long.
|
||||
|
||||
|
||||
|
||||
Lint: Spelling (63 priority)
|
||||
Message: |
|
||||
141 | Eugene Charniak points out in Statistical techniques for natural language
|
||||
| ^~~~~~~~ Did you mean to spell “Charniak” this way?
|
||||
Suggest:
|
||||
- Replace with: “Carnap”
|
||||
- Replace with: “Chadian”
|
||||
- Replace with: “Chadwick”
|
||||
|
||||
|
||||
|
||||
Lint: Readability (127 priority)
|
||||
Message: |
|
||||
148 | expensive since it enumerated all possibilities. It sometimes had to resort to
|
||||
| ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
149 | backup methods when there were simply too many options (the Brown Corpus
|
||||
| ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
150 | contains a case with 17 ambiguous words in a row, and there are words such as
|
||||
| ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
151 | "still" that can represent as many as 7 distinct parts of speech.
|
||||
| ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ This sentence is 44 words long.
|
||||
|
||||
|
||||
|
||||
Lint: WordChoice (63 priority)
|
||||
Message: |
|
||||
148 | expensive since it enumerated all possibilities. It sometimes had to resort to
|
||||
149 | backup methods when there were simply too many options (the Brown Corpus
|
||||
| ^~~~~~ This word should be a phrasal verb, not a compound noun.
|
||||
Suggest:
|
||||
- Replace with: “back up”
|
||||
|
||||
|
||||
|
||||
Lint: Spelling (63 priority)
|
||||
Message: |
|
||||
153 | HMMs underlie the functioning of stochastic taggers and are used in various
|
||||
| ^~~~ Did you mean to spell “HMMs” this way?
|
||||
Suggest:
|
||||
- Replace with: “Hams”
|
||||
- Replace with: “Ha's”
|
||||
- Replace with: “HMO's”
|
||||
|
||||
|
||||
|
||||
Lint: Spelling (63 priority)
|
||||
Message: |
|
||||
159 | In 1987, Steven DeRose and Kenneth W. Church independently developed dynamic
|
||||
| ^~~~~~ Did you mean to spell “DeRose” this way?
|
||||
Suggest:
|
||||
- Replace with: “Depose”
|
||||
- Replace with: “Defoe”
|
||||
- Replace with: “Denise”
|
||||
|
||||
|
||||
|
||||
Lint: Spelling (63 priority)
|
||||
Message: |
|
||||
159 | In 1987, Steven DeRose and Kenneth W. Church independently developed dynamic
|
||||
| ^~ Did you mean to spell “W.” this way?
|
||||
Suggest:
|
||||
- Replace with: “We”
|
||||
- Replace with: “WA”
|
||||
- Replace with: “WC”
|
||||
|
||||
|
||||
|
||||
Lint: Spelling (63 priority)
|
||||
Message: |
|
||||
160 | programming algorithms to solve the same problem in vastly less time. Their
|
||||
161 | methods were similar to the Viterbi algorithm known for some time in other
|
||||
| ^~~~~~~ Did you mean to spell “Viterbi” this way?
|
||||
Suggest:
|
||||
- Replace with: “Vite's”
|
||||
- Replace with: “Verdi”
|
||||
- Replace with: “Vite”
|
||||
|
||||
|
||||
|
||||
Lint: Readability (127 priority)
|
||||
Message: |
|
||||
162 | fields. DeRose used a table of pairs, while Church used a table of triples and a
|
||||
| ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
163 | method of estimating the values for triples that were rare or nonexistent in the
|
||||
| ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
164 | Brown Corpus (an actual measurement of triple probabilities would require a much
|
||||
| ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
165 | larger corpus). Both methods achieved an accuracy of over 95%. DeRose's 1990
|
||||
| ~~~~~~~~~~~~~~~ This sentence is 43 words long.
|
||||
|
||||
|
||||
|
||||
Lint: Spelling (63 priority)
|
||||
Message: |
|
||||
162 | fields. DeRose used a table of pairs, while Church used a table of triples and a
|
||||
| ^~~~~~ Did you mean to spell “DeRose” this way?
|
||||
Suggest:
|
||||
- Replace with: “Depose”
|
||||
- Replace with: “Defoe”
|
||||
- Replace with: “Denise”
|
||||
|
||||
|
||||
|
||||
Lint: Spelling (63 priority)
|
||||
Message: |
|
||||
165 | larger corpus). Both methods achieved an accuracy of over 95%. DeRose's 1990
|
||||
| ^~~~~~~~ Did you mean to spell “DeRose's” this way?
|
||||
166 | dissertation at Brown University included analyses of the specific error types,
|
||||
Suggest:
|
||||
- Replace with: “Defoe's”
|
||||
- Replace with: “Denise's”
|
||||
- Replace with: “Repose's”
|
||||
|
||||
|
||||
|
||||
Lint: Spelling (63 priority)
|
||||
Message: |
|
||||
173 | levels of linguistic analysis: syntax, morphology, semantics, and so on. CLAWS,
|
||||
174 | DeRose's and Church's methods did fail for some of the known cases where
|
||||
| ^~~~~~~~ Did you mean to spell “DeRose's” this way?
|
||||
Suggest:
|
||||
- Replace with: “Defoe's”
|
||||
- Replace with: “Denise's”
|
||||
- Replace with: “Repose's”
|
||||
|
||||
|
||||
|
||||
Lint: Readability (127 priority)
|
||||
Message: |
|
||||
175 | semantics is required, but those proved negligibly rare. This convinced many in
|
||||
| ^~~~~~~~~~~~~~~~~~~~~~~~
|
||||
176 | the field that part-of-speech tagging could usefully be separated from the other
|
||||
| ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
177 | levels of processing; this, in turn, simplified the theory and practice of
|
||||
| ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
178 | computerized language analysis and encouraged researchers to find ways to
|
||||
| ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
179 | separate other pieces as well. Markov Models became the standard method for the
|
||||
| ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ This sentence is 45 words long.
|
||||
|
||||
|
||||
|
||||
Lint: Spelling (63 priority)
|
||||
Message: |
|
||||
186 | "unsupervised" tagging. Unsupervised tagging techniques use an untagged corpus
|
||||
| ^~~~~~~~ Did you mean to spell “untagged” this way?
|
||||
187 | for their training data and produce the tagset by induction. That is, they
|
||||
Suggest:
|
||||
- Replace with: “untapped”
|
||||
- Replace with: “untasted”
|
||||
- Replace with: “unwaged”
|
||||
|
||||
|
||||
|
||||
Lint: Spelling (63 priority)
|
||||
Message: |
|
||||
186 | "unsupervised" tagging. Unsupervised tagging techniques use an untagged corpus
|
||||
187 | for their training data and produce the tagset by induction. That is, they
|
||||
| ^~~~~~ Did you mean to spell “tagset” this way?
|
||||
Suggest:
|
||||
- Replace with: “tablet”
|
||||
- Replace with: “tagged”
|
||||
- Replace with: “target”
|
||||
|
||||
|
||||
|
||||
Lint: Spelling (63 priority)
|
||||
Message: |
|
||||
200 | Some current major algorithms for part-of-speech tagging include the Viterbi
|
||||
| ^~~~~~~ Did you mean to spell “Viterbi” this way?
|
||||
201 | algorithm, Brill tagger, Constraint Grammar, and the Baum-Welch algorithm (also
|
||||
Suggest:
|
||||
- Replace with: “Vite's”
|
||||
- Replace with: “Verdi”
|
||||
- Replace with: “Vite”
|
||||
|
||||
|
||||
|
||||
Lint: Spelling (63 priority)
|
||||
Message: |
|
||||
201 | algorithm, Brill tagger, Constraint Grammar, and the Baum-Welch algorithm (also
|
||||
| ^~~~~ Did you mean to spell “Welch” this way?
|
||||
202 | known as the forward-backward algorithm). Hidden Markov model and visible Markov
|
||||
Suggest:
|
||||
- Replace with: “Welsh”
|
||||
- Replace with: “Belch”
|
||||
- Replace with: “Walsh”
|
||||
|
||||
|
||||
|
||||
Lint: Spelling (63 priority)
|
||||
Message: |
|
||||
203 | model taggers can both be implemented using the Viterbi algorithm. The
|
||||
| ^~~~~~~ Did you mean to spell “Viterbi” this way?
|
||||
Suggest:
|
||||
- Replace with: “Vite's”
|
||||
- Replace with: “Verdi”
|
||||
- Replace with: “Vite”
|
||||
|
||||
|
||||
|
||||
Lint: Spelling (63 priority)
|
||||
Message: |
|
||||
207 | Many machine learning methods have also been applied to the problem of POS
|
||||
| ^~~ Did you mean to spell “POS” this way?
|
||||
208 | tagging. Methods such as SVM, maximum entropy classifier, perceptron, and
|
||||
Suggest:
|
||||
- Replace with: “PBS”
|
||||
- Replace with: “PMS”
|
||||
- Replace with: “POV”
|
||||
|
||||
|
||||
|
||||
Lint: Spelling (63 priority)
|
||||
Message: |
|
||||
208 | tagging. Methods such as SVM, maximum entropy classifier, perceptron, and
|
||||
| ^~~ Did you mean to spell “SVM” this way?
|
||||
Suggest:
|
||||
- Replace with: “Sim”
|
||||
- Replace with: “Sam”
|
||||
- Replace with: “SCM”
|
||||
|
||||
|
||||
|
||||
Lint: Spelling (63 priority)
|
||||
Message: |
|
||||
213 | Wiki. This comparison uses the Penn tag set on some of the Penn Treebank data,
|
||||
| ^~~~~~~~ Did you mean to spell “Treebank” this way?
|
||||
214 | so the results are directly comparable. However, many significant taggers are
|
||||
Suggest:
|
||||
- Replace with: “Freeman”
|
||||
- Replace with: “Reembark”
|
||||
- Replace with: “Debank”
|
||||
|
||||
|
||||
|
||||
File diff suppressed because it is too large
Load diff
7413
harper-core/tests/text/linters/The Great Gatsby.snap.yml
Normal file
7413
harper-core/tests/text/linters/The Great Gatsby.snap.yml
Normal file
File diff suppressed because it is too large
Load diff
Loading…
Add table
Add a link
Reference in a new issue