mirror of
https://github.com/Automattic/harper.git
synced 2025-12-23 08:48:15 +00:00
test: add failing example for markdown comment parser (#1948)
Some checks are pending
Binaries / harper-cli - macOS-aarch64 (push) Waiting to run
Binaries / harper-cli - Linux-aarch64-GNU (push) Waiting to run
Binaries / harper-cli - Linux-aarch64-musl (push) Waiting to run
Binaries / harper-cli - macOS-x86_64 (push) Waiting to run
Binaries / harper-cli - Linux-x86_64-GNU (push) Waiting to run
Binaries / harper-cli - Linux-x86_64-musl (push) Waiting to run
Binaries / harper-cli - Windows-x86_64 (push) Waiting to run
Binaries / harper-ls - macOS-aarch64 (push) Waiting to run
Binaries / harper-ls - Linux-aarch64-GNU (push) Waiting to run
Binaries / harper-ls - Linux-aarch64-musl (push) Waiting to run
Binaries / harper-ls - macOS-x86_64 (push) Waiting to run
Binaries / harper-ls - Linux-x86_64-GNU (push) Waiting to run
Binaries / harper-ls - Linux-x86_64-musl (push) Waiting to run
Binaries / harper-ls - Windows-x86_64 (push) Waiting to run
Build Web / build-web (push) Waiting to run
Chrome Plugin / chrome-plugin (push) Waiting to run
Just Checks / just check-js (push) Waiting to run
Just Checks / just check-rust (push) Waiting to run
Just Checks / just test-chrome-plugin (push) Waiting to run
Just Checks / just test-firefox-plugin (push) Waiting to run
Just Checks / just test-harperjs (push) Waiting to run
Just Checks / just test-obsidian (push) Waiting to run
Just Checks / just test-rust (push) Waiting to run
Just Checks / just test-vscode (push) Waiting to run
VS Code Plugin / alpine-arm64 (push) Waiting to run
VS Code Plugin / alpine-x64 (push) Waiting to run
VS Code Plugin / darwin-arm64 (push) Waiting to run
VS Code Plugin / darwin-x64 (push) Waiting to run
VS Code Plugin / linux-arm64 (push) Waiting to run
VS Code Plugin / linux-armhf (push) Waiting to run
VS Code Plugin / linux-x64 (push) Waiting to run
VS Code Plugin / win32-arm64 (push) Waiting to run
VS Code Plugin / win32-x64 (push) Waiting to run
WordPress Plugin / wp-plugin (push) Waiting to run
Some checks are pending
Binaries / harper-cli - macOS-aarch64 (push) Waiting to run
Binaries / harper-cli - Linux-aarch64-GNU (push) Waiting to run
Binaries / harper-cli - Linux-aarch64-musl (push) Waiting to run
Binaries / harper-cli - macOS-x86_64 (push) Waiting to run
Binaries / harper-cli - Linux-x86_64-GNU (push) Waiting to run
Binaries / harper-cli - Linux-x86_64-musl (push) Waiting to run
Binaries / harper-cli - Windows-x86_64 (push) Waiting to run
Binaries / harper-ls - macOS-aarch64 (push) Waiting to run
Binaries / harper-ls - Linux-aarch64-GNU (push) Waiting to run
Binaries / harper-ls - Linux-aarch64-musl (push) Waiting to run
Binaries / harper-ls - macOS-x86_64 (push) Waiting to run
Binaries / harper-ls - Linux-x86_64-GNU (push) Waiting to run
Binaries / harper-ls - Linux-x86_64-musl (push) Waiting to run
Binaries / harper-ls - Windows-x86_64 (push) Waiting to run
Build Web / build-web (push) Waiting to run
Chrome Plugin / chrome-plugin (push) Waiting to run
Just Checks / just check-js (push) Waiting to run
Just Checks / just check-rust (push) Waiting to run
Just Checks / just test-chrome-plugin (push) Waiting to run
Just Checks / just test-firefox-plugin (push) Waiting to run
Just Checks / just test-harperjs (push) Waiting to run
Just Checks / just test-obsidian (push) Waiting to run
Just Checks / just test-rust (push) Waiting to run
Just Checks / just test-vscode (push) Waiting to run
VS Code Plugin / alpine-arm64 (push) Waiting to run
VS Code Plugin / alpine-x64 (push) Waiting to run
VS Code Plugin / darwin-arm64 (push) Waiting to run
VS Code Plugin / darwin-x64 (push) Waiting to run
VS Code Plugin / linux-arm64 (push) Waiting to run
VS Code Plugin / linux-armhf (push) Waiting to run
VS Code Plugin / linux-x64 (push) Waiting to run
VS Code Plugin / win32-arm64 (push) Waiting to run
VS Code Plugin / win32-x64 (push) Waiting to run
WordPress Plugin / wp-plugin (push) Waiting to run
* test: add failing example for markdown comment parser * fix(core): crash * fuzz: add other fuzzing hang * fix(comments): infinite loop --------- Co-authored-by: Elijah Potter <me@elijahpotter.dev>
This commit is contained in:
parent
589ca33466
commit
3b1b126dd7
4 changed files with 90 additions and 35 deletions
|
|
@ -119,3 +119,28 @@ impl Parser for CommentParser {
|
|||
self.inner.parse(source)
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::CommentParser;
|
||||
use harper_core::parsers::{MarkdownOptions, StrParser};
|
||||
|
||||
#[test]
|
||||
fn hang() {
|
||||
use std::sync::mpsc::channel;
|
||||
use std::thread;
|
||||
use std::time::Duration;
|
||||
|
||||
let (tx, rx) = channel::<()>();
|
||||
|
||||
let handle = thread::spawn(move || {
|
||||
let opts = MarkdownOptions::default();
|
||||
let parser = CommentParser::new_from_language_id("java", opts).unwrap();
|
||||
let _res = parser.parse_str("//{@j");
|
||||
tx.send(()).expect("send failed");
|
||||
});
|
||||
|
||||
rx.recv_timeout(Duration::from_secs(10)).expect("timed out");
|
||||
handle.join().expect("failed to join");
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -145,15 +145,21 @@ fn parse_inline_tag(tokens: &[Token]) -> Option<usize> {
|
|||
return None;
|
||||
}
|
||||
|
||||
if tokens.len() <= 3 {
|
||||
return None;
|
||||
}
|
||||
|
||||
let mut cursor = 3;
|
||||
|
||||
while !matches!(
|
||||
tokens.get(cursor),
|
||||
Some(Token {
|
||||
kind: TokenKind::Punctuation(Punctuation::CloseCurly),
|
||||
..
|
||||
})
|
||||
) {
|
||||
while cursor < tokens.len()
|
||||
&& !matches!(
|
||||
tokens.get(cursor),
|
||||
Some(Token {
|
||||
kind: TokenKind::Punctuation(Punctuation::CloseCurly),
|
||||
..
|
||||
})
|
||||
)
|
||||
{
|
||||
cursor += 1;
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -161,35 +161,45 @@ impl Parser for Markdown {
|
|||
|
||||
let mut tokens = Vec::new();
|
||||
|
||||
let mut traversed_bytes = 0;
|
||||
let mut traversed_chars = 0;
|
||||
// Build a mapping from the inner parser's byte-based indexing to Harper's char-based
|
||||
// indexing
|
||||
let mut byte_to_char = vec![0; source_str.len() + 1];
|
||||
let mut char_index = 0;
|
||||
let mut byte_idx = 0;
|
||||
for ch in source_str.chars() {
|
||||
let char_len = ch.len_utf8();
|
||||
for _ in 0..char_len {
|
||||
byte_to_char[byte_idx] = char_index;
|
||||
byte_idx += 1;
|
||||
}
|
||||
char_index += 1;
|
||||
}
|
||||
byte_to_char[source_str.len()] = char_index;
|
||||
|
||||
let mut stack = Vec::new();
|
||||
|
||||
// NOTE: the range spits out __byte__ indices, not char indices.
|
||||
// This is why we keep track above.
|
||||
for (event, range) in md_parser.into_offset_iter() {
|
||||
if range.start > traversed_bytes {
|
||||
traversed_chars += source_str[traversed_bytes..range.start].chars().count();
|
||||
traversed_bytes = range.start;
|
||||
}
|
||||
let span_start = byte_to_char[range.start];
|
||||
let span_end = byte_to_char[range.end];
|
||||
|
||||
match event {
|
||||
pulldown_cmark::Event::SoftBreak => {
|
||||
tokens.push(Token {
|
||||
span: Span::new_with_len(traversed_chars, 1),
|
||||
span: Span::new_with_len(span_start, 1),
|
||||
kind: TokenKind::Newline(1),
|
||||
});
|
||||
}
|
||||
pulldown_cmark::Event::HardBreak => {
|
||||
tokens.push(Token {
|
||||
span: Span::new_with_len(traversed_chars, 1),
|
||||
span: Span::new_with_len(span_start, 1),
|
||||
kind: TokenKind::Newline(2),
|
||||
});
|
||||
}
|
||||
pulldown_cmark::Event::Start(pulldown_cmark::Tag::List(v)) => {
|
||||
tokens.push(Token {
|
||||
span: Span::new_with_len(traversed_chars, 0),
|
||||
span: Span::new_with_len(span_start, 0),
|
||||
kind: TokenKind::Newline(2),
|
||||
});
|
||||
stack.push(pulldown_cmark::Tag::List(v));
|
||||
|
|
@ -201,7 +211,7 @@ impl Parser for Markdown {
|
|||
| pulldown_cmark::Event::End(pulldown_cmark::TagEnd::CodeBlock)
|
||||
| pulldown_cmark::Event::End(pulldown_cmark::TagEnd::TableCell) => {
|
||||
tokens.push(Token {
|
||||
// We cannot use `traversed_chars` here, as it will still point to the
|
||||
// We cannot use `span_start` here, as it will still point to the
|
||||
// first character of the `Event` at this point. Instead, we use the
|
||||
// position of the previous token's last character. This ensures the
|
||||
// paragraph break is placed at the end of the content, not its beginning.
|
||||
|
|
@ -214,38 +224,39 @@ impl Parser for Markdown {
|
|||
pulldown_cmark::Event::End(_) => {
|
||||
stack.pop();
|
||||
}
|
||||
pulldown_cmark::Event::InlineMath(code)
|
||||
| pulldown_cmark::Event::DisplayMath(code)
|
||||
| pulldown_cmark::Event::Code(code) => {
|
||||
let chunk_len = code.chars().count();
|
||||
pulldown_cmark::Event::InlineMath(_)
|
||||
| pulldown_cmark::Event::DisplayMath(_)
|
||||
| pulldown_cmark::Event::Code(_) => {
|
||||
let chunk_len = span_end - span_start;
|
||||
|
||||
tokens.push(Token {
|
||||
span: Span::new_with_len(traversed_chars, chunk_len),
|
||||
span: Span::new_with_len(span_start, chunk_len),
|
||||
kind: TokenKind::Unlintable,
|
||||
});
|
||||
}
|
||||
pulldown_cmark::Event::Text(text) => {
|
||||
let chunk_len = text.chars().count();
|
||||
pulldown_cmark::Event::Text(_text) => {
|
||||
let chunk_len = span_end - span_start;
|
||||
|
||||
if let Some(tag) = stack.last() {
|
||||
use pulldown_cmark::Tag;
|
||||
|
||||
if matches!(tag, Tag::CodeBlock(..)) {
|
||||
tokens.push(Token {
|
||||
span: Span::new_with_len(traversed_chars, text.chars().count()),
|
||||
span: Span::new_with_len(span_start, chunk_len),
|
||||
|
||||
kind: TokenKind::Unlintable,
|
||||
});
|
||||
continue;
|
||||
}
|
||||
if matches!(tag, Tag::Link { .. }) && self.options.ignore_link_title {
|
||||
tokens.push(Token {
|
||||
span: Span::new_with_len(traversed_chars, text.chars().count()),
|
||||
span: Span::new_with_len(span_start, chunk_len),
|
||||
kind: TokenKind::Unlintable,
|
||||
});
|
||||
continue;
|
||||
}
|
||||
if !(matches!(tag, Tag::Paragraph)
|
||||
|| matches!(tag, Tag::Link { .. }) && !self.options.ignore_link_title
|
||||
|| (matches!(tag, Tag::Link { .. }) && !self.options.ignore_link_title)
|
||||
|| matches!(tag, Tag::Heading { .. })
|
||||
|| matches!(tag, Tag::Item)
|
||||
|| matches!(tag, Tag::TableCell)
|
||||
|
|
@ -257,21 +268,19 @@ impl Parser for Markdown {
|
|||
}
|
||||
}
|
||||
|
||||
let mut new_tokens =
|
||||
english_parser.parse(&source[traversed_chars..traversed_chars + chunk_len]);
|
||||
let mut new_tokens = english_parser.parse(&source[span_start..span_end]);
|
||||
|
||||
new_tokens
|
||||
.iter_mut()
|
||||
.for_each(|token| token.span.push_by(traversed_chars));
|
||||
.for_each(|token| token.span.push_by(span_start));
|
||||
|
||||
tokens.append(&mut new_tokens);
|
||||
}
|
||||
// TODO: Support via `harper-html`
|
||||
pulldown_cmark::Event::Html(_content)
|
||||
| pulldown_cmark::Event::InlineHtml(_content) => {
|
||||
let size = _content.chars().count();
|
||||
pulldown_cmark::Event::Html(_) | pulldown_cmark::Event::InlineHtml(_) => {
|
||||
let size = span_end - span_start;
|
||||
tokens.push(Token {
|
||||
span: Span::new_with_len(traversed_chars, size),
|
||||
span: Span::new_with_len(span_start, size),
|
||||
kind: TokenKind::Unlintable,
|
||||
});
|
||||
}
|
||||
|
|
@ -551,4 +560,19 @@ Paragraph.
|
|||
let tokens = parser.parse_str(source);
|
||||
assert_ne!(tokens.last().unwrap().span.end, 0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn hang() {
|
||||
let opts = MarkdownOptions::default();
|
||||
let parser = Markdown::new(opts);
|
||||
let _res = parser.parse_str("[[#|]]:A]");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn hang2() {
|
||||
// This seems to only be a java specific problem...
|
||||
let opts = MarkdownOptions::default();
|
||||
let parser = Markdown::new(opts);
|
||||
let _res = parser.parse_str("//{@j");
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -6,7 +6,7 @@
|
|||
# I/Ddem+ NSg/VB+ V3 D/P NSg/VB P NPl/V3+ VP/J R NPr/J/P I/J/R/Dq NPl P NPr🅪Sg/VB/J+ . NSg/C/P NSg/R/C NPr/J NPr🅪Sg/VB/J+ . I/Ddem+ VL3 VP/J P NSg/VB D+ Nᴹ/Vg/J+ NPl+ IPl+ NSg/VB C/P NSg/I+ NPl/V3+ .
|
||||
>
|
||||
#
|
||||
> To achieve this , the filename of this file contains `.US , which will tell the snapshot generator to use the American dialect , rather than trying to use an automatically detected dialect .
|
||||
> To achieve this , the filename of this file contains `.US.` , which will tell the snapshot generator to use the American dialect , rather than trying to use an automatically detected dialect .
|
||||
# P VB I/Ddem+ . D NSg P I/Ddem NSg/VB+ V3 Unlintable . I/C+ NPr/VXB NPr/VB D NSg/VB+ NSg P N🅪Sg/VB D NPr/J NSg+ . NPr/VB/J/R C/P Nᴹ/Vg/J P N🅪Sg/VB D/P R VP/J NSg+ .
|
||||
>
|
||||
#
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue