test: add failing example for markdown comment parser (#1948)

* test: add failing example for markdown comment parser * fix(core): crash * fuzz: add other fuzzing hang * fix(comments): infinite loop --------- Co-authored-by: Elijah Potter <me@elijahpotter.dev>
2025-12-23 08:48:15 +00:00 · 2025-11-19 00:20:17 +01:00 · 2025-11-19 00:20:17 +01:00 · 3b1b126dd7
commit 3b1b126dd7
parent 589ca33466
4 changed files with 90 additions and 35 deletions
--- a/harper-comments/src/comment_parser.rs
+++ b/harper-comments/src/comment_parser.rs
@ -119,3 +119,28 @@ impl Parser for CommentParser {
        self.inner.parse(source)
    }
 }
+
+#[cfg(test)]
+mod tests {
+    use super::CommentParser;
+    use harper_core::parsers::{MarkdownOptions, StrParser};
+
+    #[test]
+    fn hang() {
+        use std::sync::mpsc::channel;
+        use std::thread;
+        use std::time::Duration;
+
+        let (tx, rx) = channel::<()>();
+
+        let handle = thread::spawn(move || {
+            let opts = MarkdownOptions::default();
+            let parser = CommentParser::new_from_language_id("java", opts).unwrap();
+            let _res = parser.parse_str("//{@j");
+            tx.send(()).expect("send failed");
+        });
+
+        rx.recv_timeout(Duration::from_secs(10)).expect("timed out");
+        handle.join().expect("failed to join");
+    }
+}
--- a/harper-comments/src/comment_parsers/jsdoc.rs
+++ b/harper-comments/src/comment_parsers/jsdoc.rs
@ -145,15 +145,21 @@ fn parse_inline_tag(tokens: &[Token]) -> Option<usize> {
        return None;
    }

+    if tokens.len() <= 3 {
+        return None;
+    }
+
    let mut cursor = 3;

-    while !matches!(
-        tokens.get(cursor),
-        Some(Token {
-            kind: TokenKind::Punctuation(Punctuation::CloseCurly),
-            ..
-        })
-    ) {
+    while cursor < tokens.len()
+        && !matches!(
+            tokens.get(cursor),
+            Some(Token {
+                kind: TokenKind::Punctuation(Punctuation::CloseCurly),
+                ..
+            })
+        )
+    {
        cursor += 1;
    }

--- a/harper-core/src/parsers/markdown.rs
+++ b/harper-core/src/parsers/markdown.rs
@ -161,35 +161,45 @@ impl Parser for Markdown {

        let mut tokens = Vec::new();

-        let mut traversed_bytes = 0;
-        let mut traversed_chars = 0;
+        // Build a mapping from the inner parser's byte-based indexing to Harper's char-based
+        // indexing
+        let mut byte_to_char = vec![0; source_str.len() + 1];
+        let mut char_index = 0;
+        let mut byte_idx = 0;
+        for ch in source_str.chars() {
+            let char_len = ch.len_utf8();
+            for _ in 0..char_len {
+                byte_to_char[byte_idx] = char_index;
+                byte_idx += 1;
+            }
+            char_index += 1;
+        }
+        byte_to_char[source_str.len()] = char_index;

        let mut stack = Vec::new();

        // NOTE: the range spits out __byte__ indices, not char indices.
        // This is why we keep track above.
        for (event, range) in md_parser.into_offset_iter() {
-            if range.start > traversed_bytes {
-                traversed_chars += source_str[traversed_bytes..range.start].chars().count();
-                traversed_bytes = range.start;
-            }
+            let span_start = byte_to_char[range.start];
+            let span_end = byte_to_char[range.end];

            match event {
                pulldown_cmark::Event::SoftBreak => {
                    tokens.push(Token {
-                        span: Span::new_with_len(traversed_chars, 1),
+                        span: Span::new_with_len(span_start, 1),
                        kind: TokenKind::Newline(1),
                    });
                }
                pulldown_cmark::Event::HardBreak => {
                    tokens.push(Token {
-                        span: Span::new_with_len(traversed_chars, 1),
+                        span: Span::new_with_len(span_start, 1),
                        kind: TokenKind::Newline(2),
                    });
                }
                pulldown_cmark::Event::Start(pulldown_cmark::Tag::List(v)) => {
                    tokens.push(Token {
-                        span: Span::new_with_len(traversed_chars, 0),
+                        span: Span::new_with_len(span_start, 0),
                        kind: TokenKind::Newline(2),
                    });
                    stack.push(pulldown_cmark::Tag::List(v));
@ -201,7 +211,7 @@ impl Parser for Markdown {
                | pulldown_cmark::Event::End(pulldown_cmark::TagEnd::CodeBlock)
                | pulldown_cmark::Event::End(pulldown_cmark::TagEnd::TableCell) => {
                    tokens.push(Token {
-                        // We cannot use `traversed_chars` here, as it will still point to the
+                        // We cannot use `span_start` here, as it will still point to the
                        // first character of the `Event` at this point. Instead, we use the
                        // position of the previous token's last character. This ensures the
                        // paragraph break is placed at the end of the content, not its beginning.
@ -214,38 +224,39 @@ impl Parser for Markdown {
                pulldown_cmark::Event::End(_) => {
                    stack.pop();
                }
-                pulldown_cmark::Event::InlineMath(code)
-                | pulldown_cmark::Event::DisplayMath(code)
-                | pulldown_cmark::Event::Code(code) => {
-                    let chunk_len = code.chars().count();
+                pulldown_cmark::Event::InlineMath(_)
+                | pulldown_cmark::Event::DisplayMath(_)
+                | pulldown_cmark::Event::Code(_) => {
+                    let chunk_len = span_end - span_start;

                    tokens.push(Token {
-                        span: Span::new_with_len(traversed_chars, chunk_len),
+                        span: Span::new_with_len(span_start, chunk_len),
                        kind: TokenKind::Unlintable,
                    });
                }
-                pulldown_cmark::Event::Text(text) => {
-                    let chunk_len = text.chars().count();
+                pulldown_cmark::Event::Text(_text) => {
+                    let chunk_len = span_end - span_start;

                    if let Some(tag) = stack.last() {
                        use pulldown_cmark::Tag;

                        if matches!(tag, Tag::CodeBlock(..)) {
                            tokens.push(Token {
-                                span: Span::new_with_len(traversed_chars, text.chars().count()),
+                                span: Span::new_with_len(span_start, chunk_len),
+
                                kind: TokenKind::Unlintable,
                            });
                            continue;
                        }
                        if matches!(tag, Tag::Link { .. }) && self.options.ignore_link_title {
                            tokens.push(Token {
-                                span: Span::new_with_len(traversed_chars, text.chars().count()),
+                                span: Span::new_with_len(span_start, chunk_len),
                                kind: TokenKind::Unlintable,
                            });
                            continue;
                        }
                        if !(matches!(tag, Tag::Paragraph)
-                            || matches!(tag, Tag::Link { .. }) && !self.options.ignore_link_title
+                            || (matches!(tag, Tag::Link { .. }) && !self.options.ignore_link_title)
                            || matches!(tag, Tag::Heading { .. })
                            || matches!(tag, Tag::Item)
                            || matches!(tag, Tag::TableCell)
@ -257,21 +268,19 @@ impl Parser for Markdown {
                        }
                    }

-                    let mut new_tokens =
-                        english_parser.parse(&source[traversed_chars..traversed_chars + chunk_len]);
+                    let mut new_tokens = english_parser.parse(&source[span_start..span_end]);

                    new_tokens
                        .iter_mut()
-                        .for_each(|token| token.span.push_by(traversed_chars));
+                        .for_each(|token| token.span.push_by(span_start));

                    tokens.append(&mut new_tokens);
                }
                // TODO: Support via `harper-html`
-                pulldown_cmark::Event::Html(_content)
-                | pulldown_cmark::Event::InlineHtml(_content) => {
-                    let size = _content.chars().count();
+                pulldown_cmark::Event::Html(_) | pulldown_cmark::Event::InlineHtml(_) => {
+                    let size = span_end - span_start;
                    tokens.push(Token {
-                        span: Span::new_with_len(traversed_chars, size),
+                        span: Span::new_with_len(span_start, size),
                        kind: TokenKind::Unlintable,
                    });
                }
@ -551,4 +560,19 @@ Paragraph.
        let tokens = parser.parse_str(source);
        assert_ne!(tokens.last().unwrap().span.end, 0);
    }
+
+    #[test]
+    fn hang() {
+        let opts = MarkdownOptions::default();
+        let parser = Markdown::new(opts);
+        let _res = parser.parse_str("[[#|]]:A]");
+    }
+
+    #[test]
+    fn hang2() {
+        // This seems to only be a java specific problem...
+        let opts = MarkdownOptions::default();
+        let parser = Markdown::new(opts);
+        let _res = parser.parse_str("//{@j");
+    }
 }
--- a/harper-core/tests/text/tagged/Spell.US.md
+++ b/harper-core/tests/text/tagged/Spell.US.md
@ -6,7 +6,7 @@
 # I/Ddem+ NSg/VB+  V3       D/P NSg/VB P  NPl/V3+ VP/J    R         NPr/J/P I/J/R/Dq NPl      P  NPr🅪Sg/VB/J+ . NSg/C/P NSg/R/C NPr/J    NPr🅪Sg/VB/J+ . I/Ddem+ VL3 VP/J     P  NSg/VB D+  Nᴹ/Vg/J+ NPl+        IPl+ NSg/VB C/P NSg/I+ NPl/V3+  .
 >
 #
-> To achieve this    , the filename of this   file    contains `.US       , which will    tell   the snapshot generator to use     the American dialect , rather     than trying  to use     an  automatically detected dialect .
+> To achieve this    , the filename of this   file    contains `.US.`     , which will    tell   the snapshot generator to use     the American dialect , rather     than trying  to use     an  automatically detected dialect .
 # P  VB      I/Ddem+ . D   NSg      P  I/Ddem NSg/VB+ V3       Unlintable . I/C+  NPr/VXB NPr/VB D   NSg/VB+  NSg       P  N🅪Sg/VB D   NPr/J    NSg+    . NPr/VB/J/R C/P  Nᴹ/Vg/J P  N🅪Sg/VB D/P R             VP/J     NSg+    .
 >
 #