Better parse errors on CSV parsing fail.

2025-09-28 22:34:45 +00:00 · 2022-07-16 21:49:18 +02:00 · 2022-07-16 21:49:18 +02:00 · 35ff9642aa
commit 35ff9642aa
parent b45ce4908d
5 changed files with 137 additions and 30 deletions
--- a/examples/csv/Parser/CSV.roc
+++ b/examples/csv/Parser/CSV.roc
@ -8,10 +8,12 @@ interface Parser.CSV
  parseStr,
  parseCSV,
  field,
+  string,
+  nat,
  ]
  imports [
-  Parser.Core.{Parser, parse, buildPrimitiveParser, fail, const, alt, map, map2, apply, many, oneorMore, sepBy1, between, ignore},
-  Parser.Str.{RawStr, parseStrPartial, oneOf, codepoint, codepointSatisfies, string, scalar, digits, strFromRaw}
+  Parser.Core.{Parser, parse, buildPrimitiveParser, fail, const, alt, map, map2, apply, many, oneorMore, sepBy1, between, ignore, flatten},
+  Parser.Str.{RawStr, parseStrPartial, oneOf, codepoint, codepointSatisfies, scalar, digits, strFromRaw}
  ]

 ## This is a CSV parser which follows RFC4180
@ -27,11 +29,12 @@ CSVField : RawStr
 CSVRecord : List CSVField
 CSV : List CSVRecord

-parseStr : Parser CSVRecord a, Str -> Result (List a) [ParsingFailure Str, SyntaxError (List U8), ParsingIncomplete CSVRecord]
+parseStr : Parser CSVRecord a, Str -> Result (List a) [ParsingFailure Str, SyntaxError Str, ParsingIncomplete CSVRecord]
 parseStr = \csvParser, input ->
  when parseStrToCSV input is
    Err (ParsingIncomplete rest) ->
-      Err (SyntaxError rest)
+      restStr = Parser.Str.strFromRaw rest
+      Err (SyntaxError restStr)
    Err (ParsingFailure str) ->
      Err (ParsingFailure str)
    Ok csvData ->
@ -47,32 +50,65 @@ parseCSV : Parser CSVRecord a, CSV -> Result (List a) [ParsingFailure Str, Parsi
 parseCSV = \csvParser, csvData ->
  List.walkUntil csvData (Ok []) \state, recordList ->
    when parse csvParser recordList (\leftover -> leftover == []) is
-      Err problem ->
-        Break (Err problem)
+      Err (ParsingFailure problem) ->
+        recordStr = recordList |> List.map strFromRaw |> Str.joinWith ", "
+        problemStr = "\(problem)\nWhile parsing record `\(recordStr)`."
+        Break (Err (ParsingFailure problemStr))
+      Err (ParsingIncomplete problem) ->
+        Break (Err (ParsingIncomplete problem))
      Ok val ->
        state
        |> Result.map (\vals -> List.append vals val)
        |> Continue

+# Wrapper function to combine a set of fields into your desired `a`
+#
+# ## Usage example
+#
+# >>> record (\firstName -> \lastName -> \age -> User {firstName, lastName, age})
+# >>> |> field string
+# >>> |> field string
+# >>> |> field nat
+#
 record : a -> Parser CSVRecord a
 record = Parser.Core.const

 field : Parser RawStr a -> Parser CSVRecord a
 field = \fieldParser ->
-  buildPrimitiveParser \recordVal ->
-    when List.get recordVal 0 is
+  buildPrimitiveParser \fieldsList ->
+    when List.get fieldsList 0 is
      Err OutOfBounds ->
        Err (ParsingFailure "expected another CSV field but there are no more fields in this record")
      Ok rawStr ->
        when Parser.Str.parseRawStr fieldParser rawStr is
          Ok val ->
-            Ok {val: val, input: (List.dropFirst recordVal)}
+            Ok {val: val, input: (List.dropFirst fieldsList)}
          Err (ParsingFailure reason) ->
-            Err (ParsingFailure reason)
+            fieldStr = rawStr |> strFromRaw
+            Err (ParsingFailure "Field `\(fieldStr)` from could not be parsed. \(reason)")
          Err (ParsingIncomplete reason) ->
            reasonStr = strFromRaw reason
-            Err (ParsingFailure "The field parser was unable to read the whole field: \(reasonStr)")
+            fieldsStr = fieldsList |> List.map strFromRaw |> Str.joinWith ", "
+            Err (ParsingFailure "The field parser was unable to read the whole field: `\(reasonStr)` while parsing the first field of leftover \(fieldsStr))")

+# Parser for a field containing a UTF8-encoded string
+string : Parser CSVField Str
+string = Parser.Str.anyString
+
+nat : Parser CSVField Nat
+nat =
+  string
+  |> map (\val ->
+    when Str.toNat val is
+      Ok num ->
+        Ok num
+      Err problem ->
+        Err "The field is not a valid Nat: \(val)"
+        )
+  |> flatten
+
+# f64 : Parser CSVField F64
+# f64 = string |> map Str.toF64 |> flatten

 parseStrToCSV : Str -> Result CSV [ParsingFailure Str, ParsingIncomplete RawStr]
 parseStrToCSV = \input ->
@ -111,7 +147,7 @@ escapedContents = many (oneOf [
  textdata
 ])

-twodquotes = string "\"\""
+twodquotes = Parser.Str.string "\"\""

 nonescapedCsvField : Parser RawStr CSVField
 nonescapedCsvField = many textdata
@ -120,5 +156,5 @@ dquote = codepoint 34 # '"'
 endOfLine = alt (ignore crlf) (ignore lf)
 cr = codepoint 13 # '\r'
 lf = codepoint 10 # '\n'
-crlf = string "\r\n"
+crlf = Parser.Str.string "\r\n"
 textdata = codepointSatisfies (\x -> (x >= 32 && x <= 33) || (x >= 35 && x <= 43) || (x >= 45 && x <= 126)) # Any printable char except " (34) and , (44)
--- a/examples/csv/Parser/Core.roc
+++ b/examples/csv/Parser/Core.roc
@ -1,6 +1,7 @@
 interface Parser.Core
  exposes [
    Parser,
+    ParseResult,
    parse,
    parsePartial,
    fail,
@ -21,6 +22,7 @@ interface Parser.Core
    sepBy1,
    ignore,
    buildPrimitiveParser,
+    flatten,
  ]
  imports []

@ -37,9 +39,11 @@ interface Parser.Core
 ## How a parser is _actually_ implemented internally is not important
 ## and this might change between versions;
 ## for instance to improve efficiency or error messages on parsing failures.
-Parser input a := (input -> Result {val: a, input: input} [ParsingFailure Str])
+Parser input a := (input -> ParseResult input a)

-buildPrimitiveParser : (input -> Result {val: a, input: input} [ParsingFailure Str]) -> Parser input a
+ParseResult input a : Result {val: a, input: input} [ParsingFailure Str]
+
+buildPrimitiveParser : (input -> ParseResult input a) -> Parser input a
 buildPrimitiveParser = \fun ->
  @Parser fun

@ -57,7 +61,7 @@ buildPrimitiveParser = \fun ->
 ##
 ## Of course, this is mostly useful when creating your own internal parsing building blocks.
 ## `run` or `Parser.Str.runStr` etc. are more useful in daily usage.
-parsePartial : Parser input a, input -> Result {val: a, input: input} [ParsingFailure Str]
+parsePartial : Parser input a, input -> ParseResult input a
 parsePartial = \@Parser parser, input ->
  (parser input)

@ -211,6 +215,22 @@ map3 = \parserA, parserB, parserC, transform ->

 # ^ And this could be repeated for as high as we want, of course.

+# Removes a layer of 'result' from running the parser.
+#
+# This allows for instance to map functions that return a result over the parser,
+# where errors are turned into `ParsingFailure` s.
+flatten : Parser input (Result a Str) -> Parser input a
+flatten = \parser ->
+  buildPrimitiveParser \input ->
+    result = parsePartial parser input
+    when result is
+        Err problem ->
+          Err problem
+        Ok {val: (Ok val), input: inputRest} ->
+          Ok {val: val, input: inputRest}
+        Ok {val: (Err problem), input: inputRest} ->
+          Err (ParsingFailure problem)
+
 ## Runs a parser lazily
 ##
 ## This is (only) useful when dealing with a recursive structure.
@ -226,7 +246,7 @@ maybe : Parser input a -> Parser input (Result a [Nothing])
 maybe = \parser ->
  alt (parser |> map (\val -> Ok val)) (const (Err Nothing))

-manyImpl : Parser input a, List a, input -> Result { input : input, val : List a } [ParsingFailure Str]
+manyImpl : Parser input a, List a, input -> ParseResult input (List a)
 manyImpl = \parser, vals, input ->
  result = parsePartial parser input
  when result is
--- a/examples/csv/Parser/Str.roc
+++ b/examples/csv/Parser/Str.roc
@ -9,13 +9,16 @@ interface Parser.Str
    stringRaw,
    codepoint,
    codepointSatisfies,
+    anyString,
+    anyRawString,
+    anyCodepoint,
    scalar,
    oneOf,
    digit,
    digits,
    strFromRaw,
  ]
-  imports [Parser.Core.{Parser, const, fail, map, map2, apply, many, oneOrMore, parse, parsePartial, buildPrimitiveParser, between}]
+  imports [Parser.Core.{Parser, ParseResult, const, fail, map, map2, apply, many, oneOrMore, parse, parsePartial, buildPrimitiveParser, between}]

 # Specific string-based parsers:

@ -41,7 +44,7 @@ strFromCodepoint = \cp ->
  strFromRaw [cp]

 ## Runs a parser against the start of a list of scalars, allowing the parser to consume it only partially.
-parseRawStrPartial : Parser RawStr a, RawStr -> Result {val: a, input: RawStr} [ParsingFailure Str]
+parseRawStrPartial : Parser RawStr a, RawStr -> ParseResult RawStr a
 parseRawStrPartial = \parser, input ->
  parsePartial parser input

@ -49,7 +52,7 @@ parseRawStrPartial = \parser, input ->
 ##
 ## - If the parser succeeds, returns the resulting value as well as the leftover input.
 ## - If the parser fails, returns `Err (ParsingFailure msg)`
-parseStrPartial : Parser RawStr a, Str -> Result {val: a, input: Str} [ParsingFailure Str]
+parseStrPartial : Parser RawStr a, Str -> ParseResult Str a
 parseStrPartial = \parser, input ->
  parser
  |> parseRawStrPartial (strToRaw input)
@ -137,9 +140,30 @@ scalar = \expectedScalar ->
  |> string
  |> map (\_ -> expectedScalar)

-betweenBraces : Parser RawStr a -> Parser RawStr a
-betweenBraces = \parser ->
-  between parser (scalar '[') (scalar ']')
+# Matches any codepoint
+anyCodepoint : Parser RawStr U8
+anyCodepoint = codepointSatisfies (\_ -> True)
+
+# Matches any bytestring
+# and consumes all of it.
+# Does not fail.
+anyRawString : Parser RawStr RawStr
+anyRawString = buildPrimitiveParser \rawStringValue ->
+  Ok {val: rawStringValue, input: []}
+
+# Matches any string
+# as long as it is valid UTF8.
+anyString : Parser RawStr Str
+anyString = buildPrimitiveParser \fieldRawString ->
+  when Str.fromUtf8 fieldRawString is
+    Ok stringVal ->
+      Ok {val: stringVal, input: []}
+    Err (BadUtf8 _ _) ->
+      Err (ParsingFailure "Expected a string field, but its contents cannot be parsed as UTF8.")
+
+# betweenBraces : Parser RawStr a -> Parser RawStr a
+# betweenBraces = \parser ->
+#   between parser (scalar '[') (scalar ']')


 digit : Parser RawStr U8
--- a/examples/csv/main.roc
+++ b/examples/csv/main.roc
@ -8,12 +8,20 @@ app "main"
 # with hard-coded input.

 main =
-  when Parser.CSV.parseStr userCSVParser "John,Doe,10" is
+  when Parser.CSV.parseStr userCSVParser "John,Doe,10\r\nRichard,Feldman,100\r\nMarten,Wijnja,28\r\n" is
    Ok result ->
        val = result |> Str.joinWith("\n")
-        "Parse success: \(val)\n"
+        nResults = List.len result |> Num.toStr
+        "Parse success!\n\n\(nResults) users were found:\n\(val)\n"
    Err problem ->
-        "Parsing Problem"
+      when problem is
+        ParsingFailure failure ->
+          "Parsing failure: \(failure)\n"
+        ParsingIncomplete leftover ->
+          leftoverStr = leftover |> List.map Parser.Str.strFromRaw |> Str.joinWith ", "
+          "Parsing incomplete. Following still left: \(leftoverStr)\n"
+        SyntaxError error ->
+          "Parsing failure. Syntax error in the CSV: \(error)"
 # main = fullTest csvParser "10,20\n\"An escaped field!\"\"\n,,,\",30\n"
 # main = partialTest fieldParser "\"An escaped field with some \"\"<- double quotes\""
 # main = fullTest fieldContentsParser "My very cool,\"\"\r\n string"
@ -21,10 +29,12 @@ main =
 # main = partialTest manyParser "this is a very long string\"\""

 userCSVParser =
-  Parser.CSV.record (\first -> \last -> \age -> "User: \(first) \(last) \(age)")
-  |> Parser.Core.apply (Parser.CSV.field (Parser.Str.string "John"))
-  |> Parser.Core.apply (Parser.CSV.field (Parser.Str.string "Doe"))
-  |> Parser.Core.apply (Parser.CSV.field (Parser.Str.string "10"))
+  Parser.CSV.record (\first -> \last -> \age ->
+    ageStr = Num.toStr age
+    "User: \(first) \(last) \(ageStr)")
+  |> Parser.Core.apply (Parser.CSV.field Parser.CSV.string)
+  |> Parser.Core.apply (Parser.CSV.field Parser.CSV.string)
+  |> Parser.Core.apply (Parser.CSV.field Parser.CSV.nat)

 partialTest = \parser, input ->
  when Parser.Str.parseStrPartial parser input is
--- a/examples/csv/platform/host.c
+++ b/examples/csv/platform/host.c
@ -6,13 +6,20 @@
 #include <string.h>
 #include <unistd.h>

+//#define ROC_PLATFORM_DEBUG
+
 void alloc_panic(size_t size);

 void *roc_alloc(size_t size, unsigned int alignment) {
+#ifdef ROC_PLATFORM_DEBUG
  printf("Allocating %llu (alignment %ud) ", (unsigned long long)size,
         alignment);
+#endif
  void *result = malloc(size);
+
+#ifdef ROC_PLATFORM_DEBUG
  printf("at: %p\n", result);
+#endif

  if (result == NULL) {
    if (size ==
@ -28,11 +35,16 @@ void *roc_alloc(size_t size, unsigned int alignment) {

 void *roc_realloc(void *ptr, size_t new_size, size_t old_size,
                  unsigned int alignment) {
+#ifdef ROC_PLATFORM_DEBUG
  printf("Rellocating %p (%llu -> %llu) (alignment %ud) ", ptr,
         (unsigned long long)old_size, (unsigned long long)new_size, alignment);
+#endif

  void *result = realloc(ptr, new_size);
+
+#ifdef ROC_PLATFORM_DEBUG
  printf("at: %p\n", result);
+#endif

  if (result == NULL) {
    if (new_size ==
@ -47,7 +59,10 @@ void *roc_realloc(void *ptr, size_t new_size, size_t old_size,
 }

 void roc_dealloc(void *ptr, unsigned int alignment) {
+
+#ifdef ROC_PLATFORM_DEBUG
  printf("Deallocating %p (alignment %ud)\n", ptr, alignment);
+#endif
  free(ptr);
 }

@ -67,7 +82,9 @@ void alloc_panic(size_t size) {
 }

 void *roc_memcpy(void *dest, const void *src, size_t n) {
+#ifdef ROC_PLATFORM_DEBUG
  printf("memcpy %p -> %p (size: %llu)\n", src, dest, (unsigned long long)n);
+#endif
  return memcpy(dest, src, n);
 }