BREAKING(std/encoding/csv): improve the definition of ParseOptions (#7714)

This commit is contained in:
uki00a 2020-09-28 03:20:46 +09:00 committed by GitHub
parent 5db72dcaf3
commit 94dcef714d
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
3 changed files with 101 additions and 72 deletions

View file

@ -37,25 +37,29 @@ writeVarbig(w: Deno.Writer, x: bigint, o: VarbigOptions = {}): Promise<number>
Parse the CSV from the `reader` with the options provided and return Parse the CSV from the `reader` with the options provided and return
`string[][]`. `string[][]`.
#### `parse(input: string | BufReader, opt: ParseOptions = { header: false }): Promise<unknown[]>`: #### `parse(input: string | BufReader, opt: ParseOptions = { skipFirstRow: false }): Promise<unknown[]>`:
Parse the CSV string/buffer with the options provided. The result of this Parse the CSV string/buffer with the options provided. The result of this
function is as follows: function is as follows:
- If you don't provide both `opt.header` and `opt.parse`, it returns - If you don't provide `opt.skipFirstRow`, `opt.parse`, and `opt.columns`, it
`string[][]`. returns `string[][]`.
- If you provide `opt.header` but not `opt.parse`, it returns `object[]`. - If you provide `opt.skipFirstRow` or `opt.columns` but not `opt.parse`, it
returns `object[]`.
- If you provide `opt.parse`, it returns an array where each element is the - If you provide `opt.parse`, it returns an array where each element is the
value returned from `opt.parse`. value returned from `opt.parse`.
##### `ParseOptions` ##### `ParseOptions`
- **`header: boolean | string[] | HeaderOptions[];`**: If a boolean is provided, - **`skipFirstRow: boolean;`**: If you provide `skipFirstRow: true` and
the first line will be used as Header definitions. If `string[]` or `columns`, the first line will be skipped. If you provide `skipFirstRow: true`
`HeaderOptions[]` those names will be used for header definition. but not `columns`, the first line will be skipped and used as header
definitions.
- **`columns: string[] | HeaderOptions[];`**: If you provide `string[]` or
`ColumnOptions[]`, those names will be used for header definition.
- **`parse?: (input: unknown) => unknown;`**: Parse function for the row, which - **`parse?: (input: unknown) => unknown;`**: Parse function for the row, which
will be executed after parsing of all columns. Therefore if you don't provide will be executed after parsing of all columns. Therefore if you don't provide
header and parse function with headers, input will be `string[]`. `skipFirstRow`, `columns`, and `parse` function, input will be `string[]`.
##### `HeaderOptions` ##### `HeaderOptions`

View file

@ -52,7 +52,7 @@ export class ParseError extends Error {
} }
/** /**
* @property comma - Character which separates values. Default: ',' * @property separator - Character which separates values. Default: ','
* @property comment - Character to start a comment. Default: '#' * @property comment - Character to start a comment. Default: '#'
* @property trimLeadingSpace - Flag to trim the leading space of the value. * @property trimLeadingSpace - Flag to trim the leading space of the value.
* Default: 'false' * Default: 'false'
@ -62,7 +62,7 @@ export class ParseError extends Error {
* If == 0, first row is used as referral for the number of fields. * If == 0, first row is used as referral for the number of fields.
*/ */
export interface ReadOptions { export interface ReadOptions {
comma?: string; separator?: string;
comment?: string; comment?: string;
trimLeadingSpace?: boolean; trimLeadingSpace?: boolean;
lazyQuotes?: boolean; lazyQuotes?: boolean;
@ -70,16 +70,16 @@ export interface ReadOptions {
} }
function chkOptions(opt: ReadOptions): void { function chkOptions(opt: ReadOptions): void {
if (!opt.comma) { if (!opt.separator) {
opt.comma = ","; opt.separator = ",";
} }
if (!opt.trimLeadingSpace) { if (!opt.trimLeadingSpace) {
opt.trimLeadingSpace = false; opt.trimLeadingSpace = false;
} }
if ( if (
INVALID_RUNE.includes(opt.comma) || INVALID_RUNE.includes(opt.separator) ||
(typeof opt.comment === "string" && INVALID_RUNE.includes(opt.comment)) || (typeof opt.comment === "string" && INVALID_RUNE.includes(opt.comment)) ||
opt.comma === opt.comment opt.separator === opt.comment
) { ) {
throw new Error(ERR_INVALID_DELIM); throw new Error(ERR_INVALID_DELIM);
} }
@ -88,7 +88,7 @@ function chkOptions(opt: ReadOptions): void {
async function readRecord( async function readRecord(
startLine: number, startLine: number,
reader: BufReader, reader: BufReader,
opt: ReadOptions = { comma: ",", trimLeadingSpace: false }, opt: ReadOptions = { separator: ",", trimLeadingSpace: false },
): Promise<string[] | null> { ): Promise<string[] | null> {
const tp = new TextProtoReader(reader); const tp = new TextProtoReader(reader);
let line = await readLine(tp); let line = await readLine(tp);
@ -103,13 +103,13 @@ async function readRecord(
return []; return [];
} }
assert(opt.comma != null); assert(opt.separator != null);
let fullLine = line; let fullLine = line;
let quoteError: ParseError | null = null; let quoteError: ParseError | null = null;
const quote = '"'; const quote = '"';
const quoteLen = quote.length; const quoteLen = quote.length;
const commaLen = opt.comma.length; const separatorLen = opt.separator.length;
let recordBuffer = ""; let recordBuffer = "";
const fieldIndexes = [] as number[]; const fieldIndexes = [] as number[];
parseField: parseField:
@ -120,7 +120,7 @@ async function readRecord(
if (line.length === 0 || !line.startsWith(quote)) { if (line.length === 0 || !line.startsWith(quote)) {
// Non-quoted string field // Non-quoted string field
const i = line.indexOf(opt.comma); const i = line.indexOf(opt.separator);
let field = line; let field = line;
if (i >= 0) { if (i >= 0) {
field = field.substring(0, i); field = field.substring(0, i);
@ -144,7 +144,7 @@ async function readRecord(
recordBuffer += field; recordBuffer += field;
fieldIndexes.push(recordBuffer.length); fieldIndexes.push(recordBuffer.length);
if (i >= 0) { if (i >= 0) {
line = line.substring(i + commaLen); line = line.substring(i + separatorLen);
continue parseField; continue parseField;
} }
break parseField; break parseField;
@ -161,9 +161,9 @@ async function readRecord(
// `""` sequence (append quote). // `""` sequence (append quote).
recordBuffer += quote; recordBuffer += quote;
line = line.substring(quoteLen); line = line.substring(quoteLen);
} else if (line.startsWith(opt.comma)) { } else if (line.startsWith(opt.separator)) {
// `","` sequence (end of field). // `","` sequence (end of field).
line = line.substring(commaLen); line = line.substring(separatorLen);
fieldIndexes.push(recordBuffer.length); fieldIndexes.push(recordBuffer.length);
continue parseField; continue parseField;
} else if (0 === line.length) { } else if (0 === line.length) {
@ -281,7 +281,7 @@ async function readLine(tp: TextProtoReader): Promise<string | null> {
export async function readMatrix( export async function readMatrix(
reader: BufReader, reader: BufReader,
opt: ReadOptions = { opt: ReadOptions = {
comma: ",", separator: ",",
trimLeadingSpace: false, trimLeadingSpace: false,
lazyQuotes: false, lazyQuotes: false,
}, },
@ -324,13 +324,13 @@ export async function readMatrix(
/** /**
* Parse the CSV string/buffer with the options provided. * Parse the CSV string/buffer with the options provided.
* *
* HeaderOptions provides the column definition * ColumnOptions provides the column definition
* and the parse function for each entry of the * and the parse function for each entry of the
* column. * column.
*/ */
export interface HeaderOptions { export interface ColumnOptions {
/** /**
* Name of the header to be used as property * Name of the column to be used as property
*/ */
name: string; name: string;
/** /**
@ -343,14 +343,20 @@ export interface HeaderOptions {
export interface ParseOptions extends ReadOptions { export interface ParseOptions extends ReadOptions {
/** /**
* If a boolean is provided, the first line will be used as Header definitions. * If you provide `skipFirstRow: true` and `columns`, the first line will be skipped.
* If `string[]` or `HeaderOptions[]` those names will be used for header definition. * If you provide `skipFirstRow: true` but not `columns`, the first line will be skipped and used as header definitions.
*/ */
header: boolean | string[] | HeaderOptions[]; skipFirstRow?: boolean;
/**
* If you provide `string[]` or `ColumnOptions[]`, those names will be used for header definition.
*/
columns?: string[] | ColumnOptions[];
/** Parse function for rows. /** Parse function for rows.
* Example: * Example:
* const r = await parseFile('a,b,c\ne,f,g\n', { * const r = await parseFile('a,b,c\ne,f,g\n', {
* header: ["this", "is", "sparta"], * columns: ["this", "is", "sparta"],
* parse: (e: Record<string, unknown>) => { * parse: (e: Record<string, unknown>) => {
* return { super: e.this, street: e.is, fighter: e.sparta }; * return { super: e.this, street: e.is, fighter: e.sparta };
* } * }
@ -370,14 +376,14 @@ export interface ParseOptions extends ReadOptions {
* for columns and rows. * for columns and rows.
* @param input Input to parse. Can be a string or BufReader. * @param input Input to parse. Can be a string or BufReader.
* @param opt options of the parser. * @param opt options of the parser.
* @returns If you don't provide both `opt.header` and `opt.parse`, it returns `string[][]`. * @returns If you don't provide `opt.skipFirstRow`, `opt.parse`, and `opt.columns`, it returns `string[][]`.
* If you provide `opt.header` but not `opt.parse`, it returns `object[]`. * If you provide `opt.skipFirstRow` or `opt.columns` but not `opt.parse`, it returns `object[]`.
* If you provide `opt.parse`, it returns an array where each element is the value returned from `opt.parse`. * If you provide `opt.parse`, it returns an array where each element is the value returned from `opt.parse`.
*/ */
export async function parse( export async function parse(
input: string | BufReader, input: string | BufReader,
opt: ParseOptions = { opt: ParseOptions = {
header: false, skipFirstRow: false,
}, },
): Promise<unknown[]> { ): Promise<unknown[]> {
let r: string[][]; let r: string[][];
@ -386,27 +392,15 @@ export async function parse(
} else { } else {
r = await readMatrix(new BufReader(new StringReader(input)), opt); r = await readMatrix(new BufReader(new StringReader(input)), opt);
} }
if (opt.header) { if (opt.skipFirstRow || opt.columns) {
let headers: HeaderOptions[] = []; let headers: ColumnOptions[] = [];
let i = 0; let i = 0;
if (Array.isArray(opt.header)) {
if (typeof opt.header[0] !== "string") { if (opt.skipFirstRow) {
headers = opt.header as HeaderOptions[];
} else {
const h = opt.header as string[];
headers = h.map(
(e): HeaderOptions => {
return {
name: e,
};
},
);
}
} else {
const head = r.shift(); const head = r.shift();
assert(head != null); assert(head != null);
headers = head.map( headers = head.map(
(e): HeaderOptions => { (e): ColumnOptions => {
return { return {
name: e, name: e,
}; };
@ -414,6 +408,21 @@ export async function parse(
); );
i++; i++;
} }
if (opt.columns) {
if (typeof opt.columns[0] !== "string") {
headers = opt.columns as ColumnOptions[];
} else {
const h = opt.columns as string[];
headers = h.map(
(e): ColumnOptions => {
return {
name: e,
};
},
);
}
}
return r.map((e): unknown => { return r.map((e): unknown => {
if (e.length !== headers.length) { if (e.length !== headers.length) {
throw `Error number of fields line:${i}`; throw `Error number of fields line:${i}`;

View file

@ -17,6 +17,7 @@ import {
import { StringReader } from "../io/readers.ts"; import { StringReader } from "../io/readers.ts";
import { BufReader } from "../io/bufio.ts"; import { BufReader } from "../io/bufio.ts";
// Test cases for `readMatrix()`
const testCases = [ const testCases = [
{ {
Name: "Simple", Name: "Simple",
@ -60,7 +61,7 @@ zzz,yyy,xxx`,
Name: "Semicolon", Name: "Semicolon",
Input: "a;b;c\n", Input: "a;b;c\n",
Output: [["a", "b", "c"]], Output: [["a", "b", "c"]],
Comma: ";", Separator: ";",
}, },
{ {
Name: "MultiLine", Name: "MultiLine",
@ -334,14 +335,14 @@ x,,,
Input: "a£b,c£ \td,e\n€ comment\n", Input: "a£b,c£ \td,e\n€ comment\n",
Output: [["a", "b,c", "d,e"]], Output: [["a", "b,c", "d,e"]],
TrimLeadingSpace: true, TrimLeadingSpace: true,
Comma: "£", Separator: "£",
Comment: "€", Comment: "€",
}, },
{ {
Name: "NonASCIICommaAndCommentWithQuotes", Name: "NonASCIICommaAndCommentWithQuotes",
Input: 'a€" b,"€ c\nλ comment\n', Input: 'a€" b,"€ c\nλ comment\n',
Output: [["a", " b,", " c"]], Output: [["a", " b,", " c"]],
Comma: "€", Separator: "€",
Comment: "λ", Comment: "λ",
}, },
{ {
@ -350,7 +351,7 @@ x,,,
Name: "NonASCIICommaConfusion", Name: "NonASCIICommaConfusion",
Input: '"abθcd"λefθgh', Input: '"abθcd"λefθgh',
Output: [["abθcd", "efθgh"]], Output: [["abθcd", "efθgh"]],
Comma: "λ", Separator: "λ",
Comment: "€", Comment: "€",
}, },
{ {
@ -415,17 +416,17 @@ x,,,
}, },
{ {
Name: "BadComma1", Name: "BadComma1",
Comma: "\n", Separator: "\n",
Error: new Error(ERR_INVALID_DELIM), Error: new Error(ERR_INVALID_DELIM),
}, },
{ {
Name: "BadComma2", Name: "BadComma2",
Comma: "\r", Separator: "\r",
Error: new Error(ERR_INVALID_DELIM), Error: new Error(ERR_INVALID_DELIM),
}, },
{ {
Name: "BadComma3", Name: "BadComma3",
Comma: '"', Separator: '"',
Error: new Error(ERR_INVALID_DELIM), Error: new Error(ERR_INVALID_DELIM),
}, },
{ {
@ -440,7 +441,7 @@ x,,,
}, },
{ {
Name: "BadCommaComment", Name: "BadCommaComment",
Comma: "X", Separator: "X",
Comment: "X", Comment: "X",
Error: new Error(ERR_INVALID_DELIM), Error: new Error(ERR_INVALID_DELIM),
}, },
@ -449,13 +450,13 @@ for (const t of testCases) {
Deno.test({ Deno.test({
name: `[CSV] ${t.Name}`, name: `[CSV] ${t.Name}`,
async fn(): Promise<void> { async fn(): Promise<void> {
let comma = ","; let separator = ",";
let comment: string | undefined; let comment: string | undefined;
let fieldsPerRec: number | undefined; let fieldsPerRec: number | undefined;
let trim = false; let trim = false;
let lazyquote = false; let lazyquote = false;
if (t.Comma) { if (t.Separator) {
comma = t.Comma; separator = t.Separator;
} }
if (t.Comment) { if (t.Comment) {
comment = t.Comment; comment = t.Comment;
@ -475,7 +476,7 @@ for (const t of testCases) {
await readMatrix( await readMatrix(
new BufReader(new StringReader(t.Input ?? "")), new BufReader(new StringReader(t.Input ?? "")),
{ {
comma: comma, separator,
comment: comment, comment: comment,
trimLeadingSpace: trim, trimLeadingSpace: trim,
fieldsPerRecord: fieldsPerRec, fieldsPerRecord: fieldsPerRec,
@ -489,7 +490,7 @@ for (const t of testCases) {
actual = await readMatrix( actual = await readMatrix(
new BufReader(new StringReader(t.Input ?? "")), new BufReader(new StringReader(t.Input ?? "")),
{ {
comma: comma, separator,
comment: comment, comment: comment,
trimLeadingSpace: trim, trimLeadingSpace: trim,
fieldsPerRecord: fieldsPerRec, fieldsPerRecord: fieldsPerRec,
@ -507,19 +508,19 @@ const parseTestCases = [
{ {
name: "simple", name: "simple",
in: "a,b,c", in: "a,b,c",
header: false, skipFirstRow: false,
result: [["a", "b", "c"]], result: [["a", "b", "c"]],
}, },
{ {
name: "simple Bufreader", name: "simple Bufreader",
in: new BufReader(new StringReader("a,b,c")), in: new BufReader(new StringReader("a,b,c")),
header: false, skipFirstRow: false,
result: [["a", "b", "c"]], result: [["a", "b", "c"]],
}, },
{ {
name: "multiline", name: "multiline",
in: "a,b,c\ne,f,g\n", in: "a,b,c\ne,f,g\n",
header: false, skipFirstRow: false,
result: [ result: [
["a", "b", "c"], ["a", "b", "c"],
["e", "f", "g"], ["e", "f", "g"],
@ -528,13 +529,13 @@ const parseTestCases = [
{ {
name: "header mapping boolean", name: "header mapping boolean",
in: "a,b,c\ne,f,g\n", in: "a,b,c\ne,f,g\n",
header: true, skipFirstRow: true,
result: [{ a: "e", b: "f", c: "g" }], result: [{ a: "e", b: "f", c: "g" }],
}, },
{ {
name: "header mapping array", name: "header mapping array",
in: "a,b,c\ne,f,g\n", in: "a,b,c\ne,f,g\n",
header: ["this", "is", "sparta"], columns: ["this", "is", "sparta"],
result: [ result: [
{ this: "a", is: "b", sparta: "c" }, { this: "a", is: "b", sparta: "c" },
{ this: "e", is: "f", sparta: "g" }, { this: "e", is: "f", sparta: "g" },
@ -543,7 +544,7 @@ const parseTestCases = [
{ {
name: "header mapping object", name: "header mapping object",
in: "a,b,c\ne,f,g\n", in: "a,b,c\ne,f,g\n",
header: [{ name: "this" }, { name: "is" }, { name: "sparta" }], columns: [{ name: "this" }, { name: "is" }, { name: "sparta" }],
result: [ result: [
{ this: "a", is: "b", sparta: "c" }, { this: "a", is: "b", sparta: "c" },
{ this: "e", is: "f", sparta: "g" }, { this: "e", is: "f", sparta: "g" },
@ -552,7 +553,7 @@ const parseTestCases = [
{ {
name: "header mapping parse entry", name: "header mapping parse entry",
in: "a,b,c\ne,f,g\n", in: "a,b,c\ne,f,g\n",
header: [ columns: [
{ {
name: "this", name: "this",
parse: (e: string): string => { parse: (e: string): string => {
@ -583,7 +584,7 @@ const parseTestCases = [
parse: (e: string[]): unknown => { parse: (e: string[]): unknown => {
return { super: e[0], street: e[1], fighter: e[2] }; return { super: e[0], street: e[1], fighter: e[2] };
}, },
header: false, skipFirstRow: false,
result: [ result: [
{ super: "a", street: "b", fighter: "c" }, { super: "a", street: "b", fighter: "c" },
{ super: "e", street: "f", fighter: "g" }, { super: "e", street: "f", fighter: "g" },
@ -592,7 +593,7 @@ const parseTestCases = [
{ {
name: "header mapping object parseline", name: "header mapping object parseline",
in: "a,b,c\ne,f,g\n", in: "a,b,c\ne,f,g\n",
header: [{ name: "this" }, { name: "is" }, { name: "sparta" }], columns: [{ name: "this" }, { name: "is" }, { name: "sparta" }],
parse: (e: Record<string, unknown>): unknown => { parse: (e: Record<string, unknown>): unknown => {
return { super: e.this, street: e.is, fighter: e.sparta }; return { super: e.this, street: e.is, fighter: e.sparta };
}, },
@ -601,6 +602,20 @@ const parseTestCases = [
{ super: "e", street: "f", fighter: "g" }, { super: "e", street: "f", fighter: "g" },
], ],
}, },
{
name: "provides both opts.skipFirstRow and opts.columns",
in: "a,b,1\nc,d,2\ne,f,3",
skipFirstRow: true,
columns: [
{ name: "foo" },
{ name: "bar" },
{ name: "baz", parse: (e: string) => Number(e) },
],
result: [
{ foo: "c", bar: "d", baz: 2 },
{ foo: "e", bar: "f", baz: 3 },
],
},
]; ];
for (const testCase of parseTestCases) { for (const testCase of parseTestCases) {
@ -608,7 +623,8 @@ for (const testCase of parseTestCases) {
name: `[CSV] Parse ${testCase.name}`, name: `[CSV] Parse ${testCase.name}`,
async fn(): Promise<void> { async fn(): Promise<void> {
const r = await parse(testCase.in, { const r = await parse(testCase.in, {
header: testCase.header, skipFirstRow: testCase.skipFirstRow,
columns: testCase.columns,
parse: testCase.parse as (input: unknown) => unknown, parse: testCase.parse as (input: unknown) => unknown,
}); });
assertEquals(r, testCase.result); assertEquals(r, testCase.result);