npm package csvtojson CSV Parse Error: Error: unclosed_quote

npm package csvtojson CSV Parse Error: Error: unclosed_quote

在尝试处理大型 CSV 文件(大约 130 万条记录)时,我在某些记录(例如在 400 多条记录之后)被成功处理。从 CSV 文件中,我没有看到那里的数据格式有任何问题,但是解析器可能会引发此错误,因为在 column/field 值中发现了“\n”字符。

任何帮助将不胜感激。

我玩过这个,可以使用 CSV 文件行挂钩来连接它,csv-file-line-hook,您可以检查无效行并修复或简单地使它们无效。

下面的示例将简单地跳过无效行(缺少结束引号)

example.js

const fs = require("fs");

let fileReadStream = fs.createReadStream("test.csv");
let invalidLineCount = 0;

const csvtojson = require("csvtojson");
csvtojson({ "delimiter": ";", "fork": true })
.preFileLine((fileLineString, lineIdx)=> {
    let invalidLinePattern = /^['"].*[^"'];/;
    if (invalidLinePattern.test(fileLineString)) {
        console.log(`Line #${lineIdx + 1} is invalid, skipping:`, fileLineString);
        fileLineString = "";
        invalidLineCount++;
    }
    return fileLineString
})
.fromStream(fileReadStream) 
.subscribe((dataObj) => { 
    console.log(dataObj);
}, 
(err) => { 
    console.error("Error:", err); 
}, 
(success) => {
    console.log("Skipped lines:", invalidLineCount);
    console.log("Success"); 
});

test.csv

Name;Age;Profession
Bob;34;"Sales,Marketing"
Sarah;31;"Software Engineer"
James;45;Driver
"Billy, ;35;Manager
"Timothy;23;"QA

这个正则表达式效果更好

/^(?:[^"\]|\.|"(?:\.|[^"\])")$/g

这是一个更复杂的大文件工作脚本,通过读取每一行

import csv from 'csvtojson'
import fs from 'fs-extra'
import lineReader from 'line-reader'

import { __dirname } from '../../../utils.js'

const CSV2JSON = async(dumb, editDumb, headers, {
    options = {
        trim: true,
        delimiter: '|',
        quote: '"',
        escape: '"',
        fork: true,
        headers: headers
    }
} = {}) => {
    try {
        log(`\n\nStarting CSV2JSON - Current directory: ${__dirname()} - Please wait..`)

        await new Promise((resolve, reject) => {
            let firstLine, counter = 0
            lineReader.eachLine(dumb, async(line, last) => {
                counter++

                // log(`line before convert: ${line}`)
                let json = (
                    await csv(options).fromString(headers + '\n\r' + line)
                        .preFileLine((fileLineString, lineIdx) => {
                            // if it its not the first line
                                // eslint-disable-next-line max-len
                                if (counter !== 1 && !fileLineString.match(/^(?:[^"\]|\.|"(?:\.|[^"\])*")*$/g)) {
                                    // eslint-disable-next-line max-len
                                    console.log(`Line #${lineIdx + 1} is invalid. It has unescaped quotes. We will skip this line.. Invalid Line: ${fileLineString}`)
                                    fileLineString = ''
                                }

                            return fileLineString
                        })
                        .on('error', e => {
                            e = `Error while converting CSV to JSON.
                            Line before convert: ${line}
                            Error: ${e}`
                            throw new BaseError(e)
                        })
                )[0]

                // log(`line after convert: ${json}`)

                if (json) {
                    json = JSON.stringify(json).replace(/\"/g, '')

                    if (json.match(/^(?:[^"\]|\.|"(?:\.|[^"\])*")*$/g)) {
                        await fs.appendFile(editDumb, json)
                    }
                }

                if (last) {
                    resolve()
                }
            })
        })
    } catch (e) {
        throw new BaseError(`Error while converting CSV to JSON - Error: ${e}`)
    }
}

export { CSV2JSON }