使用nodejs中的csv-parse模块忽略引号内的内容
ignore what's inside quotes with csv-parse module in nodejs
我正在尝试编写一个 csv 解析器来处理一些非常复杂的数据,但我无法让 csv-parse 模块忽略引号内的定界符。除了双引号本身,我无法控制数据的来源,所以我无法真正使用转义字符来解决它。解析器首先解析带有“|”的行,然后是“@”,然后是“;”。
这是我正在尝试处理的数据示例:
recognitionDate | training
"2021-08-17" | "01|0009";"random string";"2"@"01|0009";"random string 2";"2"
目前,由于“|”,我无法在培训专栏上中断在“@”之后的“01|0009”。
这是解析器的代码:
const getData = (data, delimiter) => {
return new Promise((resolve, reject) => {
const rows = []
const rtrim = delimiter === '|' || delimiter === '@'? false : true
const parser = parseCsv(
data,
{
delimiter,
relax: true,
ltrim: true,
rtrim,
}
)
parser.on('readable', () => {
for(let data = parser.read(); data; data = parser.read()){
rows.push(data)
}
})
parser.on('end', () => {
resolve(
delimiter === '|' ?
rows :
rows.flat(1)
)
})
parser.on('error', err => {console.log(err)})
})
}
我是这样使用它的:
const buildRisk = async (row, header) => {
const riskObj = {}
// iterate through each element of each row
for(let i = 0; i < row.length; i++){
if(!row[i]) continue
let fields = await getData(row[i], '@')
const mainIndex = header[i].replace(/\s+/g, ' ').trim() + 's'
riskObj[mainIndex] = []
for(let j = 0; j < fields.length; j++){
const objData = await getData(fields[j], ';')
if(objData.length > 1){
riskObj[mainIndex].push({})
for(let k = 0; k < objData.length; k++){
riskObj[mainIndex][j]['index' + k] = objData[k].replace(/\s+/g, ' ').trim()
}
}
else{
riskObj[mainIndex] = objData[0].replace(/\s+/g, ' ').trim()
}
}
}
return riskObj
}
const main = async () => {
// first get all rows
const rows = await getData(data, '|')
if(!rows.length) return
// get header
const header = rows.shift()
console.log(rows)
// iterate through rows
for(const row of rows){
const risk = await buildRisk(row, header)
console.log(risk)
}
}
这是来自模块的错误:
CsvError: Invalid Record Length: expect 2, got 3 on line 2
at Parser.__onRecord (/home/luders/teste/node_modules/csv-parse/lib/index.js:773:9)
at Parser.__parse (/home/luders/teste/node_modules/csv-parse/lib/index.js:662:38)
at Parser._flush (/home/luders/teste/node_modules/csv-parse/lib/index.js:483:22)
at Parser.prefinish (internal/streams/transform.js:147:10)
at Parser.emit (events.js:375:28)
at prefinish (internal/streams/writable.js:630:14)
at finishMaybe (internal/streams/writable.js:638:5)
at Parser.Writable.end (internal/streams/writable.js:582:5)
at Immediate._onImmediate (/home/luders/teste/node_modules/csv-parse/lib/index.js:1186:16)
at processImmediate (internal/timers.js:464:21) {
code: 'CSV_INCONSISTENT_RECORD_LENGTH',
comment_lines: 0,
empty_lines: 0,
invalid_field_length: 0,
lines: 2,
records: 1,
columns: false,
error: undefined,
header: false,
index: 3,
column: 3,
quoting: false,
record: [
'"2021-08-17" ',
'"01|0009";"random string";"2"@"01',
'0009";"random string 2";"2"'
]
}
如您所见,它在应该解析 2 列的时候解析了 3 列。
如何让它忽略双引号内的内容?
我认为这不是转义 csv 的有效方法。整个字段需要用引号括起来,其中的引号也应该作为双引号转义。你需要在这里做一些预处理。 csv 的有效格式为:
recognitionDate | training
"2021-08-17" | " ""01|0009"";""random string"";""2""@""01|0009"";""random string 2"";""2"" "
我正在尝试编写一个 csv 解析器来处理一些非常复杂的数据,但我无法让 csv-parse 模块忽略引号内的定界符。除了双引号本身,我无法控制数据的来源,所以我无法真正使用转义字符来解决它。解析器首先解析带有“|”的行,然后是“@”,然后是“;”。
这是我正在尝试处理的数据示例:
recognitionDate | training
"2021-08-17" | "01|0009";"random string";"2"@"01|0009";"random string 2";"2"
目前,由于“|”,我无法在培训专栏上中断在“@”之后的“01|0009”。
这是解析器的代码:
const getData = (data, delimiter) => {
return new Promise((resolve, reject) => {
const rows = []
const rtrim = delimiter === '|' || delimiter === '@'? false : true
const parser = parseCsv(
data,
{
delimiter,
relax: true,
ltrim: true,
rtrim,
}
)
parser.on('readable', () => {
for(let data = parser.read(); data; data = parser.read()){
rows.push(data)
}
})
parser.on('end', () => {
resolve(
delimiter === '|' ?
rows :
rows.flat(1)
)
})
parser.on('error', err => {console.log(err)})
})
}
我是这样使用它的:
const buildRisk = async (row, header) => {
const riskObj = {}
// iterate through each element of each row
for(let i = 0; i < row.length; i++){
if(!row[i]) continue
let fields = await getData(row[i], '@')
const mainIndex = header[i].replace(/\s+/g, ' ').trim() + 's'
riskObj[mainIndex] = []
for(let j = 0; j < fields.length; j++){
const objData = await getData(fields[j], ';')
if(objData.length > 1){
riskObj[mainIndex].push({})
for(let k = 0; k < objData.length; k++){
riskObj[mainIndex][j]['index' + k] = objData[k].replace(/\s+/g, ' ').trim()
}
}
else{
riskObj[mainIndex] = objData[0].replace(/\s+/g, ' ').trim()
}
}
}
return riskObj
}
const main = async () => {
// first get all rows
const rows = await getData(data, '|')
if(!rows.length) return
// get header
const header = rows.shift()
console.log(rows)
// iterate through rows
for(const row of rows){
const risk = await buildRisk(row, header)
console.log(risk)
}
}
这是来自模块的错误:
CsvError: Invalid Record Length: expect 2, got 3 on line 2
at Parser.__onRecord (/home/luders/teste/node_modules/csv-parse/lib/index.js:773:9)
at Parser.__parse (/home/luders/teste/node_modules/csv-parse/lib/index.js:662:38)
at Parser._flush (/home/luders/teste/node_modules/csv-parse/lib/index.js:483:22)
at Parser.prefinish (internal/streams/transform.js:147:10)
at Parser.emit (events.js:375:28)
at prefinish (internal/streams/writable.js:630:14)
at finishMaybe (internal/streams/writable.js:638:5)
at Parser.Writable.end (internal/streams/writable.js:582:5)
at Immediate._onImmediate (/home/luders/teste/node_modules/csv-parse/lib/index.js:1186:16)
at processImmediate (internal/timers.js:464:21) {
code: 'CSV_INCONSISTENT_RECORD_LENGTH',
comment_lines: 0,
empty_lines: 0,
invalid_field_length: 0,
lines: 2,
records: 1,
columns: false,
error: undefined,
header: false,
index: 3,
column: 3,
quoting: false,
record: [
'"2021-08-17" ',
'"01|0009";"random string";"2"@"01',
'0009";"random string 2";"2"'
]
}
如您所见,它在应该解析 2 列的时候解析了 3 列。
如何让它忽略双引号内的内容?
我认为这不是转义 csv 的有效方法。整个字段需要用引号括起来,其中的引号也应该作为双引号转义。你需要在这里做一些预处理。 csv 的有效格式为:
recognitionDate | training
"2021-08-17" | " ""01|0009"";""random string"";""2""@""01|0009"";""random string 2"";""2"" "