使用nodejs中的csv-parse模块忽略引号内的内容

ignore what's inside quotes with csv-parse module in nodejs

我正在尝试编写一个 csv 解析器来处理一些非常复杂的数据,但我无法让 csv-parse 模块忽略引号内的定界符。除了双引号本身,我无法控制数据的来源,所以我无法真正使用转义字符来解决它。解析器首先解析带有“|”的行,然后是“@”,然后是“;”。

这是我正在尝试处理的数据示例:

recognitionDate | training
"2021-08-17"    | "01|0009";"random string";"2"@"01|0009";"random string 2";"2"

目前,由于“|”,我无法在培训专栏上中断在“@”之后的“01|0009”。

这是解析器的代码:

const getData = (data, delimiter) => {
  return new Promise((resolve, reject) => {
    const rows = []
  
    const rtrim = delimiter === '|' || delimiter === '@'? false : true

    const parser = parseCsv(
      data,
      {
        delimiter,
        relax: true,
        ltrim: true,
        rtrim,
      }
    )
    
    parser.on('readable', () => {
      for(let data = parser.read(); data; data = parser.read()){
        rows.push(data)      
      }
    })
  
    parser.on('end', () => {
      resolve(
        delimiter === '|' ?
        rows : 
        rows.flat(1)
      )
    })
  
    parser.on('error', err => {console.log(err)})
  })
}

我是这样使用它的:

const buildRisk = async (row, header) => {
  const riskObj = {}
    // iterate through each element of each row
    for(let i = 0; i < row.length; i++){
      if(!row[i]) continue
      let fields = await getData(row[i], '@')
      const mainIndex = header[i].replace(/\s+/g, ' ').trim() + 's'
        riskObj[mainIndex] = []
        for(let j = 0; j < fields.length; j++){
          const objData = await getData(fields[j], ';')
          if(objData.length > 1){
            riskObj[mainIndex].push({})
            for(let k = 0; k < objData.length; k++){
              riskObj[mainIndex][j]['index' + k] = objData[k].replace(/\s+/g, ' ').trim()
            } 
          }
          else{
            riskObj[mainIndex] = objData[0].replace(/\s+/g, ' ').trim()
          }
        }
      
    }
    return riskObj
}

const main = async () => {
  // first get all rows
  const rows = await getData(data, '|')
  if(!rows.length) return

  
  // get header
  const header = rows.shift()
  console.log(rows)

  // iterate through rows
  for(const row of rows){
    const risk = await buildRisk(row, header)
    console.log(risk)
  }
}

这是来自模块的错误:

CsvError: Invalid Record Length: expect 2, got 3 on line 2
    at Parser.__onRecord (/home/luders/teste/node_modules/csv-parse/lib/index.js:773:9)
    at Parser.__parse (/home/luders/teste/node_modules/csv-parse/lib/index.js:662:38)
    at Parser._flush (/home/luders/teste/node_modules/csv-parse/lib/index.js:483:22)
    at Parser.prefinish (internal/streams/transform.js:147:10)
    at Parser.emit (events.js:375:28)
    at prefinish (internal/streams/writable.js:630:14)
    at finishMaybe (internal/streams/writable.js:638:5)
    at Parser.Writable.end (internal/streams/writable.js:582:5)
    at Immediate._onImmediate (/home/luders/teste/node_modules/csv-parse/lib/index.js:1186:16)
    at processImmediate (internal/timers.js:464:21) {
  code: 'CSV_INCONSISTENT_RECORD_LENGTH',
  comment_lines: 0,
  empty_lines: 0,
  invalid_field_length: 0,
  lines: 2,
  records: 1,
  columns: false,
  error: undefined,
  header: false,
  index: 3,
  column: 3,
  quoting: false,
  record: [
    '"2021-08-17"    ',
    '"01|0009";"random string";"2"@"01',
    '0009";"random string 2";"2"'
  ]
}

如您所见,它在应该解析 2 列的时候解析了 3 列。

如何让它忽略双引号内的内容?

我认为这不是转义 csv 的有效方法。整个字段需要用引号括起来,其中的引号也应该作为双引号转义。你需要在这里做一些预处理。 csv 的有效格式为:

recognitionDate | training
"2021-08-17"    | " ""01|0009"";""random string"";""2""@""01|0009"";""random string 2"";""2"" "