检测 csv 上的重复数据

Detect duplicate data on csv

node.js 使用 fast-csv 包,我目前有这个解析函数,它读取 csv 文件,更改 headers,遍历每一行并根据行数据触发事件.

validateRows: (filePath, payload, validators) => new Promise((resolve, reject) => {        
        const invalidRecords = [];
        const validRecords = [];

        fs.createReadStream(filePath)
            .pipe(csv.parse({
                headers: (headers) => mapHeaderToRelated(headers, payload), delimiter: ";", discardUnmappedColumns: true
            }))
            .validate((data, cb) => {
                const errors = validators.reduce((err, func) => [...err, ...func(data)], []);

                if (errors.length > 0) {
                    return cb(null, false, errors);
                }

                return cb(null, true);
            })
            .on("error", (error) => {
                console.log("There is some error");
                reject(error);
            })
            .on("data", (row) => {
                validRecords.push(row);
            })
            .on("data-invalid", (row, rowNumber, reason) => {
                invalidRecords.push({
                    data: row,
                    rowNumber: rowNumber,
                    reason: reason
                });
            })
            .on("end", (rowCount) => {
                console.log(`Parsed ${rowCount} rows. Valid Count: ${validRecords.length} Invalid Count: ${invalidRecords.length}`);

                resolve({
                    invalidRecords,
                    validRecords
                });
            });
    }),

我需要检测在数字上多次出现的记录。如果存在重复,比如多行具有相同的 phone 编号,它们应该被视为无效并推送到无效记录数组

Example CSV:

| name   | surname | gender  | phone  | 
| ------ | ------- | -------- | -----  |
| John   | Doe     | Male     | 123456 |
| Joh    | Deo     | Unknown  | 123456 |
| Jane   | Doe     | Female   | 999999 |

我想要的已解析 CSV 的输出:

{
 validRecords: [ 
   {
     name: Jane
     surname: Doe
     gender: Female
     phone: 99999   
   }
 ]

 invalidRecords: [ 
   {
     data: {
       name: John
       surname: Doe
       gender: Male
       phone: 123456 
     }
     rowNumber: 1,
     reason: ["Duplicate data"]
   },
   {
     data: {
       name: Joh
       surname: Deo
       gender: Male
       phone: 123456 
     }
     rowNumber: 2,
     reason: ["Duplicate data"]
   }
 ]
]

我该如何解决这个问题?

我已经用下面的和下面的一些辅助函数扩展了我的 on("end") 事件。暂时解决了。

.on("end", (rowCount) => {
                console.log(`Parsed ${rowCount} rows. Valid Count: ${validCustomers.length} Invalid Count: ${invalidCustomers.length}`);
                
                const allCustomers = [...invalidCustomers, ...validCustomers];

                const duplicateNumbers = findDuplicatePhoneNumbers(allCustomers);

                flagDuplicateCustomers(allCustomers, duplicateNumbers);
                
                // Valid but duplicate customers are pushed to the invalid customers and reason set to "Duplicate"
                const validButDuplicateCustomers = getDuplicateCustomers(validCustomers);
                validButDuplicateCustomers.forEach((c) => {
                    invalidCustomers.push({
                        data: c,
                        reason: ["Duplicate"]
                    });
                });
                
                // Add reason "Duplicate" for Invalid and Duplicate customers
                const invalidAndDuplicateCustomers = getDuplicateCustomers(invalidCustomers);
                invalidAndDuplicateCustomers.forEach((c) => {
                    if (c.reason) {
                        c.reason = [...c.reason, "Duplicate"];
                    }
                });
                
                const validAndNotDuplicate = getNonDuplicateCustomers(validCustomers);

                resolve({
                    invalidCustomers: invalidCustomers,
                    validCustomers: validAndNotDuplicate
                });
            });

辅助方法是

const getDuplicateCustomers = (customers) => customers.filter((customer) => customer.isDuplicate);

const getNonDuplicateCustomers = (records) => records.filter((record) => !record.isDuplicate);

const findDuplicatePhoneNumbers = (customers) => {
    let duplicates = [];

    const sortedCustomers = customers.sort((a, b)=> a.customer_phone - b.customer_phone);

    sortedCustomers.forEach((customer, index, array) => {
        const nextCustomer = array[index + 1];
  
        if (!nextCustomer) {
            return;
        }
  
        if (customer.customer_phone === nextCustomer.customer_phone) {
            duplicates.push(customer);
        }
    });

    const duplicatePhoneNumbers = duplicates.map((customer) => customer.customer_phone);
    const uniqueDuplicatePhoneNumbers = [...new Set(duplicatePhoneNumbers)];


    return uniqueDuplicatePhoneNumbers;
};

const flagDuplicateCustomers = (customers, duplicateNumbers) => {
    if (!duplicateNumbers) {
        return;
    }
    
    if (duplicateNumbers.length === 0) {
        return;
    }

    const duplicateCustomers = customers.filter((customer) => duplicateNumbers.includes(customer.customer_phone));

    duplicateCustomers.forEach((customer) => {
        customer.isDuplicate = true;
    });
};