检测 csv 上的重复数据
Detect duplicate data on csv
node.js 使用 fast-csv 包,我目前有这个解析函数,它读取 csv 文件,更改 headers,遍历每一行并根据行数据触发事件.
validateRows: (filePath, payload, validators) => new Promise((resolve, reject) => {
const invalidRecords = [];
const validRecords = [];
fs.createReadStream(filePath)
.pipe(csv.parse({
headers: (headers) => mapHeaderToRelated(headers, payload), delimiter: ";", discardUnmappedColumns: true
}))
.validate((data, cb) => {
const errors = validators.reduce((err, func) => [...err, ...func(data)], []);
if (errors.length > 0) {
return cb(null, false, errors);
}
return cb(null, true);
})
.on("error", (error) => {
console.log("There is some error");
reject(error);
})
.on("data", (row) => {
validRecords.push(row);
})
.on("data-invalid", (row, rowNumber, reason) => {
invalidRecords.push({
data: row,
rowNumber: rowNumber,
reason: reason
});
})
.on("end", (rowCount) => {
console.log(`Parsed ${rowCount} rows. Valid Count: ${validRecords.length} Invalid Count: ${invalidRecords.length}`);
resolve({
invalidRecords,
validRecords
});
});
}),
我需要检测在数字上多次出现的记录。如果存在重复,比如多行具有相同的 phone 编号,它们应该被视为无效并推送到无效记录数组
Example CSV:
| name | surname | gender | phone |
| ------ | ------- | -------- | ----- |
| John | Doe | Male | 123456 |
| Joh | Deo | Unknown | 123456 |
| Jane | Doe | Female | 999999 |
我想要的已解析 CSV 的输出:
{
validRecords: [
{
name: Jane
surname: Doe
gender: Female
phone: 99999
}
]
invalidRecords: [
{
data: {
name: John
surname: Doe
gender: Male
phone: 123456
}
rowNumber: 1,
reason: ["Duplicate data"]
},
{
data: {
name: Joh
surname: Deo
gender: Male
phone: 123456
}
rowNumber: 2,
reason: ["Duplicate data"]
}
]
]
我该如何解决这个问题?
我已经用下面的和下面的一些辅助函数扩展了我的 on("end") 事件。暂时解决了。
.on("end", (rowCount) => {
console.log(`Parsed ${rowCount} rows. Valid Count: ${validCustomers.length} Invalid Count: ${invalidCustomers.length}`);
const allCustomers = [...invalidCustomers, ...validCustomers];
const duplicateNumbers = findDuplicatePhoneNumbers(allCustomers);
flagDuplicateCustomers(allCustomers, duplicateNumbers);
// Valid but duplicate customers are pushed to the invalid customers and reason set to "Duplicate"
const validButDuplicateCustomers = getDuplicateCustomers(validCustomers);
validButDuplicateCustomers.forEach((c) => {
invalidCustomers.push({
data: c,
reason: ["Duplicate"]
});
});
// Add reason "Duplicate" for Invalid and Duplicate customers
const invalidAndDuplicateCustomers = getDuplicateCustomers(invalidCustomers);
invalidAndDuplicateCustomers.forEach((c) => {
if (c.reason) {
c.reason = [...c.reason, "Duplicate"];
}
});
const validAndNotDuplicate = getNonDuplicateCustomers(validCustomers);
resolve({
invalidCustomers: invalidCustomers,
validCustomers: validAndNotDuplicate
});
});
辅助方法是
const getDuplicateCustomers = (customers) => customers.filter((customer) => customer.isDuplicate);
const getNonDuplicateCustomers = (records) => records.filter((record) => !record.isDuplicate);
const findDuplicatePhoneNumbers = (customers) => {
let duplicates = [];
const sortedCustomers = customers.sort((a, b)=> a.customer_phone - b.customer_phone);
sortedCustomers.forEach((customer, index, array) => {
const nextCustomer = array[index + 1];
if (!nextCustomer) {
return;
}
if (customer.customer_phone === nextCustomer.customer_phone) {
duplicates.push(customer);
}
});
const duplicatePhoneNumbers = duplicates.map((customer) => customer.customer_phone);
const uniqueDuplicatePhoneNumbers = [...new Set(duplicatePhoneNumbers)];
return uniqueDuplicatePhoneNumbers;
};
const flagDuplicateCustomers = (customers, duplicateNumbers) => {
if (!duplicateNumbers) {
return;
}
if (duplicateNumbers.length === 0) {
return;
}
const duplicateCustomers = customers.filter((customer) => duplicateNumbers.includes(customer.customer_phone));
duplicateCustomers.forEach((customer) => {
customer.isDuplicate = true;
});
};
node.js 使用 fast-csv 包,我目前有这个解析函数,它读取 csv 文件,更改 headers,遍历每一行并根据行数据触发事件.
validateRows: (filePath, payload, validators) => new Promise((resolve, reject) => {
const invalidRecords = [];
const validRecords = [];
fs.createReadStream(filePath)
.pipe(csv.parse({
headers: (headers) => mapHeaderToRelated(headers, payload), delimiter: ";", discardUnmappedColumns: true
}))
.validate((data, cb) => {
const errors = validators.reduce((err, func) => [...err, ...func(data)], []);
if (errors.length > 0) {
return cb(null, false, errors);
}
return cb(null, true);
})
.on("error", (error) => {
console.log("There is some error");
reject(error);
})
.on("data", (row) => {
validRecords.push(row);
})
.on("data-invalid", (row, rowNumber, reason) => {
invalidRecords.push({
data: row,
rowNumber: rowNumber,
reason: reason
});
})
.on("end", (rowCount) => {
console.log(`Parsed ${rowCount} rows. Valid Count: ${validRecords.length} Invalid Count: ${invalidRecords.length}`);
resolve({
invalidRecords,
validRecords
});
});
}),
我需要检测在数字上多次出现的记录。如果存在重复,比如多行具有相同的 phone 编号,它们应该被视为无效并推送到无效记录数组
Example CSV:
| name | surname | gender | phone |
| ------ | ------- | -------- | ----- |
| John | Doe | Male | 123456 |
| Joh | Deo | Unknown | 123456 |
| Jane | Doe | Female | 999999 |
我想要的已解析 CSV 的输出:
{
validRecords: [
{
name: Jane
surname: Doe
gender: Female
phone: 99999
}
]
invalidRecords: [
{
data: {
name: John
surname: Doe
gender: Male
phone: 123456
}
rowNumber: 1,
reason: ["Duplicate data"]
},
{
data: {
name: Joh
surname: Deo
gender: Male
phone: 123456
}
rowNumber: 2,
reason: ["Duplicate data"]
}
]
]
我该如何解决这个问题?
我已经用下面的和下面的一些辅助函数扩展了我的 on("end") 事件。暂时解决了。
.on("end", (rowCount) => {
console.log(`Parsed ${rowCount} rows. Valid Count: ${validCustomers.length} Invalid Count: ${invalidCustomers.length}`);
const allCustomers = [...invalidCustomers, ...validCustomers];
const duplicateNumbers = findDuplicatePhoneNumbers(allCustomers);
flagDuplicateCustomers(allCustomers, duplicateNumbers);
// Valid but duplicate customers are pushed to the invalid customers and reason set to "Duplicate"
const validButDuplicateCustomers = getDuplicateCustomers(validCustomers);
validButDuplicateCustomers.forEach((c) => {
invalidCustomers.push({
data: c,
reason: ["Duplicate"]
});
});
// Add reason "Duplicate" for Invalid and Duplicate customers
const invalidAndDuplicateCustomers = getDuplicateCustomers(invalidCustomers);
invalidAndDuplicateCustomers.forEach((c) => {
if (c.reason) {
c.reason = [...c.reason, "Duplicate"];
}
});
const validAndNotDuplicate = getNonDuplicateCustomers(validCustomers);
resolve({
invalidCustomers: invalidCustomers,
validCustomers: validAndNotDuplicate
});
});
辅助方法是
const getDuplicateCustomers = (customers) => customers.filter((customer) => customer.isDuplicate);
const getNonDuplicateCustomers = (records) => records.filter((record) => !record.isDuplicate);
const findDuplicatePhoneNumbers = (customers) => {
let duplicates = [];
const sortedCustomers = customers.sort((a, b)=> a.customer_phone - b.customer_phone);
sortedCustomers.forEach((customer, index, array) => {
const nextCustomer = array[index + 1];
if (!nextCustomer) {
return;
}
if (customer.customer_phone === nextCustomer.customer_phone) {
duplicates.push(customer);
}
});
const duplicatePhoneNumbers = duplicates.map((customer) => customer.customer_phone);
const uniqueDuplicatePhoneNumbers = [...new Set(duplicatePhoneNumbers)];
return uniqueDuplicatePhoneNumbers;
};
const flagDuplicateCustomers = (customers, duplicateNumbers) => {
if (!duplicateNumbers) {
return;
}
if (duplicateNumbers.length === 0) {
return;
}
const duplicateCustomers = customers.filter((customer) => duplicateNumbers.includes(customer.customer_phone));
duplicateCustomers.forEach((customer) => {
customer.isDuplicate = true;
});
};