如何使用 ChoETL 比较两个 CSV 文件的添加、更改或删除记录(主要与详细信息)?
How to use ChoETL to compare two CSV files for ADD, CHANGED or DELETED records (Master vs Detail)?
我一直在玩@Cinchoo 出色的 C# ETL 系统。我需要比较两个 CSV 文件,其中一个 CSV 文件被定义为 dynamically growing master table,另一个是 feeder “细节”table.
详细信息 table 在新记录、已更改记录或主 CSV 文件中不再存在(已删除)的记录方面可能存在差异。
输出应该是第三个 table 替换或更新主文件 table - 所以它是一个不断增长的 CSV 文件。
两个 table 都有唯一的 ID 列和一个 header 行。
主 CSV
ID,name
1,Danny
2,Fred
3,Sam
详情
ID,name
1,Danny
<-- record no longer exists
3,Pamela <-- name change
4,Fernando <-- new record
到目前为止,我一直在参考这个 fiddle 和下面的代码:
using System;
using ChoETL;
using System.Linq;
public class Program
{
public static void Main()
{
var input1 = ChoCSVReader.LoadText(csv1).WithFirstLineHeader().ToArray();
var input2 = ChoCSVReader.LoadText(csv2).WithFirstLineHeader().ToArray();
Console.WriteLine("NEW records\n");
using (var output = new ChoCSVWriter(Console.Out).WithFirstLineHeader())
{
output.Write(input2.OfType<ChoDynamicObject>().Except(input1.OfType<ChoDynamicObject>(),
new ChoDynamicObjectEqualityComparer(new string[] { "id" })));
}
Console.WriteLine("\n\nDELETED records\n");
using (var output = new ChoCSVWriter(Console.Out).WithFirstLineHeader())
{
output.Write(input1.OfType<ChoDynamicObject>().Except(input2.OfType<ChoDynamicObject>(),
new ChoDynamicObjectEqualityComparer(new string[] { "id" })));
}
Console.WriteLine("\n\nCHANGED records\n");
using (var output = new ChoCSVWriter(Console.Out).WithFirstLineHeader())
{
output.Write(input1.OfType<ChoDynamicObject>().Except(input2.OfType<ChoDynamicObject>(),
new ChoDynamicObjectEqualityComparer(new string[] { "id", "name" })));
}
}
static string csv1 = @"
ID,name
1,Danny
2,Fred
3,Sam";
static string csv2 = @"
ID,name
1,Danny
3,Pamela
4,Fernando";
}
输出
NEW records
ID,name
4,Fernando
DELETED records
ID,name
2,Fred
CHANGED records
ID,name
2,Fred
3,Sam
CHANGED 记录无效。另外,我需要一个状态,所以我希望它看起来像这样:
CHANGED records
ID,name,status
1,Danny,NOCHANGE
2,Fred,DELETED
3,Pamela,CHANGED
4,Fernando,NEW
谢谢
这是使用 Cinchoo ETL 的方法
string csv1 = @"ID,name
1,Danny
2,Fred
3,Sam";
string csv2 = @"ID,name
1,Danny
3,Pamela
4,Fernando";
var r1 = ChoCSVReader.LoadText(csv1).WithFirstLineHeader().ToArray();
var r2 = ChoCSVReader.LoadText(csv2).WithFirstLineHeader().ToArray();
using (var w = new ChoCSVWriter(Console.Out).WithFirstLineHeader())
{
var newItems = r2.OfType<ChoDynamicObject>().Except(r1.OfType<ChoDynamicObject>(), new ChoDynamicObjectEqualityComparer(new string[] { "ID" }))
.Select(r =>
{
var dict = r.AsDictionary();
dict["Status"] = "NEW";
return new ChoDynamicObject(dict);
}).ToArray();
var deletedItems = r1.OfType<ChoDynamicObject>().Except(r2.OfType<ChoDynamicObject>(), new ChoDynamicObjectEqualityComparer(new string[] { "ID" }))
.Select(r =>
{
var dict = r.AsDictionary();
dict["Status"] = "DELETED";
return new ChoDynamicObject(dict);
}).ToArray();
var changedItems = r2.OfType<ChoDynamicObject>().Except(r1.OfType<ChoDynamicObject>(), ChoDynamicObjectEqualityComparer.Default)
.Except(newItems.OfType<ChoDynamicObject>(), new ChoDynamicObjectEqualityComparer(new string[] { "ID" }))
.Select(r =>
{
var dict = r.AsDictionary();
dict["Status"] = "CHANGED";
return new ChoDynamicObject(dict);
}).ToArray();
var noChangeItems = r1.OfType<ChoDynamicObject>().Intersect(r2.OfType<ChoDynamicObject>(), ChoDynamicObjectEqualityComparer.Default)
.Select(r =>
{
var dict = r.AsDictionary();
dict["Status"] = "NOCHANGE";
return new ChoDynamicObject(dict);
}).ToArray();
var finalResult = Enumerable.Concat(newItems, deletedItems).Concat(changedItems).Concat(noChangeItems).OfType<dynamic>().OrderBy(r => r.ID);
w.Write(finalResult);
}
Console.WriteLine();
输出:
ID,name,Status
1,Danny,NOCHANGE
2,Fred,DELETED
3,Pamela,CHANGED
4,Fernando,NEW
样本fiddle:https://dotnetfiddle.net/mrHpFx
更新#1:
以上方法适用于小型 CSV 文件。对于大型 CSV 文件,您必须避免使用它。而是以流的方式处理它。示例 fiddle 显示了如何(未经过全面测试,但它给出了执行此操作的方向。)
样本fiddle:https://dotnetfiddle.net/mh6w44
更新#2:
现在 Cinchoo ETL (v1.2.1.33) 内置 API 以简化方式比较 CSV 文件
var r1 = ChoCSVReader.LoadText(csv1).WithFirstLineHeader().WithMaxScanRows(1).OfType<ChoDynamicObject>();
var r2 = ChoCSVReader.LoadText(csv2).WithFirstLineHeader().WithMaxScanRows(1).OfType<ChoDynamicObject>();
using (var w = new ChoCSVWriter(Console.Out).WithFirstLineHeader())
{
foreach (var t in r1.Compare(r2, "ID", "name" ))
{
dynamic v1 = t.MasterRecord as dynamic;
dynamic v2 = t.DetailRecord as dynamic;
if (t.Status == CompareStatus.Unchanged || t.Status == CompareStatus.Deleted)
{
v1.Status = t.Status.ToString();
w.Write(v1);
}
else
{
v2.Status = t.Status.ToString();
w.Write(v2);
}
}
}
样本fiddle:https://dotnetfiddle.net/uPR5Sq
我一直在玩@Cinchoo 出色的 C# ETL 系统。我需要比较两个 CSV 文件,其中一个 CSV 文件被定义为 dynamically growing master table,另一个是 feeder “细节”table.
详细信息 table 在新记录、已更改记录或主 CSV 文件中不再存在(已删除)的记录方面可能存在差异。
输出应该是第三个 table 替换或更新主文件 table - 所以它是一个不断增长的 CSV 文件。
两个 table 都有唯一的 ID 列和一个 header 行。
主 CSV
ID,name
1,Danny
2,Fred
3,Sam
详情
ID,name
1,Danny
<-- record no longer exists
3,Pamela <-- name change
4,Fernando <-- new record
到目前为止,我一直在参考这个 fiddle 和下面的代码:
using System;
using ChoETL;
using System.Linq;
public class Program
{
public static void Main()
{
var input1 = ChoCSVReader.LoadText(csv1).WithFirstLineHeader().ToArray();
var input2 = ChoCSVReader.LoadText(csv2).WithFirstLineHeader().ToArray();
Console.WriteLine("NEW records\n");
using (var output = new ChoCSVWriter(Console.Out).WithFirstLineHeader())
{
output.Write(input2.OfType<ChoDynamicObject>().Except(input1.OfType<ChoDynamicObject>(),
new ChoDynamicObjectEqualityComparer(new string[] { "id" })));
}
Console.WriteLine("\n\nDELETED records\n");
using (var output = new ChoCSVWriter(Console.Out).WithFirstLineHeader())
{
output.Write(input1.OfType<ChoDynamicObject>().Except(input2.OfType<ChoDynamicObject>(),
new ChoDynamicObjectEqualityComparer(new string[] { "id" })));
}
Console.WriteLine("\n\nCHANGED records\n");
using (var output = new ChoCSVWriter(Console.Out).WithFirstLineHeader())
{
output.Write(input1.OfType<ChoDynamicObject>().Except(input2.OfType<ChoDynamicObject>(),
new ChoDynamicObjectEqualityComparer(new string[] { "id", "name" })));
}
}
static string csv1 = @"
ID,name
1,Danny
2,Fred
3,Sam";
static string csv2 = @"
ID,name
1,Danny
3,Pamela
4,Fernando";
}
输出
NEW records
ID,name
4,Fernando
DELETED records
ID,name
2,Fred
CHANGED records
ID,name
2,Fred
3,Sam
CHANGED 记录无效。另外,我需要一个状态,所以我希望它看起来像这样:
CHANGED records
ID,name,status
1,Danny,NOCHANGE
2,Fred,DELETED
3,Pamela,CHANGED
4,Fernando,NEW
谢谢
这是使用 Cinchoo ETL 的方法
string csv1 = @"ID,name
1,Danny
2,Fred
3,Sam";
string csv2 = @"ID,name
1,Danny
3,Pamela
4,Fernando";
var r1 = ChoCSVReader.LoadText(csv1).WithFirstLineHeader().ToArray();
var r2 = ChoCSVReader.LoadText(csv2).WithFirstLineHeader().ToArray();
using (var w = new ChoCSVWriter(Console.Out).WithFirstLineHeader())
{
var newItems = r2.OfType<ChoDynamicObject>().Except(r1.OfType<ChoDynamicObject>(), new ChoDynamicObjectEqualityComparer(new string[] { "ID" }))
.Select(r =>
{
var dict = r.AsDictionary();
dict["Status"] = "NEW";
return new ChoDynamicObject(dict);
}).ToArray();
var deletedItems = r1.OfType<ChoDynamicObject>().Except(r2.OfType<ChoDynamicObject>(), new ChoDynamicObjectEqualityComparer(new string[] { "ID" }))
.Select(r =>
{
var dict = r.AsDictionary();
dict["Status"] = "DELETED";
return new ChoDynamicObject(dict);
}).ToArray();
var changedItems = r2.OfType<ChoDynamicObject>().Except(r1.OfType<ChoDynamicObject>(), ChoDynamicObjectEqualityComparer.Default)
.Except(newItems.OfType<ChoDynamicObject>(), new ChoDynamicObjectEqualityComparer(new string[] { "ID" }))
.Select(r =>
{
var dict = r.AsDictionary();
dict["Status"] = "CHANGED";
return new ChoDynamicObject(dict);
}).ToArray();
var noChangeItems = r1.OfType<ChoDynamicObject>().Intersect(r2.OfType<ChoDynamicObject>(), ChoDynamicObjectEqualityComparer.Default)
.Select(r =>
{
var dict = r.AsDictionary();
dict["Status"] = "NOCHANGE";
return new ChoDynamicObject(dict);
}).ToArray();
var finalResult = Enumerable.Concat(newItems, deletedItems).Concat(changedItems).Concat(noChangeItems).OfType<dynamic>().OrderBy(r => r.ID);
w.Write(finalResult);
}
Console.WriteLine();
输出:
ID,name,Status
1,Danny,NOCHANGE
2,Fred,DELETED
3,Pamela,CHANGED
4,Fernando,NEW
样本fiddle:https://dotnetfiddle.net/mrHpFx
更新#1:
以上方法适用于小型 CSV 文件。对于大型 CSV 文件,您必须避免使用它。而是以流的方式处理它。示例 fiddle 显示了如何(未经过全面测试,但它给出了执行此操作的方向。)
样本fiddle:https://dotnetfiddle.net/mh6w44
更新#2:
现在 Cinchoo ETL (v1.2.1.33) 内置 API 以简化方式比较 CSV 文件
var r1 = ChoCSVReader.LoadText(csv1).WithFirstLineHeader().WithMaxScanRows(1).OfType<ChoDynamicObject>();
var r2 = ChoCSVReader.LoadText(csv2).WithFirstLineHeader().WithMaxScanRows(1).OfType<ChoDynamicObject>();
using (var w = new ChoCSVWriter(Console.Out).WithFirstLineHeader())
{
foreach (var t in r1.Compare(r2, "ID", "name" ))
{
dynamic v1 = t.MasterRecord as dynamic;
dynamic v2 = t.DetailRecord as dynamic;
if (t.Status == CompareStatus.Unchanged || t.Status == CompareStatus.Deleted)
{
v1.Status = t.Status.ToString();
w.Write(v1);
}
else
{
v2.Status = t.Status.ToString();
w.Write(v2);
}
}
}
样本fiddle:https://dotnetfiddle.net/uPR5Sq