如果 record/row 已存在于 csv 中,则替换或不追加

Replace or dont append if record/row already exists in csv

我有以下代码循环遍历目录中的 json 个文件并创建一个包含以下记录的 csv 文件:

results.csv

File Name    Page  Practice Name
fileXYZ.json 1     XYZ & Co
fileAB2.json 1     ABC & Co
file1.json   1     Associates & Co

但是,如果我停止执行并重新运行程序,会发生相同的记录再次插入到 csv 文件中,导致:

File Name    Page  Practice Name
fileXYZ.json 1     XYZ & Co
fileAB2.json 1     ABC & Co
file1.json   1     Associates & Co
fileXYZ.json 1     XYZ & Co
fileAB2.json 1     ABC & Co
file1.json   1     Associates & Co 

我如何检查记录是否已经存在(即每个字段与插入的字段相同)并替换它(或者基本上不再追加它?)例如,如果我要 运行 程序再次出现,因为 file1.json 中发生了变化,而且目录中添加了一个新文件,新的 csv 应该如下所示:

results.csv:

File Name    Page  Practice Name
fileXYZ.json 1     XYZ & Co
fileAB2.json 1     ABC & Co
file1.json   1     Corpum & Co
file32.json  1     FirmA

代码:

using ChoETL;
using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;

static void Main(string[] args)
{
    //Output to CSV
    foreach (var jsonFile in Directory.GetFiles(jsonFilesPath))
    {
        JsonToCsv(jsonFile, csvFilePath);
    }
}
public static string fieldValue(IEnumerable<dynamic> lines, string nameOfField, bool throwException = false)
{
    var skipped = lines.SkipWhile(l => l.text != nameOfField);

    switch (throwException)
    {
        case true:
            var enumerator = lines.GetEnumerator();
            
            while (enumerator.MoveNext())
            {
                if (skipped.Count() == 0)
                    return skipped.Skip(1).First().text;
                else
                    throw new InvalidDataException("Odd number of items found in IEnumerable<>");
            }

            break;
        case false:
            // Skip(#) to skip over the unnecessary Lines, 
            // such as "Account Information", preceding "Practice Name".
            return skipped.Skip(1).First().text;

            break;
        default:
            Console.WriteLine("Default case");
            break;
    }
    // Returning null isn't recommended, but it does fix the error "not all code paths return a value"
    return null;
}
public static void JsonToCsv(string jsonInputFile, string csvFile)
{
    using (var p = new ChoJSONReader(jsonInputFile).WithJSONPath("$..readResults"))
    {
        using (var fs = new FileStream(csvFile, FileMode.Append, FileAccess.Write))
        {
             using (var writer = new ChoCSVWriter(fs))
             {
                writer.WithField("FileName", fieldName: "File Name")
                    .WithField("Page")
                    .WithField("PracticeName", fieldName: "Practice Name");

                if (fs.Position == 0) // we don't need header if file already existed before
                {
                    writer.WithFirstLineHeader();
                }
                
                // Limit the result to page 1 since the fields below only exist on the 1st page
                writer.Write(p
                    .Where(r1 => r1.page == 1)
                    .Select(r1 =>
                    {
                        var lines = (dynamic[])r1.lines;
                        return new
                        {
                            FileName = jsonInputFile,
                            Page = r1.page,
                            PracticeName = //lines[6].text,
                                fieldValue(lines, "Practice Name"),
                        };
                    }
                ));

                }

                fs.Write(Environment.NewLine); // append new line carrier so we don't write to the same line when file reopened for writing
        }
    }
}

样本JSON文件

{
  "status": "succeeded",
  "createdDateTime": "2020-10-30T15:56:11Z",
  "lastUpdatedDateTime": "2020-10-30T15:56:12Z",
  "analyzeResult": {
    "version": "3.0.0",
    "readResults": [
      {
        "page": 1,
        "angle": 0.086,
        "width": 684,
        "height": 272,
        "unit": "pixel",
        "lines": [
          {
            "boundingBox": [
              7,
              6,
              196,
              5,
              196,
              24,
              7,
              25
            ],
            "text": "Account Information",
            "words": [
              {
                "boundingBox": [
                  10,
                  7,
                  83,
                  7,
                  81,
                  24,
                  7,
                  26
                ],
                "text": "Account",
                "confidence": 0.981
              },
              {
                "boundingBox": [
                  87,
                  7,
                  196,
                  6,
                  196,
                  24,
                  85,
                  24
                ],
                "text": "Information",
                "confidence": 0.939
              }
            ]
          },
          {
            "boundingBox": [
              120,
              56,
              223,
              57,
              223,
              70,
              120,
              70
            ],
            "text": "Practice Name",
            "words": [
              {
                "boundingBox": [
                  120,
                  57,
                  176,
                  57,
                  176,
                  70,
                  120,
                  71
                ],
                "text": "Practice",
                "confidence": 0.982
              },
              {
                "boundingBox": [
                  179,
                  57,
                  222,
                  57,
                  222,
                  71,
                  179,
                  70
                ],
                "text": "Name",
                "confidence": 0.985
              }
            ]
          },
          {
            "boundingBox": [
              236,
              62,
              390,
              62,
              390,
              77,
              236,
              77
            ],
            "text": "Some Practice Name",
            "words": [
              {
                "boundingBox": [
                  236,
                  62,
                  277,
                  62,
                  277,
                  78,
                  236,
                  78
                ],
                "text": "Some",
                "confidence": 0.987
              },
              {
                "boundingBox": [
                  280,
                  62,
                  340,
                  62,
                  341,
                  78,
                  280,
                  77
                ],
                "text": "Practice",
                "confidence": 0.984
              },
              {
                "boundingBox": [
                  343,
                  62,
                  390,
                  62,
                  390,
                  78,
                  344,
                  78
                ],
                "text": "Name",
                "confidence": 0.987
              }
            ]
          },
          {
            "boundingBox": [
              107,
              102,
              223,
              102,
              223,
              115,
              107,
              115
            ],
            "text": "Owner Full Name",
            "words": [
              {
                "boundingBox": [
                  108,
                  103,
                  151,
                  102,
                  151,
                  116,
                  107,
                  116
                ],
                "text": "Owner",
                "confidence": 0.985
              },
              {
                "boundingBox": [
                  154,
                  102,
                  177,
                  102,
                  176,
                  116,
                  153,
                  116
                ],
                "text": "Full",
                "confidence": 0.954
              },
              {
                "boundingBox": [
                  180,
                  102,
                  224,
                  103,
                  223,
                  116,
                  179,
                  116
                ],
                "text": "Name",
                "confidence": 0.987
              }
            ]
          },
          {
            "boundingBox": [
              237,
              104,
              298,
              104,
              298,
              119,
              237,
              119
            ],
            "text": "Bob Lee",
            "words": [
              {
                "boundingBox": [
                  238,
                  104,
                  266,
                  104,
                  266,
                  119,
                  238,
                  120
                ],
                "text": "Bob",
                "confidence": 0.987
              },
              {
                "boundingBox": [
                  269,
                  104,
                  298,
                  105,
                  298,
                  120,
                  269,
                  119
                ],
                "text": "Lee",
                "confidence": 0.987
              }
            ]
          },
          {
            "boundingBox": [
              136,
              147,
              223,
              147,
              223,
              160,
              137,
              161
            ],
            "text": "Owner Email",
            "words": [
              {
                "boundingBox": [
                  137,
                  148,
                  181,
                  147,
                  181,
                  161,
                  137,
                  162
                ],
                "text": "Owner",
                "confidence": 0.985
              },
              {
                "boundingBox": [
                  184,
                  147,
                  224,
                  147,
                  224,
                  161,
                  184,
                  161
                ],
                "text": "Email",
                "confidence": 0.985
              }
            ]
          },
          {
            "boundingBox": [
              239,
              144,
              361,
              144,
              361,
              162,
              239,
              162
            ],
            "text": "bob@gmail.com",
            "words": [
              {
                "boundingBox": [
                  240,
                  145,
                  362,
                  146,
                  361,
                  163,
                  240,
                  163
                ],
                "text": "bob@gmail.com",
                "confidence": 0.974
              }
            ]
          },
          {
            "boundingBox": [
              137,
              193,
              224,
              193,
              224,
              208,
              137,
              208
            ],
            "text": "Server Setup",
            "words": [
              {
                "boundingBox": [
                  137,
                  194,
                  179,
                  194,
                  179,
                  208,
                  137,
                  208
                ],
                "text": "Server",
                "confidence": 0.985
              },
              {
                "boundingBox": [
                  182,
                  194,
                  224,
                  194,
                  224,
                  209,
                  182,
                  208
                ],
                "text": "Setup",
                "confidence": 0.985
              }
            ]
          },
          {
            "boundingBox": [
              276,
              188,
              340,
              192,
              339,
              211,
              275,
              209
            ],
            "text": "cloud",
            "words": [
              {
                "boundingBox": [
                  297,
                  192,
                  339,
                  194,
                  339,
                  211,
                  297,
                  211
                ],
                "text": "cloud",
                "confidence": 0.933
              }
            ]
          },
          {
            "boundingBox": [
              376,
              187,
              461,
              191,
              460,
              212,
              376,
              211
            ],
            "text": "Location",
            "words": [
              {
                "boundingBox": [
                  394,
                  191,
                  460,
                  196,
                  459,
                  211,
                  394,
                  211
                ],
                "text": "Location",
                "confidence": 0.844
              }
            ]
          },
          {
            "boundingBox": [
              500,
              189,
              666,
              192,
              665,
              212,
              499,
              211
            ],
            "text": "LIcentral (multi-location)",
            "words": [
              {
                "boundingBox": [
                  501,
                  190,
                  567,
                  195,
                  567,
                  212,
                  500,
                  212
                ],
                "text": "LIcentral",
                "confidence": 0.665
              },
              {
                "boundingBox": [
                  572,
                  195,
                  665,
                  195,
                  665,
                  212,
                  571,
                  212
                ],
                "text": "(multi-location)",
                "confidence": 0.899
              }
            ]
          },
          {
            "boundingBox": [
              21,
              238,
              224,
              238,
              223,
              255,
              21,
              253
            ],
            "text": "Number of Locations Enrolling",
            "words": [
              {
                "boundingBox": [
                  21,
                  239,
                  76,
                  239,
                  76,
                  253,
                  21,
                  253
                ],
                "text": "Number",
                "confidence": 0.985
              },
              {
                "boundingBox": [
                  79,
                  239,
                  92,
                  239,
                  92,
                  253,
                  79,
                  253
                ],
                "text": "of",
                "confidence": 0.983
              },
              {
                "boundingBox": [
                  95,
                  239,
                  161,
                  239,
                  161,
                  254,
                  95,
                  253
                ],
                "text": "Locations",
                "confidence": 0.981
              },
              {
                "boundingBox": [
                  164,
                  239,
                  224,
                  239,
                  223,
                  256,
                  163,
                  254
                ],
                "text": "Enrolling",
                "confidence": 0.983
              }
            ]
          },
          {
            "boundingBox": [
              273,
              237,
              289,
              239,
              288,
              257,
              272,
              255
            ],
            "text": "1",
            "words": [
              {
                "boundingBox": [
                  278,
                  237,
                  290,
                  239,
                  287,
                  257,
                  276,
                  255
                ],
                "text": "1",
                "confidence": 0.981
              }
            ]
          },
          {
            "boundingBox": [
              337,
              239,
              670,
              239,
              670,
              253,
              337,
              252
            ],
            "text": "*If more than 1 location, add info on the locations form",
            "words": [
              {
                "boundingBox": [
                  338,
                  239,
                  347,
                  239,
                  347,
                  252,
                  338,
                  252
                ],
                "text": "*If",
                "confidence": 0.874
              },
              {
                "boundingBox": [
                  350,
                  239,
                  384,
                  239,
                  384,
                  253,
                  350,
                  252
                ],
                "text": "more",
                "confidence": 0.983
              },
              {
                "boundingBox": [
                  386,
                  239,
                  416,
                  239,
                  416,
                  253,
                  386,
                  253
                ],
                "text": "than",
                "confidence": 0.986
              },
              {
                "boundingBox": [
                  419,
                  239,
                  422,
                  239,
                  422,
                  253,
                  419,
                  253
                ],
                "text": "1",
                "confidence": 0.635
              },
              {
                "boundingBox": [
                  425,
                  239,
                  478,
                  239,
                  478,
                  253,
                  425,
                  253
                ],
                "text": "location,",
                "confidence": 0.955
              },
              {
                "boundingBox": [
                  481,
                  239,
                  506,
                  239,
                  506,
                  253,
                  481,
                  253
                ],
                "text": "add",
                "confidence": 0.986
              },
              {
                "boundingBox": [
                  509,
                  239,
                  533,
                  239,
                  533,
                  253,
                  509,
                  253
                ],
                "text": "info",
                "confidence": 0.981
              },
              {
                "boundingBox": [
                  535,
                  239,
                  551,
                  239,
                  552,
                  253,
                  535,
                  253
                ],
                "text": "on",
                "confidence": 0.988
              },
              {
                "boundingBox": [
                  554,
                  239,
                  574,
                  239,
                  575,
                  253,
                  554,
                  253
                ],
                "text": "the",
                "confidence": 0.987
              },
              {
                "boundingBox": [
                  577,
                  239,
                  634,
                  239,
                  634,
                  253,
                  577,
                  253
                ],
                "text": "locations",
                "confidence": 0.973
              },
              {
                "boundingBox": [
                  636,
                  239,
                  666,
                  240,
                  666,
                  253,
                  637,
                  253
                ],
                "text": "form",
                "confidence": 0.986
              }
            ]
          }
        ]
      }
    ]
  }
}

添加 Supun De Silva 答案的 spproach 2 后的 csv 屏幕截图:

仅供参考。您提供的示例文件不起作用,因为它在 var lines = (dynamic[])r1.lines;

失败

方法 1 - 重命名旧文件并创建一个新文件用于数据附加

1.引入新函数

 private static void RenameIfExist(string csvFilePath)
 {
    if (File.Exists(csvFilePath))
    {
        System.IO.File.Move(csvFilePath, $"{csvFilePath}_{DateTime.Now.ToString("backup_yyyyMMdd_HHmmss")}");
    }
}

2。调用移动函数并在新文件

中使用Create模式
public static void JsonToCsv(string jsonInputFile, string csvFile)
{
    using (var p = new ChoJSONReader(jsonInputFile).WithJSONPath("$..readResults"))
    {
        Program.RenameIfExist(csvFile);

        using (var fs = new FileStream(csvFile, FileMode.Create, FileAccess.Write))
        {
            try
            {
                using (ChoCSVWriter<dynamic>  writer = new ChoCSVWriter(fs)
                    .WithField("FileName", fieldName: "File Name")
                    .WithField("Page")
                    .WithField("PracticeName", fieldName: "Practice Name")
                    .WithFirstLineHeader())
                {
                    // Limit the result to page 1 since the fields below only exist on the 1st page
                    writer.Write(p
                        .Where(r1 => r1.page == 1)
                        .Select(r1 =>
                        {
                            var lines = (dynamic[])r1.lines;
                            return new
                            {
                                FileName = jsonInputFile,
                                Page = r1.page,
                                PracticeName = fieldValue(lines, "Practice Name"),
                            };
                        }
                    ));
                }

            }
            catch(Exception e)
            {
                throw e;
            }
        }
    }
}

方法 2 - 打开现有文件并使用数据创建查找结构

  • 您可能需要稍微调整一下

1.声明新结构来存储密钥

private static Dictionary<string, bool> processedfileStates = new Dictionary<string, bool>();

2。预加载器功能

private static void LoadOldStatsIfExist(string csvFilePath)
{
    if (File.Exists(csvFilePath))
    {
        using (var fs = new FileStream(csvFilePath, FileMode.Open, FileAccess.Read))
        {
            using (ChoCSVReader<dynamic> reader = new ChoCSVReader(fs).WithFirstLineHeader())
            {
                using (var dataReader = reader.AsDataReader())
                {
                    while (dataReader.Read())
                    {
                        Program.processedfileStates.Add($"{dataReader[0].ToString()}_{dataReader[1].ToString()}_{dataReader[2].ToString()}", true);
                    }
                }
            }
        }
            
    }
}

3。 Json 到 CSV fcn

public static void JsonToCsv(string jsonInputFile, string csvFile)
    {
        using (var p = new ChoJSONReader(jsonInputFile).WithJSONPath("$..readResults"))
        {
            Program.LoadOldStatsIfExist(csvFile);

            using (var fs = new FileStream(csvFile, Program.processedfileStates.Count == 0 ? FileMode.Create : FileMode.Append, FileAccess.Write))
            {
                if (Program.processedfileStates.Count != 0)
                {
                    fs.Write(Environment.NewLine);
                }
                
                try
                {
                    ChoCSVWriter<dynamic> writer = new ChoCSVWriter(fs);
                    if (Program.processedfileStates.Count == 0)
                    {
                        writer.WithFirstLineHeader();
                    }

                    using (writer
                        .WithField("FileName", fieldName: "File Name")
                        .WithField("Page")
                        .WithField("PracticeName", fieldName: "Practice Name")
                        )
                    {

                        if (Program.processedfileStates.Count == 0)
                        {
                            writer = writer.WithFirstLineHeader();
                        }

                        // Limit the result to page 1 since the fields below only exist on the 1st page
                        var data = p
                            .Where(r1 => r1.page == 1)

                            .Select(r1 =>
                            {
                                var lines = (dynamic[])r1.lines;
                                return new
                                {
                                    FileName = jsonInputFile,
                                    Page = r1.page,
                                    PracticeName = fieldValue(lines, "Practice Name"),
                                };
                            }
                        ).Where(de => !processedfileStates.ContainsKey($"{de.FileName.ToString()}_{de.Page.ToString()}_{de.PracticeName.ToString()}"));

                        writer.Write(data);
                    }

                }
                catch (Exception e)
                {
                    throw e;
                }
            }
        }
    }

方法 2 - 重构

 public class OPModel
{
    public string FileName { get; set; }
    public long Page { get; set; }
    public string PracticeName { get; set; }
}

public class Program
{
    const string jsonFilesPath = "D:\DevWork\C#\TempProject1\ConsoleApp1\data";
    const string csvFilePath = "D:\DevWork\C#\TempProject1\ConsoleApp1\output\op.csv";

    private static Dictionary<string, bool> processedfileStates = new Dictionary<string, bool>();
    private static bool fileExisted = false;
    private static void RenameIfExist(string csvFilePath)
    {
        if (File.Exists(csvFilePath))
        {
            System.IO.File.Move(csvFilePath, $"{csvFilePath}_{DateTime.Now.ToString("backup_yyyyMMdd_HHmmss")}");
        }
    }
    private static void LoadOldStatsIfExist(string csvFilePath)
    {
        if (File.Exists(csvFilePath))
        {
            using (var fs = new FileStream(csvFilePath, FileMode.Open, FileAccess.Read))
            {
                using (ChoCSVReader<dynamic> reader = new ChoCSVReader(fs).WithFirstLineHeader())
                {
                    using (var dTable = reader.AsDataTable())
                    {
                        foreach (DataRow row in dTable.Rows)
                        {
                            Program.processedfileStates.Add($"{row["File Name"].ToString()}_{row["Page"].ToString()}_{row["Practice Name"].ToString()}", true);
                        }
                    }
                }
            }
        }
    }
    public static void Main(string[] args)
    {
        try
        {
            Program.fileExisted = File.Exists(csvFilePath);
            Program.LoadOldStatsIfExist(csvFilePath);

            List<OPModel> dataToWrite = new List<OPModel>();
            // Persist each file to
            foreach (var jsonFile in Directory.GetFiles(jsonFilesPath))
            {
                dataToWrite.AddRange(JsonToCsv(jsonFile));
            }

            if (dataToWrite.Count != 0)
            {
                using (var fs = new FileStream(csvFilePath, !Program.fileExisted ? FileMode.Create : FileMode.Append, FileAccess.Write))
                {
                    try
                    {
                        ChoCSVWriter<OPModel> writer = new ChoCSVWriter<OPModel>(fs);
                        using (writer.WithField("FileName", fieldName: "File Name").WithField("Page").WithField("PracticeName", fieldName: "Practice Name"))
                        {
                            if (!Program.fileExisted)
                            {
                                writer = writer.WithFirstLineHeader();
                            }

                            writer.Write(dataToWrite);
                        }

                        fs.Write(Environment.NewLine);

                    }
                    catch (Exception e)
                    {
                        throw e;
                    }
                }
            }
            

            //Output to CSV
            
            Console.ReadKey();
        }
        catch (Exception ex)
        {

        }
    }
    public static string fieldValue(IEnumerable<dynamic> lines, string nameOfField, bool throwException = false)
    {
        var skipped = lines.SkipWhile(l => l.text != nameOfField);

        switch (throwException)
        {
            case true:
                var enumerator = lines.GetEnumerator();

                while (enumerator.MoveNext())
                {
                    if (enumerator.MoveNext())
                        return skipped.Skip(1).First().text;
                    else
                        throw new InvalidDataException("Odd number of items found in IEnumerable<>");
                }

                break;
            case false:
                // Skip(#) to skip over the unnecessary Lines, 
                // such as "Account Information", preceding "Practice Name".
                return skipped.Skip(1).First().text;
            default:
                Console.WriteLine("Default case");
                break;
        }
        // Returning null isn't recommended, but it does fix the error "not all code paths return a value"
        return null;
    }


    public static List<OPModel> JsonToCsv(string jsonInputFile)
    {
        using (var reader = new ChoJSONReader(jsonInputFile).WithJSONPath("$..readResults"))
        {
            var data = reader.Where(r1 => r1.page == 1)
                            .Select(r1 =>
                            {
                                var lines = (dynamic[])r1.lines;
                                return new OPModel
                                {
                                    FileName = jsonInputFile,
                                    Page = r1.page,
                                    PracticeName = Program.fieldValue(lines, "Practice Name")
                                };
                            }
                        ).Where(de => !processedfileStates.ContainsKey($"{de.FileName.ToString()}_{de.Page.ToString()}_{de.PracticeName.ToString()}")).ToList();

            return data;
        }
    }
}