如何仅从 JSON 文件中提取某些元素并附加到 csv?

How to extract only certain elements from JSON file and append to csv?

我的程序当前循环遍历 pdf/image 个文件的目录并使用 Azure computer vision REST API.

生成 json 个文件

除了生成 JSON 文件外,我还想从生成的 JSON 中提取某些元素,例如 textconfidence 并将它们写入csv 文件,这样它看起来像这样:

例如所需的 csv 文件输出:

filename     page    text                  words        confidence
file1.pdf    1       the quick brown fox                0.98
file1.pdf    2       the color is great!                0.71
file2.jpeg   1       something something                0.87 

但是,这是我当前获得的示例 csv 输出(一直到 FJO 列,因此无法在此处粘贴所有列):

如何使 JsonToCsv() 函数 return 文件名、页面、文本、单词和置信度元素仅用于每个 csv 文件文件?

这里是 JSON 从以下文件之一生成的一些示例:

{
  "status": "succeeded",
  "createdDateTime": "2020-05-28T05:13:21Z",
  "lastUpdatedDateTime": "2020-05-28T05:13:22Z",
  "analyzeResult": {
    "version": "3.1.0",
    "readResults": [
      {
        "page": 1,
        "language": "en",
        "angle": 0.8551,
        "width": 2661,
        "height": 1901,
        "unit": "pixel",
        "lines": [
          {
            "boundingBox": [
              67,
              646,
              2582,
              713,
              2580,
              876,
              67,
              821
            ],
            "text": "The quick brown fox jumps",
            "words": [
              {
                "boundingBox": [
                  143,
                  650,
                  435,
                  661,
                  436,
                  823,
                  144,
                  824
                ],
                "text": "The",
                "confidence": 0.958
              },
              {
                "boundingBox": [  

完整代码:

static void Main(string[] args)
{
    foreach (var file in Directory.GetFiles(inputFilesPath))
    {
        string inputFile = Path.GetFileName(file); // Return File Name + Extension

        // Call the REST API method.
        Console.WriteLine("\nExtracting text from: " + inputFile + "\n");
        ReadText(file, jsonFilesPath).Wait();
        
    }
    
    //Output to CSV
    foreach (var jsonFile in Directory.GetFiles(jsonFilesPath))
    {
        string csvFile = Path.GetFileName(csvFilePath); // Return File Name + Extension

        // Call the REST API method.
        Console.WriteLine("\nWriting/Parsing json to: " + csvFile + " file from: " + jsonFile + "\n");
        JsonToCsv(jsonFile, csvFilePath);
    }
}

public static void JsonToCsv(string jsonInputFile, string csvFile)
{
    // Tools > Nuget Package Manager > Package Manager Console
    // Install-Package ChoETL.JSON
    using (var r = new ChoJSONReader(jsonInputFile))
    {
        using (var w = new ChoCSVWriter(csvFile).WithFirstLineHeader())
        {
            w.Write(r);
        }
    }
}

static async Task ReadText(string inputFilePath, string outputFilePath)
{
    try
    {
        HttpClient client = new HttpClient();

        // Request headers.
        client.DefaultRequestHeaders.Add(
            "Ocp-Apim-Subscription-Key", subscriptionKey);

        string url = uriBase;

        HttpResponseMessage response;
        string operationLocation;

        // Reads the contents of the specified local input
        // into a byte array.
        byte[] byteData = GetInputAsByteArray(inputFilePath);

        // Adds the byte array as an octet stream to the request body.
        using (ByteArrayContent content = new ByteArrayContent(byteData))
        {
            content.Headers.ContentType =
                new MediaTypeHeaderValue("application/octet-stream");

            response = await client.PostAsync(url, content);
        }

        if (response.IsSuccessStatusCode)
            operationLocation =
                response.Headers.GetValues("Operation-Location").FirstOrDefault();
        else
        {
            // Display the JSON error data.
            string errorString = await response.Content.ReadAsStringAsync();
            Console.WriteLine("\n\nResponse:\n{0}\n",
                JToken.Parse(errorString).ToString());
            return;
        }

        string contentString;
        int i = 0;
        do
        {
            System.Threading.Thread.Sleep(1000);
            response = await client.GetAsync(operationLocation);
            contentString = await response.Content.ReadAsStringAsync();
            ++i;
        }
        while (i < 60 && contentString.IndexOf("\"status\":\"succeeded\"") == -1);

        if (i == 60 && contentString.IndexOf("\"status\":\"succeeded\"") == -1)
        {
            Console.WriteLine("\nTimeout error.\n");
            return;
        }

        // Display the JSON response.
        Console.WriteLine("\nResponse:\n\n{0}\n",
            JToken.Parse(contentString).ToString());

        // Write JSON response to file
        string inputFileName = Path.GetFileNameWithoutExtension(inputFilePath); // Extract File Name only
        var responseData = JToken.Parse(contentString).ToString(); //Fetch Data and assign it for file output
        string jsonOutputFile = Path.Combine(outputFilePath, inputFileName + ".json");
        File.WriteAllText(jsonOutputFile, responseData);
    }
    catch (Exception e)
    {
        Console.WriteLine("\n" + e.Message);
    }
}

我猜这可能会为您指明从 JSON

生成预期 CSV 的正确方向
StringBuilder csv = new StringBuilder();
using (var p = new ChoJSONReader("*** YOUR JSON FILE ***")
    .WithJSONPath("$..readResults")
    )
{
    using (var w = new ChoCSVWriter(csv)
        .WithFirstLineHeader()
        )
    {
        w.Write(p
            .SelectMany(r1 => ((dynamic[])r1.lines).SelectMany(r2 => ((dynamic[])r2.words).Select(r3 => new
            {
                r1.page,
                r2.text,
                words = r3.text,
                r3.confidence
            }))));
    }
}

Console.WriteLine(csv.ToString());

输出:

page,text,words,confidence
1,The quick brown fox jumps,The,0.958
1,The quick brown fox jumps,quick,0.57
1,The quick brown fox jumps,brown,0.799
1,The quick brown fox jumps,fox,0.442
1,The quick brown fox jumps,jumps,0.878
1,over,over,0.37
1,the lazy dog!,the,0.909
1,the lazy dog!,lazy,0.853
1,the lazy dog!,dog!,0.41

更新:

使用:ChoETL.JSONv1.2.1.2

要生成带有空格字段名称的 CSV,请使用 WithField 为每个字段指定字段 header。

StringBuilder csv = new StringBuilder();
using (var p = new ChoJSONReader("sample43.json")
    .WithJSONPath("$..readResults")
    )
{
    using (var w = new ChoCSVWriter(csv)
        .WithField("FileName", fieldName: "File Name")
        .WithField("page")
        .WithField("text")
        .WithField("words")
        .WithField("confidence")
        .WithFirstLineHeader()
        )
    {
        w.Write(p
            .SelectMany(r1 => ((dynamic[])r1.lines).SelectMany(r2 => ((dynamic[])r2.words).Select(r3 => new
            {
                FileName = "file.json",
                r1.page,
                r2.text,
                words = r3.text,
                r3.confidence
            }))));
    }
}

Console.WriteLine(csv.ToString());

输出:

File Name,page,text,words,confidence
file.json,1,The quick brown fox jumps,The,0.958
file.json,1,The quick brown fox jumps,quick,0.57
file.json,1,The quick brown fox jumps,brown,0.799
file.json,1,The quick brown fox jumps,fox,0.442
file.json,1,The quick brown fox jumps,jumps,0.878
file.json,1,over,over,0.37
file.json,1,the lazy dog!,the,0.909
file.json,1,the lazy dog!,lazy,0.853
file.json,1,the lazy dog!,dog!,0.41