如何汇总 csv 中的置信度?

How to aggregate confidence levels in csv?

我的程序当前循环遍历 pdf/image 个文件的目录并使用 Azure computer vision REST API 生成 json 个文件。使用下面的 JsonToCsv(),我将这些文件中的特定 json 元素导出到 csv 文件中,因此输出如下所示:

file.csv 输出:

page,text,words,confidence
1,The quick brown fox jumps,The,0.958
1,The quick brown fox jumps,quick,0.57
1,The quick brown fox jumps,brown,0.799
1,The quick brown fox jumps,fox,0.442
1,The quick brown fox jumps,jumps,0.878
1,over,over,0.37
1,the lazy dog!,the,0.909
1,the lazy dog!,lazy,0.853
1,the lazy dog!,dog!,0.41

我想做的是合并单词,使它们用逗号分隔而不是单独的行,因此,对包含这些单词的整个文本的置信度进行平均。例如,新文件将是:

page,text,words,confidence
1,The quick brown fox jumps,The,quick,brown,fox,jumps,0.729
1,over,over,0.37
1,the lazy dog!,the,lazy,dog!,0.724

其中第一个文本中的0.729是result/average相关置信度的组合和划分:例如(0.958+0.57+0.799+0.442+0.878)/5。对最后一个文本也执行了相同的操作。

我如何更新下面的函数来完成这个?

JsonToCsv()代码:

private static void JsonToCsv(string jsonFile, string csvfFile) {   
    using (var p = new ChoJSONReader(jsonFile)
        .WithJSONPath("$..readResults")
        )
    {
        using (var w = new ChoCSVWriter(csvfFile).WithFirstLineHeader())
        {
            w.Write(p
                .SelectMany(r1 => ((dynamic[])r1.lines).SelectMany(r2 => ((dynamic[])r2.words).Select(r3 => new
                {
                    r1.page,
                    r2.text,
                    words = r3.text,
                    r3.confidence
                }))));
        }
    }
}

示例JSON 文件:

{
  "status": "succeeded",
  "createdDateTime": "2020-05-28T05:13:21Z",
  "lastUpdatedDateTime": "2020-05-28T05:13:22Z",
  "analyzeResult": {
    "version": "3.1.0",
    "readResults": [
      {
        "page": 1,
        "language": "en",
        "angle": 0.8551,
        "width": 2661,
        "height": 1901,
        "unit": "pixel",
        "lines": [
          {
            "boundingBox": [
              67,
              646,
              2582,
              713,
              2580,
              876,
              67,
              821
            ],
            "text": "The quick brown fox jumps",
            "words": [
              {
                "boundingBox": [
                  143,
                  650,
                  435,
                  661,
                  436,
                  823,
                  144,
                  824
                ],
                "text": "The",
                "confidence": 0.958
              },
              {
                "boundingBox": [
                  540,
                  665,
                  926,
                  679,
                  926,
                  825,
                  541,
                  823
                ],
                "text": "quick",
                "confidence": 0.57
              },
              {
                "boundingBox": [
                  1125,
                  686,
                  1569,
                  700,
                  1569,
                  838,
                  1125,
                  828
                ],
                "text": "brown",
                "confidence": 0.799
              },
              {
                "boundingBox": [
                  1674,
                  703,
                  1966,
                  711,
                  1966,
                  851,
                  1674,
                  841
                ],
                "text": "fox",
                "confidence": 0.442
              },
              {
                "boundingBox": [
                  2083,
                  714,
                  2580,
                  725,
                  2579,
                  876,
                  2083,
                  855
                ],
                "text": "jumps",
                "confidence": 0.878
              }
            ]
          },
          {
            "boundingBox": [
              187,
              1062,
              485,
              1056,
              486,
              1120,
              189,
              1126
            ],
            "text": "over",
            "words": [
              {
                "boundingBox": [
                  190,
                  1064,
                  439,
                  1059,
                  441,
                  1122,
                  192,
                  1126
                ],
                "text": "over",
                "confidence": 0.37
              }
            ]
          },
          {
            "boundingBox": [
              664,
              1008,
              1973,
              1023,
              1969,
              1178,
              664,
              1154
            ],
            "text": "the lazy dog!",
            "words": [
              {
                "boundingBox": [
                  668,
                  1008,
                  923,
                  1015,
                  923,
                  1146,
                  669,
                  1117
                ],
                "text": "the",
                "confidence": 0.909
              },
              {
                "boundingBox": [
                  1107,
                  1018,
                  1447,
                  1023,
                  1445,
                  1178,
                  1107,
                  1162
                ],
                "text": "lazy",
                "confidence": 0.853
              },
              {
                "boundingBox": [
                  1639,
                  1024,
                  1974,
                  1023,
                  1971,
                  1170,
                  1636,
                  1178
                ],
                "text": "dog!",
                "confidence": 0.41
              }
            ]
          }
        ]
      }
    ]
  }
}

使用 Linq 您可以生成预期格式的 CSV。下面的示例显示了如何

StringBuilder csv = new StringBuilder();
using (var p = new ChoJSONReader("*** YOUR JSON PATH ***")
    .WithJSONPath("$..readResults")
    )
{
    using (var w = new ChoCSVWriter(csv)
        .WithFirstLineHeader()
        )
    {
        w.Write(p
            .SelectMany(r1 => ((dynamic[])r1.lines)
            .Select(r2 => new
            {
                r1.page,
                r2.text,
                words = String.Join(",", ((dynamic[])r2.words).Select(s1 => s1.text)),
                confidence = ((dynamic[])r2.words).Select(s1 => (double)s1.confidence).Average()
            })));
    }
}

Console.WriteLine(csv.ToString());