How to aggregate confidence levels in csv?

102 Views Asked by At

My program currently loops through a directory of pdf/image files and generates json files using the Azure computer vision REST API. Using the JsonToCsv() below, I export specific json elements from those files into csv file, so the the output looks like this:

file.csv output:

page,text,words,confidence
1,The quick brown fox jumps,The,0.958
1,The quick brown fox jumps,quick,0.57
1,The quick brown fox jumps,brown,0.799
1,The quick brown fox jumps,fox,0.442
1,The quick brown fox jumps,jumps,0.878
1,over,over,0.37
1,the lazy dog!,the,0.909
1,the lazy dog!,lazy,0.853
1,the lazy dog!,dog!,0.41

what i'd like to do is consolidate the words so that they are delimited by comma instead of separate row, and therefore, average out the confidence levels for the entire text containing those words. For example, the new file would like:

page,text,words,confidence
1,The quick brown fox jumps,The,quick,brown,fox,jumps,0.729
1,over,over,0.37
1,the lazy dog!,the,lazy,dog!,0.724

in which 0.729 in the first text is the result/average of relevant confidence levels combined and divided: e.g. (0.958+0.57+0.799+0.442+0.878)/5. same operation done performed on the last text as well.

How do i update the function below to accomplish this?

JsonToCsv() code:

private static void JsonToCsv(string jsonFile, string csvfFile) {   
    using (var p = new ChoJSONReader(jsonFile)
        .WithJSONPath("$..readResults")
        )
    {
        using (var w = new ChoCSVWriter(csvfFile).WithFirstLineHeader())
        {
            w.Write(p
                .SelectMany(r1 => ((dynamic[])r1.lines).SelectMany(r2 => ((dynamic[])r2.words).Select(r3 => new
                {
                    r1.page,
                    r2.text,
                    words = r3.text,
                    r3.confidence
                }))));
        }
    }
}

sample JSON file:

{
  "status": "succeeded",
  "createdDateTime": "2020-05-28T05:13:21Z",
  "lastUpdatedDateTime": "2020-05-28T05:13:22Z",
  "analyzeResult": {
    "version": "3.1.0",
    "readResults": [
      {
        "page": 1,
        "language": "en",
        "angle": 0.8551,
        "width": 2661,
        "height": 1901,
        "unit": "pixel",
        "lines": [
          {
            "boundingBox": [
              67,
              646,
              2582,
              713,
              2580,
              876,
              67,
              821
            ],
            "text": "The quick brown fox jumps",
            "words": [
              {
                "boundingBox": [
                  143,
                  650,
                  435,
                  661,
                  436,
                  823,
                  144,
                  824
                ],
                "text": "The",
                "confidence": 0.958
              },
              {
                "boundingBox": [
                  540,
                  665,
                  926,
                  679,
                  926,
                  825,
                  541,
                  823
                ],
                "text": "quick",
                "confidence": 0.57
              },
              {
                "boundingBox": [
                  1125,
                  686,
                  1569,
                  700,
                  1569,
                  838,
                  1125,
                  828
                ],
                "text": "brown",
                "confidence": 0.799
              },
              {
                "boundingBox": [
                  1674,
                  703,
                  1966,
                  711,
                  1966,
                  851,
                  1674,
                  841
                ],
                "text": "fox",
                "confidence": 0.442
              },
              {
                "boundingBox": [
                  2083,
                  714,
                  2580,
                  725,
                  2579,
                  876,
                  2083,
                  855
                ],
                "text": "jumps",
                "confidence": 0.878
              }
            ]
          },
          {
            "boundingBox": [
              187,
              1062,
              485,
              1056,
              486,
              1120,
              189,
              1126
            ],
            "text": "over",
            "words": [
              {
                "boundingBox": [
                  190,
                  1064,
                  439,
                  1059,
                  441,
                  1122,
                  192,
                  1126
                ],
                "text": "over",
                "confidence": 0.37
              }
            ]
          },
          {
            "boundingBox": [
              664,
              1008,
              1973,
              1023,
              1969,
              1178,
              664,
              1154
            ],
            "text": "the lazy dog!",
            "words": [
              {
                "boundingBox": [
                  668,
                  1008,
                  923,
                  1015,
                  923,
                  1146,
                  669,
                  1117
                ],
                "text": "the",
                "confidence": 0.909
              },
              {
                "boundingBox": [
                  1107,
                  1018,
                  1447,
                  1023,
                  1445,
                  1178,
                  1107,
                  1162
                ],
                "text": "lazy",
                "confidence": 0.853
              },
              {
                "boundingBox": [
                  1639,
                  1024,
                  1974,
                  1023,
                  1971,
                  1170,
                  1636,
                  1178
                ],
                "text": "dog!",
                "confidence": 0.41
              }
            ]
          }
        ]
      }
    ]
  }
}
1

There are 1 best solutions below

0
On BEST ANSWER

Using Linq you can produce the CSV in expected format. Sample below shows how

StringBuilder csv = new StringBuilder();
using (var p = new ChoJSONReader("*** YOUR JSON PATH ***")
    .WithJSONPath("$..readResults")
    )
{
    using (var w = new ChoCSVWriter(csv)
        .WithFirstLineHeader()
        )
    {
        w.Write(p
            .SelectMany(r1 => ((dynamic[])r1.lines)
            .Select(r2 => new
            {
                r1.page,
                r2.text,
                words = String.Join(",", ((dynamic[])r2.words).Select(s1 => s1.text)),
                confidence = ((dynamic[])r2.words).Select(s1 => (double)s1.confidence).Average()
            })));
    }
}

Console.WriteLine(csv.ToString());