如何使用 google 云视觉对 pdf 进行 OCR？

Question

我在笔记本电脑上使用 C#.net Windows 10

我有用于对图像（png、jpg）进行 OCR 处理的代码，效果很好

我需要让 pdf 文件正常工作

但一位朋友告诉我，pdf 可以直接发送到 google API 并获得 OCRed，而无需将 pdf 转换为图像然后发送图像。

这可能吗？如果可以，怎么做？

    private string GetTextFromImage(Google.Cloud.Vision.V1.Image filePath)
    {
        var response = Client.DetectText(filePath);
        var resultList = new List<ResultElement>();

        foreach (var annotation in response)
        {
            resultList.Add(new ResultElement
            {
                Description = annotation.Description,
                Location = annotation.BoundingPoly.ToString()
            });
        }

        return resultList.First().Description;
    }

    private Google.Cloud.Vision.V1.Image GetImageFromPath(string filePath)
    {
        return Google.Cloud.Vision.V1.Image.FromFile(filePath);
    }

编辑

谢谢里科

但代码返回此脚本而不是 pdf 文本

{ "responses": [ { "responses": [ { "fullTextAnnotation": { "pages": [ { "property": { "detectedLanguages": [ { "languageCode": "en", "confidence": 0.92 }, { "languageCode": "fil", "confidence": 0.02 }, { "languageCode": "af", "confidence": 0.01 } ] }, "width": 841, "height": 595, "blocks": [ { "property": { "detectedLanguages": [ { "languageCode": "en", "confidence": 0.33 }, { "languageCode": "fil", "confidence": 0.29 } ] }, "boundingBox": { "normalizedVertices": [ { "x": 0.587395966, "y": 0.9210084 }, { "x": 0.369797856, "y": 0.640336156 }, { "x": 0.4530321, "y": 0.5126051 }, { "x": 0.6706302, "y": 0.7932773 } ] }, "paragraphs": [ { "property": { "detectedLanguages": [ { "languageCode": "fil", "confidence": 0.47 }, { "languageCode": "en", "confidence": 0.39 } ] }, "boundingBox": { "normalizedVertices": [ { "x": 0.587395966, "y": 0.9210084 }, { "x": 0.372176, "y": 0.6386555 }, { "x": 0.416171223, "y": 0.5714286 }, { "x": 0.6313912, "y": 0.8554622 } ] }, "words": [ { "boundingBox": { "normalizedVertices": [ { "x": 0.529132, "y": 0.8436975 }, { "x": 0.4649227, "y": 0.761344552 }, { "x": 0.4803805, "y": 0.73613447 }, { "x": 0.544589758, "y": 0.8201681 } ] }, "symbols": [ { "text": "M", "confidence": 0.99 }, { "text": "e", "confidence": 0.99 }, { "text": "n", "confidence": 1 }, { "text": "g", "confidence": 0.99 }, { "text": "m", "confidence": 0.99 }, { "text": "e", "confidence": 0.99 }, { "text": "n", "confidence": 1 }, { "property": { "detectedBreak": { "type": "SPACE" } }, "text": "g", "confidence": 0.99 } ], "confidence": 0.99 }, { "boundingBox": { "normalizedVertices": [ { "x": 0.460166454, "y": 0.754621863 }, { "x": 0.445897728, "y": 0.73613447 }, { "x": 0.461355537, "y": 0.712605059 }, { "x": 0.475624263, "y": 0.731092453 } ] }, "symbols": [ { "text": "L", "confidence": 0.99 }, { "property": { "detectedBreak": { "type": "EOL_SURE_SPACE" } }, "text": "u", "confidence": 0.99 } ], "confidence": 0.99 }, { "property": { "detectedLanguages": [ { "languageCode": "en" } ] }, "boundingBox": { "normalizedVertices": [ { "x": 0.58501786, "y": 0.877310932 }, { "x": 0.5731272, "y": 0.8605042 }, { "x": 0.5862069, "y": 0.840336144 }, { "x": 0.5980975, "y": 0.857142866 } ] }, "symbols": [ { "property": { "detectedLanguages": [ { "languageCode": "en" } ] }, "text": "a", "confidence": 0.62 }, { "property": { "detectedLanguages": [ { "languageCode": "en" } ] }, "text": "t", "confidence": 0.98 }, { "property": { "detectedLanguages": [ { "languageCode": "en" } ], "detectedBreak": { "type": "SPACE" } }, "text": "e", "confidence": 0.94 } ], "confidence": 0.84 }, { "property": { "detectedLanguages": [ { "languageCode": "en" } ] }, "boundingBox": { "normalizedVertices": [ { "x": 0.568371, "y": 0.8537815 }, { "x": 0.549346, "y": 0.8302521 }, { "x": 0.5636147, "y": 0.8084034 }, { "x": 0.581450641, "y": 0.833613455 } ] }, "symbols": [ { "property": { "detectedLanguages": [ { "languageCode": "en" } ] }, "text": "K", "confidence": 0.92 }, { "property": { "detectedLanguages": [ { "languageCode": "en" } ] }, "text": "e", "confidence": 0.98 }, { "property": { "detectedLanguages": [ { "languageCode": "en" } ], "detectedBreak": { "type": "SPACE" } }, "text": "y", "confidence": 0.98 } ], "confidence": 0.96 }, { "property": { "detectedLanguages": [ { "languageCode": "en" } ] }, "boundingBox": { "normalizedVertices": [ { "x": 0.5457788, "y": 0.8235294 }, { "x": 0.5279429, "y": 0.8016807 }, { "x": 0.542211652, "y": 0.779831946 }, { "x": 0.560047567, "y": 0.803361356 } ] }, "symbols": [ { "property": { "detectedLanguages": [ { "languageCode": "en" } ] }, "text": "L", "confidence": 0.96 }, { "property": { "detectedLanguages": [ { "languageCode": "en" } ] }, "text": "a", "confidence": 1 }, { "property": { "detectedLanguages": [ { "languageCode": "en" } ], "detectedBreak": { "type": "SPACE" } }, "text": "b", "co

如何获取pdf文本？

Answer 1

这是对存储在 google 云存储中的 PDF 文件进行 OCR 的代码片段。这是使用 batch annotate，每个请求最多处理 5 页。

编辑：代码现在使用本地文件。这将 return 包含 PDF 文件内容的响应。

using System;
using System.IO;
using Google.Cloud.Vision.V1;
using Google.Protobuf;
using System.Collections.Generic;

public class QuickStart
    {
        public static void Main(string[] args)
        {
            var client = ImageAnnotatorClient.Create();
            Byte[] bytes = File.ReadAllBytes("your/complete/local/file/path/here.pdf");
            var content_byte = ByteString.CopyFrom(bytes);
    
            var syncRequest = new AnnotateFileRequest
            {
                InputConfig = new InputConfig
                {
                    Content = content_byte,
                    // Supported mime_types are: 'application/pdf' and 'image/tiff'
                    MimeType = "application/pdf"
    
                }
            };
    
            syncRequest.Features.Add(new Feature
            {
                Type = Feature.Types.Type.DocumentTextDetection
            });
    
            List<AnnotateFileRequest> requests =
                new List<AnnotateFileRequest>();
            requests.Add(syncRequest);
    
            var response = client.BatchAnnotateFiles(requests);
            Console.WriteLine(response);
        }
    }

如何使用 google 云视觉对 pdf 进行 OCR？

How to OCR a pdf with google cloud vision?

c#

pdf

google-cloud-vision

编辑