如何使用 google 云视觉对 pdf 进行 OCR?
How to OCR a pdf with google cloud vision?
我在笔记本电脑上使用 C#.net Windows 10
我有用于对图像(png、jpg)进行 OCR 处理的代码,效果很好
我需要让 pdf 文件正常工作
但一位朋友告诉我,pdf 可以直接发送到 google API 并获得 OCRed,而无需将 pdf 转换为图像然后发送图像。
这可能吗?如果可以,怎么做?
private string GetTextFromImage(Google.Cloud.Vision.V1.Image filePath)
{
var response = Client.DetectText(filePath);
var resultList = new List<ResultElement>();
foreach (var annotation in response)
{
resultList.Add(new ResultElement
{
Description = annotation.Description,
Location = annotation.BoundingPoly.ToString()
});
}
return resultList.First().Description;
}
private Google.Cloud.Vision.V1.Image GetImageFromPath(string filePath)
{
return Google.Cloud.Vision.V1.Image.FromFile(filePath);
}
编辑
谢谢里科
但代码返回此脚本而不是 pdf 文本
{ "responses": [ { "responses": [ { "fullTextAnnotation": { "pages": [ { "property": { "detectedLanguages": [ { "languageCode": "en", "confidence": 0.92 }, { "languageCode": "fil", "confidence": 0.02 }, { "languageCode": "af", "confidence": 0.01 } ] }, "width": 841, "height": 595, "blocks": [ { "property": { "detectedLanguages": [ { "languageCode": "en", "confidence": 0.33 }, { "languageCode": "fil", "confidence": 0.29 } ] }, "boundingBox": { "normalizedVertices": [ { "x": 0.587395966, "y": 0.9210084 }, { "x": 0.369797856, "y": 0.640336156 }, { "x": 0.4530321, "y": 0.5126051 }, { "x": 0.6706302, "y": 0.7932773 } ] }, "paragraphs": [ { "property": { "detectedLanguages": [ { "languageCode": "fil", "confidence": 0.47 }, { "languageCode": "en", "confidence": 0.39 } ] }, "boundingBox": { "normalizedVertices": [ { "x": 0.587395966, "y": 0.9210084 }, { "x": 0.372176, "y": 0.6386555 }, { "x": 0.416171223, "y": 0.5714286 }, { "x": 0.6313912, "y": 0.8554622 } ] }, "words": [ { "boundingBox": { "normalizedVertices": [ { "x": 0.529132, "y": 0.8436975 }, { "x": 0.4649227, "y": 0.761344552 }, { "x": 0.4803805, "y": 0.73613447 }, { "x": 0.544589758, "y": 0.8201681 } ] }, "symbols": [ { "text": "M", "confidence": 0.99 }, { "text": "e", "confidence": 0.99 }, { "text": "n", "confidence": 1 }, { "text": "g", "confidence": 0.99 }, { "text": "m", "confidence": 0.99 }, { "text": "e", "confidence": 0.99 }, { "text": "n", "confidence": 1 }, { "property": { "detectedBreak": { "type": "SPACE" } }, "text": "g", "confidence": 0.99 } ], "confidence": 0.99 }, { "boundingBox": { "normalizedVertices": [ { "x": 0.460166454, "y": 0.754621863 }, { "x": 0.445897728, "y": 0.73613447 }, { "x": 0.461355537, "y": 0.712605059 }, { "x": 0.475624263, "y": 0.731092453 } ] }, "symbols": [ { "text": "L", "confidence": 0.99 }, { "property": { "detectedBreak": { "type": "EOL_SURE_SPACE" } }, "text": "u", "confidence": 0.99 } ], "confidence": 0.99 }, { "property": { "detectedLanguages": [ { "languageCode": "en" } ] }, "boundingBox": { "normalizedVertices": [ { "x": 0.58501786, "y": 0.877310932 }, { "x": 0.5731272, "y": 0.8605042 }, { "x": 0.5862069, "y": 0.840336144 }, { "x": 0.5980975, "y": 0.857142866 } ] }, "symbols": [ { "property": { "detectedLanguages": [ { "languageCode": "en" } ] }, "text": "a", "confidence": 0.62 }, { "property": { "detectedLanguages": [ { "languageCode": "en" } ] }, "text": "t", "confidence": 0.98 }, { "property": { "detectedLanguages": [ { "languageCode": "en" } ], "detectedBreak": { "type": "SPACE" } }, "text": "e", "confidence": 0.94 } ], "confidence": 0.84 }, { "property": { "detectedLanguages": [ { "languageCode": "en" } ] }, "boundingBox": { "normalizedVertices": [ { "x": 0.568371, "y": 0.8537815 }, { "x": 0.549346, "y": 0.8302521 }, { "x": 0.5636147, "y": 0.8084034 }, { "x": 0.581450641, "y": 0.833613455 } ] }, "symbols": [ { "property": { "detectedLanguages": [ { "languageCode": "en" } ] }, "text": "K", "confidence": 0.92 }, { "property": { "detectedLanguages": [ { "languageCode": "en" } ] }, "text": "e", "confidence": 0.98 }, { "property": { "detectedLanguages": [ { "languageCode": "en" } ], "detectedBreak": { "type": "SPACE" } }, "text": "y", "confidence": 0.98 } ], "confidence": 0.96 }, { "property": { "detectedLanguages": [ { "languageCode": "en" } ] }, "boundingBox": { "normalizedVertices": [ { "x": 0.5457788, "y": 0.8235294 }, { "x": 0.5279429, "y": 0.8016807 }, { "x": 0.542211652, "y": 0.779831946 }, { "x": 0.560047567, "y": 0.803361356 } ] }, "symbols": [ { "property": { "detectedLanguages": [ { "languageCode": "en" } ] }, "text": "L", "confidence": 0.96 }, { "property": { "detectedLanguages": [ { "languageCode": "en" } ] }, "text": "a", "confidence": 1 }, { "property": { "detectedLanguages": [ { "languageCode": "en" } ], "detectedBreak": { "type": "SPACE" } }, "text": "b", "co
如何获取pdf文本?
这是对存储在 google 云存储中的 PDF 文件进行 OCR 的代码片段。这是使用 batch annotate,每个请求最多处理 5 页。
编辑:代码现在使用本地文件。这将 return 包含 PDF 文件内容的响应。
using System;
using System.IO;
using Google.Cloud.Vision.V1;
using Google.Protobuf;
using System.Collections.Generic;
public class QuickStart
{
public static void Main(string[] args)
{
var client = ImageAnnotatorClient.Create();
Byte[] bytes = File.ReadAllBytes("your/complete/local/file/path/here.pdf");
var content_byte = ByteString.CopyFrom(bytes);
var syncRequest = new AnnotateFileRequest
{
InputConfig = new InputConfig
{
Content = content_byte,
// Supported mime_types are: 'application/pdf' and 'image/tiff'
MimeType = "application/pdf"
}
};
syncRequest.Features.Add(new Feature
{
Type = Feature.Types.Type.DocumentTextDetection
});
List<AnnotateFileRequest> requests =
new List<AnnotateFileRequest>();
requests.Add(syncRequest);
var response = client.BatchAnnotateFiles(requests);
Console.WriteLine(response);
}
}
我在笔记本电脑上使用 C#.net Windows 10
我有用于对图像(png、jpg)进行 OCR 处理的代码,效果很好
我需要让 pdf 文件正常工作
但一位朋友告诉我,pdf 可以直接发送到 google API 并获得 OCRed,而无需将 pdf 转换为图像然后发送图像。
这可能吗?如果可以,怎么做?
private string GetTextFromImage(Google.Cloud.Vision.V1.Image filePath)
{
var response = Client.DetectText(filePath);
var resultList = new List<ResultElement>();
foreach (var annotation in response)
{
resultList.Add(new ResultElement
{
Description = annotation.Description,
Location = annotation.BoundingPoly.ToString()
});
}
return resultList.First().Description;
}
private Google.Cloud.Vision.V1.Image GetImageFromPath(string filePath)
{
return Google.Cloud.Vision.V1.Image.FromFile(filePath);
}
编辑
谢谢里科
但代码返回此脚本而不是 pdf 文本
{ "responses": [ { "responses": [ { "fullTextAnnotation": { "pages": [ { "property": { "detectedLanguages": [ { "languageCode": "en", "confidence": 0.92 }, { "languageCode": "fil", "confidence": 0.02 }, { "languageCode": "af", "confidence": 0.01 } ] }, "width": 841, "height": 595, "blocks": [ { "property": { "detectedLanguages": [ { "languageCode": "en", "confidence": 0.33 }, { "languageCode": "fil", "confidence": 0.29 } ] }, "boundingBox": { "normalizedVertices": [ { "x": 0.587395966, "y": 0.9210084 }, { "x": 0.369797856, "y": 0.640336156 }, { "x": 0.4530321, "y": 0.5126051 }, { "x": 0.6706302, "y": 0.7932773 } ] }, "paragraphs": [ { "property": { "detectedLanguages": [ { "languageCode": "fil", "confidence": 0.47 }, { "languageCode": "en", "confidence": 0.39 } ] }, "boundingBox": { "normalizedVertices": [ { "x": 0.587395966, "y": 0.9210084 }, { "x": 0.372176, "y": 0.6386555 }, { "x": 0.416171223, "y": 0.5714286 }, { "x": 0.6313912, "y": 0.8554622 } ] }, "words": [ { "boundingBox": { "normalizedVertices": [ { "x": 0.529132, "y": 0.8436975 }, { "x": 0.4649227, "y": 0.761344552 }, { "x": 0.4803805, "y": 0.73613447 }, { "x": 0.544589758, "y": 0.8201681 } ] }, "symbols": [ { "text": "M", "confidence": 0.99 }, { "text": "e", "confidence": 0.99 }, { "text": "n", "confidence": 1 }, { "text": "g", "confidence": 0.99 }, { "text": "m", "confidence": 0.99 }, { "text": "e", "confidence": 0.99 }, { "text": "n", "confidence": 1 }, { "property": { "detectedBreak": { "type": "SPACE" } }, "text": "g", "confidence": 0.99 } ], "confidence": 0.99 }, { "boundingBox": { "normalizedVertices": [ { "x": 0.460166454, "y": 0.754621863 }, { "x": 0.445897728, "y": 0.73613447 }, { "x": 0.461355537, "y": 0.712605059 }, { "x": 0.475624263, "y": 0.731092453 } ] }, "symbols": [ { "text": "L", "confidence": 0.99 }, { "property": { "detectedBreak": { "type": "EOL_SURE_SPACE" } }, "text": "u", "confidence": 0.99 } ], "confidence": 0.99 }, { "property": { "detectedLanguages": [ { "languageCode": "en" } ] }, "boundingBox": { "normalizedVertices": [ { "x": 0.58501786, "y": 0.877310932 }, { "x": 0.5731272, "y": 0.8605042 }, { "x": 0.5862069, "y": 0.840336144 }, { "x": 0.5980975, "y": 0.857142866 } ] }, "symbols": [ { "property": { "detectedLanguages": [ { "languageCode": "en" } ] }, "text": "a", "confidence": 0.62 }, { "property": { "detectedLanguages": [ { "languageCode": "en" } ] }, "text": "t", "confidence": 0.98 }, { "property": { "detectedLanguages": [ { "languageCode": "en" } ], "detectedBreak": { "type": "SPACE" } }, "text": "e", "confidence": 0.94 } ], "confidence": 0.84 }, { "property": { "detectedLanguages": [ { "languageCode": "en" } ] }, "boundingBox": { "normalizedVertices": [ { "x": 0.568371, "y": 0.8537815 }, { "x": 0.549346, "y": 0.8302521 }, { "x": 0.5636147, "y": 0.8084034 }, { "x": 0.581450641, "y": 0.833613455 } ] }, "symbols": [ { "property": { "detectedLanguages": [ { "languageCode": "en" } ] }, "text": "K", "confidence": 0.92 }, { "property": { "detectedLanguages": [ { "languageCode": "en" } ] }, "text": "e", "confidence": 0.98 }, { "property": { "detectedLanguages": [ { "languageCode": "en" } ], "detectedBreak": { "type": "SPACE" } }, "text": "y", "confidence": 0.98 } ], "confidence": 0.96 }, { "property": { "detectedLanguages": [ { "languageCode": "en" } ] }, "boundingBox": { "normalizedVertices": [ { "x": 0.5457788, "y": 0.8235294 }, { "x": 0.5279429, "y": 0.8016807 }, { "x": 0.542211652, "y": 0.779831946 }, { "x": 0.560047567, "y": 0.803361356 } ] }, "symbols": [ { "property": { "detectedLanguages": [ { "languageCode": "en" } ] }, "text": "L", "confidence": 0.96 }, { "property": { "detectedLanguages": [ { "languageCode": "en" } ] }, "text": "a", "confidence": 1 }, { "property": { "detectedLanguages": [ { "languageCode": "en" } ], "detectedBreak": { "type": "SPACE" } }, "text": "b", "co
如何获取pdf文本?
这是对存储在 google 云存储中的 PDF 文件进行 OCR 的代码片段。这是使用 batch annotate,每个请求最多处理 5 页。
编辑:代码现在使用本地文件。这将 return 包含 PDF 文件内容的响应。
using System;
using System.IO;
using Google.Cloud.Vision.V1;
using Google.Protobuf;
using System.Collections.Generic;
public class QuickStart
{
public static void Main(string[] args)
{
var client = ImageAnnotatorClient.Create();
Byte[] bytes = File.ReadAllBytes("your/complete/local/file/path/here.pdf");
var content_byte = ByteString.CopyFrom(bytes);
var syncRequest = new AnnotateFileRequest
{
InputConfig = new InputConfig
{
Content = content_byte,
// Supported mime_types are: 'application/pdf' and 'image/tiff'
MimeType = "application/pdf"
}
};
syncRequest.Features.Add(new Feature
{
Type = Feature.Types.Type.DocumentTextDetection
});
List<AnnotateFileRequest> requests =
new List<AnnotateFileRequest>();
requests.Add(syncRequest);
var response = client.BatchAnnotateFiles(requests);
Console.WriteLine(response);
}
}