从 Google Vision API OCR on PDF 获取行和段落,而不是符号
Get Lines and Paragraphs, not symbols from Google Vision API OCR on PDF
我正在尝试使用 Google Cloud Vision API 现在支持的 PDF/TIFF 文档文本检测。使用他们的示例代码,我能够提交 PDF 并收到带有提取文本的 JSON 对象。我的问题是保存到 GCS 的 JSON 文件仅包含 "symbols" 的边界框和文本,即每个单词中的每个字符。这使得 JSON 对象非常笨重且难以使用。我希望能够获取 "LINES"、"PARAGRAPHS" 和 "BLOCKS" 的文本和边界框,但我似乎无法通过 AsyncAnnotateFileRequest()
方法。
示例代码如下:
def async_detect_document(gcs_source_uri, gcs_destination_uri):
"""OCR with PDF/TIFF as source files on GCS"""
# Supported mime_types are: 'application/pdf' and 'image/tiff'
mime_type = 'application/pdf'
# How many pages should be grouped into each json output file.
batch_size = 2
client = vision.ImageAnnotatorClient()
feature = vision.types.Feature(
type=vision.enums.Feature.Type.DOCUMENT_TEXT_DETECTION)
gcs_source = vision.types.GcsSource(uri=gcs_source_uri)
input_config = vision.types.InputConfig(
gcs_source=gcs_source, mime_type=mime_type)
gcs_destination = vision.types.GcsDestination(uri=gcs_destination_uri)
output_config = vision.types.OutputConfig(
gcs_destination=gcs_destination, batch_size=batch_size)
async_request = vision.types.AsyncAnnotateFileRequest(
features=[feature], input_config=input_config,
output_config=output_config)
operation = client.async_batch_annotate_files(
requests=[async_request])
print('Waiting for the operation to finish.')
operation.result(timeout=180)
# Once the request has completed and the output has been
# written to GCS, we can list all the output files.
storage_client = storage.Client()
match = re.match(r'gs://([^/]+)/(.+)', gcs_destination_uri)
bucket_name = match.group(1)
prefix = match.group(2)
bucket = storage_client.get_bucket(bucket_name=bucket_name)
# List objects with the given prefix.
blob_list = list(bucket.list_blobs(prefix=prefix))
print('Output files:')
for blob in blob_list:
print(blob.name)
# Process the first output file from GCS.
# Since we specified batch_size=2, the first response contains
# the first two pages of the input file.
output = blob_list[0]
json_string = output.download_as_string()
response = json_format.Parse(
json_string, vision.types.AnnotateFileResponse())
# The actual response for the first page of the input file.
first_page_response = response.responses[0]
annotation = first_page_response.full_text_annotation
# Here we print the full text from the first page.
# The response contains more information:
# annotation/pages/blocks/paragraphs/words/symbols
# including confidence scores and bounding boxes
print(u'Full text:\n{}'.format(
annotation.text))
不幸的是,当使用 DOCUMENT_TEXT_DETECTION
类型时,您只能获得每页的全文或单个符号。将符号中的段落和行放在一起并不太难,像这样的东西应该可以工作(从你的例子扩展):
breaks = vision.enums.TextAnnotation.DetectedBreak.BreakType
paragraphs = []
lines = []
for page in annotation.pages:
for block in page.blocks:
for paragraph in block.paragraphs:
para = ""
line = ""
for word in paragraph.words:
for symbol in word.symbols:
line += symbol.text
if symbol.property.detected_break.type == breaks.SPACE:
line += ' '
if symbol.property.detected_break.type == breaks.EOL_SURE_SPACE:
line += ' '
lines.append(line)
para += line
line = ''
if symbol.property.detected_break.type == breaks.LINE_BREAK:
lines.append(line)
para += line
line = ''
paragraphs.append(para)
print(paragraphs)
print(lines)
我正在尝试使用 Google Cloud Vision API 现在支持的 PDF/TIFF 文档文本检测。使用他们的示例代码,我能够提交 PDF 并收到带有提取文本的 JSON 对象。我的问题是保存到 GCS 的 JSON 文件仅包含 "symbols" 的边界框和文本,即每个单词中的每个字符。这使得 JSON 对象非常笨重且难以使用。我希望能够获取 "LINES"、"PARAGRAPHS" 和 "BLOCKS" 的文本和边界框,但我似乎无法通过 AsyncAnnotateFileRequest()
方法。
示例代码如下:
def async_detect_document(gcs_source_uri, gcs_destination_uri):
"""OCR with PDF/TIFF as source files on GCS"""
# Supported mime_types are: 'application/pdf' and 'image/tiff'
mime_type = 'application/pdf'
# How many pages should be grouped into each json output file.
batch_size = 2
client = vision.ImageAnnotatorClient()
feature = vision.types.Feature(
type=vision.enums.Feature.Type.DOCUMENT_TEXT_DETECTION)
gcs_source = vision.types.GcsSource(uri=gcs_source_uri)
input_config = vision.types.InputConfig(
gcs_source=gcs_source, mime_type=mime_type)
gcs_destination = vision.types.GcsDestination(uri=gcs_destination_uri)
output_config = vision.types.OutputConfig(
gcs_destination=gcs_destination, batch_size=batch_size)
async_request = vision.types.AsyncAnnotateFileRequest(
features=[feature], input_config=input_config,
output_config=output_config)
operation = client.async_batch_annotate_files(
requests=[async_request])
print('Waiting for the operation to finish.')
operation.result(timeout=180)
# Once the request has completed and the output has been
# written to GCS, we can list all the output files.
storage_client = storage.Client()
match = re.match(r'gs://([^/]+)/(.+)', gcs_destination_uri)
bucket_name = match.group(1)
prefix = match.group(2)
bucket = storage_client.get_bucket(bucket_name=bucket_name)
# List objects with the given prefix.
blob_list = list(bucket.list_blobs(prefix=prefix))
print('Output files:')
for blob in blob_list:
print(blob.name)
# Process the first output file from GCS.
# Since we specified batch_size=2, the first response contains
# the first two pages of the input file.
output = blob_list[0]
json_string = output.download_as_string()
response = json_format.Parse(
json_string, vision.types.AnnotateFileResponse())
# The actual response for the first page of the input file.
first_page_response = response.responses[0]
annotation = first_page_response.full_text_annotation
# Here we print the full text from the first page.
# The response contains more information:
# annotation/pages/blocks/paragraphs/words/symbols
# including confidence scores and bounding boxes
print(u'Full text:\n{}'.format(
annotation.text))
不幸的是,当使用 DOCUMENT_TEXT_DETECTION
类型时,您只能获得每页的全文或单个符号。将符号中的段落和行放在一起并不太难,像这样的东西应该可以工作(从你的例子扩展):
breaks = vision.enums.TextAnnotation.DetectedBreak.BreakType
paragraphs = []
lines = []
for page in annotation.pages:
for block in page.blocks:
for paragraph in block.paragraphs:
para = ""
line = ""
for word in paragraph.words:
for symbol in word.symbols:
line += symbol.text
if symbol.property.detected_break.type == breaks.SPACE:
line += ' '
if symbol.property.detected_break.type == breaks.EOL_SURE_SPACE:
line += ' '
lines.append(line)
para += line
line = ''
if symbol.property.detected_break.type == breaks.LINE_BREAK:
lines.append(line)
para += line
line = ''
paragraphs.append(para)
print(paragraphs)
print(lines)