检查可搜索的 PDF 是否已被 OCR 或是否为可搜索的 PDF TRUE
Check if a PDF searchable has been OCR’d Or is a PDF searchable TRUE
是否有任何 Python 方法来识别 PDF 是否经过 OCR(文本质量差)与可搜索 PDF(文本质量完美)?
使用 pdf 的元数据
import pprint
import PyPDF2
def get_doc_info(path):
pp = pprint.PrettyPrinter(indent =4)
pdf_file = PyPDF2.PdfFileReader(path, 'rb')
doc_info = pdf_file.getDocumentInfo()
pp.pprint(doc_info)
我发现:
result = get_doc_info(PDF_SEARCHABLE_HAS_BEEN_OCRD.pdf)
{ '/Author': 'NAPS2',
'/CreationDate': "D:20200701104101+02'00'",
'/Creator': 'NAPS2',
'/Keywords': '',
'/ModDate': "D:20200701104101+02'00'",
'/Producer': 'PDFsharp 1.50.4589 (www.pdfsharp.com)'}
result = get_doc_info(PDF_SEARCHABLE_TRUE.pdf)
{ '/CreationDate': 'D:20210802122000Z',
'/Creator': 'Quadient CXM AG~Inspire~14.3.49.7',
'/Producer': ''}
我可以使用来自 PDF 元数据的 Creator 检查 PDF 的类型(True PDF 或 OCR PDF)吗?
还有另一种方法使用 python ?
如果问题没有解决方案,我如何使用深度 learning/Machine 学习来检测可搜索的 pdf 类型(True 或 OCR)?
这是一个了解 TRUE PDF 和 OCR PDF 之间区别的视频:https://www.youtube.com/watch?v=xs8KQbxsMcw
不久前我运行遇到了同样的问题!
我开发了(基于一些 SO post 我不记得了)这个功能:
def get_scanned_pages_percentage(filepath: str) -> float:
"""
INPUT: path to a pdf file
OUTPUT: % of pages OCR'd which include text
"""
total_pages = 0
total_scanned_pages = 0
with fitz.open(filepath) as doc:
for page in doc:
text = page.getText().strip()
if len(text) == 0:
# Ignore "empty" pages
continue
total_pages += 1
pix1 = page.getPixmap(alpha=False) # render page to an image
remove_all_text(doc, page)
pix2 = page.getPixmap(alpha=False)
img1 = pix1.getImageData("png")
img2 = pix2.getImageData("png")
if img1 == img2:
# print(f"{page.number} was scanned or has no text")
if len(text) > 0:
# print(f"\tHas text of length {len(text):,} characters")
total_scanned_pages += 1
else:
pass
if total_pages == 0:
return 0
return (total_scanned_pages / total_pages) * 100
如果 pdf 是包含 OCR 文本的图像,此函数将给出 100(或接近它),如果是原始数字 pdf,则给出 0。
删除所有文本:
def remove_all_text(doc, page):
"""Removes all text from a doc pdf page (metadata)"""
page.cleanContents() # syntax cleaning of page appearance commands
# xref of the cleaned command source (bytes object)
xref = page.getContents()[0]
cont = doc.xrefStream(xref) # read it
# The metadata is stored, it extracts it as bytes. Then searches fot the tags refering to text and deletes it.
ba_cont = bytearray(cont) # a modifyable version
pos = 0
changed = False # switch indicates changes
while pos < len(cont) - 1:
pos = ba_cont.find(b"BT\n", pos) # begin text object
if pos < 0:
break # not (more) found
pos2 = ba_cont.find(b"ET\n", pos) # end text object
if pos2 <= pos:
break # major error in PDF page definition!
ba_cont[pos: pos2 + 2] = b"" # remove text object
changed = True
if changed: # we have indeed removed some text
doc.updateStream(xref, ba_cont) # write back command stream w/o text
是否有任何 Python 方法来识别 PDF 是否经过 OCR(文本质量差)与可搜索 PDF(文本质量完美)?
使用 pdf 的元数据
import pprint
import PyPDF2
def get_doc_info(path):
pp = pprint.PrettyPrinter(indent =4)
pdf_file = PyPDF2.PdfFileReader(path, 'rb')
doc_info = pdf_file.getDocumentInfo()
pp.pprint(doc_info)
我发现:
result = get_doc_info(PDF_SEARCHABLE_HAS_BEEN_OCRD.pdf)
{ '/Author': 'NAPS2',
'/CreationDate': "D:20200701104101+02'00'",
'/Creator': 'NAPS2',
'/Keywords': '',
'/ModDate': "D:20200701104101+02'00'",
'/Producer': 'PDFsharp 1.50.4589 (www.pdfsharp.com)'}
result = get_doc_info(PDF_SEARCHABLE_TRUE.pdf)
{ '/CreationDate': 'D:20210802122000Z',
'/Creator': 'Quadient CXM AG~Inspire~14.3.49.7',
'/Producer': ''}
我可以使用来自 PDF 元数据的 Creator 检查 PDF 的类型(True PDF 或 OCR PDF)吗?
还有另一种方法使用 python ?
如果问题没有解决方案,我如何使用深度 learning/Machine 学习来检测可搜索的 pdf 类型(True 或 OCR)?
这是一个了解 TRUE PDF 和 OCR PDF 之间区别的视频:https://www.youtube.com/watch?v=xs8KQbxsMcw
不久前我运行遇到了同样的问题!
我开发了(基于一些 SO post 我不记得了)这个功能:
def get_scanned_pages_percentage(filepath: str) -> float:
"""
INPUT: path to a pdf file
OUTPUT: % of pages OCR'd which include text
"""
total_pages = 0
total_scanned_pages = 0
with fitz.open(filepath) as doc:
for page in doc:
text = page.getText().strip()
if len(text) == 0:
# Ignore "empty" pages
continue
total_pages += 1
pix1 = page.getPixmap(alpha=False) # render page to an image
remove_all_text(doc, page)
pix2 = page.getPixmap(alpha=False)
img1 = pix1.getImageData("png")
img2 = pix2.getImageData("png")
if img1 == img2:
# print(f"{page.number} was scanned or has no text")
if len(text) > 0:
# print(f"\tHas text of length {len(text):,} characters")
total_scanned_pages += 1
else:
pass
if total_pages == 0:
return 0
return (total_scanned_pages / total_pages) * 100
如果 pdf 是包含 OCR 文本的图像,此函数将给出 100(或接近它),如果是原始数字 pdf,则给出 0。
删除所有文本:
def remove_all_text(doc, page):
"""Removes all text from a doc pdf page (metadata)"""
page.cleanContents() # syntax cleaning of page appearance commands
# xref of the cleaned command source (bytes object)
xref = page.getContents()[0]
cont = doc.xrefStream(xref) # read it
# The metadata is stored, it extracts it as bytes. Then searches fot the tags refering to text and deletes it.
ba_cont = bytearray(cont) # a modifyable version
pos = 0
changed = False # switch indicates changes
while pos < len(cont) - 1:
pos = ba_cont.find(b"BT\n", pos) # begin text object
if pos < 0:
break # not (more) found
pos2 = ba_cont.find(b"ET\n", pos) # end text object
if pos2 <= pos:
break # major error in PDF page definition!
ba_cont[pos: pos2 + 2] = b"" # remove text object
changed = True
if changed: # we have indeed removed some text
doc.updateStream(xref, ba_cont) # write back command stream w/o text