将扫描的 pdf 转换为文本 python
Convert scanned pdf to text python
我有一个扫描的 pdf 文件,我尝试从中提取文本。
我尝试使用 pypdfocr 在上面制作 ocr,但出现错误:
"could not found ghostscript in the usual place"
经过搜索,我找到了这个解决方案Linking Ghostscript to pypdfocr in Windows Platform,我尝试下载 GhostScript 并将其放入环境变量,但它仍然有同样的错误。
如何使用 python 在扫描的 pdf 文件中搜索文本?
谢谢。
编辑:这是我的代码示例:
import os
import sys
import re
import json
import shutil
import glob
from pypdfocr import pypdfocr_gs
from pypdfocr import pypdfocr_tesseract
from PIL import Image
path = PATH_TO_MY_SCANNED_PDF
mainL = []
kk = {}
def new_init(self, kk):
self.lang = 'heb'
self.binary = "tesseract"
self.msgs = {
'TS_MISSING': """
Could not execute %s
Please make sure you have Tesseract installed correctly
""" % self.binary,
'TS_VERSION':'Tesseract version is too old',
'TS_img_MISSING':'Cannot find specified tiff file',
'TS_FAILED': 'Tesseract-OCR execution failed!',
}
pypdfocr_tesseract.PyTesseract.__init__ = new_init
wow = pypdfocr_gs.PyGs(kk)
tt = pypdfocr_tesseract.PyTesseract(kk)
def secFile(filename,oldfilename):
wow.make_img_from_pdf(filename)
files = glob.glob("X:/e206333106/ocr-114/balagan/" + '*.jpg')
for file in files:
im = Image.open(file)
im.save(file + ".tiff")
files = glob.glob("PATH" + '*.tiff')
for file in files:
tt.make_hocr_from_pnm(file)
pdftxt = ""
files = glob.glob("PATH" + '*.html')
for file in files:
with open(file) as myfile:
pdftxt = pdftxt + "#" + "".join(line.rstrip() for line in myfile)
findNum(pdftxt,oldfilename)
folder ="PATH"
for the_file in os.listdir(folder):
file_path = os.path.join(folder, the_file)
try:
if os.path.isfile(file_path):
os.unlink(file_path)
except Exception, e:
print e
def pdf2ocr(filename):
pdffile = filename
os.system('pypdfocr -l heb ' + pdffile)
def ocr2txt(filename):
pdffile = filename
output1 = pdffile.replace(".pdf","_ocr.txt")
output1 = "PATH" + os.path.basename(output1)
input1 = pdffile.replace(".pdf","_ocr.pdf")
os.system("pdf2txt" -o + output1 + " " + input1)
with open(output1) as myfile:
pdftxt="".join(line.rstrip() for line in myfile)
findNum(pdftxt,filename)
def findNum(pdftxt,pdffile):
l = re.findall(r'\b\d+\b', pdftxt)
output = open('PATH' + os.path.basename(pdffile) + '.txt', 'w')
for i in l:
output.write(",")
output.write(i)
output.close()
def is_ascii(s):
return all(ord(c) < 128 for c in s)
i = 0
files = glob.glob(path + '\*.pdf')
print path
print files
for file in files:
if file.endswith(".pdf"):
if is_ascii(file):
print file
pdf2ocr(file)
ocr2txt(file)
else:
newname = "PATH" + str(i) + ".pdf"
shutil.copyfile(file, newname)
print newname
secFile(newname,file)
i = i + 1
files = glob.glob(path + '\' + '*_ocr.pdf')
for file in files:
print file
shutil.copyfile(file, "PATH" + os.path.basename(file))
os.remove(file)
看看这个图书馆:https://pypi.python.org/pypi/pypdfocr
但是 PDF 文件也可以包含图像。您也许能够分析页面内容流。一些扫描仪将单个扫描页面分解为图像,因此您不会获得带有 ghostscript 的文本。
看看我的代码,它对我有用。
import os
import io
from PIL import Image
import pytesseract
from wand.image import Image as wi
import gc
pdf=wi(filename=pdf_path,resolution=300)
pdfImg=pdf.convert('jpeg')
imgBlobs=[]
extracted_text=[]
def Get_text_from_image(pdf_path):
pdf=wi(filename=pdf_path,resolution=300)
pdfImg=pdf.convert('jpeg')
imgBlobs=[]
extracted_text=[]
for img in pdfImg.sequence:
page=wi(image=img)
imgBlobs.append(page.make_blob('jpeg'))
for imgBlob in imgBlobs:
im=Image.open(io.BytesIO(imgBlob))
text=pytesseract.image_to_string(im,lang='eng')
extracted_text.append(text)
return (extracted_text)
我通过编辑 /etc/ImageMagick-6/policy.xml 为我修复了它,并将 pdf 行的权限更改为 "read|write":
打开终端并更改路径
cd /etc/ImageMagick-6
nano policy.xml
<policy domain="coder" rights="read" pattern="PDF" />
change to
<policy domain="coder" rights="read|write" pattern="PDF" />
exit
当我将 pdf 图像提取为文本时,我遇到了一些问题,请阅读以下内容 link
error-constitute-c-readimage-412
authorized-to-convert-pdf-to-an-image
Increasing the memory limit please go through the below link
enter code here
https://github.com/phw/peek/issues/112
https://github.com/ImageMagick/ImageMagick/issues/396
转换 pdf,使用 pytesseract 进行 OCR,并将 pdf 中的每个页面导出到文本文件。
安装这些....
conda install -c conda-forge pytesseract
conda install -c conda-forge tesseract
pip install pdf2image
import pytesseract
from pdf2image import convert_from_path
import glob
pdfs = glob.glob(r"yourPath\*.pdf")
for pdf_path in pdfs:
pages = convert_from_path(pdf_path, 500)
for pageNum,imgBlob in enumerate(pages):
text = pytesseract.image_to_string(imgBlob,lang='eng')
with open(f'{pdf_path[:-4]}_page{pageNum}.txt', 'w') as the_file:
the_file.write(text)
PyPDF2 是作为 PDF 工具包构建的 python 库。它能够:
Extracting document information (title, author, …)
Splitting documents page by page
Merging documents page by page
Cropping pages
Merging multiple pages into a single page
Encrypting and decrypting PDF files
and more!
要安装 PyPDF2,运行 从命令行执行以下命令:
pip install PyPDF2
代码:
import PyPDF2
pdfFileObj = open('myPdf.pdf', 'rb')
pdfReader = PyPDF2.PdfFileReader(pdfFileObj)
print(pdfReader.numPages)
pageObj = pdfReader.getPage(0)
print(pageObj.extractText())
pdfFileObj.close()
此解决方案适用于 Linux 操作系统 (NoelOCR)
安装 NoelOCR
pip3 install NoelOCR
使用它
import NoelOCR as nm
text = nm.processPDF('input.pdf')
print(text)
之后您应该从扫描的 PDF 中获得纯文本。
我有一个扫描的 pdf 文件,我尝试从中提取文本。 我尝试使用 pypdfocr 在上面制作 ocr,但出现错误:
"could not found ghostscript in the usual place"
经过搜索,我找到了这个解决方案Linking Ghostscript to pypdfocr in Windows Platform,我尝试下载 GhostScript 并将其放入环境变量,但它仍然有同样的错误。
如何使用 python 在扫描的 pdf 文件中搜索文本?
谢谢。
编辑:这是我的代码示例:
import os
import sys
import re
import json
import shutil
import glob
from pypdfocr import pypdfocr_gs
from pypdfocr import pypdfocr_tesseract
from PIL import Image
path = PATH_TO_MY_SCANNED_PDF
mainL = []
kk = {}
def new_init(self, kk):
self.lang = 'heb'
self.binary = "tesseract"
self.msgs = {
'TS_MISSING': """
Could not execute %s
Please make sure you have Tesseract installed correctly
""" % self.binary,
'TS_VERSION':'Tesseract version is too old',
'TS_img_MISSING':'Cannot find specified tiff file',
'TS_FAILED': 'Tesseract-OCR execution failed!',
}
pypdfocr_tesseract.PyTesseract.__init__ = new_init
wow = pypdfocr_gs.PyGs(kk)
tt = pypdfocr_tesseract.PyTesseract(kk)
def secFile(filename,oldfilename):
wow.make_img_from_pdf(filename)
files = glob.glob("X:/e206333106/ocr-114/balagan/" + '*.jpg')
for file in files:
im = Image.open(file)
im.save(file + ".tiff")
files = glob.glob("PATH" + '*.tiff')
for file in files:
tt.make_hocr_from_pnm(file)
pdftxt = ""
files = glob.glob("PATH" + '*.html')
for file in files:
with open(file) as myfile:
pdftxt = pdftxt + "#" + "".join(line.rstrip() for line in myfile)
findNum(pdftxt,oldfilename)
folder ="PATH"
for the_file in os.listdir(folder):
file_path = os.path.join(folder, the_file)
try:
if os.path.isfile(file_path):
os.unlink(file_path)
except Exception, e:
print e
def pdf2ocr(filename):
pdffile = filename
os.system('pypdfocr -l heb ' + pdffile)
def ocr2txt(filename):
pdffile = filename
output1 = pdffile.replace(".pdf","_ocr.txt")
output1 = "PATH" + os.path.basename(output1)
input1 = pdffile.replace(".pdf","_ocr.pdf")
os.system("pdf2txt" -o + output1 + " " + input1)
with open(output1) as myfile:
pdftxt="".join(line.rstrip() for line in myfile)
findNum(pdftxt,filename)
def findNum(pdftxt,pdffile):
l = re.findall(r'\b\d+\b', pdftxt)
output = open('PATH' + os.path.basename(pdffile) + '.txt', 'w')
for i in l:
output.write(",")
output.write(i)
output.close()
def is_ascii(s):
return all(ord(c) < 128 for c in s)
i = 0
files = glob.glob(path + '\*.pdf')
print path
print files
for file in files:
if file.endswith(".pdf"):
if is_ascii(file):
print file
pdf2ocr(file)
ocr2txt(file)
else:
newname = "PATH" + str(i) + ".pdf"
shutil.copyfile(file, newname)
print newname
secFile(newname,file)
i = i + 1
files = glob.glob(path + '\' + '*_ocr.pdf')
for file in files:
print file
shutil.copyfile(file, "PATH" + os.path.basename(file))
os.remove(file)
看看这个图书馆:https://pypi.python.org/pypi/pypdfocr 但是 PDF 文件也可以包含图像。您也许能够分析页面内容流。一些扫描仪将单个扫描页面分解为图像,因此您不会获得带有 ghostscript 的文本。
看看我的代码,它对我有用。
import os
import io
from PIL import Image
import pytesseract
from wand.image import Image as wi
import gc
pdf=wi(filename=pdf_path,resolution=300)
pdfImg=pdf.convert('jpeg')
imgBlobs=[]
extracted_text=[]
def Get_text_from_image(pdf_path):
pdf=wi(filename=pdf_path,resolution=300)
pdfImg=pdf.convert('jpeg')
imgBlobs=[]
extracted_text=[]
for img in pdfImg.sequence:
page=wi(image=img)
imgBlobs.append(page.make_blob('jpeg'))
for imgBlob in imgBlobs:
im=Image.open(io.BytesIO(imgBlob))
text=pytesseract.image_to_string(im,lang='eng')
extracted_text.append(text)
return (extracted_text)
我通过编辑 /etc/ImageMagick-6/policy.xml 为我修复了它,并将 pdf 行的权限更改为 "read|write":
打开终端并更改路径
cd /etc/ImageMagick-6
nano policy.xml
<policy domain="coder" rights="read" pattern="PDF" />
change to
<policy domain="coder" rights="read|write" pattern="PDF" />
exit
当我将 pdf 图像提取为文本时,我遇到了一些问题,请阅读以下内容 link
error-constitute-c-readimage-412
authorized-to-convert-pdf-to-an-image
Increasing the memory limit please go through the below link
enter code here
https://github.com/phw/peek/issues/112
https://github.com/ImageMagick/ImageMagick/issues/396
转换 pdf,使用 pytesseract 进行 OCR,并将 pdf 中的每个页面导出到文本文件。
安装这些....
conda install -c conda-forge pytesseract
conda install -c conda-forge tesseract
pip install pdf2image
import pytesseract
from pdf2image import convert_from_path
import glob
pdfs = glob.glob(r"yourPath\*.pdf")
for pdf_path in pdfs:
pages = convert_from_path(pdf_path, 500)
for pageNum,imgBlob in enumerate(pages):
text = pytesseract.image_to_string(imgBlob,lang='eng')
with open(f'{pdf_path[:-4]}_page{pageNum}.txt', 'w') as the_file:
the_file.write(text)
PyPDF2 是作为 PDF 工具包构建的 python 库。它能够:
Extracting document information (title, author, …)
Splitting documents page by page
Merging documents page by page
Cropping pages
Merging multiple pages into a single page
Encrypting and decrypting PDF files
and more!
要安装 PyPDF2,运行 从命令行执行以下命令:
pip install PyPDF2
代码:
import PyPDF2
pdfFileObj = open('myPdf.pdf', 'rb')
pdfReader = PyPDF2.PdfFileReader(pdfFileObj)
print(pdfReader.numPages)
pageObj = pdfReader.getPage(0)
print(pageObj.extractText())
pdfFileObj.close()
此解决方案适用于 Linux 操作系统 (NoelOCR)
安装 NoelOCR
pip3 install NoelOCR
使用它
import NoelOCR as nm text = nm.processPDF('input.pdf') print(text)
之后您应该从扫描的 PDF 中获得纯文本。