运行 OCRmyPDF 的问题:WinError2 和无效的版本号
Issues with running OCRmyPDF: A WinError2 and an Invalid Version Number
所以我整理了一个简单的 Python 脚本来对 PDF 进行 OCR:
from PIL import Image
from tika import parser
import argparse
import img2pdf
import ocrmypdf
def main():
parser = argparse.ArgumentParser(description="Get text from image.")
parser.add_argument("image_path", metavar="i", help="The path to the image being used.")
args = parser.parse_args()
image_path = args.image_path
pdf_from_image_file_name = convert_to_pdf(image_path)
pdf_w_ocr_file_name = ocr_pdf()
raw_text_from_ocr_pdf = get_text_from_pdf()
print(raw_text_from_ocr_pdf)
def convert_to_pdf(image_path, new_pdf_file_name="pdf_from_image"):
temp_image = Image.open(image_path)
pdf_bytes = img2pdf.convert(temp_image.filename)
new_file = open('./' + new_pdf_file_name + '.pdf', 'wb')
new_file.write(pdf_bytes)
temp_image.close()
new_file.close()
return new_pdf_file_name
def ocr_pdf(pdf_file_path="./temp_pdf_file_name.pdf", new_pdf_file_name="pdf_w_ocr.pdf"):
ocrmypdf.ocr(pdf_file_path, './'+new_pdf_file_name, deskew=True)
return new_pdf_file_name
def get_text_from_pdf(pdf_file_path="./pdf_w_ocr.pdf"):
raw_pdf = parser.from_file(pdf_file_path)
return raw_pdf['content']
if __name__ == '__main__':
main()
当脚本命中 import ocrmypdf
时,它会触发 [WinError 2] The system cannot find the file specified
错误,但会继续过去。从 JPG 或 PNG 到 PDF 的转换工作正常,输出也很好。但是,当到达 ocrmypdf.ocr(pdf_file_path, './'+new_pdf_file_name, deskew=True)
时,我得到 ValueError: invalid version number '4.0.0.20181030'
.
完整堆栈是:
[WinError 2] The system cannot find the file specified
Traceback (most recent call last):
File "workshop_v1.py", line 71, in <module>
main()
File "workshop_v1.py", line 49, in main
pdf_w_ocr_file_name = ocr_pdf()
File "workshop_v1.py", line 63, in ocr_pdf
ocrmypdf.ocr(pdf_file_path, './'+new_pdf_file_name, deskew=True)
File "C:\Users\xxx\anaconda3\envs\python37\lib\site-packages\ocrmypdf\api.py", line 339, in ocr
check_options(options, plugin_manager)
File "C:\Users\xxx\anaconda3\envs\python37\lib\site-packages\ocrmypdf\_validation.py", line 271, in check_options
_check_options(options, plugin_manager, ocr_engine_languages)
File "C:\Users\xxx\anaconda3\envs\python37\lib\site-packages\ocrmypdf\_validation.py", line 266, in _check_options
plugin_manager.hook.check_options(options=options)
File "C:\Users\xxx\anaconda3\envs\python37\lib\site-packages\pluggy\hooks.py", line 286, in __call__
return self._hookexec(self, self.get_hookimpls(), kwargs)
File "C:\Users\xxx\anaconda3\envs\python37\lib\site-packages\pluggy\manager.py", line 93, in _hookexec
return self._inner_hookexec(hook, methods, kwargs)
File "C:\Users\xxx\anaconda3\envs\python37\lib\site-packages\pluggy\manager.py", line 87, in <lambda>
firstresult=hook.spec.opts.get("firstresult") if hook.spec else False,
File "C:\Users\xxx\anaconda3\envs\python37\lib\site-packages\pluggy\callers.py", line 208, in _multicall
return outcome.get_result()
File "C:\Users\xxx\anaconda3\envs\python37\lib\site-packages\pluggy\callers.py", line 80, in get_result
raise ex[1].with_traceback(ex[2])
File "C:\Users\xxx\anaconda3\envs\python37\lib\site-packages\pluggy\callers.py", line 187, in _multicall
res = hook_impl.function(*args)
File "C:\Users\xxx\anaconda3\envs\python37\lib\site-packages\ocrmypdf\builtin_plugins\tesseract_ocr.py", line 84, in check_options
version_parser=tesseract.TesseractVersion,
File "C:\Users\xxx\anaconda3\envs\python37\lib\site-packages\ocrmypdf\subprocess\__init__.py", line 313, in check_external_program
if found_version and version_parser(found_version) < version_parser(need_version):
File "C:\Users\xxx\anaconda3\envs\python37\lib\distutils\version.py", line 40, in __init__
self.parse(vstring)
File "C:\Users\xxx\anaconda3\envs\python37\lib\site-packages\ocrmypdf\_exec\tesseract.py", line 72, in parse
super().parse(vstring)
File "C:\Users\xxx\anaconda3\envs\python37\lib\distutils\version.py", line 137, in parse
raise ValueError("invalid version number '%s'" % vstring)
ValueError: invalid version number '4.0.0.20181030'
我 运行 在 x64 PC 上使用 Windows 10。具体来说,我是 运行 通过 Anaconda 的 Python 3.7.10 环境。 Python 中的包版本信息包括(通过 pip freeze
):
- pytesseract v0.3.7
- ocrmypdf 12.1.0
- ghostscript v0.7
python 之外的其他潜在重要版本信息包括:
- tesseract-ocr v4.0.0.20181030(我为此添加并尝试了许多环境变量,详情如下)
- leptonica v1.76.0
- ghostscript v9.54.0
- qpdf 10.3.2(这个是下载的然后文件放在
C:/Windows/System32
目录下)
Tesseract 安装在这里:C:\Program Files (x86)\Tesseract-OCR\
,所以我尝试了以下环境变量(作为用户变量):
OCRMYPDF_TESSERACT = C:\Program Files (x86)\Tesseract-OCR\tesseract.exe
- 在
Path
的末尾添加了 C:\Program Files (x86)\Tesseract-OCR
TESSDATA_PREFIX = C:\Program Files (x86)\Tesseract-OCR\tessdata
不胜感激!
根据我在此处打开的问题在此处更新了存储库:https://github.com/jbarlow83/OCRmyPDF/issues/795
。
安装使用:pip3 install pip install git+https://github.com/jbarlow83/OCRmyPDF.git#egg=ocrmypdf
.
我仍然得到 [WinError 2] The system cannot find the file specified
,但它有效,所以我现在不打算质疑它。
所以我整理了一个简单的 Python 脚本来对 PDF 进行 OCR:
from PIL import Image
from tika import parser
import argparse
import img2pdf
import ocrmypdf
def main():
parser = argparse.ArgumentParser(description="Get text from image.")
parser.add_argument("image_path", metavar="i", help="The path to the image being used.")
args = parser.parse_args()
image_path = args.image_path
pdf_from_image_file_name = convert_to_pdf(image_path)
pdf_w_ocr_file_name = ocr_pdf()
raw_text_from_ocr_pdf = get_text_from_pdf()
print(raw_text_from_ocr_pdf)
def convert_to_pdf(image_path, new_pdf_file_name="pdf_from_image"):
temp_image = Image.open(image_path)
pdf_bytes = img2pdf.convert(temp_image.filename)
new_file = open('./' + new_pdf_file_name + '.pdf', 'wb')
new_file.write(pdf_bytes)
temp_image.close()
new_file.close()
return new_pdf_file_name
def ocr_pdf(pdf_file_path="./temp_pdf_file_name.pdf", new_pdf_file_name="pdf_w_ocr.pdf"):
ocrmypdf.ocr(pdf_file_path, './'+new_pdf_file_name, deskew=True)
return new_pdf_file_name
def get_text_from_pdf(pdf_file_path="./pdf_w_ocr.pdf"):
raw_pdf = parser.from_file(pdf_file_path)
return raw_pdf['content']
if __name__ == '__main__':
main()
当脚本命中 import ocrmypdf
时,它会触发 [WinError 2] The system cannot find the file specified
错误,但会继续过去。从 JPG 或 PNG 到 PDF 的转换工作正常,输出也很好。但是,当到达 ocrmypdf.ocr(pdf_file_path, './'+new_pdf_file_name, deskew=True)
时,我得到 ValueError: invalid version number '4.0.0.20181030'
.
完整堆栈是:
[WinError 2] The system cannot find the file specified
Traceback (most recent call last):
File "workshop_v1.py", line 71, in <module>
main()
File "workshop_v1.py", line 49, in main
pdf_w_ocr_file_name = ocr_pdf()
File "workshop_v1.py", line 63, in ocr_pdf
ocrmypdf.ocr(pdf_file_path, './'+new_pdf_file_name, deskew=True)
File "C:\Users\xxx\anaconda3\envs\python37\lib\site-packages\ocrmypdf\api.py", line 339, in ocr
check_options(options, plugin_manager)
File "C:\Users\xxx\anaconda3\envs\python37\lib\site-packages\ocrmypdf\_validation.py", line 271, in check_options
_check_options(options, plugin_manager, ocr_engine_languages)
File "C:\Users\xxx\anaconda3\envs\python37\lib\site-packages\ocrmypdf\_validation.py", line 266, in _check_options
plugin_manager.hook.check_options(options=options)
File "C:\Users\xxx\anaconda3\envs\python37\lib\site-packages\pluggy\hooks.py", line 286, in __call__
return self._hookexec(self, self.get_hookimpls(), kwargs)
File "C:\Users\xxx\anaconda3\envs\python37\lib\site-packages\pluggy\manager.py", line 93, in _hookexec
return self._inner_hookexec(hook, methods, kwargs)
File "C:\Users\xxx\anaconda3\envs\python37\lib\site-packages\pluggy\manager.py", line 87, in <lambda>
firstresult=hook.spec.opts.get("firstresult") if hook.spec else False,
File "C:\Users\xxx\anaconda3\envs\python37\lib\site-packages\pluggy\callers.py", line 208, in _multicall
return outcome.get_result()
File "C:\Users\xxx\anaconda3\envs\python37\lib\site-packages\pluggy\callers.py", line 80, in get_result
raise ex[1].with_traceback(ex[2])
File "C:\Users\xxx\anaconda3\envs\python37\lib\site-packages\pluggy\callers.py", line 187, in _multicall
res = hook_impl.function(*args)
File "C:\Users\xxx\anaconda3\envs\python37\lib\site-packages\ocrmypdf\builtin_plugins\tesseract_ocr.py", line 84, in check_options
version_parser=tesseract.TesseractVersion,
File "C:\Users\xxx\anaconda3\envs\python37\lib\site-packages\ocrmypdf\subprocess\__init__.py", line 313, in check_external_program
if found_version and version_parser(found_version) < version_parser(need_version):
File "C:\Users\xxx\anaconda3\envs\python37\lib\distutils\version.py", line 40, in __init__
self.parse(vstring)
File "C:\Users\xxx\anaconda3\envs\python37\lib\site-packages\ocrmypdf\_exec\tesseract.py", line 72, in parse
super().parse(vstring)
File "C:\Users\xxx\anaconda3\envs\python37\lib\distutils\version.py", line 137, in parse
raise ValueError("invalid version number '%s'" % vstring)
ValueError: invalid version number '4.0.0.20181030'
我 运行 在 x64 PC 上使用 Windows 10。具体来说,我是 运行 通过 Anaconda 的 Python 3.7.10 环境。 Python 中的包版本信息包括(通过 pip freeze
):
- pytesseract v0.3.7
- ocrmypdf 12.1.0
- ghostscript v0.7
python 之外的其他潜在重要版本信息包括:
- tesseract-ocr v4.0.0.20181030(我为此添加并尝试了许多环境变量,详情如下)
- leptonica v1.76.0
- ghostscript v9.54.0
- qpdf 10.3.2(这个是下载的然后文件放在
C:/Windows/System32
目录下)
Tesseract 安装在这里:C:\Program Files (x86)\Tesseract-OCR\
,所以我尝试了以下环境变量(作为用户变量):
OCRMYPDF_TESSERACT = C:\Program Files (x86)\Tesseract-OCR\tesseract.exe
- 在
Path
的末尾添加了 TESSDATA_PREFIX = C:\Program Files (x86)\Tesseract-OCR\tessdata
C:\Program Files (x86)\Tesseract-OCR
不胜感激!
根据我在此处打开的问题在此处更新了存储库:https://github.com/jbarlow83/OCRmyPDF/issues/795
。
安装使用:pip3 install pip install git+https://github.com/jbarlow83/OCRmyPDF.git#egg=ocrmypdf
.
我仍然得到 [WinError 2] The system cannot find the file specified
,但它有效,所以我现在不打算质疑它。