为什么我不能使用 pdfminer 解析此 pdf?
Why cant i parse this pdf using pdfminer?
我编写的代码成功解析了数千种不同类型的 pdf。
然而,对于这个 pdf,我得到了一个错误。这是一个非常简单的测试代码示例,它重现了错误。我的原始代码太长,无法在这里分享
file = open('C:/Users/username/file.pdf', 'rb')
rsrcmgr = PDFResourceManager()
device = PDFPageAggregator(rsrcmgr, laparams=LAParams())
interpreter = PDFPageInterpreter(rsrcmgr, device)
pages = PDFPage.get_pages(file)
for page in pages:
interpreter.process_page(page)
layout = device.get_result()
https://filetransfer.io/data-package/dWnZbcWl#link
这是完整的错误信息
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
~\AppData\Local\Temp/ipykernel_15652/28568702.py in <module>
7 for page in pages:
----> 8 interpreter.process_page(page)
9 layout = device.get_result()
C:\ProgramData\miniforge3\lib\site-packages\pdfminer\pdfinterp.py in process_page(self, page)
839 ctm = (1, 0, 0, 1, -x0, -y0)
840 self.device.begin_page(page, ctm)
--> 841 self.render_contents(page.resources, page.contents, ctm=ctm)
842 self.device.end_page(page)
843 return
C:\ProgramData\miniforge3\lib\site-packages\pdfminer\pdfinterp.py in render_contents(self, resources, streams, ctm)
852 self.init_resources(resources)
853 self.init_state(ctm)
--> 854 self.execute(list_value(streams))
855 return
856
C:\ProgramData\miniforge3\lib\site-packages\pdfminer\pdfinterp.py in execute(self, streams)
857 def execute(self, streams):
858 try:
--> 859 parser = PDFContentParser(streams)
860 except PSEOF:
861 # empty page
C:\ProgramData\miniforge3\lib\site-packages\pdfminer\pdfinterp.py in __init__(self, streams)
219 self.streams = streams
220 self.istream = 0
--> 221 PSStackParser.__init__(self, None)
222 return
223
C:\ProgramData\miniforge3\lib\site-packages\pdfminer\psparser.py in __init__(self, fp)
513
514 def __init__(self, fp):
--> 515 PSBaseParser.__init__(self, fp)
516 self.reset()
517 return
C:\ProgramData\miniforge3\lib\site-packages\pdfminer\psparser.py in __init__(self, fp)
167 def __init__(self, fp):
168 self.fp = fp
--> 169 self.seek(0)
170 return
171
C:\ProgramData\miniforge3\lib\site-packages\pdfminer\pdfinterp.py in seek(self, pos)
233
234 def seek(self, pos):
--> 235 self.fillfp()
236 PSStackParser.seek(self, pos)
237 return
C:\ProgramData\miniforge3\lib\site-packages\pdfminer\pdfinterp.py in fillfp(self)
229 else:
230 raise PSEOF('Unexpected EOF, file truncated?')
--> 231 self.fp = BytesIO(strm.get_data())
232 return
233
C:\ProgramData\miniforge3\lib\site-packages\pdfminer\pdftypes.py in get_data(self)
290 def get_data(self):
291 if self.data is None:
--> 292 self.decode()
293 return self.data
294
C:\ProgramData\miniforge3\lib\site-packages\pdfminer\pdftypes.py in decode(self)
271 raise PDFNotImplementedError('Unsupported filter: %r' % f)
272 # apply predictors
--> 273 if 'Predictor' in params:
274 pred = int_value(params['Predictor'])
275 if pred == 1:
TypeError: argument of type 'PDFObjRef' is not iterable
有人可以尝试将其加载到内存中吗?如果成功,请告诉我他们是如何做到的?
使用的包版本
conda 4.11.0 py39hcbf5309_0 conda-forge
ipython 7.28.0 py39h832f523_0 conda-forge
notebook 6.4.4 pyha770c72_0 conda-forge
pdfminer 20191125 pyhd8ed1ab_1 conda-forge
pillow 8.3.2 py39h916092e_0 conda-forge
pyparsing 2.4.7 pyh9f0ad1d_0 conda-forge
pytesseract 0.3.8 pyhd8ed1ab_0 conda-forge
python 3.9.7 h7840368_3_cpython conda-forge
wcwidth 0.2.5 pyh9f0ad1d_2 conda-forge
wheel 0.37.0 pyhd8ed1ab_1 conda-forge
我检查了元数据问题,但没有问题。我检查了加密,但这也不是问题所在。多页也没问题。
当我改变
if 'Predictor' in params:
至:
if isinstance(params, dict) and 'Predictor' in params:
在文件 pdftypes.py
(第 273 行)中,我不再收到错误。
参见:https://github.com/pdfminer/pdfminer.six/pull/471
来自 PR 471 的修复未包含在版本 20191125
中。
我编写的代码成功解析了数千种不同类型的 pdf。
然而,对于这个 pdf,我得到了一个错误。这是一个非常简单的测试代码示例,它重现了错误。我的原始代码太长,无法在这里分享
file = open('C:/Users/username/file.pdf', 'rb')
rsrcmgr = PDFResourceManager()
device = PDFPageAggregator(rsrcmgr, laparams=LAParams())
interpreter = PDFPageInterpreter(rsrcmgr, device)
pages = PDFPage.get_pages(file)
for page in pages:
interpreter.process_page(page)
layout = device.get_result()
https://filetransfer.io/data-package/dWnZbcWl#link
这是完整的错误信息
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
~\AppData\Local\Temp/ipykernel_15652/28568702.py in <module>
7 for page in pages:
----> 8 interpreter.process_page(page)
9 layout = device.get_result()
C:\ProgramData\miniforge3\lib\site-packages\pdfminer\pdfinterp.py in process_page(self, page)
839 ctm = (1, 0, 0, 1, -x0, -y0)
840 self.device.begin_page(page, ctm)
--> 841 self.render_contents(page.resources, page.contents, ctm=ctm)
842 self.device.end_page(page)
843 return
C:\ProgramData\miniforge3\lib\site-packages\pdfminer\pdfinterp.py in render_contents(self, resources, streams, ctm)
852 self.init_resources(resources)
853 self.init_state(ctm)
--> 854 self.execute(list_value(streams))
855 return
856
C:\ProgramData\miniforge3\lib\site-packages\pdfminer\pdfinterp.py in execute(self, streams)
857 def execute(self, streams):
858 try:
--> 859 parser = PDFContentParser(streams)
860 except PSEOF:
861 # empty page
C:\ProgramData\miniforge3\lib\site-packages\pdfminer\pdfinterp.py in __init__(self, streams)
219 self.streams = streams
220 self.istream = 0
--> 221 PSStackParser.__init__(self, None)
222 return
223
C:\ProgramData\miniforge3\lib\site-packages\pdfminer\psparser.py in __init__(self, fp)
513
514 def __init__(self, fp):
--> 515 PSBaseParser.__init__(self, fp)
516 self.reset()
517 return
C:\ProgramData\miniforge3\lib\site-packages\pdfminer\psparser.py in __init__(self, fp)
167 def __init__(self, fp):
168 self.fp = fp
--> 169 self.seek(0)
170 return
171
C:\ProgramData\miniforge3\lib\site-packages\pdfminer\pdfinterp.py in seek(self, pos)
233
234 def seek(self, pos):
--> 235 self.fillfp()
236 PSStackParser.seek(self, pos)
237 return
C:\ProgramData\miniforge3\lib\site-packages\pdfminer\pdfinterp.py in fillfp(self)
229 else:
230 raise PSEOF('Unexpected EOF, file truncated?')
--> 231 self.fp = BytesIO(strm.get_data())
232 return
233
C:\ProgramData\miniforge3\lib\site-packages\pdfminer\pdftypes.py in get_data(self)
290 def get_data(self):
291 if self.data is None:
--> 292 self.decode()
293 return self.data
294
C:\ProgramData\miniforge3\lib\site-packages\pdfminer\pdftypes.py in decode(self)
271 raise PDFNotImplementedError('Unsupported filter: %r' % f)
272 # apply predictors
--> 273 if 'Predictor' in params:
274 pred = int_value(params['Predictor'])
275 if pred == 1:
TypeError: argument of type 'PDFObjRef' is not iterable
有人可以尝试将其加载到内存中吗?如果成功,请告诉我他们是如何做到的?
使用的包版本
conda 4.11.0 py39hcbf5309_0 conda-forge
ipython 7.28.0 py39h832f523_0 conda-forge
notebook 6.4.4 pyha770c72_0 conda-forge
pdfminer 20191125 pyhd8ed1ab_1 conda-forge
pillow 8.3.2 py39h916092e_0 conda-forge
pyparsing 2.4.7 pyh9f0ad1d_0 conda-forge
pytesseract 0.3.8 pyhd8ed1ab_0 conda-forge
python 3.9.7 h7840368_3_cpython conda-forge
wcwidth 0.2.5 pyh9f0ad1d_2 conda-forge
wheel 0.37.0 pyhd8ed1ab_1 conda-forge
我检查了元数据问题,但没有问题。我检查了加密,但这也不是问题所在。多页也没问题。
当我改变
if 'Predictor' in params:
至:
if isinstance(params, dict) and 'Predictor' in params:
在文件 pdftypes.py
(第 273 行)中,我不再收到错误。
参见:https://github.com/pdfminer/pdfminer.six/pull/471
来自 PR 471 的修复未包含在版本 20191125
中。