PDFMiner version diffs? Getting AttributeError: 'PDFDocument' object has no attribute 'seek'
PDFMiner version diffs? Getting AttributeError: 'PDFDocument' object has no attribute 'seek'
我从之前的 SO 问题中提取了一些 Python 代码,但该代码是为以前版本的 PDFMiner 编写的(从那以后似乎对 PDFMiner 进行了一些重大更改)。我已经做了一些更改来解决错误,但现在我收到以下错误:
C:\Users\xxxx\Documents\Programming\Python>pdfextractor.py
Traceback (most recent call last):
File "C:\Users\xxxx\Documents\Programming\Python\pdfextractor.py", line 71, in <module>
pdf_to_csv(sourcefile)
File "C:\Users\xxxx\Documents\Programming\Python\pdfextractor.py", line 55, in pdf_to_csv
for i, page in PDFPage.get_pages(doc):
File "C:\Program Files\Python27\lib\site-packages\pdfminer\pdfpage.py", line 119, in get_pages
parser = PDFParser(fp)
File "C:\Program Files\Python27\lib\site-packages\pdfminer\pdfparser.py", line 43, in __init__
PSStackParser.__init__(self, fp)
File "C:\Program Files\Python27\lib\site-packages\pdfminer\psparser.py", line 495, in __init__
PSBaseParser.__init__(self, fp)
File "C:\Program Files\Python27\lib\site-packages\pdfminer\psparser.py", line 166, in __init__
self.seek(0)
File "C:\Program Files\Python27\lib\site-packages\pdfminer\psparser.py", line 507, in seek
PSBaseParser.seek(self, pos)
File "C:\Program Files\Python27\lib\site-packages\pdfminer\psparser.py", line 196, in seek
self.fp.seek(pos)
AttributeError: 'PDFDocument' object has no attribute 'seek'
这是我的代码 运行:
# ORIGINAL CODE DOES NOT SEEM COMPATIBLE WITH THE CURRENT VERSION OF PDFMINER!
# Code taken from:
#
def pdf_to_csv(filename):
from cStringIO import StringIO
from pdfminer.converter import LTChar, TextConverter
from pdfminer.layout import LAParams
# from pdfminer.pdfparser import PDFDocument, PDFParser # Not compatible with current version of PDFMiner
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
class CsvConverter(TextConverter):
def __init__(self, *args, **kwargs):
TextConverter.__init__(self, *args, **kwargs)
def end_page(self, i):
from collections import defaultdict
lines = defaultdict(lambda : {})
for child in self.cur_item._objs: #<-- changed
if isinstance(child, LTChar):
(_,_,x,y) = child.bbox
line = lines[int(-y)]
line[x] = child._text.encode(self.codec) #<-- changed
for y in sorted(lines.keys()):
line = lines[y]
self.outfp.write(";".join(line[x] for x in sorted(line.keys())))
self.outfp.write("\n")
# ... the following part of the code is a remix of the
# convert() function in the pdfminer/tools/pdf2text module
rsrc = PDFResourceManager()
outfp = StringIO()
device = CsvConverter(rsrc, outfp, codec="utf-8", laparams=LAParams())
# because my test documents are utf-8 (note: utf-8 is the default codec)
# doc = PDFDocument() # Raises error with current version of PDFMiner
# --> TypeError: __init__() takes at least 2 arguments (1 given)
fp = open(filename, 'rb')
parser = PDFParser(fp)
doc = PDFDocument(parser,'') # Inserted ahead of 'parser.set_document(doc)' to avoid error
# --> UnboundLocalError: local variable 'doc' referenced before assignment
parser.set_document(doc)
# doc.set_parser(parser) # Not compatible with current version of PDFMiner
# doc.initialize('') # Not compatible with current version of PDFMiner
interpreter = PDFPageInterpreter(rsrc, device)
# for i, page in enumerate(doc.get_pages()): # Not compatible with current version of PDFMiner
for i, page in PDFPage.get_pages(doc):
outfp.write("START PAGE %d\n" % i)
if page is not None:
interpreter.process_page(page)
outfp.write("END PAGE %d\n" % i)
# data = retstr.getvalue()
device.close()
fp.close()
return outfp.getvalue()
sourcefile = 'testfile1.pdf'
# sourcefile = 'testfile2.pdf'
# sourcefile = 'testfile3.pdf'
pdf_to_csv(sourcefile)
print 'Done.'
任何人都可以看到发生了什么事吗?我是否需要更改调用解析器的方式(参数、序列等)?
我是 运行 Python 2.7.12 & PDFMiner '20140328' Windows 10.
尝试替换行
for i, page in PDFPage.get_pages(doc):
和
for i, page in enumerate(PDFPage.create_pages(doc)):
this page of the PDFMiner documentation 的 'Basic Usage' 部分中的代码示例建议使用 create_pages
遍历文档中的页面。当您跟踪索引时变量 i
中的页面,我已将对 create_pages
的调用包装在 enumerate
.
中
我从之前的 SO 问题中提取了一些 Python 代码,但该代码是为以前版本的 PDFMiner 编写的(从那以后似乎对 PDFMiner 进行了一些重大更改)。我已经做了一些更改来解决错误,但现在我收到以下错误:
C:\Users\xxxx\Documents\Programming\Python>pdfextractor.py
Traceback (most recent call last):
File "C:\Users\xxxx\Documents\Programming\Python\pdfextractor.py", line 71, in <module>
pdf_to_csv(sourcefile)
File "C:\Users\xxxx\Documents\Programming\Python\pdfextractor.py", line 55, in pdf_to_csv
for i, page in PDFPage.get_pages(doc):
File "C:\Program Files\Python27\lib\site-packages\pdfminer\pdfpage.py", line 119, in get_pages
parser = PDFParser(fp)
File "C:\Program Files\Python27\lib\site-packages\pdfminer\pdfparser.py", line 43, in __init__
PSStackParser.__init__(self, fp)
File "C:\Program Files\Python27\lib\site-packages\pdfminer\psparser.py", line 495, in __init__
PSBaseParser.__init__(self, fp)
File "C:\Program Files\Python27\lib\site-packages\pdfminer\psparser.py", line 166, in __init__
self.seek(0)
File "C:\Program Files\Python27\lib\site-packages\pdfminer\psparser.py", line 507, in seek
PSBaseParser.seek(self, pos)
File "C:\Program Files\Python27\lib\site-packages\pdfminer\psparser.py", line 196, in seek
self.fp.seek(pos)
AttributeError: 'PDFDocument' object has no attribute 'seek'
这是我的代码 运行:
# ORIGINAL CODE DOES NOT SEEM COMPATIBLE WITH THE CURRENT VERSION OF PDFMINER!
# Code taken from:
#
def pdf_to_csv(filename):
from cStringIO import StringIO
from pdfminer.converter import LTChar, TextConverter
from pdfminer.layout import LAParams
# from pdfminer.pdfparser import PDFDocument, PDFParser # Not compatible with current version of PDFMiner
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
class CsvConverter(TextConverter):
def __init__(self, *args, **kwargs):
TextConverter.__init__(self, *args, **kwargs)
def end_page(self, i):
from collections import defaultdict
lines = defaultdict(lambda : {})
for child in self.cur_item._objs: #<-- changed
if isinstance(child, LTChar):
(_,_,x,y) = child.bbox
line = lines[int(-y)]
line[x] = child._text.encode(self.codec) #<-- changed
for y in sorted(lines.keys()):
line = lines[y]
self.outfp.write(";".join(line[x] for x in sorted(line.keys())))
self.outfp.write("\n")
# ... the following part of the code is a remix of the
# convert() function in the pdfminer/tools/pdf2text module
rsrc = PDFResourceManager()
outfp = StringIO()
device = CsvConverter(rsrc, outfp, codec="utf-8", laparams=LAParams())
# because my test documents are utf-8 (note: utf-8 is the default codec)
# doc = PDFDocument() # Raises error with current version of PDFMiner
# --> TypeError: __init__() takes at least 2 arguments (1 given)
fp = open(filename, 'rb')
parser = PDFParser(fp)
doc = PDFDocument(parser,'') # Inserted ahead of 'parser.set_document(doc)' to avoid error
# --> UnboundLocalError: local variable 'doc' referenced before assignment
parser.set_document(doc)
# doc.set_parser(parser) # Not compatible with current version of PDFMiner
# doc.initialize('') # Not compatible with current version of PDFMiner
interpreter = PDFPageInterpreter(rsrc, device)
# for i, page in enumerate(doc.get_pages()): # Not compatible with current version of PDFMiner
for i, page in PDFPage.get_pages(doc):
outfp.write("START PAGE %d\n" % i)
if page is not None:
interpreter.process_page(page)
outfp.write("END PAGE %d\n" % i)
# data = retstr.getvalue()
device.close()
fp.close()
return outfp.getvalue()
sourcefile = 'testfile1.pdf'
# sourcefile = 'testfile2.pdf'
# sourcefile = 'testfile3.pdf'
pdf_to_csv(sourcefile)
print 'Done.'
任何人都可以看到发生了什么事吗?我是否需要更改调用解析器的方式(参数、序列等)?
我是 运行 Python 2.7.12 & PDFMiner '20140328' Windows 10.
尝试替换行
for i, page in PDFPage.get_pages(doc):
和
for i, page in enumerate(PDFPage.create_pages(doc)):
this page of the PDFMiner documentation 的 'Basic Usage' 部分中的代码示例建议使用 create_pages
遍历文档中的页面。当您跟踪索引时变量 i
中的页面,我已将对 create_pages
的调用包装在 enumerate
.