从 pdf 转换为文本:行和字被打破
convert from pdf to text: lines and words are broken
我想通过 PyPDF2 将 pdf 文件转换为文本,但转换后的文本看起来与 PDF 文件不同。具体来说,PDF 中的一行在文本中被分成多行,单词也可能被打断。附件是我用下面的代码得到的 PDF 和文本文件。谁能帮我解决这个问题?
from PyPDF2 import PdfFileReader
def extract_pdf_text(file_path=""):
reader = PdfFileReader(file_path)
text = ""
# Loop over all the pdf pages.
for page in reader.pages:
text = text + page.extractText()
return text
pdf_text = extract_pdf_text("PDF file path")
pdf file
converted text
我就是这样做的。
from io import StringIO
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
import os
import sys, getopt
#converts pdf, returns its text content as a string
def convert(fname, pages=None):
if not pages:
pagenums = set()
else:
pagenums = set(pages)
output = io.StringIO()
manager = PDFResourceManager()
converter = TextConverter(manager, output, laparams=LAParams())
interpreter = PDFPageInterpreter(manager, converter)
infile = open(fname, 'rb')
for page in PDFPage.get_pages(infile, pagenums):
interpreter.process_page(page)
infile.close()
converter.close()
text = output.getvalue()
output.close
return text
#converts all pdfs in directory pdfDir, saves all resulting txt files to txtdir
def convertMultiple(pdfDir, txtDir):
if pdfDir == "": pdfDir = os.getcwd() + "\" #if no pdfDir passed in
for pdf in os.listdir(pdfDir): #iterate through pdfs in pdf directory
fileExtension = pdf.split(".")[-1]
if fileExtension == "pdf":
pdfFilename = pdfDir + pdf
text = convert(pdfFilename) #get string of text content of pdf
textFilename = txtDir + pdf + ".txt"
textFile = open(textFilename, "w") #make text file
textFile.write(text) #write text to text file
# set paths accordingly:
pdfDir = "C://your_path_here/"
txtDir = "C://your_path_here/"
convertMultiple(pdfDir, txtDir)
我想通过 PyPDF2 将 pdf 文件转换为文本,但转换后的文本看起来与 PDF 文件不同。具体来说,PDF 中的一行在文本中被分成多行,单词也可能被打断。附件是我用下面的代码得到的 PDF 和文本文件。谁能帮我解决这个问题?
from PyPDF2 import PdfFileReader
def extract_pdf_text(file_path=""):
reader = PdfFileReader(file_path)
text = ""
# Loop over all the pdf pages.
for page in reader.pages:
text = text + page.extractText()
return text
pdf_text = extract_pdf_text("PDF file path")
pdf file
converted text
我就是这样做的。
from io import StringIO
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
import os
import sys, getopt
#converts pdf, returns its text content as a string
def convert(fname, pages=None):
if not pages:
pagenums = set()
else:
pagenums = set(pages)
output = io.StringIO()
manager = PDFResourceManager()
converter = TextConverter(manager, output, laparams=LAParams())
interpreter = PDFPageInterpreter(manager, converter)
infile = open(fname, 'rb')
for page in PDFPage.get_pages(infile, pagenums):
interpreter.process_page(page)
infile.close()
converter.close()
text = output.getvalue()
output.close
return text
#converts all pdfs in directory pdfDir, saves all resulting txt files to txtdir
def convertMultiple(pdfDir, txtDir):
if pdfDir == "": pdfDir = os.getcwd() + "\" #if no pdfDir passed in
for pdf in os.listdir(pdfDir): #iterate through pdfs in pdf directory
fileExtension = pdf.split(".")[-1]
if fileExtension == "pdf":
pdfFilename = pdfDir + pdf
text = convert(pdfFilename) #get string of text content of pdf
textFilename = txtDir + pdf + ".txt"
textFile = open(textFilename, "w") #make text file
textFile.write(text) #write text to text file
# set paths accordingly:
pdfDir = "C://your_path_here/"
txtDir = "C://your_path_here/"
convertMultiple(pdfDir, txtDir)