比较关键字与 PDF 文件

Comparing keywords with PDF files

这是通过文件夹名称调用文件并提取数据的程序。现在我想将数据与我在下面的程序中使用的关键字进行比较。但它给了我:

pdfReader = pdfFileObj.loadPage(0)
AttributeError: '_io.BufferedReader' object has no attribute 'loadPage'

我想排除错误,将关键词与提取的数据进行比较。我为此程序使用了 PyMuPDF 库。

import fitz
import os

pdfFiles = []
for filename in os.listdir('resume/'):
    if filename.endswith('.pdf'):
        print(filename)
        # pdfFiles.append(filename)
        os.chdir('C:/Users/M. Abrar Hussain/Desktop/cv/resume')
        print('Current working dir : %s' % os.getcwd())
        pdfFileObj = open(filename, 'rb')
        pdfReader = pdfFileObj.loadPage(0)
        with fitz.open(pdfFileObj) as doc:
            text = ""
            for page in doc:
                text += page.getText()
                print(text)
                # split the docs
                pageObj = pdfReader.getpage(0)
                t1 = (pageObj.getText())
                t1 = t1.split(",")
                search_keywords = ['python', 'Laravel', 'Java']
                for sentence in t1:
                    lst = []
                    for word in search_keywords:
                        if word in search_keywords:
                            list.append(word)
                        print('{0} key word(s) in sentence: {1}'.format(len(lst), ', '.join(lst)))
        pdfFileObj.close()

您漏掉了两行:import PyPDF2pdfReader = PyPDF2.PdfFileReader(pdfFileObj)

注意 getPage(0) 将 return 页号为 0 的对象,在你的 for 循环中你不断地阅读同一页,如果你想每次迭代都阅读新的页面你应该检查多少页文档中有并创建从 0 到 pdfReader.numPages.

的 i 参数
import fitz
import os
import PyPDF2

pdfFiles = []
for filename in os.listdir('resume/'):
    if filename.endswith('.pdf'):
        print(filename)
        # pdfFiles.append(filename)
        os.chdir('C:/Users/M. Abrar Hussain/Desktop/cv/resume')
        print('Current working dir : %s' % os.getcwd())
        pdfFileObj = open(filename, 'rb')
        pdfReader = PyPDF2.PdfFileReader(pdfFileObj)
        pageObj = pdfReader.getPage(0)
        with fitz.open(pdfFileObj) as doc:
            text = ""
            for page in doc:
                text += page.getText()
                print(text)
                # split the docs
                pageObj = pdfReader.getPage(0)
                t1 = (pageObj.getText())
                t1 = t1.split(",")
                search_keywords = ['python', 'Laravel', 'Java']
                for sentence in t1:
                    lst = []
                    for word in search_keywords:
                        if word in search_keywords:
                            list.append(word)
                        print('{0} key word(s) in sentence: {1}'.format(len(lst), ', '.join(lst)))
        pdfFileObj.close()

working-with-pdf-files-in-python