如何使用 Python 从 PDF 读取并另存为 CSV？

Question

有这个URLhttps://www.jpx.co.jp/english/listing/stocks/new/index.html#3422

我写了（从互联网上复制和粘贴！）以下代码来保存 table 中的所有 pdf 到一个文件夹中

from PyPDF2 import PdfFileReader
import requests
from bs4 import BeautifulSoup
import io
import urllib.request as req
import urllib
import os
import time
from urllib.parse import urljoin

url = 'https://www.jpx.co.jp/english/listing/stocks/new/index.html'
headers = {'User-Agent':'Mozilla/5.0'}
res = req.urlopen(url) 
soup = BeautifulSoup(res, "html.parser") 
result = soup.select("a[href]") 

link_list =[] 
for link in result: 
    href = link.get("href") 
    link_list.append(href) 

pdf_list = [temp for temp in link_list if temp.endswith('pdf')]
print(pdf_list)

abs_pdf_list = []
for relative in pdf_list:
    temp_url = urljoin(url, relative)
    abs_pdf_list.append(temp_url)

filename_list = []
for target in abs_pdf_list:
    temp_list = target.split("/")
    filename_list.append(temp_list[len(temp_list)-1])

newpath = r'/Users/myfolder/python/IPO' 
if not os.path.exists(newpath):
    os.makedirs(newpath)

target_dir = "/Users/myfolder/python/IPO/"
savepath_list = []
for filename in filename_list:
    savepath_list.append(os.path.join(target_dir, filename))
savepath_list

for (pdflink, savepath) in zip(abs_pdf_list, savepath_list):
        print(pdflink)
        urllib.request.urlretrieve(pdflink, savepath)
        time.sleep(2)

import pdfplumber
import re

def download_file(url):
    local_filename = url.split('/')[-1]
    
    with requests.get(url) as r:
        with open(local_filename, 'wb') as f:
            f.write(r.content)
        
    return local_filename

ap_url = abs_pdf_list[0]

ap = download_file(ap_url)

with pdfplumber.open(ap) as pdf:
    page1 = pdf.pages[0]
    page2 = pdf.pages[1]  
    text = page1.extract_text()

print(text)

现在我需要阅读这些 pdf 并提取以下行，

来自第 1 页以“Information & Communication”开头的行

来自第2页开头的行 “建书期” “发行价”

并将它们保存在一个 Excel 或 CSV 文件中可悲的是我达到了我的编码技能极限，不能再进一步了。我将 pdf 转换为文本，但是......

请教我怎么做

Answer 1

我建议安装我们的新包 pdftextract，它尽可能保留 pdf 布局以提取文本，然后使用一些正则表达式来提取关键字。

这是在您 link 的 2 个 pdf 文件上测试的有效代码片段：

import re
import csv
from pdftextract import XPdf

pdf_files = ['a.pdf', "b.pdf"]
keywords = ["Information & Communication", "Book-building Period", "Offering Price"]

def extract_infos(file:str, keywords:list):
    """extract the text from the pdf file then get the wanted keywords information"""
    # extracting the text from pdf while keeping the original layout
    pdf = XPdf(file)
    txt = pdf.to_text(keep_layout=True)
    row = []
    # getting the keywords information
    for keyword in keywords:
        # search for the keyword
        pattern = "{} (.+)\r".format(keyword) # extracting the wanted info
        regex = re.compile(pattern, flags=re.I| re.M)
        m = regex.search(txt)
        if m is not None:
            m = m.groups()[0].strip(' /\r') # strip unwanted space and characters
        row.append(m)
    return row

def main(files:list, fname:str, headers:list):
    """extract the wanted info from a bunch of pdf files and save them as csv file"""
    with open(fname, "w") as wf:
        writer = csv.writer(wf)
        writer.writerow(headers)

        for i, file in enumerate(files, start=1):
            row = extract_infos(file, headers)
            writer.writerow(row)

    print("[DONE]", "writed {} rows to {}.".format(i, fname))

main(pdf_files, "stocks.csv", keywords)

如何使用 Python 从 PDF 读取并另存为 CSV？

How to read from PDF and save as CSV, using Python?

python

csv

pdf

web-scraping

data-extraction