对抓取和存储 PDF 文本的代码进行故障排除 python
Troubleshooting python code to scrape and store PDF text
以下代码搜索 the main URL 并输入 'Council' hyperlink 以从每页的会议记录文档中提取文本(存储在 PDF 中,并使用 PyPDF2 提取)。
我遇到的问题是代码应该循环遍历 n 页以提取 PDF,但只输出 returns 第一个 PDF。我不确定发生了什么,因为 minutes_links 确实将正确数量的 link 存储到 PDF 文件中,但在提取 pdf_name 和 pages_text 的 for 循环中,仅提取并存储第一个 link。
import os
import time
from io import BytesIO
from urllib.parse import urljoin
import pandas as pd
import PyPDF2
import requests
from bs4 import BeautifulSoup as soup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
# Create a headless chromedriver to query and perform action on webpages like a browser
chrome_options = Options()
chrome_options.add_argument("--headless")
driver = webdriver.Chrome(options=chrome_options)
# Main url
my_url = (
"https://covapp.vancouver.ca/councilMeetingPublic/CouncilMeetings.aspx?SearchType=3"
)
def get_n_first_pages(n: int):
"""Get the html text for the first n pages
Args:
n (int): The number of pages we want
Returns:
List[str]: A list of html text
"""
# Initialize the variables containing the pages
pages = []
# We query the web page with our chrome driver.
# This way we can iteratively click on the next link to get all the pages we want
driver.get(my_url)
# We append the page source code
pages.append(driver.page_source)
# Then for all subsequent pages, we click on next and wait to get the page
for _ in range(1, n):
driver.find_element_by_css_selector(
"#LiverpoolTheme_wt93_block_wtMainContent_RichWidgets_wt132_block_wt28"
).click()
# Wait for the page to load
time.sleep(1)
# Append the page
pages.append(driver.page_source)
return pages
def get_pdf(link: str):
"""Get the pdf text, per PDF pages, for a given link.
Args:
link (str): The link where we can retrieve the PDF
Returns:
List[str]: A list containing a string per PDF pages
"""
# We extract the file name
pdf_name = link.split("/")[-1].split(".")[0]
# We get the page containing the PDF link
# Here we don't need the chrome driver since we don't have to click on the link
# We can just get the PDF using requests after finding the href
pdf_link_page = requests.get(link)
page_soup = soup(pdf_link_page.text, "lxml")
# We get all <a> tag that have href attribute, then we select only the href
# containing min.pdf, since we only want the PDF for the minutes
pdf_link = [
urljoin(link, l.attrs["href"])
for l in page_soup.find_all("a", {"href": True})
if "min.pdf" in l.attrs["href"]
]
# There is only one PDF for the minutes so we get the only element in the list
pdf_link = pdf_link[0]
# We get the PDF with requests and then get the PDF bytes
pdf_bytes = requests.get(pdf_link).content
# We load the bytes into an in memory file (to avoid saving the PDF on disk)
p = BytesIO(pdf_bytes)
p.seek(0, os.SEEK_END)
# Now we can load our PDF in PyPDF2 from memory
read_pdf = PyPDF2.PdfFileReader(p)
count = read_pdf.numPages
pages_txt = []
# For each page we extract the text
for i in range(count):
page = read_pdf.getPage(i)
pages_txt.append(page.extractText())
# We return the PDF name as well as the text inside each pages
return pdf_name, pages_txt
# Get the first 16 pages, you can change this number
pages = get_n_first_pages(16)
# Initialize a list to store each dataframe rows
df_rows = []
# We iterate over each page
for page in pages:
page_soup = soup(page, "lxml")
# Here we get only the <a> tag inside the tbody and each tr
# We avoid getting the links from the head of the table
all_links = page_soup.select("tbody tr a")
# We extract the href for only the links containing council (we don't care about the
# video link)
minutes_links = [x.attrs["href"] for x in all_links if "council" in x.attrs["href"]]
#
for link in minutes_links:
pdf_name, pages_text = get_pdf(link)
df_rows.append(
{
"PDF_file_name": pdf_name,
# We join each page in the list into one string, separting them with a line return
"PDF_text": "\n".join(pages_text),
}
)
break
break
# We create the data frame from the list of rows
df = pd.DataFrame(df_rows)
所需的输出是一个如下所示的数据框:
PDF_file_name
PDF_text
spec20210729min
[[' \n \n \n \n \n \n \nSPECIAL COUNCIL MEET\nING MINUTES\n \n \nJULY 29, 2021\n \n \nA Special Meeting of the Council\n \nof the City of Vancouver\n \nw
spec20210802min
[[' \n \n \n \n \n \n \nSPECIAL COUNCIL MEET\nING MINUTES\n \n \nAUGUST 2, 2021\n \n \nA Special Meeting of the Council\n \nof the City of Vancouver\n \nw
现在,我可以得到其中的第一个文件,但不能得到任何后续文件。 TIA!
在两个 for 循环的末尾,您有一个 break
命令。
break
命令告诉 for 循环停止执行并继续执行下一个代码块。所以,你的每个 for 循环只结束 运行 一次。
删除这两个 break 语句,它应该可以正常工作。
P.S - 我没有测试过这个,如果它不起作用我会删除这个答案
以下代码搜索 the main URL 并输入 'Council' hyperlink 以从每页的会议记录文档中提取文本(存储在 PDF 中,并使用 PyPDF2 提取)。
我遇到的问题是代码应该循环遍历 n 页以提取 PDF,但只输出 returns 第一个 PDF。我不确定发生了什么,因为 minutes_links 确实将正确数量的 link 存储到 PDF 文件中,但在提取 pdf_name 和 pages_text 的 for 循环中,仅提取并存储第一个 link。
import os
import time
from io import BytesIO
from urllib.parse import urljoin
import pandas as pd
import PyPDF2
import requests
from bs4 import BeautifulSoup as soup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
# Create a headless chromedriver to query and perform action on webpages like a browser
chrome_options = Options()
chrome_options.add_argument("--headless")
driver = webdriver.Chrome(options=chrome_options)
# Main url
my_url = (
"https://covapp.vancouver.ca/councilMeetingPublic/CouncilMeetings.aspx?SearchType=3"
)
def get_n_first_pages(n: int):
"""Get the html text for the first n pages
Args:
n (int): The number of pages we want
Returns:
List[str]: A list of html text
"""
# Initialize the variables containing the pages
pages = []
# We query the web page with our chrome driver.
# This way we can iteratively click on the next link to get all the pages we want
driver.get(my_url)
# We append the page source code
pages.append(driver.page_source)
# Then for all subsequent pages, we click on next and wait to get the page
for _ in range(1, n):
driver.find_element_by_css_selector(
"#LiverpoolTheme_wt93_block_wtMainContent_RichWidgets_wt132_block_wt28"
).click()
# Wait for the page to load
time.sleep(1)
# Append the page
pages.append(driver.page_source)
return pages
def get_pdf(link: str):
"""Get the pdf text, per PDF pages, for a given link.
Args:
link (str): The link where we can retrieve the PDF
Returns:
List[str]: A list containing a string per PDF pages
"""
# We extract the file name
pdf_name = link.split("/")[-1].split(".")[0]
# We get the page containing the PDF link
# Here we don't need the chrome driver since we don't have to click on the link
# We can just get the PDF using requests after finding the href
pdf_link_page = requests.get(link)
page_soup = soup(pdf_link_page.text, "lxml")
# We get all <a> tag that have href attribute, then we select only the href
# containing min.pdf, since we only want the PDF for the minutes
pdf_link = [
urljoin(link, l.attrs["href"])
for l in page_soup.find_all("a", {"href": True})
if "min.pdf" in l.attrs["href"]
]
# There is only one PDF for the minutes so we get the only element in the list
pdf_link = pdf_link[0]
# We get the PDF with requests and then get the PDF bytes
pdf_bytes = requests.get(pdf_link).content
# We load the bytes into an in memory file (to avoid saving the PDF on disk)
p = BytesIO(pdf_bytes)
p.seek(0, os.SEEK_END)
# Now we can load our PDF in PyPDF2 from memory
read_pdf = PyPDF2.PdfFileReader(p)
count = read_pdf.numPages
pages_txt = []
# For each page we extract the text
for i in range(count):
page = read_pdf.getPage(i)
pages_txt.append(page.extractText())
# We return the PDF name as well as the text inside each pages
return pdf_name, pages_txt
# Get the first 16 pages, you can change this number
pages = get_n_first_pages(16)
# Initialize a list to store each dataframe rows
df_rows = []
# We iterate over each page
for page in pages:
page_soup = soup(page, "lxml")
# Here we get only the <a> tag inside the tbody and each tr
# We avoid getting the links from the head of the table
all_links = page_soup.select("tbody tr a")
# We extract the href for only the links containing council (we don't care about the
# video link)
minutes_links = [x.attrs["href"] for x in all_links if "council" in x.attrs["href"]]
#
for link in minutes_links:
pdf_name, pages_text = get_pdf(link)
df_rows.append(
{
"PDF_file_name": pdf_name,
# We join each page in the list into one string, separting them with a line return
"PDF_text": "\n".join(pages_text),
}
)
break
break
# We create the data frame from the list of rows
df = pd.DataFrame(df_rows)
所需的输出是一个如下所示的数据框:
PDF_file_name | PDF_text |
---|---|
spec20210729min | [[' \n \n \n \n \n \n \nSPECIAL COUNCIL MEET\nING MINUTES\n \n \nJULY 29, 2021\n \n \nA Special Meeting of the Council\n \nof the City of Vancouver\n \nw |
spec20210802min | [[' \n \n \n \n \n \n \nSPECIAL COUNCIL MEET\nING MINUTES\n \n \nAUGUST 2, 2021\n \n \nA Special Meeting of the Council\n \nof the City of Vancouver\n \nw |
现在,我可以得到其中的第一个文件,但不能得到任何后续文件。 TIA!
在两个 for 循环的末尾,您有一个 break
命令。
break
命令告诉 for 循环停止执行并继续执行下一个代码块。所以,你的每个 for 循环只结束 运行 一次。
删除这两个 break 语句,它应该可以正常工作。
P.S - 我没有测试过这个,如果它不起作用我会删除这个答案