使用 bs4 从多个页面抓取 PDF

Scraping PDFs from multiple pages using bs4

我是一个 python 初学者,我希望我尝试做的事情不会太复杂。从本质上讲,我想从本网站过去 10 年的市政委员会会议中提取会议记录的文本(包含在 PDF 文档中):https://covapp.vancouver.ca/councilMeetingPublic/CouncilMeetings.aspx?SearchType=3

最后,我想 analyze/categorise 会议记录中的行动项目。到目前为止,我所能做的就是从第一页获取指向 PDF 的链接。这是我的代码:

# Import requests for navigating to websites, beautiful soup to scrape website, PyPDF2 for PDF data mining
 
import sys 
import requests
import bs4 
import PyPDF2 
#import PDfMiner 
from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as soup 

# Soupify URL
my_url = "https://covapp.vancouver.ca/councilMeetingPublic/CouncilMeetings.aspx?SearchType=3"
result = requests.get(my_url)
src = result.content
page_soup = soup(src, "lxml")

#list with links
urls = []
for tr_tag in page_soup.find_all("tr"):
    a_tag = tr_tag.find("a")
    urls.append(a_tag.attrs["href"])

print(urls)

一些我可以使用的帮助:

非常感谢任何帮助!提前致谢!

编辑:我希望将数据放入数据框中,其中第一列是文件名,第二列是 PDF 中的文本。它看起来像:

PDF_file_name PDF_text
spec20210729min [[' \n \n \n \n \n \n \nSPECIAL COUNCIL MEET\nING MINUTES\n \n \nJULY 29, 2021\n \n \nA Special Meeting of the Council\n \nof the City of Vancouver\n \nw
spec20210802min [[' \n \n \n \n \n \n \nSPECIAL COUNCIL MEET\nING MINUTES\n \n \nAUGUST 2, 2021\n \n \nA Special Meeting of the Council\n \nof the City of Vancouver\n \nw

欢迎来到令人兴奋的网页抓取世界!

首先,干得好,你的工作进展顺利。 不过有几点需要讨论。

你基本上有两个问题。

1 - 如何检索所有页面 (1, ..., 50) 的 HTML 文本?

在网页抓取中,您主要针对以下网页:

  1. 如果幸运的话,页面 不会使用 javascript 呈现,您只能使用 requests 来获取页面内容
  2. 你没那么幸运,页面 使用 JavaScript 部分或全部呈现

要获得从 1 到 50 的所有页面,我们需要以某种方式点击页面末尾的按钮 下一页 . 为什么? 如果您从浏览器开发人员控制台检查 网络选项卡 中发生的情况,您会看到每次点击 下一步按钮。 不幸的是,我们无法使用 requests

渲染 JavaScript

但是我们有一个解决方案无头浏览器 (wiki)。

在解决方案中,我使用selenium,这是一个可以使用真正的浏览器驱动程序(在我们的例子中Chrome)来查询的库页面并呈现 JavaScript.

所以我们首先获取带有 selenium 的网页,我们提取 HTML,我们单击下一步并等待页面加载,我们提取 HTML , ...等等。

2 - 如何在获取 PDF 后从中提取文本?

下载 PDF 后,我们可以将其加载到一个变量中,然后用 PyPDF2 打开它并从所有页面中提取文本。我让你看看解决代码

这是一个可行的解决方案。它将遍历您想要的前 n 页和 return 您感兴趣的所有 PDF 中的文本:

import os
import time
from io import BytesIO
from urllib.parse import urljoin

import pandas as pd
import PyPDF2
import requests
from bs4 import BeautifulSoup as soup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options

# Create a headless chromedriver to query and perform action on webpages like a browser
chrome_options = Options()
chrome_options.add_argument("--headless")
driver = webdriver.Chrome(options=chrome_options)

# Main url
my_url = (
    "https://covapp.vancouver.ca/councilMeetingPublic/CouncilMeetings.aspx?SearchType=3"
)


def get_n_first_pages(n: int):
    """Get the html text for the first n pages

    Args:
        n (int): The number of pages we want

    Returns:
        List[str]: A list of html text
    """

    # Initialize the variables containing the pages
    pages = []

    # We query the web page with our chrome driver.
    # This way we can iteratively click on the next link to get all the pages we want
    driver.get(my_url)
    # We append the page source code
    pages.append(driver.page_source)

    # Then for all subsequent pages, we click on next and wait to get the page
    for _ in range(1, n):
        driver.find_element_by_css_selector(
            "#LiverpoolTheme_wt93_block_wtMainContent_RichWidgets_wt132_block_wt28"
        ).click()
        # Wait for the page to load
        time.sleep(1)
        # Append the page
        pages.append(driver.page_source)
    return pages


def get_pdf(link: str):
    """Get the pdf text, per PDF pages, for a given link.

    Args:
        link (str): The link where we can retrieve the PDF

    Returns:
        List[str]: A list containing a string per PDF pages
    """

    # We extract the file name
    pdf_name = link.split("/")[-1].split(".")[0]

    # We get the page containing the PDF link
    # Here we don't need the chrome driver since we don't have to click on the link
    # We can just get the PDF using requests after finding the href
    pdf_link_page = requests.get(link)
    page_soup = soup(pdf_link_page.text, "lxml")
    # We get all <a> tag that have href attribute, then we select only the href
    # containing min.pdf, since we only want the PDF for the minutes
    pdf_link = [
        urljoin(link, l.attrs["href"])
        for l in page_soup.find_all("a", {"href": True})
        if "min.pdf" in l.attrs["href"]
    ]
    # There is only one PDF for the minutes so we get the only element in the list
    pdf_link = pdf_link[0]

    # We get the PDF with requests and then get the PDF bytes
    pdf_bytes = requests.get(pdf_link).content
    # We load the bytes into an in memory file (to avoid saving the PDF on disk)
    p = BytesIO(pdf_bytes)
    p.seek(0, os.SEEK_END)

    # Now we can load our PDF in PyPDF2 from memory
    read_pdf = PyPDF2.PdfFileReader(p)
    count = read_pdf.numPages
    pages_txt = []
    # For each page we extract the text
    for i in range(count):
        page = read_pdf.getPage(i)
        pages_txt.append(page.extractText())

    # We return the PDF name as well as the text inside each pages
    return pdf_name, pages_txt


# Get the first 2 pages, you can change this number
pages = get_n_first_pages(2)


# Initialize a list to store each dataframe rows
df_rows = []

# We iterate over each page
for page in pages:
    page_soup = soup(page, "lxml")

    # Here we get only the <a> tag inside the tbody and each tr
    # We avoid getting the links from the head of the table
    all_links = page_soup.select("tbody tr a")
    # We extract the href for only the links containing council (we don't care about the
    # video link)
    minutes_links = [x.attrs["href"] for x in all_links if "council" in x.attrs["href"]]

    #
    for link in minutes_links:
        pdf_name, pages_text = get_pdf(link)

        df_rows.append(
            {
                "PDF_file_name": pdf_name,
                # We join each page in the list into one string, separting them with a line return
                "PDF_text": "\n".join(pages_text),
            }
        )

        break
    break

# We create the data frame from the list of rows
df = pd.DataFrame(df_rows)

输出数据帧如下:

        PDF_file_name                                           PDF_text
    0  spec20210729ag   \n \n \n \n \n \n \nSPECIAL COUNCIL MEET\nING...
...

继续抓取网页,很有趣:)

问题是 BeautifulSoup 除了第一页之外,看不到任何结果。 BeautifulSoup 只是一个 XML/HTML 解析器,它不是无头浏览器或 JavaScript-capable 运行 可以 运行 JavaScript 异步的时间环境。当您向页面发出简单的 HTTP GET 请求时,响应是一个 HTML 文档,其中第一页的结果直接烘焙到 HTML 中。这些内容在服务器向您提供文档时被烘焙到文档中,因此 BeautifulSoup 可以毫无问题地查看这些元素。然而,所有其他页面的结果都比较棘手。

在浏览器中查看页面。在记录您的网络流量时,单击“下一步”按钮以查看下一页的结果。如果您仅通过 XHR/Fetch 请求过滤流量,您会注意到向 ASP.NET 服务器发出的 HTTP POST 请求,其响应为 HTML 包含JavaScript 包含 JSON 包含 HTML。正是这个嵌套的 HTML 结构表示用于更新 table 的新内容。单击此按钮实际上不会将您带到另一个 URL - table 的内容只是发生了变化。 DOM 被 updated/populated 异步使用 JavaScript,这并不罕见。

因此,挑战在于模仿这些请求并解析响应以仅提取您感兴趣的那些 link 的 HREF。我会把它分成三个不同的脚本:

  1. 一个脚本生成所有 sub-page URL 的 .txt 文件(这些 将是您在单击 link 时导航到的 URL,例如“议程 和会议记录”, example)
  2. .txt 文件中读取一个脚本,向每个 URL 发出请求, 并将 HREF 提取到该页面上的 PDF(如果有的话)。 这些直接 URL 到 PDF 的文件将保存在另一个 .txt 文件中。
  3. 从 PDF-URL .txt 文件中读取并执行 PDF 的脚本 分析。

如果您确实愿意,可以将脚本一和脚本二结合起来。我想把它分开。


第一个脚本向主页发出初始请求,以获取一些必要的 cookie,并提取隐藏的输入 __OSVSTATE,该输入被烘焙到 HTML 中,ASP.NET 服务器关心我们未来的要求。然后,它通过向特定 ASP.NET 服务器端点发送 HTTP POST 请求来模拟“下一步”按钮上的“点击”。我们继续前进,直到我们在页面上再也找不到“下一步”按钮。事实证明,总共有大约 260 页的结果。对于这 260 个响应中的每一个,我们解析响应,从中提取 HTML,并提取 HREF。我们只保留那些 HREF 以子字符串“.htm”结尾且文本包含子字符串“分钟”(case-insensitive) 的标签。然后我们将所有 HREF 写入文本文件 page_urls.txt。由于某些原因,其中一些将被复制,而其他的最终无效 links,但我们稍后会担心。这是整个生成的 text file.

def get_urls():
    import requests
    from bs4 import BeautifulSoup as Soup
    import datetime
    import re
    import json

    # Start by making the initial request to store the necessary cookies in a session
    # Also, retrieve the __OSVSTATE

    url = "https://covapp.vancouver.ca/councilMeetingPublic/CouncilMeetings.aspx?SearchType=3"

    headers = {
        "user-agent": "Mozilla/5.0"
    }

    session = requests.Session()

    response = session.get(url, headers=headers)
    response.raise_for_status()

    soup = Soup(response.content, "html.parser")

    osv_state = soup.select_one("input[id=\"__OSVSTATE\"]")["value"]

    # Get all results from all pages

    url = "https://covapp.vancouver.ca/councilMeetingPublic/CouncilMeetings.aspx"

    headers = {
        "user-agent": "Mozilla/5.0",
        "x-requested-with": "XMLHttpRequest"
    }

    payload = {
        "__EVENTTARGET": "LiverpoolTheme_wt93$block$wtMainContent$RichWidgets_wt132$block$wt28",
        "__AJAX": "980,867,LiverpoolTheme_wt93_block_wtMainContent_RichWidgets_wt132_block_wt28,745,882,0,277,914,760,"
    }

    while True:
        params = {
            "_ts": round(datetime.datetime.now().timestamp())
        }
        payload["__OSVSTATE"] = osv_state

        response = session.post(url, params=params, headers=headers, data=payload)
        response.raise_for_status()

        pattern = "OsJSONUpdate\(({\"outers\":{[^\n]+})\)//\]\]"

        jsn = re.search(pattern, response.text).group(1)
        data = json.loads(jsn)

        osv_state = data["hidden"]["__OSVSTATE"]

        html = data["outers"]["LiverpoolTheme_wt93_block_wtMainContent_wtTblCommEventTable_Wrapper"]["inner"]

        soup = Soup(html, "html.parser")

        # Select only those a-tags whose href attribute ends with ".htm" and whose text contains the substring "minute"
        tags = soup.select("a[href$=\".htm\"]")

        hrefs = [tag["href"] for tag in tags if "minute" in tag.get_text().casefold()]

        yield from hrefs

        page_num = soup.select_one("a.ListNavigation_PageNumber").get_text()
        records_message = soup.select_one("div.Counter_Message").get_text()

        print("Page #{}:\n\tProcessed {}, collected {} URL(s)\n".format(page_num, records_message, len(hrefs)))

        if soup.select_one("a.ListNavigation_Next") is None:
            break


def main():
    with open("page_urls.txt", "w") as file:
        for url in get_urls():
            file.write(url + "\n")
    return 0


if __name__ == "__main__":
    import sys
    sys.exit(main())

第二个脚本读取前一个脚本的输出文件,并向文件中的每个URL发出请求。其中一些将无效,一些需要清理才能使用,许多将是重复的,一些将有效但不包含 link 到 PDF 等。我们访问每个页面并提取 PDF URL,并将每个文件保存在一个文件中。最后我设法收集了 287 个可用的 PDF URLs。这是生成的 text file.

def get_pdf_url(url):
    import requests
    from bs4 import BeautifulSoup as Soup

    url = url.replace("/ctyclerk", "")

    base_url = url[:url.rfind("/")+1]

    headers = {
        "user-agent": "Mozilla/5.0"
    }

    try:
        response = requests.get(url, headers=headers)
        response.raise_for_status()
    except requests.exceptions.HTTPError:
        return ""

    soup = Soup(response.content, "html.parser")

    pdf_tags = soup.select("a[href$=\".pdf\"]")

    tag = next((tag for tag in pdf_tags if "minute" in tag.get_text()), None)

    if tag is None:
        return ""

    return tag["href"] if tag["href"].startswith("http") else base_url + tag["href"]



def main():

    with open("page_urls.txt", "r") as file:
        page_urls = set(file.read().splitlines())

    with open("pdf_urls.txt", "w") as file:
        for count, pdf_url in enumerate(map(get_pdf_url, page_urls), start=1):
            if pdf_url:
                status = "Success"
                file.write(pdf_url + "\n")
                file.flush()
            else:
                status = "Skipped"
                
            print("{}/{} - {}".format(count, len(page_urls), status))

    return 0


if __name__ == "__main__":
    import sys
    sys.exit(main())

第三个脚本将从 pdf_urls.txt 文件中读取,向每个 URL 发出请求,然后将响应字节解释为 PDF:

def main():

    import requests
    from io import BytesIO
    from PyPDF2 import PdfFileReader

    with open("pdf_urls.txt", "r") as file:
        pdf_urls = file.read().splitlines()

    for pdf_url in pdf_urls:
        response = requests.get(pdf_url)
        response.raise_for_status()

        content = BytesIO(response.content)

        reader = PdfFileReader(content)
        # do stuff with reader

    return 0


if __name__ == "__main__":
    import sys
    sys.exit(main())