BeautifulSoup：6k 条记录 - 但在解析 20 行后停止

Question

目标是快速了解欧洲的一系列免费志愿服务机会

目的是获取所有 6k 目标页面：https://europa.eu/youth/volunteering/organisation/48592 见下文 - 图像以及目标目标的解释和描述以及需要的数据。

我们获取..

https://europa.eu/youth/volunteering/organisation/50162
https://europa.eu/youth/volunteering/organisation/50163

and so forth and so forth

因为我们有 6000 多条记录 - 我承认我得到了结果。但是脚本只返回 20 条记录 - 即 20 行。

查看我目前的方法：我运行这里是这个小方法：

import requests
from bs4 import BeautifulSoup
import re
import csv
from tqdm import tqdm

first = "https://europa.eu/youth/volunteering/organisations_en?page={}"
second = "https://europa.eu/youth/volunteering/organisation/{}_en"

def catch(url):
    with requests.Session() as req:
        pages = []
        print("Loading All IDS\n")
        for item in tqdm(range(0, 347)):
            r = req.get(url.format(item))
            soup = BeautifulSoup(r.content, 'html.parser')
            numbers = [item.get("href").split("/")[-1].split("_")[0] for item in soup.findAll(
                "a", href=re.compile("^/youth/volunteering/organisation/"), class_="btn btn-default")]
            pages.append(numbers)
        return numbers

def parse(url):
    links = catch(first)
    with requests.Session() as req:
        with open("Data.csv", 'w', newline="", encoding="UTF-8") as f:
            writer = csv.writer(f)
            writer.writerow(["Name", "Address", "Site", "Phone",
                             "Description", "Scope", "Rec", "Send", "PIC", "OID", "Topic"])
            print("\nParsing Now... \n")
            for link in tqdm(links):
                r = req.get(url.format(link))
                soup = BeautifulSoup(r.content, 'html.parser')
                task = soup.find("section", class_="col-sm-12").contents
                name = task[1].text
                add = task[3].find(
                    "i", class_="fa fa-location-arrow fa-lg").parent.text.strip()
                try:
                    site = task[3].find("a", class_="link-default").get("href")
                except:
                    site = "N/A"
                try:
                    phone = task[3].find(
                        "i", class_="fa fa-phone").next_element.strip()
                except:
                    phone = "N/A"
                desc = task[3].find(
                    "h3", class_="eyp-project-heading underline").find_next("p").text
                scope = task[3].findAll("span", class_="pull-right")[1].text
                rec = task[3].select("tbody td")[1].text
                send = task[3].select("tbody td")[-1].text
                pic = task[3].select(
                    "span.vertical-space")[0].text.split(" ")[1]
                oid = task[3].select(
                    "span.vertical-space")[-1].text.split(" ")[1]
                topic = [item.next_element.strip() for item in task[3].select(
                    "i.fa.fa-check.fa-lg")]
                writer.writerow([name, add, site, phone, desc,
                                 scope, rec, send, pic, oid, "".join(topic)])


parse(second)

但是在解析 20 个结果后停止

注意：我想要 return 页数而不是数字。

因为我想遍历页面而不是数字；但无论如何 - 如果我从 return 数字更改为 return 页 - 我没有得到更好的结果。

这里我似乎有一些错误：我猜 catch 函数中有一个奇特的错误：我们在这里 returning 数字，但我很确定这是一个错误：我们我们打算 returning 页面，这意味着当我们在另一个函数中迭代 catch(first) 的结果时，我们并没有迭代所有想要的东西。我想我需要包括一个修复：我们需要在该函数的底部 return 页，而不是 return numbers

也就是说：因为我想遍历页面而不是数字；但无论如何 - 如果我从 return 数字更改为 return 页 - 我没有得到更好的结果。

任何想法 - 如何让解析器给出所有 6k 结果。

Answer 1

您可以使用此示例来解析页面：

import pandas
import requests
import pandas as pd
from bs4 import BeautifulSoup


def safe_get(to_find, what_next, if_not_found="N/A"):
    if to_find:
        return what_next(to_find)
    return if_not_found


first_url = "https://europa.eu/youth/volunteering/organisations_en?page={}"

links = []
for page in range(0, 3):  # <--- increase number of pages here
    u = first_url.format(page)
    soup = BeautifulSoup(requests.get(u).content, "html.parser")

    for a in soup.select("h5 > a"):
        links.append("https://europa.eu" + a["href"])

data = []
for l in links:
    print(l)
    soup = BeautifulSoup(requests.get(l).content, "html.parser")

    name = safe_get(soup.select_one("h5"), lambda t: t.text)
    address = safe_get(
        soup.select_one(".fa-location-arrow"),
        lambda t: t.parent.get_text(strip=True),
    )
    link = safe_get(
        soup.select_one(".fa-external-link"), lambda t: t.find_next("a")["href"]
    )
    phone = safe_get(
        soup.select_one(".fa-phone"), lambda t: t.find_next(text=True).strip()
    )
    desc = safe_get(
        soup.select_one("h3 ~ p"),
        lambda t: t.get_text(strip=True, separator="\n"),
    )
    scope = safe_get(
        soup.select_one(".fa-asterisk"),
        lambda t: t.find_next("span").get_text(strip=True),
    )
    receiving = safe_get(
        soup.select_one('td:-soup-contains("Receiving") ~ td'),
        lambda t: t.get_text(strip=True),
    )
    sending = safe_get(
        soup.select_one('td:-soup-contains("Sending") ~ td'),
        lambda t: t.get_text(strip=True),
    )
    pic = safe_get(
        soup.find("span", text=lambda t: t and t.startswith("PIC")),
        lambda t: t.text.split()[-1],
    )
    oid = safe_get(
        soup.find("span", text=lambda t: t and t.startswith("OID")),
        lambda t: t.text.split()[-1],
    )
    topics = ", ".join(
        [t.find_next(text=True).strip() for t in soup.select("p > .fa-check")]
    )

    data.append(
        (
            name,
            address,
            link,
            phone,
            desc,
            scope,
            receiving,
            sending,
            pic,
            oid,
            topics,
        )
    )

df = pd.DataFrame(
    data,
    columns=[
        "Name",
        "Address",
        "Site",
        "Phone",
        "Description",
        "Scope",
        "Rec",
        "Send",
        "PIC",
        "OID",
        "Topic",
    ],
)
print(df)
df.to_csv("data.csv", index=False)

创建 data.csv（来自 Libre Office 的屏幕截图）：

BeautifulSoup：6k 条记录 - 但在解析 20 行后停止

BeautifulSoup: 6k records - but stops after parsing 20 lines

python

csv

beautifulsoup

request