Python asyncio 网络抓取输出未在 excel 中导出

Python asyncio web scraping output not exporting in excel

我是 Python asyncio 网络抓取的新手。我想使用 pandas 将抓取的数据导出到 excel。似乎我的代码波纹管正在抓取目标字段,但是当我使用 pandas 将输出导出到 excel 时,我得到的是空输出文件。

import asyncio
from concurrent.futures.thread import ThreadPoolExecutor
import pandas as pd
import os
import requests
from bs4 import BeautifulSoup

input_file = os.path.join(os.getcwd(), 'Sample.xlsx')
df = pd.read_excel(input_file, usecols=0)

req = requests.Session()
req.trust_env = False

Url, title, price = [], [], [],
executor = ThreadPoolExecutor(10)


def scrape(url, *, loop):
    loop.run_in_executor(executor, load_html, url)


def load_html(url):
    print(url)
    res = req.get(url)
    soup = BeautifulSoup(res.content, 'html.parser')
    Url.append(url)
    title.append(soup.select('.pinfo-title')[0].text)
    price.append(soup.select('.sale-price')[0].text)


loop = asyncio.get_event_loop()
for url in df['Urls']:
    scrape(url, loop=loop)

loop.run_until_complete(asyncio.gather(*asyncio.all_tasks(loop)))

output = pd.DataFrame({
        'Url': Url,
        'Title': title,
        'Price': price
})
output.to_excel('Output.xlsx', index=False)

但是如果我使用打印而不是像下面那样附加列表,那么它会打印所需的输出。

def load_html(url):
    print(url)
    res = req.get(url)
    soup = BeautifulSoup(res.content, 'html.parser')
    print(url)
    print(soup.select('.pinfo-title')[0].text)
    print(soup.select('.sale-price')[0].text)

看起来 run_in_executor 没有将任务添加到循环中。必须等待。所以你需要将它包装在一个协程中,并在循环中创建一个任务。下面是更简单的示例。

import asyncio
from urllib.request import urlopen
import json

URLS = [
    "http://localhost:8000/a",
    "http://localhost:8000/b",
    "http://localhost:8000/c",
    "http://localhost:8000/d",
]

data = []


def load_html(url):
    print(url)
    res = urlopen(url)
    data.append(res.read().decode())


async def scrape(url, loop):
    await loop.run_in_executor(None, load_html, url)


def main():
    loop = asyncio.get_event_loop()
    for url in URLS:
        loop.create_task(scrape(url, loop))

    loop.run_until_complete(asyncio.gather(*asyncio.all_tasks(loop)))
    with open('/tmp/j_dump', 'w') as fp:
        json.dump(data, fp)


if __name__ == '__main__':
    main()

更新:

代码可以简化如下。不需要额外的协程或全局数据。

def load_html(url):
    print(url)
    res = urlopen(url)
    return res.read().decode()


def main():
    loop = asyncio.get_event_loop()
    tasks = [loop.run_in_executor(None, load_html, url) for url in URLS]
    data = loop.run_until_complete(asyncio.gather(*tasks))
    with open('/tmp/j_dump', 'w') as fp:
        json.dump(data, fp)