在使用 beautifulsoup 和 concurrent.futures 时,我如何 return 我正在抓取的数据?
How can I return the data I'm scraping when using beautifulsoup and concurrent.futures?
尝试从 nyt cooking 中异步抓取一些食谱并关注此博客:https://beckernick.github.io/faster-web-scraping-python/
它会毫无问题地打印结果,但出于某种原因,我的 return
在这里什么也没做。我需要 return 列表。有什么想法吗?
import concurrent.futures
import time
MAX_THREADS = 30
urls = ['https://cooking.nytimes.com/search?q=&page={page_number}'.format(page_number=p) for p in range(1,5)]
# grab all of the recipe cards on each search page
def extract_recipe_urls(url):
"""returns a list of recipe urls"""
recipe_cards = []
response = session.get(url)
soup = BeautifulSoup(response.content, 'html.parser')
for rs in soup.find_all("article",{"class":"card recipe-card"}):
recipe_cards.append(rs.find('a')['href'])
print(recipe_cards)
return recipe_cards
def async_scraping(scrape_function, urls):
threads = min(MAX_THREADS, len(urls))
with concurrent.futures.ThreadPoolExecutor(max_workers=threads) as executor:
executor.map(scrape_function, urls)
你必须得到
results = executor.map(...)
以后可以使用循环
for item in results:
print(item)
或转换为列表
all_items = list(results)
顺便说一句: 因为 results
是一个 generator
所以你不能在两个 for
循环中使用它两次(或者在 for
-loop 和 list()
) 中,然后您必须首先将所有项目作为列表 all_items = list(results)
获取,然后在两个 for
-loops 中使用此列表 all_items
.
最小工作代码:
import requests
from bs4 import BeautifulSoup
import concurrent.futures
import time
# --- constants ---
MAX_THREADS = 30
# --- functions ---
# grab all of the recipe cards on each search page
def extract_recipe_urls(url):
"""returns a list of recipe urls"""
session = requests.Session()
recipe_cards = []
response = session.get(url)
soup = BeautifulSoup(response.content, 'html.parser')
for rs in soup.find_all("article",{"class":"card recipe-card"}):
recipe_cards.append(rs.find('a')['href'])
return recipe_cards
def async_scraping(scrape_function, urls):
threads = min(MAX_THREADS, len(urls))
with concurrent.futures.ThreadPoolExecutor(max_workers=threads) as executor:
results = executor.map(scrape_function, urls)
return results
# --- main ---
urls = ['https://cooking.nytimes.com/search?q=&page={page_number}'.format(page_number=p) for p in range(1,5)]
results = async_scraping(extract_recipe_urls, urls)
#all_items = list(results)
for item in results:
print(item)
顺便说一句: 每个 extract_recipe_urls
都会给你列表,所以最后 results
是列表的列表。
all_items = list(results)
print('len(all_items):', len(all_items))
for item in all_items:
print('len(item):', len(item))
结果
len(all_items): 4
len(item): 48
len(item): 48
len(item): 48
len(item): 48
如果您希望将所有项目作为一个平面列表,那么您可以使用 list1.extend(list2)
或 list1 + list2
,它们可以与 sum(..., [])
一起使用
all_items = sum(all_items, [])
print('len(all_items):', len(all_items))
结果:
len(all_items): 192
尝试从 nyt cooking 中异步抓取一些食谱并关注此博客:https://beckernick.github.io/faster-web-scraping-python/
它会毫无问题地打印结果,但出于某种原因,我的 return
在这里什么也没做。我需要 return 列表。有什么想法吗?
import concurrent.futures
import time
MAX_THREADS = 30
urls = ['https://cooking.nytimes.com/search?q=&page={page_number}'.format(page_number=p) for p in range(1,5)]
# grab all of the recipe cards on each search page
def extract_recipe_urls(url):
"""returns a list of recipe urls"""
recipe_cards = []
response = session.get(url)
soup = BeautifulSoup(response.content, 'html.parser')
for rs in soup.find_all("article",{"class":"card recipe-card"}):
recipe_cards.append(rs.find('a')['href'])
print(recipe_cards)
return recipe_cards
def async_scraping(scrape_function, urls):
threads = min(MAX_THREADS, len(urls))
with concurrent.futures.ThreadPoolExecutor(max_workers=threads) as executor:
executor.map(scrape_function, urls)
你必须得到
results = executor.map(...)
以后可以使用循环
for item in results:
print(item)
或转换为列表
all_items = list(results)
顺便说一句: 因为 results
是一个 generator
所以你不能在两个 for
循环中使用它两次(或者在 for
-loop 和 list()
) 中,然后您必须首先将所有项目作为列表 all_items = list(results)
获取,然后在两个 for
-loops 中使用此列表 all_items
.
最小工作代码:
import requests
from bs4 import BeautifulSoup
import concurrent.futures
import time
# --- constants ---
MAX_THREADS = 30
# --- functions ---
# grab all of the recipe cards on each search page
def extract_recipe_urls(url):
"""returns a list of recipe urls"""
session = requests.Session()
recipe_cards = []
response = session.get(url)
soup = BeautifulSoup(response.content, 'html.parser')
for rs in soup.find_all("article",{"class":"card recipe-card"}):
recipe_cards.append(rs.find('a')['href'])
return recipe_cards
def async_scraping(scrape_function, urls):
threads = min(MAX_THREADS, len(urls))
with concurrent.futures.ThreadPoolExecutor(max_workers=threads) as executor:
results = executor.map(scrape_function, urls)
return results
# --- main ---
urls = ['https://cooking.nytimes.com/search?q=&page={page_number}'.format(page_number=p) for p in range(1,5)]
results = async_scraping(extract_recipe_urls, urls)
#all_items = list(results)
for item in results:
print(item)
顺便说一句: 每个 extract_recipe_urls
都会给你列表,所以最后 results
是列表的列表。
all_items = list(results)
print('len(all_items):', len(all_items))
for item in all_items:
print('len(item):', len(item))
结果
len(all_items): 4
len(item): 48
len(item): 48
len(item): 48
len(item): 48
如果您希望将所有项目作为一个平面列表,那么您可以使用 list1.extend(list2)
或 list1 + list2
,它们可以与 sum(..., [])
all_items = sum(all_items, [])
print('len(all_items):', len(all_items))
结果:
len(all_items): 192