Concurrent 无法与 beautifulsoup 一起正常工作,无法获取所有链接
Concurrent is not working properly with beautifulsoup, not fetching all the links
在这段代码中,我想使用 beautifulsoup 从报纸 link 中提取内容。但它无法正常工作,列表“filtered_Final_LIST”中的每个 link 都有 link,其中包含多篇文章。当我使用并发库时,函数 'ext_url' 没有返回所有页面结果。
并且,正常的 for 循环工作正常。我已经使用这个并发库来提高提取速度。我做错了什么吗?
import concurrent.futures
import time
MAX_THREADS = 30
filtered_Final_LIST = ['https://www.financialexpress.com/economy/finmin-asks-ministries-to-restrict-expenses-within-prescribed-limit/2410766/"',
'https://www.financialexpress.com/economy/uk-inflation-hits-near-30-year-high-pressuring-boe-and-households/2410761/"',
'https://www.financialexpress.com/economy/economic-recovery-yet-to-attain-durability-says-report/2410690/"',
'https://www.financialexpress.com/economy/vagaries-of-weather-drive-near-13-lakh-maha-farmers-to-crop-insurance-scheme/2410030/"']
def ext_url(url):
global List_articles, List_header, List_date, List_month, List_year, List_source
## Lists to get dates and news articles
List_articles = []
List_header = []
List_date = []
List_month = []
List_year = []
List_source = []
# for i in range(len(filtered_Final_LIST)):
# if 'https://www.financialexpress.com/economy/' in str(Final_LIST[i]):
# opening the url for reading
html = urllib.request.urlopen(url , timeout = 10)
print(url)
# parsing the html file
htmlParse = BeautifulSoup(html, 'html.parser')
# getting all the paragraphs of articles
for para in htmlParse.find_all(['div'], class_='entry-content wp-block-post-content'):
List_articles.append(para.get_text())
# Getting respective month, date, year the article published
from datetime import datetime
date = htmlParse.find(itemprop="article:published_time").get("content")
match = re.search(r'\d{4}-\d{2}-\d{2}', date)
dt = datetime.strptime(match.group(), '%Y-%m-%d').date()
List_month.append(dt.month)
List_date.append(dt.day)
List_year.append(dt.year)
# getting all the headings of articles
for para in htmlParse.find_all(['h1'], class_='wp-block-post-title'):
List_header.append(para.get_text())
# getting all the source of articles
for para in htmlParse.find_all(['div'], class_='author-link ie_custom_theme_multiple_authors'):
List_source.append(para.get_text())
return List_articles, List_header, List_date, List_month, List_year, List_source
with concurrent.futures.ThreadPoolExecutor() as executor :
for i in range(len(filtered_Final_LIST)):
executor.submit(ext_url, (filtered_Final_LIST[i]))
import trio
import httpx
from bs4 import BeautifulSoup
import pandas as pd
# pip install trio httpx
mainurl = 'https://www.financialexpress.com/economy/'
news = [
'finmin-asks-ministries-to-restrict-expenses-within-prescribed-limit/2410766/',
'uk-inflation-hits-near-30-year-high-pressuring-boe-and-households/2410761/',
'economic-recovery-yet-to-attain-durability-says-report/2410690/',
'vagaries-of-weather-drive-near-13-lakh-maha-farmers-to-crop-insurance-scheme/2410030/'
]
allin = []
async def get_soup(content):
return BeautifulSoup(content, 'lxml')
async def worker(receiver):
async with receiver:
async for client, new in receiver:
r = await client.get(mainurl + new)
soup = await get_soup(r.text)
prs = [x.text for x in soup.select(
'.entry-content > p:not(:last-child)')]
title = soup.select_one('.wp-block-post-title').text
author = soup.select_one('div.author-link a').text
publish = soup.select_one(
'[itemprop="article:published_time"]')['content'].split('T')[0].split('-')
target = [title, author, *publish, prs]
allin.append(target)
async def main():
async with httpx.AsyncClient(timeout=None) as client, trio.open_nursery() as nurse:
sender, receiver = trio.open_memory_channel(0)
async with receiver:
for _ in range(5):
nurse.start_soon(worker, receiver.clone())
async with sender:
for new in news:
await sender.send([client, new])
if __name__ == "__main__":
trio.run(main)
df = pd.DataFrame(
allin, columns=['Title', 'Author', 'Year', 'Month', 'Day', 'Paragraphs'])
print(df)
df.to_csv('data.csv', index=False)
在这段代码中,我想使用 beautifulsoup 从报纸 link 中提取内容。但它无法正常工作,列表“filtered_Final_LIST”中的每个 link 都有 link,其中包含多篇文章。当我使用并发库时,函数 'ext_url' 没有返回所有页面结果。
并且,正常的 for 循环工作正常。我已经使用这个并发库来提高提取速度。我做错了什么吗?
import concurrent.futures
import time
MAX_THREADS = 30
filtered_Final_LIST = ['https://www.financialexpress.com/economy/finmin-asks-ministries-to-restrict-expenses-within-prescribed-limit/2410766/"',
'https://www.financialexpress.com/economy/uk-inflation-hits-near-30-year-high-pressuring-boe-and-households/2410761/"',
'https://www.financialexpress.com/economy/economic-recovery-yet-to-attain-durability-says-report/2410690/"',
'https://www.financialexpress.com/economy/vagaries-of-weather-drive-near-13-lakh-maha-farmers-to-crop-insurance-scheme/2410030/"']
def ext_url(url):
global List_articles, List_header, List_date, List_month, List_year, List_source
## Lists to get dates and news articles
List_articles = []
List_header = []
List_date = []
List_month = []
List_year = []
List_source = []
# for i in range(len(filtered_Final_LIST)):
# if 'https://www.financialexpress.com/economy/' in str(Final_LIST[i]):
# opening the url for reading
html = urllib.request.urlopen(url , timeout = 10)
print(url)
# parsing the html file
htmlParse = BeautifulSoup(html, 'html.parser')
# getting all the paragraphs of articles
for para in htmlParse.find_all(['div'], class_='entry-content wp-block-post-content'):
List_articles.append(para.get_text())
# Getting respective month, date, year the article published
from datetime import datetime
date = htmlParse.find(itemprop="article:published_time").get("content")
match = re.search(r'\d{4}-\d{2}-\d{2}', date)
dt = datetime.strptime(match.group(), '%Y-%m-%d').date()
List_month.append(dt.month)
List_date.append(dt.day)
List_year.append(dt.year)
# getting all the headings of articles
for para in htmlParse.find_all(['h1'], class_='wp-block-post-title'):
List_header.append(para.get_text())
# getting all the source of articles
for para in htmlParse.find_all(['div'], class_='author-link ie_custom_theme_multiple_authors'):
List_source.append(para.get_text())
return List_articles, List_header, List_date, List_month, List_year, List_source
with concurrent.futures.ThreadPoolExecutor() as executor :
for i in range(len(filtered_Final_LIST)):
executor.submit(ext_url, (filtered_Final_LIST[i]))
import trio
import httpx
from bs4 import BeautifulSoup
import pandas as pd
# pip install trio httpx
mainurl = 'https://www.financialexpress.com/economy/'
news = [
'finmin-asks-ministries-to-restrict-expenses-within-prescribed-limit/2410766/',
'uk-inflation-hits-near-30-year-high-pressuring-boe-and-households/2410761/',
'economic-recovery-yet-to-attain-durability-says-report/2410690/',
'vagaries-of-weather-drive-near-13-lakh-maha-farmers-to-crop-insurance-scheme/2410030/'
]
allin = []
async def get_soup(content):
return BeautifulSoup(content, 'lxml')
async def worker(receiver):
async with receiver:
async for client, new in receiver:
r = await client.get(mainurl + new)
soup = await get_soup(r.text)
prs = [x.text for x in soup.select(
'.entry-content > p:not(:last-child)')]
title = soup.select_one('.wp-block-post-title').text
author = soup.select_one('div.author-link a').text
publish = soup.select_one(
'[itemprop="article:published_time"]')['content'].split('T')[0].split('-')
target = [title, author, *publish, prs]
allin.append(target)
async def main():
async with httpx.AsyncClient(timeout=None) as client, trio.open_nursery() as nurse:
sender, receiver = trio.open_memory_channel(0)
async with receiver:
for _ in range(5):
nurse.start_soon(worker, receiver.clone())
async with sender:
for new in news:
await sender.send([client, new])
if __name__ == "__main__":
trio.run(main)
df = pd.DataFrame(
allin, columns=['Title', 'Author', 'Year', 'Month', 'Day', 'Paragraphs'])
print(df)
df.to_csv('data.csv', index=False)