python 网络抓取新闻文章中的 ArticleException 错误
ArticleException error in web scraping news articles by python
我正在尝试通过某些关键字在网上抓取新闻文章。我使用Python 3。但是,我无法从报纸上获取所有文章。在 csv
文件中抓取一些文章作为输出后,我收到 ArticleException
错误。谁能帮我解决这个问题?理想情况下,我想解决问题并从报纸网站下载所有相关文章。否则,跳过显示错误的 URL 并从下一个继续。在此先感谢您的帮助。
这是我使用的代码:
import urllib.request
import newspaper
from newspaper import Article
import csv, os
from bs4 import BeautifulSoup
import urllib
req_keywords = ['coronavirus', 'covid-19']
newspaper_base_url = 'http://www.thedailystar.net'
category = 'country'
def checkif_kw_exist(list_one, list_two):
common_kw = set(list_one) & set(list_two)
if len(common_kw) == 0: return False, common_kw
else: return True, common_kw
def get_article_info(url):
a = Article(url)
a.download()
a.parse()
a.nlp()
success, checked_kws = checkif_kw_exist(req_keywords, a.text.split())
if success:
return [url, a.publish_date, a.title, a.text]
else: return False
output_file = "J:/B/output.csv"
if not os.path.exists(output_file):
open(output_file, 'w').close()
for index in range(1,50000,1):
page_soup = BeautifulSoup( urllib.request.urlopen(page_url).read())
primary_tag = page_soup.find_all("h4", attrs={"class": "pad-bottom-small"})
for tag in primary_tag:
url = tag.find("a")
#print (url)
url = newspaper_base_url + url.get('href')
result = get_article_info(url)
if result is not False:
with open(output_file, 'a', encoding='utf-8') as f:
writeFile = csv.writer(f)
writeFile.writerow(result)
f.close
else:
pass
这是我收到的错误:
---------------------------------------------------------------------------
ArticleException Traceback (most recent call last)
<ipython-input-1-991b432d3bd0> in <module>
65 #print (url)
66 url = newspaper_base_url + url.get('href')
---> 67 result = get_article_info(url)
68 if result is not False:
69 with open(output_file, 'a', encoding='utf-8') as f:
<ipython-input-1-991b432d3bd0> in get_article_info(url)
28 a = Article(url)
29 a.download()
---> 30 a.parse()
31 a.nlp()
32 success, checked_kws = checkif_kw_exist(req_keywords, a.text.split())
~\Anaconda3\lib\site-packages\newspaper\article.py in parse(self)
189
190 def parse(self):
--> 191 self.throw_if_not_downloaded_verbose()
192
193 self.doc = self.config.get_parser().fromstring(self.html)
~\Anaconda3\lib\site-packages\newspaper\article.py in throw_if_not_downloaded_verbose(self)
530 elif self.download_state == ArticleDownloadState.FAILED_RESPONSE:
531 raise ArticleException('Article `download()` failed with %s on URL %s' %
--> 532 (self.download_exception_msg, self.url))
533
534 def throw_if_not_parsed_verbose(self):
ArticleException: Article `download()` failed with HTTPSConnectionPool(host='www.thedailystar.net', port=443): Read timed out. (read timeout=7) on URL http://www.thedailystar.net/ugc-asks-private-universities-stop-admissions-grades-without-test-for-coronavirus-pandemic-1890151
解决与下载内容相关的'skip'失败的最快方法是使用try/except
,如下所示:
def get_article_info(url):
a = Article(url)
try:
a.download()
a.parse()
a.nlp()
success, checked_kws = checkif_kw_exist(req_keywords, a.text.split())
if success:
return [url, a.publish_date, a.title, a.text]
else: return False
except:
return False
不建议使用 except
捕获所有可能的异常并忽略它,如果我不建议您更好地处理异常,这个答案将被否决。您确实还询问了解决问题的方法。如果不阅读您导入的库的文档,您将不知道可能会发生什么异常,因此在您跳过它们时打印出异常的详细信息将为您提供详细信息,例如您现在获得的 ArticleException
。您可以开始添加单独的 except
部分来处理您已经遇到的问题:
def get_article_info(url):
a = Article(url)
try:
a.download()
a.parse()
a.nlp()
success, checked_kws = checkif_kw_exist(req_keywords, a.text.split())
if success:
return [url, a.publish_date, a.title, a.text]
else:
return False
except ArticleException as ae:
print (ae)
return False
except Exception as e:
print(e)
return False
您收到的 ArticleException
告诉您收到 timeout
错误,这意味着 Daily Star 的响应未在时限内完成。可能是太忙了:)你可以多下载几次再放弃。
我正在尝试通过某些关键字在网上抓取新闻文章。我使用Python 3。但是,我无法从报纸上获取所有文章。在 csv
文件中抓取一些文章作为输出后,我收到 ArticleException
错误。谁能帮我解决这个问题?理想情况下,我想解决问题并从报纸网站下载所有相关文章。否则,跳过显示错误的 URL 并从下一个继续。在此先感谢您的帮助。
这是我使用的代码:
import urllib.request
import newspaper
from newspaper import Article
import csv, os
from bs4 import BeautifulSoup
import urllib
req_keywords = ['coronavirus', 'covid-19']
newspaper_base_url = 'http://www.thedailystar.net'
category = 'country'
def checkif_kw_exist(list_one, list_two):
common_kw = set(list_one) & set(list_two)
if len(common_kw) == 0: return False, common_kw
else: return True, common_kw
def get_article_info(url):
a = Article(url)
a.download()
a.parse()
a.nlp()
success, checked_kws = checkif_kw_exist(req_keywords, a.text.split())
if success:
return [url, a.publish_date, a.title, a.text]
else: return False
output_file = "J:/B/output.csv"
if not os.path.exists(output_file):
open(output_file, 'w').close()
for index in range(1,50000,1):
page_soup = BeautifulSoup( urllib.request.urlopen(page_url).read())
primary_tag = page_soup.find_all("h4", attrs={"class": "pad-bottom-small"})
for tag in primary_tag:
url = tag.find("a")
#print (url)
url = newspaper_base_url + url.get('href')
result = get_article_info(url)
if result is not False:
with open(output_file, 'a', encoding='utf-8') as f:
writeFile = csv.writer(f)
writeFile.writerow(result)
f.close
else:
pass
这是我收到的错误:
---------------------------------------------------------------------------
ArticleException Traceback (most recent call last)
<ipython-input-1-991b432d3bd0> in <module>
65 #print (url)
66 url = newspaper_base_url + url.get('href')
---> 67 result = get_article_info(url)
68 if result is not False:
69 with open(output_file, 'a', encoding='utf-8') as f:
<ipython-input-1-991b432d3bd0> in get_article_info(url)
28 a = Article(url)
29 a.download()
---> 30 a.parse()
31 a.nlp()
32 success, checked_kws = checkif_kw_exist(req_keywords, a.text.split())
~\Anaconda3\lib\site-packages\newspaper\article.py in parse(self)
189
190 def parse(self):
--> 191 self.throw_if_not_downloaded_verbose()
192
193 self.doc = self.config.get_parser().fromstring(self.html)
~\Anaconda3\lib\site-packages\newspaper\article.py in throw_if_not_downloaded_verbose(self)
530 elif self.download_state == ArticleDownloadState.FAILED_RESPONSE:
531 raise ArticleException('Article `download()` failed with %s on URL %s' %
--> 532 (self.download_exception_msg, self.url))
533
534 def throw_if_not_parsed_verbose(self):
ArticleException: Article `download()` failed with HTTPSConnectionPool(host='www.thedailystar.net', port=443): Read timed out. (read timeout=7) on URL http://www.thedailystar.net/ugc-asks-private-universities-stop-admissions-grades-without-test-for-coronavirus-pandemic-1890151
解决与下载内容相关的'skip'失败的最快方法是使用try/except
,如下所示:
def get_article_info(url):
a = Article(url)
try:
a.download()
a.parse()
a.nlp()
success, checked_kws = checkif_kw_exist(req_keywords, a.text.split())
if success:
return [url, a.publish_date, a.title, a.text]
else: return False
except:
return False
不建议使用 except
捕获所有可能的异常并忽略它,如果我不建议您更好地处理异常,这个答案将被否决。您确实还询问了解决问题的方法。如果不阅读您导入的库的文档,您将不知道可能会发生什么异常,因此在您跳过它们时打印出异常的详细信息将为您提供详细信息,例如您现在获得的 ArticleException
。您可以开始添加单独的 except
部分来处理您已经遇到的问题:
def get_article_info(url):
a = Article(url)
try:
a.download()
a.parse()
a.nlp()
success, checked_kws = checkif_kw_exist(req_keywords, a.text.split())
if success:
return [url, a.publish_date, a.title, a.text]
else:
return False
except ArticleException as ae:
print (ae)
return False
except Exception as e:
print(e)
return False
您收到的 ArticleException
告诉您收到 timeout
错误,这意味着 Daily Star 的响应未在时限内完成。可能是太忙了:)你可以多下载几次再放弃。