如何在 Python 中使用 Selenium 持续抓取网页中的文章
How to continuously crawl a webpage for articles using Selenium in Python
我正在尝试抓取 bloomberg.com 并查找所有英文新闻文章的链接。以下代码的问题在于,它确实从第一页找到了很多文章,但它只是进入了一个循环,它没有 return 任何东西并且偶尔会出现一次。
from collections import deque
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.firefox.options import Options
visited = set()
to_crawl = deque()
to_crawl.append("https://www.bloomberg.com")
def crawl_link(input_url):
options = Options()
options.add_argument('--headless')
browser = webdriver.Firefox(options=options)
browser.get(input_url)
elems = browser.find_elements(by=By.XPATH, value="//a[@href]")
for elem in elems:
#retrieve all href links and save it to url_element variable
url_element = elem.get_attribute("href")
if url_element not in visited:
to_crawl.append(url_element)
visited.add(url_element)
#save news articles
if 'www.bloomberg.com/news/articles' in url_element:
print(str(url_element))
with open("result.txt", "a") as outf:
outf.write(str(url_element) + "\n")
browser.close()
while len(to_crawl):
url_to_crawl = to_crawl.pop()
crawl_link(url_to_crawl)
我试过使用队列然后使用堆栈,但行为是一样的。我似乎无法完成我正在寻找的东西。
如何抓取此类网站以抓取新闻网址?
您使用的方法应该可以正常工作,但是在 运行 之后,我注意到有一些事情导致它挂起或抛出错误。
我做了一些调整,并添加了一些 in-line 评论来解释我的原因。
from collections import deque
from selenium.common.exceptions import StaleElementReferenceException
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.firefox.options import Options
base = "https://www.bloomberg.com"
article = base + "/news/articles"
visited = set()
# A set discards duplicates automatically and is more efficient for lookups
articles = set()
to_crawl = deque()
to_crawl.append(base)
def crawl_link(input_url):
options = Options()
options.add_argument('--headless')
browser = webdriver.Firefox(options=options)
print(input_url)
browser.get(input_url)
elems = browser.find_elements(by=By.XPATH, value="//a[@href]")
# this part was the issue, before this line there was
# `to_crawl.append()` which was prematurely adding links
# to the visited list so those links were skipped over without
# being crawled
visited.add(input_url)
for elem in elems:
# checks for errors
try:
url_element = elem.get_attribute("href")
except StaleElementReferenceException as err:
print(err)
continue
# checks to make sure links aren't being crawled more than once
# and that all the links are in the propper domain
if base in url_element and all(url_element not in i for i in [visited, to_crawl]):
to_crawl.append(url_element)
# this checks if the link matches the correct url pattern
if article in url_element and url_element not in articles:
articles.add(url_element)
print(str(url_element))
with open("result.txt", "a") as outf:
outf.write(str(url_element) + "\n")
browser.quit() # guarantees the browser closes completely
while len(to_crawl):
# popleft makes the deque a FIFO instead of LIFO.
# A queue would achieve the same thing.
url_to_crawl = to_crawl.popleft()
crawl_link(url_to_crawl)
在 运行 60+ 秒后,这是 result.txt
https://gist.github.com/alexpdev/b7545970c4e3002b1372e26651301a23
的输出
我正在尝试抓取 bloomberg.com 并查找所有英文新闻文章的链接。以下代码的问题在于,它确实从第一页找到了很多文章,但它只是进入了一个循环,它没有 return 任何东西并且偶尔会出现一次。
from collections import deque
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.firefox.options import Options
visited = set()
to_crawl = deque()
to_crawl.append("https://www.bloomberg.com")
def crawl_link(input_url):
options = Options()
options.add_argument('--headless')
browser = webdriver.Firefox(options=options)
browser.get(input_url)
elems = browser.find_elements(by=By.XPATH, value="//a[@href]")
for elem in elems:
#retrieve all href links and save it to url_element variable
url_element = elem.get_attribute("href")
if url_element not in visited:
to_crawl.append(url_element)
visited.add(url_element)
#save news articles
if 'www.bloomberg.com/news/articles' in url_element:
print(str(url_element))
with open("result.txt", "a") as outf:
outf.write(str(url_element) + "\n")
browser.close()
while len(to_crawl):
url_to_crawl = to_crawl.pop()
crawl_link(url_to_crawl)
我试过使用队列然后使用堆栈,但行为是一样的。我似乎无法完成我正在寻找的东西。
如何抓取此类网站以抓取新闻网址?
您使用的方法应该可以正常工作,但是在 运行 之后,我注意到有一些事情导致它挂起或抛出错误。
我做了一些调整,并添加了一些 in-line 评论来解释我的原因。
from collections import deque
from selenium.common.exceptions import StaleElementReferenceException
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.firefox.options import Options
base = "https://www.bloomberg.com"
article = base + "/news/articles"
visited = set()
# A set discards duplicates automatically and is more efficient for lookups
articles = set()
to_crawl = deque()
to_crawl.append(base)
def crawl_link(input_url):
options = Options()
options.add_argument('--headless')
browser = webdriver.Firefox(options=options)
print(input_url)
browser.get(input_url)
elems = browser.find_elements(by=By.XPATH, value="//a[@href]")
# this part was the issue, before this line there was
# `to_crawl.append()` which was prematurely adding links
# to the visited list so those links were skipped over without
# being crawled
visited.add(input_url)
for elem in elems:
# checks for errors
try:
url_element = elem.get_attribute("href")
except StaleElementReferenceException as err:
print(err)
continue
# checks to make sure links aren't being crawled more than once
# and that all the links are in the propper domain
if base in url_element and all(url_element not in i for i in [visited, to_crawl]):
to_crawl.append(url_element)
# this checks if the link matches the correct url pattern
if article in url_element and url_element not in articles:
articles.add(url_element)
print(str(url_element))
with open("result.txt", "a") as outf:
outf.write(str(url_element) + "\n")
browser.quit() # guarantees the browser closes completely
while len(to_crawl):
# popleft makes the deque a FIFO instead of LIFO.
# A queue would achieve the same thing.
url_to_crawl = to_crawl.popleft()
crawl_link(url_to_crawl)
在 运行 60+ 秒后,这是 result.txt
https://gist.github.com/alexpdev/b7545970c4e3002b1372e26651301a23