Python - 显示所有页面的结果,而不仅仅是第一页(Beautiful Soup)
Python - Show Results from all Pages not just the first page (Beautiful Soup)
我一直在使用 Beautiful Soup 制作一个简单的刮板,以根据用户输入的邮政编码获取餐馆的食品卫生评级。该代码工作正常,并正确地从 URL 中获取结果。
我需要帮助的是如何显示所有结果,而不仅仅是第一页的结果。
我的代码如下:
import requests
from bs4 import BeautifulSoup
pc = input("Please enter postcode")
url = "https://www.scoresonthedoors.org.uk/search.php?name=&address=&postcode="+pc+"&distance=1&search.x=8&search.y=6&gbt_id=0&award_score=&award_range=gt"
r = requests.get(url)
soup = BeautifulSoup(r.content, "lxml")
g_data = soup.findAll("div", {"class": "search-result"})
for item in g_data:
print (item.find_all("a", {"class": "name"})[0].text)
try:
print (item.find_all("span", {"class": "address"})[0].text)
except:
pass
try:
print (item.find_all("div", {"class": "rating-image"})[0].text)
except:
pass
我通过查看 URL 发现显示的页面依赖于 URL 字符串中名为 page
的变量
https://www.scoresonthedoors.org.uk/search.php?award_sort=ALPHA&name=&address=BT147AL&x=0&y=0&page=2#results
下一页按钮的分页代码为:
<a style="float: right" href="?award_sort=ALPHA&name=&address=BT147AL&x=0&y=0&page=3#results" rel="next " title="Go forward one page">Next <i class="fa fa-arrow-right fa-3"></i></a>
有没有办法让我的代码找出显示了多少页结果,然后从每一页中获取结果?
最好的解决方案是让代码每次更改 URL 字符串以更改 "page=" (例如 for 循环)还是有办法使用分页中的信息link代码?
非常感谢任何提供帮助或查看此问题的人
您实际上走对了路。预先生成要抓取的分页 url 是一个好方法。
我实际上几乎写出了整个代码。您首先要看的是 find_max_page()
函数,它包括从分页字符串中获取最大页面。有了这个数字,就可以生成所有需要抓取的url,一个一个抓取了。
检查下面的代码,几乎都在那里。
import requests
from bs4 import BeautifulSoup
class RestaurantScraper(object):
def __init__(self, pc):
self.pc = pc # the input postcode
self.max_page = self.find_max_page() # The number of page available
self.restaurants = list() # the final list of restaurants where the scrape data will at the end of process
def run(self):
for url in self.generate_pages_to_scrape():
restaurants_from_url = self.scrape_page(url)
self.restaurants += restaurants_from_url # we increment the restaurants to the global restaurants list
def create_url(self):
"""
Create a core url to scrape
:return: A url without pagination (= page 1)
"""
return "https://www.scoresonthedoors.org.uk/search.php?name=&address=&postcode=" + self.pc + \
"&distance=1&search.x=8&search.y=6&gbt_id=0&award_score=&award_range=gt"
def create_paginated_url(self, page_number):
"""
Create a paginated url
:param page_number: pagination (integer)
:return: A url paginated
"""
return self.create_url() + "&page={}".format(str(page_number))
def find_max_page(self):
"""
Function to find the number of pages for a specific search.
:return: The number of pages (integer)
"""
r = requests.get(self.create_url())
soup = BeautifulSoup(r.content, "lxml")
pagination_soup = soup.findAll("div", {"id": "paginator"})
pagination = pagination_soup[0]
page_text = pagination("p")[0].text
return int(page_text.replace('Page 1 of ', ''))
def generate_pages_to_scrape(self):
"""
Generate all the paginated url using the max_page attribute previously scraped.
:return: List of urls
"""
return [self.create_paginated_url(page_number) for page_number in range(1, self.max_page + 1)]
def scrape_page(self, url):
"""
This is coming from your original code snippet. This probably need a bit of work, but you get the idea.
:param url: Url to scrape and get data from.
:return:
"""
r = requests.get(url)
soup = BeautifulSoup(r.content, "lxml")
g_data = soup.findAll("div", {"class": "search-result"})
restaurants = list()
for item in g_data:
name = item.find_all("a", {"class": "name"})[0].text
restaurants.append(name)
try:
print item.find_all("span", {"class": "address"})[0].text
except:
pass
try:
print item.find_all("div", {"class": "rating-image"})[0].text
except:
pass
return restaurants
if __name__ == '__main__':
pc = input('Give your post code')
scraper = RestaurantScraper(pc)
scraper.run()
print "{} restaurants scraped".format(str(len(scraper.restaurants)))
我一直在使用 Beautiful Soup 制作一个简单的刮板,以根据用户输入的邮政编码获取餐馆的食品卫生评级。该代码工作正常,并正确地从 URL 中获取结果。
我需要帮助的是如何显示所有结果,而不仅仅是第一页的结果。
我的代码如下:
import requests
from bs4 import BeautifulSoup
pc = input("Please enter postcode")
url = "https://www.scoresonthedoors.org.uk/search.php?name=&address=&postcode="+pc+"&distance=1&search.x=8&search.y=6&gbt_id=0&award_score=&award_range=gt"
r = requests.get(url)
soup = BeautifulSoup(r.content, "lxml")
g_data = soup.findAll("div", {"class": "search-result"})
for item in g_data:
print (item.find_all("a", {"class": "name"})[0].text)
try:
print (item.find_all("span", {"class": "address"})[0].text)
except:
pass
try:
print (item.find_all("div", {"class": "rating-image"})[0].text)
except:
pass
我通过查看 URL 发现显示的页面依赖于 URL 字符串中名为 page
的变量https://www.scoresonthedoors.org.uk/search.php?award_sort=ALPHA&name=&address=BT147AL&x=0&y=0&page=2#results
下一页按钮的分页代码为:
<a style="float: right" href="?award_sort=ALPHA&name=&address=BT147AL&x=0&y=0&page=3#results" rel="next " title="Go forward one page">Next <i class="fa fa-arrow-right fa-3"></i></a>
有没有办法让我的代码找出显示了多少页结果,然后从每一页中获取结果?
最好的解决方案是让代码每次更改 URL 字符串以更改 "page=" (例如 for 循环)还是有办法使用分页中的信息link代码?
非常感谢任何提供帮助或查看此问题的人
您实际上走对了路。预先生成要抓取的分页 url 是一个好方法。
我实际上几乎写出了整个代码。您首先要看的是 find_max_page()
函数,它包括从分页字符串中获取最大页面。有了这个数字,就可以生成所有需要抓取的url,一个一个抓取了。
检查下面的代码,几乎都在那里。
import requests
from bs4 import BeautifulSoup
class RestaurantScraper(object):
def __init__(self, pc):
self.pc = pc # the input postcode
self.max_page = self.find_max_page() # The number of page available
self.restaurants = list() # the final list of restaurants where the scrape data will at the end of process
def run(self):
for url in self.generate_pages_to_scrape():
restaurants_from_url = self.scrape_page(url)
self.restaurants += restaurants_from_url # we increment the restaurants to the global restaurants list
def create_url(self):
"""
Create a core url to scrape
:return: A url without pagination (= page 1)
"""
return "https://www.scoresonthedoors.org.uk/search.php?name=&address=&postcode=" + self.pc + \
"&distance=1&search.x=8&search.y=6&gbt_id=0&award_score=&award_range=gt"
def create_paginated_url(self, page_number):
"""
Create a paginated url
:param page_number: pagination (integer)
:return: A url paginated
"""
return self.create_url() + "&page={}".format(str(page_number))
def find_max_page(self):
"""
Function to find the number of pages for a specific search.
:return: The number of pages (integer)
"""
r = requests.get(self.create_url())
soup = BeautifulSoup(r.content, "lxml")
pagination_soup = soup.findAll("div", {"id": "paginator"})
pagination = pagination_soup[0]
page_text = pagination("p")[0].text
return int(page_text.replace('Page 1 of ', ''))
def generate_pages_to_scrape(self):
"""
Generate all the paginated url using the max_page attribute previously scraped.
:return: List of urls
"""
return [self.create_paginated_url(page_number) for page_number in range(1, self.max_page + 1)]
def scrape_page(self, url):
"""
This is coming from your original code snippet. This probably need a bit of work, but you get the idea.
:param url: Url to scrape and get data from.
:return:
"""
r = requests.get(url)
soup = BeautifulSoup(r.content, "lxml")
g_data = soup.findAll("div", {"class": "search-result"})
restaurants = list()
for item in g_data:
name = item.find_all("a", {"class": "name"})[0].text
restaurants.append(name)
try:
print item.find_all("span", {"class": "address"})[0].text
except:
pass
try:
print item.find_all("div", {"class": "rating-image"})[0].text
except:
pass
return restaurants
if __name__ == '__main__':
pc = input('Give your post code')
scraper = RestaurantScraper(pc)
scraper.run()
print "{} restaurants scraped".format(str(len(scraper.restaurants)))