如何使用 beautifulsoup4 通过我的 python 脚本抓取更多亚马逊产品?
How do I scrape more products of amazon with my python script using beautifulsoup4?
我有一个代码可以获取 3 星及以上评级的前 20 名产品。我需要帮助为 2000 种产品创建一些东西。请检查我的代码。我正在考虑获取亚马逊网站上页码的链接并制作循环,直到我获得 2000 种产品,但无法弄清楚如何制作循环。
这是我的代码:
import requests
from bs4 import BeautifulSoup
import re
headers = {'User-agent': 'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2062.120 Safari/537.36'}
def star_link():
search = input("Please enter the category or search term: ")
url1 = "https://www.amazon.in/s/ref=nb_sb_noss?url=search-alias%3Daps&field-keywords=" + search
r1 = requests.get(url1)
soup1 = BeautifulSoup(r1.text, 'lxml')
links = soup1.find_all('i', 'a-star-3')
for link in links:
a_tag = link.parent
a_link = a_tag['href']
return a_link
def products():
url2 = "https://www.amazon.in/"+star_link()
r2 = requests.get(url2)
soup2 = BeautifulSoup(r2.text, 'lxml')
link_title = {}
print('The top 2000 products of youe selected category with 3 stars and up are: ')
print('\n')
for n in range(0, 20):
contents = soup2.findAll('li', id='result_' + str(n))
for content in contents:
links = [a['href'] for a in content.find_all(lambda i: i.get('href') and i.text)]
titles = content.find_all('h2')
for link in links:
for title in titles:
link_title.update({title.text: link})
for title in link_title:
print(title + ':' + link_title[title])
products()
用while True
到运行死循环,没取到数据就用break
data = soup.select('.result-info')
if not data:
print('END: no data:')
break
顺便说一句:
- 即使您在 url 中使用
s=0
,服务器也会提供第一页,因此您不必检查 if i == 0
s={}
可以在 ?
之后的任何地方 - 所以它可以放在最后以使代码更具可读性
- 您导入了
csv
模块但没有使用它。
我使用它,我不需要 replace(","," ")
,因为如果项目中有 ,
,它会将文本放入 " "
。
完整代码
import requests
from bs4 import BeautifulSoup
import csv
filename = "output.csv"
f = open(filename, 'w', newline="", encoding='utf-8')
csvwriter = csv.writer(f)
csvwriter.writerow( ["Date", "Location", "Title", "Price"] )
offset = 0
while True:
print('offset:', offset)
url = "https://portland.craigslist.org/search/sss?query=xbox&sort=date&s={}".format(offset)
response = requests.get(url)
if response.status_code != 200:
print('END: request status:', response.status)
break
soup = BeautifulSoup(response.text, 'html.parser')
data = soup.select('.result-info')
if not data:
print('END: no data:')
break
for container in data:
date = container.select('.result-date')[0].text
try:
location = container.select('.result-hood')[0].text
except:
try:
location = container.select('.nearby')[0].text
except:
location = ''
#location = location.replace(","," ") # don't need it with `csvwriter`
title = container.select('.result-title')[0].text
try:
price = container.select('.result-price')[0].text
except:
price = ''
#title.replace(",", " ") # don't need it with `csvwriter`
print(date, location, title, price)
csvwriter.writerow( [date, location, title, price] )
offset += 120
f.close()
我有一个代码可以获取 3 星及以上评级的前 20 名产品。我需要帮助为 2000 种产品创建一些东西。请检查我的代码。我正在考虑获取亚马逊网站上页码的链接并制作循环,直到我获得 2000 种产品,但无法弄清楚如何制作循环。 这是我的代码:
import requests
from bs4 import BeautifulSoup
import re
headers = {'User-agent': 'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2062.120 Safari/537.36'}
def star_link():
search = input("Please enter the category or search term: ")
url1 = "https://www.amazon.in/s/ref=nb_sb_noss?url=search-alias%3Daps&field-keywords=" + search
r1 = requests.get(url1)
soup1 = BeautifulSoup(r1.text, 'lxml')
links = soup1.find_all('i', 'a-star-3')
for link in links:
a_tag = link.parent
a_link = a_tag['href']
return a_link
def products():
url2 = "https://www.amazon.in/"+star_link()
r2 = requests.get(url2)
soup2 = BeautifulSoup(r2.text, 'lxml')
link_title = {}
print('The top 2000 products of youe selected category with 3 stars and up are: ')
print('\n')
for n in range(0, 20):
contents = soup2.findAll('li', id='result_' + str(n))
for content in contents:
links = [a['href'] for a in content.find_all(lambda i: i.get('href') and i.text)]
titles = content.find_all('h2')
for link in links:
for title in titles:
link_title.update({title.text: link})
for title in link_title:
print(title + ':' + link_title[title])
products()
用while True
到运行死循环,没取到数据就用break
data = soup.select('.result-info')
if not data:
print('END: no data:')
break
顺便说一句:
- 即使您在 url 中使用
s=0
,服务器也会提供第一页,因此您不必检查if i == 0
s={}
可以在?
之后的任何地方 - 所以它可以放在最后以使代码更具可读性- 您导入了
csv
模块但没有使用它。
我使用它,我不需要replace(","," ")
,因为如果项目中有,
,它会将文本放入" "
。
完整代码
import requests
from bs4 import BeautifulSoup
import csv
filename = "output.csv"
f = open(filename, 'w', newline="", encoding='utf-8')
csvwriter = csv.writer(f)
csvwriter.writerow( ["Date", "Location", "Title", "Price"] )
offset = 0
while True:
print('offset:', offset)
url = "https://portland.craigslist.org/search/sss?query=xbox&sort=date&s={}".format(offset)
response = requests.get(url)
if response.status_code != 200:
print('END: request status:', response.status)
break
soup = BeautifulSoup(response.text, 'html.parser')
data = soup.select('.result-info')
if not data:
print('END: no data:')
break
for container in data:
date = container.select('.result-date')[0].text
try:
location = container.select('.result-hood')[0].text
except:
try:
location = container.select('.nearby')[0].text
except:
location = ''
#location = location.replace(","," ") # don't need it with `csvwriter`
title = container.select('.result-title')[0].text
try:
price = container.select('.result-price')[0].text
except:
price = ''
#title.replace(",", " ") # don't need it with `csvwriter`
print(date, location, title, price)
csvwriter.writerow( [date, location, title, price] )
offset += 120
f.close()