检索特定产品的亚马逊评论
Retrieve Amazon Reviews for a particular product
我目前正在进行一个研究项目,该项目需要分析对特定产品的评论并全面了解该产品。
听说亚马逊是采购产品的好地方reviews/comments。有什么方法可以通过 API 从亚马逊检索这些用户 reviews/comments 吗?我尝试了几个python代码,但它不起作用..如果没有API检索数据,我需要写一个蜘蛛吗?
是否有 approaches/places 检索给定产品的用户评论?
Beautiful Soup 是一个 Python API,它允许您从页面(如亚马逊产品页面)中提取 html 数据,并通过文件进行解析。 API 应该允许直接从页面获取评论部分。这是文档的 link:
https://www.crummy.com/software/BeautifulSoup/bs4/doc/
www.Scrapehero.com 有一个关于如何抓取亚马逊产品详细信息的很棒的教程:How To Scrape Amazon Product Details and Pricing using Python
他们使用的完整纯文本代码是...
产品由其 ASIN 标识,因此请将数组值更改为您感兴趣的产品。
from lxml import html
import csv,os,json
import requests
from exceptions import ValueError
from time import sleep
def AmzonParser(url):
headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.90 Safari/537.36'}
page = requests.get(url,headers=headers)
while True:
sleep(3)
try:
doc = html.fromstring(page.content)
XPATH_NAME = '//h1[@id="title"]//text()'
XPATH_SALE_PRICE = '//span[contains(@id,"ourprice") or contains(@id,"saleprice")]/text()'
XPATH_ORIGINAL_PRICE = '//td[contains(text(),"List Price") or contains(text(),"M.R.P") or contains(text(),"Price")]/following-sibling::td/text()'
XPATH_CATEGORY = '//a[@class="a-link-normal a-color-tertiary"]//text()'
XPATH_AVAILABILITY = '//div[@id="availability"]//text()'
RAW_NAME = doc.xpath(XPATH_NAME)
RAW_SALE_PRICE = doc.xpath(XPATH_SALE_PRICE)
RAW_CATEGORY = doc.xpath(XPATH_CATEGORY)
RAW_ORIGINAL_PRICE = doc.xpath(XPATH_ORIGINAL_PRICE)
RAw_AVAILABILITY = doc.xpath(XPATH_AVAILABILITY)
NAME = ' '.join(''.join(RAW_NAME).split()) if RAW_NAME else None
SALE_PRICE = ' '.join(''.join(RAW_SALE_PRICE).split()).strip() if RAW_SALE_PRICE else None
CATEGORY = ' > '.join([i.strip() for i in RAW_CATEGORY]) if RAW_CATEGORY else None
ORIGINAL_PRICE = ''.join(RAW_ORIGINAL_PRICE).strip() if RAW_ORIGINAL_PRICE else None
AVAILABILITY = ''.join(RAw_AVAILABILITY).strip() if RAw_AVAILABILITY else None
if not ORIGINAL_PRICE:
ORIGINAL_PRICE = SALE_PRICE
if page.status_code!=200:
raise ValueError('captha')
data = {
'NAME':NAME,
'SALE_PRICE':SALE_PRICE,
'CATEGORY':CATEGORY,
'ORIGINAL_PRICE':ORIGINAL_PRICE,
'AVAILABILITY':AVAILABILITY,
'URL':url,
}
return data
except Exception as e:
print e
def ReadAsin():
# AsinList = csv.DictReader(open(os.path.join(os.path.dirname(__file__),"Asinfeed.csv")))
AsinList = ['B0046UR4F4',
'B00JGTVU5A',
'B00GJYCIVK',
'B00EPGK7CQ',
'B00EPGKA4G',
'B00YW5DLB4',
'B00KGD0628',
'B00O9A48N2',
'B00O9A4MEW',
'B00UZKG8QU',]
extracted_data = []
for i in AsinList:
url = "http://www.amazon.com/dp/"+i
print "Processing: "+url
extracted_data.append(AmzonParser(url))
sleep(5)
f=open('data.json','w')
json.dump(extracted_data,f,indent=4)
if __name__ == "__main__":
ReadAsin()
如果您需要定期抓取并浏览多个产品页面,我建议您对 scape hero 脚本进行添加,并在请求中使用 https://pypi.org/project/fake-useragent/ 作为 "headers"。
否则,如果您只是偶尔需要下载评论,您可以使用 https://reviewi.me。这是一个免费的基于网络的工具,适用于多个亚马逊网站,并允许导出为 CSV、XLSX 和 JSON
我目前正在进行一个研究项目,该项目需要分析对特定产品的评论并全面了解该产品。
听说亚马逊是采购产品的好地方reviews/comments。有什么方法可以通过 API 从亚马逊检索这些用户 reviews/comments 吗?我尝试了几个python代码,但它不起作用..如果没有API检索数据,我需要写一个蜘蛛吗?
是否有 approaches/places 检索给定产品的用户评论?
Beautiful Soup 是一个 Python API,它允许您从页面(如亚马逊产品页面)中提取 html 数据,并通过文件进行解析。 API 应该允许直接从页面获取评论部分。这是文档的 link: https://www.crummy.com/software/BeautifulSoup/bs4/doc/
www.Scrapehero.com 有一个关于如何抓取亚马逊产品详细信息的很棒的教程:How To Scrape Amazon Product Details and Pricing using Python
他们使用的完整纯文本代码是... 产品由其 ASIN 标识,因此请将数组值更改为您感兴趣的产品。
from lxml import html
import csv,os,json
import requests
from exceptions import ValueError
from time import sleep
def AmzonParser(url):
headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.90 Safari/537.36'}
page = requests.get(url,headers=headers)
while True:
sleep(3)
try:
doc = html.fromstring(page.content)
XPATH_NAME = '//h1[@id="title"]//text()'
XPATH_SALE_PRICE = '//span[contains(@id,"ourprice") or contains(@id,"saleprice")]/text()'
XPATH_ORIGINAL_PRICE = '//td[contains(text(),"List Price") or contains(text(),"M.R.P") or contains(text(),"Price")]/following-sibling::td/text()'
XPATH_CATEGORY = '//a[@class="a-link-normal a-color-tertiary"]//text()'
XPATH_AVAILABILITY = '//div[@id="availability"]//text()'
RAW_NAME = doc.xpath(XPATH_NAME)
RAW_SALE_PRICE = doc.xpath(XPATH_SALE_PRICE)
RAW_CATEGORY = doc.xpath(XPATH_CATEGORY)
RAW_ORIGINAL_PRICE = doc.xpath(XPATH_ORIGINAL_PRICE)
RAw_AVAILABILITY = doc.xpath(XPATH_AVAILABILITY)
NAME = ' '.join(''.join(RAW_NAME).split()) if RAW_NAME else None
SALE_PRICE = ' '.join(''.join(RAW_SALE_PRICE).split()).strip() if RAW_SALE_PRICE else None
CATEGORY = ' > '.join([i.strip() for i in RAW_CATEGORY]) if RAW_CATEGORY else None
ORIGINAL_PRICE = ''.join(RAW_ORIGINAL_PRICE).strip() if RAW_ORIGINAL_PRICE else None
AVAILABILITY = ''.join(RAw_AVAILABILITY).strip() if RAw_AVAILABILITY else None
if not ORIGINAL_PRICE:
ORIGINAL_PRICE = SALE_PRICE
if page.status_code!=200:
raise ValueError('captha')
data = {
'NAME':NAME,
'SALE_PRICE':SALE_PRICE,
'CATEGORY':CATEGORY,
'ORIGINAL_PRICE':ORIGINAL_PRICE,
'AVAILABILITY':AVAILABILITY,
'URL':url,
}
return data
except Exception as e:
print e
def ReadAsin():
# AsinList = csv.DictReader(open(os.path.join(os.path.dirname(__file__),"Asinfeed.csv")))
AsinList = ['B0046UR4F4',
'B00JGTVU5A',
'B00GJYCIVK',
'B00EPGK7CQ',
'B00EPGKA4G',
'B00YW5DLB4',
'B00KGD0628',
'B00O9A48N2',
'B00O9A4MEW',
'B00UZKG8QU',]
extracted_data = []
for i in AsinList:
url = "http://www.amazon.com/dp/"+i
print "Processing: "+url
extracted_data.append(AmzonParser(url))
sleep(5)
f=open('data.json','w')
json.dump(extracted_data,f,indent=4)
if __name__ == "__main__":
ReadAsin()
如果您需要定期抓取并浏览多个产品页面,我建议您对 scape hero 脚本进行添加,并在请求中使用 https://pypi.org/project/fake-useragent/ 作为 "headers"。
否则,如果您只是偶尔需要下载评论,您可以使用 https://reviewi.me。这是一个免费的基于网络的工具,适用于多个亚马逊网站,并允许导出为 CSV、XLSX 和 JSON