我如何使用scrapy从bestbuy获取产品的UPC
How can i get the UPC of a product from bestbuy using scrapy
hi there
i need to scrap bestbuy i am currently using scrapy i was able to get most of the data i need but however i had faced some problems trying to get the
specification data section where UPC is. i was able to get features but that part i am not
able to grab the data.
really appreciate your help this is my code
from scrapy import Spider
from bestbuy_spider.items import BestbuyProductItem
from scrapy import Request
import re
import json
class Bestbuy2Spider(Spider):
name = 'bestbuy2'
# allowed_domains = ['https://www.bestbuy.com']
allowed_domains = ['bestbuy.com']
# https://www.bestbuy.com/site/searchpage.jsp?cp=1&searchType=search&browsedCategory=pcmcat209400050001&ks=960&sp=-bestsellingsort%20skuidsaas&sc=Global&list=y&usc=All%20Categories&type=page&id=pcat17071&iht=n&nrp=15&seeAll=&st=categoryid%24pcmcat209400050001&qp=carrier_facet%3DCarrier~Verizon
# start_urls = ['https://www.bestbuy.com/site/laptop-computers/all-laptops/pcmcat138500050001.c?id=pcmcat138500050001']
start_urls = ['https://www.bestbuy.com/site/searchpage.jsp?id=pcat17071&qp=storepickupstores_facet%3DStore%20Availability%20-%20In%20Store%20Pickup~237&st=%2A']
def parse(self, response):
text = response.xpath('//div[@class="left-side"]/span/text()').extract_first()
_, items_page, total = tuple(map(lambda x: int(x), re.findall('\d+',text)))
num_pages = total // items_page
#print('number of pages:', num_pages)
urls = [
'https://www.bestbuy.com/site/searchpage.jsp?cp={}&id=pcat17071&qp=storepickupstores_facet%3DStore%20Availability%20-%20In%20Store%20Pickup~237&st=%2A'.format(
x) for x in range(1, num_pages + 1)]
for url in urls[:1]:
# product list page
yield Request(url=url, callback=self.parse_product_list)
def parse_product_list(self, response):
# product list
rows = response.xpath('//ol[@class="sku-item-list"]/li')
# print(len(rows))
# print('=' * 50)
for row in rows:
url = row.xpath('.//div[@class="sku-title"]/h4/a/@href').extract_first()
print(url)
yield Request(url='https://www.bestbuy.com' + str(url), callback=self.parse_product)
#'//ul[@Class="thumbnail-list"]//@src'
def parse_product(self, response):
price_txt = response.xpath('//div[@class="pricing-price__regular-price"]/text()').extract_first()
#reg_price = price_txt.replace('Was ', '')
item = BestbuyProductItem(
product = response.xpath('//div[@class="sku-title"]/h1/text()').extract_first(),
#color = response.xpath('li[@class="image selected"]/div/a/@title').extract_first(),
#skuId = response.xpath('//div[@class="sku product-data"]/span[2]/text()').extract_first(),
#price = response.xpath('//div[@class="priceView-hero-price priceView-customer-price"]/span[1]/text()').extract_first(),
#model = response.xpath('//div[@class="model product-data"]/span[2]/text()').extract_first(),
#main_image = response.xpath('//img[@class="primary-image"]/@src').extract_first(),
#images = response.xpath('//*[@class="thumbnail-list"]//img/@src').extract(),
#description = response.xpath('//div[@class="long-description-container body-copy "]//div/text()').extract(),
#features = response.xpath('//div[@class="list-row"]/p/text()').extract(),
#regular_price = price_txt,
Location = response.xpath('//div[@class="fulfillment-fulfillment-summary"]//div/p[1]/span/text()').extract()
)
yield item
查看一个产品页面代码 (https://www.bestbuy.com/site/sony-65-class-bravia-xr-x95j-4k-uhd-smart-google-tv/6459306.p?skuId=6459306),我注意到 json 带有 gtin13 字段(您要查找的 upc 代码)。应该很容易用 json 模块解析它并得到你需要的东西。
{
"@context":"http://schema.org/",
"@type":"Product",
"name":"Sony - 65\" class BRAVIA XR X95J 4K UHD Smart Google TV",
"image":"https://pisces.bbystatic.com/image2/BestBuy_US/images/products/6459/6459306_sd.jpg",
"url":"https://www.bestbuy.com/site/sony-65-class-bravia-xr-x95j-4k-uhd-smart-google-tv/6459306.p?skuId=6459306",
"description":"Shop Sony 65\" class BRAVIA XR X95J 4K UHD Smart Google TV at Best Buy. Find low everyday prices and buy online for delivery or in-store pick-up. Price Match Guarantee.",
"sku":"6459306",
"gtin13":"0027242921818",
"model":"XR65X95J",
"width":{
"@type":"http://schema.org/QuantitativeValue",
"unitCode":"INH",
"value":"56.87"
},
"color":"Black",
"brand":{
"@type":"Brand",
"name":"Sony"
},
"aggregateRating":{
"@type":"AggregateRating",
"ratingValue":"4.7",
"reviewCount":"221"
},
"offers":{
"@type":"AggregateOffer",
"priceCurrency":"USD",
"seller":{
"@type":"Organization",
"name":"Best Buy"
},
"lowPrice":"1184.99",
"highPrice":"1499.99",
"offercount":5,
"offers":[
{
"@type":"Offer",
"priceCurrency":"USD",
"price":"1499.99",
"availability":"http://schema.org/InStock",
"itemCondition":"http://schema.org/NewCondition",
"description":"New"
},
{
"@type":"Offer",
"priceCurrency":"USD",
"price":"1319.99",
"itemCondition":"http://schema.org/UsedCondition",
"description":"Open-Box Excellent - Certified"
},
{
"@type":"Offer",
"priceCurrency":"USD",
"price":"1274.99",
"itemCondition":"http://schema.org/UsedCondition",
"description":"Open-Box Excellent"
},
{
"@type":"Offer",
"priceCurrency":"USD",
"price":"1229.99",
"itemCondition":"http://schema.org/UsedCondition",
"description":"Open-Box Satisfactory"
},
{
"@type":"Offer",
"priceCurrency":"USD",
"price":"1184.99",
"itemCondition":"http://schema.org/UsedCondition",
"description":"Open-Box Fair"
}
]
}
}
hi there
i need to scrap bestbuy i am currently using scrapy i was able to get most of the data i need but however i had faced some problems trying to get the specification data section where UPC is. i was able to get features but that part i am not able to grab the data.
really appreciate your help this is my code
from scrapy import Spider
from bestbuy_spider.items import BestbuyProductItem
from scrapy import Request
import re
import json
class Bestbuy2Spider(Spider):
name = 'bestbuy2'
# allowed_domains = ['https://www.bestbuy.com']
allowed_domains = ['bestbuy.com']
# https://www.bestbuy.com/site/searchpage.jsp?cp=1&searchType=search&browsedCategory=pcmcat209400050001&ks=960&sp=-bestsellingsort%20skuidsaas&sc=Global&list=y&usc=All%20Categories&type=page&id=pcat17071&iht=n&nrp=15&seeAll=&st=categoryid%24pcmcat209400050001&qp=carrier_facet%3DCarrier~Verizon
# start_urls = ['https://www.bestbuy.com/site/laptop-computers/all-laptops/pcmcat138500050001.c?id=pcmcat138500050001']
start_urls = ['https://www.bestbuy.com/site/searchpage.jsp?id=pcat17071&qp=storepickupstores_facet%3DStore%20Availability%20-%20In%20Store%20Pickup~237&st=%2A']
def parse(self, response):
text = response.xpath('//div[@class="left-side"]/span/text()').extract_first()
_, items_page, total = tuple(map(lambda x: int(x), re.findall('\d+',text)))
num_pages = total // items_page
#print('number of pages:', num_pages)
urls = [
'https://www.bestbuy.com/site/searchpage.jsp?cp={}&id=pcat17071&qp=storepickupstores_facet%3DStore%20Availability%20-%20In%20Store%20Pickup~237&st=%2A'.format(
x) for x in range(1, num_pages + 1)]
for url in urls[:1]:
# product list page
yield Request(url=url, callback=self.parse_product_list)
def parse_product_list(self, response):
# product list
rows = response.xpath('//ol[@class="sku-item-list"]/li')
# print(len(rows))
# print('=' * 50)
for row in rows:
url = row.xpath('.//div[@class="sku-title"]/h4/a/@href').extract_first()
print(url)
yield Request(url='https://www.bestbuy.com' + str(url), callback=self.parse_product)
#'//ul[@Class="thumbnail-list"]//@src'
def parse_product(self, response):
price_txt = response.xpath('//div[@class="pricing-price__regular-price"]/text()').extract_first()
#reg_price = price_txt.replace('Was ', '')
item = BestbuyProductItem(
product = response.xpath('//div[@class="sku-title"]/h1/text()').extract_first(),
#color = response.xpath('li[@class="image selected"]/div/a/@title').extract_first(),
#skuId = response.xpath('//div[@class="sku product-data"]/span[2]/text()').extract_first(),
#price = response.xpath('//div[@class="priceView-hero-price priceView-customer-price"]/span[1]/text()').extract_first(),
#model = response.xpath('//div[@class="model product-data"]/span[2]/text()').extract_first(),
#main_image = response.xpath('//img[@class="primary-image"]/@src').extract_first(),
#images = response.xpath('//*[@class="thumbnail-list"]//img/@src').extract(),
#description = response.xpath('//div[@class="long-description-container body-copy "]//div/text()').extract(),
#features = response.xpath('//div[@class="list-row"]/p/text()').extract(),
#regular_price = price_txt,
Location = response.xpath('//div[@class="fulfillment-fulfillment-summary"]//div/p[1]/span/text()').extract()
)
yield item
查看一个产品页面代码 (https://www.bestbuy.com/site/sony-65-class-bravia-xr-x95j-4k-uhd-smart-google-tv/6459306.p?skuId=6459306),我注意到 json 带有 gtin13 字段(您要查找的 upc 代码)。应该很容易用 json 模块解析它并得到你需要的东西。
{
"@context":"http://schema.org/",
"@type":"Product",
"name":"Sony - 65\" class BRAVIA XR X95J 4K UHD Smart Google TV",
"image":"https://pisces.bbystatic.com/image2/BestBuy_US/images/products/6459/6459306_sd.jpg",
"url":"https://www.bestbuy.com/site/sony-65-class-bravia-xr-x95j-4k-uhd-smart-google-tv/6459306.p?skuId=6459306",
"description":"Shop Sony 65\" class BRAVIA XR X95J 4K UHD Smart Google TV at Best Buy. Find low everyday prices and buy online for delivery or in-store pick-up. Price Match Guarantee.",
"sku":"6459306",
"gtin13":"0027242921818",
"model":"XR65X95J",
"width":{
"@type":"http://schema.org/QuantitativeValue",
"unitCode":"INH",
"value":"56.87"
},
"color":"Black",
"brand":{
"@type":"Brand",
"name":"Sony"
},
"aggregateRating":{
"@type":"AggregateRating",
"ratingValue":"4.7",
"reviewCount":"221"
},
"offers":{
"@type":"AggregateOffer",
"priceCurrency":"USD",
"seller":{
"@type":"Organization",
"name":"Best Buy"
},
"lowPrice":"1184.99",
"highPrice":"1499.99",
"offercount":5,
"offers":[
{
"@type":"Offer",
"priceCurrency":"USD",
"price":"1499.99",
"availability":"http://schema.org/InStock",
"itemCondition":"http://schema.org/NewCondition",
"description":"New"
},
{
"@type":"Offer",
"priceCurrency":"USD",
"price":"1319.99",
"itemCondition":"http://schema.org/UsedCondition",
"description":"Open-Box Excellent - Certified"
},
{
"@type":"Offer",
"priceCurrency":"USD",
"price":"1274.99",
"itemCondition":"http://schema.org/UsedCondition",
"description":"Open-Box Excellent"
},
{
"@type":"Offer",
"priceCurrency":"USD",
"price":"1229.99",
"itemCondition":"http://schema.org/UsedCondition",
"description":"Open-Box Satisfactory"
},
{
"@type":"Offer",
"priceCurrency":"USD",
"price":"1184.99",
"itemCondition":"http://schema.org/UsedCondition",
"description":"Open-Box Fair"
}
]
}
}