在 Scrapy 中使用请求元以不同的顺序传递参数
Using request meta in Scrapy passes arguments in different order
我正在尝试从包含产品列表的页面中抓取数据,目前我正在获取所有链接并抓取详细信息,但问题是产品 manufacturer/brand 仅在列表页面,而不是产品页面。
我试过在回调中使用传递请求元,但制造商数据是无序传递的,导致行显示不正确的制造商。
这是示例页面:https://www.toolmania.cl/sierras-sable-561
现在是代码:
def parse(self, response):
"""Process toolmania.cl products"""
# define product url xpath
XPATH_PRODUCT_LINK = "//a[@class='thumbnail product-thumbnail']/@href"
products = response.xpath(XPATH_PRODUCT_LINK).extract()
XPATH_PRODUCT_BRAND = ".//h4[@class='product-manufacturer']/text()"
for product in products:
# obtain product brand
brand = response.xpath(XPATH_PRODUCT_BRAND).get()
#url = product
yield scrapy.Request(product, callback=self.parse_product, meta={'brand': brand})
# follow pagination link
XPATH_NEXT_PAGE = "//li[@class='page-item directional js-search-link']//a[@rel='next']/@href"
next_page = response.xpath(XPATH_NEXT_PAGE).get()
if next_page:
yield scrapy.Request(url=next_page, callback=self.parse)
def parse_product(self, response):
"""Get details from single product"""
XPATH_SINGLE_PRODUCT = "//div[@class='single-product']"
for product in response.xpath(XPATH_SINGLE_PRODUCT):
# define xpaths for product details
XPATH_PRODUCT_MODEL = ".//h5[@class='product-reference-single']/text()"
XPATH_PRODUCT_NAME = ".//h1[@class='product-name-single mb-md-4']/text()"
XPATH_PRODUCT_PRICE = ".//div[@class='product-prices margin__bottom__20']//span[@itemprop='price']/@content"
product_model = product.xpath(XPATH_PRODUCT_MODEL).get()
# clean product model
product_model = re.sub('Código de referencia: ', '', product_model)
yield {
'product_brand': response.meta['brand'],
'product_model': product_model,
'product_price': product.xpath(XPATH_PRODUCT_PRICE).extract(),
'product_name': product.xpath(XPATH_PRODUCT_NAME).extract(),
'product_link': response.url,
}
在下面的循环中使用 product
而不是 response
,并且还看到我正在使用 css 而不是 xpath
def parse(self, response):
"""Process toolmania.cl products"""
products = response.css('div.product-list')
for product in products:
# use "product" instead of "response"
brand = product.css('.product-manufacturer::text').get()
url = product.css(".thumbnail::attr(href)").get()
yield scrapy.Request(product, callback=self.parse_product, meta={'brand': brand})
我正在尝试从包含产品列表的页面中抓取数据,目前我正在获取所有链接并抓取详细信息,但问题是产品 manufacturer/brand 仅在列表页面,而不是产品页面。 我试过在回调中使用传递请求元,但制造商数据是无序传递的,导致行显示不正确的制造商。
这是示例页面:https://www.toolmania.cl/sierras-sable-561
现在是代码:
def parse(self, response):
"""Process toolmania.cl products"""
# define product url xpath
XPATH_PRODUCT_LINK = "//a[@class='thumbnail product-thumbnail']/@href"
products = response.xpath(XPATH_PRODUCT_LINK).extract()
XPATH_PRODUCT_BRAND = ".//h4[@class='product-manufacturer']/text()"
for product in products:
# obtain product brand
brand = response.xpath(XPATH_PRODUCT_BRAND).get()
#url = product
yield scrapy.Request(product, callback=self.parse_product, meta={'brand': brand})
# follow pagination link
XPATH_NEXT_PAGE = "//li[@class='page-item directional js-search-link']//a[@rel='next']/@href"
next_page = response.xpath(XPATH_NEXT_PAGE).get()
if next_page:
yield scrapy.Request(url=next_page, callback=self.parse)
def parse_product(self, response):
"""Get details from single product"""
XPATH_SINGLE_PRODUCT = "//div[@class='single-product']"
for product in response.xpath(XPATH_SINGLE_PRODUCT):
# define xpaths for product details
XPATH_PRODUCT_MODEL = ".//h5[@class='product-reference-single']/text()"
XPATH_PRODUCT_NAME = ".//h1[@class='product-name-single mb-md-4']/text()"
XPATH_PRODUCT_PRICE = ".//div[@class='product-prices margin__bottom__20']//span[@itemprop='price']/@content"
product_model = product.xpath(XPATH_PRODUCT_MODEL).get()
# clean product model
product_model = re.sub('Código de referencia: ', '', product_model)
yield {
'product_brand': response.meta['brand'],
'product_model': product_model,
'product_price': product.xpath(XPATH_PRODUCT_PRICE).extract(),
'product_name': product.xpath(XPATH_PRODUCT_NAME).extract(),
'product_link': response.url,
}
在下面的循环中使用 product
而不是 response
,并且还看到我正在使用 css 而不是 xpath
def parse(self, response):
"""Process toolmania.cl products"""
products = response.css('div.product-list')
for product in products:
# use "product" instead of "response"
brand = product.css('.product-manufacturer::text').get()
url = product.css(".thumbnail::attr(href)").get()
yield scrapy.Request(product, callback=self.parse_product, meta={'brand': brand})