如何在 scrapy 中使用 meta 获取所有数据并进行解析?
How fetch all data and parse using meta in scrapy?
我想将所有数据保存在一个 json 文件中。
我如何使用 meta 解析我的数据?
我不知道我的元格式是否正确。
最后在 json 文件中生成所有数据(我通过 meta 和我 parse_v)
请帮我解决这个问题。
现在我添加完整代码。希望你能找出我的问题
import json
import scrapy
import time
import chompjs
from scrapy import Request
from scrapy.crawler import CrawlerProcess
class TrendyolSpider(scrapy.Spider):
name = 'data'
start_urls = ['https://www.trendyol.com/join-us/straplez-firfirli-simli-astarli-triko-elbise-krem-p-41896200']
def final_parse(self, response):
abc = response.xpath("//p/script[contains(@type,'application/ld+json')]/text()").extract_first()
json_text = json.loads(abc)
img = json_text.get('image')
products = response.css('div.pd-app-container')
for product in products:
category = product.css('div.breadcrumb>a:nth-child(3)+ a.breadcrumb-item span::text').get(),
product_name = product.css("h1.pr-new-br ::text").getall(),
price = product.css('div.pr-bx-nm span.prc-org::text').get().replace("TL", ""),
discount_price = product.css('div.pr-bx-nm span.prc-slg::text').get().replace("TL", ""),
brand = response.css("div.sl-nm a::text").get(),
image = img,
size = product.css("div.pr-in-at-sp ::text").getall(),
product_information = product.css("div.pr-in-dt-cn ::text").getall(),
product_features = product.css("div.pr-prop-content ::text").getall(),
all_info = response.xpath("//script[contains(@type,'application/javascript')]/text()").extract_first()
product_json = chompjs.parse_js_object(all_info)
ides = product_json['product']['productGroupId']
varient_url = "https://public.trendyol.com/discovery-web-productgw-service/api/productGroup/" + str(ides)
yield Request(url=varient_url, callback=self.parse_v, meta={
'category': category,
'product_name': product_name,
'price': price,
'discount_price': discount_price,
'brand': brand,
'image': image,
'size': size,
'product_information': product_information,
'product_features': product_features,
})
def parse_v(self, response):
json_tex5 = json.loads(response.body)
dataa = json_tex5.get('result').get("slicingAttributes")[0].get("attributes")
yield {
'category': response.meta['category'],
'product_name': response.meta['product_name'],
'price': response.meta['price'],
'discount_price': response.meta['discount_price'],
'brand': response.meta['brand'],
'image': response.meta['image'],
'size': response.meta['size'],
'product_information': response.meta['product_information'],
'product_features': response.meta['product_features'],
'renk': dataa
}
根据您的问题,答案如下:
如果你想使用 meta 将数据从一个解析方法传输到另一个,你需要为每个值创建键并使用 meta 将每个键值对注入到 Request 中
,毕竟,在 parse_v 方法中,您必须新创建密钥并使用 response.meta 获取以前的密钥,并且它是新的键值对以产生类似 'Category' 的数据: response.meta['cat']
class TrendyolSpider(scrapy.Spider):
name = 'data'
start_urls = [
'https://www.trendyol.com/olalook/kadin-siyah-cepli-minik-beyaz-cicekli-klos-elbise-elb-19000480-p-6635101']
def parse(self, response):
text = response.xpath(
"//p/script[contains(@type,'application/ld+json')]/text()").extract_first()
json_text = json.loads(text)
items = TrendyolItem()
products = response.css('div.pd-app-container')
for product in products:
category = product.css(
'div.breadcrumb>a:nth-child(3)+ a.breadcrumb-item span::text').get(),
product_name = product.css('div.pr-in-cn h1.pr-new-br::text').get() + " " + product.css(
'div.pr-in-cn h1.pr-new-br span::text').get(),
price = product.css(
'div.pr-bx-nm span.prc-org::text').get().replace("TL", ""),
discount_price = product.css(
'div.pr-bx-nm span.prc-slg::text').get().replace("TL", ""),
brand = response.css("div.sl-nm a::text").get(),
image = json_text.get('image'),
size = product.css("div.pr-in-at-sp ::text").getall(),
product_information = product.css(
"div.pr-in-dt-cn ::text").getall(),
product_features = product.css(
"div.pr-prop-content ::text").getall(),
items['category'] = category
items['product_name'] = product_name
items['price'] = price
items['discount_price'] = discount_price
items['brand'] = brand
items['image'] = image
items['size'] = size
items['product_information'] = product_information
items['product_features'] = product_features
all_info = response.xpath(
"//script[contains(@type,'application/javascript')]/text()").extract_first()
product_json = chompjs.parse_js_object(all_info)
ides = product_json['product']['productGroupId']
varient_url = "https://public.trendyol.com/discovery-web-productgw-service/api/productGroup/" + \
str(ides)
yield Request(
url=varient_url,
callback=self.parse_v,
meta={'cat': category, 'pro_name': product_name,'p': price, 'dis_price': discount_price,
'bra': brand,'ima':image,'si':size, 'porduct_info':product_information,'features':product_features
}
)
def parse_v(self, response):
#items = response.meta['items']
json_tex5 = json.loads(response.body)
dataa = json_tex5.get('result').get(
"slicingAttributes")[0].get("attributes")
for i in dataa:
all_info = self.start_urls + i['contents'][0]['url'] + "https://cdn.dsmcdn.com"+i['contents'][0]['imageUrl']\
+ i['contents'][0]['price']['discountedPrice']['text'] + \
i['contents'][0]['price']['originalPrice']['text']
yield {
'Category': response.meta['cat'],
'Product_name': response.meta['pro_name'],
'Price': response.meta['p'],
'Discount_price': response.meta['dis_price'],
'Brand': response.meta['bra'],
'Image': response.meta['ima'],
'Size': response.meta['si'],
'Product_information': response.meta['porduct_info'],
'Product_features': response.meta['features'],
'rank': all_info
}
我想将所有数据保存在一个 json 文件中。 我如何使用 meta 解析我的数据? 我不知道我的元格式是否正确。 最后在 json 文件中生成所有数据(我通过 meta 和我 parse_v) 请帮我解决这个问题。
现在我添加完整代码。希望你能找出我的问题
import json
import scrapy
import time
import chompjs
from scrapy import Request
from scrapy.crawler import CrawlerProcess
class TrendyolSpider(scrapy.Spider):
name = 'data'
start_urls = ['https://www.trendyol.com/join-us/straplez-firfirli-simli-astarli-triko-elbise-krem-p-41896200']
def final_parse(self, response):
abc = response.xpath("//p/script[contains(@type,'application/ld+json')]/text()").extract_first()
json_text = json.loads(abc)
img = json_text.get('image')
products = response.css('div.pd-app-container')
for product in products:
category = product.css('div.breadcrumb>a:nth-child(3)+ a.breadcrumb-item span::text').get(),
product_name = product.css("h1.pr-new-br ::text").getall(),
price = product.css('div.pr-bx-nm span.prc-org::text').get().replace("TL", ""),
discount_price = product.css('div.pr-bx-nm span.prc-slg::text').get().replace("TL", ""),
brand = response.css("div.sl-nm a::text").get(),
image = img,
size = product.css("div.pr-in-at-sp ::text").getall(),
product_information = product.css("div.pr-in-dt-cn ::text").getall(),
product_features = product.css("div.pr-prop-content ::text").getall(),
all_info = response.xpath("//script[contains(@type,'application/javascript')]/text()").extract_first()
product_json = chompjs.parse_js_object(all_info)
ides = product_json['product']['productGroupId']
varient_url = "https://public.trendyol.com/discovery-web-productgw-service/api/productGroup/" + str(ides)
yield Request(url=varient_url, callback=self.parse_v, meta={
'category': category,
'product_name': product_name,
'price': price,
'discount_price': discount_price,
'brand': brand,
'image': image,
'size': size,
'product_information': product_information,
'product_features': product_features,
})
def parse_v(self, response):
json_tex5 = json.loads(response.body)
dataa = json_tex5.get('result').get("slicingAttributes")[0].get("attributes")
yield {
'category': response.meta['category'],
'product_name': response.meta['product_name'],
'price': response.meta['price'],
'discount_price': response.meta['discount_price'],
'brand': response.meta['brand'],
'image': response.meta['image'],
'size': response.meta['size'],
'product_information': response.meta['product_information'],
'product_features': response.meta['product_features'],
'renk': dataa
}
根据您的问题,答案如下:
如果你想使用 meta 将数据从一个解析方法传输到另一个,你需要为每个值创建键并使用 meta 将每个键值对注入到 Request 中 ,毕竟,在 parse_v 方法中,您必须新创建密钥并使用 response.meta 获取以前的密钥,并且它是新的键值对以产生类似 'Category' 的数据: response.meta['cat']
class TrendyolSpider(scrapy.Spider):
name = 'data'
start_urls = [
'https://www.trendyol.com/olalook/kadin-siyah-cepli-minik-beyaz-cicekli-klos-elbise-elb-19000480-p-6635101']
def parse(self, response):
text = response.xpath(
"//p/script[contains(@type,'application/ld+json')]/text()").extract_first()
json_text = json.loads(text)
items = TrendyolItem()
products = response.css('div.pd-app-container')
for product in products:
category = product.css(
'div.breadcrumb>a:nth-child(3)+ a.breadcrumb-item span::text').get(),
product_name = product.css('div.pr-in-cn h1.pr-new-br::text').get() + " " + product.css(
'div.pr-in-cn h1.pr-new-br span::text').get(),
price = product.css(
'div.pr-bx-nm span.prc-org::text').get().replace("TL", ""),
discount_price = product.css(
'div.pr-bx-nm span.prc-slg::text').get().replace("TL", ""),
brand = response.css("div.sl-nm a::text").get(),
image = json_text.get('image'),
size = product.css("div.pr-in-at-sp ::text").getall(),
product_information = product.css(
"div.pr-in-dt-cn ::text").getall(),
product_features = product.css(
"div.pr-prop-content ::text").getall(),
items['category'] = category
items['product_name'] = product_name
items['price'] = price
items['discount_price'] = discount_price
items['brand'] = brand
items['image'] = image
items['size'] = size
items['product_information'] = product_information
items['product_features'] = product_features
all_info = response.xpath(
"//script[contains(@type,'application/javascript')]/text()").extract_first()
product_json = chompjs.parse_js_object(all_info)
ides = product_json['product']['productGroupId']
varient_url = "https://public.trendyol.com/discovery-web-productgw-service/api/productGroup/" + \
str(ides)
yield Request(
url=varient_url,
callback=self.parse_v,
meta={'cat': category, 'pro_name': product_name,'p': price, 'dis_price': discount_price,
'bra': brand,'ima':image,'si':size, 'porduct_info':product_information,'features':product_features
}
)
def parse_v(self, response):
#items = response.meta['items']
json_tex5 = json.loads(response.body)
dataa = json_tex5.get('result').get(
"slicingAttributes")[0].get("attributes")
for i in dataa:
all_info = self.start_urls + i['contents'][0]['url'] + "https://cdn.dsmcdn.com"+i['contents'][0]['imageUrl']\
+ i['contents'][0]['price']['discountedPrice']['text'] + \
i['contents'][0]['price']['originalPrice']['text']
yield {
'Category': response.meta['cat'],
'Product_name': response.meta['pro_name'],
'Price': response.meta['p'],
'Discount_price': response.meta['dis_price'],
'Brand': response.meta['bra'],
'Image': response.meta['ima'],
'Size': response.meta['si'],
'Product_information': response.meta['porduct_info'],
'Product_features': response.meta['features'],
'rank': all_info
}