Post 使用 scrapy 中的请求负载请求
Post request by using request payload in scrapy
如何抓取这个 website?我如何使用负载发送 post 请求并从中获取数据?
如果我使用此代码,我可以抓取第一页,但如何抓取第二页?我需要使用 selenium 还是足够 scrapy?
import scrapy
from scrapy import log
from scrapy.http import *
import urllib2
class myntra_spider(scrapy.Spider):
name="myntra"
allowed_domain=[]
start_urls=["http://www.myntra.com/men-footwear"]
logfile=open('testlog.log','w')
log_observer=log.ScrapyFileLogObserver(logfile,level=log.ERROR)
log_observer.start()
# sub_category=[]
def parse(self,response):
print "response url ",response.url
link=response.xpath("//ul[@class='results small']/li/a/@href").extract()
print links
yield Request('http://www.myntra.com/search-service/searchservice/search/filteredSearch', callback=self.nextpages,body="")
def nextpages(self,response):
link=response.xpath("//ul[@class='results small']/li/a/@href").extract()
for i in range(10):
print "link ",link[i]
您不需要为此使用 Selenium。在浏览器中检查需要随请求一起发送的有效负载,并将其附加到请求中。
我在你的网站上试过了,下面的代码片段有效 -
def start_requests(self):
url = "http://www.myntra.com/search-service/searchservice/search/filteredSearch"
payload = [{
"query": "(global_attr_age_group:(\"Adults-Unisex\" OR \"Adults-Women\") AND global_attr_master_category:(\"Footwear\"))",
"start": 0,
"rows": 96,
"facetField": [],
"pivotFacets": [],
"fq": ["count_options_availbale:[1 TO *]"],
"sort": [
{"sort_field": "count_options_availbale", "order_by": "desc"},
{"sort_field": "score", "order_by": "desc"},
{"sort_field": "style_store1_female_sort_field", "order_by": "desc"},
{"sort_field": "potential_revenue_female_sort_field", "order_by": "desc"},
{"sort_field": "global_attr_catalog_add_date", "order_by": "desc"}
],
"return_docs": True,
"colour_grouping": True,
"useCache": True,
"flatshot": False,
"outOfStock": False,
"showInactiveStyles": False,
"facet": True
}]
yield Request(url, self.parse, method="POST", body=json.dumps(payload))
def parse(self, response):
data = json.loads(response.body)
print data
如何抓取这个 website?我如何使用负载发送 post 请求并从中获取数据?
如果我使用此代码,我可以抓取第一页,但如何抓取第二页?我需要使用 selenium 还是足够 scrapy?
import scrapy
from scrapy import log
from scrapy.http import *
import urllib2
class myntra_spider(scrapy.Spider):
name="myntra"
allowed_domain=[]
start_urls=["http://www.myntra.com/men-footwear"]
logfile=open('testlog.log','w')
log_observer=log.ScrapyFileLogObserver(logfile,level=log.ERROR)
log_observer.start()
# sub_category=[]
def parse(self,response):
print "response url ",response.url
link=response.xpath("//ul[@class='results small']/li/a/@href").extract()
print links
yield Request('http://www.myntra.com/search-service/searchservice/search/filteredSearch', callback=self.nextpages,body="")
def nextpages(self,response):
link=response.xpath("//ul[@class='results small']/li/a/@href").extract()
for i in range(10):
print "link ",link[i]
您不需要为此使用 Selenium。在浏览器中检查需要随请求一起发送的有效负载,并将其附加到请求中。
我在你的网站上试过了,下面的代码片段有效 -
def start_requests(self):
url = "http://www.myntra.com/search-service/searchservice/search/filteredSearch"
payload = [{
"query": "(global_attr_age_group:(\"Adults-Unisex\" OR \"Adults-Women\") AND global_attr_master_category:(\"Footwear\"))",
"start": 0,
"rows": 96,
"facetField": [],
"pivotFacets": [],
"fq": ["count_options_availbale:[1 TO *]"],
"sort": [
{"sort_field": "count_options_availbale", "order_by": "desc"},
{"sort_field": "score", "order_by": "desc"},
{"sort_field": "style_store1_female_sort_field", "order_by": "desc"},
{"sort_field": "potential_revenue_female_sort_field", "order_by": "desc"},
{"sort_field": "global_attr_catalog_add_date", "order_by": "desc"}
],
"return_docs": True,
"colour_grouping": True,
"useCache": True,
"flatshot": False,
"outOfStock": False,
"showInactiveStyles": False,
"facet": True
}]
yield Request(url, self.parse, method="POST", body=json.dumps(payload))
def parse(self, response):
data = json.loads(response.body)
print data