scrapy 400 错误请求,但邮递员状态为 200ok
scrapy 400 bad request but status 200ok on postman
大家好!
我正在尝试 scrape https://guardian.com.my/health.html 使用 API 和 scrapy。在邮递员上,请求 url 产生状态 200ok,但我在抓取时收到 Crawled(400) 错误请求。
import scrapy
from scrapy.exceptions import CloseSpider
import json
class GapiSpider(scrapy.Spider):
name = 'gapi'
headers={
':authority': 'guardian.com.my',
':method': 'GET',
':path': f"/graphql?query=query+GetCategories%28%24id%3AInt%21%24pageSize%3AInt%21%24currentPage%3AInt%21%24filters%3AProductAttributeFilterInput%21%24sort%3AProductAttributeSortInput%29%7Bcategory%28id%3A%24id%29%7Bid+description+name+product_count+meta_title+meta_keywords+meta_description+__typename%7Dproducts%28pageSize%3A%24pageSize+currentPage%3A%24currentPage+filter%3A%24filters+sort%3A%24sort%29%7Bitems%7Bid+name+sku+price%7BregularPrice%7Bamount%7Bcurrency+value+__typename%7D__typename%7D__typename%7Dprice_range%7Bminimum_price%7Bfinal_price%7Bcurrency+value+__typename%7Ddiscount%7Bamount_off+percent_off+__typename%7D__typename%7Dmaximum_price%7Bfinal_price%7Bcurrency+value+__typename%7Ddiscount%7Bamount_off+percent_off+__typename%7D__typename%7D__typename%7Dpromotion_label+promotion_label_name+sales_icon+small_image%7Burl+__typename%7Dstock_status+url_key+url_suffix+__typename%7Dpage_info%7Btotal_pages+__typename%7Dtotal_count+__typename%7D%7D&operationName=GetCategories&variables=%7B%22currentPage%22%3A2%2C%22id%22%3A3047%2C%22filters%22%3A%7B%22category_id%22%3A%7B%22eq%22%3A%223047%22%7D%7D%2C%22pageSize%22%3A60%2C%22sort%22%3A%7B%22position%22%3A%22ASC%22%7D%7D",
':scheme': 'https',
'accept': '*/*',
'accept-encoding': 'gzip, deflate, br',
'accept-language': 'en-US,en;q=0.9',
'cache-control': 'no-cache',
'content-type': 'application/json',
'cookie': 'lzd_cid=13b492b3-2f4c-46fc-fb4f-98212b95f68d; t_uid=13b492b3-2f4c-46fc-fb4f-98212b95f68d; lzd_sid=1728fd4a8af615419dc17044911406d4; t_fv=1623735402455; hng=MY|en-MY|MYR|458; userLanguageML=en; _bl_uid=7qkOhpLtxynm254s1v6s7X3gjp7n; cna=aipPGYuo6GwCAXOk2DtI3+FS; _gcl_au=1.1.335118801.1623735403; _tb_token_=f311f13b31eb5; age_restriction=over%3B18%3B1; _fbp=fb.2.1623760903320.1705265880; _ga=GA1.3.1886895770.1623760957; _gid=GA1.3.71310232.1623760957; t_sid=VVEk6ELqT4zeevu7Snsvx63eqX0u9De7; utm_origin=https://www.google.com/; utm_channel=SEO; xlly_s=1; EGG_SESS=S_Gs1wHo9OvRHCMp98md7LqI1pVlU7ApMhhrX1Oe_NHHkRPwi6zdBuxVpdbHrc8tccMpabJfEEwLAe7yCNtlESqPvCMPcAfjqwmNQ19bjOPRdxRnSKGABpGFDMYvsTiDFT7FfaLnWnbHJr555QFqdYHSAtp73LRUYDZgaeGlJT4=; isg=BICAfUhD9ImywYiPlm818du3UQhSCWTTedKTF_oRTxsudSGfoh0tYqpEiNW1QByr; l=eBgFfq9IjfrHbUYTBOfaourza779IIRbSuPzaNbMiOCP_-1p5HU1W6OwTtT9CnhNnsgHR3lqRWpDBu8SQyz6Qxv9-egPe9oEndBG.; tfstk=ctjGBQ2yNBGSUZThPlt1IUVci2adZKke6s51Yite4tkyX3IFiWuEzDR4-BfGDq1..; _m_h5_tk=9a0a86b9d6a2af069082c39a003a0087_1623862308738; _m_h5_tk_enc=e8b5adef6a74673cb92ca952c850f019; _uetsid=ab562c30cd9b11eb835b67411ecd1bd6; _uetvid=ab587f00cd9b11eb8eea6906a86bf0ba',
'pragma': 'no-cache',
'referer': 'https://guardian.com.my/health.html?page=1',
'sec-ch-ua': '" Not;A Brand";v="99", "Google Chrome";v="91", "Chromium";v="91"',
'sec-ch-ua-mobile': '?0',
'sec-fetch-dest': 'empty',
'sec-fetch-mode': 'cors',
'sec-fetch-site': 'same-origin',
'store': 'default',
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36'
}
def start_requests(self):
yield scrapy.Request(
url= f"https://guardian.com.my/graphql?query=query+GetCategories%28%24id%3AInt%21%24pageSize%3AInt%21%24currentPage%3AInt%21%24filters%3AProductAttributeFilterInput%21%24sort%3AProductAttributeSortInput%29%7Bcategory%28id%3A%24id%29%7Bid+description+name+product_count+meta_title+meta_keywords+meta_description+__typename%7Dproducts%28pageSize%3A%24pageSize+currentPage%3A%24currentPage+filter%3A%24filters+sort%3A%24sort%29%7Bitems%7Bid+name+sku+price%7BregularPrice%7Bamount%7Bcurrency+value+__typename%7D__typename%7D__typename%7Dprice_range%7Bminimum_price%7Bfinal_price%7Bcurrency+value+__typename%7Ddiscount%7Bamount_off+percent_off+__typename%7D__typename%7Dmaximum_price%7Bfinal_price%7Bcurrency+value+__typename%7Ddiscount%7Bamount_off+percent_off+__typename%7D__typename%7D__typename%7Dpromotion_label+promotion_label_name+sales_icon+small_image%7Burl+__typename%7Dstock_status+url_key+url_suffix+__typename%7Dpage_info%7Btotal_pages+__typename%7Dtotal_count+__typename%7D%7D&operationName=GetCategories&variables=%7B%22currentPage%22%3A2%2C%22id%22%3A3047%2C%22filters%22%3A%7B%22category_id%22%3A%7B%22eq%22%3A%223047%22%7D%7D%2C%22pageSize%22%3A60%2C%22sort%22%3A%7B%22position%22%3A%22ASC%22%7D%7D",
headers=self.headers
)
def parse(self, response):
print(response.body)
由于我使用的是 IP 代理轮转器,我还缺少什么? ROBOTSTXT_OBEY 已在 settings.py
中设置为 False
非常感谢!
当请求未以预期格式发出时,Scrapy 会抛出 400 错误请求。 headers、负载或参数可能有误。
考虑到您的情况,您在各种 headers 字段中添加了冒号 :
,这种格式在发出请求时是无效的。例如, ':authority'
应该是 'authority'
。同样,':method'
应该是 'method'
等等..
代码
import scrapy
from scrapy.exceptions import CloseSpider
import json
class GapiSpider(scrapy.Spider):
name = 'gapi'
headers={
'authority': 'guardian.com.my',
'method': 'GET',
'path': f"/graphql?query=query+GetCategories%28%24id%3AInt%21%24pageSize%3AInt%21%24currentPage%3AInt%21%24filters%3AProductAttributeFilterInput%21%24sort%3AProductAttributeSortInput%29%7Bcategory%28id%3A%24id%29%7Bid+description+name+product_count+meta_title+meta_keywords+meta_description+__typename%7Dproducts%28pageSize%3A%24pageSize+currentPage%3A%24currentPage+filter%3A%24filters+sort%3A%24sort%29%7Bitems%7Bid+name+sku+price%7BregularPrice%7Bamount%7Bcurrency+value+__typename%7D__typename%7D__typename%7Dprice_range%7Bminimum_price%7Bfinal_price%7Bcurrency+value+__typename%7Ddiscount%7Bamount_off+percent_off+__typename%7D__typename%7Dmaximum_price%7Bfinal_price%7Bcurrency+value+__typename%7Ddiscount%7Bamount_off+percent_off+__typename%7D__typename%7D__typename%7Dpromotion_label+promotion_label_name+sales_icon+small_image%7Burl+__typename%7Dstock_status+url_key+url_suffix+__typename%7Dpage_info%7Btotal_pages+__typename%7Dtotal_count+__typename%7D%7D&operationName=GetCategories&variables=%7B%22currentPage%22%3A2%2C%22id%22%3A3047%2C%22filters%22%3A%7B%22category_id%22%3A%7B%22eq%22%3A%223047%22%7D%7D%2C%22pageSize%22%3A60%2C%22sort%22%3A%7B%22position%22%3A%22ASC%22%7D%7D",
'scheme': 'https',
'accept': '*/*',
'accept-encoding': 'gzip, deflate, br',
'accept-language': 'en-US,en;q=0.9',
'cache-control': 'no-cache',
'content-type': 'application/json',
'cookie': 'lzd_cid=13b492b3-2f4c-46fc-fb4f-98212b95f68d; t_uid=13b492b3-2f4c-46fc-fb4f-98212b95f68d; lzd_sid=1728fd4a8af615419dc17044911406d4; t_fv=1623735402455; hng=MY|en-MY|MYR|458; userLanguageML=en; _bl_uid=7qkOhpLtxynm254s1v6s7X3gjp7n; cna=aipPGYuo6GwCAXOk2DtI3+FS; _gcl_au=1.1.335118801.1623735403; _tb_token_=f311f13b31eb5; age_restriction=over%3B18%3B1; _fbp=fb.2.1623760903320.1705265880; _ga=GA1.3.1886895770.1623760957; _gid=GA1.3.71310232.1623760957; t_sid=VVEk6ELqT4zeevu7Snsvx63eqX0u9De7; utm_origin=https://www.google.com/; utm_channel=SEO; xlly_s=1; EGG_SESS=S_Gs1wHo9OvRHCMp98md7LqI1pVlU7ApMhhrX1Oe_NHHkRPwi6zdBuxVpdbHrc8tccMpabJfEEwLAe7yCNtlESqPvCMPcAfjqwmNQ19bjOPRdxRnSKGABpGFDMYvsTiDFT7FfaLnWnbHJr555QFqdYHSAtp73LRUYDZgaeGlJT4=; isg=BICAfUhD9ImywYiPlm818du3UQhSCWTTedKTF_oRTxsudSGfoh0tYqpEiNW1QByr; l=eBgFfq9IjfrHbUYTBOfaourza779IIRbSuPzaNbMiOCP_-1p5HU1W6OwTtT9CnhNnsgHR3lqRWpDBu8SQyz6Qxv9-egPe9oEndBG.; tfstk=ctjGBQ2yNBGSUZThPlt1IUVci2adZKke6s51Yite4tkyX3IFiWuEzDR4-BfGDq1..; _m_h5_tk=9a0a86b9d6a2af069082c39a003a0087_1623862308738; _m_h5_tk_enc=e8b5adef6a74673cb92ca952c850f019; _uetsid=ab562c30cd9b11eb835b67411ecd1bd6; _uetvid=ab587f00cd9b11eb8eea6906a86bf0ba',
'pragma': 'no-cache',
'referer': 'https://guardian.com.my/health.html?page=1',
'sec-ch-ua': '" Not;A Brand";v="99", "Google Chrome";v="91", "Chromium";v="91"',
'sec-ch-ua-mobile': '?0',
'sec-fetch-dest': 'empty',
'sec-fetch-mode': 'cors',
'sec-fetch-site': 'same-origin',
'store': 'default',
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36'
}
def start_requests(self):
yield scrapy.Request(
url= f"https://guardian.com.my/graphql?query=query+GetCategories%28%24id%3AInt%21%24pageSize%3AInt%21%24currentPage%3AInt%21%24filters%3AProductAttributeFilterInput%21%24sort%3AProductAttributeSortInput%29%7Bcategory%28id%3A%24id%29%7Bid+description+name+product_count+meta_title+meta_keywords+meta_description+__typename%7Dproducts%28pageSize%3A%24pageSize+currentPage%3A%24currentPage+filter%3A%24filters+sort%3A%24sort%29%7Bitems%7Bid+name+sku+price%7BregularPrice%7Bamount%7Bcurrency+value+__typename%7D__typename%7D__typename%7Dprice_range%7Bminimum_price%7Bfinal_price%7Bcurrency+value+__typename%7Ddiscount%7Bamount_off+percent_off+__typename%7D__typename%7Dmaximum_price%7Bfinal_price%7Bcurrency+value+__typename%7Ddiscount%7Bamount_off+percent_off+__typename%7D__typename%7D__typename%7Dpromotion_label+promotion_label_name+sales_icon+small_image%7Burl+__typename%7Dstock_status+url_key+url_suffix+__typename%7Dpage_info%7Btotal_pages+__typename%7Dtotal_count+__typename%7D%7D&operationName=GetCategories&variables=%7B%22currentPage%22%3A1%2C%22id%22%3A3047%2C%22filters%22%3A%7B%22category_id%22%3A%7B%22eq%22%3A%223047%22%7D%7D%2C%22pageSize%22%3A60%2C%22sort%22%3A%7B%22position%22%3A%22ASC%22%7D%7D",
headers=self.headers
)
def parse(self, response):
print(response.body)
大家好!
我正在尝试 scrape https://guardian.com.my/health.html 使用 API 和 scrapy。在邮递员上,请求 url 产生状态 200ok,但我在抓取时收到 Crawled(400) 错误请求。
import scrapy
from scrapy.exceptions import CloseSpider
import json
class GapiSpider(scrapy.Spider):
name = 'gapi'
headers={
':authority': 'guardian.com.my',
':method': 'GET',
':path': f"/graphql?query=query+GetCategories%28%24id%3AInt%21%24pageSize%3AInt%21%24currentPage%3AInt%21%24filters%3AProductAttributeFilterInput%21%24sort%3AProductAttributeSortInput%29%7Bcategory%28id%3A%24id%29%7Bid+description+name+product_count+meta_title+meta_keywords+meta_description+__typename%7Dproducts%28pageSize%3A%24pageSize+currentPage%3A%24currentPage+filter%3A%24filters+sort%3A%24sort%29%7Bitems%7Bid+name+sku+price%7BregularPrice%7Bamount%7Bcurrency+value+__typename%7D__typename%7D__typename%7Dprice_range%7Bminimum_price%7Bfinal_price%7Bcurrency+value+__typename%7Ddiscount%7Bamount_off+percent_off+__typename%7D__typename%7Dmaximum_price%7Bfinal_price%7Bcurrency+value+__typename%7Ddiscount%7Bamount_off+percent_off+__typename%7D__typename%7D__typename%7Dpromotion_label+promotion_label_name+sales_icon+small_image%7Burl+__typename%7Dstock_status+url_key+url_suffix+__typename%7Dpage_info%7Btotal_pages+__typename%7Dtotal_count+__typename%7D%7D&operationName=GetCategories&variables=%7B%22currentPage%22%3A2%2C%22id%22%3A3047%2C%22filters%22%3A%7B%22category_id%22%3A%7B%22eq%22%3A%223047%22%7D%7D%2C%22pageSize%22%3A60%2C%22sort%22%3A%7B%22position%22%3A%22ASC%22%7D%7D",
':scheme': 'https',
'accept': '*/*',
'accept-encoding': 'gzip, deflate, br',
'accept-language': 'en-US,en;q=0.9',
'cache-control': 'no-cache',
'content-type': 'application/json',
'cookie': 'lzd_cid=13b492b3-2f4c-46fc-fb4f-98212b95f68d; t_uid=13b492b3-2f4c-46fc-fb4f-98212b95f68d; lzd_sid=1728fd4a8af615419dc17044911406d4; t_fv=1623735402455; hng=MY|en-MY|MYR|458; userLanguageML=en; _bl_uid=7qkOhpLtxynm254s1v6s7X3gjp7n; cna=aipPGYuo6GwCAXOk2DtI3+FS; _gcl_au=1.1.335118801.1623735403; _tb_token_=f311f13b31eb5; age_restriction=over%3B18%3B1; _fbp=fb.2.1623760903320.1705265880; _ga=GA1.3.1886895770.1623760957; _gid=GA1.3.71310232.1623760957; t_sid=VVEk6ELqT4zeevu7Snsvx63eqX0u9De7; utm_origin=https://www.google.com/; utm_channel=SEO; xlly_s=1; EGG_SESS=S_Gs1wHo9OvRHCMp98md7LqI1pVlU7ApMhhrX1Oe_NHHkRPwi6zdBuxVpdbHrc8tccMpabJfEEwLAe7yCNtlESqPvCMPcAfjqwmNQ19bjOPRdxRnSKGABpGFDMYvsTiDFT7FfaLnWnbHJr555QFqdYHSAtp73LRUYDZgaeGlJT4=; isg=BICAfUhD9ImywYiPlm818du3UQhSCWTTedKTF_oRTxsudSGfoh0tYqpEiNW1QByr; l=eBgFfq9IjfrHbUYTBOfaourza779IIRbSuPzaNbMiOCP_-1p5HU1W6OwTtT9CnhNnsgHR3lqRWpDBu8SQyz6Qxv9-egPe9oEndBG.; tfstk=ctjGBQ2yNBGSUZThPlt1IUVci2adZKke6s51Yite4tkyX3IFiWuEzDR4-BfGDq1..; _m_h5_tk=9a0a86b9d6a2af069082c39a003a0087_1623862308738; _m_h5_tk_enc=e8b5adef6a74673cb92ca952c850f019; _uetsid=ab562c30cd9b11eb835b67411ecd1bd6; _uetvid=ab587f00cd9b11eb8eea6906a86bf0ba',
'pragma': 'no-cache',
'referer': 'https://guardian.com.my/health.html?page=1',
'sec-ch-ua': '" Not;A Brand";v="99", "Google Chrome";v="91", "Chromium";v="91"',
'sec-ch-ua-mobile': '?0',
'sec-fetch-dest': 'empty',
'sec-fetch-mode': 'cors',
'sec-fetch-site': 'same-origin',
'store': 'default',
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36'
}
def start_requests(self):
yield scrapy.Request(
url= f"https://guardian.com.my/graphql?query=query+GetCategories%28%24id%3AInt%21%24pageSize%3AInt%21%24currentPage%3AInt%21%24filters%3AProductAttributeFilterInput%21%24sort%3AProductAttributeSortInput%29%7Bcategory%28id%3A%24id%29%7Bid+description+name+product_count+meta_title+meta_keywords+meta_description+__typename%7Dproducts%28pageSize%3A%24pageSize+currentPage%3A%24currentPage+filter%3A%24filters+sort%3A%24sort%29%7Bitems%7Bid+name+sku+price%7BregularPrice%7Bamount%7Bcurrency+value+__typename%7D__typename%7D__typename%7Dprice_range%7Bminimum_price%7Bfinal_price%7Bcurrency+value+__typename%7Ddiscount%7Bamount_off+percent_off+__typename%7D__typename%7Dmaximum_price%7Bfinal_price%7Bcurrency+value+__typename%7Ddiscount%7Bamount_off+percent_off+__typename%7D__typename%7D__typename%7Dpromotion_label+promotion_label_name+sales_icon+small_image%7Burl+__typename%7Dstock_status+url_key+url_suffix+__typename%7Dpage_info%7Btotal_pages+__typename%7Dtotal_count+__typename%7D%7D&operationName=GetCategories&variables=%7B%22currentPage%22%3A2%2C%22id%22%3A3047%2C%22filters%22%3A%7B%22category_id%22%3A%7B%22eq%22%3A%223047%22%7D%7D%2C%22pageSize%22%3A60%2C%22sort%22%3A%7B%22position%22%3A%22ASC%22%7D%7D",
headers=self.headers
)
def parse(self, response):
print(response.body)
由于我使用的是 IP 代理轮转器,我还缺少什么? ROBOTSTXT_OBEY 已在 settings.py
中设置为 False非常感谢!
当请求未以预期格式发出时,Scrapy 会抛出 400 错误请求。 headers、负载或参数可能有误。
考虑到您的情况,您在各种 headers 字段中添加了冒号 :
,这种格式在发出请求时是无效的。例如, ':authority'
应该是 'authority'
。同样,':method'
应该是 'method'
等等..
代码
import scrapy
from scrapy.exceptions import CloseSpider
import json
class GapiSpider(scrapy.Spider):
name = 'gapi'
headers={
'authority': 'guardian.com.my',
'method': 'GET',
'path': f"/graphql?query=query+GetCategories%28%24id%3AInt%21%24pageSize%3AInt%21%24currentPage%3AInt%21%24filters%3AProductAttributeFilterInput%21%24sort%3AProductAttributeSortInput%29%7Bcategory%28id%3A%24id%29%7Bid+description+name+product_count+meta_title+meta_keywords+meta_description+__typename%7Dproducts%28pageSize%3A%24pageSize+currentPage%3A%24currentPage+filter%3A%24filters+sort%3A%24sort%29%7Bitems%7Bid+name+sku+price%7BregularPrice%7Bamount%7Bcurrency+value+__typename%7D__typename%7D__typename%7Dprice_range%7Bminimum_price%7Bfinal_price%7Bcurrency+value+__typename%7Ddiscount%7Bamount_off+percent_off+__typename%7D__typename%7Dmaximum_price%7Bfinal_price%7Bcurrency+value+__typename%7Ddiscount%7Bamount_off+percent_off+__typename%7D__typename%7D__typename%7Dpromotion_label+promotion_label_name+sales_icon+small_image%7Burl+__typename%7Dstock_status+url_key+url_suffix+__typename%7Dpage_info%7Btotal_pages+__typename%7Dtotal_count+__typename%7D%7D&operationName=GetCategories&variables=%7B%22currentPage%22%3A2%2C%22id%22%3A3047%2C%22filters%22%3A%7B%22category_id%22%3A%7B%22eq%22%3A%223047%22%7D%7D%2C%22pageSize%22%3A60%2C%22sort%22%3A%7B%22position%22%3A%22ASC%22%7D%7D",
'scheme': 'https',
'accept': '*/*',
'accept-encoding': 'gzip, deflate, br',
'accept-language': 'en-US,en;q=0.9',
'cache-control': 'no-cache',
'content-type': 'application/json',
'cookie': 'lzd_cid=13b492b3-2f4c-46fc-fb4f-98212b95f68d; t_uid=13b492b3-2f4c-46fc-fb4f-98212b95f68d; lzd_sid=1728fd4a8af615419dc17044911406d4; t_fv=1623735402455; hng=MY|en-MY|MYR|458; userLanguageML=en; _bl_uid=7qkOhpLtxynm254s1v6s7X3gjp7n; cna=aipPGYuo6GwCAXOk2DtI3+FS; _gcl_au=1.1.335118801.1623735403; _tb_token_=f311f13b31eb5; age_restriction=over%3B18%3B1; _fbp=fb.2.1623760903320.1705265880; _ga=GA1.3.1886895770.1623760957; _gid=GA1.3.71310232.1623760957; t_sid=VVEk6ELqT4zeevu7Snsvx63eqX0u9De7; utm_origin=https://www.google.com/; utm_channel=SEO; xlly_s=1; EGG_SESS=S_Gs1wHo9OvRHCMp98md7LqI1pVlU7ApMhhrX1Oe_NHHkRPwi6zdBuxVpdbHrc8tccMpabJfEEwLAe7yCNtlESqPvCMPcAfjqwmNQ19bjOPRdxRnSKGABpGFDMYvsTiDFT7FfaLnWnbHJr555QFqdYHSAtp73LRUYDZgaeGlJT4=; isg=BICAfUhD9ImywYiPlm818du3UQhSCWTTedKTF_oRTxsudSGfoh0tYqpEiNW1QByr; l=eBgFfq9IjfrHbUYTBOfaourza779IIRbSuPzaNbMiOCP_-1p5HU1W6OwTtT9CnhNnsgHR3lqRWpDBu8SQyz6Qxv9-egPe9oEndBG.; tfstk=ctjGBQ2yNBGSUZThPlt1IUVci2adZKke6s51Yite4tkyX3IFiWuEzDR4-BfGDq1..; _m_h5_tk=9a0a86b9d6a2af069082c39a003a0087_1623862308738; _m_h5_tk_enc=e8b5adef6a74673cb92ca952c850f019; _uetsid=ab562c30cd9b11eb835b67411ecd1bd6; _uetvid=ab587f00cd9b11eb8eea6906a86bf0ba',
'pragma': 'no-cache',
'referer': 'https://guardian.com.my/health.html?page=1',
'sec-ch-ua': '" Not;A Brand";v="99", "Google Chrome";v="91", "Chromium";v="91"',
'sec-ch-ua-mobile': '?0',
'sec-fetch-dest': 'empty',
'sec-fetch-mode': 'cors',
'sec-fetch-site': 'same-origin',
'store': 'default',
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36'
}
def start_requests(self):
yield scrapy.Request(
url= f"https://guardian.com.my/graphql?query=query+GetCategories%28%24id%3AInt%21%24pageSize%3AInt%21%24currentPage%3AInt%21%24filters%3AProductAttributeFilterInput%21%24sort%3AProductAttributeSortInput%29%7Bcategory%28id%3A%24id%29%7Bid+description+name+product_count+meta_title+meta_keywords+meta_description+__typename%7Dproducts%28pageSize%3A%24pageSize+currentPage%3A%24currentPage+filter%3A%24filters+sort%3A%24sort%29%7Bitems%7Bid+name+sku+price%7BregularPrice%7Bamount%7Bcurrency+value+__typename%7D__typename%7D__typename%7Dprice_range%7Bminimum_price%7Bfinal_price%7Bcurrency+value+__typename%7Ddiscount%7Bamount_off+percent_off+__typename%7D__typename%7Dmaximum_price%7Bfinal_price%7Bcurrency+value+__typename%7Ddiscount%7Bamount_off+percent_off+__typename%7D__typename%7D__typename%7Dpromotion_label+promotion_label_name+sales_icon+small_image%7Burl+__typename%7Dstock_status+url_key+url_suffix+__typename%7Dpage_info%7Btotal_pages+__typename%7Dtotal_count+__typename%7D%7D&operationName=GetCategories&variables=%7B%22currentPage%22%3A1%2C%22id%22%3A3047%2C%22filters%22%3A%7B%22category_id%22%3A%7B%22eq%22%3A%223047%22%7D%7D%2C%22pageSize%22%3A60%2C%22sort%22%3A%7B%22position%22%3A%22ASC%22%7D%7D",
headers=self.headers
)
def parse(self, response):
print(response.body)