Scrapy API 请求 403 错误问题。适用于请求,但不适用于 scrapy
Scrapy API request 403 error issue. Works with requests, but not scrapy
我有一个奇怪的问题,我的 API 请求调用与标准请求库完美配合,但它仅使用 scrapy 抛出 403。除了“403 错误”消息外,错误消息没有给我任何有用的信息。我已经从这个 post 中删除了我的 API 密钥,但您可以轻松获得自己的密钥。如果您需要帮助获取 API 密钥,请告诉我。
工作 python 代码(没有 scrapy)
import requests
url = "https://www.airbnb.ca/api/v3/ExploreSections"
querystring = {"operationName":"ExploreSections","locale":"en-CA","currency":"CAD","_cb":"1db02z70xkcr690n1h3gp0py4nmy","variables":"{\"isInitialLoad\":true,\"hasLoggedIn\":false,\"cdnCacheSafe\":false,\"source\":\"EXPLORE\",\"exploreRequest\":{\"metadataOnly\":false,\"version\":\"1.8.3\",\"itemsPerGrid\":20,\"tabId\":\"home_tab\",\"refinementPaths\":[\"/homes\"],\"flexibleTripDates\":[\"february\",\"march\"],\"flexibleTripLengths\":[\"weekend_trip\"],\"datePickerType\":\"calendar\",\"placeId\":\"ChIJpTvG15DL1IkRd8S0KlBVNTI\",\"checkin\":\"2022-03-15\",\"checkout\":\"2022-03-16\",\"adults\":2,\"source\":\"structured_search_input_header\",\"searchType\":\"autocomplete_click\",\"query\":\"Toronto, ON\",\"cdnCacheSafe\":false,\"treatmentFlags\":[\"flex_destinations_june_2021_launch_web_treatment\",\"new_filter_bar_v2_fm_header\",\"merch_header_breakpoint_expansion_web\",\"flexible_dates_12_month_lead_time\",\"storefronts_nov23_2021_homepage_web_treatment\",\"flexible_dates_options_extend_one_three_seven_days\",\"super_date_flexibility\",\"micro_flex_improvements\",\"micro_flex_show_by_default\",\"search_input_placeholder_phrases\",\"pets_fee_treatment\"],\"screenSize\":\"large\",\"isInitialLoad\":true,\"hasLoggedIn\":false},\"removeDuplicatedParams\":false}","extensions":"{\"persistedQuery\":{\"version\":1,\"sha256Hash\":\"0d0a5c3b44e87ccaecf084cfc3027a175af11955cffa04bb986406e9b4bdfe6e\"}}"}
headers = {
"x-airbnb-api-key": "YOUR_KEY",
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.82 Safari/537.36",
"content-type": "application/json",
"accept-language": "en-US,en;q=0.9"
}
response = requests.request("GET", url, headers=headers, params=querystring)
print(response.text)
Scrapy 蜘蛛代码:
import scrapy
import json
from urllib.parse import urlencode
class ListingsSpider(scrapy.Spider):
name = 'listings'
allowed_domains = ['airbnb.ca']
def start_requests(self):
params = {"operationName":"ExploreSections","locale":"en-CA","currency":"CAD","_cb":"1db02z70xkcr690n1h3gp0py4nmy","variables":"{\"isInitialLoad\":true,\"hasLoggedIn\":false,\"cdnCacheSafe\":false,\"source\":\"EXPLORE\",\"exploreRequest\":{\"metadataOnly\":false,\"version\":\"1.8.3\",\"itemsPerGrid\":20,\"tabId\":\"home_tab\",\"refinementPaths\":[\"/homes\"],\"flexibleTripDates\":[\"february\",\"march\"],\"flexibleTripLengths\":[\"weekend_trip\"],\"datePickerType\":\"calendar\",\"placeId\":\"ChIJpTvG15DL1IkRd8S0KlBVNTI\",\"checkin\":\"2022-03-15\",\"checkout\":\"2022-03-16\",\"adults\":2,\"source\":\"structured_search_input_header\",\"searchType\":\"autocomplete_click\",\"query\":\"Toronto, ON\",\"cdnCacheSafe\":false,\"treatmentFlags\":[\"flex_destinations_june_2021_launch_web_treatment\",\"new_filter_bar_v2_fm_header\",\"merch_header_breakpoint_expansion_web\",\"flexible_dates_12_month_lead_time\",\"storefronts_nov23_2021_homepage_web_treatment\",\"flexible_dates_options_extend_one_three_seven_days\",\"super_date_flexibility\",\"micro_flex_improvements\",\"micro_flex_show_by_default\",\"search_input_placeholder_phrases\",\"pets_fee_treatment\"],\"screenSize\":\"large\",\"isInitialLoad\":true,\"hasLoggedIn\":false},\"removeDuplicatedParams\":false}","extensions":"{\"persistedQuery\":{\"version\":1,\"sha256Hash\":\"0d0a5c3b44e87ccaecf084cfc3027a175af11955cffa04bb986406e9b4bdfe6e\"}}"}
url = f"https://www.airbnb.ca/api/v3/ExploreSections?{urlencode(params)}"
headers = {
"x-airbnb-api-key": "YOUR_KEY",
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.82 Safari/537.36",
"content-type": "application/json",
"accept-language": "en-US,en;q=0.9"
}
yield scrapy.Request(
url=url,
method='GET',
headers=headers,
callback=self.parse_listings,
)
def parse_listings(self, response):
resp_dict = json.loads(response.body)
yield resp_dict
问题出在 headers,从浏览器复制它们并删除 cookie
和 content-length
(如果存在)。
import scrapy
import json
from urllib.parse import urlencode
class ListingsSpider(scrapy.Spider):
name = 'listings'
allowed_domains = ['airbnb.ca']
custom_settings = {
'DOWNLOAD_DELAY': 0.5
}
def start_requests(self):
params = {"operationName":"ExploreSections","locale":"en-CA","currency":"CAD","_cb":"1db02z70xkcr690n1h3gp0py4nmy","variables":"{\"isInitialLoad\":true,\"hasLoggedIn\":false,\"cdnCacheSafe\":false,\"source\":\"EXPLORE\",\"exploreRequest\":{\"metadataOnly\":false,\"version\":\"1.8.3\",\"itemsPerGrid\":20,\"tabId\":\"home_tab\",\"refinementPaths\":[\"/homes\"],\"flexibleTripDates\":[\"february\",\"march\"],\"flexibleTripLengths\":[\"weekend_trip\"],\"datePickerType\":\"calendar\",\"placeId\":\"ChIJpTvG15DL1IkRd8S0KlBVNTI\",\"checkin\":\"2022-03-15\",\"checkout\":\"2022-03-16\",\"adults\":2,\"source\":\"structured_search_input_header\",\"searchType\":\"autocomplete_click\",\"query\":\"Toronto, ON\",\"cdnCacheSafe\":false,\"treatmentFlags\":[\"flex_destinations_june_2021_launch_web_treatment\",\"new_filter_bar_v2_fm_header\",\"merch_header_breakpoint_expansion_web\",\"flexible_dates_12_month_lead_time\",\"storefronts_nov23_2021_homepage_web_treatment\",\"flexible_dates_options_extend_one_three_seven_days\",\"super_date_flexibility\",\"micro_flex_improvements\",\"micro_flex_show_by_default\",\"search_input_placeholder_phrases\",\"pets_fee_treatment\"],\"screenSize\":\"large\",\"isInitialLoad\":true,\"hasLoggedIn\":false},\"removeDuplicatedParams\":false}","extensions":"{\"persistedQuery\":{\"version\":1,\"sha256Hash\":\"0d0a5c3b44e87ccaecf084cfc3027a175af11955cffa04bb986406e9b4bdfe6e\"}}"}
url = f"https://www.airbnb.ca/api/v3/ExploreSections?{urlencode(params)}"
headers = {
"Accept": "*/*",
"Accept-Encoding": "gzip, deflate, br",
"Accept-Language": "en-US,en;q=0.5",
"Cache-Control": "no-cache",
"Connection": "keep-alive",
"Content-Type": "application/json",
"DNT": "1",
"Host": "www.airbnb.ca",
"Pragma": "no-cache",
"Referer": "https://www.airbnb.ca/",
"Sec-Fetch-Dest": "empty",
"Sec-Fetch-Mode": "cors",
"Sec-Fetch-Site": "same-origin",
"Sec-GPC": "1",
"TE": "trailers",
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.82 Safari/537.36",
"X-Airbnb-API-Key": "API_KEY",
"X-Airbnb-GraphQL-Platform": "web",
"X-Airbnb-GraphQL-Platform-Client": "minimalist-niobe",
"X-Airbnb-Supports-Airlock-V2": "true",
"X-CSRF-Token": "null",
"X-CSRF-Without-Token": "1",
"X-KL-Ajax-Request": "Ajax_Request",
"X-Niobe-Short-Circuited": "true"
}
yield scrapy.Request(
url=url,
method='GET',
headers=headers,
callback=self.parse_listings,
)
def parse_listings(self, response):
resp_dict = response.json()
yield resp_dict
输出:
{'data': {'presentation': {'__typename': 'RootPresentationContainer', 'explore': {'__typename': 'ExplorePresentation', 'sections': {'__typename': 'ExploreSections', 'sections': [{'__typename': 'SectionContainer', 'id':
...
...
...
正在获取请求 headers,示例 chrome:
我有一个奇怪的问题,我的 API 请求调用与标准请求库完美配合,但它仅使用 scrapy 抛出 403。除了“403 错误”消息外,错误消息没有给我任何有用的信息。我已经从这个 post 中删除了我的 API 密钥,但您可以轻松获得自己的密钥。如果您需要帮助获取 API 密钥,请告诉我。
工作 python 代码(没有 scrapy)
import requests
url = "https://www.airbnb.ca/api/v3/ExploreSections"
querystring = {"operationName":"ExploreSections","locale":"en-CA","currency":"CAD","_cb":"1db02z70xkcr690n1h3gp0py4nmy","variables":"{\"isInitialLoad\":true,\"hasLoggedIn\":false,\"cdnCacheSafe\":false,\"source\":\"EXPLORE\",\"exploreRequest\":{\"metadataOnly\":false,\"version\":\"1.8.3\",\"itemsPerGrid\":20,\"tabId\":\"home_tab\",\"refinementPaths\":[\"/homes\"],\"flexibleTripDates\":[\"february\",\"march\"],\"flexibleTripLengths\":[\"weekend_trip\"],\"datePickerType\":\"calendar\",\"placeId\":\"ChIJpTvG15DL1IkRd8S0KlBVNTI\",\"checkin\":\"2022-03-15\",\"checkout\":\"2022-03-16\",\"adults\":2,\"source\":\"structured_search_input_header\",\"searchType\":\"autocomplete_click\",\"query\":\"Toronto, ON\",\"cdnCacheSafe\":false,\"treatmentFlags\":[\"flex_destinations_june_2021_launch_web_treatment\",\"new_filter_bar_v2_fm_header\",\"merch_header_breakpoint_expansion_web\",\"flexible_dates_12_month_lead_time\",\"storefronts_nov23_2021_homepage_web_treatment\",\"flexible_dates_options_extend_one_three_seven_days\",\"super_date_flexibility\",\"micro_flex_improvements\",\"micro_flex_show_by_default\",\"search_input_placeholder_phrases\",\"pets_fee_treatment\"],\"screenSize\":\"large\",\"isInitialLoad\":true,\"hasLoggedIn\":false},\"removeDuplicatedParams\":false}","extensions":"{\"persistedQuery\":{\"version\":1,\"sha256Hash\":\"0d0a5c3b44e87ccaecf084cfc3027a175af11955cffa04bb986406e9b4bdfe6e\"}}"}
headers = {
"x-airbnb-api-key": "YOUR_KEY",
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.82 Safari/537.36",
"content-type": "application/json",
"accept-language": "en-US,en;q=0.9"
}
response = requests.request("GET", url, headers=headers, params=querystring)
print(response.text)
Scrapy 蜘蛛代码:
import scrapy
import json
from urllib.parse import urlencode
class ListingsSpider(scrapy.Spider):
name = 'listings'
allowed_domains = ['airbnb.ca']
def start_requests(self):
params = {"operationName":"ExploreSections","locale":"en-CA","currency":"CAD","_cb":"1db02z70xkcr690n1h3gp0py4nmy","variables":"{\"isInitialLoad\":true,\"hasLoggedIn\":false,\"cdnCacheSafe\":false,\"source\":\"EXPLORE\",\"exploreRequest\":{\"metadataOnly\":false,\"version\":\"1.8.3\",\"itemsPerGrid\":20,\"tabId\":\"home_tab\",\"refinementPaths\":[\"/homes\"],\"flexibleTripDates\":[\"february\",\"march\"],\"flexibleTripLengths\":[\"weekend_trip\"],\"datePickerType\":\"calendar\",\"placeId\":\"ChIJpTvG15DL1IkRd8S0KlBVNTI\",\"checkin\":\"2022-03-15\",\"checkout\":\"2022-03-16\",\"adults\":2,\"source\":\"structured_search_input_header\",\"searchType\":\"autocomplete_click\",\"query\":\"Toronto, ON\",\"cdnCacheSafe\":false,\"treatmentFlags\":[\"flex_destinations_june_2021_launch_web_treatment\",\"new_filter_bar_v2_fm_header\",\"merch_header_breakpoint_expansion_web\",\"flexible_dates_12_month_lead_time\",\"storefronts_nov23_2021_homepage_web_treatment\",\"flexible_dates_options_extend_one_three_seven_days\",\"super_date_flexibility\",\"micro_flex_improvements\",\"micro_flex_show_by_default\",\"search_input_placeholder_phrases\",\"pets_fee_treatment\"],\"screenSize\":\"large\",\"isInitialLoad\":true,\"hasLoggedIn\":false},\"removeDuplicatedParams\":false}","extensions":"{\"persistedQuery\":{\"version\":1,\"sha256Hash\":\"0d0a5c3b44e87ccaecf084cfc3027a175af11955cffa04bb986406e9b4bdfe6e\"}}"}
url = f"https://www.airbnb.ca/api/v3/ExploreSections?{urlencode(params)}"
headers = {
"x-airbnb-api-key": "YOUR_KEY",
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.82 Safari/537.36",
"content-type": "application/json",
"accept-language": "en-US,en;q=0.9"
}
yield scrapy.Request(
url=url,
method='GET',
headers=headers,
callback=self.parse_listings,
)
def parse_listings(self, response):
resp_dict = json.loads(response.body)
yield resp_dict
问题出在 headers,从浏览器复制它们并删除 cookie
和 content-length
(如果存在)。
import scrapy
import json
from urllib.parse import urlencode
class ListingsSpider(scrapy.Spider):
name = 'listings'
allowed_domains = ['airbnb.ca']
custom_settings = {
'DOWNLOAD_DELAY': 0.5
}
def start_requests(self):
params = {"operationName":"ExploreSections","locale":"en-CA","currency":"CAD","_cb":"1db02z70xkcr690n1h3gp0py4nmy","variables":"{\"isInitialLoad\":true,\"hasLoggedIn\":false,\"cdnCacheSafe\":false,\"source\":\"EXPLORE\",\"exploreRequest\":{\"metadataOnly\":false,\"version\":\"1.8.3\",\"itemsPerGrid\":20,\"tabId\":\"home_tab\",\"refinementPaths\":[\"/homes\"],\"flexibleTripDates\":[\"february\",\"march\"],\"flexibleTripLengths\":[\"weekend_trip\"],\"datePickerType\":\"calendar\",\"placeId\":\"ChIJpTvG15DL1IkRd8S0KlBVNTI\",\"checkin\":\"2022-03-15\",\"checkout\":\"2022-03-16\",\"adults\":2,\"source\":\"structured_search_input_header\",\"searchType\":\"autocomplete_click\",\"query\":\"Toronto, ON\",\"cdnCacheSafe\":false,\"treatmentFlags\":[\"flex_destinations_june_2021_launch_web_treatment\",\"new_filter_bar_v2_fm_header\",\"merch_header_breakpoint_expansion_web\",\"flexible_dates_12_month_lead_time\",\"storefronts_nov23_2021_homepage_web_treatment\",\"flexible_dates_options_extend_one_three_seven_days\",\"super_date_flexibility\",\"micro_flex_improvements\",\"micro_flex_show_by_default\",\"search_input_placeholder_phrases\",\"pets_fee_treatment\"],\"screenSize\":\"large\",\"isInitialLoad\":true,\"hasLoggedIn\":false},\"removeDuplicatedParams\":false}","extensions":"{\"persistedQuery\":{\"version\":1,\"sha256Hash\":\"0d0a5c3b44e87ccaecf084cfc3027a175af11955cffa04bb986406e9b4bdfe6e\"}}"}
url = f"https://www.airbnb.ca/api/v3/ExploreSections?{urlencode(params)}"
headers = {
"Accept": "*/*",
"Accept-Encoding": "gzip, deflate, br",
"Accept-Language": "en-US,en;q=0.5",
"Cache-Control": "no-cache",
"Connection": "keep-alive",
"Content-Type": "application/json",
"DNT": "1",
"Host": "www.airbnb.ca",
"Pragma": "no-cache",
"Referer": "https://www.airbnb.ca/",
"Sec-Fetch-Dest": "empty",
"Sec-Fetch-Mode": "cors",
"Sec-Fetch-Site": "same-origin",
"Sec-GPC": "1",
"TE": "trailers",
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.82 Safari/537.36",
"X-Airbnb-API-Key": "API_KEY",
"X-Airbnb-GraphQL-Platform": "web",
"X-Airbnb-GraphQL-Platform-Client": "minimalist-niobe",
"X-Airbnb-Supports-Airlock-V2": "true",
"X-CSRF-Token": "null",
"X-CSRF-Without-Token": "1",
"X-KL-Ajax-Request": "Ajax_Request",
"X-Niobe-Short-Circuited": "true"
}
yield scrapy.Request(
url=url,
method='GET',
headers=headers,
callback=self.parse_listings,
)
def parse_listings(self, response):
resp_dict = response.json()
yield resp_dict
输出:
{'data': {'presentation': {'__typename': 'RootPresentationContainer', 'explore': {'__typename': 'ExplorePresentation', 'sections': {'__typename': 'ExploreSections', 'sections': [{'__typename': 'SectionContainer', 'id':
...
...
...
正在获取请求 headers,示例 chrome: