Scrapy - 抓取的网站身份验证令牌在抓取时过期
Scrapy - scraped website authentication token expires while scraping
要在未来 180 天后抓取特定网站,必须获得身份验证令牌才能抓取 json 数据。抓取时,令牌过期并且 HTTP 响应 returns 状态代码为 401 "Unauthorized"。如何将新令牌放入刮板并继续刮取?感谢任何帮助。
def start_requests(self):
return [Request(url=AUTHORIZATION_URL, callback=self.request_ride_times)]
def request_ride_times(self, response):
# parse json data
data = json.loads(response.body)
# get auth token
auth = '{}'.format(data['access_token'])
# set auth token in headers
headers = {'Authorization': 'BEARER {}'.format(auth)}
# note: this probably isn't really necessary but it doesn't hurt (all the sites times we are scraping are in EST)
now = get_current_time_for_timezone("US/Eastern")
# get ending timeframe for scraping dates - 190 days out
until = now + SCRAPE_TIMEFRAME
for filter_type in FILTER_TYPES:
filter_url_query_attr = '&filters={}'.format(filter_type)
scrape_date = now
while scrape_date <= until:
url = urljoin(SCRAPE_BASE_URL, '{}{}&date={}'.format(SCRAPE_BASE_URL_QUERY_STRING, filter_url_query_attr, scrape_date.strftime("%Y-%m-%d")))
yield Request(url, headers=headers, callback=self.parse_ride_times, errback=self.error_handler)
scrape_date += timedelta(days=1)
def parse_ride_times(self, response):
# parse json data
data = json.loads(response.body)
for index, ride_details in enumerate(data['results']):
if 'schedule' not in ride_details:
continue
ride_schedule = ride_details['schedule']
# create item...
yield item
我想通了。我必须覆盖请求 object 以便在令牌过期时将新的授权令牌设置到 header 中。我将令牌设为全局变量。
# override Request object in order to set new authorization token into the header when the token expires
authorization_token = None
class AuthTokenRequest(Request):
@property
def headers(self):
global authorization_token
return Headers({'Authorization': 'BEARER {}'.format(authorization_token)}, encoding=self.encoding)
@headers.setter
def headers(self, value):
pass
然后在 while 循环中的请求中使用覆盖的请求,包括一个 errback 函数 error_handler,该函数在请求失败时调用。 error_handler 函数获取一个新令牌,重置全局令牌变量,然后使用新令牌重新提交请求。在同一请求中,dont_filter 参数已设置为 False,因此可以重新处理失败的请求。
另外创建了两个函数。一个名为 handle_auth 的创建是为了在全局变量中初始设置令牌。另一个是 start_first_run 调用 handle_auth 和 returns request_ride_times 函数。这在 start_requests 请求中调用。
def error_handler(self, failure):
global authorization_token
status = failure.value.response.status
if status == 401:
form_data = {'grant_type': 'assertion', 'assertion_type': 'public', 'client_id': 'WDPRO-MOBILE.CLIENT-PROD'}
auth_site_request = requests.post(url=AUTHORIZATION_URL, data=form_data)
auth_site_response = json.loads(auth_site_request.text)
disney_authorization_token = '{}'.format(auth_site_response['access_token'])
yield failure.request
def start_requests(self):
form_data = {'grant_type': 'assertion', 'assertion_type': 'public', 'client_id': 'WDPRO-MOBILE.CLIENT-PROD'}
return [FormRequest(url=AUTHORIZATION_URL, formdata=form_data,
callback=self.start_first_run)]
def start_first_run(self, response):
self.handle_auth(response)
return self.request_ride_times()
def handle_auth(self, response):
global authorization_token
data = json.loads(response.body)
# get auth token
authorization_token = '{}'.format(data['access_token'])
def request_ride_times(self):
# note: this probably isn't really necessary but it doesn't hurt (all the sites we are scraping are in EST)
now = get_current_time_for_timezone("US/Eastern")
# get ending timeframe for scraping dates - 190 days out
until = now + SCRAPE_TIMEFRAME
for filter_type in FILTER_TYPES:
filter_url_query_attr = '&filters={}'.format(filter_type)
scrape_date = now
while scrape_date <= until:
url = urljoin(SCRAPE_BASE_URL,
'{}{}&date={}'.format(SCRAPE_BASE_URL_QUERY_STRING,
filter_url_query_attr, scrape_date.strftime("%Y-%m-%d")))
yield AuthTokenRequest(url, callback=self.parse_ride_times, errback=self.error_handler, dont_filter=True,
meta={"scrape_date": scrape_date})
scrape_date += timedelta(days=1)
def parse_ride_times(self, response):
# parse json data
data = json.loads(response.body)
# process data...
要在未来 180 天后抓取特定网站,必须获得身份验证令牌才能抓取 json 数据。抓取时,令牌过期并且 HTTP 响应 returns 状态代码为 401 "Unauthorized"。如何将新令牌放入刮板并继续刮取?感谢任何帮助。
def start_requests(self):
return [Request(url=AUTHORIZATION_URL, callback=self.request_ride_times)]
def request_ride_times(self, response):
# parse json data
data = json.loads(response.body)
# get auth token
auth = '{}'.format(data['access_token'])
# set auth token in headers
headers = {'Authorization': 'BEARER {}'.format(auth)}
# note: this probably isn't really necessary but it doesn't hurt (all the sites times we are scraping are in EST)
now = get_current_time_for_timezone("US/Eastern")
# get ending timeframe for scraping dates - 190 days out
until = now + SCRAPE_TIMEFRAME
for filter_type in FILTER_TYPES:
filter_url_query_attr = '&filters={}'.format(filter_type)
scrape_date = now
while scrape_date <= until:
url = urljoin(SCRAPE_BASE_URL, '{}{}&date={}'.format(SCRAPE_BASE_URL_QUERY_STRING, filter_url_query_attr, scrape_date.strftime("%Y-%m-%d")))
yield Request(url, headers=headers, callback=self.parse_ride_times, errback=self.error_handler)
scrape_date += timedelta(days=1)
def parse_ride_times(self, response):
# parse json data
data = json.loads(response.body)
for index, ride_details in enumerate(data['results']):
if 'schedule' not in ride_details:
continue
ride_schedule = ride_details['schedule']
# create item...
yield item
我想通了。我必须覆盖请求 object 以便在令牌过期时将新的授权令牌设置到 header 中。我将令牌设为全局变量。
# override Request object in order to set new authorization token into the header when the token expires
authorization_token = None
class AuthTokenRequest(Request):
@property
def headers(self):
global authorization_token
return Headers({'Authorization': 'BEARER {}'.format(authorization_token)}, encoding=self.encoding)
@headers.setter
def headers(self, value):
pass
然后在 while 循环中的请求中使用覆盖的请求,包括一个 errback 函数 error_handler,该函数在请求失败时调用。 error_handler 函数获取一个新令牌,重置全局令牌变量,然后使用新令牌重新提交请求。在同一请求中,dont_filter 参数已设置为 False,因此可以重新处理失败的请求。
另外创建了两个函数。一个名为 handle_auth 的创建是为了在全局变量中初始设置令牌。另一个是 start_first_run 调用 handle_auth 和 returns request_ride_times 函数。这在 start_requests 请求中调用。
def error_handler(self, failure):
global authorization_token
status = failure.value.response.status
if status == 401:
form_data = {'grant_type': 'assertion', 'assertion_type': 'public', 'client_id': 'WDPRO-MOBILE.CLIENT-PROD'}
auth_site_request = requests.post(url=AUTHORIZATION_URL, data=form_data)
auth_site_response = json.loads(auth_site_request.text)
disney_authorization_token = '{}'.format(auth_site_response['access_token'])
yield failure.request
def start_requests(self):
form_data = {'grant_type': 'assertion', 'assertion_type': 'public', 'client_id': 'WDPRO-MOBILE.CLIENT-PROD'}
return [FormRequest(url=AUTHORIZATION_URL, formdata=form_data,
callback=self.start_first_run)]
def start_first_run(self, response):
self.handle_auth(response)
return self.request_ride_times()
def handle_auth(self, response):
global authorization_token
data = json.loads(response.body)
# get auth token
authorization_token = '{}'.format(data['access_token'])
def request_ride_times(self):
# note: this probably isn't really necessary but it doesn't hurt (all the sites we are scraping are in EST)
now = get_current_time_for_timezone("US/Eastern")
# get ending timeframe for scraping dates - 190 days out
until = now + SCRAPE_TIMEFRAME
for filter_type in FILTER_TYPES:
filter_url_query_attr = '&filters={}'.format(filter_type)
scrape_date = now
while scrape_date <= until:
url = urljoin(SCRAPE_BASE_URL,
'{}{}&date={}'.format(SCRAPE_BASE_URL_QUERY_STRING,
filter_url_query_attr, scrape_date.strftime("%Y-%m-%d")))
yield AuthTokenRequest(url, callback=self.parse_ride_times, errback=self.error_handler, dont_filter=True,
meta={"scrape_date": scrape_date})
scrape_date += timedelta(days=1)
def parse_ride_times(self, response):
# parse json data
data = json.loads(response.body)
# process data...