python/requests 的抓取问题
issue scraping with python/requests
我正在尝试(一半用于教育目的,一半用于自己监控票价)在联合航空公司的网站上抓取特定航班的价格数据。
我已经用 selenium 成功完成了它,但这是一个非常笨拙的实现,在这个过程中我注意到在初始重定向后有一个 ajax 调用,它有一个很好的 JSON 响应我想要的一切。我试图通过传递我在开发工具的网络选项卡中看到的适当 post 参数来直接访问端点,但它不起作用。然后我注意到有一个 'cart-id' 字段看起来是动态的,而另一个看起来是静态的,所以我从预重定向表单提交页面中提取它并将其插入 post,但是我得到的响应仍然是状态:失败 "we're sorry but nited.com was unable to complete your request"。
此时我不确定 post 中缺少哪些数据。我也首先点击表单提交页面,以便使用持久会话对象设置 cookie,认为这会有所帮助,但没有骰子。我错过了什么?您可以通过在浏览器中导航到下面的第一个 URL,查看网络选项卡,查看我正在寻找的实际响应,第一个名为 'rev' 的 xhr 具有 posted 表单数据我正在尝试模仿我想要的JSON。
with requests.session() as s:
formsubmitpage = s.get('https://www.united.com/ual/en/us/flight-search/book-a-flight/results/rev?f=sfo&t=tpe&d=2016-01-20&r=2016-01-26&sc=1,7&px=1&taxng=1&idx=1')
doc = html.fromstring(formsubmitpage.text)
cartid = doc.xpath('//a[@class="no-rtad"]/@data-cartid')[0]
print(cartid)
params = {"Revise":False,"UnaccompaniedMinorDisclamer":False,"ConfirmationID":None,"searchTypeMain":"roundTrip","Origin":"sfo","Destination":"tpe","DepartDate":"Jan 20, 2016","ReturnDate":"Jan 26, 2016","awardTravel":False,"MaxTrips":None,"numberOfTravelers":1,"numOfAdults":1,"numOfSeniors":0,"numOfChildren04":0,"numOfChildren03":0,"numOfChildren02":0,"numOfChildren01":0,"numOfInfants":0,"numOfLapInfants":0,"travelerCount":1,"revisedTravelerKeys":None,"revisedTravelers":None,"OriginalReservation":None,"RiskFreePolicy":None,"IsUnAccompaniedMinor":False,"MilitaryTravelType":None,"MilitaryOrGovernmentPersonnelStateCode":None,"tripLength":6,"IsParallelFareWheelCallEnabled":False,"flexMonth":None,"flexMonth2":None,"SortType":None,"cboMiles":None,"cboMiles2":None,"Trips":[{"DestinationAll":False,"returnARC":None,"connections":None,"nonStopOnly":True,"nonStop":True,"oneStop":False,"twoPlusStop":False,"ChangeType":0,"DepartDate":"Jan 20, 2016","ReturnDate":None,"PetIsTraveling":False,"PreferredTime":"","PreferredTimeReturn":None,"Destination":"TPE","Index":1,"Origin":"SFO","Selected":False,"FormatedDepartDate":"Wed, Jan 20, 2016","OriginCorrection":None,"DestinationCorrection":None,"OriginAll":False,"Flights":None},{"DestinationAll":False,"returnARC":None,"connections":None,"nonStopOnly":True,"nonStop":True,"oneStop":False,"twoPlusStop":False,"ChangeType":0,"DepartDate":"Jan 26, 2016","ReturnDate":None,"PetIsTraveling":False,"PreferredTime":"","PreferredTimeReturn":None,"Destination":"SFO","Index":2,"Origin":"TPE","Selected":False,"FormatedDepartDate":"Tue, Jan 26, 2016","OriginCorrection":None,"DestinationCorrection":None,"OriginAll":False,"Flights":None}],"nonStopOnly":1,"CalendarOnly":False,"InitialShop":True,"IsSearchInjection":False,"CartId":cartid,"CellIdSelected":None,"BBXSession":None,"SolutionSetId":None,"SimpleSearch":True,"RequeryForUpsell":False,"RequeryForPOSChange":False,"YBMAlternateService":False,"ShowClassOfServiceListPreference":False,"SelectableUpgradesOriginal":None,"RegionalPremierUpgradeBalance":0,"GlobalPremierUpgradeBalance":0,"RegionalPremierUpgrades":None,"GlobalPremierUpgrades":None,"FormattedAccountBalance":None,"GovType":None,"TripTypes":0,"flexible":False,"flexibleAward":False,"FlexibleDaysAfter":0,"FlexibleDaysBefore":0,"hiddenPreferredConn":None,"hiddenUnpreferredConn":None,"carrierPref":0,"chkFltOpt":0,"portOx":0,"travelwPet":0,"NumberOfPets":0,"cabinType":0,"cabinSelection":"ECONOMY","awardCabinType":0,"FareTypes":0,"FareWheelOnly":False,"EditSearch":False,"buyUpgrade":0,"offerCode":None,"TVAOfferCodeLastName":None,"ClassofService":None,"UpgradeType":None,"BillingAddressCountryCode":None,"BillingAddressCountryDescription":None,"IsPassPlusFlex":False,"IsPassPlusSecure":False,"IsOffer":False,"IsMeetingWorks":False,"IsValidPromotion":False,"CalendarDateChange":None,"CoolAwardSpecials":False,"LastResultId":None,"IncludeLmx":False,"NGRP":False,"calendarStops":0,"isReshopPath":False}
redirect_endpoint = s.post('https://www.united.com/ual/en/us/flight-search/book-a-flight/flightshopping/getflightresults/rev',data=json.dumps(params))
print(redirect_endpoint.text)#denied!
在 s.post
调用中,您可能指的是
data=params
(发送表单数据)或:
json=params
(发送JSON作为请求体)。
params
键用于查询字符串。
我正在尝试(一半用于教育目的,一半用于自己监控票价)在联合航空公司的网站上抓取特定航班的价格数据。
我已经用 selenium 成功完成了它,但这是一个非常笨拙的实现,在这个过程中我注意到在初始重定向后有一个 ajax 调用,它有一个很好的 JSON 响应我想要的一切。我试图通过传递我在开发工具的网络选项卡中看到的适当 post 参数来直接访问端点,但它不起作用。然后我注意到有一个 'cart-id' 字段看起来是动态的,而另一个看起来是静态的,所以我从预重定向表单提交页面中提取它并将其插入 post,但是我得到的响应仍然是状态:失败 "we're sorry but nited.com was unable to complete your request"。
此时我不确定 post 中缺少哪些数据。我也首先点击表单提交页面,以便使用持久会话对象设置 cookie,认为这会有所帮助,但没有骰子。我错过了什么?您可以通过在浏览器中导航到下面的第一个 URL,查看网络选项卡,查看我正在寻找的实际响应,第一个名为 'rev' 的 xhr 具有 posted 表单数据我正在尝试模仿我想要的JSON。
with requests.session() as s:
formsubmitpage = s.get('https://www.united.com/ual/en/us/flight-search/book-a-flight/results/rev?f=sfo&t=tpe&d=2016-01-20&r=2016-01-26&sc=1,7&px=1&taxng=1&idx=1')
doc = html.fromstring(formsubmitpage.text)
cartid = doc.xpath('//a[@class="no-rtad"]/@data-cartid')[0]
print(cartid)
params = {"Revise":False,"UnaccompaniedMinorDisclamer":False,"ConfirmationID":None,"searchTypeMain":"roundTrip","Origin":"sfo","Destination":"tpe","DepartDate":"Jan 20, 2016","ReturnDate":"Jan 26, 2016","awardTravel":False,"MaxTrips":None,"numberOfTravelers":1,"numOfAdults":1,"numOfSeniors":0,"numOfChildren04":0,"numOfChildren03":0,"numOfChildren02":0,"numOfChildren01":0,"numOfInfants":0,"numOfLapInfants":0,"travelerCount":1,"revisedTravelerKeys":None,"revisedTravelers":None,"OriginalReservation":None,"RiskFreePolicy":None,"IsUnAccompaniedMinor":False,"MilitaryTravelType":None,"MilitaryOrGovernmentPersonnelStateCode":None,"tripLength":6,"IsParallelFareWheelCallEnabled":False,"flexMonth":None,"flexMonth2":None,"SortType":None,"cboMiles":None,"cboMiles2":None,"Trips":[{"DestinationAll":False,"returnARC":None,"connections":None,"nonStopOnly":True,"nonStop":True,"oneStop":False,"twoPlusStop":False,"ChangeType":0,"DepartDate":"Jan 20, 2016","ReturnDate":None,"PetIsTraveling":False,"PreferredTime":"","PreferredTimeReturn":None,"Destination":"TPE","Index":1,"Origin":"SFO","Selected":False,"FormatedDepartDate":"Wed, Jan 20, 2016","OriginCorrection":None,"DestinationCorrection":None,"OriginAll":False,"Flights":None},{"DestinationAll":False,"returnARC":None,"connections":None,"nonStopOnly":True,"nonStop":True,"oneStop":False,"twoPlusStop":False,"ChangeType":0,"DepartDate":"Jan 26, 2016","ReturnDate":None,"PetIsTraveling":False,"PreferredTime":"","PreferredTimeReturn":None,"Destination":"SFO","Index":2,"Origin":"TPE","Selected":False,"FormatedDepartDate":"Tue, Jan 26, 2016","OriginCorrection":None,"DestinationCorrection":None,"OriginAll":False,"Flights":None}],"nonStopOnly":1,"CalendarOnly":False,"InitialShop":True,"IsSearchInjection":False,"CartId":cartid,"CellIdSelected":None,"BBXSession":None,"SolutionSetId":None,"SimpleSearch":True,"RequeryForUpsell":False,"RequeryForPOSChange":False,"YBMAlternateService":False,"ShowClassOfServiceListPreference":False,"SelectableUpgradesOriginal":None,"RegionalPremierUpgradeBalance":0,"GlobalPremierUpgradeBalance":0,"RegionalPremierUpgrades":None,"GlobalPremierUpgrades":None,"FormattedAccountBalance":None,"GovType":None,"TripTypes":0,"flexible":False,"flexibleAward":False,"FlexibleDaysAfter":0,"FlexibleDaysBefore":0,"hiddenPreferredConn":None,"hiddenUnpreferredConn":None,"carrierPref":0,"chkFltOpt":0,"portOx":0,"travelwPet":0,"NumberOfPets":0,"cabinType":0,"cabinSelection":"ECONOMY","awardCabinType":0,"FareTypes":0,"FareWheelOnly":False,"EditSearch":False,"buyUpgrade":0,"offerCode":None,"TVAOfferCodeLastName":None,"ClassofService":None,"UpgradeType":None,"BillingAddressCountryCode":None,"BillingAddressCountryDescription":None,"IsPassPlusFlex":False,"IsPassPlusSecure":False,"IsOffer":False,"IsMeetingWorks":False,"IsValidPromotion":False,"CalendarDateChange":None,"CoolAwardSpecials":False,"LastResultId":None,"IncludeLmx":False,"NGRP":False,"calendarStops":0,"isReshopPath":False}
redirect_endpoint = s.post('https://www.united.com/ual/en/us/flight-search/book-a-flight/flightshopping/getflightresults/rev',data=json.dumps(params))
print(redirect_endpoint.text)#denied!
在 s.post
调用中,您可能指的是
data=params
(发送表单数据)或:
json=params
(发送JSON作为请求体)。
params
键用于查询字符串。