匿名请求被拒绝?
Anonymous request being denied?
我正在尝试学习发出匿名 http 请求并取得了一些成功,但我最近的尝试没有接受我的请求(requesocks.exceptions.HTTPError:400 客户端错误)。我正在使用 tor 获取匿名 IP。这是我的代码:
from fake_useragent import UserAgent
import requests
import requesocks
def newUserAgent():
"adds a new User-Agent item to HEADERS dictionary"
HEADERS['User-Agent'] = UA.random
def newUrl():
"increments CurrentPage and returns url"
url = 'http://www.realtor.ca/Residential/Map.aspx#CultureId=1&ApplicationId=1&RecordsPerPage=9&MaximumResults=9&PropertyTypeId=300&TransactionTypeId=2' \
'&StoreyRange=0-0&OwnershipTypeGroupId=1&BuildingTypeId=1&BedRange=0-0&BathRange=0-0&LongitudeMin=-119.66980648040801&LongitudeMax=-119.58174419403106' \
'&LatitudeMin=49.822197219797346&LatitudeMax=49.84943388971021&SortOrder=A&SortBy=1&viewState=l&Longitude=-119.487716674805&Latitude=49.8434562683105' \
'&CurrentPage=' + str(CURRENT_PAGE + 1)
return url
def getDataDict():
"returns data_dict from msl.ca url"
# Reset User-Agent in HEADERS, increment CurrentPage in url
newUserAgent()
url = newUrl()
# Check visible IP
ip = SESSION.get("http://icanhazip.com/")
print "visible IP is:", ip.text
# Request the URL
response = SESSION.get(url, headers=HEADERS)
response.raise_for_status() # raise exception if invalid response
def main():
getDataDict()
#------------------------
# global objects:
#------------------------
CURRENT_PAGE = 0
UA = UserAgent()
HEADERS = {
'Host': 'www.realtor.ca',
'Accept': '*/*',
'Accept-Language': 'en-US,en;q=0.5',
'Accept-Encoding': 'gzip, deflate',
'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
'Referer': 'http://www.realtor.ca/Residential/Map.aspx',
'Content-Length': '411',
'Connection': 'keep-alive',
'Pragma': 'no-cache',
'Cache-Control': 'no-cache'
}
#Initialize a new wrapped requests object
SESSION = requesocks.session()
#Use Tor for both HTTP and HTTPS
SESSION.proxies = {'http': 'socks5://localhost:9050', 'https': 'socks5://localhost:9050'}
if __name__ == '__main__':
main()
我在这里做错了什么? IP 打印正常(即对 http://icanhazip.com/ 的请求工作正常,但对后续 URL 的请求则不正常 - 有什么不同??)
服务器正在返回HTTP Error 400. The request URL is invalid.
您无法发出包含锚文本 (#CultureId...
) 的 HTTP 请求。
井号 (#) 及其后的所有内容都不会在 HTTP 请求中发送。大量使用 Ajax 的网站(如您尝试使用的网站)将使用 Javascript 来阅读锚文本,然后发出 Ajax 更新内容的请求。
通过快速查看他们的站点,来自锚标记的请求是通过 Ajax 请求到 URL http://www.realtor.ca/api/Listing.svc/PropertySearch_Post
的,锚文本在 post 正文中.
截图:
从 cookie 的外观来看,您需要先向 /Residential/Map.aspx
发出请求以建立会话 cookie,然后您可以尝试向 PropertySearch_Post
URL 发出请求与您的搜索参数。它 returns 一个 JSON 响应,您必须对其进行解析才能对搜索结果执行任何操作。
编辑:此代码对我有用(打印成功的 JSON 响应和结果)
import requests
import requesocks
def newUserAgent():
"adds a new User-Agent item to HEADERS dictionary"
HEADERS['User-Agent'] = 'Mozilla/5.0 (Ubuntu; Firefox=41)'
def newUrl():
"increments CurrentPage and returns url"
url = 'http://www.realtor.ca/Residential/Map.asp'
return url
def getDataDict():
"returns data_dict from msl.ca url"
# Reset User-Agent in HEADERS, increment CurrentPage in url
newUserAgent()
url = newUrl()
# Check visible IP
ip = SESSION.get("http://icanhazip.com/")
print "visible IP is:", ip.text
# Request the URL
response = SESSION.get(url, headers=HEADERS)
response.raise_for_status() # raise exception if invalid response
PAYLOAD = { 'CultureId': '1', 'ApplicationId': '1', 'RecordsPerPage': '9', 'MaximumResults': '9', 'PropertyTypeId': '300','TransactionTypeId': '2','StoreyRange': '0-0', 'OwnershipTypeGroupId': '1', 'BuildingTypeId': '1', 'BedRange': '0-0', 'BathRange': '0-0', 'LongitudeMin': '-119.66980648040801', 'LongitudeMax': '-119.58174419403106', 'LatitudeMin': '49.822197219797346', 'LatitudeMax': '49.84943388971021', 'SortOrder': 'A', 'SortBy': '1', 'viewState': 'l', 'Longitude': '-119.487716674805', 'Latitude': '49.8434562683105', 'CurrentPage': '1' }
response = SESSION.post('http://www.realtor.ca/api/Listing.svc/PropertySearch_Post', data=PAYLOAD, headers=HEADERS)
print response.text
def main():
getDataDict()
#------------------------
# global objects:
#------------------------
CURRENT_PAGE = 0
HEADERS = {
'Host': 'www.realtor.ca',
'Accept': '*/*',
'Accept-Language': 'en-US,en;q=0.5',
'Accept-Encoding': 'gzip, deflate',
'Referer': 'http://www.realtor.ca/Residential/Map.aspx',
'Connection': 'keep-alive',
'Pragma': 'no-cache',
'Cache-Control': 'no-cache'
}
UA = newUserAgent()
#Initialize a new wrapped requests object
SESSION = requesocks.session()
#Use Tor for both HTTP and HTTPS
SESSION.proxies = {'http': 'socks5://localhost:9050', 'https': 'socks5://localhost:9050'}
if __name__ == '__main__':
main()
我正在尝试学习发出匿名 http 请求并取得了一些成功,但我最近的尝试没有接受我的请求(requesocks.exceptions.HTTPError:400 客户端错误)。我正在使用 tor 获取匿名 IP。这是我的代码:
from fake_useragent import UserAgent
import requests
import requesocks
def newUserAgent():
"adds a new User-Agent item to HEADERS dictionary"
HEADERS['User-Agent'] = UA.random
def newUrl():
"increments CurrentPage and returns url"
url = 'http://www.realtor.ca/Residential/Map.aspx#CultureId=1&ApplicationId=1&RecordsPerPage=9&MaximumResults=9&PropertyTypeId=300&TransactionTypeId=2' \
'&StoreyRange=0-0&OwnershipTypeGroupId=1&BuildingTypeId=1&BedRange=0-0&BathRange=0-0&LongitudeMin=-119.66980648040801&LongitudeMax=-119.58174419403106' \
'&LatitudeMin=49.822197219797346&LatitudeMax=49.84943388971021&SortOrder=A&SortBy=1&viewState=l&Longitude=-119.487716674805&Latitude=49.8434562683105' \
'&CurrentPage=' + str(CURRENT_PAGE + 1)
return url
def getDataDict():
"returns data_dict from msl.ca url"
# Reset User-Agent in HEADERS, increment CurrentPage in url
newUserAgent()
url = newUrl()
# Check visible IP
ip = SESSION.get("http://icanhazip.com/")
print "visible IP is:", ip.text
# Request the URL
response = SESSION.get(url, headers=HEADERS)
response.raise_for_status() # raise exception if invalid response
def main():
getDataDict()
#------------------------
# global objects:
#------------------------
CURRENT_PAGE = 0
UA = UserAgent()
HEADERS = {
'Host': 'www.realtor.ca',
'Accept': '*/*',
'Accept-Language': 'en-US,en;q=0.5',
'Accept-Encoding': 'gzip, deflate',
'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
'Referer': 'http://www.realtor.ca/Residential/Map.aspx',
'Content-Length': '411',
'Connection': 'keep-alive',
'Pragma': 'no-cache',
'Cache-Control': 'no-cache'
}
#Initialize a new wrapped requests object
SESSION = requesocks.session()
#Use Tor for both HTTP and HTTPS
SESSION.proxies = {'http': 'socks5://localhost:9050', 'https': 'socks5://localhost:9050'}
if __name__ == '__main__':
main()
我在这里做错了什么? IP 打印正常(即对 http://icanhazip.com/ 的请求工作正常,但对后续 URL 的请求则不正常 - 有什么不同??)
服务器正在返回HTTP Error 400. The request URL is invalid.
您无法发出包含锚文本 (#CultureId...
) 的 HTTP 请求。
井号 (#) 及其后的所有内容都不会在 HTTP 请求中发送。大量使用 Ajax 的网站(如您尝试使用的网站)将使用 Javascript 来阅读锚文本,然后发出 Ajax 更新内容的请求。
通过快速查看他们的站点,来自锚标记的请求是通过 Ajax 请求到 URL http://www.realtor.ca/api/Listing.svc/PropertySearch_Post
的,锚文本在 post 正文中.
截图:
从 cookie 的外观来看,您需要先向 /Residential/Map.aspx
发出请求以建立会话 cookie,然后您可以尝试向 PropertySearch_Post
URL 发出请求与您的搜索参数。它 returns 一个 JSON 响应,您必须对其进行解析才能对搜索结果执行任何操作。
编辑:此代码对我有用(打印成功的 JSON 响应和结果)
import requests
import requesocks
def newUserAgent():
"adds a new User-Agent item to HEADERS dictionary"
HEADERS['User-Agent'] = 'Mozilla/5.0 (Ubuntu; Firefox=41)'
def newUrl():
"increments CurrentPage and returns url"
url = 'http://www.realtor.ca/Residential/Map.asp'
return url
def getDataDict():
"returns data_dict from msl.ca url"
# Reset User-Agent in HEADERS, increment CurrentPage in url
newUserAgent()
url = newUrl()
# Check visible IP
ip = SESSION.get("http://icanhazip.com/")
print "visible IP is:", ip.text
# Request the URL
response = SESSION.get(url, headers=HEADERS)
response.raise_for_status() # raise exception if invalid response
PAYLOAD = { 'CultureId': '1', 'ApplicationId': '1', 'RecordsPerPage': '9', 'MaximumResults': '9', 'PropertyTypeId': '300','TransactionTypeId': '2','StoreyRange': '0-0', 'OwnershipTypeGroupId': '1', 'BuildingTypeId': '1', 'BedRange': '0-0', 'BathRange': '0-0', 'LongitudeMin': '-119.66980648040801', 'LongitudeMax': '-119.58174419403106', 'LatitudeMin': '49.822197219797346', 'LatitudeMax': '49.84943388971021', 'SortOrder': 'A', 'SortBy': '1', 'viewState': 'l', 'Longitude': '-119.487716674805', 'Latitude': '49.8434562683105', 'CurrentPage': '1' }
response = SESSION.post('http://www.realtor.ca/api/Listing.svc/PropertySearch_Post', data=PAYLOAD, headers=HEADERS)
print response.text
def main():
getDataDict()
#------------------------
# global objects:
#------------------------
CURRENT_PAGE = 0
HEADERS = {
'Host': 'www.realtor.ca',
'Accept': '*/*',
'Accept-Language': 'en-US,en;q=0.5',
'Accept-Encoding': 'gzip, deflate',
'Referer': 'http://www.realtor.ca/Residential/Map.aspx',
'Connection': 'keep-alive',
'Pragma': 'no-cache',
'Cache-Control': 'no-cache'
}
UA = newUserAgent()
#Initialize a new wrapped requests object
SESSION = requesocks.session()
#Use Tor for both HTTP and HTTPS
SESSION.proxies = {'http': 'socks5://localhost:9050', 'https': 'socks5://localhost:9050'}
if __name__ == '__main__':
main()