根据Scrapy中的url设置代理
Setting up proxy according to url in Scrapy
我有一个 URL 列表,其中一些包含 .onion 网站和其他清晰的网站 我想知道是否有一种方法可以设置 Scrapy,以便根据 URL , 因此它要么对普通 .com 和 .net 站点使用专用的透明网络代理,要么对 .onion 站点使用 Socks5 代理
def random_dedicate_proxy():
dedicated_ips = [ proxy1, proxy2, proxy3
]
dedicated_proxies = [{'http':'http://' + ip, 'https':'https://' + ip} for ip in dedicated_ips]
return choice(dedicated_proxies)
def proxy_selector(url):
TOR_CLIENT = 'socks5h://127.0.0.1:9050'
if '.onion' in url:
proxy = {'http': TOR_CLIENT, 'https': TOR_CLIENT}
else:
proxy = random_dedicate_proxy()
return proxy
def get_urls_from_spreadsheet():
fname = 'list_of_stuff.csv'
url_df = pd.read_csv(fname,usecols=['URL'],keep_default_na=False)
URL = url_df.URL.dropna()
urls = [clean_url(url) for url in URL if url != '']
return urls
class BasicSpider(scrapy.Spider):
name = "basic"
rotate_user_agent = True
start_urls = get_urls_from_spreadsheet()
def parse(self, response):
item = StatusCehckerItem()
item['url'] = response.url
item['status_code'] = response.status
item['time'] = time.time()
response.meta['proxy'] = proxy_selector(response.url)
return item
使用此代码时,我得到 DNSLookupError: DNS lookup failed: no results for hostname lookup: mqqrfjmfu2i73bjq.onion/.
确保蜘蛛设置中的 HTTPPROXY_ENABLED
设置为 True
。然后在您的 start_requests
方法中选择代理 URL 的方法。
class BasicSpider(scrapy.Spider):
custom_settings = {
'HTTPPROXY_ENABLED': True # can also set this in the settings.py file
}
name = "basic"
rotate_user_agent = True
def start_requests(self):
urls = get_urls_from_spreadsheet()
for url in urls:
proxy = proxy_selector(url)
yield scrapy.Request(url=url, callback=self.parse, meta={'proxy': proxy})
def parse(self, response):
item = StatusCehckerItem()
item['url'] = response.url
item['status_code'] = response.status
item['time'] = time.time()
return item
我有一个 URL 列表,其中一些包含 .onion 网站和其他清晰的网站 我想知道是否有一种方法可以设置 Scrapy,以便根据 URL , 因此它要么对普通 .com 和 .net 站点使用专用的透明网络代理,要么对 .onion 站点使用 Socks5 代理
def random_dedicate_proxy():
dedicated_ips = [ proxy1, proxy2, proxy3
]
dedicated_proxies = [{'http':'http://' + ip, 'https':'https://' + ip} for ip in dedicated_ips]
return choice(dedicated_proxies)
def proxy_selector(url):
TOR_CLIENT = 'socks5h://127.0.0.1:9050'
if '.onion' in url:
proxy = {'http': TOR_CLIENT, 'https': TOR_CLIENT}
else:
proxy = random_dedicate_proxy()
return proxy
def get_urls_from_spreadsheet():
fname = 'list_of_stuff.csv'
url_df = pd.read_csv(fname,usecols=['URL'],keep_default_na=False)
URL = url_df.URL.dropna()
urls = [clean_url(url) for url in URL if url != '']
return urls
class BasicSpider(scrapy.Spider):
name = "basic"
rotate_user_agent = True
start_urls = get_urls_from_spreadsheet()
def parse(self, response):
item = StatusCehckerItem()
item['url'] = response.url
item['status_code'] = response.status
item['time'] = time.time()
response.meta['proxy'] = proxy_selector(response.url)
return item
使用此代码时,我得到 DNSLookupError: DNS lookup failed: no results for hostname lookup: mqqrfjmfu2i73bjq.onion/.
确保蜘蛛设置中的 HTTPPROXY_ENABLED
设置为 True
。然后在您的 start_requests
方法中选择代理 URL 的方法。
class BasicSpider(scrapy.Spider):
custom_settings = {
'HTTPPROXY_ENABLED': True # can also set this in the settings.py file
}
name = "basic"
rotate_user_agent = True
def start_requests(self):
urls = get_urls_from_spreadsheet()
for url in urls:
proxy = proxy_selector(url)
yield scrapy.Request(url=url, callback=self.parse, meta={'proxy': proxy})
def parse(self, response):
item = StatusCehckerItem()
item['url'] = response.url
item['status_code'] = response.status
item['time'] = time.time()
return item