使用特定 URL 和脚本构建代理旋转器
Building a proxy rotator with specific URL and script
我正在努力使用为不同 url 构建的现有代码来构建代理旋转器。
下面的代码示例中提供了我想要的网址。当代理类型为 "HTTPS" 时,我试图让提供的脚本调用所需的 URL 并获取 ALL 'IP:PORT'(当前脚本限制为 10 个)。
xpath或者bs4都可以。不过我对bs4比较熟悉
我理解其中的逻辑,但我不知道如何构建它。
首先,我尝试剥离字符串并尝试调用特定的 td 元素,但它不起作用。
#URLs I want
url_list = ['http://spys.one/free-proxy-list/US/','http://spys.one/free-proxy-list/US/1/']
#code I have
from lxml.html import fromstring
import requests
from itertools import cycle
import traceback
def get_proxies():
url = 'https://free-proxy-list.net/'
response = requests.get(url)
parser = fromstring(response.text)
proxies = set()
for i in parser.xpath('//tbody/tr')[:10]:
if i.xpath('.//td[7][contains(text(),"yes")]'):
proxy = ":".join([i.xpath('.//td[1]/text()')[0], i.xpath('.//td[2]/text()')[0]])
proxies.add(proxy)
return proxies
proxies = get_proxies()
proxy_pool = cycle(proxies)
proxy = next(proxy_pool)
response = requests.get(url,proxies={"http": proxy, "https": proxy})
我希望了解所提供的代码是如何针对 2 个所需的 URL 构建的,return 所有 IP:PORT 代理类型为 HTTPS 时的数字
一种方法是在循环中发出特定于端口的 POST 请求。您可以修改以添加到一个最终列表中。端点已经是特定于 https 的。
import requests
from bs4 import BeautifulSoup as bs
def get_proxies(number, port, p):
r = requests.post('http://spys.one/en/https-ssl-proxy/', data = {'xpp': 5, 'xf4': number})
proxies = [':'.join([str(i),port]) for i in p.findall(r.text)]
return proxies
ports = ['3128', '8080', '80']
p = re.compile(r'spy14>(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})<script')
proxies = []
for number, port in enumerate(ports,1):
proxies+=get_proxies(number, port, p)
print(proxies)
示例结果:
针对特定国家/地区:
import requests
from bs4 import BeautifulSoup as bs
def get_proxies(number, port, p, country):
r = requests.post('http://spys.one/en/https-ssl-proxy/', data = {'xpp': 5, 'xf4': number})
soup = bs(r.content, 'lxml')
proxies = [':'.join([p.findall(i.text)[0], port]) for i in soup.select('table table tr:has(.spy14:contains("' + country + '")) td:has(script) .spy14')]
return proxies
ports = ['3128', '8080', '80']
p = re.compile(r'(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})document')
proxies = []
for number, port in enumerate(ports,1):
proxies+=get_proxies(number, port, p, 'United States')
print(proxies)
你说的那个已经写好了我就参考我原来的回答:
from bs4 import BeautifulSoup as bs
import requests
def get_proxies():
r = requests.get('https://free-proxy-list.net/')
soup = bs(r.content, 'lxml')
proxies = {tr.td.text + ':' + tr.td.next_sibling.text for tr in soup.select('tr:has(.hx:contains(yes))')}
return proxies
get_proxies()
我正在努力使用为不同 url 构建的现有代码来构建代理旋转器。
下面的代码示例中提供了我想要的网址。当代理类型为 "HTTPS" 时,我试图让提供的脚本调用所需的 URL 并获取 ALL 'IP:PORT'(当前脚本限制为 10 个)。
xpath或者bs4都可以。不过我对bs4比较熟悉
我理解其中的逻辑,但我不知道如何构建它。 首先,我尝试剥离字符串并尝试调用特定的 td 元素,但它不起作用。
#URLs I want
url_list = ['http://spys.one/free-proxy-list/US/','http://spys.one/free-proxy-list/US/1/']
#code I have
from lxml.html import fromstring
import requests
from itertools import cycle
import traceback
def get_proxies():
url = 'https://free-proxy-list.net/'
response = requests.get(url)
parser = fromstring(response.text)
proxies = set()
for i in parser.xpath('//tbody/tr')[:10]:
if i.xpath('.//td[7][contains(text(),"yes")]'):
proxy = ":".join([i.xpath('.//td[1]/text()')[0], i.xpath('.//td[2]/text()')[0]])
proxies.add(proxy)
return proxies
proxies = get_proxies()
proxy_pool = cycle(proxies)
proxy = next(proxy_pool)
response = requests.get(url,proxies={"http": proxy, "https": proxy})
我希望了解所提供的代码是如何针对 2 个所需的 URL 构建的,return 所有 IP:PORT 代理类型为 HTTPS 时的数字
一种方法是在循环中发出特定于端口的 POST 请求。您可以修改以添加到一个最终列表中。端点已经是特定于 https 的。
import requests
from bs4 import BeautifulSoup as bs
def get_proxies(number, port, p):
r = requests.post('http://spys.one/en/https-ssl-proxy/', data = {'xpp': 5, 'xf4': number})
proxies = [':'.join([str(i),port]) for i in p.findall(r.text)]
return proxies
ports = ['3128', '8080', '80']
p = re.compile(r'spy14>(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})<script')
proxies = []
for number, port in enumerate(ports,1):
proxies+=get_proxies(number, port, p)
print(proxies)
示例结果:
针对特定国家/地区:
import requests
from bs4 import BeautifulSoup as bs
def get_proxies(number, port, p, country):
r = requests.post('http://spys.one/en/https-ssl-proxy/', data = {'xpp': 5, 'xf4': number})
soup = bs(r.content, 'lxml')
proxies = [':'.join([p.findall(i.text)[0], port]) for i in soup.select('table table tr:has(.spy14:contains("' + country + '")) td:has(script) .spy14')]
return proxies
ports = ['3128', '8080', '80']
p = re.compile(r'(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})document')
proxies = []
for number, port in enumerate(ports,1):
proxies+=get_proxies(number, port, p, 'United States')
print(proxies)
你说的那个已经写好了我就参考我原来的回答:
from bs4 import BeautifulSoup as bs
import requests
def get_proxies():
r = requests.get('https://free-proxy-list.net/')
soup = bs(r.content, 'lxml')
proxies = {tr.td.text + ':' + tr.td.next_sibling.text for tr in soup.select('tr:has(.hx:contains(yes))')}
return proxies
get_proxies()