从 Python3 中的 URL 获取和解析代理
Fetching and Parsing proxies from URLs in Python3
我尝试从不同的代理列表网站获取和解析代理。
以下是我到目前为止所做的:
#!/usr/bin/python3
from tqdm import tqdm
import time
import sys
import re
proxies = []
def fetchAndParseProxies(url, custom_regex):
n = 0
try:
proxylist = requests.get(url, timeout=15).text
proxylist = proxylist.replace('null', '"N/A"')
custom_regex = custom_regex.replace('%ip%', '([0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3})')
custom_regex = custom_regex.replace('%port%', '([0-9]{1,5})')
for proxy in re.findall(re.compile(custom_regex), proxylist):
proxies.append(proxy[0] + ":" + proxy[1])
n += 1
except:
sys.stdout.write("{0: >5} proxies fetched from {1}\n".format('0',url))
proxysources = [
["http://spys.one/en","tr><td>%ip%<\/td><td>%port%<\/td><td>(.*?){2}<\/td><td class='hm'>.*?<\/td><td>.*?<\/td><td class='hm'>.*?<\/td><td class='hx'>(.*?)<\/td><td class='hm'>.*?<\/td><\/tr>"],
#["http://www.httptunnel.ge/ProxyListForFree.aspx"," target=\"_new\">%ip%:%port%</a>"],
#["https://www.us-proxy.org/", "<tr><td>%ip%<\/td><td>%port%<\/td><td>(.*?){2}<\/td><td class='hm'>.*?<\/td><td>.*?<\/td><td class='hm'>.*?<\/td><td class='hx'>(.*?)<\/td><td class='hm'>.*?<\/td><\/tr>"],
#["https://free-proxy-list.net/", "<tr><td>%ip%<\/td><td>%port%<\/td><td>(.*?){2}<\/td><td class='hm'>.*?<\/td><td>.*?<\/td><td class='hm'>.*?<\/td><td class='hx'>(.*?)<\/td><td class='hm'>.*?<\/td><\/tr>"],
#["https://www.sslproxies.org/", "<tr><td>%ip%<\/td><td>%port%<\/td><td>(.*?){2}<\/td><td class='hm'>.*?<\/td><td>.*?<\/td><td class='hm'>.*?<\/td><td class='hx'>(.*?)<\/td><td class='hm'>.*?<\/td><\/tr>"],
#["https://www.proxy-list.download/api/v0/get?l=en&t=https", '"IP": "%ip%", "PORT": "%port%",'],
#["https://api.proxyscrape.com/?request=getproxies&proxytype=http&timeout=5000&country=all&anonymity=elite&ssl=all", "%ip%:%port%"],
#["http://free-proxy.cz/en/proxylist/country/all/http/ping/level1", "<tr><td>%IP%<\/td><td>%Port%<\/td><td>(.*?){2}<\/td><td class='hm'>.*?<\/td><td>.*?<\/td><td class='hm'>.*?<\/td><td class='hx'>(.*?)<\/td><td class='hm'>.*?<\/td><\/tr>"],
["https://www.proxy-list.download/HTTPS", "<tr><td>%ip%<\/td><td>%port%<\/td><td>(.*?){2}<\/td><td class='hm'>.*?<\/td><td>.*?<\/td><td class='hm'>.*?<\/td><td class='hx'>(.*?)<\/td><td class='hm'>.*?<\/td><\/tr>"],
["https://www.proxy-list.download/HTTP", "<tr><td>%ip%<\/td><td>%port%<\/td><td>(.*?){2}<\/td><td class='hm'>.*?<\/td><td>.*?<\/td><td class='hm'>.*?<\/td><td class='hx'>(.*?)<\/td><td class='hm'>.*?<\/td><\/tr>"],
["http://www.freeproxylists.net/", "<tr><td>%ip%<\/td><td>%port%<\/td><td>(.*?){2}<\/td><td class='hm'>.*?<\/td><td>.*?<\/td><td class='hm'>.*?<\/td><td class='hx'>(.*?)<\/td><td class='hm'>.*?<\/td><\/tr>"],
["https://www.proxynova.com/proxy-server-list/", "<tr><td>%ip%<\/td><td>%port%<\/td><td>(.*?){2}<\/td><td class='hm'>.*?<\/td><td>.*?<\/td><td class='hm'>.*?<\/td><td class='hx'>(.*?)<\/td><td class='hm'>.*?<\/td><\/tr>"],
["http://www.freeproxylists.net/", "<tr><td>%ip%<\/td><td>%port%<\/td><td>(.*?){2}<\/td><td class='hm'>.*?<\/td><td>.*?<\/td><td class='hm'>.*?<\/td><td class='hx'>(.*?)<\/td><td class='hm'>.*?<\/td><\/tr>"],
]
loop = tqdm(total=len(proxysources), position=0, leave=False)
for source in proxysources:
loop.set_description('fetching...')
fetchAndParseProxies(source[0], source[1])
loop.update(1)
loop.close()
print(len(proxies)," Proxies Fetched.")
我的输出:
0 Proxies Fetched.
如您所见,问题是它对未注释的行显示 0 Proxies Fetched
,即使网站结构与我所看到的相同。我一定是在我的正则表达式中犯了一个错误,但我找不到哪里。
非常感谢您的帮助。
与此同时,我会继续查看它,如果有的话,post 会更新。
此脚本将从 http://sps.one/en
获取代理,但类似的方法可用于其他代理列表:
import requests
from bs4 import BeautifulSoup
ports_url = 'http://spys.one/proxy-port/'
headers = {'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:78.0) Gecko/20100101 Firefox/78.0'}
soup = BeautifulSoup(requests.post(ports_url, headers=headers, data={'xpp': 5}).content, 'html.parser')
for f in soup.select('td[colspan="2"] > a > font.spy6'):
u = 'http://spys.one/proxy-port/' + f.text + '/'
s = BeautifulSoup(requests.post(u, headers=headers, data={'xpp': 5}).content, 'html.parser')
for ff in s.select('tr > td:nth-child(1) > font.spy14'):
print(ff.text)
打印:
81.17.131.61:8080
200.108.183.2:8080
105.209.182.128:8080
45.77.63.202:8080
94.158.152.54:8080
50.233.228.147:8080
142.44.148.56:8080
52.138.1.43:8080
68.183.202.221:8080
103.52.135.60:8080
104.238.174.173:8080
181.129.219.133:8080
183.89.147.40:8080
51.38.71.101:8080
103.112.61.162:8080
131.221.228.9:8080
49.0.65.246:8080
45.32.176.57:8080
104.238.185.153:8080
155.138.146.210:8080
203.76.124.35:8080
182.253.6.234:8080
36.90.93.20:8080
207.182.135.52:8080
165.16.109.50:8080
202.142.178.98:8080
103.123.246.66:8080
185.36.157.30:8080
103.104.213.227:8080
68.188.63.149:8080
136.244.113.206:3128
54.39.91.84:3128
198.13.36.75:3128
93.153.173.102:3128
161.35.110.112:3128
... and so on.
我尝试从不同的代理列表网站获取和解析代理。
以下是我到目前为止所做的:
#!/usr/bin/python3
from tqdm import tqdm
import time
import sys
import re
proxies = []
def fetchAndParseProxies(url, custom_regex):
n = 0
try:
proxylist = requests.get(url, timeout=15).text
proxylist = proxylist.replace('null', '"N/A"')
custom_regex = custom_regex.replace('%ip%', '([0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3})')
custom_regex = custom_regex.replace('%port%', '([0-9]{1,5})')
for proxy in re.findall(re.compile(custom_regex), proxylist):
proxies.append(proxy[0] + ":" + proxy[1])
n += 1
except:
sys.stdout.write("{0: >5} proxies fetched from {1}\n".format('0',url))
proxysources = [
["http://spys.one/en","tr><td>%ip%<\/td><td>%port%<\/td><td>(.*?){2}<\/td><td class='hm'>.*?<\/td><td>.*?<\/td><td class='hm'>.*?<\/td><td class='hx'>(.*?)<\/td><td class='hm'>.*?<\/td><\/tr>"],
#["http://www.httptunnel.ge/ProxyListForFree.aspx"," target=\"_new\">%ip%:%port%</a>"],
#["https://www.us-proxy.org/", "<tr><td>%ip%<\/td><td>%port%<\/td><td>(.*?){2}<\/td><td class='hm'>.*?<\/td><td>.*?<\/td><td class='hm'>.*?<\/td><td class='hx'>(.*?)<\/td><td class='hm'>.*?<\/td><\/tr>"],
#["https://free-proxy-list.net/", "<tr><td>%ip%<\/td><td>%port%<\/td><td>(.*?){2}<\/td><td class='hm'>.*?<\/td><td>.*?<\/td><td class='hm'>.*?<\/td><td class='hx'>(.*?)<\/td><td class='hm'>.*?<\/td><\/tr>"],
#["https://www.sslproxies.org/", "<tr><td>%ip%<\/td><td>%port%<\/td><td>(.*?){2}<\/td><td class='hm'>.*?<\/td><td>.*?<\/td><td class='hm'>.*?<\/td><td class='hx'>(.*?)<\/td><td class='hm'>.*?<\/td><\/tr>"],
#["https://www.proxy-list.download/api/v0/get?l=en&t=https", '"IP": "%ip%", "PORT": "%port%",'],
#["https://api.proxyscrape.com/?request=getproxies&proxytype=http&timeout=5000&country=all&anonymity=elite&ssl=all", "%ip%:%port%"],
#["http://free-proxy.cz/en/proxylist/country/all/http/ping/level1", "<tr><td>%IP%<\/td><td>%Port%<\/td><td>(.*?){2}<\/td><td class='hm'>.*?<\/td><td>.*?<\/td><td class='hm'>.*?<\/td><td class='hx'>(.*?)<\/td><td class='hm'>.*?<\/td><\/tr>"],
["https://www.proxy-list.download/HTTPS", "<tr><td>%ip%<\/td><td>%port%<\/td><td>(.*?){2}<\/td><td class='hm'>.*?<\/td><td>.*?<\/td><td class='hm'>.*?<\/td><td class='hx'>(.*?)<\/td><td class='hm'>.*?<\/td><\/tr>"],
["https://www.proxy-list.download/HTTP", "<tr><td>%ip%<\/td><td>%port%<\/td><td>(.*?){2}<\/td><td class='hm'>.*?<\/td><td>.*?<\/td><td class='hm'>.*?<\/td><td class='hx'>(.*?)<\/td><td class='hm'>.*?<\/td><\/tr>"],
["http://www.freeproxylists.net/", "<tr><td>%ip%<\/td><td>%port%<\/td><td>(.*?){2}<\/td><td class='hm'>.*?<\/td><td>.*?<\/td><td class='hm'>.*?<\/td><td class='hx'>(.*?)<\/td><td class='hm'>.*?<\/td><\/tr>"],
["https://www.proxynova.com/proxy-server-list/", "<tr><td>%ip%<\/td><td>%port%<\/td><td>(.*?){2}<\/td><td class='hm'>.*?<\/td><td>.*?<\/td><td class='hm'>.*?<\/td><td class='hx'>(.*?)<\/td><td class='hm'>.*?<\/td><\/tr>"],
["http://www.freeproxylists.net/", "<tr><td>%ip%<\/td><td>%port%<\/td><td>(.*?){2}<\/td><td class='hm'>.*?<\/td><td>.*?<\/td><td class='hm'>.*?<\/td><td class='hx'>(.*?)<\/td><td class='hm'>.*?<\/td><\/tr>"],
]
loop = tqdm(total=len(proxysources), position=0, leave=False)
for source in proxysources:
loop.set_description('fetching...')
fetchAndParseProxies(source[0], source[1])
loop.update(1)
loop.close()
print(len(proxies)," Proxies Fetched.")
我的输出:
0 Proxies Fetched.
如您所见,问题是它对未注释的行显示 0 Proxies Fetched
,即使网站结构与我所看到的相同。我一定是在我的正则表达式中犯了一个错误,但我找不到哪里。
非常感谢您的帮助。 与此同时,我会继续查看它,如果有的话,post 会更新。
此脚本将从 http://sps.one/en
获取代理,但类似的方法可用于其他代理列表:
import requests
from bs4 import BeautifulSoup
ports_url = 'http://spys.one/proxy-port/'
headers = {'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:78.0) Gecko/20100101 Firefox/78.0'}
soup = BeautifulSoup(requests.post(ports_url, headers=headers, data={'xpp': 5}).content, 'html.parser')
for f in soup.select('td[colspan="2"] > a > font.spy6'):
u = 'http://spys.one/proxy-port/' + f.text + '/'
s = BeautifulSoup(requests.post(u, headers=headers, data={'xpp': 5}).content, 'html.parser')
for ff in s.select('tr > td:nth-child(1) > font.spy14'):
print(ff.text)
打印:
81.17.131.61:8080
200.108.183.2:8080
105.209.182.128:8080
45.77.63.202:8080
94.158.152.54:8080
50.233.228.147:8080
142.44.148.56:8080
52.138.1.43:8080
68.183.202.221:8080
103.52.135.60:8080
104.238.174.173:8080
181.129.219.133:8080
183.89.147.40:8080
51.38.71.101:8080
103.112.61.162:8080
131.221.228.9:8080
49.0.65.246:8080
45.32.176.57:8080
104.238.185.153:8080
155.138.146.210:8080
203.76.124.35:8080
182.253.6.234:8080
36.90.93.20:8080
207.182.135.52:8080
165.16.109.50:8080
202.142.178.98:8080
103.123.246.66:8080
185.36.157.30:8080
103.104.213.227:8080
68.188.63.149:8080
136.244.113.206:3128
54.39.91.84:3128
198.13.36.75:3128
93.153.173.102:3128
161.35.110.112:3128
... and so on.