上百台服务器如何最快解析DNS A记录?
How to resolve DNS A records using hundreds of servers as fast as possible?
现在我用过这个:
from selenium import webdriver
from selenium.webdriver import FirefoxProfile
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
from selenium.webdriver.firefox.options import Options
class DNS_LOOKUP:
ROBTEX_IPLOOKUP = 'https://www.robtex.com/ip-lookup/'
ROBTEX_HEAD = '//section[1]/div[3]/p/a'
ROBTEX_TABLE = '//section[2]/div[3]/table/tbody/tr/td//a'
NSLOOKUP_IPV4 = '//div[2]/div[1]/table/tbody/tr/td[2]/span[1]'
NSLOOKUP_IPV6 = '//div[2]/div[2]/table/tbody/tr/td[2]/span[1]'
NSLOOKUP_SOURCES = ['cloudflare', 'google', 'opendns', 'authoritative']
def __init__(self):
options = Options()
options.add_argument("--headless")
options.add_argument("--log-level=3")
options.add_argument("--mute-audio")
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")
options.add_argument('--disable-extensions')
options.add_argument('--disable-gpu')
capabibilties = DesiredCapabilities().FIREFOX
capabibilties['pageLoadStrategy'] = 'eager'
profile = FirefoxProfile(os.environ['appdata'] + '\Mozilla\Firefox\Profiles\bkpihn0o.bot')
profile.set_preference("http.response.timeout", 1)
profile.set_preference("dom.max_script_run_time", 0)
profile.set_preference('permissions.default.stylesheet', 2)
profile.set_preference('permissions.default.image', 2)
profile.set_preference('dom.ipc.plugins.enabled.libflashplayer.so', 'false')
profile.set_preference("permissions.default.script", 2)
profile.set_preference("javascript.enabled", False)
self.Firefox = webdriver.Firefox(capabilities=capabibilties, options=options, firefox_profile=profile)
self.AltFirefox = webdriver.Firefox(capabilities=capabibilties)
def _robtex(self, addr):
self.Firefox.get(f'https://www.robtex.com/dns-lookup/{addr}')
ips = {href.removeprefix(DNS_LOOKUP.ROBTEX_IPLOOKUP) for e in self.Firefox.find_elements('xpath', DNS_LOOKUP.ROBTEX_HEAD) if (href := e.get_attribute('href')).startswith(DNS_LOOKUP.ROBTEX_IPLOOKUP)}
ips |= {href.removeprefix(DNS_LOOKUP.ROBTEX_IPLOOKUP) for e in self.Firefox.find_elements('xpath', DNS_LOOKUP.ROBTEX_TABLE) if (href := e.get_attribute('href')).startswith(DNS_LOOKUP.ROBTEX_IPLOOKUP)}
ipv4, ipv6 = set(), set()
for i in sorted(ips):
if IPV4.match(i):
ipv4.add(i)
elif is_ipv6(i):
ipv6.add(i)
return ipv4, ipv6
def _nslookup(self, addr):
ipv4, ipv6 = set(), set()
for source in DNS_LOOKUP.NSLOOKUP_SOURCES:
self.AltFirefox.get(f'https://www.nslookup.io/dns-records/{addr}#{source}')
ipv4 |= {ip for e in self.AltFirefox.find_elements('xpath', DNS_LOOKUP.NSLOOKUP_IPV4) if IPV4.match((ip := e.text))}
ipv6 |= {ip for e in self.AltFirefox.find_elements('xpath', DNS_LOOKUP.NSLOOKUP_IPV6) if is_ipv6((ip := e.text))}
return ipv4, ipv6
def dns_query(self, addr):
robtex = self._robtex(addr)
nslookup = self._nslookup(addr)
ipv4, ipv6 = robtex
ipv4 |= nslookup[0]
ipv6 |= nslookup[1]
return {'ipv4': sorted(ipv4), 'ipv6': sorted(ipv6)}
这种方法 returns 很多地址,但遗憾的是不够,正如您所看到的,它使用 selenium
而不是 requests
,因此它很慢。好吧,说实话,我一次又一次地进行了广泛而严格的测试,selenium 总是比请求快。但它的速度仍然是令人无法接受的。
我也写过这个:
import dns
resolver = dns.resolver.Resolver()
resolver.nameservers = ['8.8.8.8']
def dns_resolve(address):
return sorted({resolver.query(address)[0].address for i in range(4)})
速度要快得多,但是每次查询每个服务器只 returns 一个地址,所以我重复了四次操作,我希望每次查询每个服务器至少返回 4 个地址...
我什至写了这个:
import json
import requests
def manual_resolve(address):
return [i['data'] for i in json.loads(requests.get(f'https://dns.google/resolve?name={address}&type=A').text)['Answer']]
它是我能得到的最低级别,但正如我之前所说,在我的网络条件下,请求实际上比 selenium 慢,慢得多...
所以我想知道使用多台服务器查询 DNS A 记录的最快方法是什么,多台是指大量;
我从这里得到了 5556 个值得信赖的名称服务器:https://public-dns.info/nameservers.csv(地址指向的文件可能会随着时间的推移而改变,我下载它时的版本有 5556 个条目),我使用了这个脚本处理信息:
import csv
import json
import ping3
import re
import pickle
import subprocess
import time
from collections import namedtuple
from datetime import datetime
from pathlib import Path
IPV4 = re.compile(r'^((25[0-5]|2[0-4]\d|1?\d\d?)\.){3}(25[0-5]|2[0-4]\d|1?\d\d?)$')
publicdns = Path('C:/Users/Estranger/Downloads/nameservers.csv').read_text(encoding='utf8').splitlines()
publicdns = list(csv.reader(publicdns))
to_date = lambda x: datetime.strptime(x, '%Y-%m-%dT%H:%M:%SZ')
Entry = namedtuple('Entry', publicdns[0])
deserializer = [str, str, int, str, str, str, str, str, bool, float, to_date, to_date]
publicdns = [Entry(*(f(v) for f, v in zip(deserializer, i))) for i in publicdns[1:]]
Path('D:/nameservers.pickle').write_bytes(pickle.dumps(publicdns, protocol=pickle.HIGHEST_PROTOCOL))
IPV4_DNS = [ipv4 for e in publicdns if e.reliability >= 0.75 and IPV4.match((ipv4 := e.ip_address))]
Path('D:/reliable_ipv4_dns.txt').write_text('\n'.join(IPV4_DNS))
def ping(addr, lim=0.5):
return sum(d if (d := ping3.ping(addr, timeout=lim, unit='ms')) else 0 for _ in range(4)) / 4
ping_latency = []
new_servers = []
def format_delta(d):
d = int(d)
h, rem = divmod(d, 3600)
m, s = divmod(rem, 60)
return f'{h:02d}:{m:02d}:{s:02d}'
def ping_filter(condition, timeout):
loop = 1
if loop == 1:
servers = IPV4_DNS.copy()
logs = []
start = datetime.now()
success_rate = 0
while True:
loop_start = datetime.now()
total = len(servers)
ping_latency.clear()
new_servers.clear()
succeeded = 0
failed = 0
l = len(str(total))
for iteration, server in enumerate(servers):
latency = ping(server, timeout)
timestamp = datetime.now()
elapsed = timestamp-start
loop_elapsed = timestamp-loop_start
eta = (loop_elapsed.total_seconds() / (iteration + 1)) * (total - iteration - 1)
entry = {
'timestamp': f'{timestamp:%Y-%m-%d %H:%M:%S}',
'loop': loop,
'loop start': f'{loop_start:%Y-%m-%d %H:%M:%S}',
'iteration': iteration,
'server': server,
'success': True,
'latency': round(latency, 2),
'unit': 'ms',
'total': total,
'succeeded': succeeded,
'failed': failed,
'started': f'{start:%Y-%m-%d %H:%M:%S}',
'elapsed': format_delta(elapsed.total_seconds()),
'loop runtime': format_delta(loop_elapsed.total_seconds()),
'ETA': format_delta(eta),
'success rate': f'{success_rate:06.2%}'
}
if 0 < latency <= int(timeout*1000):
succeeded += 1
entry['succeeded'] += 1
new_servers.append(server)
ping_latency.append((server, latency))
else:
failed += 1
entry['failed'] += 1
entry['success'] = False
entry['latency'] = 'timeout'
if iteration == total - 1:
success_rate = succeeded / total
entry['success rate'] = f'{success_rate:06.2%}'
print(json.dumps(entry, indent=4))
logs.append(entry)
new_total = len(new_servers)
servers = new_servers.copy()
if new_total == total or loop == 32:
timestamp = datetime.now()
elapsed = datetime.now()-start
entry = {
'completed': f'{timestamp:%Y-%m-%d %H:%M:%S}',
'started': f'{start:%Y-%m-%d %H:%M:%S}',
'elapsed': format_delta(elapsed.total_seconds()),
'loop': loop
}
print(json.dumps(entry, indent=4))
logs.append(entry)
break
loop += 1
Path(f'D:/IPv4_DNS_{condition}.txt').write_text('\n'.join(servers))
Path(f'D:/IPv4_DNS_ping_log_{condition}.json').write_text(json.dumps(logs, indent=4))
Path(f'D:/IPv4_DNS_ping_latency_{condition}.json').write_text(json.dumps(dict(ping_latency), indent=4))
ping_filter('NOVPN', 0.3)
花了24个多小时才完成,总之我搞定了1518台服务器。
而且我需要使用所有这些 1518 台服务器为每个操作解析每个输入地址的 A 记录,以便有机会找到未被阻止或减慢的 IP 地址,那么如何使用大量异步解析 DNS A 记录名称服务器的数量?
更新
好的,现在我已经查看了 asyncio
和 concurrent.futures.ThreadPoolExecutor
以及 dns.asyncresolver
,我认为它们正是我正在寻找的,但仍有一些我不太了解的地方还没看懂。
我正在考虑使用 4 个并发线程池,每个 运行 同步 4 次(每个服务器获得 4 个地址,因为我现在每个服务器只能获得 1 个地址,而 Google 不是任何帮助),每个最大大小为 4,每个任务都是使用 32 个服务器执行异步 DNS 查询功能。
这是我想出的:
def split_sixteen(series):
length = len(series)
p4 = -(-length // 4)
p16 = -(-length // 16)
indices = [(0, 1), (1, 2), (2, 3), (3, 4)]
return [[series[p4*a:p4*b][p16*c:p16*d] for c, d in indices] for a, b in indices]
class Assigner:
def __init__(self, tasks, batch=32) -> None:
self.tasks = tasks
self.length = len(tasks)
self.index = 0
self.batch = batch
self.all_assigned = False
def assign(self):
if not self.all_assigned:
start = self.index
if self.index + self.batch <= self.length:
self.index += self.batch
else:
self.index = self.length
if self.index == self.length:
self.all_assigned = True
return self.tasks[start:self.index]
else:
raise StopIteration('All tasks have been assigned')
我不知道线程池应该有什么功能运行,我认为该功能应该有一个while循环,直到assigner耗尽,循环最多需要32台服务器assigner 并将其放入线程池中 运行 ,如果还没有 4 个协程,则它会等到其中一个例程完成后启动另一个例程,循环结束后,该函数应等待完成合并结果的例程,所有 4 个线程池的结果应该合并...
我不知道如何让所有这些一起工作...
大约 selenium
比 requests
快,令人震惊,我知道,但这是真的,如果你不主观地相信它,真理就不会变得不真实:
import os
import requests
from selenium import webdriver
from selenium.webdriver import FirefoxProfile
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
from selenium.webdriver.firefox.options import Options
options = Options()
options.add_argument("--headless")
options.add_argument("--log-level=3")
options.add_argument("--mute-audio")
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")
options.add_argument('--disable-extensions')
options.add_argument('--disable-gpu')
capabibilties = DesiredCapabilities().FIREFOX
capabibilties['pageLoadStrategy'] = 'eager'
profile = FirefoxProfile(os.environ['appdata'] + '\Mozilla\Firefox\Profiles\bkpihn0o.bot')
profile.set_preference("http.response.timeout", 1)
profile.set_preference("dom.max_script_run_time", 0)
profile.set_preference('permissions.default.stylesheet', 2)
profile.set_preference('permissions.default.image', 2)
profile.set_preference('dom.ipc.plugins.enabled.libflashplayer.so', 'false')
profile.set_preference("permissions.default.script", 2)
profile.set_preference("javascript.enabled", False)
Firefox = webdriver.Firefox(capabilities=capabibilties, options=options, firefox_profile=profile)
也许我没有提到我在连接到 VPN 时执行了这些测试,而且请求似乎没有利用 VPN。
请参阅我对您的 post 的评论。
我很难相信 selenium
可以胜过直接调用 DNS 服务器。毕竟,selenium
将使用相同的网络发出 GET 请求。
我在 Windows 下安装了 dnspython
并使用线程池进行了一些基准测试,线程池的大小等于我尝试解析的域数,其中 750 个包含 15 个不同的域重复50次。然后,我根据 OP 引用的 CSV 文件创建了一小部分 DNS 服务器。我还尝试使用 asyncio
解析域,但无法正常工作。这些是具体的基准:
test1
:使用由单个 DNS 服务器“8.8.8.8”组成的服务器列表同时解析所有域。一个给定的域,例如 bonappetit.com 可能会在不同的解析请求中被解析为不同的 IP 地址,即使所有请求都使用了相同的 DNS 服务器地址。解决所有请求的时间为 1.56 秒。
test2
:对于这个基准测试,每个请求都被赋予一个服务器列表,该列表是从 DNS 服务器列表中统一选择的单个 DNS 服务器。我很快发现许多 DNS 服务器地址不是特别可靠,导致超时异常,不得不从服务器列表中删除并重新运行基准测试。解决所有请求的时间为 5.57 秒。
test3
:对于此基准测试,每个请求都会获得完整的 DNS 服务器列表 ,不包括 '8.8.8.8'。基准时间为 3.54 秒。未显示的是基准,我在其中提供了包括“8.8.8.8”在内的完整服务器列表。它的基准时间与 'test1' 的时间基本相同。我再次重复基准测试,其中“8.8.8.8”是列表中的最后一个条目,运行 时间始终只比 test1
的 运行 时间慢百分之几秒它是列表中的第一个条目。我随后再次重新运行此基准测试,从列表中排除“8.8.8.8”,我现在看到 运行 次与 test1
的 运行 次相当。 这向我表明,我碰巧使用的服务器不是“8.8.8.8”,即使它们没有超时,响应时间也非常可变。
test4
:我使用了完全不同的DNS解析方法,即socket.gethostbyname
。这产生了迄今为止最快的基准时间 0.27 秒,并且对于每个域,只返回了一个 IP 地址。我相信这两个结果都可以通过 Windows 缓存结果来解释。
test5
:这是对单个 DNS 服务器“8.8.8.8”使用 asyncio
的尝试,它超时了。我不知道为什么。
结论
首先,您的 5556 DNS 服务器不同样值得信赖,而且它们的可信度每时每刻都在变化。我会用地理位置靠近您的服务器进行试验,以确定 最 值得信赖的服务器,并将一些放在服务器列表中,并将“8.8.8.8”作为第一个条目。 为了更清楚地说明这一点,我说的是基准测试中使用的代码 test3
,但不排除服务器“8.8.8.8”。
其次,我认为没有理由创建多个线程池。但是有一些你不应该超过的最大尺寸。当然500应该不是问题。但是,如果你能让 asyncio
工作,你应该能够毫无问题地 收集 数以千计的任务。
import dns.resolver
import asyncio
import dns.asyncresolver
import socket
import time
from multiprocessing.pool import ThreadPool
l = [
'yahoo.com',
'cnn.com',
'ibm.com',
'nytimes.com',
'whosebug.com',
'tcm.com',
'wqxr.org',
'wahingtonpost.com',
'theatlantic.com',
'nymag.com',
'newyorker.com',
'bonappetit.com',
'seriouseats.com',
'foodtv.com',
'food52.com',
]
domains = []
for _ in range(50):
domains += l
servers = [
'8.8.8.8',
'8.0.7.0',
'8.0.6.0',
'195.99.66.220',
'38.132.106.139',
]
def test1(pool):
def resolve(domain):
resolver = dns.resolver.Resolver()
resolver.nameservers = ['8.8.8.8']
return (domain, resolver.resolve(domain)[0].address)
return pool.map(resolve, domains)
def test2(pool):
def resolve(idx, domain):
resolver = dns.resolver.Resolver()
i = idx % len(servers)
resolver.nameservers = [servers[i]]
try:
return (domain, resolver.resolve(domain)[0].address)
except Exception as e:
print(e, servers[i])
return None
return pool.starmap(resolve, enumerate(domains))
def test3(pool):
def resolve(domain):
resolver = dns.resolver.Resolver()
resolver.nameservers = servers[1:] # omit '8.8.8.8'
return (domain, resolver.resolve(domain)[0].address)
return pool.map(resolve, domains)
def test4(pool):
def resolve(domain):
return (domain, socket.gethostbyname(domain))
return pool.map(resolve, domains)
async def test5():
async def resolve(domain):
resolver = dns.asyncresolver.Resolver()
resolver.nameservers = ['8.8.8.8']
addr = await resolver.resolve(domain)
return (domain, addr[0].address)
return await asyncio.gather(*(resolve(domain) for domain in domains))
pool = ThreadPool(len(domains))
def benchmark(fun):
print()
print(fun.__name__)
start = time.time()
results = fun(pool)
print(time.time() - start)
print(sorted(set(results)))
benchmark(test1)
benchmark(test2)
benchmark(test3)
benchmark(test4)
print()
print('test5')
start = time.time()
results = asyncio.run(test5())
print(time.time() - start)
print(sorted(set(results)))
打印:
test1
1.5600032806396484
[('bonappetit.com', '151.101.0.239'), ('bonappetit.com', '151.101.192.239'), ('bonappetit.com', '151.101.64.239'), ('cnn.com', '151.101.1.67'), ('cnn.com', '151.101.129.67'), ('cnn.com', '151.101.65.67'), ('food52.com', '104.18.166.45'), ('food52.com', '104.18.174.13'), ('foodtv.com', '67.199.248.12'), ('foodtv.com', '67.199.248.13'), ('ibm.com', '184.29.179.199'), ('newyorker.com', '151.101.0.239'), ('newyorker.com', '151.101.128.239'), ('newyorker.com', '151.101.192.239'), ('newyorker.com', '151.101.64.239'), ('nymag.com', '151.101.130.133'), ('nymag.com', '151.101.194.133'), ('nymag.com', '151.101.2.133'), ('nymag.com', '151.101.66.133'), ('nytimes.com', '151.101.1.164'), ('nytimes.com', '151.101.129.164'), ('nytimes.com', '151.101.193.164'), ('nytimes.com', '151.101.65.164'), ('seriouseats.com', '151.101.2.137'), ('whosebug.com', '151.101.1.69'), ('whosebug.com', '151.101.129.69'), ('whosebug.com', '151.101.65.69'), ('tcm.com', '104.127.162.10'), ('theatlantic.com', '151.101.130.133'), ('theatlantic.com', '151.101.194.133'), ('theatlantic.com', '151.101.2.133'), ('theatlantic.com', '151.101.66.133'), ('wahingtonpost.com', '198.72.14.16'), ('wqxr.org', '44.194.174.151'), ('wqxr.org', '54.144.182.133'), ('yahoo.com', '74.6.143.25'), ('yahoo.com', '74.6.143.26'), ('yahoo.com', '74.6.231.21'), ('yahoo.com', '98.137.11.163')]
test2
5.566321611404419
[('bonappetit.com', '151.101.0.239'), ('bonappetit.com', '151.101.128.239'), ('bonappetit.com', '151.101.192.239'), ('bonappetit.com', '151.101.64.239'), ('cnn.com', '151.101.1.67'), ('cnn.com', '151.101.129.67'), ('cnn.com', '151.101.193.67'), ('cnn.com', '151.101.65.67'), ('food52.com', '104.18.166.45'), ('foodtv.com', '67.199.248.12'), ('foodtv.com', '67.199.248.13'), ('ibm.com', '23.218.185.219'), ('newyorker.com', '151.101.0.239'), ('newyorker.com', '151.101.128.239'), ('newyorker.com', '151.101.192.239'), ('newyorker.com', '151.101.64.239'), ('nymag.com', '151.101.130.133'), ('nytimes.com', '151.101.1.164'), ('nytimes.com', '151.101.129.164'), ('nytimes.com', '151.101.193.164'), ('nytimes.com', '151.101.65.164'), ('seriouseats.com', '151.101.130.137'), ('seriouseats.com', '151.101.194.137'), ('seriouseats.com', '151.101.2.137'), ('seriouseats.com', '151.101.66.137'), ('whosebug.com', '151.101.1.69'), ('tcm.com', '104.127.162.10'), ('theatlantic.com', '151.101.130.133'), ('theatlantic.com', '151.101.194.133'), ('theatlantic.com', '151.101.2.133'), ('theatlantic.com', '151.101.66.133'), ('wahingtonpost.com', '198.72.14.16'), ('wqxr.org', '44.194.174.151'), ('wqxr.org', '54.144.182.133'), ('yahoo.com', '74.6.143.26'), ('yahoo.com', '74.6.231.21'), ('yahoo.com', '98.137.11.163')]
test3
3.536404609680176
[('bonappetit.com', '151.101.0.239'), ('bonappetit.com', '151.101.128.239'), ('bonappetit.com', '151.101.192.239'), ('bonappetit.com', '151.101.64.239'), ('cnn.com', '151.101.1.67'), ('cnn.com', '151.101.129.67'), ('cnn.com', '151.101.193.67'), ('cnn.com', '151.101.65.67'), ('food52.com', '104.18.166.45'), ('food52.com', '104.18.174.13'), ('foodtv.com', '67.199.248.12'), ('foodtv.com', '67.199.248.13'), ('ibm.com', '23.218.185.219'), ('newyorker.com', '151.101.0.239'), ('newyorker.com', '151.101.128.239'), ('newyorker.com', '151.101.192.239'), ('newyorker.com', '151.101.64.239'), ('nymag.com', '151.101.130.133'), ('nymag.com', '151.101.194.133'), ('nymag.com', '151.101.2.133'), ('nymag.com', '151.101.66.133'), ('nytimes.com', '151.101.1.164'), ('nytimes.com', '151.101.129.164'), ('nytimes.com', '151.101.193.164'), ('nytimes.com', '151.101.65.164'), ('seriouseats.com', '151.101.130.137'), ('seriouseats.com', '151.101.194.137'), ('seriouseats.com', '151.101.2.137'), ('seriouseats.com', '151.101.66.137'), ('whosebug.com', '151.101.1.69'), ('whosebug.com', '151.101.129.69'), ('whosebug.com', '151.101.193.69'), ('whosebug.com', '151.101.65.69'), ('tcm.com', '23.75.199.121'), ('theatlantic.com', '151.101.130.133'), ('theatlantic.com', '151.101.194.133'), ('theatlantic.com', '151.101.2.133'), ('theatlantic.com', '151.101.66.133'), ('wahingtonpost.com', '198.72.14.16'), ('wqxr.org', '44.194.174.151'), ('wqxr.org', '54.144.182.133'), ('yahoo.com', '74.6.143.25'), ('yahoo.com', '74.6.143.26'), ('yahoo.com', '74.6.231.20'), ('yahoo.com', '74.6.231.21'), ('yahoo.com', '98.137.11.163'), ('yahoo.com', '98.137.11.164')]
test4
0.33908557891845703
[('bonappetit.com', '151.101.64.239'), ('cnn.com', '151.101.129.67'), ('food52.com', '104.18.174.13'), ('foodtv.com', '67.199.248.12'), ('ibm.com', '104.104.121.251'), ('newyorker.com', '151.101.192.239'), ('nymag.com', '151.101.130.133'), ('nytimes.com', '151.101.193.164'), ('seriouseats.com', '151.101.66.137'), ('whosebug.com', '151.101.65.69'), ('tcm.com', '104.127.162.10'), ('theatlantic.com', '151.101.2.133'), ('wahingtonpost.com', '198.72.14.16'), ('wqxr.org', '44.194.174.151'), ('yahoo.com', '98.137.11.164')]
test5
Traceback (most recent call last):
...
addr = await resolver.resolve(domain)
File "C:\Booboo\test\test_venv\lib\site-packages\dns\asyncresolver.py", line 74, in resolve
timeout = self._compute_timeout(start, lifetime)
File "C:\Booboo\test\test_venv\lib\site-packages\dns\resolver.py", line 997, in _compute_timeout
raise Timeout(timeout=duration)
dns.exception.Timeout: The DNS operation timed out after 5.38653302192688 seconds
更新
我遇到了 ,它描述了 dnspython
的 PyPi
版本中导致 async
解析器超时的错误。解决方案是从 Github:
加载最新版本
pip install -U https://github.com/rthalley/dnspython/archive/master.zip
这次我重新运行了基准测试,将需要获取的域数量增加到 3,000,坦率地说,这是要创建的很多线程,所以我将线程池的大小上限设置为 500(并且现在包括在计时中创建线程池的时间)并且我将完整的 DNS 服务器列表提供给多线程基准测试和异步基准测试(总共两个基准测试)。我还做了一个改进,解析器 class 只创建一次并重新用于解析所有请求。此外,单个查询返回的所有 IP 地址都被添加到一组 IP 地址中,这些 IP 地址保存在由域键入的字典中,以防针对同一个域发出多个请求(就像我的基准测试中的情况一样),所以我们有最终找到一组唯一的 IP 地址。这是返回的最终字典。
结果:
multithreading: 1.99 seconds
asyncio: 3.43 seconds
这令人惊讶,因为我认为要处理大量域并限制线程池的大小,asyncio 版本的性能会更高。多线程显然是可行的方法。
import dns.resolver
import asyncio
import dns.asyncresolver
import socket
import time
from multiprocessing.pool import ThreadPool
l = [
'yahoo.com',
'cnn.com',
'ibm.com',
'nytimes.com',
'whosebug.com',
'tcm.com',
'wqxr.org',
'wahingtonpost.com',
'theatlantic.com',
'nymag.com',
'newyorker.com',
'bonappetit.com',
'seriouseats.com',
'foodtv.com',
'food52.com',
]
domains = []
for _ in range(200):
domains += l
servers = [
'8.8.8.8',
'8.0.7.0',
'8.0.6.0',
'195.99.66.220',
'38.132.106.139',
]
def threading_test(pool):
resolver = dns.resolver.Resolver()
resolver.nameservers = servers
ip_addresses = {}
def resolve(domain):
results = resolver.resolve(domain)
s = ip_addresses.setdefault(domain, set())
for result in results:
s.add(result.address)
pool.map(resolve, domains)
return ip_addresses
async def async_test():
resolver = dns.asyncresolver.Resolver()
resolver.nameservers = servers
ip_addresses = {}
async def resolve(domain):
results = await resolver.resolve(domain)
s = ip_addresses.setdefault(domain, set())
for result in results:
s.add(result.address)
await asyncio.gather(*(resolve(domain) for domain in domains))
return ip_addresses
print('threading_test')
start = time.time()
pool = ThreadPool(min(500, len(domains)))
results = threading_test(pool)
print(time.time() - start)
for k in sorted(results.keys()):
print(k, sorted(results[k]))
print()
print('async_test')
start = time.time()
results = asyncio.run(async_test())
print(time.time() - start)
for k in sorted(results.keys()):
print(k, sorted(results[k]))
打印:
1.9919934272766113
bonappetit.com ['151.101.0.239', '151.101.128.239', '151.101.192.239', '151.101.64.239']
cnn.com ['151.101.1.67', '151.101.129.67', '151.101.193.67', '151.101.65.67']
food52.com ['104.18.166.45', '104.18.174.13']
foodtv.com ['67.199.248.12', '67.199.248.13']
ibm.com ['184.29.179.199']
newyorker.com ['151.101.0.239', '151.101.128.239', '151.101.192.239', '151.101.64.239']
nymag.com ['151.101.130.133', '151.101.194.133', '151.101.2.133', '151.101.66.133']
nytimes.com ['151.101.1.164', '151.101.129.164', '151.101.193.164', '151.101.65.164']
seriouseats.com ['151.101.130.137', '151.101.194.137', '151.101.2.137', '151.101.66.137']
whosebug.com ['151.101.1.69', '151.101.129.69', '151.101.193.69', '151.101.65.69']
tcm.com ['104.127.162.10']
theatlantic.com ['151.101.130.133', '151.101.194.133', '151.101.2.133', '151.101.66.133']
wahingtonpost.com ['198.72.14.16']
wqxr.org ['44.194.174.151', '54.144.182.133']
yahoo.com ['74.6.143.25', '74.6.143.26', '74.6.231.20', '74.6.231.21', '98.137.11.163', '98.137.11.164']
async_test
3.437023878097534
bonappetit.com ['151.101.0.239', '151.101.128.239', '151.101.192.239', '151.101.64.239']
cnn.com ['151.101.1.67', '151.101.129.67', '151.101.193.67', '151.101.65.67']
food52.com ['104.18.166.45', '104.18.174.13']
foodtv.com ['67.199.248.12', '67.199.248.13']
ibm.com ['184.29.179.199', '23.218.185.219']
newyorker.com ['151.101.0.239', '151.101.128.239', '151.101.192.239', '151.101.64.239']
nymag.com ['151.101.130.133', '151.101.194.133', '151.101.2.133', '151.101.66.133']
nytimes.com ['151.101.1.164', '151.101.129.164', '151.101.193.164', '151.101.65.164']
seriouseats.com ['151.101.130.137', '151.101.194.137', '151.101.2.137', '151.101.66.137']
whosebug.com ['151.101.1.69', '151.101.129.69', '151.101.193.69', '151.101.65.69']
tcm.com ['104.127.162.10', '23.79.32.175']
theatlantic.com ['151.101.130.133', '151.101.194.133', '151.101.2.133', '151.101.66.133']
wahingtonpost.com ['198.72.14.16']
wqxr.org ['44.194.174.151', '54.144.182.133']
yahoo.com ['74.6.143.25', '74.6.143.26', '74.6.231.20', '74.6.231.21', '98.137.11.163', '98.137.11.164']
现在我用过这个:
from selenium import webdriver
from selenium.webdriver import FirefoxProfile
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
from selenium.webdriver.firefox.options import Options
class DNS_LOOKUP:
ROBTEX_IPLOOKUP = 'https://www.robtex.com/ip-lookup/'
ROBTEX_HEAD = '//section[1]/div[3]/p/a'
ROBTEX_TABLE = '//section[2]/div[3]/table/tbody/tr/td//a'
NSLOOKUP_IPV4 = '//div[2]/div[1]/table/tbody/tr/td[2]/span[1]'
NSLOOKUP_IPV6 = '//div[2]/div[2]/table/tbody/tr/td[2]/span[1]'
NSLOOKUP_SOURCES = ['cloudflare', 'google', 'opendns', 'authoritative']
def __init__(self):
options = Options()
options.add_argument("--headless")
options.add_argument("--log-level=3")
options.add_argument("--mute-audio")
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")
options.add_argument('--disable-extensions')
options.add_argument('--disable-gpu')
capabibilties = DesiredCapabilities().FIREFOX
capabibilties['pageLoadStrategy'] = 'eager'
profile = FirefoxProfile(os.environ['appdata'] + '\Mozilla\Firefox\Profiles\bkpihn0o.bot')
profile.set_preference("http.response.timeout", 1)
profile.set_preference("dom.max_script_run_time", 0)
profile.set_preference('permissions.default.stylesheet', 2)
profile.set_preference('permissions.default.image', 2)
profile.set_preference('dom.ipc.plugins.enabled.libflashplayer.so', 'false')
profile.set_preference("permissions.default.script", 2)
profile.set_preference("javascript.enabled", False)
self.Firefox = webdriver.Firefox(capabilities=capabibilties, options=options, firefox_profile=profile)
self.AltFirefox = webdriver.Firefox(capabilities=capabibilties)
def _robtex(self, addr):
self.Firefox.get(f'https://www.robtex.com/dns-lookup/{addr}')
ips = {href.removeprefix(DNS_LOOKUP.ROBTEX_IPLOOKUP) for e in self.Firefox.find_elements('xpath', DNS_LOOKUP.ROBTEX_HEAD) if (href := e.get_attribute('href')).startswith(DNS_LOOKUP.ROBTEX_IPLOOKUP)}
ips |= {href.removeprefix(DNS_LOOKUP.ROBTEX_IPLOOKUP) for e in self.Firefox.find_elements('xpath', DNS_LOOKUP.ROBTEX_TABLE) if (href := e.get_attribute('href')).startswith(DNS_LOOKUP.ROBTEX_IPLOOKUP)}
ipv4, ipv6 = set(), set()
for i in sorted(ips):
if IPV4.match(i):
ipv4.add(i)
elif is_ipv6(i):
ipv6.add(i)
return ipv4, ipv6
def _nslookup(self, addr):
ipv4, ipv6 = set(), set()
for source in DNS_LOOKUP.NSLOOKUP_SOURCES:
self.AltFirefox.get(f'https://www.nslookup.io/dns-records/{addr}#{source}')
ipv4 |= {ip for e in self.AltFirefox.find_elements('xpath', DNS_LOOKUP.NSLOOKUP_IPV4) if IPV4.match((ip := e.text))}
ipv6 |= {ip for e in self.AltFirefox.find_elements('xpath', DNS_LOOKUP.NSLOOKUP_IPV6) if is_ipv6((ip := e.text))}
return ipv4, ipv6
def dns_query(self, addr):
robtex = self._robtex(addr)
nslookup = self._nslookup(addr)
ipv4, ipv6 = robtex
ipv4 |= nslookup[0]
ipv6 |= nslookup[1]
return {'ipv4': sorted(ipv4), 'ipv6': sorted(ipv6)}
这种方法 returns 很多地址,但遗憾的是不够,正如您所看到的,它使用 selenium
而不是 requests
,因此它很慢。好吧,说实话,我一次又一次地进行了广泛而严格的测试,selenium 总是比请求快。但它的速度仍然是令人无法接受的。
我也写过这个:
import dns
resolver = dns.resolver.Resolver()
resolver.nameservers = ['8.8.8.8']
def dns_resolve(address):
return sorted({resolver.query(address)[0].address for i in range(4)})
速度要快得多,但是每次查询每个服务器只 returns 一个地址,所以我重复了四次操作,我希望每次查询每个服务器至少返回 4 个地址...
我什至写了这个:
import json
import requests
def manual_resolve(address):
return [i['data'] for i in json.loads(requests.get(f'https://dns.google/resolve?name={address}&type=A').text)['Answer']]
它是我能得到的最低级别,但正如我之前所说,在我的网络条件下,请求实际上比 selenium 慢,慢得多...
所以我想知道使用多台服务器查询 DNS A 记录的最快方法是什么,多台是指大量;
我从这里得到了 5556 个值得信赖的名称服务器:https://public-dns.info/nameservers.csv(地址指向的文件可能会随着时间的推移而改变,我下载它时的版本有 5556 个条目),我使用了这个脚本处理信息:
import csv
import json
import ping3
import re
import pickle
import subprocess
import time
from collections import namedtuple
from datetime import datetime
from pathlib import Path
IPV4 = re.compile(r'^((25[0-5]|2[0-4]\d|1?\d\d?)\.){3}(25[0-5]|2[0-4]\d|1?\d\d?)$')
publicdns = Path('C:/Users/Estranger/Downloads/nameservers.csv').read_text(encoding='utf8').splitlines()
publicdns = list(csv.reader(publicdns))
to_date = lambda x: datetime.strptime(x, '%Y-%m-%dT%H:%M:%SZ')
Entry = namedtuple('Entry', publicdns[0])
deserializer = [str, str, int, str, str, str, str, str, bool, float, to_date, to_date]
publicdns = [Entry(*(f(v) for f, v in zip(deserializer, i))) for i in publicdns[1:]]
Path('D:/nameservers.pickle').write_bytes(pickle.dumps(publicdns, protocol=pickle.HIGHEST_PROTOCOL))
IPV4_DNS = [ipv4 for e in publicdns if e.reliability >= 0.75 and IPV4.match((ipv4 := e.ip_address))]
Path('D:/reliable_ipv4_dns.txt').write_text('\n'.join(IPV4_DNS))
def ping(addr, lim=0.5):
return sum(d if (d := ping3.ping(addr, timeout=lim, unit='ms')) else 0 for _ in range(4)) / 4
ping_latency = []
new_servers = []
def format_delta(d):
d = int(d)
h, rem = divmod(d, 3600)
m, s = divmod(rem, 60)
return f'{h:02d}:{m:02d}:{s:02d}'
def ping_filter(condition, timeout):
loop = 1
if loop == 1:
servers = IPV4_DNS.copy()
logs = []
start = datetime.now()
success_rate = 0
while True:
loop_start = datetime.now()
total = len(servers)
ping_latency.clear()
new_servers.clear()
succeeded = 0
failed = 0
l = len(str(total))
for iteration, server in enumerate(servers):
latency = ping(server, timeout)
timestamp = datetime.now()
elapsed = timestamp-start
loop_elapsed = timestamp-loop_start
eta = (loop_elapsed.total_seconds() / (iteration + 1)) * (total - iteration - 1)
entry = {
'timestamp': f'{timestamp:%Y-%m-%d %H:%M:%S}',
'loop': loop,
'loop start': f'{loop_start:%Y-%m-%d %H:%M:%S}',
'iteration': iteration,
'server': server,
'success': True,
'latency': round(latency, 2),
'unit': 'ms',
'total': total,
'succeeded': succeeded,
'failed': failed,
'started': f'{start:%Y-%m-%d %H:%M:%S}',
'elapsed': format_delta(elapsed.total_seconds()),
'loop runtime': format_delta(loop_elapsed.total_seconds()),
'ETA': format_delta(eta),
'success rate': f'{success_rate:06.2%}'
}
if 0 < latency <= int(timeout*1000):
succeeded += 1
entry['succeeded'] += 1
new_servers.append(server)
ping_latency.append((server, latency))
else:
failed += 1
entry['failed'] += 1
entry['success'] = False
entry['latency'] = 'timeout'
if iteration == total - 1:
success_rate = succeeded / total
entry['success rate'] = f'{success_rate:06.2%}'
print(json.dumps(entry, indent=4))
logs.append(entry)
new_total = len(new_servers)
servers = new_servers.copy()
if new_total == total or loop == 32:
timestamp = datetime.now()
elapsed = datetime.now()-start
entry = {
'completed': f'{timestamp:%Y-%m-%d %H:%M:%S}',
'started': f'{start:%Y-%m-%d %H:%M:%S}',
'elapsed': format_delta(elapsed.total_seconds()),
'loop': loop
}
print(json.dumps(entry, indent=4))
logs.append(entry)
break
loop += 1
Path(f'D:/IPv4_DNS_{condition}.txt').write_text('\n'.join(servers))
Path(f'D:/IPv4_DNS_ping_log_{condition}.json').write_text(json.dumps(logs, indent=4))
Path(f'D:/IPv4_DNS_ping_latency_{condition}.json').write_text(json.dumps(dict(ping_latency), indent=4))
ping_filter('NOVPN', 0.3)
花了24个多小时才完成,总之我搞定了1518台服务器。
而且我需要使用所有这些 1518 台服务器为每个操作解析每个输入地址的 A 记录,以便有机会找到未被阻止或减慢的 IP 地址,那么如何使用大量异步解析 DNS A 记录名称服务器的数量?
更新
好的,现在我已经查看了 asyncio
和 concurrent.futures.ThreadPoolExecutor
以及 dns.asyncresolver
,我认为它们正是我正在寻找的,但仍有一些我不太了解的地方还没看懂。
我正在考虑使用 4 个并发线程池,每个 运行 同步 4 次(每个服务器获得 4 个地址,因为我现在每个服务器只能获得 1 个地址,而 Google 不是任何帮助),每个最大大小为 4,每个任务都是使用 32 个服务器执行异步 DNS 查询功能。
这是我想出的:
def split_sixteen(series):
length = len(series)
p4 = -(-length // 4)
p16 = -(-length // 16)
indices = [(0, 1), (1, 2), (2, 3), (3, 4)]
return [[series[p4*a:p4*b][p16*c:p16*d] for c, d in indices] for a, b in indices]
class Assigner:
def __init__(self, tasks, batch=32) -> None:
self.tasks = tasks
self.length = len(tasks)
self.index = 0
self.batch = batch
self.all_assigned = False
def assign(self):
if not self.all_assigned:
start = self.index
if self.index + self.batch <= self.length:
self.index += self.batch
else:
self.index = self.length
if self.index == self.length:
self.all_assigned = True
return self.tasks[start:self.index]
else:
raise StopIteration('All tasks have been assigned')
我不知道线程池应该有什么功能运行,我认为该功能应该有一个while循环,直到assigner耗尽,循环最多需要32台服务器assigner 并将其放入线程池中 运行 ,如果还没有 4 个协程,则它会等到其中一个例程完成后启动另一个例程,循环结束后,该函数应等待完成合并结果的例程,所有 4 个线程池的结果应该合并...
我不知道如何让所有这些一起工作...
大约 selenium
比 requests
快,令人震惊,我知道,但这是真的,如果你不主观地相信它,真理就不会变得不真实:
import os
import requests
from selenium import webdriver
from selenium.webdriver import FirefoxProfile
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
from selenium.webdriver.firefox.options import Options
options = Options()
options.add_argument("--headless")
options.add_argument("--log-level=3")
options.add_argument("--mute-audio")
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")
options.add_argument('--disable-extensions')
options.add_argument('--disable-gpu')
capabibilties = DesiredCapabilities().FIREFOX
capabibilties['pageLoadStrategy'] = 'eager'
profile = FirefoxProfile(os.environ['appdata'] + '\Mozilla\Firefox\Profiles\bkpihn0o.bot')
profile.set_preference("http.response.timeout", 1)
profile.set_preference("dom.max_script_run_time", 0)
profile.set_preference('permissions.default.stylesheet', 2)
profile.set_preference('permissions.default.image', 2)
profile.set_preference('dom.ipc.plugins.enabled.libflashplayer.so', 'false')
profile.set_preference("permissions.default.script", 2)
profile.set_preference("javascript.enabled", False)
Firefox = webdriver.Firefox(capabilities=capabibilties, options=options, firefox_profile=profile)
也许我没有提到我在连接到 VPN 时执行了这些测试,而且请求似乎没有利用 VPN。
请参阅我对您的 post 的评论。
我很难相信 selenium
可以胜过直接调用 DNS 服务器。毕竟,selenium
将使用相同的网络发出 GET 请求。
我在 Windows 下安装了 dnspython
并使用线程池进行了一些基准测试,线程池的大小等于我尝试解析的域数,其中 750 个包含 15 个不同的域重复50次。然后,我根据 OP 引用的 CSV 文件创建了一小部分 DNS 服务器。我还尝试使用 asyncio
解析域,但无法正常工作。这些是具体的基准:
test1
:使用由单个 DNS 服务器“8.8.8.8”组成的服务器列表同时解析所有域。一个给定的域,例如 bonappetit.com 可能会在不同的解析请求中被解析为不同的 IP 地址,即使所有请求都使用了相同的 DNS 服务器地址。解决所有请求的时间为 1.56 秒。test2
:对于这个基准测试,每个请求都被赋予一个服务器列表,该列表是从 DNS 服务器列表中统一选择的单个 DNS 服务器。我很快发现许多 DNS 服务器地址不是特别可靠,导致超时异常,不得不从服务器列表中删除并重新运行基准测试。解决所有请求的时间为 5.57 秒。test3
:对于此基准测试,每个请求都会获得完整的 DNS 服务器列表 ,不包括 '8.8.8.8'。基准时间为 3.54 秒。未显示的是基准,我在其中提供了包括“8.8.8.8”在内的完整服务器列表。它的基准时间与 'test1' 的时间基本相同。我再次重复基准测试,其中“8.8.8.8”是列表中的最后一个条目,运行 时间始终只比test1
的 运行 时间慢百分之几秒它是列表中的第一个条目。我随后再次重新运行此基准测试,从列表中排除“8.8.8.8”,我现在看到 运行 次与test1
的 运行 次相当。 这向我表明,我碰巧使用的服务器不是“8.8.8.8”,即使它们没有超时,响应时间也非常可变。test4
:我使用了完全不同的DNS解析方法,即socket.gethostbyname
。这产生了迄今为止最快的基准时间 0.27 秒,并且对于每个域,只返回了一个 IP 地址。我相信这两个结果都可以通过 Windows 缓存结果来解释。test5
:这是对单个 DNS 服务器“8.8.8.8”使用asyncio
的尝试,它超时了。我不知道为什么。
结论
首先,您的 5556 DNS 服务器不同样值得信赖,而且它们的可信度每时每刻都在变化。我会用地理位置靠近您的服务器进行试验,以确定 最 值得信赖的服务器,并将一些放在服务器列表中,并将“8.8.8.8”作为第一个条目。 为了更清楚地说明这一点,我说的是基准测试中使用的代码 test3
,但不排除服务器“8.8.8.8”。
其次,我认为没有理由创建多个线程池。但是有一些你不应该超过的最大尺寸。当然500应该不是问题。但是,如果你能让 asyncio
工作,你应该能够毫无问题地 收集 数以千计的任务。
import dns.resolver
import asyncio
import dns.asyncresolver
import socket
import time
from multiprocessing.pool import ThreadPool
l = [
'yahoo.com',
'cnn.com',
'ibm.com',
'nytimes.com',
'whosebug.com',
'tcm.com',
'wqxr.org',
'wahingtonpost.com',
'theatlantic.com',
'nymag.com',
'newyorker.com',
'bonappetit.com',
'seriouseats.com',
'foodtv.com',
'food52.com',
]
domains = []
for _ in range(50):
domains += l
servers = [
'8.8.8.8',
'8.0.7.0',
'8.0.6.0',
'195.99.66.220',
'38.132.106.139',
]
def test1(pool):
def resolve(domain):
resolver = dns.resolver.Resolver()
resolver.nameservers = ['8.8.8.8']
return (domain, resolver.resolve(domain)[0].address)
return pool.map(resolve, domains)
def test2(pool):
def resolve(idx, domain):
resolver = dns.resolver.Resolver()
i = idx % len(servers)
resolver.nameservers = [servers[i]]
try:
return (domain, resolver.resolve(domain)[0].address)
except Exception as e:
print(e, servers[i])
return None
return pool.starmap(resolve, enumerate(domains))
def test3(pool):
def resolve(domain):
resolver = dns.resolver.Resolver()
resolver.nameservers = servers[1:] # omit '8.8.8.8'
return (domain, resolver.resolve(domain)[0].address)
return pool.map(resolve, domains)
def test4(pool):
def resolve(domain):
return (domain, socket.gethostbyname(domain))
return pool.map(resolve, domains)
async def test5():
async def resolve(domain):
resolver = dns.asyncresolver.Resolver()
resolver.nameservers = ['8.8.8.8']
addr = await resolver.resolve(domain)
return (domain, addr[0].address)
return await asyncio.gather(*(resolve(domain) for domain in domains))
pool = ThreadPool(len(domains))
def benchmark(fun):
print()
print(fun.__name__)
start = time.time()
results = fun(pool)
print(time.time() - start)
print(sorted(set(results)))
benchmark(test1)
benchmark(test2)
benchmark(test3)
benchmark(test4)
print()
print('test5')
start = time.time()
results = asyncio.run(test5())
print(time.time() - start)
print(sorted(set(results)))
打印:
test1
1.5600032806396484
[('bonappetit.com', '151.101.0.239'), ('bonappetit.com', '151.101.192.239'), ('bonappetit.com', '151.101.64.239'), ('cnn.com', '151.101.1.67'), ('cnn.com', '151.101.129.67'), ('cnn.com', '151.101.65.67'), ('food52.com', '104.18.166.45'), ('food52.com', '104.18.174.13'), ('foodtv.com', '67.199.248.12'), ('foodtv.com', '67.199.248.13'), ('ibm.com', '184.29.179.199'), ('newyorker.com', '151.101.0.239'), ('newyorker.com', '151.101.128.239'), ('newyorker.com', '151.101.192.239'), ('newyorker.com', '151.101.64.239'), ('nymag.com', '151.101.130.133'), ('nymag.com', '151.101.194.133'), ('nymag.com', '151.101.2.133'), ('nymag.com', '151.101.66.133'), ('nytimes.com', '151.101.1.164'), ('nytimes.com', '151.101.129.164'), ('nytimes.com', '151.101.193.164'), ('nytimes.com', '151.101.65.164'), ('seriouseats.com', '151.101.2.137'), ('whosebug.com', '151.101.1.69'), ('whosebug.com', '151.101.129.69'), ('whosebug.com', '151.101.65.69'), ('tcm.com', '104.127.162.10'), ('theatlantic.com', '151.101.130.133'), ('theatlantic.com', '151.101.194.133'), ('theatlantic.com', '151.101.2.133'), ('theatlantic.com', '151.101.66.133'), ('wahingtonpost.com', '198.72.14.16'), ('wqxr.org', '44.194.174.151'), ('wqxr.org', '54.144.182.133'), ('yahoo.com', '74.6.143.25'), ('yahoo.com', '74.6.143.26'), ('yahoo.com', '74.6.231.21'), ('yahoo.com', '98.137.11.163')]
test2
5.566321611404419
[('bonappetit.com', '151.101.0.239'), ('bonappetit.com', '151.101.128.239'), ('bonappetit.com', '151.101.192.239'), ('bonappetit.com', '151.101.64.239'), ('cnn.com', '151.101.1.67'), ('cnn.com', '151.101.129.67'), ('cnn.com', '151.101.193.67'), ('cnn.com', '151.101.65.67'), ('food52.com', '104.18.166.45'), ('foodtv.com', '67.199.248.12'), ('foodtv.com', '67.199.248.13'), ('ibm.com', '23.218.185.219'), ('newyorker.com', '151.101.0.239'), ('newyorker.com', '151.101.128.239'), ('newyorker.com', '151.101.192.239'), ('newyorker.com', '151.101.64.239'), ('nymag.com', '151.101.130.133'), ('nytimes.com', '151.101.1.164'), ('nytimes.com', '151.101.129.164'), ('nytimes.com', '151.101.193.164'), ('nytimes.com', '151.101.65.164'), ('seriouseats.com', '151.101.130.137'), ('seriouseats.com', '151.101.194.137'), ('seriouseats.com', '151.101.2.137'), ('seriouseats.com', '151.101.66.137'), ('whosebug.com', '151.101.1.69'), ('tcm.com', '104.127.162.10'), ('theatlantic.com', '151.101.130.133'), ('theatlantic.com', '151.101.194.133'), ('theatlantic.com', '151.101.2.133'), ('theatlantic.com', '151.101.66.133'), ('wahingtonpost.com', '198.72.14.16'), ('wqxr.org', '44.194.174.151'), ('wqxr.org', '54.144.182.133'), ('yahoo.com', '74.6.143.26'), ('yahoo.com', '74.6.231.21'), ('yahoo.com', '98.137.11.163')]
test3
3.536404609680176
[('bonappetit.com', '151.101.0.239'), ('bonappetit.com', '151.101.128.239'), ('bonappetit.com', '151.101.192.239'), ('bonappetit.com', '151.101.64.239'), ('cnn.com', '151.101.1.67'), ('cnn.com', '151.101.129.67'), ('cnn.com', '151.101.193.67'), ('cnn.com', '151.101.65.67'), ('food52.com', '104.18.166.45'), ('food52.com', '104.18.174.13'), ('foodtv.com', '67.199.248.12'), ('foodtv.com', '67.199.248.13'), ('ibm.com', '23.218.185.219'), ('newyorker.com', '151.101.0.239'), ('newyorker.com', '151.101.128.239'), ('newyorker.com', '151.101.192.239'), ('newyorker.com', '151.101.64.239'), ('nymag.com', '151.101.130.133'), ('nymag.com', '151.101.194.133'), ('nymag.com', '151.101.2.133'), ('nymag.com', '151.101.66.133'), ('nytimes.com', '151.101.1.164'), ('nytimes.com', '151.101.129.164'), ('nytimes.com', '151.101.193.164'), ('nytimes.com', '151.101.65.164'), ('seriouseats.com', '151.101.130.137'), ('seriouseats.com', '151.101.194.137'), ('seriouseats.com', '151.101.2.137'), ('seriouseats.com', '151.101.66.137'), ('whosebug.com', '151.101.1.69'), ('whosebug.com', '151.101.129.69'), ('whosebug.com', '151.101.193.69'), ('whosebug.com', '151.101.65.69'), ('tcm.com', '23.75.199.121'), ('theatlantic.com', '151.101.130.133'), ('theatlantic.com', '151.101.194.133'), ('theatlantic.com', '151.101.2.133'), ('theatlantic.com', '151.101.66.133'), ('wahingtonpost.com', '198.72.14.16'), ('wqxr.org', '44.194.174.151'), ('wqxr.org', '54.144.182.133'), ('yahoo.com', '74.6.143.25'), ('yahoo.com', '74.6.143.26'), ('yahoo.com', '74.6.231.20'), ('yahoo.com', '74.6.231.21'), ('yahoo.com', '98.137.11.163'), ('yahoo.com', '98.137.11.164')]
test4
0.33908557891845703
[('bonappetit.com', '151.101.64.239'), ('cnn.com', '151.101.129.67'), ('food52.com', '104.18.174.13'), ('foodtv.com', '67.199.248.12'), ('ibm.com', '104.104.121.251'), ('newyorker.com', '151.101.192.239'), ('nymag.com', '151.101.130.133'), ('nytimes.com', '151.101.193.164'), ('seriouseats.com', '151.101.66.137'), ('whosebug.com', '151.101.65.69'), ('tcm.com', '104.127.162.10'), ('theatlantic.com', '151.101.2.133'), ('wahingtonpost.com', '198.72.14.16'), ('wqxr.org', '44.194.174.151'), ('yahoo.com', '98.137.11.164')]
test5
Traceback (most recent call last):
...
addr = await resolver.resolve(domain)
File "C:\Booboo\test\test_venv\lib\site-packages\dns\asyncresolver.py", line 74, in resolve
timeout = self._compute_timeout(start, lifetime)
File "C:\Booboo\test\test_venv\lib\site-packages\dns\resolver.py", line 997, in _compute_timeout
raise Timeout(timeout=duration)
dns.exception.Timeout: The DNS operation timed out after 5.38653302192688 seconds
更新
我遇到了 dnspython
的 PyPi
版本中导致 async
解析器超时的错误。解决方案是从 Github:
pip install -U https://github.com/rthalley/dnspython/archive/master.zip
这次我重新运行了基准测试,将需要获取的域数量增加到 3,000,坦率地说,这是要创建的很多线程,所以我将线程池的大小上限设置为 500(并且现在包括在计时中创建线程池的时间)并且我将完整的 DNS 服务器列表提供给多线程基准测试和异步基准测试(总共两个基准测试)。我还做了一个改进,解析器 class 只创建一次并重新用于解析所有请求。此外,单个查询返回的所有 IP 地址都被添加到一组 IP 地址中,这些 IP 地址保存在由域键入的字典中,以防针对同一个域发出多个请求(就像我的基准测试中的情况一样),所以我们有最终找到一组唯一的 IP 地址。这是返回的最终字典。
结果:
multithreading: 1.99 seconds
asyncio: 3.43 seconds
这令人惊讶,因为我认为要处理大量域并限制线程池的大小,asyncio 版本的性能会更高。多线程显然是可行的方法。
import dns.resolver
import asyncio
import dns.asyncresolver
import socket
import time
from multiprocessing.pool import ThreadPool
l = [
'yahoo.com',
'cnn.com',
'ibm.com',
'nytimes.com',
'whosebug.com',
'tcm.com',
'wqxr.org',
'wahingtonpost.com',
'theatlantic.com',
'nymag.com',
'newyorker.com',
'bonappetit.com',
'seriouseats.com',
'foodtv.com',
'food52.com',
]
domains = []
for _ in range(200):
domains += l
servers = [
'8.8.8.8',
'8.0.7.0',
'8.0.6.0',
'195.99.66.220',
'38.132.106.139',
]
def threading_test(pool):
resolver = dns.resolver.Resolver()
resolver.nameservers = servers
ip_addresses = {}
def resolve(domain):
results = resolver.resolve(domain)
s = ip_addresses.setdefault(domain, set())
for result in results:
s.add(result.address)
pool.map(resolve, domains)
return ip_addresses
async def async_test():
resolver = dns.asyncresolver.Resolver()
resolver.nameservers = servers
ip_addresses = {}
async def resolve(domain):
results = await resolver.resolve(domain)
s = ip_addresses.setdefault(domain, set())
for result in results:
s.add(result.address)
await asyncio.gather(*(resolve(domain) for domain in domains))
return ip_addresses
print('threading_test')
start = time.time()
pool = ThreadPool(min(500, len(domains)))
results = threading_test(pool)
print(time.time() - start)
for k in sorted(results.keys()):
print(k, sorted(results[k]))
print()
print('async_test')
start = time.time()
results = asyncio.run(async_test())
print(time.time() - start)
for k in sorted(results.keys()):
print(k, sorted(results[k]))
打印:
1.9919934272766113
bonappetit.com ['151.101.0.239', '151.101.128.239', '151.101.192.239', '151.101.64.239']
cnn.com ['151.101.1.67', '151.101.129.67', '151.101.193.67', '151.101.65.67']
food52.com ['104.18.166.45', '104.18.174.13']
foodtv.com ['67.199.248.12', '67.199.248.13']
ibm.com ['184.29.179.199']
newyorker.com ['151.101.0.239', '151.101.128.239', '151.101.192.239', '151.101.64.239']
nymag.com ['151.101.130.133', '151.101.194.133', '151.101.2.133', '151.101.66.133']
nytimes.com ['151.101.1.164', '151.101.129.164', '151.101.193.164', '151.101.65.164']
seriouseats.com ['151.101.130.137', '151.101.194.137', '151.101.2.137', '151.101.66.137']
whosebug.com ['151.101.1.69', '151.101.129.69', '151.101.193.69', '151.101.65.69']
tcm.com ['104.127.162.10']
theatlantic.com ['151.101.130.133', '151.101.194.133', '151.101.2.133', '151.101.66.133']
wahingtonpost.com ['198.72.14.16']
wqxr.org ['44.194.174.151', '54.144.182.133']
yahoo.com ['74.6.143.25', '74.6.143.26', '74.6.231.20', '74.6.231.21', '98.137.11.163', '98.137.11.164']
async_test
3.437023878097534
bonappetit.com ['151.101.0.239', '151.101.128.239', '151.101.192.239', '151.101.64.239']
cnn.com ['151.101.1.67', '151.101.129.67', '151.101.193.67', '151.101.65.67']
food52.com ['104.18.166.45', '104.18.174.13']
foodtv.com ['67.199.248.12', '67.199.248.13']
ibm.com ['184.29.179.199', '23.218.185.219']
newyorker.com ['151.101.0.239', '151.101.128.239', '151.101.192.239', '151.101.64.239']
nymag.com ['151.101.130.133', '151.101.194.133', '151.101.2.133', '151.101.66.133']
nytimes.com ['151.101.1.164', '151.101.129.164', '151.101.193.164', '151.101.65.164']
seriouseats.com ['151.101.130.137', '151.101.194.137', '151.101.2.137', '151.101.66.137']
whosebug.com ['151.101.1.69', '151.101.129.69', '151.101.193.69', '151.101.65.69']
tcm.com ['104.127.162.10', '23.79.32.175']
theatlantic.com ['151.101.130.133', '151.101.194.133', '151.101.2.133', '151.101.66.133']
wahingtonpost.com ['198.72.14.16']
wqxr.org ['44.194.174.151', '54.144.182.133']
yahoo.com ['74.6.143.25', '74.6.143.26', '74.6.231.20', '74.6.231.21', '98.137.11.163', '98.137.11.164']