如何在 python3 中使用多线程获取数据
how to fetch data through multiple account with threading in python3
我想实现一个可以并行取数据的功能
后台是A站点可以抓取100个站点的信息
同一个帐户一次不能使用多次,所以我在网站 A 上创建了 5 个不同的帐户,这样我就可以使用 5 个帐户获取信息。
账户信息如
worker1 pawd
worker2 pawd
worker3 pawd
worker4 pawd
worker5 pawd
如果您想从站点 A 获取站点 B 的信息。
那么您需要在站点 A 上输入类似 get info for siteB_IP
的 cmd。
假设有 100 个 IP 存储在列表名称中 IPlist
如何通过线程并行获取5个可用账号的100个IP信息,然后
所有信息都可以存储在一个变量中而不会发生冲突。
我试过的是下面的,下面的代码无法执行,因为我没有办法实现解决方案:
import threading
user = 'root'
pwd = 'Changeme123'
# the first step is to logon with default account
rs = link.send_cmd(r':lognew:' + '"' + user + '","' + pwd + '"')
# then get all nebor ip from the logon site, the function parse_multi is used for parsing data
IPlist = parse_multi(link.send_cmd('get-IP-info:0xffff'))
def Fetchinfo(user, ip):
rs = link.send_cmd(r':lognew:' + '"' + user + '","' + pwd + '"')
areainfo = link.send_cmd('get info for ' + site_IP)
for ip in IPlist:
# how to handle 100 IPs in the situstion of 5 accounts avaliable ?
thread = threading.thread(target = Fetchinfo, args = [worker, ip]
由于您不希望来自同一帐户 ID 和密码的调用同时发生,您可以定义一个函数,该函数按顺序循环 sub-list 个 IP 以同步获取:
def fetch_data_for_ips(account_id, account_password, ips_to_fetch):
results = list()
for ip_to_fetch in ips_to_fetch:
# fetch with the account_id and password synchronously
result = ...
results.append(result)
return results // Added this
然后,使用线程池,运行每个账户并发的不同批次:
from concurrent.futures import ThreadPoolExecutor, as_completed
# Split the workload for each account to fetch
num, remainder = divmod(len(ip_list), len(accounts))
num_ips_for_each_account = num + bool(remainder)
# This gives e.g. [[1,2,3], [4,5,6]], where each sublist is for each account to fetch
ip_lists_for_each_account = [ip_list[i: i + num_ips_for_each_account] for i in range(0, len(ip_list), num_ips_for_each_account)]
# You should only need number of threads = to the number of accounts you have
with ThreadPoolExecutor(len(accounts)) as executor:
# Feel free to use a set instead if you don't need to know which result came from which thread
futures = dict()
results = list()
for (account_id, account_password), ips_to_fetch in zip(accounts, ip_lists_for_each_account):
future = executor.submit(fetch_data_for_ips, account_id, account_password, ips_to_fetch)
futures[future] = account_id
for future in as_completed(futures):
result = future.result()
account_id = futures[future]
print(f'{account_id} fetched these:', result)
results.extend(result)
你可以参考下面rcshon建议的示例代码。
def fetch_data_for_ips(account_id,ips_to_fetch):
results = list()
for ip_to_fetch in ips_to_fetch:
# fetch with the account_id and password synchronously
result = ','.join((account_id,ip_to_fetch))
results.append(result)
return results
from concurrent.futures import ThreadPoolExecutor, as_completed
accounts = ['worker1','worker2','worker3','worker4','worker5']
ip_list = [str(_) for _ in range(10)]
# Split the workload for each account to fetch
num, remainder = divmod(len(ip_list), len(accounts))
num_ips_for_each_account = num + bool(remainder)
# This gives e.g. [[1,2,3], [4,5,6]], where each sublist is for each account to fetch
ip_lists_for_each_account = [ip_list[i: i + num_ips_for_each_account] for i in range(0, len(ip_list), num_ips_for_each_account)]
# You should only need number of threads = to the number of accounts you have
with ThreadPoolExecutor(len(accounts)) as executor:
# Feel free to use a set instead if you don't need to know which result came from which thread
futures = dict()
results = list()
for account_id, ips_to_fetch in zip(accounts, ip_lists_for_each_account):
future = executor.submit(fetch_data_for_ips, account_id, ips_to_fetch)
futures[future] = account_id
for future in as_completed(futures):
result = future.result()
account_id = futures[future]
print(f'{account_id} fetched these:', result)
results.extend(result)
output :
worker3 fetched these: ['worker3,4', 'worker3,5']
worker2 fetched these: ['worker2,2', 'worker2,3']
worker1 fetched these: ['worker1,0', 'worker1,1']
worker4 fetched these: ['worker4,6', 'worker4,7']
worker5 fetched these: ['worker5,8', 'worker5,9']
我想实现一个可以并行取数据的功能
后台是A站点可以抓取100个站点的信息
同一个帐户一次不能使用多次,所以我在网站 A 上创建了 5 个不同的帐户,这样我就可以使用 5 个帐户获取信息。
账户信息如
worker1 pawd
worker2 pawd
worker3 pawd
worker4 pawd
worker5 pawd
如果您想从站点 A 获取站点 B 的信息。
那么您需要在站点 A 上输入类似 get info for siteB_IP
的 cmd。
假设有 100 个 IP 存储在列表名称中 IPlist
如何通过线程并行获取5个可用账号的100个IP信息,然后 所有信息都可以存储在一个变量中而不会发生冲突。
我试过的是下面的,下面的代码无法执行,因为我没有办法实现解决方案:
import threading
user = 'root'
pwd = 'Changeme123'
# the first step is to logon with default account
rs = link.send_cmd(r':lognew:' + '"' + user + '","' + pwd + '"')
# then get all nebor ip from the logon site, the function parse_multi is used for parsing data
IPlist = parse_multi(link.send_cmd('get-IP-info:0xffff'))
def Fetchinfo(user, ip):
rs = link.send_cmd(r':lognew:' + '"' + user + '","' + pwd + '"')
areainfo = link.send_cmd('get info for ' + site_IP)
for ip in IPlist:
# how to handle 100 IPs in the situstion of 5 accounts avaliable ?
thread = threading.thread(target = Fetchinfo, args = [worker, ip]
由于您不希望来自同一帐户 ID 和密码的调用同时发生,您可以定义一个函数,该函数按顺序循环 sub-list 个 IP 以同步获取:
def fetch_data_for_ips(account_id, account_password, ips_to_fetch):
results = list()
for ip_to_fetch in ips_to_fetch:
# fetch with the account_id and password synchronously
result = ...
results.append(result)
return results // Added this
然后,使用线程池,运行每个账户并发的不同批次:
from concurrent.futures import ThreadPoolExecutor, as_completed
# Split the workload for each account to fetch
num, remainder = divmod(len(ip_list), len(accounts))
num_ips_for_each_account = num + bool(remainder)
# This gives e.g. [[1,2,3], [4,5,6]], where each sublist is for each account to fetch
ip_lists_for_each_account = [ip_list[i: i + num_ips_for_each_account] for i in range(0, len(ip_list), num_ips_for_each_account)]
# You should only need number of threads = to the number of accounts you have
with ThreadPoolExecutor(len(accounts)) as executor:
# Feel free to use a set instead if you don't need to know which result came from which thread
futures = dict()
results = list()
for (account_id, account_password), ips_to_fetch in zip(accounts, ip_lists_for_each_account):
future = executor.submit(fetch_data_for_ips, account_id, account_password, ips_to_fetch)
futures[future] = account_id
for future in as_completed(futures):
result = future.result()
account_id = futures[future]
print(f'{account_id} fetched these:', result)
results.extend(result)
你可以参考下面rcshon建议的示例代码。
def fetch_data_for_ips(account_id,ips_to_fetch):
results = list()
for ip_to_fetch in ips_to_fetch:
# fetch with the account_id and password synchronously
result = ','.join((account_id,ip_to_fetch))
results.append(result)
return results
from concurrent.futures import ThreadPoolExecutor, as_completed
accounts = ['worker1','worker2','worker3','worker4','worker5']
ip_list = [str(_) for _ in range(10)]
# Split the workload for each account to fetch
num, remainder = divmod(len(ip_list), len(accounts))
num_ips_for_each_account = num + bool(remainder)
# This gives e.g. [[1,2,3], [4,5,6]], where each sublist is for each account to fetch
ip_lists_for_each_account = [ip_list[i: i + num_ips_for_each_account] for i in range(0, len(ip_list), num_ips_for_each_account)]
# You should only need number of threads = to the number of accounts you have
with ThreadPoolExecutor(len(accounts)) as executor:
# Feel free to use a set instead if you don't need to know which result came from which thread
futures = dict()
results = list()
for account_id, ips_to_fetch in zip(accounts, ip_lists_for_each_account):
future = executor.submit(fetch_data_for_ips, account_id, ips_to_fetch)
futures[future] = account_id
for future in as_completed(futures):
result = future.result()
account_id = futures[future]
print(f'{account_id} fetched these:', result)
results.extend(result)
output :
worker3 fetched these: ['worker3,4', 'worker3,5']
worker2 fetched these: ['worker2,2', 'worker2,3']
worker1 fetched these: ['worker1,0', 'worker1,1']
worker4 fetched these: ['worker4,6', 'worker4,7']
worker5 fetched these: ['worker5,8', 'worker5,9']