从线程中删除重复的行
Remove duplicate lines from threading
我有一个从文件中随机读取行并使用线程的程序。问题是每当它从文件中读取行时,它有时会从文件中读取重复的行。例如,假设我使用 5 个线程,我的文件如下所示:
line1
line2
line3
line4
line5
程序使用线程随机读取行,但有时它可以读取第4行,第3行,第5行,第2行,第5行(再次)。所以我的问题是如何摆脱重复的第 5 行?
代码:
def get_token():
tokens = []
with open('pokens.txt', 'r', encoding='UTF-8') as file:
lines = file.readlines()
for line in lines:
tokens.append(line.replace('\n', ''))
return tokens
def get_proxy():
proxies = []
with open('proxies.txt', 'r', encoding='UTF-8') as file:
lines = file.readlines()
for line in lines:
proxies.append(line.replace('\n', ''))
return proxies
class Gen:
def __init__(self, token, proxy=None):
options = webdriver.ChromeOptions()
options.add_experimental_option("excludeSwitches", ["enable-logging"])
proxy_ip_port = proxy
proxy2 = Proxy()
proxy2.proxy_type = ProxyType.MANUAL
proxy2.http_proxy = proxy_ip_port
proxy2.ssl_proxy = proxy_ip_port
capabilities = webdriver.DesiredCapabilities.CHROME
proxy2.add_to_capabilities(capabilities)
self.browser = webdriver.Chrome("chromedriver.exe")
self.token = token
self.proxy = proxy
self.password = 'passwordhere'
def register(self):
print('hi')
# Code continues with no duplicates
def worker(proxy=None):
token_list = get_token()
token = random.choice(token_list)
d = Gen(token, proxy=proxy)
d.register()
def main():
threads = []
num_thread = input('Number of Threads: ')
num_thread = int(num_thread)
proxies = get_proxy()
for i in range(num_thread):
t = threading.Thread(target=worker, args= (random.choice(proxies), ))
threads.append(t)
t.start()
if __name__ == '__main__':
main()
下面是您的程序的简化“玩具版”,我更新后执行以下操作:
- 从主线程读取令牌文件,放入列表
- 随机打乱列表的顺序
- 给每个工人一个大小大致相等的令牌列表子集供其选择
- 每个工作人员仅打印出主线程提供的数据(为清楚起见,实际上省略了对数据进行的任何操作)
这种方法避免了重复,因为任何给定的标记只在列表中出现一次,并且每个线程都被赋予了列表的不同子集以从中选择标记。
import threading
import random
def read_tokens_list():
tokens = []
with open('pokens.txt', 'r', encoding='UTF-8') as file:
lines = file.readlines()
for line in lines:
tokens.append(line.replace('\n', ''))
return tokens
def read_proxies_list():
proxies = []
with open('proxies.txt', 'r', encoding='UTF-8') as file:
lines = file.readlines()
for line in lines:
proxies.append(line.replace('\n', ''))
return proxies
def worker(proxy,token_list):
token = random.choice(token_list)
print("Worker: my proxy is [%s], my token list is %s, I've chosen [%s] as my token" % (proxy, token_list, token))
def main():
threads = []
num_thread = input('Number of Threads: ')
num_thread = int(num_thread)
proxies = read_proxies_list()
token_list = read_tokens_list() # read in the pokens.txt file
random.shuffle(token_list) # shuffle the list into random order
tokens_per_worker = len(token_list) // num_thread # how many tokens from the list each worker will get (roughly)
for i in range(num_thread):
if ((i+1)<num_thread):
num_tokens_for_this_worker = tokens_per_worker # give each worker an even share of the list
else:
num_tokens_for_this_worker = len(token_list) # except the last worker gets whatever is left
# we'll give the first (num_tokens_for_this_worker) tokens in the list to this worker
tokens_for_this_worker = token_list[0:num_tokens_for_this_worker]
# and remove those tokens from the list so that they won't get used by anyone else
token_list = token_list[num_tokens_for_this_worker:]
t = threading.Thread(target=worker, args=(random.choice(proxies), tokens_for_this_worker, ))
threads.append(t)
t.start()
for t in threads:
t.join()
if __name__ == '__main__':
main()
我有一个从文件中随机读取行并使用线程的程序。问题是每当它从文件中读取行时,它有时会从文件中读取重复的行。例如,假设我使用 5 个线程,我的文件如下所示:
line1
line2
line3
line4
line5
程序使用线程随机读取行,但有时它可以读取第4行,第3行,第5行,第2行,第5行(再次)。所以我的问题是如何摆脱重复的第 5 行?
代码:
def get_token():
tokens = []
with open('pokens.txt', 'r', encoding='UTF-8') as file:
lines = file.readlines()
for line in lines:
tokens.append(line.replace('\n', ''))
return tokens
def get_proxy():
proxies = []
with open('proxies.txt', 'r', encoding='UTF-8') as file:
lines = file.readlines()
for line in lines:
proxies.append(line.replace('\n', ''))
return proxies
class Gen:
def __init__(self, token, proxy=None):
options = webdriver.ChromeOptions()
options.add_experimental_option("excludeSwitches", ["enable-logging"])
proxy_ip_port = proxy
proxy2 = Proxy()
proxy2.proxy_type = ProxyType.MANUAL
proxy2.http_proxy = proxy_ip_port
proxy2.ssl_proxy = proxy_ip_port
capabilities = webdriver.DesiredCapabilities.CHROME
proxy2.add_to_capabilities(capabilities)
self.browser = webdriver.Chrome("chromedriver.exe")
self.token = token
self.proxy = proxy
self.password = 'passwordhere'
def register(self):
print('hi')
# Code continues with no duplicates
def worker(proxy=None):
token_list = get_token()
token = random.choice(token_list)
d = Gen(token, proxy=proxy)
d.register()
def main():
threads = []
num_thread = input('Number of Threads: ')
num_thread = int(num_thread)
proxies = get_proxy()
for i in range(num_thread):
t = threading.Thread(target=worker, args= (random.choice(proxies), ))
threads.append(t)
t.start()
if __name__ == '__main__':
main()
下面是您的程序的简化“玩具版”,我更新后执行以下操作:
- 从主线程读取令牌文件,放入列表
- 随机打乱列表的顺序
- 给每个工人一个大小大致相等的令牌列表子集供其选择
- 每个工作人员仅打印出主线程提供的数据(为清楚起见,实际上省略了对数据进行的任何操作)
这种方法避免了重复,因为任何给定的标记只在列表中出现一次,并且每个线程都被赋予了列表的不同子集以从中选择标记。
import threading
import random
def read_tokens_list():
tokens = []
with open('pokens.txt', 'r', encoding='UTF-8') as file:
lines = file.readlines()
for line in lines:
tokens.append(line.replace('\n', ''))
return tokens
def read_proxies_list():
proxies = []
with open('proxies.txt', 'r', encoding='UTF-8') as file:
lines = file.readlines()
for line in lines:
proxies.append(line.replace('\n', ''))
return proxies
def worker(proxy,token_list):
token = random.choice(token_list)
print("Worker: my proxy is [%s], my token list is %s, I've chosen [%s] as my token" % (proxy, token_list, token))
def main():
threads = []
num_thread = input('Number of Threads: ')
num_thread = int(num_thread)
proxies = read_proxies_list()
token_list = read_tokens_list() # read in the pokens.txt file
random.shuffle(token_list) # shuffle the list into random order
tokens_per_worker = len(token_list) // num_thread # how many tokens from the list each worker will get (roughly)
for i in range(num_thread):
if ((i+1)<num_thread):
num_tokens_for_this_worker = tokens_per_worker # give each worker an even share of the list
else:
num_tokens_for_this_worker = len(token_list) # except the last worker gets whatever is left
# we'll give the first (num_tokens_for_this_worker) tokens in the list to this worker
tokens_for_this_worker = token_list[0:num_tokens_for_this_worker]
# and remove those tokens from the list so that they won't get used by anyone else
token_list = token_list[num_tokens_for_this_worker:]
t = threading.Thread(target=worker, args=(random.choice(proxies), tokens_for_this_worker, ))
threads.append(t)
t.start()
for t in threads:
t.join()
if __name__ == '__main__':
main()