根据 Python 中的子字符串删除重复项
Remove duplicates based on substring in Python
我有以下代码用于检测文件中的重复项并将它们输出到 3 个单独的文件中,一个用于非重复项,一个用于重复项 (x2) 和一个用于重复项 (> x2)。第一个文件只包含原始文件中没有重复的行。 (它不会删除找到的任何重复行,它会保留单行)。
import os
import sys
import time
import collections
file_in = sys.argv[1]
file_ot = str(file_in) + ".proc"
file_ot2 = str(file_in) + ".proc2"
file_ot3 = str(file_in) + ".proc3"
counter = 0
dict_in = collections.defaultdict(list)
with open(file_in, "r") as f:
for line in f:
#print("read line: " + str(line))
counter += 1
fixed_line = line.strip()
line_list = fixed_line.split(";")
key = line_list[0][:12]
print(":Key: " + str(key))
dict_in[key].append(line)
with open(file_ot, "w") as f1, open(file_ot2, "w") as f2, open(file_ot3, "w") as f3:
selector = {1: f1, 2: f2}
for values in dict_in.values():
if len(values) == 1:
f1.writelines(values)
elif len(values) == 2:
f2.writelines(values)
else:
f3.writelines(values)
print("Read: " + str(counter) + " lines")
上面的代码有效,但是对于 v 个大文件 (~1g),在我的系统上处理它们大约需要十分钟。我想知道是否有一种方法可以优化这段代码的速度,或者在那个方向上有什么建议。提前致谢!
输入数据示例:
0000AAAAAAAA;X;;X;
0000AAAAAAAA;X;X;;
0000BBBBBBBB;X;;;
0000CCCCCCCC;;X;;
0000DDDDDDDD;X;;X;
0000DDDDDDDD;X;X;;
0000DDDDDDDD;X;X;X;X
0000EEEEEEEE;X;X;X;X
0000FFFFFFFF;X;;;
0000GGGGGGGG;X;;X;
0000HHHHHHHH;X;X;;
0000JJJJJJJJ;X;X;;
预期输出:
FILE1:
0000BBBBBBBB;X;;;
0000CCCCCCCC;;X;;
0000EEEEEEEE;X;X;X;X
0000FFFFFFFF;X;;;
0000GGGGGGGG;X;;X;
0000HHHHHHHH;X;X;;
0000JJJJJJJJ;X;X;;
FILE2:
0000AAAAAAAA;X;;X;
0000AAAAAAAA;X;X;;
FILE3:
0000DDDDDDDD;X;;X;
0000DDDDDDDD;X;X;;
0000DDDDDDDD;X;X;X;X
我使用了 543MB 的随机文本文件来测试它。
import time
myList = []
start = time.time()
with open("myFile.txt") as f:
for line in f:
line = line.replace("\n","")
myList.insert(len(myList), line)
with open("dupListaOne.txt", "w") as f1, open ("dupListMore.txt","w") as f2, open("UniqueList.txt","w") as f3:
new_list = sorted(set(myList))
for i in range(len(new_list)):
a = myList.count(new_list[i])
if ((a-1) == 1):
f1.write("%s\n" % new_list[i] + " " + str(a-1))
elif ((a-1) > 1):
f2.write("%s\n" % new_list[i] + " " + str(a-1))
else:
f3.write("%s\n" % new_list[i] + " " + str(a-1))
end = time.time()
print("Time: ",end - start)
f1.close()
f2.close()
f3.close()
经过时间:123.82529425621033 秒。 ~ 2 分钟
我有以下代码用于检测文件中的重复项并将它们输出到 3 个单独的文件中,一个用于非重复项,一个用于重复项 (x2) 和一个用于重复项 (> x2)。第一个文件只包含原始文件中没有重复的行。 (它不会删除找到的任何重复行,它会保留单行)。
import os
import sys
import time
import collections
file_in = sys.argv[1]
file_ot = str(file_in) + ".proc"
file_ot2 = str(file_in) + ".proc2"
file_ot3 = str(file_in) + ".proc3"
counter = 0
dict_in = collections.defaultdict(list)
with open(file_in, "r") as f:
for line in f:
#print("read line: " + str(line))
counter += 1
fixed_line = line.strip()
line_list = fixed_line.split(";")
key = line_list[0][:12]
print(":Key: " + str(key))
dict_in[key].append(line)
with open(file_ot, "w") as f1, open(file_ot2, "w") as f2, open(file_ot3, "w") as f3:
selector = {1: f1, 2: f2}
for values in dict_in.values():
if len(values) == 1:
f1.writelines(values)
elif len(values) == 2:
f2.writelines(values)
else:
f3.writelines(values)
print("Read: " + str(counter) + " lines")
上面的代码有效,但是对于 v 个大文件 (~1g),在我的系统上处理它们大约需要十分钟。我想知道是否有一种方法可以优化这段代码的速度,或者在那个方向上有什么建议。提前致谢!
输入数据示例:
0000AAAAAAAA;X;;X;
0000AAAAAAAA;X;X;;
0000BBBBBBBB;X;;;
0000CCCCCCCC;;X;;
0000DDDDDDDD;X;;X;
0000DDDDDDDD;X;X;;
0000DDDDDDDD;X;X;X;X
0000EEEEEEEE;X;X;X;X
0000FFFFFFFF;X;;;
0000GGGGGGGG;X;;X;
0000HHHHHHHH;X;X;;
0000JJJJJJJJ;X;X;;
预期输出:
FILE1:
0000BBBBBBBB;X;;;
0000CCCCCCCC;;X;;
0000EEEEEEEE;X;X;X;X
0000FFFFFFFF;X;;;
0000GGGGGGGG;X;;X;
0000HHHHHHHH;X;X;;
0000JJJJJJJJ;X;X;;
FILE2:
0000AAAAAAAA;X;;X;
0000AAAAAAAA;X;X;;
FILE3:
0000DDDDDDDD;X;;X;
0000DDDDDDDD;X;X;;
0000DDDDDDDD;X;X;X;X
我使用了 543MB 的随机文本文件来测试它。
import time
myList = []
start = time.time()
with open("myFile.txt") as f:
for line in f:
line = line.replace("\n","")
myList.insert(len(myList), line)
with open("dupListaOne.txt", "w") as f1, open ("dupListMore.txt","w") as f2, open("UniqueList.txt","w") as f3:
new_list = sorted(set(myList))
for i in range(len(new_list)):
a = myList.count(new_list[i])
if ((a-1) == 1):
f1.write("%s\n" % new_list[i] + " " + str(a-1))
elif ((a-1) > 1):
f2.write("%s\n" % new_list[i] + " " + str(a-1))
else:
f3.write("%s\n" % new_list[i] + " " + str(a-1))
end = time.time()
print("Time: ",end - start)
f1.close()
f2.close()
f3.close()
经过时间:123.82529425621033 秒。 ~ 2 分钟