有向邻接表快速实现
Directional adjacency list fast implementation
import distance
from collections import defaultdict
my_list = ['ACAA', 'TCAA','TCAT','TGAT','TCGA','TGGA','GCGA','AAAA','GGGG','GGGC']
counts = {'ACAA':60, 'TCAA':3,'TCAT':30,'TGAT':8,'TCGA':1,'TGGA':1,'GCGA':8,'AAAA':5,'GGGG':8,'GGGC':1}
adj_list = defaultdict(list)
for strng1 in my_list:
for strng2 in my_list:
if distance.hamming(strng1, strng2) == 1 and counts[strng1] >= (counts[strng2]*2):
adj_list[strng1].append(strng2)
我有这个用于获取定向邻接列表的实现。预期结果:
ACAA: TCAA
TCAA: TCGA
TCAT: TCAA, TGAT
TGAT
TCGA: TGGA
TGGA: TCGA
GCGA: TCGA
AAAA
GGGG: GGGC
GGGC
有没有更快的实现?对于大型数据集,这会变得非常慢。用 cython 重写它会加快速度吗?如果是,有人可以帮助我开始使用 cython 吗?
我不知道 Cython,但您可以避免在内部循环中访问字典项:
adj_list = defaultdict(list)
for strng1 in my_list:
a1 = adj_list[strng1]
c1 = counts[strng1]
for strng2 in my_list:
if distance.hamming(strng1, strng2) == 1 and c1 >= (counts[strng2]*2):
a1.append(strng2)
您甚至可以通过仅在后半部分进行迭代并执行对称附加来削减更多。这样你就可以节省 50% 的距离计算,因为它是对称的。你只在上矩阵三角形上执行它(不包括对角线,我假设一个字符串与自身的距离为0)而不是整个矩阵。
for i,strng1 in enumerate(my_list):
...
for j in range(i+1,len(my_list)):
我的尝试,我不确定,但应该接近:
adj_list = defaultdict(list)
for i,strng1 in enumerate(my_list):
a1 = adj_list[strng1]
c1 = counts[strng1]
for j in range(i+1,len(my_list)):
strng2 = my_list[j]
if distance.hamming(strng1, strng2) == 1:
c2 = counts[strng2]
if c1 >= (c2*2):
a1.append(strng2)
if c2 >= (c1*2):
adj_list[strng1].append(strng2)
crysis405 编辑:
原文:
def adj_lst(my_list, counts):
adj_list = defaultdict(list)
for strng1 in my_list:
a1 = adj_list[strng1]
c1 = counts[strng1]
for strng2 in my_list:
if distance.hamming(strng1, strng2) == 1 and c1 >= (counts[strng2]*2):
adj_list[strng1].append(strng2)
建议改进:
def adj_lst_fast(my_list, counts):
adj_list_fast = defaultdict(list)
for i,strng1 in enumerate(my_list):
a1 = adj_list_fast[strng1]
c1 = counts[strng1]
for j in range(i+1,len(my_list)):
strng2 = my_list[j]
if distnace.hamming(strng1, strng2:
c2 = counts[strng2]
if c1 >= (c2*2):
adj_list_fast[strng1].append(strng2)
elif c2 >= (c1*2):
adj_list_fast[strng2].append(strng1)
性能:
print(timeit.timeit('adj_lst(my_list, counts)', number = 10000,
setup="from __main__ import adj_lst, my_list, counts"))
1.2892486669989012
print(timeit.timeit('adj_lst_fast(my_list, counts)', number = 10000,
setup="from __main__ import adj_lst_fast, my_list, counts"))
0.6437049919986748
import distance
from collections import defaultdict
my_list = ['ACAA', 'TCAA','TCAT','TGAT','TCGA','TGGA','GCGA','AAAA','GGGG','GGGC']
counts = {'ACAA':60, 'TCAA':3,'TCAT':30,'TGAT':8,'TCGA':1,'TGGA':1,'GCGA':8,'AAAA':5,'GGGG':8,'GGGC':1}
adj_list = defaultdict(list)
for strng1 in my_list:
for strng2 in my_list:
if distance.hamming(strng1, strng2) == 1 and counts[strng1] >= (counts[strng2]*2):
adj_list[strng1].append(strng2)
我有这个用于获取定向邻接列表的实现。预期结果:
ACAA: TCAA
TCAA: TCGA
TCAT: TCAA, TGAT
TGAT
TCGA: TGGA
TGGA: TCGA
GCGA: TCGA
AAAA
GGGG: GGGC
GGGC
有没有更快的实现?对于大型数据集,这会变得非常慢。用 cython 重写它会加快速度吗?如果是,有人可以帮助我开始使用 cython 吗?
我不知道 Cython,但您可以避免在内部循环中访问字典项:
adj_list = defaultdict(list)
for strng1 in my_list:
a1 = adj_list[strng1]
c1 = counts[strng1]
for strng2 in my_list:
if distance.hamming(strng1, strng2) == 1 and c1 >= (counts[strng2]*2):
a1.append(strng2)
您甚至可以通过仅在后半部分进行迭代并执行对称附加来削减更多。这样你就可以节省 50% 的距离计算,因为它是对称的。你只在上矩阵三角形上执行它(不包括对角线,我假设一个字符串与自身的距离为0)而不是整个矩阵。
for i,strng1 in enumerate(my_list):
...
for j in range(i+1,len(my_list)):
我的尝试,我不确定,但应该接近:
adj_list = defaultdict(list)
for i,strng1 in enumerate(my_list):
a1 = adj_list[strng1]
c1 = counts[strng1]
for j in range(i+1,len(my_list)):
strng2 = my_list[j]
if distance.hamming(strng1, strng2) == 1:
c2 = counts[strng2]
if c1 >= (c2*2):
a1.append(strng2)
if c2 >= (c1*2):
adj_list[strng1].append(strng2)
crysis405 编辑:
原文:
def adj_lst(my_list, counts):
adj_list = defaultdict(list)
for strng1 in my_list:
a1 = adj_list[strng1]
c1 = counts[strng1]
for strng2 in my_list:
if distance.hamming(strng1, strng2) == 1 and c1 >= (counts[strng2]*2):
adj_list[strng1].append(strng2)
建议改进:
def adj_lst_fast(my_list, counts):
adj_list_fast = defaultdict(list)
for i,strng1 in enumerate(my_list):
a1 = adj_list_fast[strng1]
c1 = counts[strng1]
for j in range(i+1,len(my_list)):
strng2 = my_list[j]
if distnace.hamming(strng1, strng2:
c2 = counts[strng2]
if c1 >= (c2*2):
adj_list_fast[strng1].append(strng2)
elif c2 >= (c1*2):
adj_list_fast[strng2].append(strng1)
性能:
print(timeit.timeit('adj_lst(my_list, counts)', number = 10000,
setup="from __main__ import adj_lst, my_list, counts"))
1.2892486669989012
print(timeit.timeit('adj_lst_fast(my_list, counts)', number = 10000,
setup="from __main__ import adj_lst_fast, my_list, counts"))
0.6437049919986748