Python 中的集群化:如何对唯一 ID 对列表进行分组?
Clusterizaztion in Python: How to group list of unique pairs of ids?
我需要对彼此相交的唯一 ID 对列表进行聚类。
最简单的例子:
data_rows=[
{'clid':1,'uid':'a'},
{'clid':2,'uid':'b'},
{'clid':3,'uid':'b'},
{'clid':4,'uid':'c'},
{'clid':4,'uid':'x'},
{'clid':5,'uid':'d'},
{'clid':5,'uid':'e'},
{'clid':6,'uid':'d'},
{'clid':6,'uid':'e'},
{'clid':6,'uid':'f'},
]
# I transform data to this to this.
clid_dic = {1:['a'], 2:['b'],3:['b'], 4:['c','x'],5:['d','e'],6:['d','e','f']}
uid_dic = {'a':[1], 'b':[2,3], 'c':[4], 'x':[4], 'd':[5,6], 'e':[5,6], 'f':[6]}
# expected result
eid_dic = {
1:{'clids':[1],'uids':['a']},
2:{'clids':[2,3],'uids':['b']},
3:{'clids':[4],'uids':['c','x']},
4:{'clids':[5,6],'uids':['d','e','f']}
}
我总是无法创建适用于更大的对列表 (>500.000) 并且不会陷入递归或其他错误的解决方案。
我的解决方案如下所示:
def crawl(clid):
clids, uids = [],[]
# add current clid to batch
if clid not in clids: clids.append(clid)
#collect its uids
for uid in clid_dic[clid]:
if uid not in uids: uids.append(uid)
#remove from dic
del clid_dic[clid]
# collect new clids for each uid
for uid in uids:
if uid in uid_dic:
for cld in uid_dic[uid]:
if cld not in clids:
clids.append(cld)
# for each collected clid call self
for cld in clids:
if cld in clid_dic:
tmp = crawl(cld)
clids.extend([x for x in tmp[0] if x not in clids])
uids.extend([x for x in tmp[1] if x not in uids])
return(clids,uids)
def loop():
i = 0
eid_dic = {}
while True:
n = len(clid_dic)
if n == 0: break
i += 1
clid = next(iter(clid_dic))
tmp = crawl(clid)
clids, uids= tmp[0], tmp[1]
eid_dic[i] = {'clids': clids, 'uids': uids}
print(eid_dic)
if __name__ == '__main__':
loop()
这似乎是基本任务(对 smth 进行聚类),但缺乏经验和知识使我陷入困境。
如果项目的顺序不是问题,这是一个解决方案(使用 clid_dic
和 uid_dic
值的集合):
data_rows=[
{'clid':1,'uid':'a'},
{'clid':2,'uid':'b'},
{'clid':3,'uid':'b'},
{'clid':4,'uid':'c'},
{'clid':4,'uid':'x'},
{'clid':5,'uid':'d'},
{'clid':5,'uid':'e'},
{'clid':6,'uid':'d'},
{'clid':6,'uid':'e'},
{'clid':6,'uid':'f'},
]
clid_dic = {}
uid_dic = {}
for d in data_rows:
clid_dic.setdefault(d['clid'], set()).add(d['uid'])
uid_dic.setdefault(d['uid'], set()).add(d['clid'])
eid_dic, cnt = {}, 1
while clid_dic:
k, v = clid_dic.popitem()
k = {k}
to_remove = []
for k2, v2 in clid_dic.items():
if v & v2:
v = v.union(v2)
to_remove.append(k2)
for k2 in to_remove:
del clid_dic[k2]
to_remove = []
for k2, v2 in uid_dic.items():
if k & v2:
k = k.union(v2)
to_remove.append(k2)
for k2 in to_remove:
del uid_dic[k2]
eid_dic[cnt] = {'clids':list(k), 'uids':list(v)}
cnt += 1
from pprint import pprint
pprint(eid_dic)
打印:
{1: {'clids': [5, 6], 'uids': ['e', 'd', 'f']},
2: {'clids': [4], 'uids': ['c', 'x']},
3: {'clids': [2, 3], 'uids': ['b']},
4: {'clids': [1], 'uids': ['a']}}
我需要对彼此相交的唯一 ID 对列表进行聚类。 最简单的例子:
data_rows=[
{'clid':1,'uid':'a'},
{'clid':2,'uid':'b'},
{'clid':3,'uid':'b'},
{'clid':4,'uid':'c'},
{'clid':4,'uid':'x'},
{'clid':5,'uid':'d'},
{'clid':5,'uid':'e'},
{'clid':6,'uid':'d'},
{'clid':6,'uid':'e'},
{'clid':6,'uid':'f'},
]
# I transform data to this to this.
clid_dic = {1:['a'], 2:['b'],3:['b'], 4:['c','x'],5:['d','e'],6:['d','e','f']}
uid_dic = {'a':[1], 'b':[2,3], 'c':[4], 'x':[4], 'd':[5,6], 'e':[5,6], 'f':[6]}
# expected result
eid_dic = {
1:{'clids':[1],'uids':['a']},
2:{'clids':[2,3],'uids':['b']},
3:{'clids':[4],'uids':['c','x']},
4:{'clids':[5,6],'uids':['d','e','f']}
}
我总是无法创建适用于更大的对列表 (>500.000) 并且不会陷入递归或其他错误的解决方案。 我的解决方案如下所示:
def crawl(clid):
clids, uids = [],[]
# add current clid to batch
if clid not in clids: clids.append(clid)
#collect its uids
for uid in clid_dic[clid]:
if uid not in uids: uids.append(uid)
#remove from dic
del clid_dic[clid]
# collect new clids for each uid
for uid in uids:
if uid in uid_dic:
for cld in uid_dic[uid]:
if cld not in clids:
clids.append(cld)
# for each collected clid call self
for cld in clids:
if cld in clid_dic:
tmp = crawl(cld)
clids.extend([x for x in tmp[0] if x not in clids])
uids.extend([x for x in tmp[1] if x not in uids])
return(clids,uids)
def loop():
i = 0
eid_dic = {}
while True:
n = len(clid_dic)
if n == 0: break
i += 1
clid = next(iter(clid_dic))
tmp = crawl(clid)
clids, uids= tmp[0], tmp[1]
eid_dic[i] = {'clids': clids, 'uids': uids}
print(eid_dic)
if __name__ == '__main__':
loop()
这似乎是基本任务(对 smth 进行聚类),但缺乏经验和知识使我陷入困境。
如果项目的顺序不是问题,这是一个解决方案(使用 clid_dic
和 uid_dic
值的集合):
data_rows=[
{'clid':1,'uid':'a'},
{'clid':2,'uid':'b'},
{'clid':3,'uid':'b'},
{'clid':4,'uid':'c'},
{'clid':4,'uid':'x'},
{'clid':5,'uid':'d'},
{'clid':5,'uid':'e'},
{'clid':6,'uid':'d'},
{'clid':6,'uid':'e'},
{'clid':6,'uid':'f'},
]
clid_dic = {}
uid_dic = {}
for d in data_rows:
clid_dic.setdefault(d['clid'], set()).add(d['uid'])
uid_dic.setdefault(d['uid'], set()).add(d['clid'])
eid_dic, cnt = {}, 1
while clid_dic:
k, v = clid_dic.popitem()
k = {k}
to_remove = []
for k2, v2 in clid_dic.items():
if v & v2:
v = v.union(v2)
to_remove.append(k2)
for k2 in to_remove:
del clid_dic[k2]
to_remove = []
for k2, v2 in uid_dic.items():
if k & v2:
k = k.union(v2)
to_remove.append(k2)
for k2 in to_remove:
del uid_dic[k2]
eid_dic[cnt] = {'clids':list(k), 'uids':list(v)}
cnt += 1
from pprint import pprint
pprint(eid_dic)
打印:
{1: {'clids': [5, 6], 'uids': ['e', 'd', 'f']},
2: {'clids': [4], 'uids': ['c', 'x']},
3: {'clids': [2, 3], 'uids': ['b']},
4: {'clids': [1], 'uids': ['a']}}