Python 中的集群化:如何对唯一 ID 对列表进行分组?

Clusterizaztion in Python: How to group list of unique pairs of ids?

我需要对彼此相交的唯一 ID 对列表进行聚类。 最简单的例子:

data_rows=[
    {'clid':1,'uid':'a'},
    {'clid':2,'uid':'b'},
    {'clid':3,'uid':'b'},
    {'clid':4,'uid':'c'},
    {'clid':4,'uid':'x'},
    {'clid':5,'uid':'d'},
    {'clid':5,'uid':'e'},
    {'clid':6,'uid':'d'},
    {'clid':6,'uid':'e'},
    {'clid':6,'uid':'f'},
]
# I transform data to this to this.
clid_dic = {1:['a'], 2:['b'],3:['b'], 4:['c','x'],5:['d','e'],6:['d','e','f']}
uid_dic = {'a':[1], 'b':[2,3], 'c':[4], 'x':[4], 'd':[5,6], 'e':[5,6], 'f':[6]}
# expected result
eid_dic = {
    1:{'clids':[1],'uids':['a']},
    2:{'clids':[2,3],'uids':['b']},
    3:{'clids':[4],'uids':['c','x']},
    4:{'clids':[5,6],'uids':['d','e','f']}
}

我总是无法创建适用于更大的对列表 (>500.000) 并且不会陷入递归或其他错误的解决方案。 我的解决方案如下所示:

def crawl(clid):
    clids, uids = [],[]
    # add current clid to batch
    if clid not in clids: clids.append(clid)

    #collect its uids
    for uid in clid_dic[clid]:
        if uid not in uids: uids.append(uid)

    #remove from dic
    del clid_dic[clid]

    # collect new clids for each uid
    for uid in uids:
        if uid in uid_dic:
            for cld in uid_dic[uid]:
                if cld not in clids:
                    clids.append(cld)
    # for each collected clid call self
    for cld in clids:
        if cld in clid_dic:
            tmp  = crawl(cld)
            clids.extend([x for x in tmp[0] if x not in clids])
            uids.extend([x for x in tmp[1] if x not in uids])
    return(clids,uids)

def loop():
    i = 0
    eid_dic = {}
    while True:
        n = len(clid_dic)
        if n == 0: break
        i += 1
        clid = next(iter(clid_dic))
        tmp = crawl(clid)
        clids, uids= tmp[0], tmp[1]

        eid_dic[i] = {'clids': clids, 'uids': uids}
    print(eid_dic)
if __name__ == '__main__':
    loop()

这似乎是基本任务(对 smth 进行聚类),但缺乏经验和知识使我陷入困境。

如果项目的顺序不是问题,这是一个解决方案(使用 clid_dicuid_dic 值的集合):

data_rows=[
    {'clid':1,'uid':'a'},
    {'clid':2,'uid':'b'},
    {'clid':3,'uid':'b'},
    {'clid':4,'uid':'c'},
    {'clid':4,'uid':'x'},
    {'clid':5,'uid':'d'},
    {'clid':5,'uid':'e'},
    {'clid':6,'uid':'d'},
    {'clid':6,'uid':'e'},
    {'clid':6,'uid':'f'},
]

clid_dic = {}
uid_dic = {}
for d in data_rows:
    clid_dic.setdefault(d['clid'], set()).add(d['uid'])
    uid_dic.setdefault(d['uid'], set()).add(d['clid'])

eid_dic, cnt = {}, 1

while clid_dic:
    k, v = clid_dic.popitem()
    k = {k}

    to_remove = []
    for k2, v2 in clid_dic.items():
        if v & v2:
            v = v.union(v2)
            to_remove.append(k2)
    for k2 in to_remove:
        del clid_dic[k2]

    to_remove = []
    for k2, v2 in uid_dic.items():
        if k & v2:
            k = k.union(v2)
            to_remove.append(k2)
    for k2 in to_remove:
        del uid_dic[k2]

    eid_dic[cnt] = {'clids':list(k), 'uids':list(v)}
    cnt += 1

from pprint import pprint
pprint(eid_dic)

打印:

{1: {'clids': [5, 6], 'uids': ['e', 'd', 'f']},
 2: {'clids': [4], 'uids': ['c', 'x']},
 3: {'clids': [2, 3], 'uids': ['b']},
 4: {'clids': [1], 'uids': ['a']}}