Python 两个项目在不同列表中同时出现
Python co-occurrence of two items in different lists
我有一个列表列表,例如:
names = [['cat', 'fish'], ['cat'], ['fish', 'dog', 'cat'],
['cat', 'bird', 'fish'], ['fish', 'bird']]
我想计算每对名字在整个列表中一起提到的次数,输出如下:
{ ['cat', 'fish']: 3, ['cat', 'dog']: 1,['cat','bird']:1
['fish','dog'] : 1, ['fish','bird']:2}
我试过了:
from collections import Counter
from collections import defaultdict
co_occurences = defaultdict(Counter)
for tags in names:
for key in tags:
co_occurences[key].update(tags)
print co_occurences
但它不计算主列表中的共同出现次数。
在这里。我使用 <=
来测试一个集合是否是另一个集合的子集(集合没有顺序,每个元素只出现一次)。
import itertools
from pprint import pprint
names = [['cat', 'fish'],
['cat'],
['fish', 'dog', 'cat'],
['cat', 'bird', 'fish'],
['fish', 'bird']]
# Flatten the list and make all names unique
unique_names = set(itertools.chain.from_iterable(names))
# Get all combinations of pairs
all_pairs = list(itertools.combinations(unique_names, 2))
# Create the dictionary
result = {pair: len([x for x in names if set(pair) <= set(x)]) for pair in all_pairs}
pprint(result)
这是输出
{('bird', 'cat'): 1,
('bird', 'dog'): 0,
('bird', 'fish'): 2,
('dog', 'cat'): 1,
('fish', 'cat'): 3,
('fish', 'dog'): 1}
我建议将它放在一个专用函数中 len([x for x in names if set(pair) <= set(x)])
用于字典的值。
首先计算单词的所有 2 个组合,如果两个术语都出现在列表项中,则增加其在 result
字典中的值:(l
是整个列表中不同元素的列表):
from collections import defaultdict
from itertools import combinations, chain
names = [['cat', 'fish'], ['cat'], ['fish', 'dog', 'cat'],
['cat', 'bird', 'fish'], ['fish', 'bird']]
l = set(chain.from_iterable(names)) # {'dog', 'bird', 'cat', 'fish'}
result = defaultdict(int)
for x in (list(combinations(l, 2))):
for y in names:
if((x[0] in y) and (x[1] in y)):
result[x[0],x[1]] += 1
result # defaultdict(<class 'int'>, {('fish', 'bird'): 2, ('cat', 'dog'): 1, ('cat', 'fish'): 3, ('fish', 'dog'): 1, ('cat', 'bird'): 1})
您可以通过使用 itertools.combinations
和 itertools.chain
来使用所需的结果:
>>> from itertools import combinations, chain
>>> names = [['cat', 'fish'], ['cat'], ['fish', 'dog', 'cat'],
... ['cat', 'bird', 'fish'], ['fish', 'bird']]
>>> uniques = set(chain(*names))
>>> {x: sum(1 for n in names if all(i in n for i in x)) for x in combinations(uniques, 2)}
{('fish', 'dog'): 1, ('dog', 'cat'): 1, ('bird', 'fish'): 2, ('fish', 'cat'): 3, ('bird', 'dog'): 0, ('bird', 'cat'): 1}
您可以在 python 中使用按位与,并通过将列表列表转换为集合列表来比较它们
>>> set(['cat','dog']) & set(['cat','dog','monkey','horse','fish'])
set(['dog', 'cat'])
您可以使用这个 属性 并达到您想要的计数。
def listOccurences(item, names):
# item is the list that you want to check, eg. ['cat','fish']
# names contain the list of list you have.
set_of_items = set(item) # set(['cat','fish'])
count = 0
for value in names:
if set_of_items & set(value) == set_of_items:
count+=1
return count
names = [['cat', 'fish'], ['cat'], ['fish', 'dog', 'cat'],['cat', 'bird', 'fish'], ['fish', 'bird']]
# Now for each of your possibilities which you can generate
# Chain flattens the list, set removes duplicates, and combinations generates all possible pairs.
permuted_values = list(itertools.combinations(set(itertools.chain.from_iterable(names)), 2))
d = {}
for v in permuted_values:
d[str(v)] = listOccurences(v, names)
# The key in the dict being a list cannot be possible unless it's converted to a string.
print(d)
# {"['fish', 'dog']": 1, "['cat', 'dog']": 1, "['cat', 'fish']": 3, "['cat', 'bird']": 1, "['fish', 'bird']": 2}
此处列出的解决方案不适用于我的大型数据集(成千上万),它们太慢了。以下解决方案更快,只需几分之一秒。
在此处检查计数器 class
https://docs.python.org/2/library/collections.html#collections.Counter
# generate combinations for each sub list seperately
lists_of_pairs = [list(itertools.combinations(sub_list, 2)) for sub_list in names]
# flatten the lists of pairs to 1 large list of pairs
all_pairs = [pair for pairs_list in lists_of_pairs for pair in pairs_list]
# let the Counter do the rest for you
co_occurences_counts = Counter(all_pairs)
我有一个列表列表,例如:
names = [['cat', 'fish'], ['cat'], ['fish', 'dog', 'cat'],
['cat', 'bird', 'fish'], ['fish', 'bird']]
我想计算每对名字在整个列表中一起提到的次数,输出如下:
{ ['cat', 'fish']: 3, ['cat', 'dog']: 1,['cat','bird']:1
['fish','dog'] : 1, ['fish','bird']:2}
我试过了:
from collections import Counter
from collections import defaultdict
co_occurences = defaultdict(Counter)
for tags in names:
for key in tags:
co_occurences[key].update(tags)
print co_occurences
但它不计算主列表中的共同出现次数。
在这里。我使用 <=
来测试一个集合是否是另一个集合的子集(集合没有顺序,每个元素只出现一次)。
import itertools
from pprint import pprint
names = [['cat', 'fish'],
['cat'],
['fish', 'dog', 'cat'],
['cat', 'bird', 'fish'],
['fish', 'bird']]
# Flatten the list and make all names unique
unique_names = set(itertools.chain.from_iterable(names))
# Get all combinations of pairs
all_pairs = list(itertools.combinations(unique_names, 2))
# Create the dictionary
result = {pair: len([x for x in names if set(pair) <= set(x)]) for pair in all_pairs}
pprint(result)
这是输出
{('bird', 'cat'): 1,
('bird', 'dog'): 0,
('bird', 'fish'): 2,
('dog', 'cat'): 1,
('fish', 'cat'): 3,
('fish', 'dog'): 1}
我建议将它放在一个专用函数中 len([x for x in names if set(pair) <= set(x)])
用于字典的值。
首先计算单词的所有 2 个组合,如果两个术语都出现在列表项中,则增加其在 result
字典中的值:(l
是整个列表中不同元素的列表):
from collections import defaultdict
from itertools import combinations, chain
names = [['cat', 'fish'], ['cat'], ['fish', 'dog', 'cat'],
['cat', 'bird', 'fish'], ['fish', 'bird']]
l = set(chain.from_iterable(names)) # {'dog', 'bird', 'cat', 'fish'}
result = defaultdict(int)
for x in (list(combinations(l, 2))):
for y in names:
if((x[0] in y) and (x[1] in y)):
result[x[0],x[1]] += 1
result # defaultdict(<class 'int'>, {('fish', 'bird'): 2, ('cat', 'dog'): 1, ('cat', 'fish'): 3, ('fish', 'dog'): 1, ('cat', 'bird'): 1})
您可以通过使用 itertools.combinations
和 itertools.chain
来使用所需的结果:
>>> from itertools import combinations, chain
>>> names = [['cat', 'fish'], ['cat'], ['fish', 'dog', 'cat'],
... ['cat', 'bird', 'fish'], ['fish', 'bird']]
>>> uniques = set(chain(*names))
>>> {x: sum(1 for n in names if all(i in n for i in x)) for x in combinations(uniques, 2)}
{('fish', 'dog'): 1, ('dog', 'cat'): 1, ('bird', 'fish'): 2, ('fish', 'cat'): 3, ('bird', 'dog'): 0, ('bird', 'cat'): 1}
您可以在 python 中使用按位与,并通过将列表列表转换为集合列表来比较它们
>>> set(['cat','dog']) & set(['cat','dog','monkey','horse','fish'])
set(['dog', 'cat'])
您可以使用这个 属性 并达到您想要的计数。
def listOccurences(item, names):
# item is the list that you want to check, eg. ['cat','fish']
# names contain the list of list you have.
set_of_items = set(item) # set(['cat','fish'])
count = 0
for value in names:
if set_of_items & set(value) == set_of_items:
count+=1
return count
names = [['cat', 'fish'], ['cat'], ['fish', 'dog', 'cat'],['cat', 'bird', 'fish'], ['fish', 'bird']]
# Now for each of your possibilities which you can generate
# Chain flattens the list, set removes duplicates, and combinations generates all possible pairs.
permuted_values = list(itertools.combinations(set(itertools.chain.from_iterable(names)), 2))
d = {}
for v in permuted_values:
d[str(v)] = listOccurences(v, names)
# The key in the dict being a list cannot be possible unless it's converted to a string.
print(d)
# {"['fish', 'dog']": 1, "['cat', 'dog']": 1, "['cat', 'fish']": 3, "['cat', 'bird']": 1, "['fish', 'bird']": 2}
此处列出的解决方案不适用于我的大型数据集(成千上万),它们太慢了。以下解决方案更快,只需几分之一秒。
在此处检查计数器 class
https://docs.python.org/2/library/collections.html#collections.Counter
# generate combinations for each sub list seperately
lists_of_pairs = [list(itertools.combinations(sub_list, 2)) for sub_list in names]
# flatten the lists of pairs to 1 large list of pairs
all_pairs = [pair for pairs_list in lists_of_pairs for pair in pairs_list]
# let the Counter do the rest for you
co_occurences_counts = Counter(all_pairs)