Defaultdict(defaultdict) 用于文本分析
Defaultdict(defaultdict) for text analysis
从文件中读取并清理的文本:
['the', 'cat', 'chased', 'the', 'dog', 'fled']
挑战是 return 一个字典,每个单词作为值,可以跟随它的单词作为键,并计算它跟随它的次数:
{'the': {'cat': 1, 'dog': 1}, 'chased': {'the': 1}, 'cat': {'chased': 1}, 'dog': {'fled': 1}}
Collections.counter 将计算每个唯一值的频率。然而,我解决这一挑战的算法又长又笨重。如何使用 defaultdict 来简化此问题的解决?
编辑:这是我解决这个问题的代码。一个缺陷是嵌套字典中的值是一个单词在文本中出现的总次数,而不是它实际跟随该特定单词的次数。
from collections import Counter, defaultdict
wordsFile = f.read()
words = [x.strip(string.punctuation).lower() for x in wordsFile.split()]
counter = Counter(words)
# the dict of [unique word]:[index of appearance in 'words']
index = defaultdict(list)
# Appends every position of 'term' to the 'term' key
for pos, term in enumerate(words):
index[term].append(pos)
# range ends at len(index) - 2 because last word in text has no follower
master = {}
for i in range(0,(len(index)-2)):
# z will hold the [index of appearance in 'words'] values
z = []
z = index.values()[i]
try:
# Because I am interested in follower words
z = [words[a+1] for a in z]
print z; print
# To avoid value errors if a+1 exceeds range of list
except Exception:
pass
# For each word, build r into the dict that contains each follower word and its frequency.
r = {}
for key in z:
r.update({key: counter[key]})
master.update({index.keys()[i]:r})
return master
没有必要使用 collections
模块来实现工作解决方案:
示例 1
import itertools
import pprint
def main():
array = 'the', 'cat', 'chased', 'the', 'dog', 'fled'
frequency = {}
add_frequency(frequency, array)
pprint.pprint(frequency)
def add_frequency(frequency, array):
for a, b in pairwise(array):
if a in frequency:
follower = frequency[a]
else:
follower = frequency[a] = {}
if b in follower:
follower[b] += 1
else:
follower[b] = 1
def pairwise(iterable):
"""s -> (s[0], s[1]), (s[1], [s2]), (s[2], s[3]), ..."""
a, b = itertools.tee(iterable)
next(b, None)
return zip(a, b)
if __name__ == '__main__':
main()
以下代码显示了如何使用 collections.defaultdict
来按照您的要求进行操作:
示例 2
import collections
import itertools
import pprint
def main():
array = 'the', 'cat', 'chased', 'the', 'dog', 'fled'
frequency = collections.defaultdict(lambda: collections.defaultdict(int))
add_frequency(frequency, array)
pprint.pprint(frequency)
def add_frequency(frequency, array):
for a, b in pairwise(array):
frequency[a][b] += 1
def pairwise(iterable):
"""s -> (s[0], s[1]), (s[1], [s2]), (s[2], s[3]), ..."""
a, b = itertools.tee(iterable)
next(b, None)
return zip(a, b)
if __name__ == '__main__':
main()
您也可以在创建 defaultdict
时使用 functools.partial
而不是 lambda
。
示例 3
from collections import defaultdict
from functools import partial
from itertools import tee
from pprint import pprint
def main():
array = 'the', 'cat', 'chased', 'the', 'dog', 'fled'
frequency = defaultdict(partial(defaultdict, int))
add_frequency(frequency, array)
pprint(frequency)
def add_frequency(frequency, array):
for a, b in pairwise(array):
frequency[a][b] += 1
def pairwise(iterable):
"""s -> (s[0], s[1]), (s[1], [s2]), (s[2], s[3]), ..."""
a, b = tee(iterable)
next(b, None)
return zip(a, b)
if __name__ == '__main__':
main()
我有一个简单的答案,虽然它不使用 defaultdict - 只是标准字典和 setdefault。我可能没有理解您的意图,但这是我所看到的:
def word_analysis(input):
from itertools import tee, izip
i1, i2 = tee(input)
i2.next()
results = {}
for w1,w2 in izip(i1,i2): # Process works pairwise
d = results.setdefault(w1,{}) # Establish/use the first word dict
d[w2] = 1 + d.setdefault(w2,0) # Increment the counter
return results
print word_analysis(['the', 'cat', 'chased', 'the', 'dog', 'fled'])
对我来说,这提供了与您报告的相同的输出:
{'the': {'dog': 1, 'cat': 1}, 'chased': {'the': 1}, 'dog': {'fled': 1}, 'cat': {'chased': 1}}
我是不是漏掉了什么?
使用defaultdict
:
import collections
words = ['the', 'cat','chased', 'the', 'dog', 'fled']
result = collections.defaultdict(dict)
for i in range(len(words) - 1): # loop till second to last word
occurs = result[words[i]] # get the dict containing the words that follow and their freqs
new_freq = occurs.get(words[i+1], 0) + 1 # update the freqs
occurs[words[i+1]] = new_freq
print list(result.items())
从文件中读取并清理的文本:
['the', 'cat', 'chased', 'the', 'dog', 'fled']
挑战是 return 一个字典,每个单词作为值,可以跟随它的单词作为键,并计算它跟随它的次数:
{'the': {'cat': 1, 'dog': 1}, 'chased': {'the': 1}, 'cat': {'chased': 1}, 'dog': {'fled': 1}}
Collections.counter 将计算每个唯一值的频率。然而,我解决这一挑战的算法又长又笨重。如何使用 defaultdict 来简化此问题的解决?
编辑:这是我解决这个问题的代码。一个缺陷是嵌套字典中的值是一个单词在文本中出现的总次数,而不是它实际跟随该特定单词的次数。
from collections import Counter, defaultdict
wordsFile = f.read()
words = [x.strip(string.punctuation).lower() for x in wordsFile.split()]
counter = Counter(words)
# the dict of [unique word]:[index of appearance in 'words']
index = defaultdict(list)
# Appends every position of 'term' to the 'term' key
for pos, term in enumerate(words):
index[term].append(pos)
# range ends at len(index) - 2 because last word in text has no follower
master = {}
for i in range(0,(len(index)-2)):
# z will hold the [index of appearance in 'words'] values
z = []
z = index.values()[i]
try:
# Because I am interested in follower words
z = [words[a+1] for a in z]
print z; print
# To avoid value errors if a+1 exceeds range of list
except Exception:
pass
# For each word, build r into the dict that contains each follower word and its frequency.
r = {}
for key in z:
r.update({key: counter[key]})
master.update({index.keys()[i]:r})
return master
没有必要使用 collections
模块来实现工作解决方案:
示例 1
import itertools
import pprint
def main():
array = 'the', 'cat', 'chased', 'the', 'dog', 'fled'
frequency = {}
add_frequency(frequency, array)
pprint.pprint(frequency)
def add_frequency(frequency, array):
for a, b in pairwise(array):
if a in frequency:
follower = frequency[a]
else:
follower = frequency[a] = {}
if b in follower:
follower[b] += 1
else:
follower[b] = 1
def pairwise(iterable):
"""s -> (s[0], s[1]), (s[1], [s2]), (s[2], s[3]), ..."""
a, b = itertools.tee(iterable)
next(b, None)
return zip(a, b)
if __name__ == '__main__':
main()
以下代码显示了如何使用 collections.defaultdict
来按照您的要求进行操作:
示例 2
import collections
import itertools
import pprint
def main():
array = 'the', 'cat', 'chased', 'the', 'dog', 'fled'
frequency = collections.defaultdict(lambda: collections.defaultdict(int))
add_frequency(frequency, array)
pprint.pprint(frequency)
def add_frequency(frequency, array):
for a, b in pairwise(array):
frequency[a][b] += 1
def pairwise(iterable):
"""s -> (s[0], s[1]), (s[1], [s2]), (s[2], s[3]), ..."""
a, b = itertools.tee(iterable)
next(b, None)
return zip(a, b)
if __name__ == '__main__':
main()
您也可以在创建 defaultdict
时使用 functools.partial
而不是 lambda
。
示例 3
from collections import defaultdict
from functools import partial
from itertools import tee
from pprint import pprint
def main():
array = 'the', 'cat', 'chased', 'the', 'dog', 'fled'
frequency = defaultdict(partial(defaultdict, int))
add_frequency(frequency, array)
pprint(frequency)
def add_frequency(frequency, array):
for a, b in pairwise(array):
frequency[a][b] += 1
def pairwise(iterable):
"""s -> (s[0], s[1]), (s[1], [s2]), (s[2], s[3]), ..."""
a, b = tee(iterable)
next(b, None)
return zip(a, b)
if __name__ == '__main__':
main()
我有一个简单的答案,虽然它不使用 defaultdict - 只是标准字典和 setdefault。我可能没有理解您的意图,但这是我所看到的:
def word_analysis(input):
from itertools import tee, izip
i1, i2 = tee(input)
i2.next()
results = {}
for w1,w2 in izip(i1,i2): # Process works pairwise
d = results.setdefault(w1,{}) # Establish/use the first word dict
d[w2] = 1 + d.setdefault(w2,0) # Increment the counter
return results
print word_analysis(['the', 'cat', 'chased', 'the', 'dog', 'fled'])
对我来说,这提供了与您报告的相同的输出:
{'the': {'dog': 1, 'cat': 1}, 'chased': {'the': 1}, 'dog': {'fled': 1}, 'cat': {'chased': 1}}
我是不是漏掉了什么?
使用defaultdict
:
import collections
words = ['the', 'cat','chased', 'the', 'dog', 'fled']
result = collections.defaultdict(dict)
for i in range(len(words) - 1): # loop till second to last word
occurs = result[words[i]] # get the dict containing the words that follow and their freqs
new_freq = occurs.get(words[i+1], 0) + 1 # update the freqs
occurs[words[i+1]] = new_freq
print list(result.items())