成对计算相对频率条带 mapreduce
counting relative frequency in pairs a strips mapreduce
我是 python 的新手,我想使用 MrJob 包来计算我在代码下方编写的成对单词的相对频率,但它没有产生正确的输出。
你能帮我解决我的错误吗?
(|) = (, )/()=(, )/∑A' (′ , )
import re
from collections import defaultdict
from mrjob.job import MRJob
WORD_RE = re.compile(r"[\w']+")
class MRRelativeFreq(MRJob):
def mapper(self, _, line):
for word in WORD_RE.findall(line):
for wordpair in WORD_RE.findall(line):
if word != wordpair:
yield (word.lower(), wordpair.lower(), 1)
def reducer(self, key, values):
cnts = defaultdict(int)
total = 0
for (word, count) in values:
cnt=0
total += count
cnts[word] += count
for (k,kp), v in cnts.items():
yield (k,kp), (v, float(v) / total)
def combiner(self, key, values):
yield None, (key, sum(values))
if __name__ == '__main__':
MRRelativeFreq.run()
您将需要一个中间数据结构,在本例中是一个 defaultdict
来计算单词出现的总次数。
import re
from collections import defaultdict
from itertools import combinations
from mrjob.job import MRJob
from mrjob.step import MRStep
WORD_RE = re.compile(r"[\w']+")
class MRRelativeFreq(MRJob):
denoms = defaultdict(int)
def steps(self):
return [
MRStep(
mapper=self.mapper,
combiner=self.combiner,
reducer=self.reducer),
MRStep(
reducer=self.reducer_s2)
]
def mapper(self, _, line):
words = WORD_RE.findall(line)
for (x, y) in combinations(words, 2):
if x != y:
yield ((x.lower(), "*"), 1)
yield ((x.lower(), y.lower()), 1)
def combiner(self, pair, counts):
yield (pair, sum(counts))
def reducer(self, pair, counts):
count = sum(counts)
x, y = pair
if y == "*":
self.denoms[x] = count
else:
yield ((x, y), count)
def reducer_s2(self, pair, ycnt):
x, y = pair
lkup = self.denoms[x]
yield (pair, round((sum(ycnt) / lkup), 2))
if __name__ == '__main__':
MRRelativeFreq.run()
给定以下结构的文件:
"""
this is something
this is not
or else this is
"""
我得到以下结果:
["or", "else"] 0.33
["or", "is"] 0.33
["or", "this"] 0.33
["this", "is"] 0.6
["this", "not"] 0.2
["this", "something"] 0.2
["is", "not"] 0.5
["is", "something"] 0.5
["else", "is"] 0.5
["else", "this"] 0.5
使用 this 作为提示。
我是 python 的新手,我想使用 MrJob 包来计算我在代码下方编写的成对单词的相对频率,但它没有产生正确的输出。 你能帮我解决我的错误吗? (|) = (, )/()=(, )/∑A' (′ , )
import re
from collections import defaultdict
from mrjob.job import MRJob
WORD_RE = re.compile(r"[\w']+")
class MRRelativeFreq(MRJob):
def mapper(self, _, line):
for word in WORD_RE.findall(line):
for wordpair in WORD_RE.findall(line):
if word != wordpair:
yield (word.lower(), wordpair.lower(), 1)
def reducer(self, key, values):
cnts = defaultdict(int)
total = 0
for (word, count) in values:
cnt=0
total += count
cnts[word] += count
for (k,kp), v in cnts.items():
yield (k,kp), (v, float(v) / total)
def combiner(self, key, values):
yield None, (key, sum(values))
if __name__ == '__main__':
MRRelativeFreq.run()
您将需要一个中间数据结构,在本例中是一个 defaultdict
来计算单词出现的总次数。
import re
from collections import defaultdict
from itertools import combinations
from mrjob.job import MRJob
from mrjob.step import MRStep
WORD_RE = re.compile(r"[\w']+")
class MRRelativeFreq(MRJob):
denoms = defaultdict(int)
def steps(self):
return [
MRStep(
mapper=self.mapper,
combiner=self.combiner,
reducer=self.reducer),
MRStep(
reducer=self.reducer_s2)
]
def mapper(self, _, line):
words = WORD_RE.findall(line)
for (x, y) in combinations(words, 2):
if x != y:
yield ((x.lower(), "*"), 1)
yield ((x.lower(), y.lower()), 1)
def combiner(self, pair, counts):
yield (pair, sum(counts))
def reducer(self, pair, counts):
count = sum(counts)
x, y = pair
if y == "*":
self.denoms[x] = count
else:
yield ((x, y), count)
def reducer_s2(self, pair, ycnt):
x, y = pair
lkup = self.denoms[x]
yield (pair, round((sum(ycnt) / lkup), 2))
if __name__ == '__main__':
MRRelativeFreq.run()
给定以下结构的文件:
"""
this is something
this is not
or else this is
"""
我得到以下结果:
["or", "else"] 0.33
["or", "is"] 0.33
["or", "this"] 0.33
["this", "is"] 0.6
["this", "not"] 0.2
["this", "something"] 0.2
["is", "not"] 0.5
["is", "something"] 0.5
["else", "is"] 0.5
["else", "this"] 0.5
使用 this 作为提示。