成对计算相对频率条带 mapreduce

counting relative frequency in pairs a strips mapreduce

我是 python 的新手,我想使用 MrJob 包来计算我在代码下方编写的成对单词的相对频率,但它没有产生正确的输出。 你能帮我解决我的错误吗? (|) = (, )/()=(, )/∑A' (′ , )

import re
from collections import defaultdict

from mrjob.job import MRJob

WORD_RE = re.compile(r"[\w']+")


class MRRelativeFreq(MRJob):
    def mapper(self, _, line):
        for word in WORD_RE.findall(line):
          for wordpair in WORD_RE.findall(line):
            if word != wordpair:
               yield (word.lower(), wordpair.lower(), 1)

    def reducer(self, key, values):
        cnts = defaultdict(int)
        total = 0
        for (word, count) in values:
          cnt=0
          total += count
          cnts[word] += count

        for (k,kp), v in cnts.items():
            yield (k,kp), (v, float(v) / total) 

    def combiner(self, key, values):
        yield None, (key, sum(values))


if __name__ == '__main__':
    MRRelativeFreq.run()

您将需要一个中间数据结构,在本例中是一个 defaultdict 来计算单词出现的总次数。

import re
from collections import defaultdict
from itertools import combinations

from mrjob.job import MRJob
from mrjob.step import MRStep

WORD_RE = re.compile(r"[\w']+")


class MRRelativeFreq(MRJob):
    denoms = defaultdict(int)

    def steps(self):
        return [
            MRStep(
                mapper=self.mapper,
                combiner=self.combiner,
                reducer=self.reducer),
            MRStep(
                reducer=self.reducer_s2)
        ]

    def mapper(self, _, line):
        words = WORD_RE.findall(line)
        for (x, y) in combinations(words, 2):
            if x != y:
                yield ((x.lower(), "*"), 1)
                yield ((x.lower(), y.lower()), 1)

    def combiner(self, pair, counts):
        yield (pair, sum(counts))

    def reducer(self, pair, counts):
        count = sum(counts)
        x, y = pair
        if y == "*":
            self.denoms[x] = count
        else:
            yield ((x, y), count)

    def reducer_s2(self, pair, ycnt):
        x, y = pair
        lkup = self.denoms[x]
        yield (pair, round((sum(ycnt) / lkup), 2))

if __name__ == '__main__':
    MRRelativeFreq.run()

给定以下结构的文件:

"""
this is something
this is not
or else this is
"""

我得到以下结果:

["or", "else"]  0.33
["or", "is"]    0.33
["or", "this"]  0.33
["this", "is"]  0.6
["this", "not"] 0.2
["this", "something"]   0.2
["is", "not"]   0.5
["is", "something"] 0.5
["else", "is"]  0.5
["else", "this"]    0.5

使用 this 作为提示。