Python mrjob - 找到 10 个最长的单词,但 mrjob returns 重复的单词
Python mrjob - Finding 10 longest words, but mrjob returns duplicate words
我正在使用 Python mrjob 从文本文件中查找最长的 10 个单词。我得到了一个结果,但是结果包含重复的单词。如何只获取唯一的单词(即删除重复的单词)?
%%file most_chars.py
from mrjob.job import MRJob
from mrjob.step import MRStep
import re
WORD_RE = re.compile(r"[\w']+") # any whitespace or apostrophe, used to split lines below
class MostChars(MRJob):
def steps(self):
return [
MRStep(mapper=self.mapper_get_words,
reducer=self.reducer_find_longest_words)
]
def mapper_get_words(self, _, line):
for word in WORD_RE.findall(line):
yield None, (len(word), word.lower().strip())
# discard the key; it is just None
def reducer_find_longest_words(self, _, word_count_pairs):
# each item of word_count_pairs is (count, word),
# so yielding one results in key=counts, value=word
sorted_pair = sorted(word_count_pairs, reverse=True)
for pair in sorted_pair[0:10]:
yield pair
if __name__ == '__main__':
MostChars.run()
实际输出:
18 "overcapitalization"
18 "overcapitalization"
18 "overcapitalization"
17 "uncomprehendingly"
17 "misunderstandings"
17 "disinterestedness"
17 "disinterestedness"
17 "disinterestedness"
17 "disinterestedness"
17 "conventionalities"
预期输出:
18 "overcapitalization"
17 "uncomprehendingly"
17 "misunderstandings"
17 "disinterestedness"
17 "conventionalities"
还有 5 个不同的词
更新 reducer_find_longest_words
以仅获取唯一元素。注意 list(set())
.
的使用
def reducer_find_longest_words(self, _, word_count_pairs):
# each item of word_count_pairs is (count, word),
# so yielding one results in key=counts, value=word
unique_pairs = [list(x) for x in set(tuple(x) for x in word_count_pairs)]
sorted_pair = sorted(unique_pairs, reverse=True)
for pair in sorted_pair[0:10]:
yield pair
我正在使用 Python mrjob 从文本文件中查找最长的 10 个单词。我得到了一个结果,但是结果包含重复的单词。如何只获取唯一的单词(即删除重复的单词)?
%%file most_chars.py
from mrjob.job import MRJob
from mrjob.step import MRStep
import re
WORD_RE = re.compile(r"[\w']+") # any whitespace or apostrophe, used to split lines below
class MostChars(MRJob):
def steps(self):
return [
MRStep(mapper=self.mapper_get_words,
reducer=self.reducer_find_longest_words)
]
def mapper_get_words(self, _, line):
for word in WORD_RE.findall(line):
yield None, (len(word), word.lower().strip())
# discard the key; it is just None
def reducer_find_longest_words(self, _, word_count_pairs):
# each item of word_count_pairs is (count, word),
# so yielding one results in key=counts, value=word
sorted_pair = sorted(word_count_pairs, reverse=True)
for pair in sorted_pair[0:10]:
yield pair
if __name__ == '__main__':
MostChars.run()
实际输出:
18 "overcapitalization"
18 "overcapitalization"
18 "overcapitalization"
17 "uncomprehendingly"
17 "misunderstandings"
17 "disinterestedness"
17 "disinterestedness"
17 "disinterestedness"
17 "disinterestedness"
17 "conventionalities"
预期输出:
18 "overcapitalization"
17 "uncomprehendingly"
17 "misunderstandings"
17 "disinterestedness"
17 "conventionalities"
还有 5 个不同的词
更新 reducer_find_longest_words
以仅获取唯一元素。注意 list(set())
.
def reducer_find_longest_words(self, _, word_count_pairs):
# each item of word_count_pairs is (count, word),
# so yielding one results in key=counts, value=word
unique_pairs = [list(x) for x in set(tuple(x) for x in word_count_pairs)]
sorted_pair = sorted(unique_pairs, reverse=True)
for pair in sorted_pair[0:10]:
yield pair