如何计算Ruby中的最高词频
How to calculate the highest word frequency in Ruby
我一直在为 Rails 课程的 Coursera 介绍做这项作业。我们的任务是编写一个程序来计算文本文件中的最大词频。我们被指示创建一个方法:
- 计算单个单词在给定内容中出现的最大次数并存储在
highest_wf_count
中。
- 找出使用次数最多的词并将其存储在
highest_wf_words
。
当我 运行 给我们的 rspec 测试时,有一个测试失败了。我打印了我的输出以查看问题所在,但无法修复它。
这是我的代码,rspec 测试,以及我得到的结果:
class LineAnalyzer
attr_accessor :highest_wf_count
attr_accessor :highest_wf_words
attr_accessor :content
attr_accessor :line_number
def initialize(content, line_number)
@content = content
@line_number = line_number
@highest_wf_count = 0
@highest_wf_words = highest_wf_words
calculate_word_frequency
end
def calculate_word_frequency()
@highest_wf_words = Hash.new(0)
@content.split.each do |word|
@highest_wf_words[word.downcase!] += 1
if @highest_wf_words.has_key?(word)
@highest_wf_words[word] += 1
else
@highest_wf_words[word] = 1
end
@highest_wf_words.sort_by{|word, count| count}
@highest_wf_count = @highest_wf_words.max_by {|word, count| count}
end
end
def highest_wf_count()
p @highest_wf_count
end
end
这是rspec代码:
require 'rspec'
describe LineAnalyzer do
subject(:lineAnalyzer) { LineAnalyzer.new("test", 1) }
it "has accessor for highest_wf_count" do
is_expected.to respond_to(:highest_wf_count)
end
it "has accessor for highest_wf_words" do
is_expected.to respond_to(:highest_wf_words)
end
it "has accessor for content" do
is_expected.to respond_to(:content)
end
it "has accessor for line_number" do
is_expected.to respond_to(:line_number)
end
it "has method calculate_word_frequency" do
is_expected.to respond_to(:calculate_word_frequency)
end
context "attributes and values" do
it "has attributes content and line_number" do
is_expected.to have_attributes(content: "test", line_number: 1)
end
it "content attribute should have value \"test\"" do
expect(lineAnalyzer.content).to eq("test")
end
it "line_number attribute should have value 1" do
expect(lineAnalyzer.line_number).to eq(1)
end
end
it "calls calculate_word_frequency when created" do
expect_any_instance_of(LineAnalyzer).to receive(:calculate_word_frequency)
LineAnalyzer.new("", 1)
end
context "#calculate_word_frequency" do
subject(:lineAnalyzer) { LineAnalyzer.new("This is a really really really cool cool you you you", 2) }
it "highest_wf_count value is 3" do
expect(lineAnalyzer.highest_wf_count).to eq(3)
end
it "highest_wf_words will include \"really\" and \"you\"" do
expect(lineAnalyzer.highest_wf_words).to include 'really', 'you'
end
it "content attribute will have value \"This is a really really really cool cool you you you\"" do
expect(lineAnalyzer.content).to eq("This is a really really really cool cool you you you")
end
it "line_number attribute will have value 2" do
expect(lineAnalyzer.line_number).to eq(2)
end
end
end
这是 rspec 输出:
13 examples, 1 failure
Failed examples:
rspec ./course01/module02/assignment-Calc-Max-Word-Freq/spec/line_analyzer_spec.rb:42 # LineAnalyzer#calculate_word_frequency highest_wf_count value is 3
我的输出:
#<LineAnalyzer:0x00007fc7f9018858 @content="This is a really really really cool cool you you you", @line_number=2, @highest_wf_count=[nil, 10], @highest_wf_words={"this"=>2, nil=>10, "is"=>1, "a"=>1, "really"=>3, "cool"=>2, "you"=>3}>
- 根据测试字符串,字数不正确。
- "nil" 被包含在哈希中。
- 散列未按应有的值(计数)排序。
我尝试了几种方法来解决这些问题,但都没有效果。我又看了一遍讲座 material,但找不到任何有用的东西,而且讨论板上也没有经常监控学生提出的问题。
无:
零来自downcase!
这会就地修改字符串,如果没有任何更改,returns nil
。
如果你说 "this is weird",那么你是对的(恕我直言)。
# just use the non destructive variant
word.downcase
排序:
sort_by
returns 一个新对象(Hash,Array,...)并且不修改方法的接收者。您需要重新分配或使用 sort_by!
unsorted = [3, 1, 2]
sorted = unsorted.sort
p unsorted # => [3, 1, 2]
p sorted # => [1, 2, 3]
unsorted.sort!
p unsorted # => [1, 2, 3]
字数统计错误:
一旦你纠正了这两个错误,它应该看起来更好了。请注意,该方法不是 return 单个整数,而是一个包含单词和计数的双元素数组,因此它应该看起来像这样:["really", 6]
简化事情:
如果你能用ruby 2.7,那就有好用的Enumerable#tally
方法了!
%w(foo foo bar foo baz foo).tally
=> {"foo"=>4, "bar"=>1, "baz"=>1}
示例取自
https://medium.com/@baweaver/ruby-2-7-enumerable-tally-a706a5fb11ea
downcase!(*args) public
Downcases the contents of str, returning nil if no changes were made.
由于 .downcase!
方法的这种意外行为,如果单词已经全部小写,您将在该行中增加 nil
的出现次数:
@highest_wf_words[word.downcase!] += 1
测试也失败了,因为 @highest_wf_words.max_by {|word, count| count}
returns 一个包含计数和单词的数组,而我们只想获取计数。
通过测试的简化 calculate_word_frequency
方法如下所示:
def calculate_word_frequency()
@highest_wf_words = Hash.new(0)
@content.split.each do |word|
# we don't have to check if the word existed before
# because we set 0 as default value in @highest_wf_words hash
# use .downcase instead of .downcase!
@highest_wf_words[word.downcase] += 1
# extract only the count, and then get the max
@highest_wf_count = @highest_wf_words.map {|word, count| count}.max
end
end
我一直在为 Rails 课程的 Coursera 介绍做这项作业。我们的任务是编写一个程序来计算文本文件中的最大词频。我们被指示创建一个方法:
- 计算单个单词在给定内容中出现的最大次数并存储在
highest_wf_count
中。 - 找出使用次数最多的词并将其存储在
highest_wf_words
。
当我 运行 给我们的 rspec 测试时,有一个测试失败了。我打印了我的输出以查看问题所在,但无法修复它。
这是我的代码,rspec 测试,以及我得到的结果:
class LineAnalyzer
attr_accessor :highest_wf_count
attr_accessor :highest_wf_words
attr_accessor :content
attr_accessor :line_number
def initialize(content, line_number)
@content = content
@line_number = line_number
@highest_wf_count = 0
@highest_wf_words = highest_wf_words
calculate_word_frequency
end
def calculate_word_frequency()
@highest_wf_words = Hash.new(0)
@content.split.each do |word|
@highest_wf_words[word.downcase!] += 1
if @highest_wf_words.has_key?(word)
@highest_wf_words[word] += 1
else
@highest_wf_words[word] = 1
end
@highest_wf_words.sort_by{|word, count| count}
@highest_wf_count = @highest_wf_words.max_by {|word, count| count}
end
end
def highest_wf_count()
p @highest_wf_count
end
end
这是rspec代码:
require 'rspec'
describe LineAnalyzer do
subject(:lineAnalyzer) { LineAnalyzer.new("test", 1) }
it "has accessor for highest_wf_count" do
is_expected.to respond_to(:highest_wf_count)
end
it "has accessor for highest_wf_words" do
is_expected.to respond_to(:highest_wf_words)
end
it "has accessor for content" do
is_expected.to respond_to(:content)
end
it "has accessor for line_number" do
is_expected.to respond_to(:line_number)
end
it "has method calculate_word_frequency" do
is_expected.to respond_to(:calculate_word_frequency)
end
context "attributes and values" do
it "has attributes content and line_number" do
is_expected.to have_attributes(content: "test", line_number: 1)
end
it "content attribute should have value \"test\"" do
expect(lineAnalyzer.content).to eq("test")
end
it "line_number attribute should have value 1" do
expect(lineAnalyzer.line_number).to eq(1)
end
end
it "calls calculate_word_frequency when created" do
expect_any_instance_of(LineAnalyzer).to receive(:calculate_word_frequency)
LineAnalyzer.new("", 1)
end
context "#calculate_word_frequency" do
subject(:lineAnalyzer) { LineAnalyzer.new("This is a really really really cool cool you you you", 2) }
it "highest_wf_count value is 3" do
expect(lineAnalyzer.highest_wf_count).to eq(3)
end
it "highest_wf_words will include \"really\" and \"you\"" do
expect(lineAnalyzer.highest_wf_words).to include 'really', 'you'
end
it "content attribute will have value \"This is a really really really cool cool you you you\"" do
expect(lineAnalyzer.content).to eq("This is a really really really cool cool you you you")
end
it "line_number attribute will have value 2" do
expect(lineAnalyzer.line_number).to eq(2)
end
end
end
这是 rspec 输出:
13 examples, 1 failure
Failed examples:
rspec ./course01/module02/assignment-Calc-Max-Word-Freq/spec/line_analyzer_spec.rb:42 # LineAnalyzer#calculate_word_frequency highest_wf_count value is 3
我的输出:
#<LineAnalyzer:0x00007fc7f9018858 @content="This is a really really really cool cool you you you", @line_number=2, @highest_wf_count=[nil, 10], @highest_wf_words={"this"=>2, nil=>10, "is"=>1, "a"=>1, "really"=>3, "cool"=>2, "you"=>3}>
- 根据测试字符串,字数不正确。
- "nil" 被包含在哈希中。
- 散列未按应有的值(计数)排序。
我尝试了几种方法来解决这些问题,但都没有效果。我又看了一遍讲座 material,但找不到任何有用的东西,而且讨论板上也没有经常监控学生提出的问题。
无:
零来自downcase!
这会就地修改字符串,如果没有任何更改,returns nil
。
如果你说 "this is weird",那么你是对的(恕我直言)。
# just use the non destructive variant
word.downcase
排序:
sort_by
returns 一个新对象(Hash,Array,...)并且不修改方法的接收者。您需要重新分配或使用 sort_by!
unsorted = [3, 1, 2]
sorted = unsorted.sort
p unsorted # => [3, 1, 2]
p sorted # => [1, 2, 3]
unsorted.sort!
p unsorted # => [1, 2, 3]
字数统计错误:
一旦你纠正了这两个错误,它应该看起来更好了。请注意,该方法不是 return 单个整数,而是一个包含单词和计数的双元素数组,因此它应该看起来像这样:["really", 6]
简化事情:
如果你能用ruby 2.7,那就有好用的Enumerable#tally
方法了!
%w(foo foo bar foo baz foo).tally
=> {"foo"=>4, "bar"=>1, "baz"=>1}
示例取自 https://medium.com/@baweaver/ruby-2-7-enumerable-tally-a706a5fb11ea
downcase!(*args) public
Downcases the contents of str, returning nil if no changes were made.
由于 .downcase!
方法的这种意外行为,如果单词已经全部小写,您将在该行中增加 nil
的出现次数:
@highest_wf_words[word.downcase!] += 1
测试也失败了,因为 @highest_wf_words.max_by {|word, count| count}
returns 一个包含计数和单词的数组,而我们只想获取计数。
通过测试的简化 calculate_word_frequency
方法如下所示:
def calculate_word_frequency()
@highest_wf_words = Hash.new(0)
@content.split.each do |word|
# we don't have to check if the word existed before
# because we set 0 as default value in @highest_wf_words hash
# use .downcase instead of .downcase!
@highest_wf_words[word.downcase] += 1
# extract only the count, and then get the max
@highest_wf_count = @highest_wf_words.map {|word, count| count}.max
end
end