"Combined" 3 个或更多字符串的差异/交集
"Combined" difference / intersection of 3 or more strings
[更新问题]
我正在使用将语音转换为文本的服务。
服务 returns 3 个句子的备选建议,例如
[
"News update I read a Jones, near record snowfalls...",
"News update I'm Rita Jones, near record snowfalls...",
"News update I am Rita Jones, near record snow-falls..."
]
我希望能够让用户从每个句子中选择最好的部分。
三个选项中相同的句子部分应该作为一个单独的项目呈现,因为没有选择:
["News", "Update", ...]
句子中不同的部分应显示为数组,因为可以做出选择:
[["I", "I'm", "I am"], ["read a", "Rita"]]
最终输出如下所示:
[
["News"],
["update"],
["I", "I'm", "I am"],
["read a", "Rita"],
["Jones,"],
["near"],
["record"],
["snowfalls", "snow-balls"]
]
虽然以下也是可以接受的:
[
["News update"],
["I", "I'm", "I am"],
["read a", "Rita"],
["Jones, near record"],
["snowfalls", "snow-balls"]
]
diff/wdiff 可能有答案,但我找不到答案。
我正在使用 Ruby,但也对任何 Linux 命令行工具感到满意。
虽然这看起来像是家庭作业,但我还是上钩了。这里的诀窍是知道什么可以 'grouped' 什么不能分组。你没有在那张纸条上列出说明,所以我认为它可以解释。
sentences = [
"News update I read a Jones, near record snowfalls...",
"News update I'm Rita Jones, near record snowfalls...",
"News update I am Rita Jones, near record snow-balls..."
]
@small_words = %w(a am)
def push_together(words)
new_words = []
words.each_with_index do |word, i|
if @small_words.include?(word)
new_words[i-1] += " " + word
else
new_words << word
end
end
new_words
end
def words_in_sentences(sentences)
new_sentences = []
sentences.each do |sentence|
words = sentence.split(" ")
new_sentences << if words.any? { |w| @small_words.include?(w) }
push_together(words)
else
words
end
end
new_sentences
end
new_sentences = words_in_sentences(sentences)
grouped_words = []
new_sentences.each do |sentence|
sentence.size.times do |i|
grouped_words[i] ||= []
grouped_words[i] << sentence[i]
end
end
p grouped_words.map(&:uniq)
输出:
[["News"], ["update"], ["I", "I'm", "I am"], ["read a", "Rita"], ["Jones,"], ["near"], ["record"], ["snowfalls...", "snow-balls..."]]
这是一个更通用的解决方案:
# try to sync a list of strings by finding "sections" of identical words
# maximum lookahead (in case of nonidentical words)
MAX_DIFF = 5
# generate all possibilities to increase one position in the index array
# in/out: array of index arrays
def inc_some(arrs)
out = []
for i in 0..arrs[0].size-1 do
# increase i-th position
arrs.each do |a|
b = a.clone
b[i] += 1
out << b
end
end
out
end
# is a given index array a match, i.e. a "section" of identical words?
def matches(sens,a)
sens.zip(a).map do |sen,offs|
if sen then sen[offs] else "" end
end.uniq.size == 1
end
# find a match
def find_match(sens)
len = sens.size
# successively try more lookahead
for m in 0..MAX_DIFF*len do
arrs = [[0] * len] # start with current words
# generate all possibilities to distribute a lookahead of m
# among the word lists
for i in 1..m do
arrs = inc_some(arrs)
end
arrs.each do |a|
# stop search if a match has been found
if matches(sens,a)
return a
end
end
end
# no match has been found
return nil
end
# try to synchronise recursively
def sync1(sens,acc)
if sens.join("").empty?
return {:ok => acc}
end
m = find_match(sens)
# no match found? Then we only have a partial solution
if m.nil?
return {:partial_solution => acc}
end
# pair word lists with match
m = sens.zip(m)
# if we found an immediate match, consume it
# otherwise, consume only the *differing* words
singleton = if m.map{|a| a[1]}.max == 0 then 1 else 0 end
words = m.map do |s,offs|
if s.nil? then "" else s[0,offs+singleton].join(" ") end
end
sens1 = m.map do |s,offs|
if s.nil? then [] else s[offs+singleton,s.size] end
end
sync1(sens1,acc << words.uniq)
end
# synchronise a list of sentences
def sync(sens)
sync1(sens.map{|s| s.split(" ")},[])
end
sentences = [
"News update I read a Jones, near record snowfalls...",
"News update I'm Rita Jones, near record snowfalls...",
"News update I am Rita Jones, near record snow-falls..."
]
puts sync(sentences).inspect
#> [["News"], ["update"], ["I read a", "I'm Rita", "I am Rita"], ["Jones,"], ["near"], ["record"], ["snowfalls...", "snow-falls..."]]
[更新问题] 我正在使用将语音转换为文本的服务。
服务 returns 3 个句子的备选建议,例如
[
"News update I read a Jones, near record snowfalls...",
"News update I'm Rita Jones, near record snowfalls...",
"News update I am Rita Jones, near record snow-falls..."
]
我希望能够让用户从每个句子中选择最好的部分。
三个选项中相同的句子部分应该作为一个单独的项目呈现,因为没有选择:
["News", "Update", ...]
句子中不同的部分应显示为数组,因为可以做出选择:
[["I", "I'm", "I am"], ["read a", "Rita"]]
最终输出如下所示:
[
["News"],
["update"],
["I", "I'm", "I am"],
["read a", "Rita"],
["Jones,"],
["near"],
["record"],
["snowfalls", "snow-balls"]
]
虽然以下也是可以接受的:
[
["News update"],
["I", "I'm", "I am"],
["read a", "Rita"],
["Jones, near record"],
["snowfalls", "snow-balls"]
]
diff/wdiff 可能有答案,但我找不到答案。
我正在使用 Ruby,但也对任何 Linux 命令行工具感到满意。
虽然这看起来像是家庭作业,但我还是上钩了。这里的诀窍是知道什么可以 'grouped' 什么不能分组。你没有在那张纸条上列出说明,所以我认为它可以解释。
sentences = [
"News update I read a Jones, near record snowfalls...",
"News update I'm Rita Jones, near record snowfalls...",
"News update I am Rita Jones, near record snow-balls..."
]
@small_words = %w(a am)
def push_together(words)
new_words = []
words.each_with_index do |word, i|
if @small_words.include?(word)
new_words[i-1] += " " + word
else
new_words << word
end
end
new_words
end
def words_in_sentences(sentences)
new_sentences = []
sentences.each do |sentence|
words = sentence.split(" ")
new_sentences << if words.any? { |w| @small_words.include?(w) }
push_together(words)
else
words
end
end
new_sentences
end
new_sentences = words_in_sentences(sentences)
grouped_words = []
new_sentences.each do |sentence|
sentence.size.times do |i|
grouped_words[i] ||= []
grouped_words[i] << sentence[i]
end
end
p grouped_words.map(&:uniq)
输出:
[["News"], ["update"], ["I", "I'm", "I am"], ["read a", "Rita"], ["Jones,"], ["near"], ["record"], ["snowfalls...", "snow-balls..."]]
这是一个更通用的解决方案:
# try to sync a list of strings by finding "sections" of identical words
# maximum lookahead (in case of nonidentical words)
MAX_DIFF = 5
# generate all possibilities to increase one position in the index array
# in/out: array of index arrays
def inc_some(arrs)
out = []
for i in 0..arrs[0].size-1 do
# increase i-th position
arrs.each do |a|
b = a.clone
b[i] += 1
out << b
end
end
out
end
# is a given index array a match, i.e. a "section" of identical words?
def matches(sens,a)
sens.zip(a).map do |sen,offs|
if sen then sen[offs] else "" end
end.uniq.size == 1
end
# find a match
def find_match(sens)
len = sens.size
# successively try more lookahead
for m in 0..MAX_DIFF*len do
arrs = [[0] * len] # start with current words
# generate all possibilities to distribute a lookahead of m
# among the word lists
for i in 1..m do
arrs = inc_some(arrs)
end
arrs.each do |a|
# stop search if a match has been found
if matches(sens,a)
return a
end
end
end
# no match has been found
return nil
end
# try to synchronise recursively
def sync1(sens,acc)
if sens.join("").empty?
return {:ok => acc}
end
m = find_match(sens)
# no match found? Then we only have a partial solution
if m.nil?
return {:partial_solution => acc}
end
# pair word lists with match
m = sens.zip(m)
# if we found an immediate match, consume it
# otherwise, consume only the *differing* words
singleton = if m.map{|a| a[1]}.max == 0 then 1 else 0 end
words = m.map do |s,offs|
if s.nil? then "" else s[0,offs+singleton].join(" ") end
end
sens1 = m.map do |s,offs|
if s.nil? then [] else s[offs+singleton,s.size] end
end
sync1(sens1,acc << words.uniq)
end
# synchronise a list of sentences
def sync(sens)
sync1(sens.map{|s| s.split(" ")},[])
end
sentences = [
"News update I read a Jones, near record snowfalls...",
"News update I'm Rita Jones, near record snowfalls...",
"News update I am Rita Jones, near record snow-falls..."
]
puts sync(sentences).inspect
#> [["News"], ["update"], ["I read a", "I'm Rita", "I am Rita"], ["Jones,"], ["near"], ["record"], ["snowfalls...", "snow-falls..."]]