使用 Nokogiri 抓取 b 元素之间的所有内容
Grab everything between b elements with Nokogiri
这里是 HTML:
<tr class="level2">
<td>
<b>word</b>
"Text I need"
<b>word</b>
"Text I need"
<b>word</b>
"Text I need"
<b>word</b>
"Text I need"
<i>blabla</>
"Text I need"
<b>word</b>
"Text I need"
<i>blabla</>
"Text I need"
<i>blabla</>
<b>word</b>
</td>
</tr>
我想 select <b>
元素之间的每个节点,然后稍后遍历每个节点。目前我有:
translations = page.xpath('//text()[preceding-sibling::b]')
当 <b>
元素之间只有文本时,它工作正常。但是,当 <b>
元素之间出现一个或多个 <i>
标记时,我只会得到节点中的第一个文本。节点中的剩余文本转到下一个节点。
我想要输出:
node 1: Text I need
node 2: Text I need
node 3: Text I need
node 4: Text I need
Text I need
node 5: Text I need
Text I need
这是代码:
require 'rubygems'
require 'open-uri'
require 'nokogiri' #parse html
require 'csv'
DATA_DIR = "words"
Dir.mkdir(DATA_DIR) unless File.exists?(DATA_DIR) # making directory
BASE_LINK = "http://dict.ibs.ee/translate.cgi?word="
LANGUAGE = "&language=English"
WILDCARD = "*"
SLEEP_TIME = 0.1 # sleep between web requests in seconds
counter = 1 #counter for file name
i = 1
name = "IBSwords"+"#{counter}"+".csv"
alphabet = %w[a b c d e f g h i j k l m n o p q r s t u v w x y z]
four_letter_combinations = alphabet.product(alphabet, alphabet, alphabet).map(&:join)
#combination from 4 letters
for combination in four_letter_combinations
begin
i += 1
if (i % 150000 ) == 0
counter += 1
name = "IBSwords"+"#{counter}"+".csv"
end
sleep (SLEEP_TIME)
link = BASE_LINK+"about"+LANGUAGE
page = Nokogiri::HTML(open(link)) #retry in 60 sec if no connection
rescue StandardError=>e
puts "#{e} No Connection, retrying..."
sleep 60
retry
else
unless page.css('body > div > center > table > tbody > tr > td > div > center > table > tbody > tr > td > blockquote > dl > dd > b').nil?
puts "*****************#{i} #{combination}***********"
en_words = page.css('blockquote > dl > dd > b')
#ee_words = page.css('blockquote > dl > dd').to_s.split(/<b>.*<\/b>/)
ee_words = page.xpath('//text()[preceding-sibling::b]')
# iterating through
en_words.zip(ee_words).each do |word, ee_word|
en_word = word.text.chomp.strip
ee_trans = ee_word.text.chomp.strip
#en_desc = word.xpath('td[2]/node()[not(self::strong)]').text
puts "#{en_word}"
puts "#{ee_trans}"
puts "*******************************"
i += 1
#writing to csv
CSV.open("words/#{name}", "ab") do |row| # write to CSV
row << [
en_word,
#en_desc,
ee_trans,
#ee_desc
]
end
end
end
end
end
您可能正在寻找仅 xpath
的解决方案,但这是使用 ruby 枚举器的解决方案:
xml.xpath('//td').children.inject({}) do |memo, node|
case node.name
when 'b' then memo["#{node.children.first}"] = ""
when 'text'
memo["#{memo.keys.last}"] << "#{node}" unless memo.length.zero?
else # just skip
end
memo
end
这给出:
#⇒ {
# "word 1" => "\n \"Text I need 1\"\n ",
# "word 2" => "\n \"Text I need 2\"\n ",
# "word 3" => "\n \"Text I need 3\"\n ",
# "word 4" => "\n \"Text I need 41\"\n \n \"Text I need 42\"\n ",
# "word 5" => "\n \"Text I need 51\"\n \n \"Text I need 52\"\n \n ",
# "word 6" => "\n\n "
# }
希望对您有所帮助。
我减少了您的 HTML 以减少冗长。它在没有额外文本的情况下实现了同样的效果。
我会这样做:
require 'nokogiri'
doc = Nokogiri::HTML(<<EOT)
<tr class="level2">
<td>
<b>word</b>
"Text I need"
<b>word</b>
"Text I need"
<i>blabla</i>
"Text I need"
<b>word</b>
"Text I need"
<i>blabla</i>
"Text I need"
<i>blabla</i>
<b>word</b>
</td>
</tr>
EOT
doc.search('td i').remove
由于不需要 <i>
节点,只需将它们剥离即可。结果 doc
看起来像:
puts doc.to_html
# >> <!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" "http://www.w3.org/TR/REC-html40/loose.dtd">
# >> <html><body>
# >> <tr class="level2">
# >> <td>
# >> <b>word</b>
# >> "Text I need"
# >> <b>word</b>
# >> "Text I need"
# >>
# >> "Text I need"
# >> <b>word</b>
# >> "Text I need"
# >>
# >> "Text I need"
# >>
# >> <b>word</b>
# >>
# >> </td>
# >> </tr>
# >> </body></html>
一旦 <i>
节点消失,就可以遍历 <td>
的内容并处理它们的文本:
text = doc.at('td').children.reject { |n| n.text.strip == '' }.slice_before { |n| n.name == 'b' }.map{ |a| a.map { |n| n.text.strip }}
此时text
包含:
text
# => [["word", "\"Text I need\""],
# ["word", "\"Text I need\"", "\"Text I need\""],
# ["word", "\"Text I need\"", "\"Text I need\""],
# ["word"]]
请注意,尾部有一个 "word",它模仿您提供的示例 HTML。如果您知道您不想保留任何尾随文本,您可以简单地 pop
关闭该元素。如果您认为有些元素只是单个项目,您可以遍历列表以查找单个项目并拒绝它们。至于怎么处理,就看你自己了。
这里是 HTML:
<tr class="level2">
<td>
<b>word</b>
"Text I need"
<b>word</b>
"Text I need"
<b>word</b>
"Text I need"
<b>word</b>
"Text I need"
<i>blabla</>
"Text I need"
<b>word</b>
"Text I need"
<i>blabla</>
"Text I need"
<i>blabla</>
<b>word</b>
</td>
</tr>
我想 select <b>
元素之间的每个节点,然后稍后遍历每个节点。目前我有:
translations = page.xpath('//text()[preceding-sibling::b]')
当 <b>
元素之间只有文本时,它工作正常。但是,当 <b>
元素之间出现一个或多个 <i>
标记时,我只会得到节点中的第一个文本。节点中的剩余文本转到下一个节点。
我想要输出:
node 1: Text I need
node 2: Text I need
node 3: Text I need
node 4: Text I need
Text I need
node 5: Text I need
Text I need
这是代码:
require 'rubygems'
require 'open-uri'
require 'nokogiri' #parse html
require 'csv'
DATA_DIR = "words"
Dir.mkdir(DATA_DIR) unless File.exists?(DATA_DIR) # making directory
BASE_LINK = "http://dict.ibs.ee/translate.cgi?word="
LANGUAGE = "&language=English"
WILDCARD = "*"
SLEEP_TIME = 0.1 # sleep between web requests in seconds
counter = 1 #counter for file name
i = 1
name = "IBSwords"+"#{counter}"+".csv"
alphabet = %w[a b c d e f g h i j k l m n o p q r s t u v w x y z]
four_letter_combinations = alphabet.product(alphabet, alphabet, alphabet).map(&:join)
#combination from 4 letters
for combination in four_letter_combinations
begin
i += 1
if (i % 150000 ) == 0
counter += 1
name = "IBSwords"+"#{counter}"+".csv"
end
sleep (SLEEP_TIME)
link = BASE_LINK+"about"+LANGUAGE
page = Nokogiri::HTML(open(link)) #retry in 60 sec if no connection
rescue StandardError=>e
puts "#{e} No Connection, retrying..."
sleep 60
retry
else
unless page.css('body > div > center > table > tbody > tr > td > div > center > table > tbody > tr > td > blockquote > dl > dd > b').nil?
puts "*****************#{i} #{combination}***********"
en_words = page.css('blockquote > dl > dd > b')
#ee_words = page.css('blockquote > dl > dd').to_s.split(/<b>.*<\/b>/)
ee_words = page.xpath('//text()[preceding-sibling::b]')
# iterating through
en_words.zip(ee_words).each do |word, ee_word|
en_word = word.text.chomp.strip
ee_trans = ee_word.text.chomp.strip
#en_desc = word.xpath('td[2]/node()[not(self::strong)]').text
puts "#{en_word}"
puts "#{ee_trans}"
puts "*******************************"
i += 1
#writing to csv
CSV.open("words/#{name}", "ab") do |row| # write to CSV
row << [
en_word,
#en_desc,
ee_trans,
#ee_desc
]
end
end
end
end
end
您可能正在寻找仅 xpath
的解决方案,但这是使用 ruby 枚举器的解决方案:
xml.xpath('//td').children.inject({}) do |memo, node|
case node.name
when 'b' then memo["#{node.children.first}"] = ""
when 'text'
memo["#{memo.keys.last}"] << "#{node}" unless memo.length.zero?
else # just skip
end
memo
end
这给出:
#⇒ {
# "word 1" => "\n \"Text I need 1\"\n ",
# "word 2" => "\n \"Text I need 2\"\n ",
# "word 3" => "\n \"Text I need 3\"\n ",
# "word 4" => "\n \"Text I need 41\"\n \n \"Text I need 42\"\n ",
# "word 5" => "\n \"Text I need 51\"\n \n \"Text I need 52\"\n \n ",
# "word 6" => "\n\n "
# }
希望对您有所帮助。
我减少了您的 HTML 以减少冗长。它在没有额外文本的情况下实现了同样的效果。
我会这样做:
require 'nokogiri'
doc = Nokogiri::HTML(<<EOT)
<tr class="level2">
<td>
<b>word</b>
"Text I need"
<b>word</b>
"Text I need"
<i>blabla</i>
"Text I need"
<b>word</b>
"Text I need"
<i>blabla</i>
"Text I need"
<i>blabla</i>
<b>word</b>
</td>
</tr>
EOT
doc.search('td i').remove
由于不需要 <i>
节点,只需将它们剥离即可。结果 doc
看起来像:
puts doc.to_html
# >> <!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" "http://www.w3.org/TR/REC-html40/loose.dtd">
# >> <html><body>
# >> <tr class="level2">
# >> <td>
# >> <b>word</b>
# >> "Text I need"
# >> <b>word</b>
# >> "Text I need"
# >>
# >> "Text I need"
# >> <b>word</b>
# >> "Text I need"
# >>
# >> "Text I need"
# >>
# >> <b>word</b>
# >>
# >> </td>
# >> </tr>
# >> </body></html>
一旦 <i>
节点消失,就可以遍历 <td>
的内容并处理它们的文本:
text = doc.at('td').children.reject { |n| n.text.strip == '' }.slice_before { |n| n.name == 'b' }.map{ |a| a.map { |n| n.text.strip }}
此时text
包含:
text
# => [["word", "\"Text I need\""],
# ["word", "\"Text I need\"", "\"Text I need\""],
# ["word", "\"Text I need\"", "\"Text I need\""],
# ["word"]]
请注意,尾部有一个 "word",它模仿您提供的示例 HTML。如果您知道您不想保留任何尾随文本,您可以简单地 pop
关闭该元素。如果您认为有些元素只是单个项目,您可以遍历列表以查找单个项目并拒绝它们。至于怎么处理,就看你自己了。