如何使用水豚获取动态页面内容
How to get dynamic page content with capybara
正在尝试从元素中获取文本:
<div class="points-text" data-reactid=".2765swfgy68.1.2.2.1:$sony-xperia-z1-compact.2.0.4.0.0.1">29,367 points</div>
我猜网站使用 Reactjs,即使使用 Poltergeist 驱动程序,水豚也无法获取内容。有什么解决方法吗?
这是我的代码:
require 'rubygems'
require 'capybara'
require 'capybara/poltergeist'
Capybara.default_driver = :poltergeist
Capybara.register_driver :poltergeist do |app|
Capybara::Poltergeist::Driver.new(app, {js_errors: false})
end
class WebScraper
include Capybara::DSL
def get_page_data(url)
visit(url)
doc = Nokogiri::HTML(page.html)
p doc.css('.points-text')
end
end
scraper = WebScraper.new
puts scraper.get_page_data('http://versus.com/en/sony-xperia-z1-compact')
如果您已经在使用 Capybara 访问,则无需使用 Nokogiri 解析 html。
def get_page_data(url)
visit(url)
p find(:css, '.points-text').text
end
将在带有class points-text
的元素中打印可见文本
此代码适用于 selenium 和 webkit 但不适用于 poltergeist 驱动程序.
从屏幕截图我可以看到网站上的 javascript 甚至没有执行。
require 'capybara'
require 'capybara-webkit'
require 'capybara/poltergeist'
Capybara.default_driver = :poltergeist
# Capybara.default_driver = :selenium
# Capybara.default_driver = :webkit
Capybara.register_driver :poltergeist do |app|
Capybara::Poltergeist::Driver.new(app, {js_errors: false})
end
Capybara::Webkit.configure do |config|
config.allow_unknown_urls
end
class WebScraper
include Capybara::DSL
def get_page_data(url)
visit(url)
max_same_times = 3
same_times = 0
old_points = nil
20.times do |i|
doc = Nokogiri::HTML(page.html)
# p doc.css('.points-text')
points = doc.css('.points-text')
# p [same_times, max_same_times]
if same_times == max_same_times
break
end
if points.length > 0
points = points[0].text
# p [old_points, points]
if old_points == points
same_times += 1
else
same_times = 0
end
old_points = points
end
page.save_screenshot("#{i}.png")
end
old_points
end
end
scraper = WebScraper.new
puts scraper.get_page_data('http://versus.com/en/sony-xperia-z1-compact')
正在尝试从元素中获取文本:
<div class="points-text" data-reactid=".2765swfgy68.1.2.2.1:$sony-xperia-z1-compact.2.0.4.0.0.1">29,367 points</div>
我猜网站使用 Reactjs,即使使用 Poltergeist 驱动程序,水豚也无法获取内容。有什么解决方法吗?
这是我的代码:
require 'rubygems'
require 'capybara'
require 'capybara/poltergeist'
Capybara.default_driver = :poltergeist
Capybara.register_driver :poltergeist do |app|
Capybara::Poltergeist::Driver.new(app, {js_errors: false})
end
class WebScraper
include Capybara::DSL
def get_page_data(url)
visit(url)
doc = Nokogiri::HTML(page.html)
p doc.css('.points-text')
end
end
scraper = WebScraper.new
puts scraper.get_page_data('http://versus.com/en/sony-xperia-z1-compact')
如果您已经在使用 Capybara 访问,则无需使用 Nokogiri 解析 html。
def get_page_data(url)
visit(url)
p find(:css, '.points-text').text
end
将在带有class points-text
的元素中打印可见文本此代码适用于 selenium 和 webkit 但不适用于 poltergeist 驱动程序. 从屏幕截图我可以看到网站上的 javascript 甚至没有执行。
require 'capybara'
require 'capybara-webkit'
require 'capybara/poltergeist'
Capybara.default_driver = :poltergeist
# Capybara.default_driver = :selenium
# Capybara.default_driver = :webkit
Capybara.register_driver :poltergeist do |app|
Capybara::Poltergeist::Driver.new(app, {js_errors: false})
end
Capybara::Webkit.configure do |config|
config.allow_unknown_urls
end
class WebScraper
include Capybara::DSL
def get_page_data(url)
visit(url)
max_same_times = 3
same_times = 0
old_points = nil
20.times do |i|
doc = Nokogiri::HTML(page.html)
# p doc.css('.points-text')
points = doc.css('.points-text')
# p [same_times, max_same_times]
if same_times == max_same_times
break
end
if points.length > 0
points = points[0].text
# p [old_points, points]
if old_points == points
same_times += 1
else
same_times = 0
end
old_points = points
end
page.save_screenshot("#{i}.png")
end
old_points
end
end
scraper = WebScraper.new
puts scraper.get_page_data('http://versus.com/en/sony-xperia-z1-compact')