使用 Kimurai 进行网页抓取 gem
Web scraping with Kimurai gem
我正在使用 Kimurai Ruby gem 进行一些网络抓取。我有这个脚本,效果很好:
require 'kimurai'
class SimpleSpider < Kimurai::Base
@name = "simple_spider"
@engine = :selenium_chrome
@start_urls = ["https://apply.workable.com/taxjar/"]
def parse(response, url:, data: {})
# Update response to current response after interaction with a browser
count = 0
# browser.click_button "Show more"
doc = browser.current_response
returned_jobs = doc.css('.careers-jobs-list-styles__jobsList--3_v12')
returned_jobs.css('li').each do |char_element|
# puts char_element
title = char_element.css('a')[0]['aria-label']
link = "https://apply.workable.com" + char_element.css('a')[0]['href']
#click on job link and get description
browser.visit(link)
job_page = browser.current_response
description = job_page.xpath('/html/body/div[1]/div/div[1]/div[2]/div[2]/div[2]').text
puts '*******'
puts title
puts link
puts description
puts count += 1
end
puts "There are #{count} jobs total"
end
end
SimpleSpider.crawl!
但是,我希望这一切都成为 return 一组对象……在本例中为作业。我想在 parse 方法中创建一个 jobs 数组,并在 returned_jobs
循环中做类似 jobs << [title, link, description, company]
的事情,并在我调用 SimpleSpider.crawl!
时得到 returned 但那不起作用。
感谢任何帮助。
您可以像这样稍微修改您的代码:
class SimpleSpider < Kimurai::Base
@name = "simple_spider"
@engine = :selenium_chrome
@start_urls = ["https://apply.workable.com/taxjar/"]
def parse(response, url:, data: {})
# Update response to current response after interaction with a browser
count = 0
# browser.click_button "Show more"
doc = browser.current_response
returned_jobs = doc.css('.careers-jobs-list-styles__jobsList--3_v12')
jobs = []
returned_jobs.css('li').each do |char_element|
# puts char_element
title = char_element.css('a')[0]['aria-label']
link = "https://apply.workable.com" + char_element.css('a')[0]['href']
#click on job link and get description
browser.visit(link)
job_page = browser.current_response
description = job_page.xpath('/html/body/div[1]/div/div[1]/div[2]/div[2]/div[2]').text
jobs << [title, link, description]
end
puts "There are #{jobs.count} jobs total"
puts jobs
end
end
我不确定公司的情况,因为我在您的代码中没有看到该变量。但是,您可以在上面看到调用数组并对其进行处理的想法。
这是终端中输出 运行 的一部分:
我还有一个博客 post here 关于如何在 Rails 应用程序上使用 Ruby 的 Kimurai 框架。
原来有一个 parse method 允许返回一个值。这是工作示例:
require 'open-uri'
require 'nokogiri'
require 'kimurai'
class TaxJar < Kimurai::Base
@name = "tax_jar"
@engine = :selenium_chrome
@start_urls = ["https://apply.workable.com/taxjar/"]
def parse(response, url:, data: {})
jobs = Array.new
doc = browser.current_response
returned_jobs = doc.css('.careers-jobs-list-styles__jobsList--3_v12')
returned_jobs.css('li').each do |char_element|
title = char_element.css('a')[0]['aria-label']
link = "https://apply.workable.com" + char_element.css('a')[0]['href']
#click on job link and get description
browser.visit(link)
job_page = browser.current_response
description = job_page.xpath('/html/body/div[1]/div/div[1]/div[2]/div[2]/div[2]').text
company = 'TaxJar'
puts "title is: #{title}, link is: #{link}, \n description is: #{description}"
jobs << [title, link, description, company]
end
return jobs
end
end
jobs = TaxJar.parse!(:parse, url: "https://apply.workable.com/taxjar/")
puts jobs.inspect
如果你正在抓取 JS 网站,这个 gem 与我试过的其他 (waitr/selenium) 相比似乎相当强大。
我正在使用 Kimurai Ruby gem 进行一些网络抓取。我有这个脚本,效果很好:
require 'kimurai'
class SimpleSpider < Kimurai::Base
@name = "simple_spider"
@engine = :selenium_chrome
@start_urls = ["https://apply.workable.com/taxjar/"]
def parse(response, url:, data: {})
# Update response to current response after interaction with a browser
count = 0
# browser.click_button "Show more"
doc = browser.current_response
returned_jobs = doc.css('.careers-jobs-list-styles__jobsList--3_v12')
returned_jobs.css('li').each do |char_element|
# puts char_element
title = char_element.css('a')[0]['aria-label']
link = "https://apply.workable.com" + char_element.css('a')[0]['href']
#click on job link and get description
browser.visit(link)
job_page = browser.current_response
description = job_page.xpath('/html/body/div[1]/div/div[1]/div[2]/div[2]/div[2]').text
puts '*******'
puts title
puts link
puts description
puts count += 1
end
puts "There are #{count} jobs total"
end
end
SimpleSpider.crawl!
但是,我希望这一切都成为 return 一组对象……在本例中为作业。我想在 parse 方法中创建一个 jobs 数组,并在 returned_jobs
循环中做类似 jobs << [title, link, description, company]
的事情,并在我调用 SimpleSpider.crawl!
时得到 returned 但那不起作用。
感谢任何帮助。
您可以像这样稍微修改您的代码:
class SimpleSpider < Kimurai::Base
@name = "simple_spider"
@engine = :selenium_chrome
@start_urls = ["https://apply.workable.com/taxjar/"]
def parse(response, url:, data: {})
# Update response to current response after interaction with a browser
count = 0
# browser.click_button "Show more"
doc = browser.current_response
returned_jobs = doc.css('.careers-jobs-list-styles__jobsList--3_v12')
jobs = []
returned_jobs.css('li').each do |char_element|
# puts char_element
title = char_element.css('a')[0]['aria-label']
link = "https://apply.workable.com" + char_element.css('a')[0]['href']
#click on job link and get description
browser.visit(link)
job_page = browser.current_response
description = job_page.xpath('/html/body/div[1]/div/div[1]/div[2]/div[2]/div[2]').text
jobs << [title, link, description]
end
puts "There are #{jobs.count} jobs total"
puts jobs
end
end
我不确定公司的情况,因为我在您的代码中没有看到该变量。但是,您可以在上面看到调用数组并对其进行处理的想法。
这是终端中输出 运行 的一部分:
我还有一个博客 post here 关于如何在 Rails 应用程序上使用 Ruby 的 Kimurai 框架。
原来有一个 parse method 允许返回一个值。这是工作示例:
require 'open-uri'
require 'nokogiri'
require 'kimurai'
class TaxJar < Kimurai::Base
@name = "tax_jar"
@engine = :selenium_chrome
@start_urls = ["https://apply.workable.com/taxjar/"]
def parse(response, url:, data: {})
jobs = Array.new
doc = browser.current_response
returned_jobs = doc.css('.careers-jobs-list-styles__jobsList--3_v12')
returned_jobs.css('li').each do |char_element|
title = char_element.css('a')[0]['aria-label']
link = "https://apply.workable.com" + char_element.css('a')[0]['href']
#click on job link and get description
browser.visit(link)
job_page = browser.current_response
description = job_page.xpath('/html/body/div[1]/div/div[1]/div[2]/div[2]/div[2]').text
company = 'TaxJar'
puts "title is: #{title}, link is: #{link}, \n description is: #{description}"
jobs << [title, link, description, company]
end
return jobs
end
end
jobs = TaxJar.parse!(:parse, url: "https://apply.workable.com/taxjar/")
puts jobs.inspect
如果你正在抓取 JS 网站,这个 gem 与我试过的其他 (waitr/selenium) 相比似乎相当强大。