需要从网络抓取中获取电子邮件 ID 和 phone 号码
Need to fetch the email id and phone number from web scraping
require 'open-uri'
require 'nokogiri'
def scrape(url)
html = open(url).read
nokogiri_doc = Nokogiri::HTML(html)
final_array = []
nokogiri_doc.search("a").each do |element|
element = element.text
final_array << element
end
final_array.each_with_index do |index|
puts "#{index}"
end
end
scrape('http://www.infranetsol.com/')
在此我只获取 a
标签,但我需要将电子邮件 ID 和 phone 号码放入 excel 文件中。
你只有文字。所以,你能做的就是只保留看起来像电子邮件或 phone 数字的字符串。
例如,如果您将结果保存在数组中
a = scrape('http://www.infranetsol.com/')
您可以通过电子邮件获取元素(带有'@'的字符串):
a.select { |s| s.match(/.*@.*/) }
您可以获得带有 phone 数字的元素(至少包含 5 位数字的字符串):
a.select{ |s| s.match(/\d{5}/) }
完整代码:
require 'open-uri'
require 'nokogiri'
def scrape(url)
html = open(url).read
nokogiri_doc = Nokogiri::HTML(html)
final_array = []
nokogiri_doc.search("a").each do |element|
element = element.text
final_array << element
end
final_array.each_with_index do |index|
puts "#{index}"
end
end
a = scrape('http://www.infranetsol.com/')
email = a.select { |s| s.match(/.*@.*/) }
phone = a.select{ |s| s.match(/\d{5}/) }
# in your example, you will have to email in email
# and unfortunately a complex string for phone.
# you can use scan to extract phone from text and flat_map
# to get an array without sub array
# But keep in mind it will only worked with this text
phone.flat_map{ |elt| elt.scan(/\d[\d ]*/) }
require 'open-uri'
require 'nokogiri'
def scrape(url)
html = open(url).read
nokogiri_doc = Nokogiri::HTML(html)
final_array = []
nokogiri_doc.search("a").each do |element|
element = element.text
final_array << element
end
final_array.each_with_index do |index|
puts "#{index}"
end
end
scrape('http://www.infranetsol.com/')
在此我只获取 a
标签,但我需要将电子邮件 ID 和 phone 号码放入 excel 文件中。
你只有文字。所以,你能做的就是只保留看起来像电子邮件或 phone 数字的字符串。
例如,如果您将结果保存在数组中
a = scrape('http://www.infranetsol.com/')
您可以通过电子邮件获取元素(带有'@'的字符串):
a.select { |s| s.match(/.*@.*/) }
您可以获得带有 phone 数字的元素(至少包含 5 位数字的字符串):
a.select{ |s| s.match(/\d{5}/) }
完整代码:
require 'open-uri'
require 'nokogiri'
def scrape(url)
html = open(url).read
nokogiri_doc = Nokogiri::HTML(html)
final_array = []
nokogiri_doc.search("a").each do |element|
element = element.text
final_array << element
end
final_array.each_with_index do |index|
puts "#{index}"
end
end
a = scrape('http://www.infranetsol.com/')
email = a.select { |s| s.match(/.*@.*/) }
phone = a.select{ |s| s.match(/\d{5}/) }
# in your example, you will have to email in email
# and unfortunately a complex string for phone.
# you can use scan to extract phone from text and flat_map
# to get an array without sub array
# But keep in mind it will only worked with this text
phone.flat_map{ |elt| elt.scan(/\d[\d ]*/) }