使用 RSelenium 在多个页面上进行 Web 抓取,并使用正则表达式发送 select 电子邮件
Web Scraping on multiple pages with RSelenium and select emails with regular expression
我想收集点击此网站上每个名称的电子邮件地址 https://ki.se/en/research/professors-at-ki 我创建了以下循环。由于某种原因,一些电子邮件没有被收集,而且代码很慢......
您有更好的代码创意吗?
非常感谢您
library(RSelenium)
#use Rselenium to dowload emails
rD <- rsDriver(browser="firefox", port=4545L, verbose=F)
remDr <- rD[["client"]]
remDr$navigate("https://ki.se/en/research/professors-at-ki")
database<-data.frame(NA, nrow = length(name), ncol = 3)
for(i in 1:length(name)){
#first website
remDr$navigate("https://ki.se/en/research/professors-at-ki")
elems <- remDr$findElements(using = 'xpath', "//strong") #all elements to be selected
elem <- elems[[i]] #do search and click on each one
class(elem)
people<- elem$getElementText()
elem$clickElement()
page <- remDr$getPageSource()
#stringplit
p<-str_split(as.character(page), "\n")
a<-grep("@", p[[1]])
if(length(a)>0){
email<-p[[1]][a[2]]
email<-gsub(" ", "", email)
database[i,1]<-people
database[i,2]<-email
database[i,3]<-"Karolinska Institute"
}
}
RSelenium
通常不是最快的方法,因为它需要浏览器加载页面。在某些情况下,RSelenium
是唯一的选择,但在这种情况下,您可以使用 rvest
库实现您需要的,这应该更快。至于您收到的错误,有两位教授,他们提供的链接似乎不起作用,因此您收到的错误。
library(rvest)
library(tidyverse)
# getting links to professors microsites as part of the KI main website
r <- read_html("https://ki.se/en/research/professors-at-ki")
people_links <- r %>%
html_nodes("a") %>%
html_attrs() %>%
as.character() %>%
str_subset("https://staff.ki.se/people/")
# accessing the obtained links, getting the e-mails
df <- tibble(people_links) %>%
# filtering out these links as they do not seem to be accessible
filter( !(people_links %in% c("https://staff.ki.se/people/gungra", "https://staff.ki.se/people/evryla")) ) %>%
rowwise() %>%
mutate(
mail = read_html(people_links) %>%
html_nodes("a") %>%
html_attrs() %>%
as.character() %>%
str_subset("mailto:") %>%
str_remove("mailto:")
)
我想收集点击此网站上每个名称的电子邮件地址 https://ki.se/en/research/professors-at-ki 我创建了以下循环。由于某种原因,一些电子邮件没有被收集,而且代码很慢...... 您有更好的代码创意吗? 非常感谢您
library(RSelenium)
#use Rselenium to dowload emails
rD <- rsDriver(browser="firefox", port=4545L, verbose=F)
remDr <- rD[["client"]]
remDr$navigate("https://ki.se/en/research/professors-at-ki")
database<-data.frame(NA, nrow = length(name), ncol = 3)
for(i in 1:length(name)){
#first website
remDr$navigate("https://ki.se/en/research/professors-at-ki")
elems <- remDr$findElements(using = 'xpath', "//strong") #all elements to be selected
elem <- elems[[i]] #do search and click on each one
class(elem)
people<- elem$getElementText()
elem$clickElement()
page <- remDr$getPageSource()
#stringplit
p<-str_split(as.character(page), "\n")
a<-grep("@", p[[1]])
if(length(a)>0){
email<-p[[1]][a[2]]
email<-gsub(" ", "", email)
database[i,1]<-people
database[i,2]<-email
database[i,3]<-"Karolinska Institute"
}
}
RSelenium
通常不是最快的方法,因为它需要浏览器加载页面。在某些情况下,RSelenium
是唯一的选择,但在这种情况下,您可以使用 rvest
库实现您需要的,这应该更快。至于您收到的错误,有两位教授,他们提供的链接似乎不起作用,因此您收到的错误。
library(rvest)
library(tidyverse)
# getting links to professors microsites as part of the KI main website
r <- read_html("https://ki.se/en/research/professors-at-ki")
people_links <- r %>%
html_nodes("a") %>%
html_attrs() %>%
as.character() %>%
str_subset("https://staff.ki.se/people/")
# accessing the obtained links, getting the e-mails
df <- tibble(people_links) %>%
# filtering out these links as they do not seem to be accessible
filter( !(people_links %in% c("https://staff.ki.se/people/gungra", "https://staff.ki.se/people/evryla")) ) %>%
rowwise() %>%
mutate(
mail = read_html(people_links) %>%
html_nodes("a") %>%
html_attrs() %>%
as.character() %>%
str_subset("mailto:") %>%
str_remove("mailto:")
)