通过密码门户下载 html
Download html through a password portal
我想从 www.geocaching.com 下载 HTML 网页,以便抓取一些信息。但是,我要下载的网页有两种显示方式,具体取决于用户是否登录。我要抓取的信息只有在用户登录时才能找到。
过去我使用 download.file()
和 mapply()
从 URL 列表 (geocache_link_list
) 下载 HTML 文件并使用另一个列表 (geocache_name_list
) 像这样:
mapply(function(x,y) download.file(x,y), geocache_link_list, geocache_name_list)
但这会下载未登录的页面。
我也尝试使用 RCurl
,但这也下载了非登录页面,所以我从未尝试将其合并到 mapply 函数中:
library(RCurl)
baseurl <- geocache_link_list[1]
un <- readline("Type the username:")
pw <- readline("Type the password:")
upw <- paste(un, pw, sep = ":")
有没有办法从 R 中使用 RSelenium
或 RCurl
之类的东西 运行 浏览器来输入登录详细信息,然后重定向到所需的页面并下载它们?
很简单!
library(RCurl)
library(xml2)
html_inputs <- function(p, xpath = "//form/input") {
xml_find_all(p, xpath) %>% {setNames(as.list(xml_attr(., "value")), xml_attr(., "name"))}
}
get_header <- function(){
## RCurl设置, 直接把cookie粘贴过来,即可登录
myHttpheader<- c(
"User-Agent" = "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.71",
# "Accept" = "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
"Accept-Language" = "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3",
# "Accept-Encoding"="gzip, deflate",
"Connection"="keep-alive",
DNT = 1,
"Upgrade-Insecure-Requests" = 1,
"Host" = "www.geocaching.com")
file_cookie <- "cookies.txt"
ch <- getCurlHandle(# cainfo="pem/cacert.pem",
# ssl.verifyhost=FALSE, ssl.verifypeer = FALSE,
followlocation = TRUE,
verbose = TRUE,
cookiejar = file_cookie, cookiefile = file_cookie,
httpheader = myHttpheader)#带上百宝箱开始上路
tmp <- curlSetOpt(curl = ch)
return(ch)
}
ch <- get_header()
h <- basicHeaderGatherer()
#input your username and password here
user <- "kongdd"
pwd <- "****"
p <- getURL("https://www.geocaching.com/account/login", curl = ch)
tooken <- html_inputs(read_html(p))[1]
params <- list(Password = user,
Username = pwd) %>% c(., tooken)
p2 <- postForm("https://www.geocaching.com/account/login", curl = ch,
.params = params)
grep("kongdd", p2)#If 1 returned, it indicate you have login successfully.
登录成功后,可以通过参数curl
.
访问数据
我想从 www.geocaching.com 下载 HTML 网页,以便抓取一些信息。但是,我要下载的网页有两种显示方式,具体取决于用户是否登录。我要抓取的信息只有在用户登录时才能找到。
过去我使用 download.file()
和 mapply()
从 URL 列表 (geocache_link_list
) 下载 HTML 文件并使用另一个列表 (geocache_name_list
) 像这样:
mapply(function(x,y) download.file(x,y), geocache_link_list, geocache_name_list)
但这会下载未登录的页面。
我也尝试使用 RCurl
,但这也下载了非登录页面,所以我从未尝试将其合并到 mapply 函数中:
library(RCurl)
baseurl <- geocache_link_list[1]
un <- readline("Type the username:")
pw <- readline("Type the password:")
upw <- paste(un, pw, sep = ":")
有没有办法从 R 中使用 RSelenium
或 RCurl
之类的东西 运行 浏览器来输入登录详细信息,然后重定向到所需的页面并下载它们?
很简单!
library(RCurl)
library(xml2)
html_inputs <- function(p, xpath = "//form/input") {
xml_find_all(p, xpath) %>% {setNames(as.list(xml_attr(., "value")), xml_attr(., "name"))}
}
get_header <- function(){
## RCurl设置, 直接把cookie粘贴过来,即可登录
myHttpheader<- c(
"User-Agent" = "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.71",
# "Accept" = "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
"Accept-Language" = "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3",
# "Accept-Encoding"="gzip, deflate",
"Connection"="keep-alive",
DNT = 1,
"Upgrade-Insecure-Requests" = 1,
"Host" = "www.geocaching.com")
file_cookie <- "cookies.txt"
ch <- getCurlHandle(# cainfo="pem/cacert.pem",
# ssl.verifyhost=FALSE, ssl.verifypeer = FALSE,
followlocation = TRUE,
verbose = TRUE,
cookiejar = file_cookie, cookiefile = file_cookie,
httpheader = myHttpheader)#带上百宝箱开始上路
tmp <- curlSetOpt(curl = ch)
return(ch)
}
ch <- get_header()
h <- basicHeaderGatherer()
#input your username and password here
user <- "kongdd"
pwd <- "****"
p <- getURL("https://www.geocaching.com/account/login", curl = ch)
tooken <- html_inputs(read_html(p))[1]
params <- list(Password = user,
Username = pwd) %>% c(., tooken)
p2 <- postForm("https://www.geocaching.com/account/login", curl = ch,
.params = params)
grep("kongdd", p2)#If 1 returned, it indicate you have login successfully.
登录成功后,可以通过参数curl
.