R selenium下载不同年份数据的方法
R selenium method for downloading data for different year
以下代码是我在同一平台上的一个问题中编写的。我必须下载 2020 年的数据,但是一旦在给定网络链接的右侧栏中单击 2020 年,URL 就不会改变。该代码在 Firefox 上打开 2020 页面,但未在系统中下载所需文件(2020 文件)。它正在下载我不需要的 2021 文件。我无法弄清楚这个问题。
我正在处理的 URL 是:https://www.rbi.org.in/scripts/AnnualPublications.aspx?head=Handbook%20of%20Statistics%20on%20Indian%20Economy
代码是:
library(tidyverse)
library(stringr)
library(purrr)
library(rvest)
library(RSelenium)
rD <- rsDriver(browser="firefox", port=4567L, verbose=F)
remDr <- rD[["client"]]
remDr$navigate("https://www.rbi.org.in/scripts/AnnualPublications.aspx?head=Handbook+of+Statistics+on+Indian+Economy")
elem<- remDr$findElement(using = "link text", "2020")
elem$clickElement()
page <- remDr$getPageSource()[[1]]
read_html(page) -> html
html %>%
html_nodes("a") %>%
html_attr("href") %>%
str_subset("\.PDF") -> urls
urls %>% str_split(.,'/') %>% unlist() %>% str_subset("\.PDF") -> filenames
for(u in 1:length(urls))
{
cat(paste('downloading: ', u, ' of ', length(urls)))
download.file(urls[u], filenames[u], mode='wb')
}
system("taskkill /im java.exe /f", intern=FALSE, ignore.stdout=FALSE)
您可以通过获取手册的名称来检查您是否在正确的页面上,
您现在是 2021 年
remDr$navigate("https://www.rbi.org.in/scripts/AnnualPublications.aspx?head=Handbook+of+Statistics+on+Indian+Economy")
remDr$getPageSource()[[1]] %>%
read_html() %>% html_nodes(xpath = '//*[@id="accordion"]/table[2]/tbody/tr[2]/td[1]/text()[1]') %>% html_text()
[1] "Handbook of Statistics on the Indian Economy, 2020-21 "
现在是 2020 年
elem<- remDr$findElement(using = "link text", "2020")
elem$clickElement()
remDr$getPageSource()[[1]] %>%
read_html() %>% html_nodes(xpath = '//*[@id="accordion"]/table[2]/tbody/tr[2]/td[1]/text()[1]') %>% html_text()
[1] "Handbook of Statistics on Indian Economy 2019-20 "
编辑:-
library(tidyverse)
library(rvest)
library(RSelenium)
启动浏览器
rD <- rsDriver(browser="firefox", port=4567L, verbose=F)
remDr <- rD[["client"]]
加载网页
remDr$navigate("https://www.rbi.org.in/scripts/AnnualPublications.aspx?head=Handbook+of+Statistics+on+Indian+Economy")
点击年份 2020。确保您获取的是正确年份的数据。
remDr$findElement(using = "link text", "2020")$clickElement()
remDr$getPageSource()[[1]] %>%
read_html() %>% html_nodes(xpath = '//*[@id="accordion"]/table[2]/tbody/tr[2]/td[1]/text()[1]') %>% html_text()
[1] "Handbook of Statistics on the Indian Economy, 2020-21 "
获取pdf名称,url并开始下载
urls = remDr$getPageSource()[[1]] %>% read_html() %>%
html_nodes("a") %>%
html_attr("href") %>%
str_subset("\.PDF")
filenames = urls %>% str_split(.,'/') %>% unlist() %>% str_subset("\.PDF")
for(u in 1:length(urls)){
cat(paste('downloading: ', u, ' of ', length(urls)))
download.file(urls[u], filenames[u], mode='wb')
}
以下代码是我在同一平台上的一个问题中编写的。我必须下载 2020 年的数据,但是一旦在给定网络链接的右侧栏中单击 2020 年,URL 就不会改变。该代码在 Firefox 上打开 2020 页面,但未在系统中下载所需文件(2020 文件)。它正在下载我不需要的 2021 文件。我无法弄清楚这个问题。 我正在处理的 URL 是:https://www.rbi.org.in/scripts/AnnualPublications.aspx?head=Handbook%20of%20Statistics%20on%20Indian%20Economy 代码是:
library(tidyverse)
library(stringr)
library(purrr)
library(rvest)
library(RSelenium)
rD <- rsDriver(browser="firefox", port=4567L, verbose=F)
remDr <- rD[["client"]]
remDr$navigate("https://www.rbi.org.in/scripts/AnnualPublications.aspx?head=Handbook+of+Statistics+on+Indian+Economy")
elem<- remDr$findElement(using = "link text", "2020")
elem$clickElement()
page <- remDr$getPageSource()[[1]]
read_html(page) -> html
html %>%
html_nodes("a") %>%
html_attr("href") %>%
str_subset("\.PDF") -> urls
urls %>% str_split(.,'/') %>% unlist() %>% str_subset("\.PDF") -> filenames
for(u in 1:length(urls))
{
cat(paste('downloading: ', u, ' of ', length(urls)))
download.file(urls[u], filenames[u], mode='wb')
}
system("taskkill /im java.exe /f", intern=FALSE, ignore.stdout=FALSE)
您可以通过获取手册的名称来检查您是否在正确的页面上,
您现在是 2021 年
remDr$navigate("https://www.rbi.org.in/scripts/AnnualPublications.aspx?head=Handbook+of+Statistics+on+Indian+Economy")
remDr$getPageSource()[[1]] %>%
read_html() %>% html_nodes(xpath = '//*[@id="accordion"]/table[2]/tbody/tr[2]/td[1]/text()[1]') %>% html_text()
[1] "Handbook of Statistics on the Indian Economy, 2020-21 "
现在是 2020 年
elem<- remDr$findElement(using = "link text", "2020")
elem$clickElement()
remDr$getPageSource()[[1]] %>%
read_html() %>% html_nodes(xpath = '//*[@id="accordion"]/table[2]/tbody/tr[2]/td[1]/text()[1]') %>% html_text()
[1] "Handbook of Statistics on Indian Economy 2019-20 "
编辑:-
library(tidyverse)
library(rvest)
library(RSelenium)
启动浏览器
rD <- rsDriver(browser="firefox", port=4567L, verbose=F)
remDr <- rD[["client"]]
加载网页
remDr$navigate("https://www.rbi.org.in/scripts/AnnualPublications.aspx?head=Handbook+of+Statistics+on+Indian+Economy")
点击年份 2020。确保您获取的是正确年份的数据。
remDr$findElement(using = "link text", "2020")$clickElement()
remDr$getPageSource()[[1]] %>%
read_html() %>% html_nodes(xpath = '//*[@id="accordion"]/table[2]/tbody/tr[2]/td[1]/text()[1]') %>% html_text()
[1] "Handbook of Statistics on the Indian Economy, 2020-21 "
获取pdf名称,url并开始下载
urls = remDr$getPageSource()[[1]] %>% read_html() %>%
html_nodes("a") %>%
html_attr("href") %>%
str_subset("\.PDF")
filenames = urls %>% str_split(.,'/') %>% unlist() %>% str_subset("\.PDF")
for(u in 1:length(urls)){
cat(paste('downloading: ', u, ' of ', length(urls)))
download.file(urls[u], filenames[u], mode='wb')
}