RSelenium 抓取带有空格的动态 Table
RSelenium Scraping a Dynamic Table with Empty Spaces
我正在尝试抓取此网站 table 中的数据:https://www.iea.org/data-and-statistics/data-tables?country=WORLD
我正在使用 RSelenium,我正在获取我正在寻找的信息。问题是网站上的 table 有空元素,这些元素在我最终得到的文本输出中没有指明。结果,我无法在 R.
中复制原始 table
你能想出我可以抓取 table 并将其复制到 R 中的任何方法吗?
感谢您的支持。原代码如下。
library(RSelenium)
library(tidyverse)
driver <- RSelenium::rsDriver(browser = "chrome",
chromever =
system2(command = "wmic",
args = 'datafile where name="C:\\Program Files (x86)\\Google\\Chrome\\Application\\chrome.exe" get Version /value',
stdout = TRUE,
stderr = TRUE) %>%
stringr::str_extract(pattern = "(?<=Version=)\d+\.\d+\.\d+\.") %>%
magrittr::extract(!is.na(.)) %>%
stringr::str_replace_all(pattern = "\.",
replacement = "\\.") %>%
paste0("^", .) %>%
stringr::str_subset(string =
binman::list_versions(appname = "chromedriver") %>%
dplyr::last()) %>%
as.numeric_version() %>%
max() %>%
as.character())
remote_driver <- driver[["client"]]
remote_driver$navigate("https://www.iea.org/data-and-statistics/data-tables?country=WORLD")
out <- remote_driver$findElement(using = "class", value="m-data-table")
data <- out$getElementText()
data <- gsub ("\n", ";", data)
data <- strsplit(data,";")
data <- gsub ("ktoe","Ktoe", data[[1]])
data <- gsub (pattern="\s+([a-z])", replacement="\_\U\1", perl=TRUE, data)
data <- strsplit(data," ")
data
您可能想要抓取内部 HTML,而不是内部文本:
dtab <- out$getElementAttribute("innerHTML")
result <- dtab[[1]] %>%
# convert from html table to data frame
xml2::read_html() %>%
rvest::html_table() %>%
as.data.frame() %>%
# remove the "ktoe" row
filter(row_number() != 1) %>%
# convert to ASCII encoding
mutate(across(everything(),
~iconv(.x, "utf-8", "ASCII", sub = ""))) %>%
# convert all except first column to integers
mutate(across(-one_of("Var.1"),
as.integer))
结果:
> tibble::glimpse(result)
Rows: 28
Columns: 12
$ Var.1 <chr> "Production", "Imports", "Exports", "International mari...
$ Coal <int> 3893679, 829289, -879689, NA, NA, -4954, 3838326, -1156...
$ Crude.oil <int> 4552548, 2479487, -2440069, NA, NA, -6450, 4585516, -23...
$ Oil.products <int> NA, 1396839, -1488672, NA, NA, 3316, -88517, 261068, 95...
$ Natural.gas <int> 3293124, 985018, -1019823, NA, NA, 3276, 3261595, NA, -...
$ Nuclear <int> 706814, NA, NA, NA, NA, NA, 706814, NA, NA, -703393, -3...
$ Hydro <int> 362332, NA, NA, NA, NA, NA, 362332, NA, NA, -362332, NA...
$ Wind..solar..etc. <int> 286377, NA, NA, NA, NA, NA, 286377, NA, -42, -233459, -...
$ Biofuels.and.waste <int> 1324214, 29259, -25250, NA, NA, -1096, 1327127, NA, -11...
$ Electricity <int> NA, 62626, -62373, NA, NA, NA, 253, NA, -5139, 1931734,...
$ Heat <int> 2062, 6, -2, NA, NA, NA, 2066, NA, 3966, -894, 255095, ...
$ Total <int> 14421151, 5782524, -5915877, NA, NA, -5909, 14281889, 2...
我正在尝试抓取此网站 table 中的数据:https://www.iea.org/data-and-statistics/data-tables?country=WORLD
我正在使用 RSelenium,我正在获取我正在寻找的信息。问题是网站上的 table 有空元素,这些元素在我最终得到的文本输出中没有指明。结果,我无法在 R.
中复制原始 table你能想出我可以抓取 table 并将其复制到 R 中的任何方法吗?
感谢您的支持。原代码如下。
library(RSelenium)
library(tidyverse)
driver <- RSelenium::rsDriver(browser = "chrome",
chromever =
system2(command = "wmic",
args = 'datafile where name="C:\\Program Files (x86)\\Google\\Chrome\\Application\\chrome.exe" get Version /value',
stdout = TRUE,
stderr = TRUE) %>%
stringr::str_extract(pattern = "(?<=Version=)\d+\.\d+\.\d+\.") %>%
magrittr::extract(!is.na(.)) %>%
stringr::str_replace_all(pattern = "\.",
replacement = "\\.") %>%
paste0("^", .) %>%
stringr::str_subset(string =
binman::list_versions(appname = "chromedriver") %>%
dplyr::last()) %>%
as.numeric_version() %>%
max() %>%
as.character())
remote_driver <- driver[["client"]]
remote_driver$navigate("https://www.iea.org/data-and-statistics/data-tables?country=WORLD")
out <- remote_driver$findElement(using = "class", value="m-data-table")
data <- out$getElementText()
data <- gsub ("\n", ";", data)
data <- strsplit(data,";")
data <- gsub ("ktoe","Ktoe", data[[1]])
data <- gsub (pattern="\s+([a-z])", replacement="\_\U\1", perl=TRUE, data)
data <- strsplit(data," ")
data
您可能想要抓取内部 HTML,而不是内部文本:
dtab <- out$getElementAttribute("innerHTML")
result <- dtab[[1]] %>%
# convert from html table to data frame
xml2::read_html() %>%
rvest::html_table() %>%
as.data.frame() %>%
# remove the "ktoe" row
filter(row_number() != 1) %>%
# convert to ASCII encoding
mutate(across(everything(),
~iconv(.x, "utf-8", "ASCII", sub = ""))) %>%
# convert all except first column to integers
mutate(across(-one_of("Var.1"),
as.integer))
结果:
> tibble::glimpse(result)
Rows: 28
Columns: 12
$ Var.1 <chr> "Production", "Imports", "Exports", "International mari...
$ Coal <int> 3893679, 829289, -879689, NA, NA, -4954, 3838326, -1156...
$ Crude.oil <int> 4552548, 2479487, -2440069, NA, NA, -6450, 4585516, -23...
$ Oil.products <int> NA, 1396839, -1488672, NA, NA, 3316, -88517, 261068, 95...
$ Natural.gas <int> 3293124, 985018, -1019823, NA, NA, 3276, 3261595, NA, -...
$ Nuclear <int> 706814, NA, NA, NA, NA, NA, 706814, NA, NA, -703393, -3...
$ Hydro <int> 362332, NA, NA, NA, NA, NA, 362332, NA, NA, -362332, NA...
$ Wind..solar..etc. <int> 286377, NA, NA, NA, NA, NA, 286377, NA, -42, -233459, -...
$ Biofuels.and.waste <int> 1324214, 29259, -25250, NA, NA, -1096, 1327127, NA, -11...
$ Electricity <int> NA, 62626, -62373, NA, NA, NA, 253, NA, -5139, 1931734,...
$ Heat <int> 2062, 6, -2, NA, NA, NA, 2066, NA, 3966, -894, 255095, ...
$ Total <int> 14421151, 5782524, -5915877, NA, NA, -5909, 14281889, 2...