使用 R 进行网络抓取 - 我想从网站中提取一些 table 之类的数据
Web-Scraping using R - I want to extract some table like data from a website
我在从网站抓取数据时遇到了一些问题。我在网络抓取方面没有太多经验。我的计划是使用 R 从以下网站抓取一些数据:https://www.fatf-gafi.org/countries/
更准确地说,我想提取受到某种制裁的国家/地区列表
library(XML)
url <- paste0("https://www.fatf-gafi.org/countries/")
source <- readLines(url, encoding = "UTF-8")
parsed_doc <- htmlParse(source, encoding = "UTF-8")
但这不会显示预期的信息,因为它不在 table 下,而是嵌套的 div。
这是一项棘手的解析工作。您需要的信息不在您从 readLines
获得的 html 中。相反,它是由页面使用 XHR 请求动态加载的。通常,像这样的 XHR 请求将 return 一个 json 字符串,但在您的情况下,它是 returns javascript,其中信息存储为包含 return 数组的变量 json 片段,每个国家一个。这可以通过一些字符串操作和 json 解析来获得最终结果:
library(httr)
library(rvest)
url <- paste0('https://www.fatf-gafi.org/media/fatf/fatfv20/',
'js/country-data-multi-lang.js')
js <- content(GET(url), 'text')
vars <- strsplit(js, 'var countries = ')[[1]][2]
vars <- paste0("{", sub("^\[\{", "", strsplit(vars, '\},\{')[[1]]), "}")
countries <- do.call(rbind, lapply(vars[1:209],
function(x) as.data.frame(jsonlite::parse_json(x))))
countries <- countries[c(1, 4:13)]
names(countries) <- sub('^.*\.', '', names(countries))
dplyr::tibble(countries)
#> # A tibble: 209 x 11
#> name FATF APG CFATF EAG ESAAMLG GABAC GAFILAT GIABA MENAFATF MONEYVAL
#> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
#> 1 Afghani~ "" "mbr" "" "obs" "" "" "" "" "" ""
#> 2 Albania "" "" "" "" "" "" "" "" "" "mbr"
#> 3 Algeria "" "" "" "" "" "" "" "" "mbr" ""
#> 4 Andorra "" "" "" "" "" "" "" "" "" "mbr"
#> 5 Angola "" "" "" "" "mbr" "" "" "" "" ""
#> 6 Anguilla "" "" "mbr" "" "" "" "" "" "" ""
#> 7 Antigua~ "" "" "mbr" "" "" "" "" "" "" ""
#> 8 Argenti~ "mbr" "non" "non" "non" "non" "" "mbr" "non" "non" "non"
#> 9 Armenia "" "" "" "obs" "" "" "" "" "" "mbr"
#> 10 Aruba K~ "els" "" "mbr" "" "" "" "" "" "" ""
#> # ... with 199 more rows
只是为了测试 JavaScript 评估如何与 V8、嵌入式 JavaScript 和 WebAssembly 引擎.
一起工作
https://cran.r-project.org/web/packages/V8/vignettes/v8_intro.html
创建上下文引擎,评估请求的 JavaScript 并从 V8 中获取 countries
变量的值(它变成了嵌套数据帧,因此 unnest()
),最后一行填充了NA
s,因此过滤器。
library(httr)
library(V8)
library(dplyr)
library(tidyr)
url <- paste0('https://www.fatf-gafi.org/media/fatf/fatfv20/',
'js/country-data-multi-lang.js')
js_content <- content(GET(url), 'text')
ct <- v8()
ct$eval(js_content)
ct$get("countries") %>%
unnest(cols = c(groups)) %>%
select(c(1:2,4:14,16)) %>%
filter(!is.na(name))
#> # A tibble: 209 × 14
#> name code FATF APG CFATF EAG ESAAMLG GABAC GAFILAT GIABA MENAFATF
#> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
#> 1 Afghanist… AF "" "mbr" "" "obs" "" "" "" "" ""
#> 2 Albania AL "" "" "" "" "" "" "" "" ""
#> 3 Algeria DZ "" "" "" "" "" "" "" "" "mbr"
#> 4 Andorra AD "" "" "" "" "" "" "" "" ""
#> 5 Angola AO "" "" "" "" "mbr" "" "" "" ""
#> 6 Anguilla AI "" "" "mbr" "" "" "" "" "" ""
#> 7 Antigua a… AG "" "" "mbr" "" "" "" "" "" ""
#> 8 Argentina AR "mbr" "non" "non" "non" "non" "" "mbr" "non" "non"
#> 9 Armenia AM "" "" "" "obs" "" "" "" "" ""
#> 10 Aruba Kin… AW "els" "" "mbr" "" "" "" "" "" ""
#> # … with 200 more rows, and 3 more variables: MONEYVAL <chr>,
#> # jurisdiction <chr>, id <chr>
我在从网站抓取数据时遇到了一些问题。我在网络抓取方面没有太多经验。我的计划是使用 R 从以下网站抓取一些数据:https://www.fatf-gafi.org/countries/
更准确地说,我想提取受到某种制裁的国家/地区列表
library(XML)
url <- paste0("https://www.fatf-gafi.org/countries/")
source <- readLines(url, encoding = "UTF-8")
parsed_doc <- htmlParse(source, encoding = "UTF-8")
但这不会显示预期的信息,因为它不在 table 下,而是嵌套的 div。
这是一项棘手的解析工作。您需要的信息不在您从 readLines
获得的 html 中。相反,它是由页面使用 XHR 请求动态加载的。通常,像这样的 XHR 请求将 return 一个 json 字符串,但在您的情况下,它是 returns javascript,其中信息存储为包含 return 数组的变量 json 片段,每个国家一个。这可以通过一些字符串操作和 json 解析来获得最终结果:
library(httr)
library(rvest)
url <- paste0('https://www.fatf-gafi.org/media/fatf/fatfv20/',
'js/country-data-multi-lang.js')
js <- content(GET(url), 'text')
vars <- strsplit(js, 'var countries = ')[[1]][2]
vars <- paste0("{", sub("^\[\{", "", strsplit(vars, '\},\{')[[1]]), "}")
countries <- do.call(rbind, lapply(vars[1:209],
function(x) as.data.frame(jsonlite::parse_json(x))))
countries <- countries[c(1, 4:13)]
names(countries) <- sub('^.*\.', '', names(countries))
dplyr::tibble(countries)
#> # A tibble: 209 x 11
#> name FATF APG CFATF EAG ESAAMLG GABAC GAFILAT GIABA MENAFATF MONEYVAL
#> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
#> 1 Afghani~ "" "mbr" "" "obs" "" "" "" "" "" ""
#> 2 Albania "" "" "" "" "" "" "" "" "" "mbr"
#> 3 Algeria "" "" "" "" "" "" "" "" "mbr" ""
#> 4 Andorra "" "" "" "" "" "" "" "" "" "mbr"
#> 5 Angola "" "" "" "" "mbr" "" "" "" "" ""
#> 6 Anguilla "" "" "mbr" "" "" "" "" "" "" ""
#> 7 Antigua~ "" "" "mbr" "" "" "" "" "" "" ""
#> 8 Argenti~ "mbr" "non" "non" "non" "non" "" "mbr" "non" "non" "non"
#> 9 Armenia "" "" "" "obs" "" "" "" "" "" "mbr"
#> 10 Aruba K~ "els" "" "mbr" "" "" "" "" "" "" ""
#> # ... with 199 more rows
只是为了测试 JavaScript 评估如何与 V8、嵌入式 JavaScript 和 WebAssembly 引擎.
一起工作
https://cran.r-project.org/web/packages/V8/vignettes/v8_intro.html
创建上下文引擎,评估请求的 JavaScript 并从 V8 中获取 countries
变量的值(它变成了嵌套数据帧,因此 unnest()
),最后一行填充了NA
s,因此过滤器。
library(httr)
library(V8)
library(dplyr)
library(tidyr)
url <- paste0('https://www.fatf-gafi.org/media/fatf/fatfv20/',
'js/country-data-multi-lang.js')
js_content <- content(GET(url), 'text')
ct <- v8()
ct$eval(js_content)
ct$get("countries") %>%
unnest(cols = c(groups)) %>%
select(c(1:2,4:14,16)) %>%
filter(!is.na(name))
#> # A tibble: 209 × 14
#> name code FATF APG CFATF EAG ESAAMLG GABAC GAFILAT GIABA MENAFATF
#> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
#> 1 Afghanist… AF "" "mbr" "" "obs" "" "" "" "" ""
#> 2 Albania AL "" "" "" "" "" "" "" "" ""
#> 3 Algeria DZ "" "" "" "" "" "" "" "" "mbr"
#> 4 Andorra AD "" "" "" "" "" "" "" "" ""
#> 5 Angola AO "" "" "" "" "mbr" "" "" "" ""
#> 6 Anguilla AI "" "" "mbr" "" "" "" "" "" ""
#> 7 Antigua a… AG "" "" "mbr" "" "" "" "" "" ""
#> 8 Argentina AR "mbr" "non" "non" "non" "non" "" "mbr" "non" "non"
#> 9 Armenia AM "" "" "" "obs" "" "" "" "" ""
#> 10 Aruba Kin… AW "els" "" "mbr" "" "" "" "" "" ""
#> # … with 200 more rows, and 3 more variables: MONEYVAL <chr>,
#> # jurisdiction <chr>, id <chr>