(R) 使用直接下载 link 下载一个在浏览器中有效但在 R 中无效的文件
(R) Download a file with direct download link that works in browser, but won't in R
我正在尝试从 WorldPop UK 网站下载数据集中许多国家/地区的大量文件(不仅仅是小示例)。下载每个文件将非常耗时且乏味。
我相当熟悉 R 中的下载方法,但我无法使这些下载工作。我知道这是因为通过 html 以某种方式下载 links 运行,但我不擅长 html 或 java。
我阅读了大量有关 httr、RCurl 和 RSelenium 的资料。我更喜欢避免使用 RSelenium 的解决方案,因为我对其他软件包更加熟悉并且可能会共享代码并且不想每次都必须设置浏览器(至少这是我的理解)
有人可以帮我解决这个问题吗?
直接下载 link 到一个在浏览器中运行良好的小文本 (.txt) 文件,但在使用 download.file 或 curl_download 的 R 中则不行:
http://www.worldpop.org.uk/data/files/index.php?dataset=140&action=download&file=60
带有尼日利亚文件索引的站点(您可以在 html 代码中看到 href= links):http://www.worldpop.org.uk/data/files/index.php?dataset=140&action=dir
在 chrome,查看源代码:http://www.worldpop.org.uk/data/files/index.php?dataset=140&action=dir
下载 link 在我的开发者控制台上的第 558 行和第 559 行之间。
提前致谢!
好吧,他们当然不会让这变得容易。除了令人费解的 "web app" 他们还尝试做正确的事情并使用 sha1 对源 javascript 资源进行验证,但未能保持这些资源的维护(即安全浏览器将无法使用那个网站)。
无论如何,这是你必须做的以避免splashr
或RSelenium
/seleniumPipes
。我使用了您的 "README" 示例并且有很多评论。
我的建议是将一个或多个位包装到一个函数中以便于使用,并考虑将各种调用包装在 purrr
帮助程序中,例如 safely
(也有 "retry" 示例 oot并启动)。
library(httr)
library(rvest)
library(tidyverse)
# Need to "prime" the session with a cookie
res <- GET(url="http://www.worldpop.org.uk/data/data_sources/")
# Get the page contents
pg <- content(res)
# Find the summary links
summary_link_nodes <- html_nodes(pg, xpath=".//a[contains(@href,'summary')]")
# extract the table cells & href so we can make a data frame
map(summary_link_nodes, html_nodes, xpath=".//../..") %>%
map(html_children) %>%
map(html_text) %>%
map(~.[1:4]) %>%
map(as.list) %>%
map_df(set_names, c("continent", "country", "resolution", "data_type")) %>%
bind_cols(
data_frame(
summary_link = sprintf("http://www.worldpop.org.uk%s", html_attr(summary_link_nodes, "href"))
)
) -> world_pop_data
glimpse(world_pop_data)
## Observations: 462
## Variables: 5
## $ continent <chr> "Africa", "Africa", "Africa", "Africa", "Africa", "Africa", "Afri...
## $ country <chr> "Algeria", "Angola", "Benin", "Botswana", "Burkina Faso", "Burund...
## $ resolution <chr> "100m", "100m", "100m", "100m", "100m", "100m", "100m", "100m", "...
## $ data_type <chr> "Population", "Population", "Population", "Population", "Populati...
## $ summary_link <chr> "http://www.worldpop.org.uk/data/summary?doi=10.5258/SOTON/WP0000...
# just see "Nigeria" data
filter(world_pop_data, country=="Nigeria")
## # A tibble: 8 x 5
## continent country resolution data_type summary_link
## <chr> <chr> <chr> <chr> <chr>
## 1 Africa Nigeria 100m Population http://www.worldpop.org.uk/data/summary?doi=10.5258/SOTON/WP00196
## 2 Africa Nigeria 1km Births http://www.worldpop.org.uk/data/summary?doi=10.5258/SOTON/WP00357
## 3 Africa Nigeria 1km Pregnancies http://www.worldpop.org.uk/data/summary?doi=10.5258/SOTON/WP00465
## 4 Africa Nigeria 1km Contraceptive Use http://www.worldpop.org.uk/data/summary?doi=10.5258/SOTON/WP00198
## 5 Africa Nigeria 1km Literacy http://www.worldpop.org.uk/data/summary?doi=10.5258/SOTON/WP00199
## 6 Africa Nigeria 1km Poverty http://www.worldpop.org.uk/data/summary?doi=10.5258/SOTON/WP00200
## 7 Africa Nigeria 1km Stunting http://www.worldpop.org.uk/data/summary?doi=10.5258/SOTON/WP00201
## 8 Africa Nigeria 100m Age structures http://www.worldpop.org.uk/data/summary?doi=10.5258/SOTON/WP00194
我相当确定您可以从其中一个 ^^ URL 开始任何文件下载会话尝试,但您需要测试它,因为您可能需要始终从"main" 页面(如前所述,它根据 cookie 维护会话位置)。
# get nigeria population URL
filter(world_pop_data, country=="Nigeria") %>%
filter(data_type=="Population") %>%
pull(summary_link) -> nigeria_pop
nigeria_pop
# [1] "http://www.worldpop.org.uk/data/summary?doi=10.5258/SOTON/WP00196"
# follow it
GET(url=nigeria_pop) -> res2
pg2 <- content(res2)
该页面上总是有一个 <form>
,所以我们需要 "submit" 表示带有 POST
的表格:
# extract "form" fields (that page does a POST request)
fields <- html_nodes(pg2, "form#conform > input")
fields <- set_names(xml_attr(fields, "value"), html_attr(fields, "name"))
str(as.list((fields))) # just to show what it looks like
## List of 4
## $ zip_id : chr "140"
## $ zip_title: chr "Nigeria 100m Population"
## $ decoy : chr "website"
## $ website : chr NA
# submit the form with the field data.
# NOTE we need to add the `Referer` (the faux page we're on)
POST(
url = "http://www.worldpop.org.uk/data/download/",
add_headers(`Referer` = nigeria_pop),
body = list(
client_first_name = "",
client_last_name = "",
client_organization = "",
client_country = "",
client_email = "",
client_message = "",
zip_id = fields["zip_id"],
zip_title = fields["zip_title"],
decoy = fields["decoy"],
website = "",
download = "Browse Individual Files"
),
encode = "form"
) -> res3
结果页面的某处是 "Switch to file list" link,因此我们需要找到它并按照它进行操作:
# find the link that has the file list
pg3 <- content(res3)
html_nodes(pg3, xpath=".//a[contains(., 'switch to')]") %>%
html_attr("href") -> file_list_query_string # just to see the format
## [1] "?dataset=140&action=dir"
# follow that link (we need to use some of the previous captured fields)
GET(
url = "http://www.worldpop.org.uk/data/files/index.php",
query = list(
dataset = fields["zip_id"],
action = "dir"
)
) -> res4
现在,我们构建该页面上所有 link 的数据框:
pg4 <- content(res4)
data_frame(
group_name = html_nodes(pg4, "a.dl") %>% html_text(),
href = html_nodes(pg4, "a.dl") %>% html_attr("href")
) -> downloads
downloads
## # A tibble: 60 x 2
## group_name href
## <chr> <chr>
## 1 Licence.txt ?dataset=140&action=download&file=1
## 2 NGA_metadata.html ?dataset=140&action=download&file=2
## 3 NGA_pph_v2c_2006.tfw ?dataset=140&action=download&file=3
## 4 NGA_pph_v2c_2006.tif ?dataset=140&action=download&file=4
## 5 NGA_pph_v2c_2006.tif.aux.xml ?dataset=140&action=download&file=5
## 6 NGA_pph_v2c_2006.tif.xml ?dataset=140&action=download&file=6
## 7 NGA_pph_v2c_2010.tfw ?dataset=140&action=download&file=7
## 8 NGA_pph_v2c_2010.tif ?dataset=140&action=download&file=8
## 9 NGA_pph_v2c_2010.tif.aux.xml ?dataset=140&action=download&file=9
## 10 NGA_pph_v2c_2010.tif.xml ?dataset=140&action=download&file=10
## # ... with 50 more rows
虽然我之前指出您可能需要始终从头开始或从上一个 link 页面开始,但您也可以按顺序下载所有这些内容。您需要进行测试。这是一个很难处理的网站。
filter(downloads, str_detect(group_name, "README")) %>%
pull(href) -> readme_query_string # we need this below
readme_query_string
## [1] "?dataset=140&action=download&file=60"
# THERE IS A RLY GD CHANCE YOU'LL NEED TO USE timeout() for
# some of these calls. That server takes a while
# right here is where that modal "preparing the data" is shown.
# I'm 99% certain this is there to slow down crawlers/scrapers.
GET(
url = "http://www.worldpop.org.uk/data/files/index.php",
query = parse_url(readme_query_string)$query,
verbose()
) -> res5
这不是你真正要做的。您可能希望 content(res5, as="raw")
和 writeBin()
因为有些(大多数)不是纯文本。但是,这是为了表明以上所有的工作:
content(res5, as="text") %>%
cat()
## WorldPop Africa dataset details
## _______________________
##
## DATASET: Alpha version 2010, 2015 and 2020 estimates of numbers of people per pixel ('ppp') and people per hectare ('pph'), with national totals adjusted to match UN population division estimates (http://esa.un.org/wpp/) and remaining unadjusted.
## REGION: Africa
## SPATIAL RESOLUTION: 0.000833333 decimal degrees (approx 100m at the equator)
## PROJECTION: Geographic, WGS84
## UNITS: Estimated persons per grid square
## MAPPING APPROACH: Random Forest
## FORMAT: Geotiff (zipped using 7-zip (open access tool): www.7-zip.org)
## FILENAMES: Example - NGA_ppp_v2b_2010_UNadj.tif = Nigeria (NGA) population per pixel (ppp), mapped using WorldPOP modelling version 2b (v2b) for 2010 (2010) adjusted to match UN national estimates (UNadj).
## DATE OF PRODUCTION: February 2017
##
## Also included: (i) Metadata html file, (ii) Google Earth file, (iii) Population datasets produced using original census year data (2006).
如果您坚持不懈,请考虑添加您最终所做的答案,或者将其打包成一个包,以便其他人也可以使用。
HEY Whosebug DELETERS,看看这个。它本来是要与我的其他答案结合使用的,但我限制在 30K 个字符以内,而且有很多代码。
目前可以下载名为 spaceheater 的 github 软件包。再次感谢@hrbrmstr,没有他,这是不可能的。
library(devtools)
install_github("nbarsch/spaceheater")
我将其分为三个功能:
getWPdatatypes("country")
getWPoptions("country","datatype")
getWPdownload("country","datatype",c ("options"),year)
###takes 0, 1 or 2 options. 0 is ""
__
getWPdatatypes("country")
#Example
getWPdatatypes("Nigeria")
是基础级别,returns 是 worldpop 中该国家/地区可用数据类型的数据框。示例数据类型为 "Population"、"Births"、"UrbanChange".
_
getWPoptions("country", "datatype")
#Example
getWPoptions("Ghana", "Population")
旨在成为 运行 在 getWPdatatypes 和 returns 之后的一个数据框,其中包含可用于给定数据类型的国家/地区的所有选项和年份。
getWPdownload("country", "datatype", c("options"), year)
#Example
getWPdownload("Benin", "Pregnancies", "pp", 2015)
下载集感谢 hrbrmstr
如果文件名中没有标注年份,则年份为 9999。世界流行网站列出了年份。他们的文件命名方案很残酷,大部分代码是由于文件名不一致造成的。
话虽这么说,我喜欢他们的方法和他们构建的网格栅格集,所以如果有人使用代码,请引用 WorldPop。 :)
这现在是 github 上 spaceheater 包的一部分,请参阅其他答案以获取说明 https://github.com/nbarsch/spaceheater。下面是我所做的参考代码。
图书馆:
library(httr)
library(rvest)
library(tidyverse)
library(raster)
library(rgdal)
library(reshape2)
library(rangeBuilder)
library(stringr)
library(foreach)
library(dplyr)
函数(由于字符限制所以我不得不删除代码中的大部分#notes。请参阅下面的答案以获取说明。大量代码是由于文件名不一致):
获取 WP 数据类型:
getWPdatatypes <- function (country) {
country <- standardizeCountry(paste(country),fuzzyDist=30)
res <- GET(url="http://www.worldpop.org.uk/data/data_sources/")
# Get the page contents
pg <- content(res)
summary_link_nodes <- html_nodes(pg, xpath=".//a[contains(@href,'summary')]")
map(summary_link_nodes, html_nodes, xpath=".//../..") %>%
map(html_children) %>%
map(html_text) %>%
map(~.[1:4]) %>%
map(as.list) %>%
map_df(set_names, c("continent", "country", "resolution", "data_type")) %>%
bind_cols(
data_frame(
summary_link = sprintf("http://www.worldpop.org.uk%s", html_attr(summary_link_nodes, "href"))
)
) -> world_pop_data
world_pop_data$data_type <- gsub("Urban change", "UrbanChange", world_pop_data$data_type)
world_pop_data$data_type <- gsub("Maternal and Newborn Health", "MaternalNewbornHealth", world_pop_data$data_type)
world_pop_data$data_type <- gsub("Contraceptive Use", "ContraceptiveUse", world_pop_data$data_type)
world_pop_data$data_type <- gsub("Age structures", "AgeStructures", world_pop_data$data_type)
world_pop_data$data_type <- gsub("Dynamic Population", "DynamicPopulation", world_pop_data$data_type)
countryreference <- as.data.frame(world_pop_data)
countryreference <- countryreference[,c(1,2,4)]
countryreference <- countryreference[!(countryreference$country)=="N/A",]
world_pop_data <- world_pop_data[!(world_pop_data$country)=="N/A",]
countryreference$CountryStandard <- standardizeCountry(countryreference[,"country"], fuzzyDist=20)
countryreference$CountryEdit <- gsub("[()]", "", countryreference$country)
countryreference$CountryEdit2 <- gsub("\s*\([^\)]+\)","",as.character(countryreference$country))
foreach(a=1:nrow(countryreference)) %do% {
if(countryreference[a,"CountryStandard"]==""){
countryreference[a,"CountryStandard"] <- standardizeCountry(countryreference[a,"CountryEdit"], fuzzyDist=20)
if(countryreference[a,"CountryStandard"]==""){
countryreference[a,"CountryStandard"] <- standardizeCountry(countryreference[a,"CountryEdit2"], fuzzyDist=20)
if(countryreference[a,"CountryStandard"]==""){
countryreference[a,"CountryStandard"] <- toupper(countryreference[a,"country"])
}
}
}
}
exists <- isTRUE(paste(country) %in% as.character(countryreference$CountryStandard))
if(exists==FALSE){
print("It appears this country is not in the WorldPop set, please check and try again")
break
}
countryreference <- countryreference[,c(1,4,3)]
world_pop_data$CountryStandard <- countryreference[,2]
world_pop_data <- world_pop_data[,c(1,6,2,3,4,5)]
countryreference <- suppressMessages(dcast(countryreference, continent+CountryStandard ~ data_type))
countryreference <- filter(countryreference, countryreference$CountryStandard==country)
countryreference <- countryreference[,colSums(is.na(countryreference))<nrow(countryreference)]
print(countryreference[1,])
WPdata.types <<- countryreference[1,]
print("The above table has also been added to your working environment as dataframe: WPdata.types")
}
getWPoptions:
#Example
getWPdatatypes("Nigeria", "Population")
getWPoptions <- function (country, datatype) {
country <- standardizeCountry(paste(country),fuzzyDist=30)
res <- GET(url="http://www.worldpop.org.uk/data/data_sources/")
# Get the page contents
pg <- content(res)
summary_link_nodes <- html_nodes(pg, xpath=".//a[contains(@href,'summary')]")
map(summary_link_nodes, html_nodes, xpath=".//../..") %>%
map(html_children) %>%
map(html_text) %>%
map(~.[1:4]) %>%
map(as.list) %>%
map_df(set_names, c("continent", "country", "resolution", "data_type")) %>%
bind_cols(
data_frame(
summary_link = sprintf("http://www.worldpop.org.uk%s", html_attr(summary_link_nodes, "href"))
)
) -> world_pop_data
world_pop_data$data_type <- gsub("Urban change", "UrbanChange", world_pop_data$data_type)
world_pop_data$data_type <- gsub("Maternal and Newborn Health", "MaternalNewbornHealth", world_pop_data$data_type)
world_pop_data$data_type <- gsub("Contraceptive Use", "ContraceptiveUse", world_pop_data$data_type)
world_pop_data$data_type <- gsub("Age structures", "AgeStructures", world_pop_data$data_type)
world_pop_data$data_type <- gsub("Dynamic Population", "DynamicPopulation", world_pop_data$data_type)
countryreference <- as.data.frame(world_pop_data)
countryreference <- countryreference[,c(1,2,4)]
countryreference <- countryreference[!(countryreference$country)=="N/A",]
world_pop_data <- world_pop_data[!(world_pop_data$country)=="N/A",]
countryreference$CountryStandard <- standardizeCountry(countryreference[,"country"], fuzzyDist=20)
countryreference$CountryEdit <- gsub("[()]", "", countryreference$country)
countryreference$CountryEdit2 <- gsub("\s*\([^\)]+\)","",as.character(countryreference$country))
foreach(a=1:nrow(countryreference)) %do% {
if(countryreference[a,"CountryStandard"]==""){
countryreference[a,"CountryStandard"] <- standardizeCountry(countryreference[a,"CountryEdit"], fuzzyDist=20)
if(countryreference[a,"CountryStandard"]==""){
countryreference[a,"CountryStandard"] <- standardizeCountry(countryreference[a,"CountryEdit2"], fuzzyDist=20)
if(countryreference[a,"CountryStandard"]==""){
countryreference[a,"CountryStandard"] <- toupper(countryreference[a,"country"])
}
}
}
}
exists <- isTRUE(paste(country) %in% as.character(countryreference$CountryStandard))
if(exists==FALSE){
print("It appears this country is not in the WorldPop set, please check and try again")
break
}
countryreference <- countryreference[,c(1,4,3)]
world_pop_data$CountryStandard <- countryreference[,2]
world_pop_data <- world_pop_data[,c(1,6,2,3,4,5)]
countryreference <- suppressMessages(dcast(countryreference, continent+CountryStandard ~ data_type))
countryreference <- filter(countryreference, countryreference$CountryStandard==country)
countryreference <- countryreference[,colSums(is.na(countryreference))<nrow(countryreference)]
world_pop_data <- filter(world_pop_data, CountryStandard %in% countryreference$CountryStandard)
world_pop_data <- filter(world_pop_data, data_type==paste(datatype))
dataset_link <- as.character(world_pop_data[1,"summary_link"])
GET(url=dataset_link) -> res2
pg2 <- content(res2)
fields <- html_nodes(pg2, "form#conform > input")
fields <- set_names(xml_attr(fields, "value"), html_attr(fields, "name"))
POST(
url = "http://www.worldpop.org.uk/data/download/",
add_headers(`Referer` = dataset_link),
body = list(
client_first_name = "",
client_last_name = "",
client_organization = "",
client_country = "",
client_email = "",
client_message = "",
zip_id = fields["zip_id"],
zip_title = fields["zip_title"],
decoy = fields["decoy"],
website = "",
download = "Browse Individual Files"
),
encode = "form"
) -> res3
pg3 <- content(res3)
html_nodes(pg3, xpath=".//a[contains(., 'switch to')]") %>%
html_attr("href") -> file_list_query_string
GET(
url = "http://www.worldpop.org.uk/data/files/index.php",
query = list(
dataset = fields["zip_id"],
action = "dir"
)
) -> res4
pg4 <- content(res4)
data_frame(
group_name = html_nodes(pg4, "a.dl") %>% html_text(),
href = html_nodes(pg4, "a.dl") %>% html_attr("href")
) -> downloads
downloads$istif <- str_sub(downloads$group_name,-4,-1)
#Some such as senegal are inexplicably .TIF
downloads$istif <- tolower(downloads$istif)
downloads <- filter(downloads, istif==".tif")
pg4charfile <- as.character(downloads[1,"group_name"])
pg4charfile <- gsub(' {1,}','',pg4charfile)
if(substr(pg4charfile,1,6)!="popmap"){
if(grepl("\d", pg4charfile)==TRUE){
char4 <- substr(pg4charfile,4,4)
char6 <-substr(pg4charfile,6,6)
char9 <-substr(pg4charfile,9,9)
char11 <-substr(pg4charfile,11,11)
char4num <- suppressWarnings(!is.na(as.numeric(char4)))
char6num <- suppressWarnings(!is.na(as.numeric(char6)))
char9num <- suppressWarnings(!is.na(as.numeric(char9)))
char11num <- suppressWarnings(!is.na(as.numeric(char11)))
if(char4num==TRUE & char6num==TRUE){
downloads$years <-substr(downloads$group_name,4,7)
}
if(char4num==TRUE & char6num==FALSE){
downloads$years <-substr(downloads$group_name,4,5)
getfouryear <- function (yearsvec) {
yrFlip = 50
yearsvec <- as.numeric(yearsvec)
yearsvec[yearsvec > yrFlip] <- yearsvec[yearsvec > yrFlip] + 1900
yearsvec[yearsvec < yrFlip] <- yearsvec[yearsvec < yrFlip] + 2000
return(yearsvec)
}
downloads$years <- getfouryear(downloads$years)
}
if(char9num==FALSE & char11num==TRUE){
downloads$years <-substr(downloads$group_name,11,12)
getfouryear <- function (yearsvec) {
yrFlip = 50
yearsvec <- as.numeric(yearsvec)
yearsvec[yearsvec > yrFlip] <- yearsvec[yearsvec > yrFlip] + 1900
yearsvec[yearsvec < yrFlip] <- yearsvec[yearsvec < yrFlip] + 2000
return(yearsvec)
}
downloads$years <- getfouryear(downloads$years)
}
if(char4num==FALSE & char6num==FALSE & char9num==TRUE){
downloads$years <- str_extract(downloads$group_name, "\d{4}")
}
if(char4num==FALSE & char6num==FALSE & char9num==FALSE & char11num==FALSE){
downloads$years <- str_extract(downloads$group_name, "\d{4}")
}
}else{downloads$years <- 9999}
}else{
downloads$years<- as.numeric(substr(downloads$group_name,7,8))
getfouryear <- function (yearsvec) {
yrFlip = 50
yearsvec <- as.numeric(yearsvec)
yearsvec[yearsvec > yrFlip] <- yearsvec[yearsvec > yrFlip] + 1900
yearsvec[yearsvec < yrFlip] <- yearsvec[yearsvec < yrFlip] + 2000
return(yearsvec)
}
downloads$years <- getfouryear(downloads$years)
}
downloads <- downloads[!is.na(downloads$years),]
###Possible Options due to the inexplicable nature of their inconsistent file names
possopt <- c("_pph_", "_ppp_", "_pp_", "uncert", "adj","_M.", "_M_","_F.", "_F_", "interdecile", "povsd", "125", "200","wpipov", "ppipov", "incpov", "mpipov", "ANC", "SBA", "PNC")
opttext <- c("Persons per hectare", "Persons per pixel", "per pixel", " uncertainty dataset showing 95% credible intervals",
"adjusted to match UN estimates", "MALE", "MALE", "FEMALE", "FEMALE", "Uncertainty map", "poverty standard deviation map", ".25/day",
".00/day", "mean wealth index", "mean likelihood of living in poverty per grid square", "Income estimate USD per grid square", "%poverty by Multidimensional Poverty Index",
"prob of four or more antenatal care visits at time of delivery", "prob of skilled birth attendance during delivery", "prob of postnatal care received within 48 hours of delivery")
possoptdf <- data.frame(possopt, opttext, stringsAsFactors = FALSE)
groupsubstr <- str_sub(downloads$group_name,4,-4)
optionsforchoice<-foreach(a=1:nrow(downloads), .combine=rbind)%do%{
theoptions<- foreach(b=1:length(opttext), .combine=cbind)%do%{
matchoopt <- str_detect(downloads[a,"group_name"],coll(possopt[b]))
if(matchoopt==TRUE){result <-possopt[b]}
if(matchoopt==FALSE){result<- NA}
if(b==13){
mistake <- str_detect(downloads[a,"group_name"],"\d{4}")
if(mistake==TRUE){result <- NA}
}
result
}
}
optionsforchoice<-do.call(rbind,lapply(1:nrow(optionsforchoice),function(x) t(matrix(optionsforchoice[x,order(is.na(optionsforchoice[x,]))])) ))
optionsforchoice <- as.data.frame(optionsforchoice, stringsAsFactors=FALSE)
optionsforchoice <- optionsforchoice[,colSums(is.na(optionsforchoice))<nrow(optionsforchoice)]
downloads <- cbind(downloads,optionsforchoice)
optiters <- as.data.frame(optionsforchoice)
if(length(optionsforchoice)==1){colnames(downloads)[5]<-"V1"}
if(ncol(as.data.frame(optionsforchoice))==1){colnames(downloads)[5]<-"V1"}
optiters <- ncol(optiters)
###join all the options so they can be displayed
foreach(a=1:optiters)%do%{
downloads <- merge(downloads, possoptdf, by.x=paste0("V",a), by.y="possopt", all.x=TRUE)
coltochange <- ncol(downloads)
colnames(downloads)[coltochange] <- paste0("possopt",a)
}
downloads$years <- as.numeric(downloads$years)
downloads <- downloads[order(downloads$years),]
##Subsetting downloads to columns that only contain possopt
downpossopt <- downloads[ , grepl( "possopt" , colnames( downloads ) ) ]
downpossopt <- as.data.frame(downpossopt)
downpossopt$code <- c(1:nrow(downpossopt))
downloads$code <- c(1:nrow(downloads))
rownames(downloads) <- c(1:nrow(downloads))
rownames(downpossopt) <- c(1:nrow(downpossopt))
if(ncol(downpossopt)==1){
colnames(downpossopt) <- "possopt1"}
downpossopt <- cbind(downloads$years, downpossopt)
names(downpossopt)[names(downpossopt) == 'downloads$years'] <- 'years'
names(downpossopt)[names(downpossopt) == 'V1'] <- 'years'
possoptcodes <- c("pph", "ppp", "pp", "uncert", "adj","M", "M","F", "F", "interdecile", "povsd", "125", "200","wpipov", "ppipov", "incpov", "mpipov", "ANC", "SBA", "PNC")
possoptcodes <- as.data.frame(cbind(possoptcodes, opttext))
possoptcodes <- possoptcodes[c(1:6,8,10:nrow(possoptcodes)),]
names(downpossopt)[names(downpossopt) == 'downpossopt'] <- 'possopt1'
foreach(a=1:optiters)%do%{
downpossopt <- merge(downpossopt, possoptcodes, by.x=paste0("possopt",a), by.y="opttext", all.x=TRUE)
coltochange <- ncol(downpossopt)
colnames(downpossopt)[coltochange] <- paste0("OptionCode",a)
}
downpossopt <- as.data.frame(downpossopt)
downpossopt <- downpossopt[order(downpossopt$code),]
downpossopt <- downpossopt[,c(which(colnames(downpossopt)=="possopt1"),which(colnames(downpossopt)!="possopt1"))]
downpossopt <- downpossopt[,c(which(colnames(downpossopt)=="years"),which(colnames(downpossopt)!="years"))]
downpossopt$country <- countryreference[1,"CountryStandard"]
downpossopt <- downpossopt[,c(which(colnames(downpossopt)=="country"),which(colnames(downpossopt)!="country"))]
row.names(downpossopt) <- c(1:nrow(downpossopt))
downpossoptcodes <- downpossopt[ , grepl( "OptionCode" , colnames( downpossopt ) ) ]
downpossoptcodes <- as.data.frame(downpossoptcodes)
print(downpossopt)
WP.options <<- downpossopt
print("The above table has also been added to your working environment as dataframe: WP.options")
}
获取WP下载:
#Example
getWPdownload("Benin", "Pregnancies", "pp", 2015)
#if the set is missing a year type 9999, 9999 will be returned for sets with missing years in getWPoptions.
getWPdownload <- function (country, datatype, options, year) {
country <- standardizeCountry(paste(country),fuzzyDist=30)
year <- as.numeric(year)
optionschosen <- paste(options, collapse = '-')
res <- GET(url="http://www.worldpop.org.uk/data/data_sources/")
# Get the page contents
pg <- content(res)
summary_link_nodes <- html_nodes(pg, xpath=".//a[contains(@href,'summary')]")
map(summary_link_nodes, html_nodes, xpath=".//../..") %>%
map(html_children) %>%
map(html_text) %>%
map(~.[1:4]) %>%
map(as.list) %>%
map_df(set_names, c("continent", "country", "resolution", "data_type")) %>%
bind_cols(
data_frame(
summary_link = sprintf("http://www.worldpop.org.uk%s", html_attr(summary_link_nodes, "href"))
)
) -> world_pop_data
world_pop_data$data_type <- gsub("Urban change", "UrbanChange", world_pop_data$data_type)
world_pop_data$data_type <- gsub("Maternal and Newborn Health", "MaternalNewbornHealth", world_pop_data$data_type)
world_pop_data$data_type <- gsub("Contraceptive Use", "ContraceptiveUse", world_pop_data$data_type)
world_pop_data$data_type <- gsub("Age structures", "AgeStructures", world_pop_data$data_type)
world_pop_data$data_type <- gsub("Dynamic Population", "DynamicPopulation", world_pop_data$data_type)
countryreference <- as.data.frame(world_pop_data)
countryreference <- countryreference[,c(1,2,4)]
countryreference <- countryreference[!(countryreference$country)=="N/A",]
world_pop_data <- world_pop_data[!(world_pop_data$country)=="N/A",]
###Filter country names so they match the desired country
countryreference$CountryStandard <- standardizeCountry(countryreference[,"country"], fuzzyDist=20)
countryreference$CountryEdit <- gsub("[()]", "", countryreference$country)
countryreference$CountryEdit2 <- gsub("\s*\([^\)]+\)","",as.character(countryreference$country))
foreach(a=1:nrow(countryreference)) %do% {
if(countryreference[a,"CountryStandard"]==""){
countryreference[a,"CountryStandard"] <- standardizeCountry(countryreference[a,"CountryEdit"], fuzzyDist=20)
if(countryreference[a,"CountryStandard"]==""){
countryreference[a,"CountryStandard"] <- standardizeCountry(countryreference[a,"CountryEdit2"], fuzzyDist=20)
if(countryreference[a,"CountryStandard"]==""){
countryreference[a,"CountryStandard"] <- toupper(countryreference[a,"country"])
}
}
}
}
exists <- isTRUE(paste(country) %in% as.character(countryreference$CountryStandard))
if(exists==FALSE){
print("It appears this country is not in the WorldPop set, please check and try again")
break
}
countryreference <- countryreference[,c(1,4,3)]
world_pop_data$CountryStandard <- countryreference[,2]
world_pop_data <- world_pop_data[,c(1,6,2,3,4,5)]
countryreference <- suppressMessages(dcast(countryreference, continent+CountryStandard ~ data_type))
countryreference <- filter(countryreference, countryreference$CountryStandard==country)
countryreference <- countryreference[,colSums(is.na(countryreference))<nrow(countryreference)]
world_pop_data <- filter(world_pop_data, CountryStandard %in% countryreference$CountryStandard)
world_pop_data <- filter(world_pop_data, data_type==paste(datatype))
dataset_link <- as.character(world_pop_data[1,"summary_link"])
GET(url=dataset_link) -> res2
pg2 <- content(res2)
fields <- html_nodes(pg2, "form#conform > input")
fields <- set_names(xml_attr(fields, "value"), html_attr(fields, "name"))
POST(
url = "http://www.worldpop.org.uk/data/download/",
add_headers(`Referer` = dataset_link),
body = list(
client_first_name = "",
client_last_name = "",
client_organization = "",
client_country = "",
client_email = "",
client_message = "",
zip_id = fields["zip_id"],
zip_title = fields["zip_title"],
decoy = fields["decoy"],
website = "",
download = "Browse Individual Files"
),
encode = "form"
) -> res3
pg3 <- content(res3)
html_nodes(pg3, xpath=".//a[contains(., 'switch to')]") %>%
html_attr("href") -> file_list_query_string
GET(
url = "http://www.worldpop.org.uk/data/files/index.php",
query = list(
dataset = fields["zip_id"],
action = "dir"
)
) -> res4
pg4 <- content(res4)
data_frame(
group_name = html_nodes(pg4, "a.dl") %>% html_text(),
href = html_nodes(pg4, "a.dl") %>% html_attr("href")
) -> downloads
downloads$istif <- str_sub(downloads$group_name,-4,-1)
#Some such as senegal are inexplicably .TIF
downloads$istif <- tolower(downloads$istif)
downloads <- filter(downloads, istif==".tif")
pg4charfile <- as.character(downloads[1,"group_name"])
pg4charfile <- gsub(' {1,}','',pg4charfile)
if(substr(pg4charfile,1,6)!="popmap"){
if(grepl("\d", pg4charfile)==TRUE){
char4 <- substr(pg4charfile,4,4)
char6 <-substr(pg4charfile,6,6)
char9 <-substr(pg4charfile,9,9)
char11 <-substr(pg4charfile,11,11)
char4num <- suppressWarnings(!is.na(as.numeric(char4)))
char6num <- suppressWarnings(!is.na(as.numeric(char6)))
char9num <- suppressWarnings(!is.na(as.numeric(char9)))
char11num <- suppressWarnings(!is.na(as.numeric(char11)))
if(char4num==TRUE & char6num==TRUE){
downloads$years <-substr(downloads$group_name,4,7)
}
if(char4num==TRUE & char6num==FALSE){
downloads$years <-substr(downloads$group_name,4,5)
getfouryear <- function (yearsvec) {
yrFlip = 50
yearsvec <- as.numeric(yearsvec)
yearsvec[yearsvec > yrFlip] <- yearsvec[yearsvec > yrFlip] + 1900
yearsvec[yearsvec < yrFlip] <- yearsvec[yearsvec < yrFlip] + 2000
return(yearsvec)
}
downloads$years <- getfouryear(downloads$years)
}
if(char9num==FALSE & char11num==TRUE){
downloads$years <-substr(downloads$group_name,11,12)
getfouryear <- function (yearsvec) {
yrFlip = 50
yearsvec <- as.numeric(yearsvec)
yearsvec[yearsvec > yrFlip] <- yearsvec[yearsvec > yrFlip] + 1900
yearsvec[yearsvec < yrFlip] <- yearsvec[yearsvec < yrFlip] + 2000
return(yearsvec)
}
downloads$years <- getfouryear(downloads$years)
}
if(char4num==FALSE & char6num==FALSE & char9num==TRUE){
downloads$years <- str_extract(downloads$group_name, "\d{4}")
}
if(char4num==FALSE & char6num==FALSE & char9num==FALSE & char11num==FALSE){
downloads$years <- str_extract(downloads$group_name, "\d{4}")
}
}else{downloads$years <- 9999}
}else{
downloads$years<- as.numeric(substr(downloads$group_name,7,8))
getfouryear <- function (yearsvec) {
yrFlip = 50
yearsvec <- as.numeric(yearsvec)
yearsvec[yearsvec > yrFlip] <- yearsvec[yearsvec > yrFlip] + 1900
yearsvec[yearsvec < yrFlip] <- yearsvec[yearsvec < yrFlip] + 2000
return(yearsvec)
}
downloads$years <- getfouryear(downloads$years)
}
downloads <- downloads[!is.na(downloads$years),]
possopt <- c("_pph_", "_ppp_", "_pp_", "uncert", "adj","_M.", "_M_","_F.", "_F_", "interdecile", "povsd", "125", "200","wpipov", "ppipov", "incpov", "mpipov", "ANC", "SBA", "PNC")
opttext <- c("Persons per hectare", "Persons per pixel", "per pixel", " uncertainty dataset showing 95% credible intervals",
"adjusted to match UN estimates", "MALE", "MALE", "FEMALE", "FEMALE", "Uncertainty map", "poverty standard deviation map", ".25/day",
".00/day", "mean wealth index", "mean likelihood of living in poverty per grid square", "Income estimate USD per grid square", "%poverty by Multidimensional Poverty Index",
"prob of four or more antenatal care visits at time of delivery", "prob of skilled birth attendance during delivery", "prob of postnatal care received within 48 hours of delivery")
possoptdf <- data.frame(possopt, opttext, stringsAsFactors = FALSE)
groupsubstr <- str_sub(downloads$group_name,4,-4)
###get options for each file from the worldpop selected country and datatype###
optionsforchoice<-foreach(a=1:nrow(downloads), .combine=rbind)%do%{
theoptions<- foreach(b=1:length(opttext), .combine=cbind)%do%{
matchoopt <- str_detect(downloads[a,"group_name"],coll(possopt[b]))
if(matchoopt==TRUE){result <-possopt[b]}
if(matchoopt==FALSE){result<- NA}
if(b==13){
mistake <- str_detect(downloads[a,"group_name"],"\d{4}")
if(mistake==TRUE){result <- NA}
}
result
}
}
optionsforchoice<-do.call(rbind,lapply(1:nrow(optionsforchoice),function(x) t(matrix(optionsforchoice[x,order(is.na(optionsforchoice[x,]))])) ))
optionsforchoice <- as.data.frame(optionsforchoice, stringsAsFactors=FALSE)
optionsforchoice <- optionsforchoice[,colSums(is.na(optionsforchoice))<nrow(optionsforchoice)]
downloads <- cbind(downloads,optionsforchoice)
optiters <- as.data.frame(optionsforchoice)
if(length(optionsforchoice)==1){colnames(downloads)[5]<-"V1"}
if(ncol(as.data.frame(optionsforchoice))==1){colnames(downloads)[5]<-"V1"}
optiters <- ncol(optiters)
foreach(a=1:optiters)%do%{
downloads <- merge(downloads, possoptdf, by.x=paste0("V",a), by.y="possopt", all.x=TRUE)
coltochange <- ncol(downloads)
colnames(downloads)[coltochange] <- paste0("possopt",a)
}
downloads$years <- as.numeric(downloads$years)
downloads <- downloads[order(downloads$years),]
##Subsetting downloads to columns that only contain possopt
downpossopt <- downloads[ , grepl( "possopt" , colnames( downloads ) ) ]
downpossopt <- as.data.frame(downpossopt)
downpossopt$code <- c(1:nrow(downpossopt))
downloads$code <- c(1:nrow(downloads))
rownames(downloads) <- c(1:nrow(downloads))
rownames(downpossopt) <- c(1:nrow(downpossopt))
if(ncol(downpossopt)==1){
colnames(downpossopt) <- "possopt1"}
downpossopt <- cbind(downloads$years, downpossopt)
names(downpossopt)[names(downpossopt) == 'downloads$years'] <- 'years'
names(downpossopt)[names(downpossopt) == 'V1'] <- 'years'
###get the right codes for the function
possoptcodes <- c("pph", "ppp", "pp", "uncert", "adj","M", "M","F", "F", "interdecile", "povsd", "125", "200","wpipov", "ppipov", "incpov", "mpipov", "ANC", "SBA", "PNC")
possoptcodes <- as.data.frame(cbind(possoptcodes, opttext))
possoptcodes <- possoptcodes[c(1:6,8,10:nrow(possoptcodes)),]
names(downpossopt)[names(downpossopt) == 'downpossopt'] <- 'possopt1'
foreach(a=1:optiters)%do%{
downpossopt <- merge(downpossopt, possoptcodes, by.x=paste0("possopt",a), by.y="opttext", all.x=TRUE)
coltochange <- ncol(downpossopt)
colnames(downpossopt)[coltochange] <- paste0("OptionCode",a)
}
downpossopt <- as.data.frame(downpossopt)
downpossopt <- downpossopt[order(downpossopt$code),]
downpossopt <- downpossopt[,c(which(colnames(downpossopt)=="possopt1"),which(colnames(downpossopt)!="possopt1"))]
downpossopt <- downpossopt[,c(which(colnames(downpossopt)=="years"),which(colnames(downpossopt)!="years"))]
downpossopt$country <- countryreference[1,"CountryStandard"]
downpossopt <- downpossopt[,c(which(colnames(downpossopt)=="country"),which(colnames(downpossopt)!="country"))]
row.names(downpossopt) <- c(1:nrow(downpossopt))
downpossoptcodes <- downpossopt[ , grepl( "OptionCode" , colnames( downpossopt ) ) ]
downpossoptcodes <- as.data.frame(downpossoptcodes)
if(ncol(downpossoptcodes)>1){
downpossoptcodes <- data.frame(x=apply(downpossoptcodes,1,function(x) {paste(x[!is.na(x)],collapse='-')}))
}
colnames(downpossoptcodes) <- "optionspossible"
downloads <- as.data.frame(cbind(downloads, downpossoptcodes))
downloads <- filter(downloads, downloads$years==year)
if(optionschosen!=""){
downloads2 <- filter(downloads, downloads$optionspossible==optionschosen)
if(is.na(downloads2[1,1]) & length(options)>1){
optionschosen <- paste0(options[2],"-", options[1], collapse='')
downloads2 <- filter(downloads, downloads$optionspossible==optionschosen)
}
}else{downloads2 <- filter(downloads, is.na(downloads$optionspossible))}
readme_query_stringdownload <- as.character(downloads2[1,"href"])
filenamedest <- as.character(paste0(country,downloads2[1,"group_name"]))
GET(
url = "http://www.worldpop.org.uk/data/files/index.php",
query = parse_url(readme_query_stringdownload)$query,
progress(),
verbose(),
write_disk(paste0(filenamedest), overwrite=TRUE)
)-> res5
}
我正在尝试从 WorldPop UK 网站下载数据集中许多国家/地区的大量文件(不仅仅是小示例)。下载每个文件将非常耗时且乏味。
我相当熟悉 R 中的下载方法,但我无法使这些下载工作。我知道这是因为通过 html 以某种方式下载 links 运行,但我不擅长 html 或 java。
我阅读了大量有关 httr、RCurl 和 RSelenium 的资料。我更喜欢避免使用 RSelenium 的解决方案,因为我对其他软件包更加熟悉并且可能会共享代码并且不想每次都必须设置浏览器(至少这是我的理解)
有人可以帮我解决这个问题吗?
直接下载 link 到一个在浏览器中运行良好的小文本 (.txt) 文件,但在使用 download.file 或 curl_download 的 R 中则不行: http://www.worldpop.org.uk/data/files/index.php?dataset=140&action=download&file=60
带有尼日利亚文件索引的站点(您可以在 html 代码中看到 href= links):http://www.worldpop.org.uk/data/files/index.php?dataset=140&action=dir
在 chrome,查看源代码:http://www.worldpop.org.uk/data/files/index.php?dataset=140&action=dir
下载 link 在我的开发者控制台上的第 558 行和第 559 行之间。
提前致谢!
好吧,他们当然不会让这变得容易。除了令人费解的 "web app" 他们还尝试做正确的事情并使用 sha1 对源 javascript 资源进行验证,但未能保持这些资源的维护(即安全浏览器将无法使用那个网站)。
无论如何,这是你必须做的以避免splashr
或RSelenium
/seleniumPipes
。我使用了您的 "README" 示例并且有很多评论。
我的建议是将一个或多个位包装到一个函数中以便于使用,并考虑将各种调用包装在 purrr
帮助程序中,例如 safely
(也有 "retry" 示例 oot并启动)。
library(httr)
library(rvest)
library(tidyverse)
# Need to "prime" the session with a cookie
res <- GET(url="http://www.worldpop.org.uk/data/data_sources/")
# Get the page contents
pg <- content(res)
# Find the summary links
summary_link_nodes <- html_nodes(pg, xpath=".//a[contains(@href,'summary')]")
# extract the table cells & href so we can make a data frame
map(summary_link_nodes, html_nodes, xpath=".//../..") %>%
map(html_children) %>%
map(html_text) %>%
map(~.[1:4]) %>%
map(as.list) %>%
map_df(set_names, c("continent", "country", "resolution", "data_type")) %>%
bind_cols(
data_frame(
summary_link = sprintf("http://www.worldpop.org.uk%s", html_attr(summary_link_nodes, "href"))
)
) -> world_pop_data
glimpse(world_pop_data)
## Observations: 462
## Variables: 5
## $ continent <chr> "Africa", "Africa", "Africa", "Africa", "Africa", "Africa", "Afri...
## $ country <chr> "Algeria", "Angola", "Benin", "Botswana", "Burkina Faso", "Burund...
## $ resolution <chr> "100m", "100m", "100m", "100m", "100m", "100m", "100m", "100m", "...
## $ data_type <chr> "Population", "Population", "Population", "Population", "Populati...
## $ summary_link <chr> "http://www.worldpop.org.uk/data/summary?doi=10.5258/SOTON/WP0000...
# just see "Nigeria" data
filter(world_pop_data, country=="Nigeria")
## # A tibble: 8 x 5
## continent country resolution data_type summary_link
## <chr> <chr> <chr> <chr> <chr>
## 1 Africa Nigeria 100m Population http://www.worldpop.org.uk/data/summary?doi=10.5258/SOTON/WP00196
## 2 Africa Nigeria 1km Births http://www.worldpop.org.uk/data/summary?doi=10.5258/SOTON/WP00357
## 3 Africa Nigeria 1km Pregnancies http://www.worldpop.org.uk/data/summary?doi=10.5258/SOTON/WP00465
## 4 Africa Nigeria 1km Contraceptive Use http://www.worldpop.org.uk/data/summary?doi=10.5258/SOTON/WP00198
## 5 Africa Nigeria 1km Literacy http://www.worldpop.org.uk/data/summary?doi=10.5258/SOTON/WP00199
## 6 Africa Nigeria 1km Poverty http://www.worldpop.org.uk/data/summary?doi=10.5258/SOTON/WP00200
## 7 Africa Nigeria 1km Stunting http://www.worldpop.org.uk/data/summary?doi=10.5258/SOTON/WP00201
## 8 Africa Nigeria 100m Age structures http://www.worldpop.org.uk/data/summary?doi=10.5258/SOTON/WP00194
我相当确定您可以从其中一个 ^^ URL 开始任何文件下载会话尝试,但您需要测试它,因为您可能需要始终从"main" 页面(如前所述,它根据 cookie 维护会话位置)。
# get nigeria population URL
filter(world_pop_data, country=="Nigeria") %>%
filter(data_type=="Population") %>%
pull(summary_link) -> nigeria_pop
nigeria_pop
# [1] "http://www.worldpop.org.uk/data/summary?doi=10.5258/SOTON/WP00196"
# follow it
GET(url=nigeria_pop) -> res2
pg2 <- content(res2)
该页面上总是有一个 <form>
,所以我们需要 "submit" 表示带有 POST
的表格:
# extract "form" fields (that page does a POST request)
fields <- html_nodes(pg2, "form#conform > input")
fields <- set_names(xml_attr(fields, "value"), html_attr(fields, "name"))
str(as.list((fields))) # just to show what it looks like
## List of 4
## $ zip_id : chr "140"
## $ zip_title: chr "Nigeria 100m Population"
## $ decoy : chr "website"
## $ website : chr NA
# submit the form with the field data.
# NOTE we need to add the `Referer` (the faux page we're on)
POST(
url = "http://www.worldpop.org.uk/data/download/",
add_headers(`Referer` = nigeria_pop),
body = list(
client_first_name = "",
client_last_name = "",
client_organization = "",
client_country = "",
client_email = "",
client_message = "",
zip_id = fields["zip_id"],
zip_title = fields["zip_title"],
decoy = fields["decoy"],
website = "",
download = "Browse Individual Files"
),
encode = "form"
) -> res3
结果页面的某处是 "Switch to file list" link,因此我们需要找到它并按照它进行操作:
# find the link that has the file list
pg3 <- content(res3)
html_nodes(pg3, xpath=".//a[contains(., 'switch to')]") %>%
html_attr("href") -> file_list_query_string # just to see the format
## [1] "?dataset=140&action=dir"
# follow that link (we need to use some of the previous captured fields)
GET(
url = "http://www.worldpop.org.uk/data/files/index.php",
query = list(
dataset = fields["zip_id"],
action = "dir"
)
) -> res4
现在,我们构建该页面上所有 link 的数据框:
pg4 <- content(res4)
data_frame(
group_name = html_nodes(pg4, "a.dl") %>% html_text(),
href = html_nodes(pg4, "a.dl") %>% html_attr("href")
) -> downloads
downloads
## # A tibble: 60 x 2
## group_name href
## <chr> <chr>
## 1 Licence.txt ?dataset=140&action=download&file=1
## 2 NGA_metadata.html ?dataset=140&action=download&file=2
## 3 NGA_pph_v2c_2006.tfw ?dataset=140&action=download&file=3
## 4 NGA_pph_v2c_2006.tif ?dataset=140&action=download&file=4
## 5 NGA_pph_v2c_2006.tif.aux.xml ?dataset=140&action=download&file=5
## 6 NGA_pph_v2c_2006.tif.xml ?dataset=140&action=download&file=6
## 7 NGA_pph_v2c_2010.tfw ?dataset=140&action=download&file=7
## 8 NGA_pph_v2c_2010.tif ?dataset=140&action=download&file=8
## 9 NGA_pph_v2c_2010.tif.aux.xml ?dataset=140&action=download&file=9
## 10 NGA_pph_v2c_2010.tif.xml ?dataset=140&action=download&file=10
## # ... with 50 more rows
虽然我之前指出您可能需要始终从头开始或从上一个 link 页面开始,但您也可以按顺序下载所有这些内容。您需要进行测试。这是一个很难处理的网站。
filter(downloads, str_detect(group_name, "README")) %>%
pull(href) -> readme_query_string # we need this below
readme_query_string
## [1] "?dataset=140&action=download&file=60"
# THERE IS A RLY GD CHANCE YOU'LL NEED TO USE timeout() for
# some of these calls. That server takes a while
# right here is where that modal "preparing the data" is shown.
# I'm 99% certain this is there to slow down crawlers/scrapers.
GET(
url = "http://www.worldpop.org.uk/data/files/index.php",
query = parse_url(readme_query_string)$query,
verbose()
) -> res5
这不是你真正要做的。您可能希望 content(res5, as="raw")
和 writeBin()
因为有些(大多数)不是纯文本。但是,这是为了表明以上所有的工作:
content(res5, as="text") %>%
cat()
## WorldPop Africa dataset details
## _______________________
##
## DATASET: Alpha version 2010, 2015 and 2020 estimates of numbers of people per pixel ('ppp') and people per hectare ('pph'), with national totals adjusted to match UN population division estimates (http://esa.un.org/wpp/) and remaining unadjusted.
## REGION: Africa
## SPATIAL RESOLUTION: 0.000833333 decimal degrees (approx 100m at the equator)
## PROJECTION: Geographic, WGS84
## UNITS: Estimated persons per grid square
## MAPPING APPROACH: Random Forest
## FORMAT: Geotiff (zipped using 7-zip (open access tool): www.7-zip.org)
## FILENAMES: Example - NGA_ppp_v2b_2010_UNadj.tif = Nigeria (NGA) population per pixel (ppp), mapped using WorldPOP modelling version 2b (v2b) for 2010 (2010) adjusted to match UN national estimates (UNadj).
## DATE OF PRODUCTION: February 2017
##
## Also included: (i) Metadata html file, (ii) Google Earth file, (iii) Population datasets produced using original census year data (2006).
如果您坚持不懈,请考虑添加您最终所做的答案,或者将其打包成一个包,以便其他人也可以使用。
HEY Whosebug DELETERS,看看这个。它本来是要与我的其他答案结合使用的,但我限制在 30K 个字符以内,而且有很多代码。
目前可以下载名为 spaceheater 的 github 软件包。再次感谢@hrbrmstr,没有他,这是不可能的。
library(devtools)
install_github("nbarsch/spaceheater")
我将其分为三个功能:
getWPdatatypes("country")
getWPoptions("country","datatype")
getWPdownload("country","datatype",c ("options"),year)
###takes 0, 1 or 2 options. 0 is ""
__
getWPdatatypes("country")
#Example
getWPdatatypes("Nigeria")
是基础级别,returns 是 worldpop 中该国家/地区可用数据类型的数据框。示例数据类型为 "Population"、"Births"、"UrbanChange".
_
getWPoptions("country", "datatype")
#Example
getWPoptions("Ghana", "Population")
旨在成为 运行 在 getWPdatatypes 和 returns 之后的一个数据框,其中包含可用于给定数据类型的国家/地区的所有选项和年份。
getWPdownload("country", "datatype", c("options"), year)
#Example
getWPdownload("Benin", "Pregnancies", "pp", 2015)
下载集感谢 hrbrmstr
如果文件名中没有标注年份,则年份为 9999。世界流行网站列出了年份。他们的文件命名方案很残酷,大部分代码是由于文件名不一致造成的。
话虽这么说,我喜欢他们的方法和他们构建的网格栅格集,所以如果有人使用代码,请引用 WorldPop。 :)
这现在是 github 上 spaceheater 包的一部分,请参阅其他答案以获取说明 https://github.com/nbarsch/spaceheater。下面是我所做的参考代码。
图书馆:
library(httr)
library(rvest)
library(tidyverse)
library(raster)
library(rgdal)
library(reshape2)
library(rangeBuilder)
library(stringr)
library(foreach)
library(dplyr)
函数(由于字符限制所以我不得不删除代码中的大部分#notes。请参阅下面的答案以获取说明。大量代码是由于文件名不一致):
获取 WP 数据类型:
getWPdatatypes <- function (country) {
country <- standardizeCountry(paste(country),fuzzyDist=30)
res <- GET(url="http://www.worldpop.org.uk/data/data_sources/")
# Get the page contents
pg <- content(res)
summary_link_nodes <- html_nodes(pg, xpath=".//a[contains(@href,'summary')]")
map(summary_link_nodes, html_nodes, xpath=".//../..") %>%
map(html_children) %>%
map(html_text) %>%
map(~.[1:4]) %>%
map(as.list) %>%
map_df(set_names, c("continent", "country", "resolution", "data_type")) %>%
bind_cols(
data_frame(
summary_link = sprintf("http://www.worldpop.org.uk%s", html_attr(summary_link_nodes, "href"))
)
) -> world_pop_data
world_pop_data$data_type <- gsub("Urban change", "UrbanChange", world_pop_data$data_type)
world_pop_data$data_type <- gsub("Maternal and Newborn Health", "MaternalNewbornHealth", world_pop_data$data_type)
world_pop_data$data_type <- gsub("Contraceptive Use", "ContraceptiveUse", world_pop_data$data_type)
world_pop_data$data_type <- gsub("Age structures", "AgeStructures", world_pop_data$data_type)
world_pop_data$data_type <- gsub("Dynamic Population", "DynamicPopulation", world_pop_data$data_type)
countryreference <- as.data.frame(world_pop_data)
countryreference <- countryreference[,c(1,2,4)]
countryreference <- countryreference[!(countryreference$country)=="N/A",]
world_pop_data <- world_pop_data[!(world_pop_data$country)=="N/A",]
countryreference$CountryStandard <- standardizeCountry(countryreference[,"country"], fuzzyDist=20)
countryreference$CountryEdit <- gsub("[()]", "", countryreference$country)
countryreference$CountryEdit2 <- gsub("\s*\([^\)]+\)","",as.character(countryreference$country))
foreach(a=1:nrow(countryreference)) %do% {
if(countryreference[a,"CountryStandard"]==""){
countryreference[a,"CountryStandard"] <- standardizeCountry(countryreference[a,"CountryEdit"], fuzzyDist=20)
if(countryreference[a,"CountryStandard"]==""){
countryreference[a,"CountryStandard"] <- standardizeCountry(countryreference[a,"CountryEdit2"], fuzzyDist=20)
if(countryreference[a,"CountryStandard"]==""){
countryreference[a,"CountryStandard"] <- toupper(countryreference[a,"country"])
}
}
}
}
exists <- isTRUE(paste(country) %in% as.character(countryreference$CountryStandard))
if(exists==FALSE){
print("It appears this country is not in the WorldPop set, please check and try again")
break
}
countryreference <- countryreference[,c(1,4,3)]
world_pop_data$CountryStandard <- countryreference[,2]
world_pop_data <- world_pop_data[,c(1,6,2,3,4,5)]
countryreference <- suppressMessages(dcast(countryreference, continent+CountryStandard ~ data_type))
countryreference <- filter(countryreference, countryreference$CountryStandard==country)
countryreference <- countryreference[,colSums(is.na(countryreference))<nrow(countryreference)]
print(countryreference[1,])
WPdata.types <<- countryreference[1,]
print("The above table has also been added to your working environment as dataframe: WPdata.types")
}
getWPoptions:
#Example
getWPdatatypes("Nigeria", "Population")
getWPoptions <- function (country, datatype) {
country <- standardizeCountry(paste(country),fuzzyDist=30)
res <- GET(url="http://www.worldpop.org.uk/data/data_sources/")
# Get the page contents
pg <- content(res)
summary_link_nodes <- html_nodes(pg, xpath=".//a[contains(@href,'summary')]")
map(summary_link_nodes, html_nodes, xpath=".//../..") %>%
map(html_children) %>%
map(html_text) %>%
map(~.[1:4]) %>%
map(as.list) %>%
map_df(set_names, c("continent", "country", "resolution", "data_type")) %>%
bind_cols(
data_frame(
summary_link = sprintf("http://www.worldpop.org.uk%s", html_attr(summary_link_nodes, "href"))
)
) -> world_pop_data
world_pop_data$data_type <- gsub("Urban change", "UrbanChange", world_pop_data$data_type)
world_pop_data$data_type <- gsub("Maternal and Newborn Health", "MaternalNewbornHealth", world_pop_data$data_type)
world_pop_data$data_type <- gsub("Contraceptive Use", "ContraceptiveUse", world_pop_data$data_type)
world_pop_data$data_type <- gsub("Age structures", "AgeStructures", world_pop_data$data_type)
world_pop_data$data_type <- gsub("Dynamic Population", "DynamicPopulation", world_pop_data$data_type)
countryreference <- as.data.frame(world_pop_data)
countryreference <- countryreference[,c(1,2,4)]
countryreference <- countryreference[!(countryreference$country)=="N/A",]
world_pop_data <- world_pop_data[!(world_pop_data$country)=="N/A",]
countryreference$CountryStandard <- standardizeCountry(countryreference[,"country"], fuzzyDist=20)
countryreference$CountryEdit <- gsub("[()]", "", countryreference$country)
countryreference$CountryEdit2 <- gsub("\s*\([^\)]+\)","",as.character(countryreference$country))
foreach(a=1:nrow(countryreference)) %do% {
if(countryreference[a,"CountryStandard"]==""){
countryreference[a,"CountryStandard"] <- standardizeCountry(countryreference[a,"CountryEdit"], fuzzyDist=20)
if(countryreference[a,"CountryStandard"]==""){
countryreference[a,"CountryStandard"] <- standardizeCountry(countryreference[a,"CountryEdit2"], fuzzyDist=20)
if(countryreference[a,"CountryStandard"]==""){
countryreference[a,"CountryStandard"] <- toupper(countryreference[a,"country"])
}
}
}
}
exists <- isTRUE(paste(country) %in% as.character(countryreference$CountryStandard))
if(exists==FALSE){
print("It appears this country is not in the WorldPop set, please check and try again")
break
}
countryreference <- countryreference[,c(1,4,3)]
world_pop_data$CountryStandard <- countryreference[,2]
world_pop_data <- world_pop_data[,c(1,6,2,3,4,5)]
countryreference <- suppressMessages(dcast(countryreference, continent+CountryStandard ~ data_type))
countryreference <- filter(countryreference, countryreference$CountryStandard==country)
countryreference <- countryreference[,colSums(is.na(countryreference))<nrow(countryreference)]
world_pop_data <- filter(world_pop_data, CountryStandard %in% countryreference$CountryStandard)
world_pop_data <- filter(world_pop_data, data_type==paste(datatype))
dataset_link <- as.character(world_pop_data[1,"summary_link"])
GET(url=dataset_link) -> res2
pg2 <- content(res2)
fields <- html_nodes(pg2, "form#conform > input")
fields <- set_names(xml_attr(fields, "value"), html_attr(fields, "name"))
POST(
url = "http://www.worldpop.org.uk/data/download/",
add_headers(`Referer` = dataset_link),
body = list(
client_first_name = "",
client_last_name = "",
client_organization = "",
client_country = "",
client_email = "",
client_message = "",
zip_id = fields["zip_id"],
zip_title = fields["zip_title"],
decoy = fields["decoy"],
website = "",
download = "Browse Individual Files"
),
encode = "form"
) -> res3
pg3 <- content(res3)
html_nodes(pg3, xpath=".//a[contains(., 'switch to')]") %>%
html_attr("href") -> file_list_query_string
GET(
url = "http://www.worldpop.org.uk/data/files/index.php",
query = list(
dataset = fields["zip_id"],
action = "dir"
)
) -> res4
pg4 <- content(res4)
data_frame(
group_name = html_nodes(pg4, "a.dl") %>% html_text(),
href = html_nodes(pg4, "a.dl") %>% html_attr("href")
) -> downloads
downloads$istif <- str_sub(downloads$group_name,-4,-1)
#Some such as senegal are inexplicably .TIF
downloads$istif <- tolower(downloads$istif)
downloads <- filter(downloads, istif==".tif")
pg4charfile <- as.character(downloads[1,"group_name"])
pg4charfile <- gsub(' {1,}','',pg4charfile)
if(substr(pg4charfile,1,6)!="popmap"){
if(grepl("\d", pg4charfile)==TRUE){
char4 <- substr(pg4charfile,4,4)
char6 <-substr(pg4charfile,6,6)
char9 <-substr(pg4charfile,9,9)
char11 <-substr(pg4charfile,11,11)
char4num <- suppressWarnings(!is.na(as.numeric(char4)))
char6num <- suppressWarnings(!is.na(as.numeric(char6)))
char9num <- suppressWarnings(!is.na(as.numeric(char9)))
char11num <- suppressWarnings(!is.na(as.numeric(char11)))
if(char4num==TRUE & char6num==TRUE){
downloads$years <-substr(downloads$group_name,4,7)
}
if(char4num==TRUE & char6num==FALSE){
downloads$years <-substr(downloads$group_name,4,5)
getfouryear <- function (yearsvec) {
yrFlip = 50
yearsvec <- as.numeric(yearsvec)
yearsvec[yearsvec > yrFlip] <- yearsvec[yearsvec > yrFlip] + 1900
yearsvec[yearsvec < yrFlip] <- yearsvec[yearsvec < yrFlip] + 2000
return(yearsvec)
}
downloads$years <- getfouryear(downloads$years)
}
if(char9num==FALSE & char11num==TRUE){
downloads$years <-substr(downloads$group_name,11,12)
getfouryear <- function (yearsvec) {
yrFlip = 50
yearsvec <- as.numeric(yearsvec)
yearsvec[yearsvec > yrFlip] <- yearsvec[yearsvec > yrFlip] + 1900
yearsvec[yearsvec < yrFlip] <- yearsvec[yearsvec < yrFlip] + 2000
return(yearsvec)
}
downloads$years <- getfouryear(downloads$years)
}
if(char4num==FALSE & char6num==FALSE & char9num==TRUE){
downloads$years <- str_extract(downloads$group_name, "\d{4}")
}
if(char4num==FALSE & char6num==FALSE & char9num==FALSE & char11num==FALSE){
downloads$years <- str_extract(downloads$group_name, "\d{4}")
}
}else{downloads$years <- 9999}
}else{
downloads$years<- as.numeric(substr(downloads$group_name,7,8))
getfouryear <- function (yearsvec) {
yrFlip = 50
yearsvec <- as.numeric(yearsvec)
yearsvec[yearsvec > yrFlip] <- yearsvec[yearsvec > yrFlip] + 1900
yearsvec[yearsvec < yrFlip] <- yearsvec[yearsvec < yrFlip] + 2000
return(yearsvec)
}
downloads$years <- getfouryear(downloads$years)
}
downloads <- downloads[!is.na(downloads$years),]
###Possible Options due to the inexplicable nature of their inconsistent file names
possopt <- c("_pph_", "_ppp_", "_pp_", "uncert", "adj","_M.", "_M_","_F.", "_F_", "interdecile", "povsd", "125", "200","wpipov", "ppipov", "incpov", "mpipov", "ANC", "SBA", "PNC")
opttext <- c("Persons per hectare", "Persons per pixel", "per pixel", " uncertainty dataset showing 95% credible intervals",
"adjusted to match UN estimates", "MALE", "MALE", "FEMALE", "FEMALE", "Uncertainty map", "poverty standard deviation map", ".25/day",
".00/day", "mean wealth index", "mean likelihood of living in poverty per grid square", "Income estimate USD per grid square", "%poverty by Multidimensional Poverty Index",
"prob of four or more antenatal care visits at time of delivery", "prob of skilled birth attendance during delivery", "prob of postnatal care received within 48 hours of delivery")
possoptdf <- data.frame(possopt, opttext, stringsAsFactors = FALSE)
groupsubstr <- str_sub(downloads$group_name,4,-4)
optionsforchoice<-foreach(a=1:nrow(downloads), .combine=rbind)%do%{
theoptions<- foreach(b=1:length(opttext), .combine=cbind)%do%{
matchoopt <- str_detect(downloads[a,"group_name"],coll(possopt[b]))
if(matchoopt==TRUE){result <-possopt[b]}
if(matchoopt==FALSE){result<- NA}
if(b==13){
mistake <- str_detect(downloads[a,"group_name"],"\d{4}")
if(mistake==TRUE){result <- NA}
}
result
}
}
optionsforchoice<-do.call(rbind,lapply(1:nrow(optionsforchoice),function(x) t(matrix(optionsforchoice[x,order(is.na(optionsforchoice[x,]))])) ))
optionsforchoice <- as.data.frame(optionsforchoice, stringsAsFactors=FALSE)
optionsforchoice <- optionsforchoice[,colSums(is.na(optionsforchoice))<nrow(optionsforchoice)]
downloads <- cbind(downloads,optionsforchoice)
optiters <- as.data.frame(optionsforchoice)
if(length(optionsforchoice)==1){colnames(downloads)[5]<-"V1"}
if(ncol(as.data.frame(optionsforchoice))==1){colnames(downloads)[5]<-"V1"}
optiters <- ncol(optiters)
###join all the options so they can be displayed
foreach(a=1:optiters)%do%{
downloads <- merge(downloads, possoptdf, by.x=paste0("V",a), by.y="possopt", all.x=TRUE)
coltochange <- ncol(downloads)
colnames(downloads)[coltochange] <- paste0("possopt",a)
}
downloads$years <- as.numeric(downloads$years)
downloads <- downloads[order(downloads$years),]
##Subsetting downloads to columns that only contain possopt
downpossopt <- downloads[ , grepl( "possopt" , colnames( downloads ) ) ]
downpossopt <- as.data.frame(downpossopt)
downpossopt$code <- c(1:nrow(downpossopt))
downloads$code <- c(1:nrow(downloads))
rownames(downloads) <- c(1:nrow(downloads))
rownames(downpossopt) <- c(1:nrow(downpossopt))
if(ncol(downpossopt)==1){
colnames(downpossopt) <- "possopt1"}
downpossopt <- cbind(downloads$years, downpossopt)
names(downpossopt)[names(downpossopt) == 'downloads$years'] <- 'years'
names(downpossopt)[names(downpossopt) == 'V1'] <- 'years'
possoptcodes <- c("pph", "ppp", "pp", "uncert", "adj","M", "M","F", "F", "interdecile", "povsd", "125", "200","wpipov", "ppipov", "incpov", "mpipov", "ANC", "SBA", "PNC")
possoptcodes <- as.data.frame(cbind(possoptcodes, opttext))
possoptcodes <- possoptcodes[c(1:6,8,10:nrow(possoptcodes)),]
names(downpossopt)[names(downpossopt) == 'downpossopt'] <- 'possopt1'
foreach(a=1:optiters)%do%{
downpossopt <- merge(downpossopt, possoptcodes, by.x=paste0("possopt",a), by.y="opttext", all.x=TRUE)
coltochange <- ncol(downpossopt)
colnames(downpossopt)[coltochange] <- paste0("OptionCode",a)
}
downpossopt <- as.data.frame(downpossopt)
downpossopt <- downpossopt[order(downpossopt$code),]
downpossopt <- downpossopt[,c(which(colnames(downpossopt)=="possopt1"),which(colnames(downpossopt)!="possopt1"))]
downpossopt <- downpossopt[,c(which(colnames(downpossopt)=="years"),which(colnames(downpossopt)!="years"))]
downpossopt$country <- countryreference[1,"CountryStandard"]
downpossopt <- downpossopt[,c(which(colnames(downpossopt)=="country"),which(colnames(downpossopt)!="country"))]
row.names(downpossopt) <- c(1:nrow(downpossopt))
downpossoptcodes <- downpossopt[ , grepl( "OptionCode" , colnames( downpossopt ) ) ]
downpossoptcodes <- as.data.frame(downpossoptcodes)
print(downpossopt)
WP.options <<- downpossopt
print("The above table has also been added to your working environment as dataframe: WP.options")
}
获取WP下载:
#Example
getWPdownload("Benin", "Pregnancies", "pp", 2015)
#if the set is missing a year type 9999, 9999 will be returned for sets with missing years in getWPoptions.
getWPdownload <- function (country, datatype, options, year) {
country <- standardizeCountry(paste(country),fuzzyDist=30)
year <- as.numeric(year)
optionschosen <- paste(options, collapse = '-')
res <- GET(url="http://www.worldpop.org.uk/data/data_sources/")
# Get the page contents
pg <- content(res)
summary_link_nodes <- html_nodes(pg, xpath=".//a[contains(@href,'summary')]")
map(summary_link_nodes, html_nodes, xpath=".//../..") %>%
map(html_children) %>%
map(html_text) %>%
map(~.[1:4]) %>%
map(as.list) %>%
map_df(set_names, c("continent", "country", "resolution", "data_type")) %>%
bind_cols(
data_frame(
summary_link = sprintf("http://www.worldpop.org.uk%s", html_attr(summary_link_nodes, "href"))
)
) -> world_pop_data
world_pop_data$data_type <- gsub("Urban change", "UrbanChange", world_pop_data$data_type)
world_pop_data$data_type <- gsub("Maternal and Newborn Health", "MaternalNewbornHealth", world_pop_data$data_type)
world_pop_data$data_type <- gsub("Contraceptive Use", "ContraceptiveUse", world_pop_data$data_type)
world_pop_data$data_type <- gsub("Age structures", "AgeStructures", world_pop_data$data_type)
world_pop_data$data_type <- gsub("Dynamic Population", "DynamicPopulation", world_pop_data$data_type)
countryreference <- as.data.frame(world_pop_data)
countryreference <- countryreference[,c(1,2,4)]
countryreference <- countryreference[!(countryreference$country)=="N/A",]
world_pop_data <- world_pop_data[!(world_pop_data$country)=="N/A",]
###Filter country names so they match the desired country
countryreference$CountryStandard <- standardizeCountry(countryreference[,"country"], fuzzyDist=20)
countryreference$CountryEdit <- gsub("[()]", "", countryreference$country)
countryreference$CountryEdit2 <- gsub("\s*\([^\)]+\)","",as.character(countryreference$country))
foreach(a=1:nrow(countryreference)) %do% {
if(countryreference[a,"CountryStandard"]==""){
countryreference[a,"CountryStandard"] <- standardizeCountry(countryreference[a,"CountryEdit"], fuzzyDist=20)
if(countryreference[a,"CountryStandard"]==""){
countryreference[a,"CountryStandard"] <- standardizeCountry(countryreference[a,"CountryEdit2"], fuzzyDist=20)
if(countryreference[a,"CountryStandard"]==""){
countryreference[a,"CountryStandard"] <- toupper(countryreference[a,"country"])
}
}
}
}
exists <- isTRUE(paste(country) %in% as.character(countryreference$CountryStandard))
if(exists==FALSE){
print("It appears this country is not in the WorldPop set, please check and try again")
break
}
countryreference <- countryreference[,c(1,4,3)]
world_pop_data$CountryStandard <- countryreference[,2]
world_pop_data <- world_pop_data[,c(1,6,2,3,4,5)]
countryreference <- suppressMessages(dcast(countryreference, continent+CountryStandard ~ data_type))
countryreference <- filter(countryreference, countryreference$CountryStandard==country)
countryreference <- countryreference[,colSums(is.na(countryreference))<nrow(countryreference)]
world_pop_data <- filter(world_pop_data, CountryStandard %in% countryreference$CountryStandard)
world_pop_data <- filter(world_pop_data, data_type==paste(datatype))
dataset_link <- as.character(world_pop_data[1,"summary_link"])
GET(url=dataset_link) -> res2
pg2 <- content(res2)
fields <- html_nodes(pg2, "form#conform > input")
fields <- set_names(xml_attr(fields, "value"), html_attr(fields, "name"))
POST(
url = "http://www.worldpop.org.uk/data/download/",
add_headers(`Referer` = dataset_link),
body = list(
client_first_name = "",
client_last_name = "",
client_organization = "",
client_country = "",
client_email = "",
client_message = "",
zip_id = fields["zip_id"],
zip_title = fields["zip_title"],
decoy = fields["decoy"],
website = "",
download = "Browse Individual Files"
),
encode = "form"
) -> res3
pg3 <- content(res3)
html_nodes(pg3, xpath=".//a[contains(., 'switch to')]") %>%
html_attr("href") -> file_list_query_string
GET(
url = "http://www.worldpop.org.uk/data/files/index.php",
query = list(
dataset = fields["zip_id"],
action = "dir"
)
) -> res4
pg4 <- content(res4)
data_frame(
group_name = html_nodes(pg4, "a.dl") %>% html_text(),
href = html_nodes(pg4, "a.dl") %>% html_attr("href")
) -> downloads
downloads$istif <- str_sub(downloads$group_name,-4,-1)
#Some such as senegal are inexplicably .TIF
downloads$istif <- tolower(downloads$istif)
downloads <- filter(downloads, istif==".tif")
pg4charfile <- as.character(downloads[1,"group_name"])
pg4charfile <- gsub(' {1,}','',pg4charfile)
if(substr(pg4charfile,1,6)!="popmap"){
if(grepl("\d", pg4charfile)==TRUE){
char4 <- substr(pg4charfile,4,4)
char6 <-substr(pg4charfile,6,6)
char9 <-substr(pg4charfile,9,9)
char11 <-substr(pg4charfile,11,11)
char4num <- suppressWarnings(!is.na(as.numeric(char4)))
char6num <- suppressWarnings(!is.na(as.numeric(char6)))
char9num <- suppressWarnings(!is.na(as.numeric(char9)))
char11num <- suppressWarnings(!is.na(as.numeric(char11)))
if(char4num==TRUE & char6num==TRUE){
downloads$years <-substr(downloads$group_name,4,7)
}
if(char4num==TRUE & char6num==FALSE){
downloads$years <-substr(downloads$group_name,4,5)
getfouryear <- function (yearsvec) {
yrFlip = 50
yearsvec <- as.numeric(yearsvec)
yearsvec[yearsvec > yrFlip] <- yearsvec[yearsvec > yrFlip] + 1900
yearsvec[yearsvec < yrFlip] <- yearsvec[yearsvec < yrFlip] + 2000
return(yearsvec)
}
downloads$years <- getfouryear(downloads$years)
}
if(char9num==FALSE & char11num==TRUE){
downloads$years <-substr(downloads$group_name,11,12)
getfouryear <- function (yearsvec) {
yrFlip = 50
yearsvec <- as.numeric(yearsvec)
yearsvec[yearsvec > yrFlip] <- yearsvec[yearsvec > yrFlip] + 1900
yearsvec[yearsvec < yrFlip] <- yearsvec[yearsvec < yrFlip] + 2000
return(yearsvec)
}
downloads$years <- getfouryear(downloads$years)
}
if(char4num==FALSE & char6num==FALSE & char9num==TRUE){
downloads$years <- str_extract(downloads$group_name, "\d{4}")
}
if(char4num==FALSE & char6num==FALSE & char9num==FALSE & char11num==FALSE){
downloads$years <- str_extract(downloads$group_name, "\d{4}")
}
}else{downloads$years <- 9999}
}else{
downloads$years<- as.numeric(substr(downloads$group_name,7,8))
getfouryear <- function (yearsvec) {
yrFlip = 50
yearsvec <- as.numeric(yearsvec)
yearsvec[yearsvec > yrFlip] <- yearsvec[yearsvec > yrFlip] + 1900
yearsvec[yearsvec < yrFlip] <- yearsvec[yearsvec < yrFlip] + 2000
return(yearsvec)
}
downloads$years <- getfouryear(downloads$years)
}
downloads <- downloads[!is.na(downloads$years),]
possopt <- c("_pph_", "_ppp_", "_pp_", "uncert", "adj","_M.", "_M_","_F.", "_F_", "interdecile", "povsd", "125", "200","wpipov", "ppipov", "incpov", "mpipov", "ANC", "SBA", "PNC")
opttext <- c("Persons per hectare", "Persons per pixel", "per pixel", " uncertainty dataset showing 95% credible intervals",
"adjusted to match UN estimates", "MALE", "MALE", "FEMALE", "FEMALE", "Uncertainty map", "poverty standard deviation map", ".25/day",
".00/day", "mean wealth index", "mean likelihood of living in poverty per grid square", "Income estimate USD per grid square", "%poverty by Multidimensional Poverty Index",
"prob of four or more antenatal care visits at time of delivery", "prob of skilled birth attendance during delivery", "prob of postnatal care received within 48 hours of delivery")
possoptdf <- data.frame(possopt, opttext, stringsAsFactors = FALSE)
groupsubstr <- str_sub(downloads$group_name,4,-4)
###get options for each file from the worldpop selected country and datatype###
optionsforchoice<-foreach(a=1:nrow(downloads), .combine=rbind)%do%{
theoptions<- foreach(b=1:length(opttext), .combine=cbind)%do%{
matchoopt <- str_detect(downloads[a,"group_name"],coll(possopt[b]))
if(matchoopt==TRUE){result <-possopt[b]}
if(matchoopt==FALSE){result<- NA}
if(b==13){
mistake <- str_detect(downloads[a,"group_name"],"\d{4}")
if(mistake==TRUE){result <- NA}
}
result
}
}
optionsforchoice<-do.call(rbind,lapply(1:nrow(optionsforchoice),function(x) t(matrix(optionsforchoice[x,order(is.na(optionsforchoice[x,]))])) ))
optionsforchoice <- as.data.frame(optionsforchoice, stringsAsFactors=FALSE)
optionsforchoice <- optionsforchoice[,colSums(is.na(optionsforchoice))<nrow(optionsforchoice)]
downloads <- cbind(downloads,optionsforchoice)
optiters <- as.data.frame(optionsforchoice)
if(length(optionsforchoice)==1){colnames(downloads)[5]<-"V1"}
if(ncol(as.data.frame(optionsforchoice))==1){colnames(downloads)[5]<-"V1"}
optiters <- ncol(optiters)
foreach(a=1:optiters)%do%{
downloads <- merge(downloads, possoptdf, by.x=paste0("V",a), by.y="possopt", all.x=TRUE)
coltochange <- ncol(downloads)
colnames(downloads)[coltochange] <- paste0("possopt",a)
}
downloads$years <- as.numeric(downloads$years)
downloads <- downloads[order(downloads$years),]
##Subsetting downloads to columns that only contain possopt
downpossopt <- downloads[ , grepl( "possopt" , colnames( downloads ) ) ]
downpossopt <- as.data.frame(downpossopt)
downpossopt$code <- c(1:nrow(downpossopt))
downloads$code <- c(1:nrow(downloads))
rownames(downloads) <- c(1:nrow(downloads))
rownames(downpossopt) <- c(1:nrow(downpossopt))
if(ncol(downpossopt)==1){
colnames(downpossopt) <- "possopt1"}
downpossopt <- cbind(downloads$years, downpossopt)
names(downpossopt)[names(downpossopt) == 'downloads$years'] <- 'years'
names(downpossopt)[names(downpossopt) == 'V1'] <- 'years'
###get the right codes for the function
possoptcodes <- c("pph", "ppp", "pp", "uncert", "adj","M", "M","F", "F", "interdecile", "povsd", "125", "200","wpipov", "ppipov", "incpov", "mpipov", "ANC", "SBA", "PNC")
possoptcodes <- as.data.frame(cbind(possoptcodes, opttext))
possoptcodes <- possoptcodes[c(1:6,8,10:nrow(possoptcodes)),]
names(downpossopt)[names(downpossopt) == 'downpossopt'] <- 'possopt1'
foreach(a=1:optiters)%do%{
downpossopt <- merge(downpossopt, possoptcodes, by.x=paste0("possopt",a), by.y="opttext", all.x=TRUE)
coltochange <- ncol(downpossopt)
colnames(downpossopt)[coltochange] <- paste0("OptionCode",a)
}
downpossopt <- as.data.frame(downpossopt)
downpossopt <- downpossopt[order(downpossopt$code),]
downpossopt <- downpossopt[,c(which(colnames(downpossopt)=="possopt1"),which(colnames(downpossopt)!="possopt1"))]
downpossopt <- downpossopt[,c(which(colnames(downpossopt)=="years"),which(colnames(downpossopt)!="years"))]
downpossopt$country <- countryreference[1,"CountryStandard"]
downpossopt <- downpossopt[,c(which(colnames(downpossopt)=="country"),which(colnames(downpossopt)!="country"))]
row.names(downpossopt) <- c(1:nrow(downpossopt))
downpossoptcodes <- downpossopt[ , grepl( "OptionCode" , colnames( downpossopt ) ) ]
downpossoptcodes <- as.data.frame(downpossoptcodes)
if(ncol(downpossoptcodes)>1){
downpossoptcodes <- data.frame(x=apply(downpossoptcodes,1,function(x) {paste(x[!is.na(x)],collapse='-')}))
}
colnames(downpossoptcodes) <- "optionspossible"
downloads <- as.data.frame(cbind(downloads, downpossoptcodes))
downloads <- filter(downloads, downloads$years==year)
if(optionschosen!=""){
downloads2 <- filter(downloads, downloads$optionspossible==optionschosen)
if(is.na(downloads2[1,1]) & length(options)>1){
optionschosen <- paste0(options[2],"-", options[1], collapse='')
downloads2 <- filter(downloads, downloads$optionspossible==optionschosen)
}
}else{downloads2 <- filter(downloads, is.na(downloads$optionspossible))}
readme_query_stringdownload <- as.character(downloads2[1,"href"])
filenamedest <- as.character(paste0(country,downloads2[1,"group_name"]))
GET(
url = "http://www.worldpop.org.uk/data/files/index.php",
query = parse_url(readme_query_stringdownload)$query,
progress(),
verbose(),
write_disk(paste0(filenamedest), overwrite=TRUE)
)-> res5
}