使用 R 在一天内使用 4 个 url 从站点抓取数据
Scraping data off site using 4 urls for one day using R
我正在尝试从马来西亚环境部网站上抓取所有历史空气污染指数数据,该网站将所有站点的数据拆分为 4 小时链接 per/day,如下所示
http://apims.doe.gov.my/apims/hourly1.php?date=20130701
http://apims.doe.gov.my/apims/hourly2.php?date=20130701
同上 'hourly3.php?' 和 'hourly4.php?'
我对 R 不太熟悉,所以使用 XML 或 scrapeR 库最简单的方法是什么?
您可以使用 R readHTMLTable
函数从您上面提供的马来西亚 DOE URL 中提取 HTML 表格。以第一个URL为例:
# Make sure you have the XML package installed
library(XML)
url <- "http://apims.doe.gov.my/apims/hourly1.php?date=20130701"
all.tables <- readHTMLTable(url)
# the URL you gave only has one <table> tag
table <- all.tables[[1]]
# and now you have a data frame 'table' which contains the contents
# of the air pollutant table
您可以使用列表操作将所有表格变成一个宽数据框:
library(rvest)
library(magrittr)
library(dplyr)
date <- 20130701
rng <- c(1:4)
my_tabs <- lapply(rng, function(i) {
url <- sprintf("http://apims.doe.gov.my/apims/hourly%d.php?date=%s", i, date)
pg <- html(url)
pg %>% html_nodes("table") %>% extract2(1) %>% html_table(header=TRUE)
})
glimpse(plyr::join_all(my_tabs, by=colnames(my_tabs[[1]][1:2])))
## Observations: 52
## Variables:
## $ NEGERI / STATE (chr) "Johor", "Johor", "Johor", "Johor", "Kedah...
## $ KAWASAN/AREA (chr) "Kota Tinggi", "Larkin Lama", "Muar", "Pas...
## $ MASA/TIME12:00AM (chr) "63*", "53*", "51*", "55*", "37*", "48*", ...
## $ MASA/TIME01:00AM (chr) "62*", "52*", "52*", "55*", "36*", "48*", ...
## $ MASA/TIME02:00AM (chr) "61*", "51*", "53*", "55*", "35*", "48*", ...
## $ MASA/TIME03:00AM (chr) "60*", "50*", "54*", "55*", "35*", "48*", ...
## $ MASA/TIME04:00AM (chr) "59*", "49*", "54*", "54*", "34*", "47*", ...
## $ MASA/TIME05:00AM (chr) "58*", "48*", "54*", "54*", "34*", "45*", ...
## $ MASA/TIME06:00AM (chr) "57*", "47*", "53*", "53*", "33*", "45*", ...
## $ MASA/TIME07:00AM (chr) "57*", "46*", "52*", "53*", "32*", "45*", ...
## $ MASA/TIME08:00AM (chr) "56*", "45*", "52*", "52*", "32*", "44*", ...
## ...
由于与 dplyr
的命名冲突,我实际上很少 load/use plyr
但 join_all
非常适合这种情况。
您也可能需要此长格式数据:
plyr::join_all(my_tabs, by=colnames(my_tabs[[1]][1:2])) %>%
tidyr::gather(masa, nilai, -1, -2) %>%
# better column names
rename(nigeri=`NEGERI / STATE`, kawasan=`KAWASAN/AREA`) %>%
# cleanup & convert time (using local timezone)
# make readings numeric; NA will sub for #
mutate(masa=gsub("MASA/TIME", "", masa),
masa=as.POSIXct(sprintf("%s %s", date, masa), format="%Y%m%d %H:%M%p", tz="Asia/Kuala_Lumpur"),
nilai=as.numeric(gsub("[[:punct:]]+", "", nilai))) -> pollut
head(pollut)
## nigeri kawasan masa nilai
## 1 Johor Kota Tinggi 2013-07-01 12:00:00 63
## 2 Johor Larkin Lama 2013-07-01 12:00:00 53
## 3 Johor Muar 2013-07-01 12:00:00 51
## 4 Johor Pasir Gudang 2013-07-01 12:00:00 55
## 5 Kedah Alor Setar 2013-07-01 12:00:00 37
## 6 Kedah Bakar Arang, Sg. Petani 2013-07-01 12:00:00 48
我正在尝试从马来西亚环境部网站上抓取所有历史空气污染指数数据,该网站将所有站点的数据拆分为 4 小时链接 per/day,如下所示
http://apims.doe.gov.my/apims/hourly1.php?date=20130701 http://apims.doe.gov.my/apims/hourly2.php?date=20130701
同上 'hourly3.php?' 和 'hourly4.php?'
我对 R 不太熟悉,所以使用 XML 或 scrapeR 库最简单的方法是什么?
您可以使用 R readHTMLTable
函数从您上面提供的马来西亚 DOE URL 中提取 HTML 表格。以第一个URL为例:
# Make sure you have the XML package installed
library(XML)
url <- "http://apims.doe.gov.my/apims/hourly1.php?date=20130701"
all.tables <- readHTMLTable(url)
# the URL you gave only has one <table> tag
table <- all.tables[[1]]
# and now you have a data frame 'table' which contains the contents
# of the air pollutant table
您可以使用列表操作将所有表格变成一个宽数据框:
library(rvest)
library(magrittr)
library(dplyr)
date <- 20130701
rng <- c(1:4)
my_tabs <- lapply(rng, function(i) {
url <- sprintf("http://apims.doe.gov.my/apims/hourly%d.php?date=%s", i, date)
pg <- html(url)
pg %>% html_nodes("table") %>% extract2(1) %>% html_table(header=TRUE)
})
glimpse(plyr::join_all(my_tabs, by=colnames(my_tabs[[1]][1:2])))
## Observations: 52
## Variables:
## $ NEGERI / STATE (chr) "Johor", "Johor", "Johor", "Johor", "Kedah...
## $ KAWASAN/AREA (chr) "Kota Tinggi", "Larkin Lama", "Muar", "Pas...
## $ MASA/TIME12:00AM (chr) "63*", "53*", "51*", "55*", "37*", "48*", ...
## $ MASA/TIME01:00AM (chr) "62*", "52*", "52*", "55*", "36*", "48*", ...
## $ MASA/TIME02:00AM (chr) "61*", "51*", "53*", "55*", "35*", "48*", ...
## $ MASA/TIME03:00AM (chr) "60*", "50*", "54*", "55*", "35*", "48*", ...
## $ MASA/TIME04:00AM (chr) "59*", "49*", "54*", "54*", "34*", "47*", ...
## $ MASA/TIME05:00AM (chr) "58*", "48*", "54*", "54*", "34*", "45*", ...
## $ MASA/TIME06:00AM (chr) "57*", "47*", "53*", "53*", "33*", "45*", ...
## $ MASA/TIME07:00AM (chr) "57*", "46*", "52*", "53*", "32*", "45*", ...
## $ MASA/TIME08:00AM (chr) "56*", "45*", "52*", "52*", "32*", "44*", ...
## ...
由于与 dplyr
的命名冲突,我实际上很少 load/use plyr
但 join_all
非常适合这种情况。
您也可能需要此长格式数据:
plyr::join_all(my_tabs, by=colnames(my_tabs[[1]][1:2])) %>%
tidyr::gather(masa, nilai, -1, -2) %>%
# better column names
rename(nigeri=`NEGERI / STATE`, kawasan=`KAWASAN/AREA`) %>%
# cleanup & convert time (using local timezone)
# make readings numeric; NA will sub for #
mutate(masa=gsub("MASA/TIME", "", masa),
masa=as.POSIXct(sprintf("%s %s", date, masa), format="%Y%m%d %H:%M%p", tz="Asia/Kuala_Lumpur"),
nilai=as.numeric(gsub("[[:punct:]]+", "", nilai))) -> pollut
head(pollut)
## nigeri kawasan masa nilai
## 1 Johor Kota Tinggi 2013-07-01 12:00:00 63
## 2 Johor Larkin Lama 2013-07-01 12:00:00 53
## 3 Johor Muar 2013-07-01 12:00:00 51
## 4 Johor Pasir Gudang 2013-07-01 12:00:00 55
## 5 Kedah Alor Setar 2013-07-01 12:00:00 37
## 6 Kedah Bakar Arang, Sg. Petani 2013-07-01 12:00:00 48