求麻烦:POST提交

rvest trouble: POST submission

我正在尝试使用 rvest 从 USGS post 表格下载数据。我做错了什么?

make_url = function(base_url, parameter_list)
  parameter_list %>%
  names %>%
  paste(parameter_list, sep = "=", collapse = "&") %>%
  paste(base_url, ., sep = "")

session = 
  list(sn = "01170000") %>%
  make_url("http://ida.water.usgs.gov/ida/available_records.cfm?", .) %>%
  html_session

test = 
  session %>%
  html_form %>%
  .[[1]] %>%
  set_values(fromdate = "1990-10-01") %>%
  set_values(todate = "2007-09-30") %>%
  set_values(rtype = "3") %>%
  submit_form(session, .)

不需要 rvest 或会话。以下函数将接收站和日期以及 returns 一个数据框,其中包含 USGS 在每次下载时吐出的数据文件注释。

它使用 "download compressed file" 选项来节省带宽并加快下载速度。它创建临时文件来读取数据,但会自行清理。列被转换为正确的类型(不过,如果您愿意,可以省略该部分代码)。如果你不需要它,你也可以省略附加的评论(它似乎对我有用)。

readr::read_lines() 用于提高速度,如果您不想依赖 readr 包,可以使用 readLines()

转换成tibble版本的data.frame主要是为了更好的打印,但它还有其他潜在的优势,所以如果你不想依赖的话你也可以忽略它tibble 包。

有一个硬编码的 99 秒超时,但您可以根据需要对其进行参数化。

library(httr)
library(readr)
library(tibble)

#' Retrieve IDA Station Data
#'
#' @param site_no site id
#' @param date_from records from date YYYY-mm-dd
#' @param date_to records to date YYYY-mm-dd
#' @return a parsed, type-converted data frame with a comments attribute. 
#' @example
#' deerfield <- get_ida("01170000", "1990-10-01", "2007-09-30")
#'
#' head(deerfield)
#'
#' cat(comment(deerfield))

get_ida <- function(site_no, date_from, date_to) {

  date_from_time <- sprintf("%s 00:15:00.0", date_from)
  date_to_time <- sprintf("%s 23:45:00.0", date_to)

  ida_referer <- sprintf("http://ida.water.usgs.gov/ida/available_records.cfm?sn=%s", site_no)

  tf <- tempfile(".zip")

  res <- POST(url = "http://ida.water.usgs.gov/ida/available_records_process.cfm",
              body = list(fromdate = date_from,
                          todate = date_to,
                          mindatetime = date_from_time,
                          maxdatetime = date_to_time,
                          site_no = site_no,
                          rtype = "2",
                          submit1 = "Retrieve+Data"),
              add_headers(Origin="http://ida.water.usgs.gov",
                          Referer=ida_referer),
              write_disk(tf),
              timeout(99),
              encode = "form")

  fils <- unzip(tf, exdir=tempdir())
  tmp <- read_lines(fils)

  unlink(tf)
  unlink(fils)

  comments <- grep("^#", tmp, value=TRUE)
  records <- grep("^#", tmp, value=TRUE, invert=TRUE)
  header <- records[1:2]
  records <- records[-(1:2)]
  cols <- strsplit(header[1], "[[:space:]]+")[[1]]

  comments <- paste0(comments, collapse="\n")
  records <- paste0(records, collapse="\n")

  df <- read_tsv(records, col_names=cols, "cccnnnnc")
  df$date_time <- as.POSIXct(df$date_time, format="%Y%m%d%H%M%S")
  df <- as_tibble(df)

  comment(df) <- comments

  df

}

有效证明:

deerfield <- get_ida("01170000", "1990-10-01", "2007-09-30")

dplyr::glimpse(deerfield)
## Observations: 550,917
## Variables: 8
## $ site_no     <chr> "01170000", "01170000", "01170000", "01170000", "0117000...
## $ date_time   <time> 1990-10-01 00:15:00, 1990-10-01 00:30:00, 1990-10-01 00...
## $ tz_cd       <chr> "EDT", "EDT", "EDT", "EDT", "EDT", "EDT", "EDT", "EDT", ...
## $ dd          <dbl> 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,...
## $ accuracy_cd <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...
## $ value       <dbl> 146, 139, 135, 143, 154, 166, 171, 175, 171, 166, 162, 1...
## $ prec        <dbl> 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,...
## $ remark      <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...

head(deerfield)
## # A tibble: 6 x 8
##    site_no           date_time tz_cd    dd accuracy_cd value  prec remark
##      <chr>              <time> <chr> <dbl>       <dbl> <dbl> <dbl>  <chr>
## 1 01170000 1990-10-01 00:15:00   EDT     7           1   146     3   <NA>
## 2 01170000 1990-10-01 00:30:00   EDT     7           1   139     3   <NA>
## 3 01170000 1990-10-01 00:45:00   EDT     7           1   135     3   <NA>
## 4 01170000 1990-10-01 01:00:00   EDT     7           1   143     3   <NA>
## 5 01170000 1990-10-01 01:15:00   EDT     7           1   154     3   <NA>
## 6 01170000 1990-10-01 01:30:00   EDT     7           1   166     3   <NA>

cat(comment(deerfield))
# retrieved: 2016-09-12 05:32:34 CST
#
# Data for the following station is contained in this file
# ---------------------------------------------------------
#  USGS 01170000 DEERFIELD RIVER NEAR WEST DEERFIELD, MA
#
# This data file was retrieved from the USGS
# instantaneous data archive at
# http://ida.water.usgs.gov
#
# ---------------------WARNING---------------------
# The instantaneous data you have obtained from
# this automated U.S. Geological Survey database
# may or may not have been the basis for the published
# daily mean discharges for this station. Although
# automated filtering has been used to compare these
# data to the published daily mean values and to remove
# obviously bad data, there may still be significant
# error in individual values. Users are strongly
# encouraged to review all data carefully prior to use.
# These data are released on the condition that neither
# the USGS nor the United States Government may be held
# liable for any damages resulting from its use.
#
# This file consists of tab-separated columns of the
# following fields.
#
# column       column definition
# -----------  -----------------------------------------
# site_no      USGS site identification number
# date_time     date and time in format (YYYYMMDDhhmmss)
# tz_cd        time zone
# dd           internal USGS sensor designation (''data descriptor'')
# accuracy_cd  accuracy code
#                   0 - A daily mean discharge calculated from the instantaneous
#                       data on this day is 0.01 cubic feet per second
#                       or less and the published daily mean is zero.
#                   1 - A daily mean discharge calculated from the instantaneous
#                       data on this day matches the published daily mean
#                       within 1 percent.
#                   2 - A daily mean discharge calculated from the instantaneous
#                       data on this day matches the published daily mean
#                       from greater than 1 to 5 percent.
#                   3 - A daily mean discharge calculated from the instantaneous
#                       values on this day matches the published daily mean
#                       from greater than 5 to 10 percent.
#                   9 - The instantaneous value is considered correct by the
#                       collecting USGS Water Science Center. A published daily
#                       mean value does not exist and/or no comparison was made.
# value        discharge in cubic feet per second
# precision    digits of precision in the discharge
# remark       optional remark code
#                 Remark  Explanation
#                   <     Actual value is known to be less than reported value.
#                   >     Actual value is known to be greater than reported value.
#                   &     Value is affected by unspecified reasons.
#                   A     Value is affected by ice at the measurement site.
#                   B     Value is affected by backwater at the measurement site.
#                   e     Value has been estimated by USGS personnel.
#                   E     Value was computed from an estimated value.
#                   F     Value was modified due to automated filtering.
#                   K     Value is affected by instrument calibration drift.
#                   R     Rating is undefined for this value.
#
#

好的,这是让 rvest 发挥作用的方法:

library(magrittr)

make_url = function(base_url, parameter_list = list(), ...) {
  together_list = 
    parameter_list %>%
    c(list(...) )

  together_list %>%
    names %>%
    paste(together_list, sep = "=", collapse = "&") %>%
    paste(base_url, ., sep = "?")
}

download_ida = function(site_no, 
                        fromdate = "1990-10-01", 
                        todate = "2007-09-30", 
                        dir = ".",
                        filename = paste(site_no, "txt", sep = ".") ) {

  session = 
    "http://ida.water.usgs.gov/ida/available_records.cfm" %>%
    make_url(sn = "01170000") %>%
    html_session

  form = 
    session %>%
    html_form %>%
    .[[1]] %>%
    set_values(fromdate = fromdate,
               todate = todate,
               rtype = "2")

  tempfile = tempfile(".zip")

  submit_form(session, form, submit = NULL,
              httr::write_disk(tempfile,
                               overwrite = TRUE),
              httr::add_headers(Referer = session$url) )

  filename = file.path(dir, filename)

  tempfile %>%
    unzip(exdir = dir) %>%
    file.rename(filename)

  filename
}

read_ida = function(filename) {

  col_names = 
    filename %>%
    readr::read_tsv(comment = "#", n_max = 1, col_names = FALSE)

  filename %>%
    readr::read_tsv(comment = "#", skip= 2, col_names = FALSE, na = "Ice",
                    col_types = cols(X2 = col_datetime(format = "%Y%m%d%H%M%S"))) %>%
    stats::setNames(col_names)
}

deerfield = 
  "01170000" %>%
  download_ida %>%
  read_ida

但有一个警告:rvest 目前有一个开放的拉取请求 https://github.com/hadley/rvest/pull/161 ,它是让它工作所必需的。为此,有必要重新定义 submit_request 和 submit_form 集成新的拉取请求:

submit_request = function(form, submit = NULL) {
  is_submit <- function(x)
    if ( is.null(x$type) ) FALSE else
      tolower(x$type) %in% c("submit", "image", "button")

  submits <- Filter(is_submit, form$fields)

  if (length(submits) == 0) {
    stop("Could not find possible submission target.", call. = FALSE)
  }
  if (is.null(submit)) {
    submit <- names(submits)[[1]]
    message("Submitting with '", submit, "'")
  }
  if (!(submit %in% names(submits))) {
    stop("Unknown submission name '", submit, "'.\n", "Possible values: ", 
         paste0(names(submits), collapse = ", "), call. = FALSE)
  }
  other_submits <- setdiff(names(submits), submit)
  method <- form$method
  if (!(method %in% c("POST", "GET"))) {
    warning("Invalid method (", method, "), defaulting to GET", 
            call. = FALSE)
    method <- "GET"
  }
  url <- form$url
  fields <- form$fields
  fields <- Filter(function(x) length(x$value) > 0, fields)
  fields <- fields[setdiff(names(fields), other_submits)]
  values <- pluck(fields, "value")
  names(values) <- names(fields)
  list(method = method, encode = form$enctype, url = url, values = values)
}

submit_form = function(session, form, submit = NULL, ...) {
  request <- submit_request(form, submit)
  url <- xml2::url_absolute(form$url, session$url)
  if (request$method == "GET") {
    rvest:::request_GET(session, url = url, query = request$values, ...)
  } else if (request$method == "POST") {
    rvest:::request_POST(session, url = url, body = request$values, 
                         encode = request$encode, ...)
  } else {
    stop("Unknown method: ", request$method, call. = FALSE)
  }
}

希望拉取请求能尽快合并。