将正确的参数传递给 RCurl/postForm
Passing correct params to RCurl/postForm
我正在尝试通过 RCurl
从国家信息中心下载 pdf,但遇到了一些问题。对于这个例子 URL,我想要的是默认设置对应的 pdf,除了 "Report Format" 应该是 "PDF"。当我 运行 以下脚本时,它会保存与选择其他按钮相关的文件("Parent(s) of..."/HMDA -- 不是默认值)。我尝试将这些输入元素添加到 params
,但没有任何改变。有人可以帮我找出问题所在吗?谢谢。
library(RCurl)
curl = getCurlHandle()
curlSetOpt(cookiejar = 'cookies.txt', curl = curl)
params = list(rbRptFormatPDF = 'rbRptFormatPDF')
url = 'https://www.ffiec.gov/nicpubweb/nicweb/OrgHierarchySearchForm.aspx?parID_RSSD=2162966&parDT_END=99991231'
html = getURL(url, curl = curl)
viewstate = sub('.*id="__VIEWSTATE" value="([0-9a-zA-Z+/=]*).*', '\1', html)
event = sub('.*id="__EVENTVALIDATION" value="([0-9a-zA-Z+/=]*).*', '\1', html)
params[['__VIEWSTATE']] = viewstate
params[['__EVENTVALIDATION']] = event
params[['btnSubmit']] = 'Submit'
result = postForm(url, .params=params, curl=curl, style='POST')
writeBin( as.vector(result), 'test.pdf')
这提供了正确的 PDF 吗?
library(httr)
library(rvest)
library(purrr)
# setup inane sharepoint viewstate parameters
res <- GET(url = "https://www.ffiec.gov/nicpubweb/nicweb/OrgHierarchySearchForm.aspx",
query=list(parID_RSSD=2162966, parDT_END=99991231))
# extract them
pg <- content(res, as="parsed")
hidden <- html_nodes(pg, xpath=".//form/input[@type='hidden']")
params <- setNames(as.list(xml_attr(hidden, "value")), xml_attr(hidden, "name"))
# pile on more params
params <- c(
params,
grpInstitution = "rbCurInst",
lbTopHolders = "2961897",
grpHMDA = "rbNonHMDA",
lbTypeOfInstitution = "-99",
txtAsOfDate = "12/28/2016",
txtAsOfDateErrMsg = "",
lbHMDAYear = "2015",
grpRptFormat = "rbRptFormatPDF",
btnSubmit = "Submit"
)
# submit the req and save to disk
POST(url = "https://www.ffiec.gov/nicpubweb/nicweb/OrgHierarchySearchForm.aspx",
query=list(parID_RSSD=2162966, parDT_END=99991231),
add_headers(Origin = "https://www.ffiec.gov"),
body = params,
encode = "form",
write_disk("/tmp/output.pdf")) -> res2
我正在尝试通过 RCurl
从国家信息中心下载 pdf,但遇到了一些问题。对于这个例子 URL,我想要的是默认设置对应的 pdf,除了 "Report Format" 应该是 "PDF"。当我 运行 以下脚本时,它会保存与选择其他按钮相关的文件("Parent(s) of..."/HMDA -- 不是默认值)。我尝试将这些输入元素添加到 params
,但没有任何改变。有人可以帮我找出问题所在吗?谢谢。
library(RCurl)
curl = getCurlHandle()
curlSetOpt(cookiejar = 'cookies.txt', curl = curl)
params = list(rbRptFormatPDF = 'rbRptFormatPDF')
url = 'https://www.ffiec.gov/nicpubweb/nicweb/OrgHierarchySearchForm.aspx?parID_RSSD=2162966&parDT_END=99991231'
html = getURL(url, curl = curl)
viewstate = sub('.*id="__VIEWSTATE" value="([0-9a-zA-Z+/=]*).*', '\1', html)
event = sub('.*id="__EVENTVALIDATION" value="([0-9a-zA-Z+/=]*).*', '\1', html)
params[['__VIEWSTATE']] = viewstate
params[['__EVENTVALIDATION']] = event
params[['btnSubmit']] = 'Submit'
result = postForm(url, .params=params, curl=curl, style='POST')
writeBin( as.vector(result), 'test.pdf')
这提供了正确的 PDF 吗?
library(httr)
library(rvest)
library(purrr)
# setup inane sharepoint viewstate parameters
res <- GET(url = "https://www.ffiec.gov/nicpubweb/nicweb/OrgHierarchySearchForm.aspx",
query=list(parID_RSSD=2162966, parDT_END=99991231))
# extract them
pg <- content(res, as="parsed")
hidden <- html_nodes(pg, xpath=".//form/input[@type='hidden']")
params <- setNames(as.list(xml_attr(hidden, "value")), xml_attr(hidden, "name"))
# pile on more params
params <- c(
params,
grpInstitution = "rbCurInst",
lbTopHolders = "2961897",
grpHMDA = "rbNonHMDA",
lbTypeOfInstitution = "-99",
txtAsOfDate = "12/28/2016",
txtAsOfDateErrMsg = "",
lbHMDAYear = "2015",
grpRptFormat = "rbRptFormatPDF",
btnSubmit = "Submit"
)
# submit the req and save to disk
POST(url = "https://www.ffiec.gov/nicpubweb/nicweb/OrgHierarchySearchForm.aspx",
query=list(parID_RSSD=2162966, parDT_END=99991231),
add_headers(Origin = "https://www.ffiec.gov"),
body = params,
encode = "form",
write_disk("/tmp/output.pdf")) -> res2