如何在 R 中使用 httr 验证 shibboleth 多主机名网站

how to authenticate a shibboleth multi-hostname website with httr in R

注意:ipums international 和 ipums usa 可能使用相同的系统。 ipums usa 允许更快的注册。如果您想测试您的代码,请尝试 https://usa.ipums.org/usa-action/users/request_access 注册!

我正在尝试以编程方式从 https://international.ipums.org/ with the R language and httr. i need to use httr and not RCurl because i need to post-authentication download large files not into RAM but directly to disk. this is currently only possible with httr as far as i know

下载文件

下面的可重现代码记录了我从登录页面 (https://international.ipums.org/international-action/users/login) 到主 post 身份验证页面的最大努力。任何提示或提示将不胜感激!谢谢!

my_email <- "email@address.com"
my_password <- "password"

tf <- tempfile()

# use httr, because i need to download a large file after authentication
# and only httr supports that with its `write_disk()` option
library(httr)

# turn off ssl verify, otherwise the subsequent GET command will fail
set_config( config( ssl_verifypeer = 0L ) )

GET( "https://international.ipums.org/Shibboleth.sso/Login?target=https%3A%2F%2Finternational.ipums.org%2Finternational-action%2Fmenu" )

# connect to the starting login page of the website
( a <- GET( "https://international.ipums.org/international-action/users/login" , verbose( info = TRUE ) ) )

# which takes me through to a lot of websites, but ultimately (in my browser) lands at
shibboleth_url <- "https://live.identity.popdata.org:443/idp/Authn/UserPassword"

# construct authentication information?
base_values <- list( "j_username" = my_email , "j_password" = my_password )
idp_values <- list( "j_username" = my_email , "j_password" = my_password ,  "_idp_authn_lc_key"=subset( a$cookies , domain == "live.identity.popdata.org" )$value , "JSESSIONID" = subset( a$cookies , domain == "#HttpOnly_live.identity.popdata.org" )$value )
ipums_values <- list( "j_username" = my_email , "j_password" = my_password ,  "_idp_authn_lc_key"=subset( a$cookies , domain == "live.identity.popdata.org" )$value , "JSESSIONID" = subset( a$cookies , domain == "international.ipums.org" )$value)

# i believe this is where the main login should happen, but it looks like it's failing
GET( shibboleth_url , query = idp_values )
POST( shibboleth_url , body = base_values )
writeBin( GET( shibboleth_url , query = idp_values )$content , tf )

readLines( tf )
# The MPC account authentication system has encountered an error
# This error can sometimes occur if you did not close your browser after logging out of an application previously.  It may also occur for other reasons.  Please close your browser and try your action again."                                                                      

writeBin( GET( "https://live.identity.popdata.org/idp/profile/SAML2/Redirect/SSO" , query = idp_values )$content , tf )
POST( "https://live.identity.popdata.org/idp/profile/SAML2/Redirect/SSO" , body = idp_values )
readLines( tf )
# same error as above

# return to the main login page..
writeBin( GET( "https://international.ipums.org/international-action/menu" , query = ipums_values )$content , tf )
readLines( tf )
# ..not logged in

您必须使用 set_cookies() 将您的 cookie 发送到服务器:

library(httr)
library(rvest)
#my_email <- "xxx"
#my_password <- "yyy"
tf <- tempfile()
set_config( config( ssl_verifypeer = 0L ) )

# Get first page
p1 <- GET( "https://international.ipums.org/international-action/users/login" , verbose( info = TRUE ) )

# Post Login credentials
b2 <- list( "j_username" = my_email , "j_password" = my_password )
c2 <- c(JSESSIONID=p1$cookies[p1$cookies$domain=="#HttpOnly_live.identity.popdata.org",]$value,
           `_idp_authn_lc_key`=p1$cookies[p1$cookies$domain=="live.identity.popdata.org",]$value)
p2 <- POST(p1$url,body = b2, set_cookies(.cookies = c2), encode="form" )

# Parse hidden fields
h2 <- read_html(p2$content)
form <-  h2 %>% html_form() 

# Post hidden fields
b3 <- list( "RelayState"=form[[1]]$fields[[1]]$value, "SAMLResponse"=form[[1]]$fields[[2]]$value)
c3 <- c(JSESSIONID=p1$cookies[p1$cookies$domain=="#HttpOnly_live.identity.popdata.org",]$value,
           `_idp_session`=p2$cookies[p2$cookies$name=="_idp_session",]$value,
           `_idp_authn_lc_key`=p2$cookies[p2$cookies$name=="_idp_authn_lc_key",]$value)
p3 <- POST( form[[1]]$url , body=b3, set_cookies(.cookies = c3), encode = "form")

# Get interesting page
c4 <- c(JSESSIONID=p3$cookies[p1$cookies$domain=="international.ipums.org" && p3$cookies$name=="JSESSIONID",]$value,
           `_idp_session`=p3$cookies[p3$cookies$name=="_idp_session",]$value,
           `_idp_authn_lc_key`=p3$cookies[p3$cookies$name=="_idp_authn_lc_key",]$value)
p4 <- GET( "https://international.ipums.org/international-action/menu", set_cookies(.cookies = c4) )
writeBin(p4$content , tf )
readLines( tf )[55]

因为结果是

[1] "    <li class=\"lastItem\"><a href=\"/international-action/users/logout\">Logout</a></li>"

我认为您已登录...

@HubertL 在正确的方向上做了很多步骤,但是,我认为他的回答并不完整。

首先,在实施自动 Web 授权时要注意的主要事情是 'normal' 手动工作流程中使用的 cookie。您可以在任何现代浏览器中使用开发工具轻松监视它们:

在这里,我们看到 JSESSIONID_shibsession* cookie,第一个保存网站的 JSP 会话 ID,第二个很可能仅用于 shibboleth 授权。服务器可能以某种方式绑定了它们,但 JSESSIONID 不需要授权,您打开网站后立即获得授权。因此,我们必须获得 _shibsession* cookie 才能使我们的 JSESSIONID 获得授权。这就是带有许多重定向的 Shibboleth 授权过程的内容。查看代码中的注释。

login_ipums = function(user, password)
{
  require(httr)
  require(rvest)

  set_config( config( ssl_verifypeer = 0L ) )

  #important - httr preserves cookies on subsequent requests to the same host, we don't need that because of sessions expiration
  handle_reset("https://usa.ipums.org/")

  #set login and password
  login1 = GET( "https://usa.ipums.org/usa-action/users/login" )
  form_auth = list( "j_username" = user , "j_password" = password )

  l1_cookies=login1$cookies$value
  names(l1_cookies)=login1$cookies$name

  #receive auth tokens as html hidden fields in a form
  login2 = POST(login1$url, body = form_auth, set_cookies(.cookies=l1_cookies), encode="form")
  login2_form = read_html(login2$content) %>% html_form() 

  l2_cookies=login2$cookies$value
  names(l2_cookies)=login2$cookies$name

  #submit the form back (browser submits it back automatically with JS)
  login3 = POST(login2_form[[1]]$url, body=list(RelayState=login2_form[[1]]$fields$RelayState$value, 
                                                SAMLResponse=login2_form[[1]]$fields$SAMLResponse$value), 
                set_cookies(.cookies=l2_cookies), 
                encode="form")

  #now we have what we came for - _shibsession_* and JSESSION id cookie
  login_cookies = login3$cookies$value
  names(login_cookies)=login3$cookies$name

  return=login_cookies
}

调用 login_ipums 后,我们将获得以下 cookie:

> cookies=login_ipums(my_email, my_password)
> names(cookies)
[1] "JSESSIONID"      
[2] "_idp_authn_lc_key"             
[3] "_shibsession_7573612e69..."

在这里,我们有JSESSIONID_shibsession_*用于site-wide授权。 _idp_authn_lc_key 可能不需要,但留下它也无妨。

现在,您可以轻松下载这样的文件:

cookies=login_ipums(my_email, my_password)
target = GET("https://usa.ipums.org/usa-action/downloads/extract_files/usa_00001.dat.gz",
         set_cookies(.cookies=cookies),
         write_disk("file.bin", overwrite = TRUE))

重要提示:如您所见,我使用的是 IPUMS USA,而不是 International。要使用您的帐户检查该代码,请将所有地方的 usa 替换为 international,包括网址中的 *-action