无头 javascript 使用 selenium 下载

Headless javascript download with selenium

我正在尝试在无头上下文中从 http://www.oracle.com/technetwork/server-storage/developerstudio/downloads/index.html 下载文件。我有一个帐户(它们是免费的),但是该站点确实并不容易,显然它使用了 javascript forms/redirection 链。使用 Firefox,我可以使用元素检查器,在下载开始时将文件的 url 复制为 cURL,并在无头机器中使用它来下载文件,但到目前为止,我所有尝试获取文件的尝试都只在无头机器失败了。

我已经成功登录:

#!/usr/bin/env python3

username="<my username>"
password="<my password>"

import requests
from selenium import webdriver
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
caps = DesiredCapabilities.PHANTOMJS
caps["phantomjs.page.settings.userAgent"] = "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:55.0) Gecko/20100101 Firefox/55.0"
driver = webdriver.PhantomJS("/usr/local/bin/phantomjs")
driver.set_window_size(1120, 550)
driver.get("http://www.oracle.com/technetwork/server-storage/developerstudio/downloads/index.html")
print("loaded")
driver.find_element_by_name("agreement").click()
print("clicked agreement")
driver.find_element_by_partial_link_text("RPM installer").click()
print("clicked link")
driver.find_element_by_id("sso_username").send_keys(username)
driver.find_element_by_id("ssopassword").send_keys(password)
driver.find_element_by_xpath("//input[contains(@title,'Please click here to sign in')]").click()
print("submitted")

print(driver.get_cookies())

print(driver.current_url)
print(driver.page_source)
driver.quit()

我怀疑登录成功了,因为在 cookie 中我看到了一些与我的用户名相关的数据,但是在 Firefox 中提交表单导致在 3-4 次重定向后开始下载,而在这里我什么也得不到 page_sourcecurrent_url仍然属于登录页面。

也许网站正在积极阻止这种使用,或者我做错了什么。知道如何实际下载文件吗?

多亏了 TheChetan 的评论,我才开始工作。虽然我没有使用 javascript-blob 路由,但是 Tarun Lalwani 在 中建议的 requests 方法。我花了一段时间才意识到我也必须修改请求中的用户代理。最后这对我有用:

#!/usr/bin/env python3

from selenium import webdriver
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
from requests import Session
from urllib.parse import urlparse
from os.path import basename
from hashlib import sha256
import sys

index_url = "http://www.oracle.com/technetwork/server-storage/developerstudio/downloads/index.html"
link_text = "RPM installer"
username="<my username>"
password="<my password>"
user_agent = "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:55.0) Gecko/20100101 Firefox/55.0"

# set up browser
caps = DesiredCapabilities.PHANTOMJS
caps["phantomjs.page.settings.userAgent"] = user_agent
driver = webdriver.PhantomJS("/usr/local/bin/phantomjs")
driver.set_window_size(800,600)

# load index page and click through
driver.get(index_url)
print("loaded")
driver.find_element_by_name("agreement").click()
print("clicked agreement")
link = driver.find_element_by_partial_link_text(link_text)
sha = driver.find_element_by_xpath("//*[contains(text(), '{0}')]/following::*[contains(text(), 'sum:')]/following-sibling::*".format(link_text)).text
file_url = link.get_attribute("href")
filename = basename(urlparse(file_url).path)
print("filename: {0}".format(filename))
print("checksum: {0}".format(sha))
link.click()
print("clicked link")
driver.find_element_by_id("sso_username").send_keys(username)
driver.find_element_by_id("ssopassword").send_keys(password)
driver.find_element_by_xpath("//input[contains(@title,'Please click here to sign in')]").click()
print("submitted")

# we should be logged in now

def progressBar(title, value, endvalue, bar_length=60):
  percent = float(value) / endvalue
  arrow = '-' * int(round(percent * bar_length)-1) + '>'
  spaces = ' ' * (bar_length - len(arrow))
  sys.stdout.write("\r{0}: [{1}] {2}%".format(title, arrow + spaces, int(round(percent * 100))))
  sys.stdout.flush()

# transfer the cookies to a new session and request the file
session = Session()
session.headers = {"user-agent": user_agent}
for cookie in driver.get_cookies():
  session.cookies.set(cookie["name"], cookie["value"])
driver.quit()
r = session.get(file_url, stream=True)
# now we should have gotten the url with param
new_url = r.url
print("final url {0}".format(new_url))
r = session.get(new_url, stream=True)
print("requested")
length = int(r.headers['Content-Length'])
title = "Downloading ({0})".format(length)
sha_file = sha256()
chunk_size = 2048
done = 0
with open(filename, "wb") as f:
  for chunk in r.iter_content(chunk_size):
    f.write(chunk)
    sha_file.update(chunk)
    done = done+len(chunk)
    progressBar(title, done, length)
print()

# check integrity
if (sha_file.hexdigest() == sha):
  print("checksums match")
  sys.exit(0)
else:
  print("checksums do NOT match!")
  sys.exit(1)

所以最后的想法是使用 selenium+phantomjs 进行登录,然后使用 cookie 进行普通请求。