Selenium 将文件保存到当前工作目录
Selenium save file to current working directory
我有一个网站,我在解决 CAPTCHA
问题后正在查询该网站。
解决 CAPTCHA
后,我的查询会下载一个 PDF
文件。我的问题是我无法 FireFox
在没有用户交互的情况下自动将文件下载到当前工作目录。
我也不知道如何确定文件是否已经存在,这会提示我的代码显示对话框或消息。
这是我当前的代码,它可以正确执行所有操作,直到文件下载弹出窗口。
import os
import logging
import argparse
import requests
from time import sleep
from selenium import webdriver
from selenium.common import exceptions
from selenium.webdriver.common.by import By
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
logger = logging.getLogger('tst-log-query')
logger.setLevel(logging.DEBUG)
formatter = logging.Formatter(
'%(asctime)s - %(name)s - %(levelname)-5.5s - %(message)s', "%Y-%m-%d %H:%M:%S")
file_handler = logging.FileHandler(
'tst-log-query.log', 'w', encoding='utf-8')
file_handler.setFormatter(formatter)
logger.addHandler(file_handler)
mainurl = "https://cndt-certidao.tst.jus.br/inicio.faces"
ckey = "f1a382ddd51949057324a7fc7c8ccf8a"
def solver(captcha):
with requests.Session() as req:
print("[*] - Please wait while CAPTCHA is solved ")
cdata1 = {
"clientKey": ckey,
"task": {
"type": "ImageToTextTask",
"body": captcha
}
}
cdata2 = {
"clientKey": ckey
}
while True:
try:
r = req.post(
'https://api.anti-captcha.com/createTask', json=cdata1)
cdata2['taskId'] = r.json()['taskId']
break
except KeyError:
logger.debug(r.json()["errorDescription"])
continue
while True:
sleep(5)
logger.info("Slept 5 Seconds!")
fr = req.post(
'https://api.anti-captcha.com/getTaskResult', json=cdata2)
status = fr.json()
logger.debug("Status: {}".format(status["status"]))
if status['status'] == "ready":
cap = status['solution']['text']
print("[*] - CAPTCHA Solved!")
return cap
else:
continue
def main(pat):
# saving to current working directory
options = Options()
options.set_preference('browser.download.folderList', 2)
options.set_preference('browser.download.manager.showWhenStarting', False)
options.set_preference('browser.download.dir', os.getcwd())
options.set_preference(
'browser.helperApps.neverAsk.saveToDisk', 'pdf')
#__________________________#
driver = webdriver.Firefox(options=options)
print(f"Checking (CNPJ/CPF)# {pat}")
while True:
try:
driver.get(mainurl)
waiter = WebDriverWait(driver, 60)
waiter.until(
EC.element_to_be_clickable(
(By.CSS_SELECTOR, "input[value=Regularização]"))
).click()
waiter.until(
EC.presence_of_element_located(
(By.CSS_SELECTOR, "#consultarRegularizacaoForm\:cpfCnpj"))
).send_keys(pat)
cap = waiter.until(
EC.presence_of_element_located(
(By.CSS_SELECTOR, "img[src^=data]"))).get_attribute('src').split(',', 1)[1]
break
except exceptions.TimeoutException:
logger.error('[*] - Unable to found elements, Refreshing Request.')
continue
capso = solver(cap)
if capso:
driver.find_element(By.ID, 'idCaptcha').send_keys(capso)
driver.find_element(
By.ID, 'consultarRegularizacaoForm:btnEmitirCertidao').click()
if __name__ == "__main__":
parser = argparse.ArgumentParser(description='Download PDF File!')
parser.add_argument(
'pattern', metavar="(CNPJ/CPF) Number", help="(CNPJ/CPF) Number", type=str)
try:
main(parser.parse_args().pattern)
except KeyboardInterrupt:
exit("Good Bye!")
用法:python script.py 15436940000103
options = Options()
options.headless = True
options.set_preference(
"browser.helperApps.neverAsk.saveToDisk", "application/pdf")
options.set_preference("browser.download.folderList", 2)
options.set_preference("browser.download.dir", os.getcwd())
options.set_preference("pdfjs.disabled", True)
driver = webdriver.Firefox(options=options)
使用前面的代码解决了。
我有一个网站,我在解决 CAPTCHA
问题后正在查询该网站。
解决 CAPTCHA
后,我的查询会下载一个 PDF
文件。我的问题是我无法 FireFox
在没有用户交互的情况下自动将文件下载到当前工作目录。
我也不知道如何确定文件是否已经存在,这会提示我的代码显示对话框或消息。
这是我当前的代码,它可以正确执行所有操作,直到文件下载弹出窗口。
import os
import logging
import argparse
import requests
from time import sleep
from selenium import webdriver
from selenium.common import exceptions
from selenium.webdriver.common.by import By
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
logger = logging.getLogger('tst-log-query')
logger.setLevel(logging.DEBUG)
formatter = logging.Formatter(
'%(asctime)s - %(name)s - %(levelname)-5.5s - %(message)s', "%Y-%m-%d %H:%M:%S")
file_handler = logging.FileHandler(
'tst-log-query.log', 'w', encoding='utf-8')
file_handler.setFormatter(formatter)
logger.addHandler(file_handler)
mainurl = "https://cndt-certidao.tst.jus.br/inicio.faces"
ckey = "f1a382ddd51949057324a7fc7c8ccf8a"
def solver(captcha):
with requests.Session() as req:
print("[*] - Please wait while CAPTCHA is solved ")
cdata1 = {
"clientKey": ckey,
"task": {
"type": "ImageToTextTask",
"body": captcha
}
}
cdata2 = {
"clientKey": ckey
}
while True:
try:
r = req.post(
'https://api.anti-captcha.com/createTask', json=cdata1)
cdata2['taskId'] = r.json()['taskId']
break
except KeyError:
logger.debug(r.json()["errorDescription"])
continue
while True:
sleep(5)
logger.info("Slept 5 Seconds!")
fr = req.post(
'https://api.anti-captcha.com/getTaskResult', json=cdata2)
status = fr.json()
logger.debug("Status: {}".format(status["status"]))
if status['status'] == "ready":
cap = status['solution']['text']
print("[*] - CAPTCHA Solved!")
return cap
else:
continue
def main(pat):
# saving to current working directory
options = Options()
options.set_preference('browser.download.folderList', 2)
options.set_preference('browser.download.manager.showWhenStarting', False)
options.set_preference('browser.download.dir', os.getcwd())
options.set_preference(
'browser.helperApps.neverAsk.saveToDisk', 'pdf')
#__________________________#
driver = webdriver.Firefox(options=options)
print(f"Checking (CNPJ/CPF)# {pat}")
while True:
try:
driver.get(mainurl)
waiter = WebDriverWait(driver, 60)
waiter.until(
EC.element_to_be_clickable(
(By.CSS_SELECTOR, "input[value=Regularização]"))
).click()
waiter.until(
EC.presence_of_element_located(
(By.CSS_SELECTOR, "#consultarRegularizacaoForm\:cpfCnpj"))
).send_keys(pat)
cap = waiter.until(
EC.presence_of_element_located(
(By.CSS_SELECTOR, "img[src^=data]"))).get_attribute('src').split(',', 1)[1]
break
except exceptions.TimeoutException:
logger.error('[*] - Unable to found elements, Refreshing Request.')
continue
capso = solver(cap)
if capso:
driver.find_element(By.ID, 'idCaptcha').send_keys(capso)
driver.find_element(
By.ID, 'consultarRegularizacaoForm:btnEmitirCertidao').click()
if __name__ == "__main__":
parser = argparse.ArgumentParser(description='Download PDF File!')
parser.add_argument(
'pattern', metavar="(CNPJ/CPF) Number", help="(CNPJ/CPF) Number", type=str)
try:
main(parser.parse_args().pattern)
except KeyboardInterrupt:
exit("Good Bye!")
用法:python script.py 15436940000103
options = Options()
options.headless = True
options.set_preference(
"browser.helperApps.neverAsk.saveToDisk", "application/pdf")
options.set_preference("browser.download.folderList", 2)
options.set_preference("browser.download.dir", os.getcwd())
options.set_preference("pdfjs.disabled", True)
driver = webdriver.Firefox(options=options)
使用前面的代码解决了。