是否可以下载 PDF 格式的 scrapy 文件?
Is it possible to download a scrapy file in PDF?
我想要以下代码(由 F.Hoque 开发)从该网站下载 PDF 文件。
import scrapy
from scrapy_selenium import SeleniumRequest
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
class TestSpider(scrapy.Spider):
name = 'test'
def start_requests(self):
yield SeleniumRequest(
url='https://www.ons.gov.uk',
callback=self.parse,
wait_time = 3,
screenshot = True
)
def parse(self, response):
driver = response.meta['driver']
driver.save_screenshot('screenshot.png')
WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.NAME, "q"))).send_keys("Education and childcare")
driver.save_screenshot('screenshot_1.png')
click_button=driver.find_element_by_xpath('//*[@id="nav-search-submit"]').click()
driver.save_screenshot('screenshot_2.png')
click_button=driver.find_element_by_xpath('//*[@id="results"]/div[1]/div[2]/div[1]/h3/a/span').click()
click_button=driver.find_element_by_xpath('//*[@id="main"]/div[2]/div[1]/section/div/div[1]/div/div[2]/h3/a/span').click()
click_button=driver.find_element_by_xpath('//*[@id="main"]/div[2]/div/div[1]/div[2]/p[2]/a').click()
此外,我不确定要将其添加到哪个 settings.py 文件(因为 运行 的代码需要它):
# Middleware
DOWNLOADER_MIDDLEWARES = {
'scrapy_selenium.SeleniumMiddleware': 800
}
# Selenium
from shutil import which
SELENIUM_DRIVER_NAME = 'chrome'
SELENIUM_DRIVER_EXECUTABLE_PATH = which('chromedriver')
SELENIUM_DRIVER_ARGUMENTS = ['--headless']
我通过 Anaconda 3 使用 Spyder,我有五个不同的 settings.py 文件。以下是他们各自的位置:
"C:\Users\David\anaconda3\Lib\site-packages\scrapy\commands\settings.py"
"C:\Users\David\anaconda3\pkgs\bokeh-2.3.2-py38haa95532_0\Lib\site-packages\bokeh\settings.py"
"C:\Users\David\anaconda3\Lib\site-packages\bokeh\settings.py"
"C:\Users\David\anaconda3\pkgs\isort-5.8.0-pyhd3eb1b0_0\site-packages\isort\settings.py"
"C:\Users\David\anaconda3\Lib\site-packages\isort\settings.py"
我应该将第二个代码保存到这些 settings.py 个文件中的哪个?
Scrapy 可以使用 media/image 管道下载 pdf files/images。查看它们仅包含 pdf link 但不包含文件的输出。您会注意到 url 最后没有 .pdf
扩展名,而不仅仅是 link 如果它有 .pdf
那么它将是一个文件,只有这样我才能下载 pdf使用 scrapy 媒体从这里下载文件 pipeline.If 你点击输出文件然后它会手动开始 downlown.I 不知道端点 /pdf
可以转换成 .pdf 然后可以下载
import scrapy
from scrapy_selenium import SeleniumRequest
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
class TestSpider(scrapy.Spider):
name = 'test'
def start_requests(self):
yield SeleniumRequest(
url='https://www.ons.gov.uk',
callback=self.parse,
wait_time = 3,
screenshot = True
)
def parse(self, response):
driver = response.meta['driver']
#driver.save_screenshot('screenshot.png')
WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.NAME, "q"))).send_keys("Education and childcare")
#driver.save_screenshot('screenshot_1.png')
click_button=driver.find_element_by_xpath('//*[@id="nav-search-submit"]').click()
#driver.save_screenshot('screenshot_2.png')
click_button=driver.find_element_by_xpath('//*[@id="results"]/div[1]/div[2]/div[1]/h3/a/span').click()
click_button=driver.find_element_by_xpath('//*[@id="main"]/div[2]/div[1]/section/div/div[1]/div/div[2]/h3/a/span').click()
#No need to click because click and download not possible
#click_button=driver.find_element_by_xpath('//*[@id="main"]/div[2]/div/div[1]/div[2]/p[2]/a').click()
#driver.save_screenshot('screenshot_pdf.png')
pdf_url= driver.find_element_by_xpath('//*[@class="link-complex js-pdf-dl-link"]').get_attribute('href')
yield {'url': pdf_url}
输出:
{'url': 'https://www.ons.gov.uk/peoplepopulationandcommunity/educationandchildcare/articles/remoteschoolingthroughthecoronaviruscovid19pandemicengland/april2020tojune2021/pdf'}
我想要以下代码(由 F.Hoque 开发)从该网站下载 PDF 文件。
import scrapy
from scrapy_selenium import SeleniumRequest
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
class TestSpider(scrapy.Spider):
name = 'test'
def start_requests(self):
yield SeleniumRequest(
url='https://www.ons.gov.uk',
callback=self.parse,
wait_time = 3,
screenshot = True
)
def parse(self, response):
driver = response.meta['driver']
driver.save_screenshot('screenshot.png')
WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.NAME, "q"))).send_keys("Education and childcare")
driver.save_screenshot('screenshot_1.png')
click_button=driver.find_element_by_xpath('//*[@id="nav-search-submit"]').click()
driver.save_screenshot('screenshot_2.png')
click_button=driver.find_element_by_xpath('//*[@id="results"]/div[1]/div[2]/div[1]/h3/a/span').click()
click_button=driver.find_element_by_xpath('//*[@id="main"]/div[2]/div[1]/section/div/div[1]/div/div[2]/h3/a/span').click()
click_button=driver.find_element_by_xpath('//*[@id="main"]/div[2]/div/div[1]/div[2]/p[2]/a').click()
此外,我不确定要将其添加到哪个 settings.py 文件(因为 运行 的代码需要它):
# Middleware
DOWNLOADER_MIDDLEWARES = {
'scrapy_selenium.SeleniumMiddleware': 800
}
# Selenium
from shutil import which
SELENIUM_DRIVER_NAME = 'chrome'
SELENIUM_DRIVER_EXECUTABLE_PATH = which('chromedriver')
SELENIUM_DRIVER_ARGUMENTS = ['--headless']
我通过 Anaconda 3 使用 Spyder,我有五个不同的 settings.py 文件。以下是他们各自的位置:
"C:\Users\David\anaconda3\Lib\site-packages\scrapy\commands\settings.py"
"C:\Users\David\anaconda3\pkgs\bokeh-2.3.2-py38haa95532_0\Lib\site-packages\bokeh\settings.py"
"C:\Users\David\anaconda3\Lib\site-packages\bokeh\settings.py"
"C:\Users\David\anaconda3\pkgs\isort-5.8.0-pyhd3eb1b0_0\site-packages\isort\settings.py"
"C:\Users\David\anaconda3\Lib\site-packages\isort\settings.py"
我应该将第二个代码保存到这些 settings.py 个文件中的哪个?
Scrapy 可以使用 media/image 管道下载 pdf files/images。查看它们仅包含 pdf link 但不包含文件的输出。您会注意到 url 最后没有 .pdf
扩展名,而不仅仅是 link 如果它有 .pdf
那么它将是一个文件,只有这样我才能下载 pdf使用 scrapy 媒体从这里下载文件 pipeline.If 你点击输出文件然后它会手动开始 downlown.I 不知道端点 /pdf
可以转换成 .pdf 然后可以下载
import scrapy
from scrapy_selenium import SeleniumRequest
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
class TestSpider(scrapy.Spider):
name = 'test'
def start_requests(self):
yield SeleniumRequest(
url='https://www.ons.gov.uk',
callback=self.parse,
wait_time = 3,
screenshot = True
)
def parse(self, response):
driver = response.meta['driver']
#driver.save_screenshot('screenshot.png')
WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.NAME, "q"))).send_keys("Education and childcare")
#driver.save_screenshot('screenshot_1.png')
click_button=driver.find_element_by_xpath('//*[@id="nav-search-submit"]').click()
#driver.save_screenshot('screenshot_2.png')
click_button=driver.find_element_by_xpath('//*[@id="results"]/div[1]/div[2]/div[1]/h3/a/span').click()
click_button=driver.find_element_by_xpath('//*[@id="main"]/div[2]/div[1]/section/div/div[1]/div/div[2]/h3/a/span').click()
#No need to click because click and download not possible
#click_button=driver.find_element_by_xpath('//*[@id="main"]/div[2]/div/div[1]/div[2]/p[2]/a').click()
#driver.save_screenshot('screenshot_pdf.png')
pdf_url= driver.find_element_by_xpath('//*[@class="link-complex js-pdf-dl-link"]').get_attribute('href')
yield {'url': pdf_url}
输出:
{'url': 'https://www.ons.gov.uk/peoplepopulationandcommunity/educationandchildcare/articles/remoteschoolingthroughthecoronaviruscovid19pandemicengland/april2020tojune2021/pdf'}