Scrapy爬虫不下载文件?
Scrapy crawl spider does not download files?
所以我做了一个抓取该网站的抓取蜘蛛(https://minerals.usgs.gov/science/mineral-deposit-database/#products,跟随该网页上的每个 link,它从中抓取标题并且应该将文件下载为嗯。但是这并没有发生,日志中也没有错误指示!
日志样本
2018-11-19 18:20:12 [scrapy.core.scraper] DEBUG: Scraped from <200
https://www.sciencebase.gov/catalog/item/5a1492c3e4b09fc93dcfd574>
{'date': [datetime.datetime(2018, 11, 19, 18, 20, 12, 209865)],
'file':
['https://www.sciencebase.gov/catalog/file/get/5a1492c3e4b09fc93dcfd574?
f=__disk__d7%2F26%2Fdb%2Fd726dbd9030e7554a4ef13cb56f53983f407eb7d',
'https://www.sciencebase.gov/catalog/file/get/5a1492c3e4b09fc93dcfd574?
f=__disk__d7%2F26%2Fdb%2Fd726dbd9030e7554a
4ef13cb56f53983f407eb7d&transform=1',
'https://www.sciencebase.gov/catalog/file/get/5a1492c3e4b09fc93dcfd574?
f=__disk__72%2F6b%2F7d%2F726b7dd547ce9805a97e2464dc1f4646b2a16cfb',
'https://www.sciencebase.gov/catalog/file/get/5a1492c3e4b09fc93dcfd574?
f=__disk__d4%2F87%2F6b%2Fd4876b385bc9ac2af3c9221aee4ff7a5a88f201a',
'https://www.sciencebase.gov/catalog/file/get/5a1492c3e4b09fc93dcfd574?
f=__disk__12%2Fd9%2F4f%2F12d94f844998c4a4eaf1cedd80b70f36ed960a2c',
'https://www.sciencebase.gov/catalog/file/get/5a1492c3e4b09fc93dcfd574?
f=__disk__12%2Fd9%2F4f%2F12d94f844998c4a4eaf1cedd80b70f36ed960a2c',
'https://www.sciencebase.gov/catalog/file/get/5a1492c3e4b09fc93dcfd574?
f=__disk__e3%2Ff0%2F95%2Fe3f0958d05c1240724b58709196a87492b85d8d4',
'https://www.sciencebase.gov/catalog/file/get/5a1492c3e4b09fc93dcfd574?
f=__disk__e3%2Ff0%2F95%2Fe3f0958d05c1240724b58709196a87492b85d8d4',
'https://www.sciencebase.gov/catalog/file/get/5a1492c3e4b09fc93dcfd574?
facet=USGS_TopoMineSymbols_ver2_mapservice.sd',
'https://www.sciencebase.gov/catalog/file/get/5a1492c3e4b09fc93dcfd574?
f=__disk__b0%2F64%2Fd3%2Fb064d3465149780209ef624db57830e40edb9115'],
'name': ['Prospect- and Mine-Related Features from U.S. Geological Survey '
'7.5- and 15-Minute Topographic Quadrangle Maps of the United '
'States'],
'project': ['us_deposits'],
'server': ['DESKTOP-9CUE746'],
'spider': ['deposits'],
'url':
['https://www.sciencebase.gov/catalog/item/5a1492c3e4b09fc93dcfd574']}
2018-11-19 18:20:12 [scrapy.statscollectors] INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 7312,
'downloader/request_count': 23,
'downloader/request_method_count/GET': 23,
'downloader/response_bytes': 615330,
'downloader/response_count': 23,
'downloader/response_status_count/200': 13,
'downloader/response_status_count/301': 1,
'downloader/response_status_count/302': 9,
'finish_reason': 'finished',
'finish_time': datetime.datetime(2018, 11, 19, 17, 20, 12, 397317),
'item_scraped_count': 9,
'log_count/DEBUG': 34,
'log_count/INFO': 7,
'offsite/domains': 1,
'offsite/filtered': 2,
'request_depth_max': 1,
'response_received_count': 13,
'scheduler/dequeued': 19,
'scheduler/dequeued/memory': 19,
'scheduler/enqueued': 19,
'scheduler/enqueued/memory': 19,
'start_time': datetime.datetime(2018, 11, 19, 17, 20, 7, 541186)}
2018-11-19 18:20:12 [scrapy.core.engine] INFO: Spider closed (finished)
蜘蛛
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
import datetime
import socket
from us_deposits.items import DepositsusaItem
from scrapy.loader import ItemLoader
from scrapy.loader.processors import MapCompose
from urllib.parse import urlparse
from urllib.parse import urljoin
class DepositsSpider(CrawlSpider):
name = 'deposits'
allowed_domains = ['doi.org']
start_urls = ['https://minerals.usgs.gov/science/mineral-deposit-database/#products', ]
rules = (
Rule(LinkExtractor(restrict_xpaths='//*[@id="products"][1]/p/a'),
callback='parse_x'),
)
def parse_x(self, response):
i = ItemLoader(item=DepositsusaItem(), response=response)
i.add_xpath('name', '//*[@class="container"][1]/header/h1/text()')
i.add_xpath('file', '//span[starts-with(@data-url, "/catalog/file/get/")]/@data-url',
MapCompose(lambda i: urljoin(response.url, i))
)
i.add_value('url', response.url)
i.add_value('project', self.settings.get('BOT_NAME'))
i.add_value('spider', self.name)
i.add_value('server', socket.gethostname())
i.add_value('date', datetime.datetime.now())
return i.load_item()
设置
BOT_NAME = 'us_deposits'
SPIDER_MODULES = ['us_deposits.spiders']
NEWSPIDER_MODULE = 'us_deposits.spiders'
ROBOTSTXT_OBEY = True
ITEM_PIPELINES = {
'us_deposits.pipelines.UsDepositsPipeline': 1,
}
FILES_STORE = {
'C:/Users/User/Documents/Python WebCrawling Learning Projects'
}
有什么想法吗?
仔细查看 Files Pipeline 文档:
In a Spider, you scrape an item and put the URLs of the desired into a
file_urls field.
您需要将要下载的文件的 URL 存储在字段名称 file_urls
中,而不是 file
。
这个最小的蜘蛛适合我:
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
class MySpider(CrawlSpider):
name = 'usgs.gov'
allowed_domains = ['doi.org']
start_urls = ['https://minerals.usgs.gov/science/mineral-deposit-database/#products']
custom_settings = {
'ITEM_PIPELINES': {'scrapy.pipelines.files.FilesPipeline': 1},
'FILES_STORE': '/my/valid/path/',
}
rules = (
Rule(LinkExtractor(restrict_xpaths='//div[@id="products"]/p/a'), callback='parse_x'),
)
def parse_x(self, response):
yield {
'file_urls': [response.urljoin(u) for u in response.xpath('//span[starts-with(@data-url, "/catalog/file/get/")]/@data-url').extract()],
}
所以我做了一个抓取该网站的抓取蜘蛛(https://minerals.usgs.gov/science/mineral-deposit-database/#products,跟随该网页上的每个 link,它从中抓取标题并且应该将文件下载为嗯。但是这并没有发生,日志中也没有错误指示!
日志样本
2018-11-19 18:20:12 [scrapy.core.scraper] DEBUG: Scraped from <200
https://www.sciencebase.gov/catalog/item/5a1492c3e4b09fc93dcfd574>
{'date': [datetime.datetime(2018, 11, 19, 18, 20, 12, 209865)],
'file':
['https://www.sciencebase.gov/catalog/file/get/5a1492c3e4b09fc93dcfd574?
f=__disk__d7%2F26%2Fdb%2Fd726dbd9030e7554a4ef13cb56f53983f407eb7d',
'https://www.sciencebase.gov/catalog/file/get/5a1492c3e4b09fc93dcfd574?
f=__disk__d7%2F26%2Fdb%2Fd726dbd9030e7554a
4ef13cb56f53983f407eb7d&transform=1',
'https://www.sciencebase.gov/catalog/file/get/5a1492c3e4b09fc93dcfd574?
f=__disk__72%2F6b%2F7d%2F726b7dd547ce9805a97e2464dc1f4646b2a16cfb',
'https://www.sciencebase.gov/catalog/file/get/5a1492c3e4b09fc93dcfd574?
f=__disk__d4%2F87%2F6b%2Fd4876b385bc9ac2af3c9221aee4ff7a5a88f201a',
'https://www.sciencebase.gov/catalog/file/get/5a1492c3e4b09fc93dcfd574?
f=__disk__12%2Fd9%2F4f%2F12d94f844998c4a4eaf1cedd80b70f36ed960a2c',
'https://www.sciencebase.gov/catalog/file/get/5a1492c3e4b09fc93dcfd574?
f=__disk__12%2Fd9%2F4f%2F12d94f844998c4a4eaf1cedd80b70f36ed960a2c',
'https://www.sciencebase.gov/catalog/file/get/5a1492c3e4b09fc93dcfd574?
f=__disk__e3%2Ff0%2F95%2Fe3f0958d05c1240724b58709196a87492b85d8d4',
'https://www.sciencebase.gov/catalog/file/get/5a1492c3e4b09fc93dcfd574?
f=__disk__e3%2Ff0%2F95%2Fe3f0958d05c1240724b58709196a87492b85d8d4',
'https://www.sciencebase.gov/catalog/file/get/5a1492c3e4b09fc93dcfd574?
facet=USGS_TopoMineSymbols_ver2_mapservice.sd',
'https://www.sciencebase.gov/catalog/file/get/5a1492c3e4b09fc93dcfd574?
f=__disk__b0%2F64%2Fd3%2Fb064d3465149780209ef624db57830e40edb9115'],
'name': ['Prospect- and Mine-Related Features from U.S. Geological Survey '
'7.5- and 15-Minute Topographic Quadrangle Maps of the United '
'States'],
'project': ['us_deposits'],
'server': ['DESKTOP-9CUE746'],
'spider': ['deposits'],
'url':
['https://www.sciencebase.gov/catalog/item/5a1492c3e4b09fc93dcfd574']}
2018-11-19 18:20:12 [scrapy.statscollectors] INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 7312,
'downloader/request_count': 23,
'downloader/request_method_count/GET': 23,
'downloader/response_bytes': 615330,
'downloader/response_count': 23,
'downloader/response_status_count/200': 13,
'downloader/response_status_count/301': 1,
'downloader/response_status_count/302': 9,
'finish_reason': 'finished',
'finish_time': datetime.datetime(2018, 11, 19, 17, 20, 12, 397317),
'item_scraped_count': 9,
'log_count/DEBUG': 34,
'log_count/INFO': 7,
'offsite/domains': 1,
'offsite/filtered': 2,
'request_depth_max': 1,
'response_received_count': 13,
'scheduler/dequeued': 19,
'scheduler/dequeued/memory': 19,
'scheduler/enqueued': 19,
'scheduler/enqueued/memory': 19,
'start_time': datetime.datetime(2018, 11, 19, 17, 20, 7, 541186)}
2018-11-19 18:20:12 [scrapy.core.engine] INFO: Spider closed (finished)
蜘蛛
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
import datetime
import socket
from us_deposits.items import DepositsusaItem
from scrapy.loader import ItemLoader
from scrapy.loader.processors import MapCompose
from urllib.parse import urlparse
from urllib.parse import urljoin
class DepositsSpider(CrawlSpider):
name = 'deposits'
allowed_domains = ['doi.org']
start_urls = ['https://minerals.usgs.gov/science/mineral-deposit-database/#products', ]
rules = (
Rule(LinkExtractor(restrict_xpaths='//*[@id="products"][1]/p/a'),
callback='parse_x'),
)
def parse_x(self, response):
i = ItemLoader(item=DepositsusaItem(), response=response)
i.add_xpath('name', '//*[@class="container"][1]/header/h1/text()')
i.add_xpath('file', '//span[starts-with(@data-url, "/catalog/file/get/")]/@data-url',
MapCompose(lambda i: urljoin(response.url, i))
)
i.add_value('url', response.url)
i.add_value('project', self.settings.get('BOT_NAME'))
i.add_value('spider', self.name)
i.add_value('server', socket.gethostname())
i.add_value('date', datetime.datetime.now())
return i.load_item()
设置
BOT_NAME = 'us_deposits'
SPIDER_MODULES = ['us_deposits.spiders']
NEWSPIDER_MODULE = 'us_deposits.spiders'
ROBOTSTXT_OBEY = True
ITEM_PIPELINES = {
'us_deposits.pipelines.UsDepositsPipeline': 1,
}
FILES_STORE = {
'C:/Users/User/Documents/Python WebCrawling Learning Projects'
}
有什么想法吗?
仔细查看 Files Pipeline 文档:
In a Spider, you scrape an item and put the URLs of the desired into a file_urls field.
您需要将要下载的文件的 URL 存储在字段名称 file_urls
中,而不是 file
。
这个最小的蜘蛛适合我:
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
class MySpider(CrawlSpider):
name = 'usgs.gov'
allowed_domains = ['doi.org']
start_urls = ['https://minerals.usgs.gov/science/mineral-deposit-database/#products']
custom_settings = {
'ITEM_PIPELINES': {'scrapy.pipelines.files.FilesPipeline': 1},
'FILES_STORE': '/my/valid/path/',
}
rules = (
Rule(LinkExtractor(restrict_xpaths='//div[@id="products"]/p/a'), callback='parse_x'),
)
def parse_x(self, response):
yield {
'file_urls': [response.urljoin(u) for u in response.xpath('//span[starts-with(@data-url, "/catalog/file/get/")]/@data-url').extract()],
}