为什么我的 CrawlerProcess 没有函数 "crawl"?
Why does my CrawlerProcess not have the function "crawl"?
import scrapy
from scrapy.crawler import CrawlerProcess
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors import LinkExtractor
from items import BackpageItem, CityvibeItem
from scrapy.shell import inspect_response
import re
import time
import sys
class MySpider(CrawlSpider):
name = 'example'
allowed_domains = ['www.example.com']
# Set last_age to decide how many pages are crawled
last_page = 10
start_urls = ['http://www.example.com/washington/?page=%s' % page for page in xrange(1,last_page)]
rules = (
#Follow all links inside <div class="cat"> and calls parse_item on each link
Rule(LinkExtractor(
restrict_xpaths=('//a[@name="listing_link"]')),
callback='parse_item'),
)
# Extract relevent text from the website into a ExampleItem
def parse_item(self, response):
item = ExampleItem()
item['title'] = response.xpath('string(//h2[@class="post-title"]/text())').extract()
item['desc'] = response.xpath('string(//div[@class="section post-body"]/text())').extract()
item['url'] = response.url
item['location'] = response.xpath('string(//div[@class="posting"]/div[2]/text())').extract()
item['posted_date'] = response.xpath('string(//div[@class="post-date"]/span/text())').extract()#.re("(?<=Posted\s*).*")
item['crawled_date'] = time.strftime("%c")
# not sure how to get the other image urls right now
item['image_urls'] = response.xpath('string(//div[@class="section post-contact-container"]/div/div/img/@src)').extract()
# I can't find this section on any pages right now
item['other_ad_urls'] = response.xpath('//a[@name="listing_link"]/@href').extract()
item['phone_number'] = "".join(response.xpath('//div[@class="post-info"]/span[contains(text(), "Phone")]/following-sibling::a/text()').extract())
item['email'] = "".join(response.xpath('//div[@class="post-info"]/span[contains(text(), "Email")]/following-sibling::a/text()').extract())
item['website'] = "".join(response.xpath('//div[@class="post-info limit"]/span[contains(text(), "Website")]/following-sibling::a/text()').extract())
item['name'] = response.xpath('//div[@class="post-name"]/text()').extract()
#uncomment for debugging
#inspect_response(response, self)
return item
# process1 = CrawlerProcess({
# 'ITEM_PIPELINES': {
# #'scrapy.contrib.pipeline.images.ImagesPipeline': 1
# 'backpage.pipelines.GeolocationPipeline': 4,
# 'backpage.pipelines.LocationExtractionPipeline': 3,
# 'backpage.pipelines.BackpagePipeline': 5
# }
# });
process1 = CrawlerProcess({
'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'
})
process1.crawl(MySpider)
process1.start()
当我从命令行使用
运行 它时,我的蜘蛛工作得很好
scrapy crawl example
但我需要 运行 多个蜘蛛,所以我想将它们全部放在一个脚本中并使用 CrawlerProcess。当我尝试 运行 时,出现错误,
AttributeError: 'CrawlerProcess' object has no attribute 'crawl'
这是 scrapy 版本 0.24.6。
所有项目和管道都是正确的,因为蜘蛛是从命令行工作的。
Scrapy 和 Scrapyd 之间存在(曾经?)兼容性问题。我需要 运行 Scrapy 0.24 和 Scrapyd 1.0.1。
这是 Github 上的问题
https://github.com/scrapy/scrapyd/issues/100#issuecomment-115268880
import scrapy
from scrapy.crawler import CrawlerProcess
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors import LinkExtractor
from items import BackpageItem, CityvibeItem
from scrapy.shell import inspect_response
import re
import time
import sys
class MySpider(CrawlSpider):
name = 'example'
allowed_domains = ['www.example.com']
# Set last_age to decide how many pages are crawled
last_page = 10
start_urls = ['http://www.example.com/washington/?page=%s' % page for page in xrange(1,last_page)]
rules = (
#Follow all links inside <div class="cat"> and calls parse_item on each link
Rule(LinkExtractor(
restrict_xpaths=('//a[@name="listing_link"]')),
callback='parse_item'),
)
# Extract relevent text from the website into a ExampleItem
def parse_item(self, response):
item = ExampleItem()
item['title'] = response.xpath('string(//h2[@class="post-title"]/text())').extract()
item['desc'] = response.xpath('string(//div[@class="section post-body"]/text())').extract()
item['url'] = response.url
item['location'] = response.xpath('string(//div[@class="posting"]/div[2]/text())').extract()
item['posted_date'] = response.xpath('string(//div[@class="post-date"]/span/text())').extract()#.re("(?<=Posted\s*).*")
item['crawled_date'] = time.strftime("%c")
# not sure how to get the other image urls right now
item['image_urls'] = response.xpath('string(//div[@class="section post-contact-container"]/div/div/img/@src)').extract()
# I can't find this section on any pages right now
item['other_ad_urls'] = response.xpath('//a[@name="listing_link"]/@href').extract()
item['phone_number'] = "".join(response.xpath('//div[@class="post-info"]/span[contains(text(), "Phone")]/following-sibling::a/text()').extract())
item['email'] = "".join(response.xpath('//div[@class="post-info"]/span[contains(text(), "Email")]/following-sibling::a/text()').extract())
item['website'] = "".join(response.xpath('//div[@class="post-info limit"]/span[contains(text(), "Website")]/following-sibling::a/text()').extract())
item['name'] = response.xpath('//div[@class="post-name"]/text()').extract()
#uncomment for debugging
#inspect_response(response, self)
return item
# process1 = CrawlerProcess({
# 'ITEM_PIPELINES': {
# #'scrapy.contrib.pipeline.images.ImagesPipeline': 1
# 'backpage.pipelines.GeolocationPipeline': 4,
# 'backpage.pipelines.LocationExtractionPipeline': 3,
# 'backpage.pipelines.BackpagePipeline': 5
# }
# });
process1 = CrawlerProcess({
'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'
})
process1.crawl(MySpider)
process1.start()
当我从命令行使用
运行 它时,我的蜘蛛工作得很好scrapy crawl example
但我需要 运行 多个蜘蛛,所以我想将它们全部放在一个脚本中并使用 CrawlerProcess。当我尝试 运行 时,出现错误,
AttributeError: 'CrawlerProcess' object has no attribute 'crawl'
这是 scrapy 版本 0.24.6。 所有项目和管道都是正确的,因为蜘蛛是从命令行工作的。
Scrapy 和 Scrapyd 之间存在(曾经?)兼容性问题。我需要 运行 Scrapy 0.24 和 Scrapyd 1.0.1。 这是 Github 上的问题 https://github.com/scrapy/scrapyd/issues/100#issuecomment-115268880