以错误的 csv 格式提取的 Scrapy 管道
Scrapy pipeline extracting in the wrong csv format
我的 Hacker News 蜘蛛在一行上输出所有结果,而不是像这里看到的那样每行一个。
这是我的代码。
import scrapy
import string
import urlparse
from scrapy.selector import Selector
from scrapy.selector import HtmlXPathSelector
from scrapy.contrib.linkextractors import LinkExtractor
class HnItem(scrapy.Item):
title = scrapy.Field()
link = scrapy.Field()
score = scrapy.Field()
class HnSpider(scrapy.Spider):
name = 'hackernews'
allowed_domains = ["news.ycombinator.com"]
start_urls = ["https://news.ycombinator.com/"]
def parse(self, response):
sel = response
selector_list = response.xpath('.//table[@class="itemlist"]')
for sel in selector_list:
item = HnItem()
item['title'] = sel.xpath('.//td[@class="title"]/text()').extract()
item['link'] = sel.xpath('.//tr[@class="athing"]/td[3]/a/@href').extract()
item['score'] = sel.xpath('.//td[@class="subtext"]/span/text()').extract()
yield item
和我的 settings.py 文件
BOT_NAME = 'hnews'
SPIDER_MODULES = ['hnews.spiders']
NEWSPIDER_MODULE = 'hnews.spiders'
USER_AGENT = 'hnews (+http://www.yourdomain.com)'
FEED_URI = '/used/scrapy/hnews/%(name)s/%(time)s.csv'
FEED_FORMAT = 'csv'
我已经尝试在许多其他解决方案中实施 ,但到目前为止没有成功。我在这方面还是很新,所以如果可能的话请多多包涵。
发生这种情况是因为您的项目管道正在同时获取所有列表。例如:item['title']
立即获取所有标题的列表,然后将其传输到项目管道,然后直接写入 csv 文件。
解决方案是遍历列表并一次将其交给项目管道。这是修改后的代码:
import scrapy
from scrapy.selector import Selector
class HnItem(scrapy.Item):
title = scrapy.Field()
link = scrapy.Field()
score = scrapy.Field()
class HnSpider(scrapy.Spider):
name = 'hackernews'
allowed_domains = ["news.ycombinator.com"]
start_urls = ["https://news.ycombinator.com/"]
def parse(self, response):
sel = Selector(response)
item = HnItem()
title_list = sel.xpath('.//td[@class="title"]/a/text()').extract()[:-2]
link_list= sel.xpath('.//tr[@class="athing"]/td[3]/a/@href').extract()
score_list = sel.xpath('.//td[@class="subtext"]/span/text()').extract()
for x in range(0,len(title_list)):
item['title'] = title_list[x]
item['link'] = link_list[x]
item['score'] = score_list[x]
yield item
我的 Hacker News 蜘蛛在一行上输出所有结果,而不是像这里看到的那样每行一个。
这是我的代码。
import scrapy
import string
import urlparse
from scrapy.selector import Selector
from scrapy.selector import HtmlXPathSelector
from scrapy.contrib.linkextractors import LinkExtractor
class HnItem(scrapy.Item):
title = scrapy.Field()
link = scrapy.Field()
score = scrapy.Field()
class HnSpider(scrapy.Spider):
name = 'hackernews'
allowed_domains = ["news.ycombinator.com"]
start_urls = ["https://news.ycombinator.com/"]
def parse(self, response):
sel = response
selector_list = response.xpath('.//table[@class="itemlist"]')
for sel in selector_list:
item = HnItem()
item['title'] = sel.xpath('.//td[@class="title"]/text()').extract()
item['link'] = sel.xpath('.//tr[@class="athing"]/td[3]/a/@href').extract()
item['score'] = sel.xpath('.//td[@class="subtext"]/span/text()').extract()
yield item
和我的 settings.py 文件
BOT_NAME = 'hnews'
SPIDER_MODULES = ['hnews.spiders']
NEWSPIDER_MODULE = 'hnews.spiders'
USER_AGENT = 'hnews (+http://www.yourdomain.com)'
FEED_URI = '/used/scrapy/hnews/%(name)s/%(time)s.csv'
FEED_FORMAT = 'csv'
我已经尝试在许多其他解决方案中实施
发生这种情况是因为您的项目管道正在同时获取所有列表。例如:item['title']
立即获取所有标题的列表,然后将其传输到项目管道,然后直接写入 csv 文件。
解决方案是遍历列表并一次将其交给项目管道。这是修改后的代码:
import scrapy
from scrapy.selector import Selector
class HnItem(scrapy.Item):
title = scrapy.Field()
link = scrapy.Field()
score = scrapy.Field()
class HnSpider(scrapy.Spider):
name = 'hackernews'
allowed_domains = ["news.ycombinator.com"]
start_urls = ["https://news.ycombinator.com/"]
def parse(self, response):
sel = Selector(response)
item = HnItem()
title_list = sel.xpath('.//td[@class="title"]/a/text()').extract()[:-2]
link_list= sel.xpath('.//tr[@class="athing"]/td[3]/a/@href').extract()
score_list = sel.xpath('.//td[@class="subtext"]/span/text()').extract()
for x in range(0,len(title_list)):
item['title'] = title_list[x]
item['link'] = link_list[x]
item['score'] = score_list[x]
yield item