Scrapy Spider 部分抓取内容并留下其他内容
Scrapy Spider scrapes content Partially and leaving others
我定义了一个 scrapy spider,它可以抓取所有的名字和一些故事,而定义的 xpath 无法捕获故事,来自 https://www.cancercarenorthwest.com/survivor-stories,
# -*- coding: utf-8 -*-
import scrapy
from scrapy.contrib.loader import ItemLoader
from scrapy.contrib.spiders import CrawlSpider,Rule
from scrapy.selector import XmlXPathSelector
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from cancerstories.items import CancerstoriesItem
class LungcancerSpider(CrawlSpider):
name = "lungcancer"
allowed_domains = ["coloncancercoalition.org"]
start_urls = (
'http://www.coloncancercoalition.org/community/stories/survivor-stories/',
)
rules = (
Rule(SgmlLinkExtractor(allow=[r'http://www.coloncancercoalition.org/\d+/\d+/\d+/\w+']),callback='parse_page',follow=True),
)
def parse_page(self, response):
Li = ItemLoader(item=CancerstoriesItem(),response=response)
Li.add_xpath('name', '/html/body/div[4]/div[1]/div[1]/div/h1/text()')
Li.add_xpath('story','//../div/div/p/text()')
yield Li.load_item()
我觉得你需要加入post内容下所有段落的文字:
Li.add_xpath('story', '//div[@class="post-content"]/div/p/text()', Join(" "))
其中 Join()
是导入的输出处理器:
from scrapy.loader.processors import Join
我定义了一个 scrapy spider,它可以抓取所有的名字和一些故事,而定义的 xpath 无法捕获故事,来自 https://www.cancercarenorthwest.com/survivor-stories,
# -*- coding: utf-8 -*-
import scrapy
from scrapy.contrib.loader import ItemLoader
from scrapy.contrib.spiders import CrawlSpider,Rule
from scrapy.selector import XmlXPathSelector
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from cancerstories.items import CancerstoriesItem
class LungcancerSpider(CrawlSpider):
name = "lungcancer"
allowed_domains = ["coloncancercoalition.org"]
start_urls = (
'http://www.coloncancercoalition.org/community/stories/survivor-stories/',
)
rules = (
Rule(SgmlLinkExtractor(allow=[r'http://www.coloncancercoalition.org/\d+/\d+/\d+/\w+']),callback='parse_page',follow=True),
)
def parse_page(self, response):
Li = ItemLoader(item=CancerstoriesItem(),response=response)
Li.add_xpath('name', '/html/body/div[4]/div[1]/div[1]/div/h1/text()')
Li.add_xpath('story','//../div/div/p/text()')
yield Li.load_item()
我觉得你需要加入post内容下所有段落的文字:
Li.add_xpath('story', '//div[@class="post-content"]/div/p/text()', Join(" "))
其中 Join()
是导入的输出处理器:
from scrapy.loader.processors import Join