scrapy 解析第一页

scrapy parsing first page

我正在使用 scrapy .24.4,我正在尝试从 threatexpert 中抓取一些信息,我几乎已经掌握了,我可以抓取除第一页之外的所有页面上的所有信息(或 start_url).我试过 parse_start_url 并添加规则,但无法正常工作。我确定这只是我忽略的事情,但我整个周末都在看它,只需要休息一下。如果有人有任何建议等,我将不胜感激。哦,我确实让它在 start_url 范围内工作,但它看起来有点不雅,我正在努力学习正确的方法。非常感谢!!

import scrapy
import re
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.contrib.spiders import CrawlSpider, Rule
from threatexpert.items import ThreatExpert
import urlparse

class ThreatExpertSpider(scrapy.Spider):
name = 'threatexpert'
start_urls = ["http://www.threatexpert.com/reports.aspx?tf=2&sl=1"]

def parse(self, response):
    print '++++++++++++++++++++++++pull all page links+++++++++++++++++++++++'
    urls = response.xpath('//a[contains(@href, "page")]/@href').extract()
    for url in urls:
        url = urlparse.urljoin(response.url, url)
        self.log('Found follow url: %s' % url)
        yield scrapy.Request(url, callback = self.parse_links)


def parse_links(self, response):
    print '++++++++++++++++++++++++pull item urls++++++++++++++++++++++++++'
    urls = response.xpath('//a[contains(@href, "md5")]/@href').extract()
    for url in urls:
        url = urlparse.urljoin(response.url, url)
        self.log('Found follow url: %s' % url)
        yield scrapy.Request(url, callback = self.parse_items)


def parse_items(self, response):
    self.log("Hi, this is an item page! %s" % response.url)
    item = ThreatExpert()
    item['md5'] = response.xpath('//html/body/ul[1]/ul/ul/li[1]/text()').re(r"File MD5: ([\w, ]+)")
    yield item

请参考下面的代码,这对我有用。如果您有任何疑问,请通过命令更新。

from scrapy.spider import BaseSpider
from scrapy.http import Request
import re
from urlparse import urljoin
from scrapy.selector import HtmlXPathSelector
from threatexpert.items import ThreatExpert
import inspect
class usmallspider(BaseSpider):
    name = 'threatexpert'
    start_urls = ["http://www.threatexpert.com/reports.aspx?tf=2&sl=1"]

def parse(self, response):
    hxs = HtmlXPathSelector(response)
    urls = response.xpath('//a[contains(@href, "page")]/@href').extract()
    for url in urls:
        url = urljoin(response.url, url)
        print url
        if url:
            yield Request(url, callback=self.parse_links)

def parse_links(self, response):
    hxs = HtmlXPathSelector(response)
    urls = response.xpath('//a[contains(@href, "md5")]/@href').extract()
    for url in urls:
        url = urljoin(response.url, url)
        if url:
            yield Request(url, callback = self.parse_items)


def parse_items(self, response):
    itm=[]
    item = MallUk1Item()
    hxs = HtmlXPathSelector(response)
    item['md5'] = response.xpath('//html/body/ul[1]/ul/ul/li[1]/text()').re(r"File MD5: ([\w, ]+)")
    itm.append(item)
    return itm

非常非常感谢您的回复,它让我开始了我的工作!只是有错误的 class 而不是 class ThreatExpertSpider(scrapy.Spider),我使用了 class ThreatExpertSpider(CrawlSpider):,我仍然不完全确定它是如何工作的,但它做。我知道 RTFM,大声笑,但我正在学习。这是对我有用的,以防其他人正在寻找这个。

import scrapy
import re
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.contrib.spiders import CrawlSpider, Rule
from threatexpert.items import ThreatExpert
import urlparse

class ThreatExpertSpider(CrawlSpider):
    name = 'threatexpert'
    start_urls = ["http://www.threatexpert.com/reports.aspx?tf=3&sl=1"]

    rules = (
        Rule(SgmlLinkExtractor(allow=r'page=\d'), callback='parse_links', follow=True),
        )

    def parse_start_url(self, response):
        print '++++++++++++++++++++++++parse_start_url+++++++++++++++++++++++'
        return self.parse_items(response)
        # urls = response.xpath('//a[contains(@href, "page")]/@href').extract()
        # for url in urls:
        #     url = urlparse.urljoin(response.url, url)
        #     self.log('Found follow url: %s' % url)
        #     yield scrapy.Request(url, callback = self.parse_links)


    def parse_links(self, response):
        print '++++++++++++++++++++++++pull item urls++++++++++++++++++++++++++'
        urls = response.xpath('//a[contains(@href, "md5")]/@href').extract()
        for url in urls:
            url = urlparse.urljoin(response.url, url)
            self.log('Found follow url: %s' % url)
            yield scrapy.Request(url, callback = self.parse_items)


    def parse_items(self, response):
        self.log("Hi, this is an item page! %s" % response.url)
        item = ThreatExpert()
        item['md5'] = response.xpath('//html/body/ul[1]/ul/ul/li[1]/text()').re(r"File MD5: ([\w, ]+)")

        # item['callback'] = response.xpath('//*[contains(text(), "The following Host Names were requested from a host database:")]/following-sibling::ul/li/text()').extract()
        # if item['callback']:
        #     item['callback'] = response.xpath('//*[contains(text(), "The following Host Names were requested from a host database:")]/following-sibling::ul/li/text()').extract()
        # else:
        #     del item['callback']
        yield item