使用 Regex 仅 Select 索引页面

Question

抓取特定网站时，索引页上是否有正则表达式 select？我是 select 某些页面，但还只需要在这些页面之上的索引页面。

我似乎想不出正确的表达方式。基本上，我想抓取索引页面、联系页面、关于页面和广告页面来查找联系信息。

这是代码。

from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
from scrapy.item import Item, Field
import csv
from urlparse import urlparse

class MailItem(Item):
    desc = Field()
    title = Field()
    url = Field()
    mail = Field()

class MailSpider(CrawlSpider):
    name = "marksey"

    parsed_hostnames= set()

rules = [
    Rule(SgmlLinkExtractor(allow=(r'/contact',r'/about',r'/advertise',)), callback='parse_item', follow=True)
]

###r'^https?://[^/]+(/(\?.*|index\.php(\?.*)?)?)?$',

start_urls = []
allowed_domains = []
with open('C:\Users\Vasily\MyStuff\emailtest\emailtest\scraped_data.csv', 'rb') as csvfile:
    reader = csv.reader(csvfile, delimiter=',', quotechar='"')
    next(reader)
    for row in reader:
        url = row[0].strip()
        if (url.strip() != ""):
            start_urls.append(url)
            hostname = urlparse(url).hostname
            allowed_domains.append(hostname)

def parse_item(self, response):
    hxs = HtmlXPathSelector(response)
    items = []

for sel in response.xpath('//html/head'):
    item = MailItem()
    item['title'] = sel.xpath('title/text()').extract()
    item['desc'] = sel.xpath('//meta[@name=\'description\']/@content').extract()
    item['url'] = response.url
    item['mail'] = hxs.select('//body//text()').re(r'[\w.-]+@[\w.-]+')

    if not item['mail']:
        item['mail'] = item['url']

    items.append(item)
    hostname = urlparse(response.url).hostname
    self.parsed_hostnames.add(hostname)

return items

def process_links(self, links):
    return [l for l in links if urlparse(l.url).hostname not in self.parsed_hostnames]

Answer 1

你需要做的是从 parse_start_url() 调用 parse_item() 回调 - 这样你也可以解析来自 start_urls 的 url，这是我我假设是一个索引页：

class MailSpider(CrawlSpider):
    ...

    def parse_start_url(self, response):
        return self.parse_item(response)

    def parse_item(self, response):
        hxs = HtmlXPathSelector(response)
        items = []
        ...

另请参阅：

Scrapy CrawlSpider doesn't crawl the first landing page

使用 Regex 仅 Select 索引页面

Using Regex to Select Index Page Only

python

regex

scrapy