使用 Regex 仅 Select 索引页面
Using Regex to Select Index Page Only
抓取特定网站时,索引页上是否有正则表达式 select?我是 select 某些页面,但还只需要在这些页面之上的索引页面。
我似乎想不出正确的表达方式。基本上,我想抓取索引页面、联系页面、关于页面和广告页面来查找联系信息。
这是代码。
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
from scrapy.item import Item, Field
import csv
from urlparse import urlparse
class MailItem(Item):
desc = Field()
title = Field()
url = Field()
mail = Field()
class MailSpider(CrawlSpider):
name = "marksey"
parsed_hostnames= set()
rules = [
Rule(SgmlLinkExtractor(allow=(r'/contact',r'/about',r'/advertise',)), callback='parse_item', follow=True)
]
###r'^https?://[^/]+(/(\?.*|index\.php(\?.*)?)?)?$',
start_urls = []
allowed_domains = []
with open('C:\Users\Vasily\MyStuff\emailtest\emailtest\scraped_data.csv', 'rb') as csvfile:
reader = csv.reader(csvfile, delimiter=',', quotechar='"')
next(reader)
for row in reader:
url = row[0].strip()
if (url.strip() != ""):
start_urls.append(url)
hostname = urlparse(url).hostname
allowed_domains.append(hostname)
def parse_item(self, response):
hxs = HtmlXPathSelector(response)
items = []
for sel in response.xpath('//html/head'):
item = MailItem()
item['title'] = sel.xpath('title/text()').extract()
item['desc'] = sel.xpath('//meta[@name=\'description\']/@content').extract()
item['url'] = response.url
item['mail'] = hxs.select('//body//text()').re(r'[\w.-]+@[\w.-]+')
if not item['mail']:
item['mail'] = item['url']
items.append(item)
hostname = urlparse(response.url).hostname
self.parsed_hostnames.add(hostname)
return items
def process_links(self, links):
return [l for l in links if urlparse(l.url).hostname not in self.parsed_hostnames]
你需要做的是从 parse_start_url()
调用 parse_item()
回调 - 这样你也可以解析来自 start_urls
的 url,这是我我假设是一个索引页:
class MailSpider(CrawlSpider):
...
def parse_start_url(self, response):
return self.parse_item(response)
def parse_item(self, response):
hxs = HtmlXPathSelector(response)
items = []
...
另请参阅:
- Scrapy CrawlSpider doesn't crawl the first landing page
抓取特定网站时,索引页上是否有正则表达式 select?我是 select 某些页面,但还只需要在这些页面之上的索引页面。
我似乎想不出正确的表达方式。基本上,我想抓取索引页面、联系页面、关于页面和广告页面来查找联系信息。
这是代码。
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
from scrapy.item import Item, Field
import csv
from urlparse import urlparse
class MailItem(Item):
desc = Field()
title = Field()
url = Field()
mail = Field()
class MailSpider(CrawlSpider):
name = "marksey"
parsed_hostnames= set()
rules = [
Rule(SgmlLinkExtractor(allow=(r'/contact',r'/about',r'/advertise',)), callback='parse_item', follow=True)
]
###r'^https?://[^/]+(/(\?.*|index\.php(\?.*)?)?)?$',
start_urls = []
allowed_domains = []
with open('C:\Users\Vasily\MyStuff\emailtest\emailtest\scraped_data.csv', 'rb') as csvfile:
reader = csv.reader(csvfile, delimiter=',', quotechar='"')
next(reader)
for row in reader:
url = row[0].strip()
if (url.strip() != ""):
start_urls.append(url)
hostname = urlparse(url).hostname
allowed_domains.append(hostname)
def parse_item(self, response):
hxs = HtmlXPathSelector(response)
items = []
for sel in response.xpath('//html/head'):
item = MailItem()
item['title'] = sel.xpath('title/text()').extract()
item['desc'] = sel.xpath('//meta[@name=\'description\']/@content').extract()
item['url'] = response.url
item['mail'] = hxs.select('//body//text()').re(r'[\w.-]+@[\w.-]+')
if not item['mail']:
item['mail'] = item['url']
items.append(item)
hostname = urlparse(response.url).hostname
self.parsed_hostnames.add(hostname)
return items
def process_links(self, links):
return [l for l in links if urlparse(l.url).hostname not in self.parsed_hostnames]
你需要做的是从 parse_start_url()
调用 parse_item()
回调 - 这样你也可以解析来自 start_urls
的 url,这是我我假设是一个索引页:
class MailSpider(CrawlSpider):
...
def parse_start_url(self, response):
return self.parse_item(response)
def parse_item(self, response):
hxs = HtmlXPathSelector(response)
items = []
...
另请参阅:
- Scrapy CrawlSpider doesn't crawl the first landing page