两个Scrapy爬虫的例子,一个有内存泄漏,我找不到
Example of two Scrapy spiders, one has a memory leak and I can't find it
这让我抓狂。它驱使我巩固和简化了很多代码,但我就是无法解决问题。这是我写的两个蜘蛛的例子。最上面的一个有内存泄漏,导致内存缓慢扩展直至满。
它们几乎相同,它们使用相同的项目和蜘蛛之外的所有其他东西,所以我认为我的其余代码没有任何问题。我还在这里和那里隔离了一些代码,尝试在最后删除变量。我查看了 scrapy 文档,但仍然感到困惑。任何人有什么魔法可以工作吗?
import scrapy
from wordscrape.items import WordScrapeItem
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
import json
class EnglishWikiSpider(CrawlSpider):
name='englishwiki'
allowed_domains = ['en.wikipedia.org']
start_urls = [
'http://en.wikipedia.org/wiki/'
]
rules = (
Rule(SgmlLinkExtractor(allow=('/wiki/', )), callback='parse_it', follow=True),
)
def parse_it(self, response):
the_item = WordScrapeItem()
# This takes all the text that is in that div and extracts it, only the text, not html tags (see: //text())
# if it meets the conditions of my regex
english_text = response.xpath('//*[@id="mw-content-text"]//text()').re(ur'[a-zA-Z\'-]+')
english_dict = {}
for i in english_text:
if len(i) > 1:
word = i.lower()
if word in english_dict:
english_dict[word] += 1
else:
english_dict[word] = 1
# Dump into json string and put it in the word item, it will be ['word': {<<jsondict>>}, 'site' : url, ...]
jsondump = json.dumps(english_dict)
the_item['word'] = jsondump
the_item['site'] = response.url
return the_item
第二个稳定的蜘蛛:
import scrapy
from wordscrape.items import WordScrapeItem
import re
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
import json
class NaverNewsSpider(CrawlSpider):
name='navernews'
allowed_domains = ['news.naver.com']
start_urls = [
'http://news.naver.com',
'http://news.naver.com/main/read.nhn?oid=001&sid1=102&aid=0007354749&mid=shm&cid=428288&mode=LSD&nh=20150114125510',
]
rules = (
Rule(SgmlLinkExtractor(allow=('main/read\.nhn', )), callback='parse_it', follow=True),
)
def parse_it(self, response):
the_item = WordScrapeItem()
# gets all the text from the listed div and then applies the regex to find all word objects in hanul range
hangul_syllables = response.xpath('//*[@id="articleBodyContents"]//text()').re(ur'[\uac00-\ud7af]+')
# Go through all hangul syllables found and adds to value or adds key
hangul_dict = {}
for i in hangul_syllables:
if i in hangul_dict:
hangul_dict[i] += 1
else:
hangul_dict[i] = 1
jsondump = json.dumps(hangul_dict)
the_item['word'] = jsondump
the_item['site'] = response.url
return the_item
我认为 Jepio 的评论是正确的。我认为 spidder 发现了太多要跟踪的链接,因此不得不将它们全部存储在临时期限内。
编辑:所以,问题是它将所有这些链接存储在内存中而不是磁盘上,它最终会填满我所有的内存。解决方案是 运行 使用作业目录进行 scrapy,这会强制将它们存储在有大量 space 的磁盘上。
$ scrapy crawl spider -s JOBDIR=somedirname
这让我抓狂。它驱使我巩固和简化了很多代码,但我就是无法解决问题。这是我写的两个蜘蛛的例子。最上面的一个有内存泄漏,导致内存缓慢扩展直至满。
它们几乎相同,它们使用相同的项目和蜘蛛之外的所有其他东西,所以我认为我的其余代码没有任何问题。我还在这里和那里隔离了一些代码,尝试在最后删除变量。我查看了 scrapy 文档,但仍然感到困惑。任何人有什么魔法可以工作吗?
import scrapy
from wordscrape.items import WordScrapeItem
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
import json
class EnglishWikiSpider(CrawlSpider):
name='englishwiki'
allowed_domains = ['en.wikipedia.org']
start_urls = [
'http://en.wikipedia.org/wiki/'
]
rules = (
Rule(SgmlLinkExtractor(allow=('/wiki/', )), callback='parse_it', follow=True),
)
def parse_it(self, response):
the_item = WordScrapeItem()
# This takes all the text that is in that div and extracts it, only the text, not html tags (see: //text())
# if it meets the conditions of my regex
english_text = response.xpath('//*[@id="mw-content-text"]//text()').re(ur'[a-zA-Z\'-]+')
english_dict = {}
for i in english_text:
if len(i) > 1:
word = i.lower()
if word in english_dict:
english_dict[word] += 1
else:
english_dict[word] = 1
# Dump into json string and put it in the word item, it will be ['word': {<<jsondict>>}, 'site' : url, ...]
jsondump = json.dumps(english_dict)
the_item['word'] = jsondump
the_item['site'] = response.url
return the_item
第二个稳定的蜘蛛:
import scrapy
from wordscrape.items import WordScrapeItem
import re
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
import json
class NaverNewsSpider(CrawlSpider):
name='navernews'
allowed_domains = ['news.naver.com']
start_urls = [
'http://news.naver.com',
'http://news.naver.com/main/read.nhn?oid=001&sid1=102&aid=0007354749&mid=shm&cid=428288&mode=LSD&nh=20150114125510',
]
rules = (
Rule(SgmlLinkExtractor(allow=('main/read\.nhn', )), callback='parse_it', follow=True),
)
def parse_it(self, response):
the_item = WordScrapeItem()
# gets all the text from the listed div and then applies the regex to find all word objects in hanul range
hangul_syllables = response.xpath('//*[@id="articleBodyContents"]//text()').re(ur'[\uac00-\ud7af]+')
# Go through all hangul syllables found and adds to value or adds key
hangul_dict = {}
for i in hangul_syllables:
if i in hangul_dict:
hangul_dict[i] += 1
else:
hangul_dict[i] = 1
jsondump = json.dumps(hangul_dict)
the_item['word'] = jsondump
the_item['site'] = response.url
return the_item
我认为 Jepio 的评论是正确的。我认为 spidder 发现了太多要跟踪的链接,因此不得不将它们全部存储在临时期限内。
编辑:所以,问题是它将所有这些链接存储在内存中而不是磁盘上,它最终会填满我所有的内存。解决方案是 运行 使用作业目录进行 scrapy,这会强制将它们存储在有大量 space 的磁盘上。
$ scrapy crawl spider -s JOBDIR=somedirname