Webkit_server(从 python 的 dryscrape 调用)在访问每个页面时使用越来越多的内存。如何减少使用的内存?
Webkit_server (called from python's dryscrape) uses more and more memory with each page visited. How do I reduce the memory used?
我正在使用 python3
中的 dryscrape
编写 scraper。我试图在 __scraping __session 期间访问数百个不同的 url 并在每个 url 上点击大约 10 ajax 页(不访问不同的 url 每 ajax 页)。我需要 dryscrape
之类的东西,因为我需要能够与 javascript 组件进行交互。我为自己的需要编写的 classes 工作,但是当我访问了大约 50 或 100 页(所有 4Gbs 的内存和 4Gbs 的交换磁盘 space 实际上是 100% 满)。我查看了内存耗尽的原因,似乎 webkit_server
进程负责所有内存。为什么会发生这种情况,我该如何避免?
下面是我的 class 和主要方法的相关片段。
这是使用 dryscape
的 class,您可以确切地看到我使用的设置。
import dryscrape
from lxml import html
from time import sleep
from webkit_server import InvalidResponseError
import re
from utils import unugly, my_strip, cleanhtml, stringify_children
from Profile import Profile, Question
class ExampleSession():
def __init__(self, settings):
self.settings = settings
# dryscrape.start_xvfb()
self.br = self.getBrowser()
def getBrowser(self):
session = dryscrape.Session()
session.set_attribute('auto_load_images', False)
session.set_header('User-agent', 'Google Chrome')
return session
def login(self):
try:
print('Trying to log in... ')
self.br.visit('https://www.example.com/login')
self.br.at_xpath('//*[@id="login_username"]').set(self.settings['myUsername'])
self.br.at_xpath('//*[@id="login_password"]').set(self.settings['myPassword'])
q = self.br.at_xpath('//*[@id="loginbox_form"]')
q.submit()
except Exception as e:
print(str(e))
print('\tException and couldn\'t log in!')
return
print('Logged in as %s' % (str(self.settings['myUsername'])))
def getProfileQuestionsByUrl(self, url, thread_id=0):
self.br.visit(str(url.rstrip()) + '/questions')
tree = html.fromstring(self.br.body())
questions = []
num_pages = int(my_strip(tree.xpath('//*[@id="questions_pages"]//*[@class="last"]')[0].text))
page = 0
while (page < num_pages):
sleep(0.5)
# Do something with each ajax page
# Next try-except tries to click the 'next' button
try:
next_button = self.br.at_xpath('//*[@id="questions_pages"]//*[@class="next"]')
next_button.click()
except Exception as e:
pass
page = page + 1
return questions
def getProfileByUrl(self, url, thread_id=0):
missing = 'NA'
try:
try:
# Visit a unique url
self.br.visit(url.rstrip())
except Exception as e:
print(str(e))
return None
tree = html.fromstring(self.br.body())
map = {}
# Fill up the dictionary with some things I find on the page
profile = Profile(map)
return profile
except Exception as e:
print(str(e))
return None
这是主要方法(片段):
def getProfiles(settings, urls, thread_id):
exampleSess = ExampleSession(settings)
exampleSess.login()
profiles = []
'''
I want to visit at most a thousand unique urls (but I don't care if it
will take 2 hours or 2 days as long as the session doesn't fatally break
and my laptop doesn't run out of memory)
'''
for url in urls:
try:
profile = exampleSess.getProfileByUrl(url, thread_id)
if (profile is not None):
profiles.append(profile)
try:
if (settings['scrapeQuestions'] == 'yes'):
profile_questions = exampleSess.getProfileQuestionsByUrl(url, thread_id)
if (profile_questions is not None):
profile.add_questions(profile_questions)
except SocketError as e:
print(str(e))
print('\t[Thread %d] SocketError in getProfileQuestionsByUrl of profile...' % (thread_id))
except Exception as e:
print(str(e))
print('\t[Thread %d] Exception while getting profile %s' % (thread_id, str(url.rstrip())))
okc.br.reset()
exampleSession = None # Does this kill my dryscrape session and prevents webkit_server from running?
return profiles
我 dryscrape
设置正确吗?我用 getProfileByUrl
和 getProfileQuestionsByUrl
访问的 urls
越多,dryscrape
的 webkit_server
如何最终使用 4Gb 以上的内存?是否有任何我遗漏的设置可能会增加内存使用量?
我无法解决内存问题(我可以在另一台笔记本电脑上重现此问题)。我最终从 dryscrape
切换到 selenium
(然后切换到 phantomjs
)。 PhantomJS
在我看来一直很优越,而且它也没有占用太多内存。
我正在使用 python3
中的 dryscrape
编写 scraper。我试图在 __scraping __session 期间访问数百个不同的 url 并在每个 url 上点击大约 10 ajax 页(不访问不同的 url 每 ajax 页)。我需要 dryscrape
之类的东西,因为我需要能够与 javascript 组件进行交互。我为自己的需要编写的 classes 工作,但是当我访问了大约 50 或 100 页(所有 4Gbs 的内存和 4Gbs 的交换磁盘 space 实际上是 100% 满)。我查看了内存耗尽的原因,似乎 webkit_server
进程负责所有内存。为什么会发生这种情况,我该如何避免?
下面是我的 class 和主要方法的相关片段。
这是使用 dryscape
的 class,您可以确切地看到我使用的设置。
import dryscrape
from lxml import html
from time import sleep
from webkit_server import InvalidResponseError
import re
from utils import unugly, my_strip, cleanhtml, stringify_children
from Profile import Profile, Question
class ExampleSession():
def __init__(self, settings):
self.settings = settings
# dryscrape.start_xvfb()
self.br = self.getBrowser()
def getBrowser(self):
session = dryscrape.Session()
session.set_attribute('auto_load_images', False)
session.set_header('User-agent', 'Google Chrome')
return session
def login(self):
try:
print('Trying to log in... ')
self.br.visit('https://www.example.com/login')
self.br.at_xpath('//*[@id="login_username"]').set(self.settings['myUsername'])
self.br.at_xpath('//*[@id="login_password"]').set(self.settings['myPassword'])
q = self.br.at_xpath('//*[@id="loginbox_form"]')
q.submit()
except Exception as e:
print(str(e))
print('\tException and couldn\'t log in!')
return
print('Logged in as %s' % (str(self.settings['myUsername'])))
def getProfileQuestionsByUrl(self, url, thread_id=0):
self.br.visit(str(url.rstrip()) + '/questions')
tree = html.fromstring(self.br.body())
questions = []
num_pages = int(my_strip(tree.xpath('//*[@id="questions_pages"]//*[@class="last"]')[0].text))
page = 0
while (page < num_pages):
sleep(0.5)
# Do something with each ajax page
# Next try-except tries to click the 'next' button
try:
next_button = self.br.at_xpath('//*[@id="questions_pages"]//*[@class="next"]')
next_button.click()
except Exception as e:
pass
page = page + 1
return questions
def getProfileByUrl(self, url, thread_id=0):
missing = 'NA'
try:
try:
# Visit a unique url
self.br.visit(url.rstrip())
except Exception as e:
print(str(e))
return None
tree = html.fromstring(self.br.body())
map = {}
# Fill up the dictionary with some things I find on the page
profile = Profile(map)
return profile
except Exception as e:
print(str(e))
return None
这是主要方法(片段):
def getProfiles(settings, urls, thread_id):
exampleSess = ExampleSession(settings)
exampleSess.login()
profiles = []
'''
I want to visit at most a thousand unique urls (but I don't care if it
will take 2 hours or 2 days as long as the session doesn't fatally break
and my laptop doesn't run out of memory)
'''
for url in urls:
try:
profile = exampleSess.getProfileByUrl(url, thread_id)
if (profile is not None):
profiles.append(profile)
try:
if (settings['scrapeQuestions'] == 'yes'):
profile_questions = exampleSess.getProfileQuestionsByUrl(url, thread_id)
if (profile_questions is not None):
profile.add_questions(profile_questions)
except SocketError as e:
print(str(e))
print('\t[Thread %d] SocketError in getProfileQuestionsByUrl of profile...' % (thread_id))
except Exception as e:
print(str(e))
print('\t[Thread %d] Exception while getting profile %s' % (thread_id, str(url.rstrip())))
okc.br.reset()
exampleSession = None # Does this kill my dryscrape session and prevents webkit_server from running?
return profiles
我 dryscrape
设置正确吗?我用 getProfileByUrl
和 getProfileQuestionsByUrl
访问的 urls
越多,dryscrape
的 webkit_server
如何最终使用 4Gb 以上的内存?是否有任何我遗漏的设置可能会增加内存使用量?
我无法解决内存问题(我可以在另一台笔记本电脑上重现此问题)。我最终从 dryscrape
切换到 selenium
(然后切换到 phantomjs
)。 PhantomJS
在我看来一直很优越,而且它也没有占用太多内存。