使用 python PhantomJS 和 Green GreenPly 崩溃
PhantonJS and Green GreenPile crash using python
我正在尝试使用 PhantomJS 的多个实例并在线程之间使用驱动程序而不是销毁它并一次又一次地创建进程:
import sys
from datetime import datetime
import eventlet
from helpers import log, make_request
import settings
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import redis
import lxml.html as html
pool = eventlet.GreenPool(settings.max_threads)
pile = eventlet.GreenPile(pool)
redis = redis.StrictRedis(host='localhost', port=6379, db=0)
def begin_crawl_phantomJSV2():
url = redis.spop("queue")
if None == url:
return
driver = webdriver.PhantomJS();
process_urlv2(driver, url)
def process_urlv2(driver, url):
driver.get(url)
## some work
url = redis.spop("queue")
if None == url:
driver.close()
driver.quit()
return
pile.spawn(process_urlv2(driver, url))
if __name__ == '__main__':
timea = datetime.now()
log("Beginning crawl at {}".format(timea))
redis.sadd("queue", "http://linka.com")
redis.sadd("queue", "http://linkb.com")
[pile.spawn(begin_crawl_phantomJSV2) for _ in range(1)]
pool.waitall()
并出现以下错误:
Traceback (most recent call last):
File "C:\Python27\lib\site-packages\eventlet\hubs\hub.py", line 457, in fire_timers
timer()
File "C:\Python27\lib\site-packages\eventlet\hubs\timer.py", line 58, in __call__
cb(*args, **kw)
File "C:\Python27\lib\site-packages\eventlet\greenthread.py", line 214, in main
result = function(*args, **kwargs)
TypeError: 'NoneType' object is not callable
将方法更改为时:
def begin_crawl_phantomJS():
driver = webdriver.PhantomJS();
url = redis.spop("queue")
if None == url:
return
process_url(driver, url)
driver.close()
driver.quit()
pile.spawn(begin_crawl_phantomJS)
def process_url(driver, url):
driver.get(url)
## some work
它工作得很好,但我浪费了一些时间让 phantomjs prosses 吃午饭,你知道 shell 我做什么吗?
不得不改变
pile.spawn(process_urlv2(driver, url))
到
pile.spawn(process_urlv2, driver, url)
我正在尝试使用 PhantomJS 的多个实例并在线程之间使用驱动程序而不是销毁它并一次又一次地创建进程:
import sys
from datetime import datetime
import eventlet
from helpers import log, make_request
import settings
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import redis
import lxml.html as html
pool = eventlet.GreenPool(settings.max_threads)
pile = eventlet.GreenPile(pool)
redis = redis.StrictRedis(host='localhost', port=6379, db=0)
def begin_crawl_phantomJSV2():
url = redis.spop("queue")
if None == url:
return
driver = webdriver.PhantomJS();
process_urlv2(driver, url)
def process_urlv2(driver, url):
driver.get(url)
## some work
url = redis.spop("queue")
if None == url:
driver.close()
driver.quit()
return
pile.spawn(process_urlv2(driver, url))
if __name__ == '__main__':
timea = datetime.now()
log("Beginning crawl at {}".format(timea))
redis.sadd("queue", "http://linka.com")
redis.sadd("queue", "http://linkb.com")
[pile.spawn(begin_crawl_phantomJSV2) for _ in range(1)]
pool.waitall()
并出现以下错误:
Traceback (most recent call last):
File "C:\Python27\lib\site-packages\eventlet\hubs\hub.py", line 457, in fire_timers
timer()
File "C:\Python27\lib\site-packages\eventlet\hubs\timer.py", line 58, in __call__
cb(*args, **kw)
File "C:\Python27\lib\site-packages\eventlet\greenthread.py", line 214, in main
result = function(*args, **kwargs)
TypeError: 'NoneType' object is not callable
将方法更改为时:
def begin_crawl_phantomJS():
driver = webdriver.PhantomJS();
url = redis.spop("queue")
if None == url:
return
process_url(driver, url)
driver.close()
driver.quit()
pile.spawn(begin_crawl_phantomJS)
def process_url(driver, url):
driver.get(url)
## some work
它工作得很好,但我浪费了一些时间让 phantomjs prosses 吃午饭,你知道 shell 我做什么吗?
不得不改变
pile.spawn(process_urlv2(driver, url))
到
pile.spawn(process_urlv2, driver, url)