如何提高 django haystack 搜索的速度
how to improve speed of django haystack search
我想在我的 django 环境中为简单的数据结构创建一个搜索引擎:
| id | comapany name |
|:-----------|-----------------:|
| 12345678 | company A's name |
| 12345687 | peoples pizza a/s|
| 87654321 | sub's for pugs |
大约有 800,000 家公司,我只想按名称搜索。
找到名称后,我的 django 中会返回 ID。
我已经尝试了 haystack、whoosh 等各种设置,但是当我从 ~500 的测试数据集增加到 800,000 时,我的搜索结果一直非常缓慢。
有时搜索需要 将近一个小时。
我正在使用 Paas Heroku,所以我想我会尝试一个集成的付费服务(searly 的 elasticsearch 实现)。这很有帮助,但当我到达大约 80,000 家公司时,它又开始变得非常慢了。
已安装的应用程序
INSTALLED_APPS = [
'django.contrib.admin',
'django.contrib.auth',
'django.contrib.contenttypes',
'django.contrib.sessions',
'django.contrib.sites',
# Added.
'haystack',
# Then your usual apps...
]
更多settings.py
import os
from urlparse import urlparse
es = urlparse(os.environ.get('SEARCHBOX_URL') or 'http://127.0.0.1:9200/')
port = es.port or 80
HAYSTACK_CONNECTIONS = {
'default': {
'ENGINE': 'haystack.backends.elasticsearch_backend.ElasticsearchSearchEngine',
'URL': es.scheme + '://' + es.hostname + ':' + str(port),
'INDEX_NAME': 'documents',
},
if es.username:
HAYSTACK_CONNECTIONS['default']['KWARGS'] = {"http_auth": es.username + ':' + es.password}
search_indexes.py
from haystack import indexes
from hello.models import Article
class ArticleIndex(indexes.SearchIndex, indexes.Indexable):
'''
defines the model for the serach Engine.
'''
text = indexes.CharField(document=True, use_template=True)
pub_date = indexes.DateTimeField(model_attr='pub_date')
# pub_date line was commented out previously
content_auto = indexes.EdgeNgramField(model_attr='title')
def get_model(self):
return Article
def index_queryset(self, using=None):
"""Used when the entire index for model is updated."""
return self.get_model().objects.all()
article_text.txt
{{ object.title }}
{{ object.user.get_full_name }}
{{ object.body }}
urls.py
url(r'^search/$', views.search_titles, name='search'),
views.py
def search_titles(request):
txt = request.POST.get('search_text', '')
if txt and len(txt) >= 4:
articles = SearchQuerySet().autocomplete(content_auto=txt)
# if the post request is empty, return nothing
# this prevents internal server error with jquery
else:
articles = []
return render_to_response('scripts/ajax_search.html',
{'articles': articles})
search.html
{% if articles.count > 0 %}
<!-- simply prints the links to the cvr numbers-->
<!-- for article in articles -->
{% for article in "x"|rjust:"15" %}
<li><a href="{{ article.object.get_absolute_url }}">{{ article.object.title }}</a></li>
{% endfor %}
{% else %}
<li>Try again, or try CVR + ⏎</li>
{% endif %}
index.html(我称之为搜索引擎)
{% csrf_token %}
<input type="text" id="search" name="search" />
<!-- This <ul> all company names end up-->
<ul id ="search-results"></ul>
我将 ves.py 搜索方法更改为:
txt = request.POST.get('search_text', '')
articles = []
suggestedSearchTerm = ""
if txt and len(txt) >= 4:
sqs = SearchQuerySet()
sqs.query.set_limits(low=0, high=8)
sqs = sqs.filter(content=txt)
articles = sqs.query.get_results()
suggestedSearchTerm = SearchQuerySet().spelling_suggestion(txt)
if suggestedSearchTerm == txt:
suggestedSearchTerm = ''
else:
suggestedSearchTerm = suggestedSearchTerm.lower()
我想在我的 django 环境中为简单的数据结构创建一个搜索引擎:
| id | comapany name |
|:-----------|-----------------:|
| 12345678 | company A's name |
| 12345687 | peoples pizza a/s|
| 87654321 | sub's for pugs |
大约有 800,000 家公司,我只想按名称搜索。 找到名称后,我的 django 中会返回 ID。
我已经尝试了 haystack、whoosh 等各种设置,但是当我从 ~500 的测试数据集增加到 800,000 时,我的搜索结果一直非常缓慢。 有时搜索需要 将近一个小时。
我正在使用 Paas Heroku,所以我想我会尝试一个集成的付费服务(searly 的 elasticsearch 实现)。这很有帮助,但当我到达大约 80,000 家公司时,它又开始变得非常慢了。
已安装的应用程序
INSTALLED_APPS = [
'django.contrib.admin',
'django.contrib.auth',
'django.contrib.contenttypes',
'django.contrib.sessions',
'django.contrib.sites',
# Added.
'haystack',
# Then your usual apps...
]
更多settings.py
import os
from urlparse import urlparse
es = urlparse(os.environ.get('SEARCHBOX_URL') or 'http://127.0.0.1:9200/')
port = es.port or 80
HAYSTACK_CONNECTIONS = {
'default': {
'ENGINE': 'haystack.backends.elasticsearch_backend.ElasticsearchSearchEngine',
'URL': es.scheme + '://' + es.hostname + ':' + str(port),
'INDEX_NAME': 'documents',
},
if es.username:
HAYSTACK_CONNECTIONS['default']['KWARGS'] = {"http_auth": es.username + ':' + es.password}
search_indexes.py
from haystack import indexes
from hello.models import Article
class ArticleIndex(indexes.SearchIndex, indexes.Indexable):
'''
defines the model for the serach Engine.
'''
text = indexes.CharField(document=True, use_template=True)
pub_date = indexes.DateTimeField(model_attr='pub_date')
# pub_date line was commented out previously
content_auto = indexes.EdgeNgramField(model_attr='title')
def get_model(self):
return Article
def index_queryset(self, using=None):
"""Used when the entire index for model is updated."""
return self.get_model().objects.all()
article_text.txt
{{ object.title }}
{{ object.user.get_full_name }}
{{ object.body }}
urls.py
url(r'^search/$', views.search_titles, name='search'),
views.py
def search_titles(request):
txt = request.POST.get('search_text', '')
if txt and len(txt) >= 4:
articles = SearchQuerySet().autocomplete(content_auto=txt)
# if the post request is empty, return nothing
# this prevents internal server error with jquery
else:
articles = []
return render_to_response('scripts/ajax_search.html',
{'articles': articles})
search.html
{% if articles.count > 0 %}
<!-- simply prints the links to the cvr numbers-->
<!-- for article in articles -->
{% for article in "x"|rjust:"15" %}
<li><a href="{{ article.object.get_absolute_url }}">{{ article.object.title }}</a></li>
{% endfor %}
{% else %}
<li>Try again, or try CVR + ⏎</li>
{% endif %}
index.html(我称之为搜索引擎)
{% csrf_token %}
<input type="text" id="search" name="search" />
<!-- This <ul> all company names end up-->
<ul id ="search-results"></ul>
我将 ves.py 搜索方法更改为:
txt = request.POST.get('search_text', '')
articles = []
suggestedSearchTerm = ""
if txt and len(txt) >= 4:
sqs = SearchQuerySet()
sqs.query.set_limits(low=0, high=8)
sqs = sqs.filter(content=txt)
articles = sqs.query.get_results()
suggestedSearchTerm = SearchQuerySet().spelling_suggestion(txt)
if suggestedSearchTerm == txt:
suggestedSearchTerm = ''
else:
suggestedSearchTerm = suggestedSearchTerm.lower()