在 Django 模型中使用 Trigram (gin_trgm_ops) 创建 Gin 索引
Creating a Gin Index with Trigram (gin_trgm_ops) in Django model
django.contrib.postgres 的新 TrigramSimilarity 功能非常适合我遇到的问题。我将它用于搜索栏以查找难以拼写的拉丁名称。问题是有超过 200 万个名字,搜索时间比我想要的要长。
我想按照 postgres documentation.
中描述的那样在三元组上创建索引
但我不确定如何以 Django API 会利用它的方式执行此操作。对于 postgres 文本搜索,有关于如何创建索引的描述,但对于 trigram similarity.
这是我现在拥有的:
class NCBI_names(models.Model):
tax_id = models.ForeignKey(NCBI_nodes, on_delete=models.CASCADE, default = 0)
name_txt = models.CharField(max_length=255, default = '')
name_class = models.CharField(max_length=32, db_index=True, default = '')
class Meta:
indexes = [GinIndex(fields=['name_txt'])]
在视图的 get_queryset
方法中:
class TaxonSearchListView(ListView):
#form_class=TaxonSearchForm
template_name='collectie/taxon_list.html'
paginate_by=20
model=NCBI_names
context_object_name = 'taxon_list'
def dispatch(self, request, *args, **kwargs):
query = request.GET.get('q')
if query:
try:
tax_id = self.model.objects.get(name_txt__iexact=query).tax_id.tax_id
return redirect('collectie:taxon_detail', tax_id)
except (self.model.DoesNotExist, self.model.MultipleObjectsReturned) as e:
return super(TaxonSearchListView, self).dispatch(request, *args, **kwargs)
else:
return super(TaxonSearchListView, self).dispatch(request, *args, **kwargs)
def get_queryset(self):
result = super(TaxonSearchListView, self).get_queryset()
#
query = self.request.GET.get('q')
if query:
result = result.exclude(name_txt__icontains = 'sp.')
result = result.annotate(similarity=TrigramSimilarity('name_txt', query)).filter(similarity__gt=0.3).order_by('-similarity')
return result
我遇到了类似的问题,试图使用 pg_tgrm
扩展来支持高效的 contains
和 icontains
Django 字段查找。
可能有更优雅的方法,但像这样定义一个新的索引类型对我有用:
from django.contrib.postgres.indexes import GinIndex
class TrigramIndex(GinIndex):
def get_sql_create_template_values(self, model, schema_editor, using):
fields = [model._meta.get_field(field_name) for field_name, order in self.fields_orders]
tablespace_sql = schema_editor._get_index_tablespace_sql(model, fields)
quote_name = schema_editor.quote_name
columns = [
('%s %s' % (quote_name(field.column), order)).strip() + ' gin_trgm_ops'
for field, (field_name, order) in zip(fields, self.fields_orders)
]
return {
'table': quote_name(model._meta.db_table),
'name': quote_name(self.name),
'columns': ', '.join(columns),
'using': using,
'extra': tablespace_sql,
}
方法 get_sql_create_template_values
是从 Index.get_sql_create_template_values()
复制而来的,只有一个修改:添加 + ' gin_trgm_ops'
.
对于您的用例,您将使用此 TrigramIndex
而不是 GinIndex
在 name_txt
上定义索引。然后 运行 makemigrations
,这将产生一个迁移,生成所需的 CREATE INDEX
SQL.
更新:
我看到您也在使用 icontains
:
进行查询
result.exclude(name_txt__icontains = 'sp.')
Postgresql 后端会把它变成这样的东西:
UPPER("NCBI_names"."name_txt"::text) LIKE UPPER('sp.')
然后因为 UPPER()
.
而不会使用三元组索引
我遇到了同样的问题,最终将数据库后端子类化以解决它:
from django.db.backends.postgresql import base, operations
class DatabaseFeatures(base.DatabaseFeatures):
pass
class DatabaseOperations(operations.DatabaseOperations):
def lookup_cast(self, lookup_type, internal_type=None):
lookup = '%s'
# Cast text lookups to text to allow things like filter(x__contains=4)
if lookup_type in ('iexact', 'contains', 'icontains', 'startswith',
'istartswith', 'endswith', 'iendswith', 'regex', 'iregex'):
if internal_type in ('IPAddressField', 'GenericIPAddressField'):
lookup = "HOST(%s)"
else:
lookup = "%s::text"
return lookup
class DatabaseWrapper(base.DatabaseWrapper):
"""
Override the defaults where needed to allow use of trigram index
"""
ops_class = DatabaseOperations
def __init__(self, *args, **kwargs):
self.operators.update({
'icontains': 'ILIKE %s',
'istartswith': 'ILIKE %s',
'iendswith': 'ILIKE %s',
})
self.pattern_ops.update({
'icontains': "ILIKE '%%' || {} || '%%'",
'istartswith': "ILIKE {} || '%%'",
'iendswith': "ILIKE '%%' || {}",
})
super(DatabaseWrapper, self).__init__(*args, **kwargs)
如果有人想在多个列上使用 space 连接(串联)索引,您可以使用我对内置索引的修改。
创建类似 gin (("column1" || ' ' || "column2" || ' ' || ...) gin_trgm_ops)
的索引
class GinSpaceConcatIndex(GinIndex):
def get_sql_create_template_values(self, model, schema_editor, using):
fields = [model._meta.get_field(field_name) for field_name, order in self.fields_orders]
tablespace_sql = schema_editor._get_index_tablespace_sql(model, fields)
quote_name = schema_editor.quote_name
columns = [
('%s %s' % (quote_name(field.column), order)).strip()
for field, (field_name, order) in zip(fields, self.fields_orders)
]
return {
'table': quote_name(model._meta.db_table),
'name': quote_name(self.name),
'columns': "({}) gin_trgm_ops".format(" || ' ' || ".join(columns)),
'using': using,
'extra': tablespace_sql,
}
我发现 12/2020 article 使用最新版本的 Django ORM:
class Author(models.Model):
first_name = models.CharField(max_length=100)
last_name = models.CharField(max_length=100)
class Meta:
indexes = [
GinIndex(
name='review_author_ln_gin_idx',
fields=['last_name'],
opclasses=['gin_trgm_ops'],
)
]
如果像最初的发帖者一样,您希望创建一个与 icontains 一起使用的索引,则必须索引列的 UPPER(),这需要 OpClass 的特殊处理:
from django.db.models.functions import Upper
from django.contrib.postgres.indexes import GinIndex, OpClass
class Author(models.Model):
indexes = [
GinIndex(
OpClass(Upper('last_name'), name='gin_trgm_ops'),
name='review_author_ln_gin_idx',
)
]
受 old article on this subject, I landed to a current one 的启发,它为 GistIndex
提供了以下解决方案:
更新:
从 Django-1.11 开始,事情似乎更简单了,因为 and django docs 建议:
from django.contrib.postgres.indexes import GinIndex
class MyModel(models.Model):
the_field = models.CharField(max_length=512, db_index=True)
class Meta:
indexes = [GinIndex(fields=['the_field'])]
出于此目的来自 Django-2.2, an attribute opclasses
will be available in class Index(fields=(), name=None, db_tablespace=None, opclasses=())
。
from django.contrib.postgres.indexes import GistIndex
class GistIndexTrgrmOps(GistIndex):
def create_sql(self, model, schema_editor):
# - this Statement is instantiated by the _create_index_sql()
# method of django.db.backends.base.schema.BaseDatabaseSchemaEditor.
# using sql_create_index template from
# django.db.backends.postgresql.schema.DatabaseSchemaEditor
# - the template has original value:
# "CREATE INDEX %(name)s ON %(table)s%(using)s (%(columns)s)%(extra)s"
statement = super().create_sql(model, schema_editor)
# - however, we want to use a GIST index to accelerate trigram
# matching, so we want to add the gist_trgm_ops index operator
# class
# - so we replace the template with:
# "CREATE INDEX %(name)s ON %(table)s%(using)s (%(columns)s gist_trgrm_ops)%(extra)s"
statement.template =\
"CREATE INDEX %(name)s ON %(table)s%(using)s (%(columns)s gist_trgm_ops)%(extra)s"
return statement
然后你可以在你的模型中使用它 class 像这样:
class YourModel(models.Model):
some_field = models.TextField(...)
class Meta:
indexes = [
GistIndexTrgrmOps(fields=['some_field'])
]
要使 Django 2.2 使用索引进行 icontains
和类似搜索:
子类 GinIndex 以创建不区分大小写的索引(大写所有字段值):
from django.contrib.postgres.indexes import GinIndex
class UpperGinIndex(GinIndex):
def create_sql(self, model, schema_editor, using=''):
statement = super().create_sql(model, schema_editor, using=using)
quote_name = statement.parts['columns'].quote_name
def upper_quoted(column):
return f'UPPER({quote_name(column)})'
statement.parts['columns'].quote_name = upper_quoted
return statement
像这样将索引添加到您的模型中,包括使用 opclasses
:
时需要的 kwarg name
class MyModel(Model):
name = TextField(...)
class Meta:
indexes = [
UpperGinIndex(fields=['name'], name='mymodel_name_gintrgm', opclasses=['gin_trgm_ops'])
]
生成迁移并编辑生成的文件:
# Generated by Django 2.2.3 on 2019-07-15 10:46
from django.contrib.postgres.operations import TrigramExtension # <<< add this
from django.db import migrations
import myapp.models
class Migration(migrations.Migration):
operations = [
TrigramExtension(), # <<< add this
migrations.AddIndex(
model_name='mymodel',
index=myapp.models.UpperGinIndex(fields=['name'], name='mymodel_name_gintrgm', opclasses=['gin_trgm_ops']),
),
]
这已经有了答案,但在 Django 2.2 中你可以更容易地做到这一点:
class MyModel(models.Model):
name = models.TextField()
class Meta:
indexes = [GistIndex(name="gist_trgm_idx", fields=("name",), opclasses=("gist_trgm_ops",))]
或者您可以使用 GinIndex
。
django.contrib.postgres 的新 TrigramSimilarity 功能非常适合我遇到的问题。我将它用于搜索栏以查找难以拼写的拉丁名称。问题是有超过 200 万个名字,搜索时间比我想要的要长。
我想按照 postgres documentation.
中描述的那样在三元组上创建索引但我不确定如何以 Django API 会利用它的方式执行此操作。对于 postgres 文本搜索,有关于如何创建索引的描述,但对于 trigram similarity.
这是我现在拥有的:
class NCBI_names(models.Model):
tax_id = models.ForeignKey(NCBI_nodes, on_delete=models.CASCADE, default = 0)
name_txt = models.CharField(max_length=255, default = '')
name_class = models.CharField(max_length=32, db_index=True, default = '')
class Meta:
indexes = [GinIndex(fields=['name_txt'])]
在视图的 get_queryset
方法中:
class TaxonSearchListView(ListView):
#form_class=TaxonSearchForm
template_name='collectie/taxon_list.html'
paginate_by=20
model=NCBI_names
context_object_name = 'taxon_list'
def dispatch(self, request, *args, **kwargs):
query = request.GET.get('q')
if query:
try:
tax_id = self.model.objects.get(name_txt__iexact=query).tax_id.tax_id
return redirect('collectie:taxon_detail', tax_id)
except (self.model.DoesNotExist, self.model.MultipleObjectsReturned) as e:
return super(TaxonSearchListView, self).dispatch(request, *args, **kwargs)
else:
return super(TaxonSearchListView, self).dispatch(request, *args, **kwargs)
def get_queryset(self):
result = super(TaxonSearchListView, self).get_queryset()
#
query = self.request.GET.get('q')
if query:
result = result.exclude(name_txt__icontains = 'sp.')
result = result.annotate(similarity=TrigramSimilarity('name_txt', query)).filter(similarity__gt=0.3).order_by('-similarity')
return result
我遇到了类似的问题,试图使用 pg_tgrm
扩展来支持高效的 contains
和 icontains
Django 字段查找。
可能有更优雅的方法,但像这样定义一个新的索引类型对我有用:
from django.contrib.postgres.indexes import GinIndex
class TrigramIndex(GinIndex):
def get_sql_create_template_values(self, model, schema_editor, using):
fields = [model._meta.get_field(field_name) for field_name, order in self.fields_orders]
tablespace_sql = schema_editor._get_index_tablespace_sql(model, fields)
quote_name = schema_editor.quote_name
columns = [
('%s %s' % (quote_name(field.column), order)).strip() + ' gin_trgm_ops'
for field, (field_name, order) in zip(fields, self.fields_orders)
]
return {
'table': quote_name(model._meta.db_table),
'name': quote_name(self.name),
'columns': ', '.join(columns),
'using': using,
'extra': tablespace_sql,
}
方法 get_sql_create_template_values
是从 Index.get_sql_create_template_values()
复制而来的,只有一个修改:添加 + ' gin_trgm_ops'
.
对于您的用例,您将使用此 TrigramIndex
而不是 GinIndex
在 name_txt
上定义索引。然后 运行 makemigrations
,这将产生一个迁移,生成所需的 CREATE INDEX
SQL.
更新:
我看到您也在使用 icontains
:
result.exclude(name_txt__icontains = 'sp.')
Postgresql 后端会把它变成这样的东西:
UPPER("NCBI_names"."name_txt"::text) LIKE UPPER('sp.')
然后因为 UPPER()
.
我遇到了同样的问题,最终将数据库后端子类化以解决它:
from django.db.backends.postgresql import base, operations
class DatabaseFeatures(base.DatabaseFeatures):
pass
class DatabaseOperations(operations.DatabaseOperations):
def lookup_cast(self, lookup_type, internal_type=None):
lookup = '%s'
# Cast text lookups to text to allow things like filter(x__contains=4)
if lookup_type in ('iexact', 'contains', 'icontains', 'startswith',
'istartswith', 'endswith', 'iendswith', 'regex', 'iregex'):
if internal_type in ('IPAddressField', 'GenericIPAddressField'):
lookup = "HOST(%s)"
else:
lookup = "%s::text"
return lookup
class DatabaseWrapper(base.DatabaseWrapper):
"""
Override the defaults where needed to allow use of trigram index
"""
ops_class = DatabaseOperations
def __init__(self, *args, **kwargs):
self.operators.update({
'icontains': 'ILIKE %s',
'istartswith': 'ILIKE %s',
'iendswith': 'ILIKE %s',
})
self.pattern_ops.update({
'icontains': "ILIKE '%%' || {} || '%%'",
'istartswith': "ILIKE {} || '%%'",
'iendswith': "ILIKE '%%' || {}",
})
super(DatabaseWrapper, self).__init__(*args, **kwargs)
如果有人想在多个列上使用 space 连接(串联)索引,您可以使用我对内置索引的修改。
创建类似 gin (("column1" || ' ' || "column2" || ' ' || ...) gin_trgm_ops)
class GinSpaceConcatIndex(GinIndex):
def get_sql_create_template_values(self, model, schema_editor, using):
fields = [model._meta.get_field(field_name) for field_name, order in self.fields_orders]
tablespace_sql = schema_editor._get_index_tablespace_sql(model, fields)
quote_name = schema_editor.quote_name
columns = [
('%s %s' % (quote_name(field.column), order)).strip()
for field, (field_name, order) in zip(fields, self.fields_orders)
]
return {
'table': quote_name(model._meta.db_table),
'name': quote_name(self.name),
'columns': "({}) gin_trgm_ops".format(" || ' ' || ".join(columns)),
'using': using,
'extra': tablespace_sql,
}
我发现 12/2020 article 使用最新版本的 Django ORM:
class Author(models.Model):
first_name = models.CharField(max_length=100)
last_name = models.CharField(max_length=100)
class Meta:
indexes = [
GinIndex(
name='review_author_ln_gin_idx',
fields=['last_name'],
opclasses=['gin_trgm_ops'],
)
]
如果像最初的发帖者一样,您希望创建一个与 icontains 一起使用的索引,则必须索引列的 UPPER(),这需要 OpClass 的特殊处理:
from django.db.models.functions import Upper
from django.contrib.postgres.indexes import GinIndex, OpClass
class Author(models.Model):
indexes = [
GinIndex(
OpClass(Upper('last_name'), name='gin_trgm_ops'),
name='review_author_ln_gin_idx',
)
]
受 old article on this subject, I landed to a current one 的启发,它为 GistIndex
提供了以下解决方案:
更新:
从 Django-1.11 开始,事情似乎更简单了,因为
from django.contrib.postgres.indexes import GinIndex
class MyModel(models.Model):
the_field = models.CharField(max_length=512, db_index=True)
class Meta:
indexes = [GinIndex(fields=['the_field'])]
出于此目的来自 Django-2.2, an attribute opclasses
will be available in class Index(fields=(), name=None, db_tablespace=None, opclasses=())
。
from django.contrib.postgres.indexes import GistIndex
class GistIndexTrgrmOps(GistIndex):
def create_sql(self, model, schema_editor):
# - this Statement is instantiated by the _create_index_sql()
# method of django.db.backends.base.schema.BaseDatabaseSchemaEditor.
# using sql_create_index template from
# django.db.backends.postgresql.schema.DatabaseSchemaEditor
# - the template has original value:
# "CREATE INDEX %(name)s ON %(table)s%(using)s (%(columns)s)%(extra)s"
statement = super().create_sql(model, schema_editor)
# - however, we want to use a GIST index to accelerate trigram
# matching, so we want to add the gist_trgm_ops index operator
# class
# - so we replace the template with:
# "CREATE INDEX %(name)s ON %(table)s%(using)s (%(columns)s gist_trgrm_ops)%(extra)s"
statement.template =\
"CREATE INDEX %(name)s ON %(table)s%(using)s (%(columns)s gist_trgm_ops)%(extra)s"
return statement
然后你可以在你的模型中使用它 class 像这样:
class YourModel(models.Model):
some_field = models.TextField(...)
class Meta:
indexes = [
GistIndexTrgrmOps(fields=['some_field'])
]
要使 Django 2.2 使用索引进行 icontains
和类似搜索:
子类 GinIndex 以创建不区分大小写的索引(大写所有字段值):
from django.contrib.postgres.indexes import GinIndex
class UpperGinIndex(GinIndex):
def create_sql(self, model, schema_editor, using=''):
statement = super().create_sql(model, schema_editor, using=using)
quote_name = statement.parts['columns'].quote_name
def upper_quoted(column):
return f'UPPER({quote_name(column)})'
statement.parts['columns'].quote_name = upper_quoted
return statement
像这样将索引添加到您的模型中,包括使用 opclasses
:
name
class MyModel(Model):
name = TextField(...)
class Meta:
indexes = [
UpperGinIndex(fields=['name'], name='mymodel_name_gintrgm', opclasses=['gin_trgm_ops'])
]
生成迁移并编辑生成的文件:
# Generated by Django 2.2.3 on 2019-07-15 10:46
from django.contrib.postgres.operations import TrigramExtension # <<< add this
from django.db import migrations
import myapp.models
class Migration(migrations.Migration):
operations = [
TrigramExtension(), # <<< add this
migrations.AddIndex(
model_name='mymodel',
index=myapp.models.UpperGinIndex(fields=['name'], name='mymodel_name_gintrgm', opclasses=['gin_trgm_ops']),
),
]
这已经有了答案,但在 Django 2.2 中你可以更容易地做到这一点:
class MyModel(models.Model):
name = models.TextField()
class Meta:
indexes = [GistIndex(name="gist_trgm_idx", fields=("name",), opclasses=("gist_trgm_ops",))]
或者您可以使用 GinIndex
。