多次查询 Elasticsearch 时响应为空
Empty response while querying Elasticsearch multiple times
我编写了一个脚本来使用嵌套查询和 elaticsearch-dsl-py 获取一些数据。
在我添加 while ids_left > 0
循环以按块从 Elasticsearch 获取数据之前,一切都运行良好。
现在我只在 response
中获取第一个块的数据。我对所有连续的块都有空响应 <Response: []>
。
这是为什么?如何获取每个块的响应数据?
from elasticsearch import Elasticsearch
from elasticsearch_dsl import Search, Q
ES_HOST = 'es0.dev.lombardia'
ES_PORT = 9200
data = {'organizations': [[{'db_name': u'lombardia0', 'id': 10}]], 'ids': ['726GZWQ65682D,506GBBO25953J,977ENPZ91770F']}
ids = filter(
lambda x: x != None,
map(lambda x: x.strip() if re.match("^[a-zA-Z0-9_]*$", x.strip()) else None, data['ids'][0].split(','))
)
if len(ids) == 0:
sys.exit("No valid IDs.")
organizations = data['organizations'][0]
total_num_of_ids = len(ids)
offset, chunk, ids_left = 0, 10, total_num_of_ids
root_path = 'Demographic_Details'
es = Elasticsearch(hosts = [{'host': ES_HOST, 'port': ES_PORT}])
for organization in organizations:
index = 'logic_{0}'.format(organization['db_name'])
while ids_left > 0:
print('OFFSET %s' % str(offset))
if (offset + chunk) <= total_num_of_ids:
limit = offset + chunk
else:
limit = total_num_of_ids
search = None
search = Search(using=es).index(index).source(include=[root_path])
q = Q('bool', must=[Q('nested', path=root_path, query=Q('bool', should=[], minimum_should_match=1))])
search = search.query(q)
for i in xrange(offset, limit):
q = Q('match', **{'{0}.ID'.format(root_path): ids[i]})
search.query.must[0].query.should.append(q)
print(search.to_dict())
search = search[offset:limit]
response = search.execute()
for hit in response:
print(hit[root_path][0]['id'], hit[root_path][0]['match'])
offset += chunk
ids_left -= chunk
打印结果:
OFFSET 0 查询
{'query': {'bool': {'must': [{'nested': {'path': 'Demographic_Details', 'query': {'bool': {'minimum_should_match': 1, 'should': [{'match': {'Demographic_Details.ID': u'726GZWQ65682D'}}, {'match': {'Demographic_Details.ID': u'506GBBO25953J'}}, {'match': {'Demographic_Details.ID': u'977ENPZ91770F'}}, {'match': {'Demographic_Details.ID': u'250GDPU44147B'}}, {'match': {'Demographic_Details.ID': u'528FAOH03019V'}}, {'match': {'Demographic_Details.ID': u'827GNXH29227B'}}, {'match': {'Demographic_Details.ID': u'836GWCX91596A'}}, {'match': {'Demographic_Details.ID': u'482VURG98816U'}}, {'match': {'Demographic_Details.ID': u'989VKQX13983W'}}, {'match': {'Demographic_Details.ID': u'900GJVU10735D'}}]}}}}]}}, '_source': {'include': ['Demographic_Details']}}
响应数据:
-> for hit in response:
(Pdb) cont
(u'827GNXH29227B', u'Y')
(u'250GDPU44147B', u'Y')
(u'836GWCX91596A', u'Y')
(u'482VURG98816U', u'Y')
(u'977ENPZ91770F', u'Y')
(u'989VKQX13983W', u'Y')
(u'528FAOH03019V', u'Y')
(u'900GJVU10735D', u'Y')
(u'726GZWQ65682D', u'Y')
(u'506GBBO25953J', u'Y')
OFFSET 10 查询
{'query': {'bool': {'must': [{'nested': {'path': 'Demographic_Details', 'query': {'bool': {'minimum_should_match': 1, 'should': [{'match': {'Demographic_Details.ID': u'731NBER88448A'}}, {'match': {'Demographic_Details.ID': u'963WLQD56637O'}}, {'match': {'Demographic_Details.ID': u'880RFWM18773C'}}, {'match': {'Demographic_Details.ID': u'037BASP48376D'}}, {'match': {'Demographic_Details.ID': u'554XZQP10563T'}}, {'match': {'Demographic_Details.ID': u'305KTYG96669R'}}, {'match': {'Demographic_Details.ID': u'056XZQI88874A'}}, {'match': {'Demographic_Details.ID': u'294OKUR30033G'}}, {'match': {'Demographic_Details.ID': u'404DDCN87823H'}}, {'match': {'Demographic_Details.ID': u'333UQAN69783V'}}]}}}}]}}, '_source': {'include': ['Demographic_Details']}}
有 .scan() 方法可以访问所有匹配的文档。现在不再需要 search[offset:limit]
切片,因为已经为每个块创建了一个单独的查询,我需要的只是获取查询的所有结果。
现在代码如下所示:
...
while ids_left > 0:
print('OFFSET %s' % str(offset))
if (offset + chunk) <= total_num_of_ids:
limit = offset + chunk
else:
limit = total_num_of_ids
search = Search(using=es).index(index).source(include=[root_path])
q = Q('bool', must=[Q('nested', path=root_path, query=Q('bool', should=[], minimum_should_match=1))])
search = search.query(q)
for i in xrange(offset, limit):
q = Q('match', **{'{0}.ID'.format(root_path): ids[i]})
search.query.must[0].query.should.append(q)
print(search.to_dict())
for hit in search.scan():
print(hit[root_path][0]['id'], hit[root_path][0]['match'])
offset += chunk
ids_left -= chunk
我编写了一个脚本来使用嵌套查询和 elaticsearch-dsl-py 获取一些数据。
在我添加 while ids_left > 0
循环以按块从 Elasticsearch 获取数据之前,一切都运行良好。
现在我只在 response
中获取第一个块的数据。我对所有连续的块都有空响应 <Response: []>
。
这是为什么?如何获取每个块的响应数据?
from elasticsearch import Elasticsearch
from elasticsearch_dsl import Search, Q
ES_HOST = 'es0.dev.lombardia'
ES_PORT = 9200
data = {'organizations': [[{'db_name': u'lombardia0', 'id': 10}]], 'ids': ['726GZWQ65682D,506GBBO25953J,977ENPZ91770F']}
ids = filter(
lambda x: x != None,
map(lambda x: x.strip() if re.match("^[a-zA-Z0-9_]*$", x.strip()) else None, data['ids'][0].split(','))
)
if len(ids) == 0:
sys.exit("No valid IDs.")
organizations = data['organizations'][0]
total_num_of_ids = len(ids)
offset, chunk, ids_left = 0, 10, total_num_of_ids
root_path = 'Demographic_Details'
es = Elasticsearch(hosts = [{'host': ES_HOST, 'port': ES_PORT}])
for organization in organizations:
index = 'logic_{0}'.format(organization['db_name'])
while ids_left > 0:
print('OFFSET %s' % str(offset))
if (offset + chunk) <= total_num_of_ids:
limit = offset + chunk
else:
limit = total_num_of_ids
search = None
search = Search(using=es).index(index).source(include=[root_path])
q = Q('bool', must=[Q('nested', path=root_path, query=Q('bool', should=[], minimum_should_match=1))])
search = search.query(q)
for i in xrange(offset, limit):
q = Q('match', **{'{0}.ID'.format(root_path): ids[i]})
search.query.must[0].query.should.append(q)
print(search.to_dict())
search = search[offset:limit]
response = search.execute()
for hit in response:
print(hit[root_path][0]['id'], hit[root_path][0]['match'])
offset += chunk
ids_left -= chunk
打印结果:
OFFSET 0 查询
{'query': {'bool': {'must': [{'nested': {'path': 'Demographic_Details', 'query': {'bool': {'minimum_should_match': 1, 'should': [{'match': {'Demographic_Details.ID': u'726GZWQ65682D'}}, {'match': {'Demographic_Details.ID': u'506GBBO25953J'}}, {'match': {'Demographic_Details.ID': u'977ENPZ91770F'}}, {'match': {'Demographic_Details.ID': u'250GDPU44147B'}}, {'match': {'Demographic_Details.ID': u'528FAOH03019V'}}, {'match': {'Demographic_Details.ID': u'827GNXH29227B'}}, {'match': {'Demographic_Details.ID': u'836GWCX91596A'}}, {'match': {'Demographic_Details.ID': u'482VURG98816U'}}, {'match': {'Demographic_Details.ID': u'989VKQX13983W'}}, {'match': {'Demographic_Details.ID': u'900GJVU10735D'}}]}}}}]}}, '_source': {'include': ['Demographic_Details']}}
响应数据:
-> for hit in response:
(Pdb) cont
(u'827GNXH29227B', u'Y')
(u'250GDPU44147B', u'Y')
(u'836GWCX91596A', u'Y')
(u'482VURG98816U', u'Y')
(u'977ENPZ91770F', u'Y')
(u'989VKQX13983W', u'Y')
(u'528FAOH03019V', u'Y')
(u'900GJVU10735D', u'Y')
(u'726GZWQ65682D', u'Y')
(u'506GBBO25953J', u'Y')
OFFSET 10 查询
{'query': {'bool': {'must': [{'nested': {'path': 'Demographic_Details', 'query': {'bool': {'minimum_should_match': 1, 'should': [{'match': {'Demographic_Details.ID': u'731NBER88448A'}}, {'match': {'Demographic_Details.ID': u'963WLQD56637O'}}, {'match': {'Demographic_Details.ID': u'880RFWM18773C'}}, {'match': {'Demographic_Details.ID': u'037BASP48376D'}}, {'match': {'Demographic_Details.ID': u'554XZQP10563T'}}, {'match': {'Demographic_Details.ID': u'305KTYG96669R'}}, {'match': {'Demographic_Details.ID': u'056XZQI88874A'}}, {'match': {'Demographic_Details.ID': u'294OKUR30033G'}}, {'match': {'Demographic_Details.ID': u'404DDCN87823H'}}, {'match': {'Demographic_Details.ID': u'333UQAN69783V'}}]}}}}]}}, '_source': {'include': ['Demographic_Details']}}
有 .scan() 方法可以访问所有匹配的文档。现在不再需要 search[offset:limit]
切片,因为已经为每个块创建了一个单独的查询,我需要的只是获取查询的所有结果。
现在代码如下所示:
...
while ids_left > 0:
print('OFFSET %s' % str(offset))
if (offset + chunk) <= total_num_of_ids:
limit = offset + chunk
else:
limit = total_num_of_ids
search = Search(using=es).index(index).source(include=[root_path])
q = Q('bool', must=[Q('nested', path=root_path, query=Q('bool', should=[], minimum_should_match=1))])
search = search.query(q)
for i in xrange(offset, limit):
q = Q('match', **{'{0}.ID'.format(root_path): ids[i]})
search.query.must[0].query.should.append(q)
print(search.to_dict())
for hit in search.scan():
print(hit[root_path][0]['id'], hit[root_path][0]['match'])
offset += chunk
ids_left -= chunk