优化 Python 中的嵌套 for 循环
Optimize a nested for loop in Python
我将数据作为键值对存储在句子的 leveldb database. The values are the laser 向量嵌入中,键是这些句子的意图。当输入一个新句子时,我将该句子的向量嵌入与 leveldb 数据库中的值进行比较,以识别意图。在这里,我使用了一个 嵌套的 for 循环 并且这需要超过 5 秒的时间来执行。有人可以建议一种优化此循环/代码段的方法吗?
expose.py
import plyvel
from flask import Flask
from flask_restful import Api
from laserembeddings import Laser
from getters.getIntents import *
from getters.getEntities import *
app = Flask(__name__)
api = Api(app)
si_data_vec = plyvel.DB('levelDB/si_data_vec', create_if_missing=False)
path_to_bpe_codes = 'data/laser_models/93langs.fcodes'
path_to_bpe_vocab = 'data/laser_models/93langs.fvocab'
path_to_encoder = 'data/laser_models/bilstm.93langs.2018-12-26.pt'
laser = Laser(path_to_bpe_codes, path_to_bpe_vocab, path_to_encoder)
@app.route('/lang/si/<keylist>', methods=['GET'])
def get_si(keylist):
intent = get_intents(keylist, si_data_vec, laser)
return intent
# Initialize and start the web application
if __name__ == "__main__":
app.run()
getIntents.py
这包含要优化的循环
import io
from itertools import combinations
import numpy as np
def get_intents(key_list, si_data_vec, laser):
avg = laser.embed_sentences([key_list], lang='si')[0]
minimum_dist = 1
intent = ''
### LOOP TO BE OPTIMIZED
for key, value in si_data_vec:
bio = io.BytesIO(value)
vec = np.load(bio)
for pair in combinations([avg, vec], 2):
dist = distance(list(pair[0]), list(pair[1]))
if dist < minimum_dist:
minimum_dist = dist
intent = key.decode()
return intent
def distance(list1, list2):
"""Distance between two vectors."""
squares = [(p-q) ** 2 for p, q in zip(list1, list2)]
return sum(squares) ** .5
Updated getIntents.py as per the comment
import io
import numpy as np
def get_intents(key_list, si_data_vec, laser):
avg = laser.embed_sentences([key_list], lang='si')[0]
minimum_dist = 1
intent = ''
for key, value in si_data_vec:
bio = io.BytesIO(value)
vec = np.load(bio)
dist = distance(avg, vec)
if dist < minimum_dist:
minimum_dist = dist
intent = key.decode()
return intent
def distance(list1, list2):
"""Distance between two vectors."""
squares = [(p-q) ** 2 for p, q in zip(list1, list2)]
return sum(squares) ** .5
我唯一能想到的就是使用 numpy
进行距离计算(反正你已经导入了 numpy);我不确定这是否会给你带来很大的加速。
avg = np.array(laser.embed_sentences([key_list], lang='si')[0])
for key, value in si_data_vec:
bio = io.BytesIO(value)
vec = np.load(bio)
dist = np.linalg.norm(avg-vec)
另见 How can the Euclidean distance be calculated with NumPy?
我将数据作为键值对存储在句子的 leveldb database. The values are the laser 向量嵌入中,键是这些句子的意图。当输入一个新句子时,我将该句子的向量嵌入与 leveldb 数据库中的值进行比较,以识别意图。在这里,我使用了一个 嵌套的 for 循环 并且这需要超过 5 秒的时间来执行。有人可以建议一种优化此循环/代码段的方法吗?
expose.py
import plyvel
from flask import Flask
from flask_restful import Api
from laserembeddings import Laser
from getters.getIntents import *
from getters.getEntities import *
app = Flask(__name__)
api = Api(app)
si_data_vec = plyvel.DB('levelDB/si_data_vec', create_if_missing=False)
path_to_bpe_codes = 'data/laser_models/93langs.fcodes'
path_to_bpe_vocab = 'data/laser_models/93langs.fvocab'
path_to_encoder = 'data/laser_models/bilstm.93langs.2018-12-26.pt'
laser = Laser(path_to_bpe_codes, path_to_bpe_vocab, path_to_encoder)
@app.route('/lang/si/<keylist>', methods=['GET'])
def get_si(keylist):
intent = get_intents(keylist, si_data_vec, laser)
return intent
# Initialize and start the web application
if __name__ == "__main__":
app.run()
getIntents.py
这包含要优化的循环
import io
from itertools import combinations
import numpy as np
def get_intents(key_list, si_data_vec, laser):
avg = laser.embed_sentences([key_list], lang='si')[0]
minimum_dist = 1
intent = ''
### LOOP TO BE OPTIMIZED
for key, value in si_data_vec:
bio = io.BytesIO(value)
vec = np.load(bio)
for pair in combinations([avg, vec], 2):
dist = distance(list(pair[0]), list(pair[1]))
if dist < minimum_dist:
minimum_dist = dist
intent = key.decode()
return intent
def distance(list1, list2):
"""Distance between two vectors."""
squares = [(p-q) ** 2 for p, q in zip(list1, list2)]
return sum(squares) ** .5
Updated getIntents.py as per the comment
import io
import numpy as np
def get_intents(key_list, si_data_vec, laser):
avg = laser.embed_sentences([key_list], lang='si')[0]
minimum_dist = 1
intent = ''
for key, value in si_data_vec:
bio = io.BytesIO(value)
vec = np.load(bio)
dist = distance(avg, vec)
if dist < minimum_dist:
minimum_dist = dist
intent = key.decode()
return intent
def distance(list1, list2):
"""Distance between two vectors."""
squares = [(p-q) ** 2 for p, q in zip(list1, list2)]
return sum(squares) ** .5
我唯一能想到的就是使用 numpy
进行距离计算(反正你已经导入了 numpy);我不确定这是否会给你带来很大的加速。
avg = np.array(laser.embed_sentences([key_list], lang='si')[0])
for key, value in si_data_vec:
bio = io.BytesIO(value)
vec = np.load(bio)
dist = np.linalg.norm(avg-vec)
另见 How can the Euclidean distance be calculated with NumPy?