Tensorflow:高效的多项式采样(Theano x50 更快?)
Tensorflow: Efficient multinomial sampling (Theano x50 faster?)
我希望能够从多项式分布中进行采样非常,而且显然我的 TensorFlow 代码非常...非常慢...
我的想法是:
- 一个向量:例如
counts = [40, 50, 26, ..., 19]
- 概率矩阵:
probs = [[0.1, ..., 0.5], ... [0.3, ..., 0.02]]
使得 np.sum(probs, axis=1) = 1
假设 len(counts) = N
和 len(probs) = (N, 50)
。我想做的是(在我们的例子中):
- 从矩阵的第一个概率向量采样 40 次
probs
- 从矩阵的第二个概率向量采样50次
probs
- ...
- 从矩阵的第N个概率向量中采样19次
probs
这样我的最终矩阵看起来像(例如):
A = [[22, ... 13], ..., [12, ..., 3]]
其中 np.sum(A, axis=1) == counts
(即每行的总和 = counts
向量对应行中的数字)
这是我的 TensorFlow 代码示例:
import numpy as np
import tensorflow as tf
import tensorflow.contrib.distributions as ds
import time
nb_distribution = 100 # number of probability distributions
counts = np.random.randint(2000, 3500, size=nb_distribution) # define number of counts (vector of size 100 with int in 2000, 3500)
# print(u[:40]) # should be the same as the output of print(np.sum(res, 1)[:40]) in the tf.Session()
# probsn is a matrix of probability:
# each row of probsn contains a vector of size 30 that sums to 1
probsn = np.random.uniform(size=(nb_distribution, 30))
probsn /= np.sum(probsn, axis=1)[:, None]
counts = tf.Variable(counts, dtype=tf.float32)
probs = tf.Variable(tf.convert_to_tensor(probsn.astype(np.float32)))
# sample from the multinomial
dist = ds.Multinomial(total_count=counts, probs=probs)
out = dist.sample()
start = time.time()
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
res = sess.run(out)
# print(np.sum(res, 1)[:40])
print(time.time() - start)
已用时间:0.12秒
我在 Theano 中的等效代码:
import numpy as np
import theano
from theano.tensor import _shared
nb_distribution = 100 # number of probability distributions
counts = np.random.randint(2000, 3500, size=nb_distribution)
#print(u[:40]) # should be the same as the output of print(np.sum(v_sample(), 1)[:40])
counts = _shared(counts) # define number of counts (vector of size 100 with int in 2000, 3500)
# probsn is a matrix of probability:
# each row of probsn contains a vector that sums to 1
probsn = np.random.uniform(size=(nb_distribution, 30))
probsn /= np.sum(probsn, axis=1)[:, None]
probsn = _shared(probsn)
from theano.tensor.shared_randomstreams import RandomStreams
np_rng = np.random.RandomState(12345)
theano_rng = RandomStreams(np_rng.randint(2 ** 30))
v_sample = theano.function(inputs=[], outputs=theano_rng.multinomial(n=counts, pvals=probsn))
start_t = time.time()
out = np.sum(v_sample(), 1)[:40]
# print(out)
print(time.time() - start_t)
已用时间:0.0025秒
Theano 快了 100 倍...我的 TensorFlow 代码有问题吗?如何在 TensorFlow 中有效地从多项式分布中采样?
问题在于 TensorFlow 多项式 sample()
方法实际上使用了方法调用 _sample_n()
。此方法定义为 here。正如我们在从多项式中采样的代码中看到的那样,代码为 每一行 生成一个 one_hot 的矩阵,然后通过对行求和将矩阵缩减为一个向量:
math_ops.reduce_sum(array_ops.one_hot(x, depth=k), axis=-2)
效率低下,因为它使用了额外的内存。为了避免这种情况,我使用了
tf.scatter_nd
函数。这是一个完全可运行的示例:
import tensorflow as tf
import numpy as np
import tensorflow.contrib.distributions as ds
import time
tf.reset_default_graph()
nb_distribution = 100 # number of probabilities distribution
u = np.random.randint(2000, 3500, size=nb_distribution) # define number of counts (vector of size 100 with int in 2000, 3500)
# probsn is a matrix of probability:
# each row of probsn contains a vector of size 30 that sums to 1
probsn = np.random.uniform(size=(nb_distribution, 30))
probsn /= np.sum(probsn, axis=1)[:, None]
counts = tf.Variable(u, dtype=tf.float32)
probs = tf.Variable(tf.convert_to_tensor(probsn.astype(np.float32)))
# sample from the multinomial
dist = ds.Multinomial(total_count=counts, probs=probs)
out = dist.sample()
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
res = sess.run(out) # if remove this line the code is slower...
start = time.time()
res = sess.run(out)
print(time.time() - start)
print(np.all(u == np.sum(res, axis=1)))
这段代码用了 0.05 秒来计算
def vmultinomial_sampling(counts, pvals, seed=None):
k = tf.shape(pvals)[1]
logits = tf.expand_dims(tf.log(pvals), 1)
def sample_single(args):
logits_, n_draw_ = args[0], args[1]
x = tf.multinomial(logits_, n_draw_, seed)
indices = tf.cast(tf.reshape(x, [-1,1]), tf.int32)
updates = tf.ones(n_draw_) # tf.shape(indices)[0]
return tf.scatter_nd(indices, updates, [k])
x = tf.map_fn(sample_single, [logits, counts], dtype=tf.float32)
return x
xx = vmultinomial_sampling(u, probsn)
# check = tf.expand_dims(counts, 1) * probs
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
res = sess.run(xx) # if remove this line the code is slower...
start_t = time.time()
res = sess.run(xx)
print(time.time() -start_t)
#print(np.sum(res, axis=1))
print(np.all(u == np.sum(res, axis=1)))
这段代码用了 0.016 秒
缺点是我的代码实际上并没有并行化计算(即使 parallel_iterations
参数在 map_fn
中默认设置为 10,将其设置为 1 也不会改变任何东西。 ..)
也许有人会找到更好的东西,因为与 Theano 的实现相比它仍然很慢(因为它没有利用并行化......然而,在这里,并行化是有意义的,因为采样一行独立于另一行采样...)
我希望能够从多项式分布中进行采样非常,而且显然我的 TensorFlow 代码非常...非常慢...
我的想法是:
- 一个向量:例如
counts = [40, 50, 26, ..., 19]
- 概率矩阵:
probs = [[0.1, ..., 0.5], ... [0.3, ..., 0.02]]
使得np.sum(probs, axis=1) = 1
假设 len(counts) = N
和 len(probs) = (N, 50)
。我想做的是(在我们的例子中):
- 从矩阵的第一个概率向量采样 40 次
probs
- 从矩阵的第二个概率向量采样50次
probs
- ...
- 从矩阵的第N个概率向量中采样19次
probs
这样我的最终矩阵看起来像(例如):
A = [[22, ... 13], ..., [12, ..., 3]]
其中 np.sum(A, axis=1) == counts
(即每行的总和 = counts
向量对应行中的数字)
这是我的 TensorFlow 代码示例:
import numpy as np
import tensorflow as tf
import tensorflow.contrib.distributions as ds
import time
nb_distribution = 100 # number of probability distributions
counts = np.random.randint(2000, 3500, size=nb_distribution) # define number of counts (vector of size 100 with int in 2000, 3500)
# print(u[:40]) # should be the same as the output of print(np.sum(res, 1)[:40]) in the tf.Session()
# probsn is a matrix of probability:
# each row of probsn contains a vector of size 30 that sums to 1
probsn = np.random.uniform(size=(nb_distribution, 30))
probsn /= np.sum(probsn, axis=1)[:, None]
counts = tf.Variable(counts, dtype=tf.float32)
probs = tf.Variable(tf.convert_to_tensor(probsn.astype(np.float32)))
# sample from the multinomial
dist = ds.Multinomial(total_count=counts, probs=probs)
out = dist.sample()
start = time.time()
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
res = sess.run(out)
# print(np.sum(res, 1)[:40])
print(time.time() - start)
已用时间:0.12秒
我在 Theano 中的等效代码:
import numpy as np
import theano
from theano.tensor import _shared
nb_distribution = 100 # number of probability distributions
counts = np.random.randint(2000, 3500, size=nb_distribution)
#print(u[:40]) # should be the same as the output of print(np.sum(v_sample(), 1)[:40])
counts = _shared(counts) # define number of counts (vector of size 100 with int in 2000, 3500)
# probsn is a matrix of probability:
# each row of probsn contains a vector that sums to 1
probsn = np.random.uniform(size=(nb_distribution, 30))
probsn /= np.sum(probsn, axis=1)[:, None]
probsn = _shared(probsn)
from theano.tensor.shared_randomstreams import RandomStreams
np_rng = np.random.RandomState(12345)
theano_rng = RandomStreams(np_rng.randint(2 ** 30))
v_sample = theano.function(inputs=[], outputs=theano_rng.multinomial(n=counts, pvals=probsn))
start_t = time.time()
out = np.sum(v_sample(), 1)[:40]
# print(out)
print(time.time() - start_t)
已用时间:0.0025秒
Theano 快了 100 倍...我的 TensorFlow 代码有问题吗?如何在 TensorFlow 中有效地从多项式分布中采样?
问题在于 TensorFlow 多项式 sample()
方法实际上使用了方法调用 _sample_n()
。此方法定义为 here。正如我们在从多项式中采样的代码中看到的那样,代码为 每一行 生成一个 one_hot 的矩阵,然后通过对行求和将矩阵缩减为一个向量:
math_ops.reduce_sum(array_ops.one_hot(x, depth=k), axis=-2)
效率低下,因为它使用了额外的内存。为了避免这种情况,我使用了
tf.scatter_nd
函数。这是一个完全可运行的示例:
import tensorflow as tf
import numpy as np
import tensorflow.contrib.distributions as ds
import time
tf.reset_default_graph()
nb_distribution = 100 # number of probabilities distribution
u = np.random.randint(2000, 3500, size=nb_distribution) # define number of counts (vector of size 100 with int in 2000, 3500)
# probsn is a matrix of probability:
# each row of probsn contains a vector of size 30 that sums to 1
probsn = np.random.uniform(size=(nb_distribution, 30))
probsn /= np.sum(probsn, axis=1)[:, None]
counts = tf.Variable(u, dtype=tf.float32)
probs = tf.Variable(tf.convert_to_tensor(probsn.astype(np.float32)))
# sample from the multinomial
dist = ds.Multinomial(total_count=counts, probs=probs)
out = dist.sample()
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
res = sess.run(out) # if remove this line the code is slower...
start = time.time()
res = sess.run(out)
print(time.time() - start)
print(np.all(u == np.sum(res, axis=1)))
这段代码用了 0.05 秒来计算
def vmultinomial_sampling(counts, pvals, seed=None):
k = tf.shape(pvals)[1]
logits = tf.expand_dims(tf.log(pvals), 1)
def sample_single(args):
logits_, n_draw_ = args[0], args[1]
x = tf.multinomial(logits_, n_draw_, seed)
indices = tf.cast(tf.reshape(x, [-1,1]), tf.int32)
updates = tf.ones(n_draw_) # tf.shape(indices)[0]
return tf.scatter_nd(indices, updates, [k])
x = tf.map_fn(sample_single, [logits, counts], dtype=tf.float32)
return x
xx = vmultinomial_sampling(u, probsn)
# check = tf.expand_dims(counts, 1) * probs
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
res = sess.run(xx) # if remove this line the code is slower...
start_t = time.time()
res = sess.run(xx)
print(time.time() -start_t)
#print(np.sum(res, axis=1))
print(np.all(u == np.sum(res, axis=1)))
这段代码用了 0.016 秒
缺点是我的代码实际上并没有并行化计算(即使 parallel_iterations
参数在 map_fn
中默认设置为 10,将其设置为 1 也不会改变任何东西。 ..)
也许有人会找到更好的东西,因为与 Theano 的实现相比它仍然很慢(因为它没有利用并行化......然而,在这里,并行化是有意义的,因为采样一行独立于另一行采样...)