"nearest" 插值方法的 NumPy 百分位数和 TensorFlow 百分位数的不同结果
Different results with NumPy percentile and TensorFlow percentile for "nearest" interpolation method
我注意到,尽管 NumPy 的 numpy.percentile
and TensorFlow Probability's tfp.stats.percentile
对其 "nearest" 插值方法
给出了相同的文档字符串解释
This optional parameter specifies the interpolation method to use when the desired percentile lies between two data points i < j
:
...
‘nearest’: i
or j
, whichever is nearest.
他们给出了不同的结果。下面是我的意思的一个最小工作示例。
环境
$ "$(which python3)" --version
Python 3.7.5
$ python3 -m venv "${HOME}/.venvs/question"
$ . "${HOME}/.venvs/question/bin/activate"
(question) $ cat requirements.txt
numpy~=1.18
tensorflow~=2.1
tensorflow-probability~=0.9
black
(question) $ python -m pip install -r requirements.txt
代码
# question.py
import numpy as np
import tensorflow as tf
import tensorflow_probability as tfp
def main():
a = np.array([[10.0, 7.0, 4.0], [3.0, 2.0, 1.0]])
q = 50
print(f"Flattened array: {a.flatten()}")
print("NumPy:")
print(f"\t{q}th percentile (linear): {np.percentile(a, q, interpolation='linear')}")
print(
f"\t{q}th percentile (nearest): {np.percentile(a, q, interpolation='nearest')}"
)
b = tf.convert_to_tensor(a)
print("TensorFlow:")
print(
f"\t{q}th percentile (linear): {tfp.stats.percentile(b, q, interpolation='linear')}"
)
print(
f"\t{q}th percentile (nearest): {tfp.stats.percentile(b, q, interpolation='nearest')}"
)
if __name__ == '__main__':
main()
当 运行 为 "nearest" 插值方法给出不同的结果时
(question) $ python question.py
Flattened array: [10. 7. 4. 3. 2. 1.]
NumPy:
50th percentile (linear): 3.5
50th percentile (nearest): 3.0
TensorFlow:
50th percentile (linear): 3.5
50th percentile (nearest): 4.0
在 NumPy v1.18.2 source of the function that numpy.percentile
is calling I'm still confused as to why. It seems that this is due to a rounding decision (given that NumPy uses numpy.around
and TFP uses tf.round
周围摸索之后。
谁能给我解释一下是什么导致了这种差异?我想为函数做一个垫片,但我需要了解 return 行为。
逐步查看两者的来源,似乎 不是 像我首先那样的舍入问题,但是 numpy.percentile
does the final evaluation on an ascending sorted ndarray, while tfp.stats.percentile
在降序排序的张量上进行.
# answer.py
import numpy as np
import tensorflow as tf
import tensorflow_probability as tfp
from tensorflow_probability.python.internal import tensorshape_util
from tensorflow_probability.python.internal import distribution_util
def numpy_src(input, q, axis=0, out=None):
a = input
q = np.true_divide(q, 100) # 0.5
q = np.asanyarray(q) # array(0.5)
q = q[None] # array([0.5])
ap = a.flatten() # array([10., 7., 4., 3., 2., 1.])
Nx = ap.shape[axis] # 6
indices = q * (Nx - 1) # array([2.5])
indices = np.around(indices).astype(np.intp) # array([2])
ap.partition(indices, axis=axis) # array([ 1., 2., 3., 4., 7., 10.])
indices = indices[0] # 2
r = np.take(ap, indices, axis=axis, out=out) # 3.0
print(f"Result of np.percentile source: {r}")
def tensorflow_src(input, q=50, axis=None):
x = input
name = "percentile"
interpolation = "nearest"
q = tf.cast(q, tf.float64) # tf.Tensor(50.0, shape=(), dtype=float64)
if axis is None:
y = tf.reshape(
x, [-1]
) # tf.Tensor([10. 7. 4. 3. 2. 1.], shape=(6,), dtype=float64)
frac_at_q_or_above = 1.0 - q / 100.0 # tf.Tensor(0.5, shape=(), dtype=float64)
# _sort_tensor(y)
# N.B. Here is the difference. Note the sort order is never changed
sorted_y, _ = tf.math.top_k(
y, k=tf.shape(y)[-1]
) # tf.Tensor([10. 7. 4. 3. 2. 1.], shape=(6,), dtype=float64), _
tensorshape_util.set_shape(
sorted_y, y.shape
) # tf.Tensor([10. 7. 4. 3. 2. 1.], shape=(6,), dtype=float64)
d = tf.cast(tf.shape(y)[-1], tf.float64) # tf.Tensor(6.0, shape=(), dtype=float64)
# _get_indices(interpolation)
indices = tf.round(
(d - 1) * frac_at_q_or_above
) # tf.Tensor(2.0, shape=(), dtype=float64)
indices = tf.clip_by_value(
tf.cast(indices, tf.int32), 0, tf.shape(y)[-1] - 1
) # tf.Tensor(2, shape=(), dtype=int32)
# N.B. The sort order here is descending, causing a difference
gathered_y = tf.gather(
sorted_y, indices, axis=-1
) # tf.Tensor(4.0, shape=(), dtype=float64)
result = distribution_util.rotate_transpose(gathered_y, tf.rank(q)) # 4.0
print(f"Result of tf.percentile source: {result}")
def main():
np_in = np.array([[10.0, 7.0, 4.0], [3.0, 2.0, 1.0]])
numpy_src(np_in, q=50)
tf_in = tf.convert_to_tensor(np_in)
tensorflow_src(tf_in, q=50)
if __name__ == "__main__":
main()
当 运行 给出
$ python answer.py
Result of np.percentile source: 3.0
Result of tf.percentile source: 4.0
如果而不是,则将以下内容添加到 TensorFlow Probability 的 percentile
中,以使评估的排序顺序 升序
sorted_y = tf.reverse(
sorted_y, [-1]
) # tf.Tensor([ 1. 2. 3. 4. 7. 10.], shape=(6,), dtype=float64)
那么这两个结果是一样的
$ python answer.py
Result of np.percentile source: 3.0
Result of tf.percentile source: 3.0
鉴于 TensorFlow Probability 的 docstring 说
Given a vector x
, the q
-th percentile of x
is the value q / 100
of the way from the minimum to the maximum in a sorted copy of x
.
这似乎是错误的,因为它给出了相反的结果。我已经打开 TensorFlow Probability Issue 864 来讨论这个问题。
我注意到,尽管 NumPy 的 numpy.percentile
and TensorFlow Probability's tfp.stats.percentile
对其 "nearest" 插值方法
This optional parameter specifies the interpolation method to use when the desired percentile lies between two data points
i < j
:...
‘nearest’:
i
orj
, whichever is nearest.
他们给出了不同的结果。下面是我的意思的一个最小工作示例。
环境
$ "$(which python3)" --version
Python 3.7.5
$ python3 -m venv "${HOME}/.venvs/question"
$ . "${HOME}/.venvs/question/bin/activate"
(question) $ cat requirements.txt
numpy~=1.18
tensorflow~=2.1
tensorflow-probability~=0.9
black
(question) $ python -m pip install -r requirements.txt
代码
# question.py
import numpy as np
import tensorflow as tf
import tensorflow_probability as tfp
def main():
a = np.array([[10.0, 7.0, 4.0], [3.0, 2.0, 1.0]])
q = 50
print(f"Flattened array: {a.flatten()}")
print("NumPy:")
print(f"\t{q}th percentile (linear): {np.percentile(a, q, interpolation='linear')}")
print(
f"\t{q}th percentile (nearest): {np.percentile(a, q, interpolation='nearest')}"
)
b = tf.convert_to_tensor(a)
print("TensorFlow:")
print(
f"\t{q}th percentile (linear): {tfp.stats.percentile(b, q, interpolation='linear')}"
)
print(
f"\t{q}th percentile (nearest): {tfp.stats.percentile(b, q, interpolation='nearest')}"
)
if __name__ == '__main__':
main()
当 运行 为 "nearest" 插值方法给出不同的结果时
(question) $ python question.py
Flattened array: [10. 7. 4. 3. 2. 1.]
NumPy:
50th percentile (linear): 3.5
50th percentile (nearest): 3.0
TensorFlow:
50th percentile (linear): 3.5
50th percentile (nearest): 4.0
在 NumPy v1.18.2 source of the function that numpy.percentile
is calling I'm still confused as to why. It seems that this is due to a rounding decision (given that NumPy uses numpy.around
and TFP uses tf.round
周围摸索之后。
谁能给我解释一下是什么导致了这种差异?我想为函数做一个垫片,但我需要了解 return 行为。
逐步查看两者的来源,似乎 不是 像我首先那样的舍入问题,但是 numpy.percentile
does the final evaluation on an ascending sorted ndarray, while tfp.stats.percentile
在降序排序的张量上进行.
# answer.py
import numpy as np
import tensorflow as tf
import tensorflow_probability as tfp
from tensorflow_probability.python.internal import tensorshape_util
from tensorflow_probability.python.internal import distribution_util
def numpy_src(input, q, axis=0, out=None):
a = input
q = np.true_divide(q, 100) # 0.5
q = np.asanyarray(q) # array(0.5)
q = q[None] # array([0.5])
ap = a.flatten() # array([10., 7., 4., 3., 2., 1.])
Nx = ap.shape[axis] # 6
indices = q * (Nx - 1) # array([2.5])
indices = np.around(indices).astype(np.intp) # array([2])
ap.partition(indices, axis=axis) # array([ 1., 2., 3., 4., 7., 10.])
indices = indices[0] # 2
r = np.take(ap, indices, axis=axis, out=out) # 3.0
print(f"Result of np.percentile source: {r}")
def tensorflow_src(input, q=50, axis=None):
x = input
name = "percentile"
interpolation = "nearest"
q = tf.cast(q, tf.float64) # tf.Tensor(50.0, shape=(), dtype=float64)
if axis is None:
y = tf.reshape(
x, [-1]
) # tf.Tensor([10. 7. 4. 3. 2. 1.], shape=(6,), dtype=float64)
frac_at_q_or_above = 1.0 - q / 100.0 # tf.Tensor(0.5, shape=(), dtype=float64)
# _sort_tensor(y)
# N.B. Here is the difference. Note the sort order is never changed
sorted_y, _ = tf.math.top_k(
y, k=tf.shape(y)[-1]
) # tf.Tensor([10. 7. 4. 3. 2. 1.], shape=(6,), dtype=float64), _
tensorshape_util.set_shape(
sorted_y, y.shape
) # tf.Tensor([10. 7. 4. 3. 2. 1.], shape=(6,), dtype=float64)
d = tf.cast(tf.shape(y)[-1], tf.float64) # tf.Tensor(6.0, shape=(), dtype=float64)
# _get_indices(interpolation)
indices = tf.round(
(d - 1) * frac_at_q_or_above
) # tf.Tensor(2.0, shape=(), dtype=float64)
indices = tf.clip_by_value(
tf.cast(indices, tf.int32), 0, tf.shape(y)[-1] - 1
) # tf.Tensor(2, shape=(), dtype=int32)
# N.B. The sort order here is descending, causing a difference
gathered_y = tf.gather(
sorted_y, indices, axis=-1
) # tf.Tensor(4.0, shape=(), dtype=float64)
result = distribution_util.rotate_transpose(gathered_y, tf.rank(q)) # 4.0
print(f"Result of tf.percentile source: {result}")
def main():
np_in = np.array([[10.0, 7.0, 4.0], [3.0, 2.0, 1.0]])
numpy_src(np_in, q=50)
tf_in = tf.convert_to_tensor(np_in)
tensorflow_src(tf_in, q=50)
if __name__ == "__main__":
main()
当 运行 给出
$ python answer.py
Result of np.percentile source: 3.0
Result of tf.percentile source: 4.0
如果而不是,则将以下内容添加到 TensorFlow Probability 的 percentile
中,以使评估的排序顺序 升序
sorted_y = tf.reverse(
sorted_y, [-1]
) # tf.Tensor([ 1. 2. 3. 4. 7. 10.], shape=(6,), dtype=float64)
那么这两个结果是一样的
$ python answer.py
Result of np.percentile source: 3.0
Result of tf.percentile source: 3.0
鉴于 TensorFlow Probability 的 docstring 说
Given a vector
x
, theq
-th percentile ofx
is the valueq / 100
of the way from the minimum to the maximum in a sorted copy ofx
.
这似乎是错误的,因为它给出了相反的结果。我已经打开 TensorFlow Probability Issue 864 来讨论这个问题。