scipy gaussian_kde 和循环数据
scipy gaussian_kde and circular data
我正在使用 scipys gaussian_kde 来获取一些双峰数据的概率密度。但是,由于我的数据是 angular(以度为单位的方向),当值出现在极限附近时,我会遇到问题。下面的代码给出了两个 kde 示例,当域为 0-360 时,它估计不足,因为它无法处理数据的循环性质。 pdf 需要在单位圆上定义,但我在 scipy.stats 中找不到适合此类数据的任何内容(那里有 von mises 分布,但仅适用于单峰数据)。有人 运行 以前参加过这个吗?是否有任何东西(最好基于 python)可用于估计单位圆上的双峰 pdf?
import numpy as np
import scipy as sp
from pylab import plot,figure,subplot,show,hist
from scipy import stats
baz = np.array([-92.29061004, -85.42607874, -85.42607874, -70.01689348,
-63.43494882, -63.43494882, -70.01689348, -70.01689348,
-59.93141718, -63.43494882, -59.93141718, -63.43494882,
-63.43494882, -63.43494882, -57.52880771, -53.61564818,
-57.52880771, -63.43494882, -63.43494882, -92.29061004,
-16.92751306, -99.09027692, -99.09027692, -16.92751306,
-99.09027692, -16.92751306, -9.86580694, -8.74616226,
-9.86580694, -8.74616226, -8.74616226, -2.20259816,
-2.20259816, -2.20259816, -9.86580694, -2.20259816,
-2.48955292, -2.48955292, -2.48955292, -2.48955292,
4.96974073, 4.96974073, 4.96974073, 4.96974073,
-2.48955292, -2.48955292, -2.48955292, -2.48955292,
-2.48955292, -9.86580694, -9.86580694, -9.86580694,
-16.92751306, -19.29004622, -19.29004622, -26.56505118,
-19.29004622, -19.29004622, -19.29004622, -19.29004622])
xx = np.linspace(-180, 180, 181)
scipy_kde = stats.gaussian_kde(baz)
print scipy_kde.integrate_box_1d(-180,180)
figure()
plot(xx, scipy_kde(xx), c='green')
baz[baz<0] += 360
xx = np.linspace(0, 360, 181)
scipy_kde = stats.gaussian_kde(baz)
print scipy_kde.integrate_box_1d(-180,180)
plot(xx, scipy_kde(xx), c='red')
所以我有一个我认为合理的解决方案。基本上,我使用 Von Mises 分布作为核密度估计的基函数。代码如下,以防对其他人有用。
def vonmises_KDE(data, kappa, plot=None):
"""
Create a kernal densisity estimate of circular data using the von mises
distribution as the basis function.
"""
# imports
from scipy.stats import vonmises
from scipy.interpolate import interp1d
# convert to radians
data = np.radians(data)
# set limits for von mises
vonmises.a = -np.pi
vonmises.b = np.pi
x_data = np.linspace(-np.pi, np.pi, 100)
kernels = []
for d in data:
# Make the basis function as a von mises PDF
kernel = vonmises(kappa, loc=d)
kernel = kernel.pdf(x_data)
kernels.append(kernel)
if plot:
# For plotting
kernel /= kernel.max()
kernel *= .2
plt.plot(x_data, kernel, "grey", alpha=.5)
vonmises_kde = np.sum(kernels, axis=0)
vonmises_kde = vonmises_kde / np.trapz(vonmises_kde, x=x_data)
f = interp1d( x_data, vonmises_kde )
if plot:
plt.plot(x_data, vonmises_kde, c='red')
return x_data, vonmises_kde, f
baz = np.array([-92.29061004, -85.42607874, -85.42607874, -70.01689348,
-63.43494882, -63.43494882, -70.01689348, -70.01689348,
-59.93141718, -63.43494882, -59.93141718, -63.43494882,
-63.43494882, -63.43494882, -57.52880771, -53.61564818,
-57.52880771, -63.43494882, -63.43494882, -92.29061004,
-16.92751306, -99.09027692, -99.09027692, -16.92751306,
-99.09027692, -16.92751306, -9.86580694, -8.74616226,
-9.86580694, -8.74616226, -8.74616226, -2.20259816,
-2.20259816, -2.20259816, -9.86580694, -2.20259816,
-2.48955292, -2.48955292, -2.48955292, -2.48955292,
4.96974073, 4.96974073, 4.96974073, 4.96974073,
-2.48955292, -2.48955292, -2.48955292, -2.48955292,
-2.48955292, -9.86580694, -9.86580694, -9.86580694,
-16.92751306, -19.29004622, -19.29004622, -26.56505118,
-19.29004622, -19.29004622, -19.29004622, -19.29004622])
kappa = 12
x_data, vonmises_kde, f = vonmises_KDE(baz, kappa, plot=1)
Dave 的回答不正确,因为 scipy
的 vonmises
没有环绕 [-pi, pi]
.
您可以使用下面的代码,它基于相同的原理。它基于 numpy.
中描述的等式
def vonmises_kde(data, kappa, n_bins=100):
from scipy.special import i0
bins = np.linspace(-np.pi, np.pi, n_bins)
x = np.linspace(-np.pi, np.pi, n_bins)
# integrate vonmises kernels
kde = np.exp(kappa*np.cos(x[:, None]-data[None, :])).sum(1)/(2*np.pi*i0(kappa))
kde /= np.trapz(kde, x=bins)
return bins, kde
这是一个例子
import matplotlib.pyplot as plt
import numpy as np
from numpy.random import vonmises
# generate complex circular distribution
data = np.r_[vonmises(-1, 5, 1000), vonmises(2, 10, 500), vonmises(3, 20, 100)]
# plot data histogram
fig, axes = plt.subplots(2, 1)
axes[0].hist(data, 100)
# plot kernel density estimates
x, kde = vonmises_kde(data, 20)
axes[1].plot(x, kde)
Histogram and kernel density plots
这是对@kingjr 更准确答案的快速近似:
def vonmises_pdf(x, mu, kappa):
return np.exp(kappa * np.cos(x - mu)) / (2. * np.pi * scipy.special.i0(kappa))
def vonmises_fft_kde(data, kappa, n_bins):
bins = np.linspace(-np.pi, np.pi, n_bins + 1, endpoint=True)
hist_n, bin_edges = np.histogram(data, bins=bins)
bin_centers = np.mean([bin_edges[1:], bin_edges[:-1]], axis=0)
kernel = vonmises_pdf(
x=bin_centers,
mu=0,
kappa=kappa
)
kde = np.fft.fftshift(np.fft.irfft(np.fft.rfft(kernel) * np.fft.rfft(hist_n)))
kde /= np.trapz(kde, x=bin_centers)
return bin_centers, kde
测试(使用tqdm做进度条和计时,matplotlib验证结果):
import numpy as np
from tqdm import tqdm
import scipy.stats
import matplotlib.pyplot as plt
n_runs = 1000
n_bins = 100
kappa = 10
for _ in tqdm(xrange(n_runs)):
bins1, kde1 = vonmises_kde(
data=np.r_[
np.random.vonmises(-1, 5, 1000),
np.random.vonmises(2, 10, 500),
np.random.vonmises(3, 20, 100)
],
kappa=kappa,
n_bins=n_bins
)
for _ in tqdm(xrange(n_runs)):
bins2, kde2 = vonmises_fft_kde(
data=np.r_[
np.random.vonmises(-1, 5, 1000),
np.random.vonmises(2, 10, 500),
np.random.vonmises(3, 20, 100)
],
kappa=kappa,
n_bins=n_bins
)
plt.figure()
plt.plot(bins1, kde1, label="kingjr's solution")
plt.plot(bins2, kde2, label="dolf's FFT solution")
plt.legend()
plt.show()
结果:
100%|██████████| 1000/1000 [00:07<00:00, 135.29it/s]
100%|██████████| 1000/1000 [00:00<00:00, 1945.14it/s]
(1945 / 135 = 快 14 倍)
要获得更快的速度,请使用 2 的整数次方作为 bin 的数量。它还具有更好的扩展性(即,它可以快速处理许多 bin 和大量数据)。在我的电脑上,它比 n_bins=1024.
的原始答案快 118 倍
为什么有效?
两个信号(没有零填充)的 FFT 的乘积等于 circular (or cyclic) convolution of the two signals. A kernel density estimation 基本上是一个与具有脉冲的信号卷积的内核在每个数据点的位置。
为什么不准确?
由于我使用直方图space数据均匀,所以我丢失了每个样本的确切位置,只使用它所属的bin的中心。每个 bin 中的样本数用作该点的脉冲幅度。 例如: 暂时忽略规范化,如果您有一个从 0 到 1 的容器,并且该容器中有两个样本,分别为 0.1 和 0.2,exact
KDE将是 the kernel centred around 0.1
+ the kernel centred around 0.2
。近似值将是 2x`以 0.5 为中心的内核,这是 bin 的中心。
我正在使用 scipys gaussian_kde 来获取一些双峰数据的概率密度。但是,由于我的数据是 angular(以度为单位的方向),当值出现在极限附近时,我会遇到问题。下面的代码给出了两个 kde 示例,当域为 0-360 时,它估计不足,因为它无法处理数据的循环性质。 pdf 需要在单位圆上定义,但我在 scipy.stats 中找不到适合此类数据的任何内容(那里有 von mises 分布,但仅适用于单峰数据)。有人 运行 以前参加过这个吗?是否有任何东西(最好基于 python)可用于估计单位圆上的双峰 pdf?
import numpy as np
import scipy as sp
from pylab import plot,figure,subplot,show,hist
from scipy import stats
baz = np.array([-92.29061004, -85.42607874, -85.42607874, -70.01689348,
-63.43494882, -63.43494882, -70.01689348, -70.01689348,
-59.93141718, -63.43494882, -59.93141718, -63.43494882,
-63.43494882, -63.43494882, -57.52880771, -53.61564818,
-57.52880771, -63.43494882, -63.43494882, -92.29061004,
-16.92751306, -99.09027692, -99.09027692, -16.92751306,
-99.09027692, -16.92751306, -9.86580694, -8.74616226,
-9.86580694, -8.74616226, -8.74616226, -2.20259816,
-2.20259816, -2.20259816, -9.86580694, -2.20259816,
-2.48955292, -2.48955292, -2.48955292, -2.48955292,
4.96974073, 4.96974073, 4.96974073, 4.96974073,
-2.48955292, -2.48955292, -2.48955292, -2.48955292,
-2.48955292, -9.86580694, -9.86580694, -9.86580694,
-16.92751306, -19.29004622, -19.29004622, -26.56505118,
-19.29004622, -19.29004622, -19.29004622, -19.29004622])
xx = np.linspace(-180, 180, 181)
scipy_kde = stats.gaussian_kde(baz)
print scipy_kde.integrate_box_1d(-180,180)
figure()
plot(xx, scipy_kde(xx), c='green')
baz[baz<0] += 360
xx = np.linspace(0, 360, 181)
scipy_kde = stats.gaussian_kde(baz)
print scipy_kde.integrate_box_1d(-180,180)
plot(xx, scipy_kde(xx), c='red')
所以我有一个我认为合理的解决方案。基本上,我使用 Von Mises 分布作为核密度估计的基函数。代码如下,以防对其他人有用。
def vonmises_KDE(data, kappa, plot=None):
"""
Create a kernal densisity estimate of circular data using the von mises
distribution as the basis function.
"""
# imports
from scipy.stats import vonmises
from scipy.interpolate import interp1d
# convert to radians
data = np.radians(data)
# set limits for von mises
vonmises.a = -np.pi
vonmises.b = np.pi
x_data = np.linspace(-np.pi, np.pi, 100)
kernels = []
for d in data:
# Make the basis function as a von mises PDF
kernel = vonmises(kappa, loc=d)
kernel = kernel.pdf(x_data)
kernels.append(kernel)
if plot:
# For plotting
kernel /= kernel.max()
kernel *= .2
plt.plot(x_data, kernel, "grey", alpha=.5)
vonmises_kde = np.sum(kernels, axis=0)
vonmises_kde = vonmises_kde / np.trapz(vonmises_kde, x=x_data)
f = interp1d( x_data, vonmises_kde )
if plot:
plt.plot(x_data, vonmises_kde, c='red')
return x_data, vonmises_kde, f
baz = np.array([-92.29061004, -85.42607874, -85.42607874, -70.01689348,
-63.43494882, -63.43494882, -70.01689348, -70.01689348,
-59.93141718, -63.43494882, -59.93141718, -63.43494882,
-63.43494882, -63.43494882, -57.52880771, -53.61564818,
-57.52880771, -63.43494882, -63.43494882, -92.29061004,
-16.92751306, -99.09027692, -99.09027692, -16.92751306,
-99.09027692, -16.92751306, -9.86580694, -8.74616226,
-9.86580694, -8.74616226, -8.74616226, -2.20259816,
-2.20259816, -2.20259816, -9.86580694, -2.20259816,
-2.48955292, -2.48955292, -2.48955292, -2.48955292,
4.96974073, 4.96974073, 4.96974073, 4.96974073,
-2.48955292, -2.48955292, -2.48955292, -2.48955292,
-2.48955292, -9.86580694, -9.86580694, -9.86580694,
-16.92751306, -19.29004622, -19.29004622, -26.56505118,
-19.29004622, -19.29004622, -19.29004622, -19.29004622])
kappa = 12
x_data, vonmises_kde, f = vonmises_KDE(baz, kappa, plot=1)
Dave 的回答不正确,因为 scipy
的 vonmises
没有环绕 [-pi, pi]
.
您可以使用下面的代码,它基于相同的原理。它基于 numpy.
中描述的等式def vonmises_kde(data, kappa, n_bins=100):
from scipy.special import i0
bins = np.linspace(-np.pi, np.pi, n_bins)
x = np.linspace(-np.pi, np.pi, n_bins)
# integrate vonmises kernels
kde = np.exp(kappa*np.cos(x[:, None]-data[None, :])).sum(1)/(2*np.pi*i0(kappa))
kde /= np.trapz(kde, x=bins)
return bins, kde
这是一个例子
import matplotlib.pyplot as plt
import numpy as np
from numpy.random import vonmises
# generate complex circular distribution
data = np.r_[vonmises(-1, 5, 1000), vonmises(2, 10, 500), vonmises(3, 20, 100)]
# plot data histogram
fig, axes = plt.subplots(2, 1)
axes[0].hist(data, 100)
# plot kernel density estimates
x, kde = vonmises_kde(data, 20)
axes[1].plot(x, kde)
Histogram and kernel density plots
这是对@kingjr 更准确答案的快速近似:
def vonmises_pdf(x, mu, kappa):
return np.exp(kappa * np.cos(x - mu)) / (2. * np.pi * scipy.special.i0(kappa))
def vonmises_fft_kde(data, kappa, n_bins):
bins = np.linspace(-np.pi, np.pi, n_bins + 1, endpoint=True)
hist_n, bin_edges = np.histogram(data, bins=bins)
bin_centers = np.mean([bin_edges[1:], bin_edges[:-1]], axis=0)
kernel = vonmises_pdf(
x=bin_centers,
mu=0,
kappa=kappa
)
kde = np.fft.fftshift(np.fft.irfft(np.fft.rfft(kernel) * np.fft.rfft(hist_n)))
kde /= np.trapz(kde, x=bin_centers)
return bin_centers, kde
测试(使用tqdm做进度条和计时,matplotlib验证结果):
import numpy as np
from tqdm import tqdm
import scipy.stats
import matplotlib.pyplot as plt
n_runs = 1000
n_bins = 100
kappa = 10
for _ in tqdm(xrange(n_runs)):
bins1, kde1 = vonmises_kde(
data=np.r_[
np.random.vonmises(-1, 5, 1000),
np.random.vonmises(2, 10, 500),
np.random.vonmises(3, 20, 100)
],
kappa=kappa,
n_bins=n_bins
)
for _ in tqdm(xrange(n_runs)):
bins2, kde2 = vonmises_fft_kde(
data=np.r_[
np.random.vonmises(-1, 5, 1000),
np.random.vonmises(2, 10, 500),
np.random.vonmises(3, 20, 100)
],
kappa=kappa,
n_bins=n_bins
)
plt.figure()
plt.plot(bins1, kde1, label="kingjr's solution")
plt.plot(bins2, kde2, label="dolf's FFT solution")
plt.legend()
plt.show()
结果:
100%|██████████| 1000/1000 [00:07<00:00, 135.29it/s]
100%|██████████| 1000/1000 [00:00<00:00, 1945.14it/s]
(1945 / 135 = 快 14 倍)
要获得更快的速度,请使用 2 的整数次方作为 bin 的数量。它还具有更好的扩展性(即,它可以快速处理许多 bin 和大量数据)。在我的电脑上,它比 n_bins=1024.
的原始答案快 118 倍为什么有效?
两个信号(没有零填充)的 FFT 的乘积等于 circular (or cyclic) convolution of the two signals. A kernel density estimation 基本上是一个与具有脉冲的信号卷积的内核在每个数据点的位置。
为什么不准确?
由于我使用直方图space数据均匀,所以我丢失了每个样本的确切位置,只使用它所属的bin的中心。每个 bin 中的样本数用作该点的脉冲幅度。 例如: 暂时忽略规范化,如果您有一个从 0 到 1 的容器,并且该容器中有两个样本,分别为 0.1 和 0.2,exact
KDE将是 the kernel centred around 0.1
+ the kernel centred around 0.2
。近似值将是 2x`以 0.5 为中心的内核,这是 bin 的中心。