参数 PDF 估计:直方图与似然
parametric PDF estimation: histogram vs likelihood
给定分布样本并假设它是高斯分布(具有未知 mu、sigma 的正态分布),任务是找到最能描述它的参数均值和标准差。
数学 有什么不同,为什么会产生不同的结果?
如果不同,为什么以及何时使用哪种方法?我认为两者都是参数化的,可以用于相同的情况。
- 对参数 mu、sigma 进行正态分布的最小二乘曲线拟合
- 最大化样本的概率 = PDF(mu,sigma) 之和
import numpy as np
from matplotlib import pyplot as plt
from scipy.stats import norm
# define true mean and std for Gaussian normal distribution
mean = 5.0
std = 2.0
# generate random variets (samples) and get histogram
samples = np.random.normal(loc=mean, scale=std, size=100)
hist, bin_edges = np.histogram(samples, density=True)
midpoints = (bin_edges[:-1] + bin_edges[1:])/2.
# fit the Gaussian do find mean and std
def func(x, mean, std):
return norm.pdf(x, loc=mean, scale=std)
from scipy.optimize import curve_fit
popt, pcov = curve_fit(func, midpoints, hist)
fit_mean, fit_std = popt
print("fitted mean,std:",fit_mean,fit_std)
print("sample mean,std:",np.mean(samples),np.std(samples))
# negative log likelihood approach
def normaldistribution_negloglikelihood(params):
mu, sigma = params
return -np.log(np.sum(norm.pdf(samples, loc=mu, scale=sigma)))
#return -np.sum(norm.pdf(samples, loc=mu, scale=sigma))
from scipy.optimize import minimize
result = minimize(normaldistribution_negloglikelihood, x0=[0,1] , bounds=((None,None), (1e-5,None)) )#, method='Nelder-Mead')
if result.success:
fitted_params = result.x
#print("fitted_params", fitted_params)
else:
raise ValueError(result.message)
nll_mean, nll_std = fitted_params
print("neg LL mean,std:",nll_mean, nll_std)
# plot
plt.plot(midpoints, hist, label="sample histogram")
x = np.linspace(-5,15,500)
plt.plot(x, norm.pdf(x, loc=mean, scale=std), label="true PDF")
plt.plot(x, norm.pdf(x, loc=fit_mean, scale=fit_std), label="fitted PDF")
plt.plot(x, norm.pdf(x, loc=nll_mean, scale=nll_std), label="neg LL estimator")
plt.legend()
plt.show()
你的可能性是错误的,你应该对pdf的日志求和,而不是你所做的,所以:
def normaldistribution_negloglikelihood(params):
mu, sigma = params
return -np.sum(norm.logpdf(samples, loc=mu, scale=sigma))
result = minimize(normaldistribution_negloglikelihood, x0=[0,1] ,
bounds=((None,None), (1e-5,None)) )#, method='Nelder-Mead')
nll_mean, nll_std = result.x
x = np.linspace(-5,15,500)
plt.plot(x, norm.pdf(x, loc=mean, scale=std), label="true PDF")
plt.plot(x, norm.pdf(x, loc=nll_mean, scale=nll_std), label="neg LL estimator")
你应该最大化可能性。直方图方法取决于您的 bin 大小并且不会使用您的所有数据。
给定分布样本并假设它是高斯分布(具有未知 mu、sigma 的正态分布),任务是找到最能描述它的参数均值和标准差。
数学 有什么不同,为什么会产生不同的结果? 如果不同,为什么以及何时使用哪种方法?我认为两者都是参数化的,可以用于相同的情况。
- 对参数 mu、sigma 进行正态分布的最小二乘曲线拟合
- 最大化样本的概率 = PDF(mu,sigma) 之和
import numpy as np
from matplotlib import pyplot as plt
from scipy.stats import norm
# define true mean and std for Gaussian normal distribution
mean = 5.0
std = 2.0
# generate random variets (samples) and get histogram
samples = np.random.normal(loc=mean, scale=std, size=100)
hist, bin_edges = np.histogram(samples, density=True)
midpoints = (bin_edges[:-1] + bin_edges[1:])/2.
# fit the Gaussian do find mean and std
def func(x, mean, std):
return norm.pdf(x, loc=mean, scale=std)
from scipy.optimize import curve_fit
popt, pcov = curve_fit(func, midpoints, hist)
fit_mean, fit_std = popt
print("fitted mean,std:",fit_mean,fit_std)
print("sample mean,std:",np.mean(samples),np.std(samples))
# negative log likelihood approach
def normaldistribution_negloglikelihood(params):
mu, sigma = params
return -np.log(np.sum(norm.pdf(samples, loc=mu, scale=sigma)))
#return -np.sum(norm.pdf(samples, loc=mu, scale=sigma))
from scipy.optimize import minimize
result = minimize(normaldistribution_negloglikelihood, x0=[0,1] , bounds=((None,None), (1e-5,None)) )#, method='Nelder-Mead')
if result.success:
fitted_params = result.x
#print("fitted_params", fitted_params)
else:
raise ValueError(result.message)
nll_mean, nll_std = fitted_params
print("neg LL mean,std:",nll_mean, nll_std)
# plot
plt.plot(midpoints, hist, label="sample histogram")
x = np.linspace(-5,15,500)
plt.plot(x, norm.pdf(x, loc=mean, scale=std), label="true PDF")
plt.plot(x, norm.pdf(x, loc=fit_mean, scale=fit_std), label="fitted PDF")
plt.plot(x, norm.pdf(x, loc=nll_mean, scale=nll_std), label="neg LL estimator")
plt.legend()
plt.show()
你的可能性是错误的,你应该对pdf的日志求和,而不是你所做的,所以:
def normaldistribution_negloglikelihood(params):
mu, sigma = params
return -np.sum(norm.logpdf(samples, loc=mu, scale=sigma))
result = minimize(normaldistribution_negloglikelihood, x0=[0,1] ,
bounds=((None,None), (1e-5,None)) )#, method='Nelder-Mead')
nll_mean, nll_std = result.x
x = np.linspace(-5,15,500)
plt.plot(x, norm.pdf(x, loc=mean, scale=std), label="true PDF")
plt.plot(x, norm.pdf(x, loc=nll_mean, scale=nll_std), label="neg LL estimator")
你应该最大化可能性。直方图方法取决于您的 bin 大小并且不会使用您的所有数据。