参数 PDF 估计:直方图与似然

parametric PDF estimation: histogram vs likelihood

给定分布样本并假设它是高斯分布(具有未知 mu、sigma 的正态分布),任务是找到最能描述它的参数均值和标准差。

数学 有什么不同,为什么会产生不同的结果? 如果不同,为什么以及何时使用哪种方法?我认为两者都是参数化的,可以用于相同的情况。

  1. 对参数 mu、sigma 进行正态分布的最小二乘曲线拟合
  2. 最大化样本的概率 = PDF(mu,sigma) 之和
import numpy as np
from matplotlib import pyplot as plt
from scipy.stats import norm

# define true mean and std for Gaussian normal distribution

mean = 5.0
std = 2.0

# generate random variets (samples) and get histogram

samples = np.random.normal(loc=mean, scale=std, size=100)
hist, bin_edges = np.histogram(samples, density=True)
midpoints = (bin_edges[:-1] + bin_edges[1:])/2.

# fit the Gaussian do find mean and std

def func(x, mean, std):
    return norm.pdf(x, loc=mean, scale=std)

from scipy.optimize import curve_fit

popt, pcov = curve_fit(func, midpoints, hist)
fit_mean, fit_std = popt

print("fitted mean,std:",fit_mean,fit_std)
print("sample mean,std:",np.mean(samples),np.std(samples))

# negative log likelihood approach

def normaldistribution_negloglikelihood(params):
    mu, sigma = params
    return -np.log(np.sum(norm.pdf(samples, loc=mu, scale=sigma)))
    #return -np.sum(norm.pdf(samples, loc=mu, scale=sigma))
    
from scipy.optimize import minimize

result = minimize(normaldistribution_negloglikelihood, x0=[0,1] , bounds=((None,None), (1e-5,None)) )#, method='Nelder-Mead')

if result.success:
    fitted_params = result.x
    #print("fitted_params", fitted_params)
else:
    raise ValueError(result.message)
    

nll_mean, nll_std = fitted_params
print("neg LL mean,std:",nll_mean, nll_std)
    

# plot

plt.plot(midpoints, hist, label="sample histogram")

x = np.linspace(-5,15,500)
plt.plot(x, norm.pdf(x, loc=mean, scale=std), label="true PDF")
plt.plot(x, norm.pdf(x, loc=fit_mean, scale=fit_std), label="fitted PDF")
plt.plot(x, norm.pdf(x, loc=nll_mean, scale=nll_std), label="neg LL estimator")


plt.legend()
plt.show()

你的可能性是错误的,你应该对pdf的日志求和,而不是你所做的,所以:

def normaldistribution_negloglikelihood(params):
    mu, sigma = params
    return -np.sum(norm.logpdf(samples, loc=mu, scale=sigma))
    
result = minimize(normaldistribution_negloglikelihood, x0=[0,1] ,
                  bounds=((None,None), (1e-5,None)) )#, method='Nelder-Mead')

nll_mean, nll_std = result.x
x = np.linspace(-5,15,500)

plt.plot(x, norm.pdf(x, loc=mean, scale=std), label="true PDF")
plt.plot(x, norm.pdf(x, loc=nll_mean, scale=nll_std), label="neg LL estimator")

你应该最大化可能性。直方图方法取决于您的 bin 大小并且不会使用您的所有数据。