正态分布曲线与使用 matplotlib 的子图中的直方图不太吻合

normal distribution curve doesn't fit well over histogram in subplots using matplotlib

我正在使用“plt.subplots(2, 2, sharex=True, sharey=True)”绘制 2*2 子图。每个子图都有两个 Y 轴,并包含直方图上的正态分布曲线。请注意,我在这里特别设置了“sharex=True,sharey=True”,以使所有子图共享相同的 X 轴和 Y 轴。

在运行我的代码之后,一切都很好,除了第二个、第三个和第四个子图,它们的正态分布曲线不太符合直方图(请看这里的图)

我用谷歌搜索但没能解决这个问题。但是,如果我在我的代码中设置“sharex=True, sharey=False”,那么该图看起来是正确的,但所有子图都使用它们自己的 Y 轴,这不是我想要的。请看这里的图

希望 Whosebug 的专家能解决这个问题。非常感谢!

下面是我的代码:

import matplotlib.pyplot as plt

from scipy.stats import norm

def align_yaxis(ax1, v1, ax2, v2):
    #adjust ax2 ylimit so that v2 in ax2 is aligned to v1 in ax1
    _, y1 = ax1.transData.transform((0, v1))
    _, y2 = ax2.transData.transform((0, v2))
    inv = ax2.transData.inverted()
    _, dy = inv.transform((0, 0)) - inv.transform((0, y1-y2))
    miny, maxy = ax2.get_ylim()
    ax2.set_ylim(miny+dy, maxy+dy)
    
def drawSingle(myax, mydf , title, offset):
    
    num_bins = 200
    xs = mydf["gap"]
    x = np.linspace(-1,1,1000)
    
    mu =np.mean(x) 
    sigma =np.std(xs)
    n, bins, patche =  myax.hist(xs, num_bins, alpha=0.8, facecolor='blue', density=False) 
    

    myax.set_ylabel('frequency',color="black",fontsize=12, weight = "bold")
    myax.set_xlabel('X', fontsize=12, weight = "bold",horizontalalignment='center')

    ax_twin = myax.twinx()
    y_normcurve = norm.pdf(bins, mu, sigma)
    ax_twin.plot(bins, y_normcurve, 'r--') 

    align_yaxis(myax,0,ax_twin,0)
    peakpoint = norm.pdf(mu,loc=mu,scale=sigma)
    plt.vlines(mu, 0, peakpoint, 'y', '--', label='example')
    
    ax_twin.set_ylabel("probablility dense",color="black",fontsize=12, weight = "bold")
    
         
def drawSubplots(mydf1,mydf2,mydf3,mydf4, pos1,pos2,pos3,pos4, title, filename):
    plt.rcParams['figure.figsize'] = (18,15 )
    
    my_x_ticks = np.arange(-0.8, 0.8,0.1)
   
    rows, cols = 2, 2
    fig, ax = plt.subplots(2, 2, sharex=True, sharey=True)

    drawSingle(ax[0][0], mydf1, "Subplot1", pos1)
    drawSingle(ax[0][1], mydf2, "Subplot2", pos2)
    drawSingle(ax[1][0], mydf3, "Subplot3", pos3)
    drawSingle(ax[1][1], mydf4, "Subplot4", pos4)
    
    plt.text(-1, -1, title, horizontalalignment='center', fontsize=18)
    
    plt.show()

    
drawSubplots(df1, df2,df3,df4,3.2,3.1,2.7,2.85,"test9", "test9")

这是一个尝试:

  • 将左侧 y 轴设置为“频率”(这在当前 bin 宽度的情况下非常无用)并在 4 个子图中共享
  • 将右侧的 y 轴设置为“概率密度”;注意所有高斯的顶部是如何在 y=0.02 附近(双轴只能设置在最后,因为共享的 y 轴可以通过以后的子图更新)
  • 使直方图和正态曲线对齐
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from scipy.stats import norm

def drawSingle(myax, mydf, title):
    num_bins = 200
    xs = mydf["gap"]
    x = np.linspace(-1, 1, 1000)

    mu = np.mean(x)
    sigma = np.std(xs)
    n, bins, patches = myax.hist(xs, num_bins, alpha=0.8, facecolor='blue', density=False)

    myax.set_ylabel('frequency', color="black", fontsize=12, weight="bold")
    myax.set_xlabel('X', fontsize=12, weight="bold", horizontalalignment='center')

    normalization_factor = len(xs) * (bins[1] - bins[0])
    y_normcurve = norm.pdf(x, mu, sigma) * normalization_factor
    myax.plot(x, y_normcurve, 'r--')
    myax.vlines(mu, 0, y_normcurve.max(), 'y', '--', color='lime', label='example')
    return normalization_factor

def drawSubplots(mydf1, mydf2, mydf3, mydf4, title):
    plt.rcParams['figure.figsize'] = (18, 15)

    fig, ax = plt.subplots(nrows=2, ncols=2, sharex=True, sharey=True)

    dfs = [mydf1, mydf2, mydf3, mydf4]
    norm_factors = [drawSingle(ax_i, df, title)
                    for ax_i, df, title in zip(ax.ravel(), dfs, ["Subplot1", "Subplot2", "Subplot3", "Subplot4"])]
    for ax_i, norm_factor in zip(ax.ravel(), norm_factors):
        ax_twin = ax_i.twinx()
        ymax = ax_i.get_ylim()[1]
        ax_twin.set_ylim(0, ymax / norm_factor)
    plt.suptitle(title, fontsize=18)
    plt.tight_layout()
    plt.show()

df1, df2, df3, df4 = [pd.DataFrame({"gap": np.random.normal(0, 0.2, n)}) for n in [6000, 4000, 1800, 1200]]
drawSubplots(df1, df2, df3, df4, "Title")

非常感谢 JohanC,你太棒了。

根据您的代码,我只是在 drawSubplots 函数中添加了几行代码,以便使每个子图的下限和上限之间的 95% 的高斯曲线区域阴影化。以下是我的尝试。 ax_twin.fill_between 在这里似乎不能正常工作。从图中可以看出,阴影区域超出了高斯曲线enter image description here。我想要的只是遮蔽下限和上限之间的高斯曲线下的区域。如果你不介意,请你检查一下我的错误好吗?非常感谢!

import matplotlib.pyplot as plt
import math

from scipy.stats import norm

def align_yaxis(ax1, v1, ax2, v2):
    #adjust ax2 ylimit so that v2 in ax2 is aligned to v1 in ax1
    _, y1 = ax1.transData.transform((0, v1))
    _, y2 = ax2.transData.transform((0, v2))
    inv = ax2.transData.inverted()
    _, dy = inv.transform((0, 0)) - inv.transform((0, y1-y2))
    miny, maxy = ax2.get_ylim()
    ax2.set_ylim(miny+dy, maxy+dy)
    
    

def drawSingle(myax, mydf , title):
    
    num_bins = 200
    xs = mydf["gap"]
    x = np.linspace(-1,1,1000)
 
    mu =np.mean(xs) 
    sigma =np.std(xs)
    
    n, bins, patches = myax.hist(xs, num_bins, alpha=0.8, facecolor='blue', density=False)
    

    myax.set_ylabel('Frequency', color="black", fontsize=12, weight="bold")
    myax.set_xlabel(title, fontsize=12, weight="bold", horizontalalignment='center')

    normalization_factor = len(xs) * (bins[1] - bins[0])
    y_normcurve = norm.pdf(x, mu, sigma) * normalization_factor
    myax.plot(x, y_normcurve, 'r--')

    myax.vlines(mu, 0, y_normcurve.max(), 'y', '--', color='lime', label='example')
    
    plt.xlim(-0.8,0.8) 
    my_x_ticks = np.arange(-0.8, 0.8,0.1)
    plt.xticks(my_x_ticks)

    return normalization_factor, mu, sigma
    
    
def drawSubplots(mydf1,mydf2,mydf3,mydf4, title):
    plt.rcParams['figure.figsize'] = (18,15 )
    
    norm_factors = []
    mus = []
    sigmas = []
    
    my_x_ticks = np.arange(-0.8, 0.8,0.1)
    
    
    rows, cols = 2, 2
    fig, ax = plt.subplots(nrows=rows, ncols=cols, sharex=True, sharey=True)
    

    dfs = [mydf1, mydf2, mydf3, mydf4]
    #norm_factors = [drawSingle(ax_i, df, title)
                    #for ax_i, df, title in zip(ax.ravel(), dfs, ["Subplot1", "Subplot2", "Subplot3", "Subplot4"])]
    
    
    for ax_i, df, title in zip(ax.ravel(), dfs, ["Subplot1", "Subplot2", "Subplot3", "Subplot4"]):
        norm_factor, mu, sigma = drawSingle(ax_i, df, title)
        norm_factors.append(norm_factor)
        mus.append(mu)
        sigmas.append(sigma)
    
    
    for ax_i, norm_factor, mu, sigma in zip(ax.ravel(), norm_factors, mus, sigmas ):
        ax_twin = ax_i.twinx()
        
        xmax = ax_i.get_xlim()[1]
        ax_twin.set_ylim(0, xmax / norm_factor)
        ax_twin.set_ylabel("probablility dense",color="black",fontsize=12, weight = "bold")
        
        
        CI_95_lower = mu - (1.96*sigma)
        CI_95_upper = mu + (1.96*sigma)

        px_shaded = np.arange(CI_95_lower,CI_95_upper,0.1)
        ax_twin.fill_between(px_shaded,norm.pdf(px_shaded,loc=mu,scale=sigma) * norm_factor,alpha=0.75, color='pink')
        area_shaded_95_CI = norm.cdf(x=CI_95_upper, loc=mu, scale=sigma)-norm.cdf(x=CI_95_lower, loc=mu, scale=sigma)
        ax_twin.text(-0.06,0.01,str(round(area_shaded_95_CI*100,1))+"%", fontsize=20)
       
        ax_twin.annotate(s=f'lower bound= {CI_95_lower:.3f}',xy=(CI_95_lower,norm.pdf(CI_95_lower,loc=mu,scale=sigma)),xytext=(-0.75,0.01),weight='bold',color='blue',\
                 arrowprops=dict(arrowstyle='-|>',connectionstyle='arc3',color='green'),\
                 fontsize=12
                )

        ax_twin.annotate(s=f'upper bound= {CI_95_upper:.3f}',xy=(CI_95_upper,norm.pdf(CI_95_upper,loc=mu,scale=sigma)),xytext=(0.28,0.01),weight='bold',color='blue',\
                 arrowprops=dict(arrowstyle='-|>',connectionstyle='arc3',color='green'),\
                  fontsize=12
                )

        ax_twin.text(0.05, 0.03, r"$\mu=" + f'{mu:.6f}' + ", \sigma=" + f'{sigma:.6f}' + "$" + ", confidence interval=95%" ,
            horizontalalignment='center', fontsize=15)
        
        
    
    plt.suptitle(title, fontsize=18)
    plt.tight_layout()
    plt.show()


df1, df2, df3, df4 = [pd.DataFrame({"gap": np.random.normal(0, 0.2, n)}) for n in [6000, 4000, 1800, 1200]]

drawSubplots(df1, df2, df3, df4, "Title")