如何归一化 matplotlib 直方图中的概率分布值?
How to normalize probability distribution values in the matplotlib histogram plot?
我试图在同一个图上同时显示累积分布和 non-cumulative 分布。
fig, ax = plt.subplots(figsize=(10, 5))
n, bins, patches = ax.hist(x, n_bins, density=True, stacked=True, histtype='step',
cumulative=True, label='Empirical cumulative')
# Overlay a non-cumulative histogram.
ax.hist(x, bins=bins, density=True, stacked=True, histtype='step', cumulative=False, label='Empirical non-cumulative')
plt.show()
Empirical cumulative
曲线看起来很好,值不超过 1。但是,Empirical non-cumulative
曲线的 Y 值高于 1。我如何将它们归一化?
更新:
示例数据:
n_bins = 20
x = [
0.0051055006412772065,
0.09770815865459548,
0.20666651037049322,
0.5433266733820051,
0.5717169069724539,
0.5421114013759187,
0.4994941193115986,
0.4391978276380223,
0.3673067648294034,
0.3150259778098451,
0.4072059689437963,
0.5781929593356039,
0.6494934859266276,
0.620882081680377,
0.5845829440637116,
0.515705471234385]
请看橙色曲线。
使用 probability
而不是 probability density
创建直方图的最简单方法是使用 seaborn 的 sns.histplot(.... stat='probability')
.
要用标准的 matplotlib 模拟这一点,您可以手动计算所有值。例如:
import matplotlib.pyplot as plt
import numpy as np
n_bins = 20
x = np.random.normal(0, 1, (1000, 3))
bin_edges = np.linspace(x.min(), x.max(), n_bins + 1)
bin_values = np.array([np.histogram(x[:, i], bins=bin_edges)[0] for i in range(x.shape[1])])
cum_values = bin_values.cumsum(axis=1).cumsum(axis=0)
cum_values = cum_values / cum_values.max()
fig, ax = plt.subplots(figsize=(10, 5))
prev = 0
for c in cum_values:
plt.step(np.append(bin_edges, bin_edges[-1]), np.concatenate([[0], c, [prev]]))
prev = c[-1]
ax.set_prop_cycle(None)
prev = 0
for c in cum_values:
c = np.diff(c)
plt.step(np.append(bin_edges, bin_edges[-1]), np.concatenate([[0], c, [c[-1], prev]]), ls='--')
prev = c[-1]
plt.show()
如果您只有一个发行版,stacked=True
没有区别。代码会更简单:
import matplotlib.pyplot as plt
import numpy as np
n_bins = 20
x = np.random.normal(0, 1, 1000)
bin_edges = np.linspace(x.min(), x.max(), n_bins + 1)
bin_values = np.histogram(x, bins=bin_edges)[0]
cum_values = bin_values.cumsum()
cum_values = cum_values / cum_values.max()
fig, ax = plt.subplots(figsize=(10, 5))
plt.step(np.append(bin_edges, bin_edges[-1]), np.concatenate([[0], cum_values, [0]]))
ax.set_prop_cycle(None)
c = np.diff(cum_values)
plt.step(np.append(bin_edges, bin_edges[-1]), np.concatenate([[0], c, [c[-1], 0]]), ls='--')
plt.show()
我试图在同一个图上同时显示累积分布和 non-cumulative 分布。
fig, ax = plt.subplots(figsize=(10, 5))
n, bins, patches = ax.hist(x, n_bins, density=True, stacked=True, histtype='step',
cumulative=True, label='Empirical cumulative')
# Overlay a non-cumulative histogram.
ax.hist(x, bins=bins, density=True, stacked=True, histtype='step', cumulative=False, label='Empirical non-cumulative')
plt.show()
Empirical cumulative
曲线看起来很好,值不超过 1。但是,Empirical non-cumulative
曲线的 Y 值高于 1。我如何将它们归一化?
更新:
示例数据:
n_bins = 20
x = [
0.0051055006412772065,
0.09770815865459548,
0.20666651037049322,
0.5433266733820051,
0.5717169069724539,
0.5421114013759187,
0.4994941193115986,
0.4391978276380223,
0.3673067648294034,
0.3150259778098451,
0.4072059689437963,
0.5781929593356039,
0.6494934859266276,
0.620882081680377,
0.5845829440637116,
0.515705471234385]
请看橙色曲线。
使用 probability
而不是 probability density
创建直方图的最简单方法是使用 seaborn 的 sns.histplot(.... stat='probability')
.
要用标准的 matplotlib 模拟这一点,您可以手动计算所有值。例如:
import matplotlib.pyplot as plt
import numpy as np
n_bins = 20
x = np.random.normal(0, 1, (1000, 3))
bin_edges = np.linspace(x.min(), x.max(), n_bins + 1)
bin_values = np.array([np.histogram(x[:, i], bins=bin_edges)[0] for i in range(x.shape[1])])
cum_values = bin_values.cumsum(axis=1).cumsum(axis=0)
cum_values = cum_values / cum_values.max()
fig, ax = plt.subplots(figsize=(10, 5))
prev = 0
for c in cum_values:
plt.step(np.append(bin_edges, bin_edges[-1]), np.concatenate([[0], c, [prev]]))
prev = c[-1]
ax.set_prop_cycle(None)
prev = 0
for c in cum_values:
c = np.diff(c)
plt.step(np.append(bin_edges, bin_edges[-1]), np.concatenate([[0], c, [c[-1], prev]]), ls='--')
prev = c[-1]
plt.show()
如果您只有一个发行版,stacked=True
没有区别。代码会更简单:
import matplotlib.pyplot as plt
import numpy as np
n_bins = 20
x = np.random.normal(0, 1, 1000)
bin_edges = np.linspace(x.min(), x.max(), n_bins + 1)
bin_values = np.histogram(x, bins=bin_edges)[0]
cum_values = bin_values.cumsum()
cum_values = cum_values / cum_values.max()
fig, ax = plt.subplots(figsize=(10, 5))
plt.step(np.append(bin_edges, bin_edges[-1]), np.concatenate([[0], cum_values, [0]]))
ax.set_prop_cycle(None)
c = np.diff(cum_values)
plt.step(np.append(bin_edges, bin_edges[-1]), np.concatenate([[0], c, [c[-1], 0]]), ls='--')
plt.show()