如何绘制标准偏差
How to plot Standard Deviations
最近我开始学习数据科学的概率和统计。我正在尝试为以下分布 X
绘制 标准差 ,例如 68-95-99.7 规则。
生成绘图的代码:
import numpy as np
from scipy.stats import norm
import matplotlib.pyplot as plt
import seaborn as sns
# Line width: Maximum 130 characters in the output, post which it will continue in next line.
np.set_printoptions(linewidth=130)
sns.set_context("paper", font_scale=1.5)
# Distribution
X = [1, 2, 2, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 7, 8, 8, 9]
mean = np.mean(X)
var = np.var(X)
std = np.std(X)
print("Mean:", mean)
print("Variance:", var)
print("Standard Deviation:", std)
"""
Mean: 5.0
Variance: 4.0
Standard Deviation: 2.0
"""
plt.figure(figsize=(10, 5))
ax = sns.kdeplot(X, shade=True)
# Plot 1-std
x = np.linspace(mean - std, mean + std)
y = norm.pdf(x, mean, std)
ax.fill_between(x, y, alpha=0.5)
plt.xlabel("Random variable X")
plt.ylabel("Probability Density Function")
plt.xticks(ticks=range(0, 10))
plt.grid()
plt.show()
此代码生成以下图:
问题:
- 根据均值绘制 1 个标准差的代码有什么问题?
- 我无法理解为什么
kde
图上方有一个小峰?
- 如何绘制 1-std、2-std 和 3-std?
您的代码没有错:mean 是 5
和 std 2
,因此您正在遮蔽 5 - 2 = 3
和 5 + 2 = 7
之间的区域。
kde
图中有一个小峰,因为它表示您用 X
给出的数据分布,实际上,X
是不是正态分布。您可以使用真正的正态分布来检查这一点:
mean = 5
std = 2
X = np.random.randn(10000)
X = (X - X.mean())/X.std()*std + mean
您可以在 i
上使用 for 循环绘制其他标准偏差。 x1
是左侧,x2
是中间部分(然后设置为 np.nan
),最后 x3
是分布的右侧。然后你必须设置 np.nan
要排除的区域(对应于 x2
):
N = 10
for i in [1, 2, 3]:
x1 = np.linspace(mean - i*std, mean - (i - 1)*std, N)
x2 = np.linspace(mean - (i - 1)*std, mean + (i - 1)*std, N)
x3 = np.linspace(mean + (i - 1)*std, mean + i*std, N)
x = np.concatenate((x1, x2, x3))
x = np.where((mean - (i - 1)*std < x) & (x < mean + (i - 1)*std), np.nan, x)
y = norm.pdf(x, mean, std)
ax.fill_between(x, y, alpha=0.5)
完整代码
import numpy as np
from scipy.stats import norm
import matplotlib.pyplot as plt
import seaborn as sns
# Line width: Maximum 130 characters in the output, post which it will continue in next line.
np.set_printoptions(linewidth=130)
sns.set_context("paper", font_scale=1.5)
# Distribution
X = [1, 2, 2, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 7, 8, 8, 9]
mean = np.mean(X)
var = np.var(X)
std = np.std(X)
print("Mean:", mean)
print("Variance:", var)
print("Standard Deviation:", std)
"""
Mean: 5.0
Variance: 4.0
Standard Deviation: 2.0
"""
plt.figure(figsize=(10, 5))
ax = sns.kdeplot(X, shade=True)
N = 10
for i in [1, 2, 3]:
x1 = np.linspace(mean - i*std, mean - (i - 1)*std, N)
x2 = np.linspace(mean - (i - 1)*std, mean + (i - 1)*std, N)
x3 = np.linspace(mean + (i - 1)*std, mean + i*std, N)
x = np.concatenate((x1, x2, x3))
x = np.where((mean - (i - 1)*std < x) & (x < mean + (i - 1)*std), np.nan, x)
y = norm.pdf(x, mean, std)
ax.fill_between(x, y, alpha=0.5)
plt.xlabel("Random variable X")
plt.ylabel("Probability Density Function")
plt.xticks(ticks=range(0, 10))
plt.grid()
plt.show()
情节
最近我开始学习数据科学的概率和统计。我正在尝试为以下分布 X
绘制 标准差 ,例如 68-95-99.7 规则。
生成绘图的代码:
import numpy as np
from scipy.stats import norm
import matplotlib.pyplot as plt
import seaborn as sns
# Line width: Maximum 130 characters in the output, post which it will continue in next line.
np.set_printoptions(linewidth=130)
sns.set_context("paper", font_scale=1.5)
# Distribution
X = [1, 2, 2, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 7, 8, 8, 9]
mean = np.mean(X)
var = np.var(X)
std = np.std(X)
print("Mean:", mean)
print("Variance:", var)
print("Standard Deviation:", std)
"""
Mean: 5.0
Variance: 4.0
Standard Deviation: 2.0
"""
plt.figure(figsize=(10, 5))
ax = sns.kdeplot(X, shade=True)
# Plot 1-std
x = np.linspace(mean - std, mean + std)
y = norm.pdf(x, mean, std)
ax.fill_between(x, y, alpha=0.5)
plt.xlabel("Random variable X")
plt.ylabel("Probability Density Function")
plt.xticks(ticks=range(0, 10))
plt.grid()
plt.show()
此代码生成以下图:
问题:
- 根据均值绘制 1 个标准差的代码有什么问题?
- 我无法理解为什么
kde
图上方有一个小峰? - 如何绘制 1-std、2-std 和 3-std?
您的代码没有错:mean 是
5
和 std2
,因此您正在遮蔽5 - 2 = 3
和5 + 2 = 7
之间的区域。kde
图中有一个小峰,因为它表示您用X
给出的数据分布,实际上,X
是不是正态分布。您可以使用真正的正态分布来检查这一点:mean = 5 std = 2 X = np.random.randn(10000) X = (X - X.mean())/X.std()*std + mean
您可以在
i
上使用 for 循环绘制其他标准偏差。x1
是左侧,x2
是中间部分(然后设置为np.nan
),最后x3
是分布的右侧。然后你必须设置np.nan
要排除的区域(对应于x2
):N = 10 for i in [1, 2, 3]: x1 = np.linspace(mean - i*std, mean - (i - 1)*std, N) x2 = np.linspace(mean - (i - 1)*std, mean + (i - 1)*std, N) x3 = np.linspace(mean + (i - 1)*std, mean + i*std, N) x = np.concatenate((x1, x2, x3)) x = np.where((mean - (i - 1)*std < x) & (x < mean + (i - 1)*std), np.nan, x) y = norm.pdf(x, mean, std) ax.fill_between(x, y, alpha=0.5)
完整代码
import numpy as np
from scipy.stats import norm
import matplotlib.pyplot as plt
import seaborn as sns
# Line width: Maximum 130 characters in the output, post which it will continue in next line.
np.set_printoptions(linewidth=130)
sns.set_context("paper", font_scale=1.5)
# Distribution
X = [1, 2, 2, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 7, 8, 8, 9]
mean = np.mean(X)
var = np.var(X)
std = np.std(X)
print("Mean:", mean)
print("Variance:", var)
print("Standard Deviation:", std)
"""
Mean: 5.0
Variance: 4.0
Standard Deviation: 2.0
"""
plt.figure(figsize=(10, 5))
ax = sns.kdeplot(X, shade=True)
N = 10
for i in [1, 2, 3]:
x1 = np.linspace(mean - i*std, mean - (i - 1)*std, N)
x2 = np.linspace(mean - (i - 1)*std, mean + (i - 1)*std, N)
x3 = np.linspace(mean + (i - 1)*std, mean + i*std, N)
x = np.concatenate((x1, x2, x3))
x = np.where((mean - (i - 1)*std < x) & (x < mean + (i - 1)*std), np.nan, x)
y = norm.pdf(x, mean, std)
ax.fill_between(x, y, alpha=0.5)
plt.xlabel("Random variable X")
plt.ylabel("Probability Density Function")
plt.xticks(ticks=range(0, 10))
plt.grid()
plt.show()