scipy 的两个正态分布的重叠概率
Overlapping probability of two normal distribution with scipy
我有两条 scipy.stats.norm(mean, std).pdf(0) 正态分布曲线,我试图找出两条曲线的微分(重叠)。
如何用 Python 中的 scipy 计算它?谢谢
可以使用answer。 @duhalme 建议获取相交点,然后使用该点定义积分限制的范围,
这里的代码看起来像,
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import norm
norm.cdf(1.96)
def solve(m1,m2,std1,std2):
a = 1/(2*std1**2) - 1/(2*std2**2)
b = m2/(std2**2) - m1/(std1**2)
c = m1**2 /(2*std1**2) - m2**2 / (2*std2**2) - np.log(std2/std1)
return np.roots([a,b,c])
m1 = 2.5
std1 = 1.0
m2 = 5.0
std2 = 1.0
#Get point of intersect
result = solve(m1,m2,std1,std2)
#Get point on surface
x = np.linspace(-5,9,10000)
plot1=plt.plot(x,norm.pdf(x,m1,std1))
plot2=plt.plot(x,norm.pdf(x,m2,std2))
plot3=plt.plot(result,norm.pdf(result,m1,std1),'o')
#Plots integrated area
r = result[0]
olap = plt.fill_between(x[x>r], 0, norm.pdf(x[x>r],m1,std1),alpha=0.3)
olap = plt.fill_between(x[x<r], 0, norm.pdf(x[x<r],m2,std2),alpha=0.3)
# integrate
area = norm.cdf(r,m2,std2) + (1.-norm.cdf(r,m1,std1))
print("Area under curves ", area)
plt.show()
这里使用 cdf 来获得高斯的积分,尽管可以定义高斯的符号版本并 scipy.quad
使用(或其他)。或者,您可以使用像这样的 Monte Carlo 方法 link (即生成随机数并拒绝任何超出您想要的范围的数字)。
Ed 的回答很棒。但是,我注意到当有两个或无限(完全重叠)的接触点时它不起作用。这也是处理这两种情况的代码版本。
如果您还想继续查看分布图,可以使用 Ed 的代码。
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import norm
def solve(m1,m2,std1,std2):
a = 1./(2.*std1**2) - 1./(2.*std2**2)
b = m2/(std2**2) - m1/(std1**2)
c = m1**2 /(2*std1**2) - m2**2 / (2*std2**2) - np.log(std2/std1)
return np.roots([a,b,c])
m1 = 2.5
std1 = 1.0
m2 = 5.0
std2 = 1.0
result = solve(m1,m2,std1,std2)
# 'lower' and 'upper' represent the lower and upper bounds of the space within which we are computing the overlap
if(len(result)==0): # Completely non-overlapping
overlap = 0.0
elif(len(result)==1): # One point of contact
r = result[0]
if(m1>m2):
tm,ts=m2,std2
m2,std2=m1,std1
m1,std1=tm,ts
if(r<lower): # point of contact is less than the lower boundary. order: r-l-u
overlap = (norm.cdf(upper,m1,std1)-norm.cdf(lower,m1,std1))
elif(r<upper): # point of contact is more than the upper boundary. order: l-u-r
overlap = (norm.cdf(r,m2,std2)-norm.cdf(lower,m2,std2))+(norm.cdf(upper,m1,std1)-norm.cdf(r,m1,std1))
else: # point of contact is within the upper and lower boundaries. order: l-r-u
overlap = (norm.cdf(upper,m2,std2)-norm.cdf(lower,m2,std2))
elif(len(result)==2): # Two points of contact
r1 = result[0]
r2 = result[1]
if(r1>r2):
temp=r2
r2=r1
r1=temp
if(std1>std2):
tm,ts=m2,std2
m2,std2=m1,std1
m1,std1=tm,ts
if(r1<lower):
if(r2<lower): # order: r1-r2-l-u
overlap = (norm.cdf(upper,m1,std1)-norm.cdf(lower,m1,std1))
elif(r2<upper): # order: r1-l-r2-u
overlap = (norm.cdf(r2,m2,std2)-norm.cdf(lower,m2,std2))+(norm.cdf(upper,m1,std1)-norm.cdf(r2,m1,std1))
else: # order: r1-l-u-r2
overlap = (norm.cdf(upper,m2,std2)-norm.cdf(lower,m2,std2))
elif(r1<upper):
if(r2<upper): # order: l-r1-r2-u
print norm.cdf(r1,m1,std1), "-", norm.cdf(lower,m1,std1), "+", norm.cdf(r2,m2,std2), "-", norm.cdf(r1,m2,std2), "+", norm.cdf(upper,m1,std1), "-", norm.cdf(r2,m1,std1)
overlap = (norm.cdf(r1,m1,std1)-norm.cdf(lower,m1,std1))+(norm.cdf(r2,m2,std2)-norm.cdf(r1,m2,std2))+(norm.cdf(upper,m1,std1)-norm.cdf(r2,m1,std1))
else: # order: l-r1-u-r2
overlap = (norm.cdf(r1,m1,std1)-norm.cdf(lower,m1,std1))+(norm.cdf(upper,m2,std2)-norm.cdf(r1,m2,std2))
else: # l-u-r1-r2
overlap = (norm.cdf(upper,m1,std1)-norm.cdf(lower,m1,std1))
开始 Python 3.8
,标准库提供 NormalDist
object as part of the statistics
模块。
NormalDist
可用于计算两个正态分布之间的 重叠系数 (OVL) NormalDist.overlap(other)
方法 returns 一个介于 0.0 和 1.0 之间的值给出两个概率密度函数的重叠区域:
from statistics import NormalDist
NormalDist(mu=2.5, sigma=1).overlap(NormalDist(mu=5.0, sigma=1))
# 0.2112995473337106
我有两条 scipy.stats.norm(mean, std).pdf(0) 正态分布曲线,我试图找出两条曲线的微分(重叠)。
如何用 Python 中的 scipy 计算它?谢谢
可以使用answer。 @duhalme 建议获取相交点,然后使用该点定义积分限制的范围,
这里的代码看起来像,
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import norm
norm.cdf(1.96)
def solve(m1,m2,std1,std2):
a = 1/(2*std1**2) - 1/(2*std2**2)
b = m2/(std2**2) - m1/(std1**2)
c = m1**2 /(2*std1**2) - m2**2 / (2*std2**2) - np.log(std2/std1)
return np.roots([a,b,c])
m1 = 2.5
std1 = 1.0
m2 = 5.0
std2 = 1.0
#Get point of intersect
result = solve(m1,m2,std1,std2)
#Get point on surface
x = np.linspace(-5,9,10000)
plot1=plt.plot(x,norm.pdf(x,m1,std1))
plot2=plt.plot(x,norm.pdf(x,m2,std2))
plot3=plt.plot(result,norm.pdf(result,m1,std1),'o')
#Plots integrated area
r = result[0]
olap = plt.fill_between(x[x>r], 0, norm.pdf(x[x>r],m1,std1),alpha=0.3)
olap = plt.fill_between(x[x<r], 0, norm.pdf(x[x<r],m2,std2),alpha=0.3)
# integrate
area = norm.cdf(r,m2,std2) + (1.-norm.cdf(r,m1,std1))
print("Area under curves ", area)
plt.show()
这里使用 cdf 来获得高斯的积分,尽管可以定义高斯的符号版本并 scipy.quad
使用(或其他)。或者,您可以使用像这样的 Monte Carlo 方法 link (即生成随机数并拒绝任何超出您想要的范围的数字)。
Ed 的回答很棒。但是,我注意到当有两个或无限(完全重叠)的接触点时它不起作用。这也是处理这两种情况的代码版本。
如果您还想继续查看分布图,可以使用 Ed 的代码。
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import norm
def solve(m1,m2,std1,std2):
a = 1./(2.*std1**2) - 1./(2.*std2**2)
b = m2/(std2**2) - m1/(std1**2)
c = m1**2 /(2*std1**2) - m2**2 / (2*std2**2) - np.log(std2/std1)
return np.roots([a,b,c])
m1 = 2.5
std1 = 1.0
m2 = 5.0
std2 = 1.0
result = solve(m1,m2,std1,std2)
# 'lower' and 'upper' represent the lower and upper bounds of the space within which we are computing the overlap
if(len(result)==0): # Completely non-overlapping
overlap = 0.0
elif(len(result)==1): # One point of contact
r = result[0]
if(m1>m2):
tm,ts=m2,std2
m2,std2=m1,std1
m1,std1=tm,ts
if(r<lower): # point of contact is less than the lower boundary. order: r-l-u
overlap = (norm.cdf(upper,m1,std1)-norm.cdf(lower,m1,std1))
elif(r<upper): # point of contact is more than the upper boundary. order: l-u-r
overlap = (norm.cdf(r,m2,std2)-norm.cdf(lower,m2,std2))+(norm.cdf(upper,m1,std1)-norm.cdf(r,m1,std1))
else: # point of contact is within the upper and lower boundaries. order: l-r-u
overlap = (norm.cdf(upper,m2,std2)-norm.cdf(lower,m2,std2))
elif(len(result)==2): # Two points of contact
r1 = result[0]
r2 = result[1]
if(r1>r2):
temp=r2
r2=r1
r1=temp
if(std1>std2):
tm,ts=m2,std2
m2,std2=m1,std1
m1,std1=tm,ts
if(r1<lower):
if(r2<lower): # order: r1-r2-l-u
overlap = (norm.cdf(upper,m1,std1)-norm.cdf(lower,m1,std1))
elif(r2<upper): # order: r1-l-r2-u
overlap = (norm.cdf(r2,m2,std2)-norm.cdf(lower,m2,std2))+(norm.cdf(upper,m1,std1)-norm.cdf(r2,m1,std1))
else: # order: r1-l-u-r2
overlap = (norm.cdf(upper,m2,std2)-norm.cdf(lower,m2,std2))
elif(r1<upper):
if(r2<upper): # order: l-r1-r2-u
print norm.cdf(r1,m1,std1), "-", norm.cdf(lower,m1,std1), "+", norm.cdf(r2,m2,std2), "-", norm.cdf(r1,m2,std2), "+", norm.cdf(upper,m1,std1), "-", norm.cdf(r2,m1,std1)
overlap = (norm.cdf(r1,m1,std1)-norm.cdf(lower,m1,std1))+(norm.cdf(r2,m2,std2)-norm.cdf(r1,m2,std2))+(norm.cdf(upper,m1,std1)-norm.cdf(r2,m1,std1))
else: # order: l-r1-u-r2
overlap = (norm.cdf(r1,m1,std1)-norm.cdf(lower,m1,std1))+(norm.cdf(upper,m2,std2)-norm.cdf(r1,m2,std2))
else: # l-u-r1-r2
overlap = (norm.cdf(upper,m1,std1)-norm.cdf(lower,m1,std1))
开始 Python 3.8
,标准库提供 NormalDist
object as part of the statistics
模块。
NormalDist
可用于计算两个正态分布之间的 重叠系数 (OVL) NormalDist.overlap(other)
方法 returns 一个介于 0.0 和 1.0 之间的值给出两个概率密度函数的重叠区域:
from statistics import NormalDist
NormalDist(mu=2.5, sigma=1).overlap(NormalDist(mu=5.0, sigma=1))
# 0.2112995473337106