正态分布计算器
A Normal Distribution Calculator
所以我试图制作一个程序来解决各种纯 python(除了 math
之外没有其他模块)的正态分布问题,只针对 A Level 到小数点后 4 位,并且存在这个问题出现在函数 get_z_less_than_a_equal(0.75):
中。显然,如果在 except 子句中没有 assert 语句,变量都会变得混乱并发生变化。我发现的错误是递归错误。不管怎样,如果有更简单高效的方法,我们将不胜感激。
import math
mean = 0
standard_dev = 1
percentage_points = {0.5000: 0.0000, 0.4000: 0.2533, 0.3000: 0.5244, 0.2000: 0.8416, 0.1000: 1.2816, 0.0500: 1.6440, 0.0250: 1.9600, 0.0100: 2.3263, 0.0050: 2.5758, 0.0010: 3.0902, 0.0005: 3.2905}
def get_z_less_than(x):
"""
P(Z < x)
"""
return round(0.5 * (1 + math.erf((x - mean)/math.sqrt(2 * standard_dev**2))), 4)
def get_z_greater_than(x):
"""
P(Z > x)
"""
return round(1 - get_z_less_than(x), 4)
def get_z_in_range(lower_bound, upper_bound):
"""
P(lower_bound < Z < upper_bound)
"""
return round(get_z_less_than(upper_bound) - get_z_less_than(lower_bound), 4)
def get_z_less_than_a_equal(x):
"""
P(Z < a) = x
acquires a, given x
"""
# first trial: brute forcing
for i in range(401):
a = i/100
p = get_z_less_than(a)
if x == p:
return a
elif p > x:
break
# second trial: using symmetry
try:
res = -get_z_less_than_a_equal(1 - x)
except:
# third trial: using estimation
assert a, "error"
prev = get_z_less_than(a-0.01)
p = get_z_less_than(a)
if abs(x - prev) > abs(x - p):
res = a
else:
res = a - 0.01
return res
def get_z_greater_than_a_equal(x):
"""
P(Z > a) = x
"""
if x in percentage_points:
return percentage_points[x]
else:
return get_z_less_than_a_equal(1-x)
print(get_z_in_range(-1.20, 1.40))
print(get_z_less_than_a_equal(0.7517))
print(get_z_greater_than_a_equal(0.1000))
print(get_z_greater_than_a_equal(0.0322))
print(get_z_less_than_a_equal(0.1075))
print(get_z_less_than_a_equal(0.75))
因为python3.8,标准库中的statistics
模块有一个NormalDist
class,所以我们可以用它来实现我们的功能“纯python" 或至少用于测试:
import math
from statistics import NormalDist
normal_dist = NormalDist(mu=0, sigma=1)
for i in range(-2000, 2000):
test_val = i / 1000
assert get_z_less_than(test_val) == round(normal_dist.cdf(test_val), 4)
不会抛出错误,因此该部分可能工作正常
您的 get_z_less_than_a_equal
似乎相当于 NormalDist.inv_cdf
有非常有效的方法可以使用误差函数的反函数来准确计算它(参见 Wikipedia and Python implementation),但我们在标准库中没有这种方法
由于您只关心前几位,而get_z_less_than
是monotonic, we can use a simple bisection method来找到我们的解决方案
Newton's method 会快得多,而且实现起来也不难,因为我们知道 cdf 的导数就是 pdf,但可能仍然比我们需要的更复杂
def get_z_less_than_a_equal(x):
"""
P(Z < a) = x
acquires a, given x
"""
if x <= 0.0 or x >= 1.0:
raise ValueError("x must be >0.0 and <1.0")
min_res, max_res = -10, 10
while max_res - min_res > 1e-7:
mid = (max_res + min_res) / 2
if get_z_less_than(mid) < x:
min_res = mid
else:
max_res = mid
return round((max_res + min_res) / 2, 4)
让我们测试一下:
for i in range(1, 2000):
test_val = i / 2000
left_val = get_z_less_than_a_equal(test_val)
right_val = round(normal_dist.inv_cdf(test_val), 4)
assert left_val == right_val, f"{left_val} != {right_val}"
# AssertionError: -3.3201 != -3.2905
我们发现我们正在失去一些精度,那是因为 get_z_less_than
引入的误差(四舍五入到 4 位数)在我们使用它来估计其逆时被传播和放大(参见 Wikipedia - error propagation 了解详情)
所以让我们向 get_z_less_than
添加一个“digits”参数并稍微更改我们的函数:
def get_z_less_than(x, digits=4):
"""
P(Z < x)
"""
res = 0.5 * (1 + math.erf((x - mean) / math.sqrt(2 * standard_dev ** 2)))
return round(res, digits)
def get_z_less_than_a_equal(x, digits=4):
"""
P(Z < a) = x
acquires a, given x
"""
if x <= 0.0 or x >= 1.0:
raise ValueError("x must be >0.0 and <1.0")
min_res, max_res = -10, 10
while max_res - min_res > 10 ** -(digits * 2):
mid = (max_res + min_res) / 2
if get_z_less_than(mid, digits * 2) < x:
min_res = mid
else:
max_res = mid
return round((max_res + min_res) / 2, digits)
现在我们可以再次尝试相同的测试并查看它是否通过
所以我试图制作一个程序来解决各种纯 python(除了 math
之外没有其他模块)的正态分布问题,只针对 A Level 到小数点后 4 位,并且存在这个问题出现在函数 get_z_less_than_a_equal(0.75):
中。显然,如果在 except 子句中没有 assert 语句,变量都会变得混乱并发生变化。我发现的错误是递归错误。不管怎样,如果有更简单高效的方法,我们将不胜感激。
import math
mean = 0
standard_dev = 1
percentage_points = {0.5000: 0.0000, 0.4000: 0.2533, 0.3000: 0.5244, 0.2000: 0.8416, 0.1000: 1.2816, 0.0500: 1.6440, 0.0250: 1.9600, 0.0100: 2.3263, 0.0050: 2.5758, 0.0010: 3.0902, 0.0005: 3.2905}
def get_z_less_than(x):
"""
P(Z < x)
"""
return round(0.5 * (1 + math.erf((x - mean)/math.sqrt(2 * standard_dev**2))), 4)
def get_z_greater_than(x):
"""
P(Z > x)
"""
return round(1 - get_z_less_than(x), 4)
def get_z_in_range(lower_bound, upper_bound):
"""
P(lower_bound < Z < upper_bound)
"""
return round(get_z_less_than(upper_bound) - get_z_less_than(lower_bound), 4)
def get_z_less_than_a_equal(x):
"""
P(Z < a) = x
acquires a, given x
"""
# first trial: brute forcing
for i in range(401):
a = i/100
p = get_z_less_than(a)
if x == p:
return a
elif p > x:
break
# second trial: using symmetry
try:
res = -get_z_less_than_a_equal(1 - x)
except:
# third trial: using estimation
assert a, "error"
prev = get_z_less_than(a-0.01)
p = get_z_less_than(a)
if abs(x - prev) > abs(x - p):
res = a
else:
res = a - 0.01
return res
def get_z_greater_than_a_equal(x):
"""
P(Z > a) = x
"""
if x in percentage_points:
return percentage_points[x]
else:
return get_z_less_than_a_equal(1-x)
print(get_z_in_range(-1.20, 1.40))
print(get_z_less_than_a_equal(0.7517))
print(get_z_greater_than_a_equal(0.1000))
print(get_z_greater_than_a_equal(0.0322))
print(get_z_less_than_a_equal(0.1075))
print(get_z_less_than_a_equal(0.75))
因为python3.8,标准库中的statistics
模块有一个NormalDist
class,所以我们可以用它来实现我们的功能“纯python" 或至少用于测试:
import math
from statistics import NormalDist
normal_dist = NormalDist(mu=0, sigma=1)
for i in range(-2000, 2000):
test_val = i / 1000
assert get_z_less_than(test_val) == round(normal_dist.cdf(test_val), 4)
不会抛出错误,因此该部分可能工作正常
您的 get_z_less_than_a_equal
似乎相当于 NormalDist.inv_cdf
有非常有效的方法可以使用误差函数的反函数来准确计算它(参见 Wikipedia and Python implementation),但我们在标准库中没有这种方法
由于您只关心前几位,而get_z_less_than
是monotonic, we can use a simple bisection method来找到我们的解决方案
Newton's method 会快得多,而且实现起来也不难,因为我们知道 cdf 的导数就是 pdf,但可能仍然比我们需要的更复杂
def get_z_less_than_a_equal(x):
"""
P(Z < a) = x
acquires a, given x
"""
if x <= 0.0 or x >= 1.0:
raise ValueError("x must be >0.0 and <1.0")
min_res, max_res = -10, 10
while max_res - min_res > 1e-7:
mid = (max_res + min_res) / 2
if get_z_less_than(mid) < x:
min_res = mid
else:
max_res = mid
return round((max_res + min_res) / 2, 4)
让我们测试一下:
for i in range(1, 2000):
test_val = i / 2000
left_val = get_z_less_than_a_equal(test_val)
right_val = round(normal_dist.inv_cdf(test_val), 4)
assert left_val == right_val, f"{left_val} != {right_val}"
# AssertionError: -3.3201 != -3.2905
我们发现我们正在失去一些精度,那是因为 get_z_less_than
引入的误差(四舍五入到 4 位数)在我们使用它来估计其逆时被传播和放大(参见 Wikipedia - error propagation 了解详情)
所以让我们向 get_z_less_than
添加一个“digits”参数并稍微更改我们的函数:
def get_z_less_than(x, digits=4):
"""
P(Z < x)
"""
res = 0.5 * (1 + math.erf((x - mean) / math.sqrt(2 * standard_dev ** 2)))
return round(res, digits)
def get_z_less_than_a_equal(x, digits=4):
"""
P(Z < a) = x
acquires a, given x
"""
if x <= 0.0 or x >= 1.0:
raise ValueError("x must be >0.0 and <1.0")
min_res, max_res = -10, 10
while max_res - min_res > 10 ** -(digits * 2):
mid = (max_res + min_res) / 2
if get_z_less_than(mid, digits * 2) < x:
min_res = mid
else:
max_res = mid
return round((max_res + min_res) / 2, digits)
现在我们可以再次尝试相同的测试并查看它是否通过