为什么 _joint_log_likelihood 有很大的负值
why _joint_log_likelihood has large negative values
如何解释_joint_log_likelihood的大负值。假设数据 gas 仅 T/F class 变量 .
# Programming assignment 2
import pickle
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import BernoulliNB
import numpy as np
print("Loading datasets...")
Xs = pickle.load(open('binarized_xs.pkl', 'rb'))
ys = pickle.load(open('binarized_ys.pkl', 'rb'))
print("Done Loading...")
alphaValues = list(map(lambda pow: 10 ** pow, range(-7, 8)))
print(alphaValues)
train_jll = np.zeros((10,15))
test_jll = np.zeros((10,15))
for i in range(len(Xs)):
X_train, X_test, y_train, y_test = train_test_split(Xs[i], ys[i], test_size=1./3, random_state=3922) # CWID - A 20413922
for alpha in range(len(alphaValues)):
model = BernoulliNB(alpha=alphaValues[alpha], binarize=0.0, class_prior=None, fit_prior=True)
sum_train = sum_test = 0
model.fit(X_train, y_train)
jll_xtrain = model._joint_log_likelihood(X_train)
jll_xtest = model._joint_log_likelihood(X_test)
for k in range(0, len(jll_xtrain)):
if y_train[k] is True:
sum_train += jll_xtrain[k][1]
else:
sum_train += jll_xtrain[k][0]
for l in range(0, len(jll_xtest)):
if y_train[l] is True:
sum_test += jll_xtest[l][1]
else:
sum_test += jll_xtest[l][0]
train_jll[i][alpha] = sum_train
test_jll[i][alpha] = sum_test
print("Train set accuracies")
for x in range(len(train_jll)):
print("\t".join("{0:.4f}".format(n) for n in train_jll[x]))
print("\nTest set accuracies")
for y in range(len(test_jll)):
print("\t".join("{0:.4f}".format(n) for n in test_jll[y]))
pickle.dump((train_jll, test_jll), open('result.pkl', 'wb'))
输出-
正在加载数据集...
加载完成...
[1e-07、1e-06、1e-05、0.0001、0.001、0.01、0.1、1、10、100、1000、10000、100000、1000000、10000000]
训练集精度
-2076.9291 -1865.0913 -1653.2535 -1441.4161 -1229.5828 -1017.7904 -806.4090 -599.2828 -433.6816 -393.0386 -393.9880 -394.3349 -394.37134 -3777.694
-32191.9825 -32125.2076 -32058.4325 -31991.6569 -31924.8758 -31858.0396 -31790.7286 -31723.9835 -31809.0404 -34359.6845 -44846.6500 -50567.2306 -51394.1834 -51480.5729 -51489.2505
-9399.8784 -8674.5640 -7949.2493 -7223.9305 -6498.5714 -5772.8111 -5043.1794 -4285.4285 -3557.9619 -4441.6394 -7156.1042 -7949.3959 -8043.5842 -8053.1794 -8054.1407
-353033.8920 -353017.7734 -353001.6555 -352985.5397 -352969.4446 -352953.5568 -352939.8490 -352954.5902 -353374.1657 -358382.4072 -409460.4788 -733413.0483 -1191967.4850 -1306874.0175 -1320147.6708
-3180.1231 -3180.1231 -3180.1231 -3180.1227 -3180.1193 -3180.0847 -3179.7394 -3176.3400 -3147.0704 -3059.4605 -3189.5939 -3280.1913 -3292.5986 -3293.8829 -3294.0118
-4574.8093 -4574.8093 -4574.8092 -4574.8085 -4574.8016 -4574.7321 -4574.0420 -4567.5747 -4534.8631 -4827.2279 -5720.8408 -5979.6362 -6010.0678 -6013.1640 -6013.4741
-12062.2160 -11525.7137 -10989.2111 -10452.7065 -9916.1810 -9379.4479 -8840.6448 -8281.8848 -7575.0972 -6582.4242 -6736.2181 -7272.9840 -7364.0583 -7373.7458 -7374.7206
-5259.2360 -5259.2360 -5259.2360 -5259.2361 -5259.2374 -5259.2503 -5259.3793 -5260.7374
-3514.5228 -3489.1943 -3463.8658 -3438.5362 -3413.1960 -3387.7507 -3361.3059 -3328.3483 -3321.7284 -4067.7458 -5939.6312 -6559.4446 -6635.7135 -6643.5165 -6644.2986
-16439.2193 -16439.2192 -16439.2186 -16439.2126 -16439.1520 -16438.5472 -16432.5838 -16380.1945 -16158.3379 -16977.7163 -21690.7497 -23938.6506 -24244.2920 -24275.9518 -24279.1292
测试集准确度
-1053.7924 -945.5709 -837.3494 -729.1282 -620.9093 -512.7138 -404.7524 -299.1838 -215.8272 -196.2691 -196.9655 -197.1646 -197.1862 -197.17813 -197.1862 -197.18813
-15989.8827 -15957.6465 -15925.4103 -15893.1734 -15860.9307 -15828.6299 -15795.8012 -15761.3437 -15802.3473 -17104.9942 -22402.1020 -25280.9583 -25696.8189 -25740.2591 -25744.6225
-4534.7085 -4177.8078 -3820.9070 -3464.0050 -3107.0909 -2750.0567 -2391.8688 -2025.7431 -1695.5868 -2189.5685 -3580.8558 -3984.4531 -4032.3297 -4037.2065 -4037.6951
-177068.0250 -177042.6961 -177017.3678 -176992.0408 -176966.7274 -176941.5493 -176917.7628 -176910.2919 -177109.2109 -179594.7458 -205083.1285 -366890.6006 -596016.9983 -653440.6438 -660074.2024
-1620.5941 -1620.5941 -1620.5940 -1620.5938 -1620.5919 -1620.5727 -1620.3805 -1618.4869 -1602.0804 -1548.8017 -1601.9069 -1644.1081 -1649.9448 -1650.5496 -1650.6103
-2300.3443 -2300.3443 -2300.3443 -2300.3440 -2300.3405 -2300.3063 -2299.9661 -2296.7773 -2280.6516 -2426.8969 -2874.4986 -3004.2282 -3019.4848 -3021.0371 -3021.1926
-6679.6097 -6357.2478 -6034.8857 -5712.5225 -5390.1476 -5067.6560 -4744.0017 -4409.1390 -3991.1972 -3405.6240 -3400.4154 -3643.0063 -3685.1813 -3689.6778 -3690.1304
-2566.8863 -2566.8863 -2566.8863 -2566.8864 -2566.8872 -2566.8959 -2566.9826 -2567.8859 -2579.5423 -2735.3703 -3228.0735 -3432.8776 -3459.7340 -3462.5032 -3462.7809
-1767.5406 -1758.3303 -1749.1199 -1739.9090 -1730.6931 -1721.4272 -1711.6835 -1698.6756 -1696.0342 -2056.3261 -2977.0952 -3283.5231 -3321.2606 -3325.1219 -3325.5089
-8463.1185 -8460.8158 -8458.5130 -8456.2073 -8453.8744 -8451.2691 -8445.9788 -8417.2311 -8295.8823 -8635.8290 -10888.2128 -11982.1214 -12131.2829 -12146.7387 -12148.2898
这是一个统计问题,但对数赔率的大负值意味着它不太可能。
联合日志可能罩获得非常大的 -ve 值是很常见的(假设您获得的值是 -x )。您将其除以数据集中的观察次数(假设数据集中有 n 行)。那么每次观察的平均概率是e^(-x/n)
如何解释_joint_log_likelihood的大负值。假设数据 gas 仅 T/F class 变量 .
# Programming assignment 2
import pickle
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import BernoulliNB
import numpy as np
print("Loading datasets...")
Xs = pickle.load(open('binarized_xs.pkl', 'rb'))
ys = pickle.load(open('binarized_ys.pkl', 'rb'))
print("Done Loading...")
alphaValues = list(map(lambda pow: 10 ** pow, range(-7, 8)))
print(alphaValues)
train_jll = np.zeros((10,15))
test_jll = np.zeros((10,15))
for i in range(len(Xs)):
X_train, X_test, y_train, y_test = train_test_split(Xs[i], ys[i], test_size=1./3, random_state=3922) # CWID - A 20413922
for alpha in range(len(alphaValues)):
model = BernoulliNB(alpha=alphaValues[alpha], binarize=0.0, class_prior=None, fit_prior=True)
sum_train = sum_test = 0
model.fit(X_train, y_train)
jll_xtrain = model._joint_log_likelihood(X_train)
jll_xtest = model._joint_log_likelihood(X_test)
for k in range(0, len(jll_xtrain)):
if y_train[k] is True:
sum_train += jll_xtrain[k][1]
else:
sum_train += jll_xtrain[k][0]
for l in range(0, len(jll_xtest)):
if y_train[l] is True:
sum_test += jll_xtest[l][1]
else:
sum_test += jll_xtest[l][0]
train_jll[i][alpha] = sum_train
test_jll[i][alpha] = sum_test
print("Train set accuracies")
for x in range(len(train_jll)):
print("\t".join("{0:.4f}".format(n) for n in train_jll[x]))
print("\nTest set accuracies")
for y in range(len(test_jll)):
print("\t".join("{0:.4f}".format(n) for n in test_jll[y]))
pickle.dump((train_jll, test_jll), open('result.pkl', 'wb'))
输出-
正在加载数据集... 加载完成... [1e-07、1e-06、1e-05、0.0001、0.001、0.01、0.1、1、10、100、1000、10000、100000、1000000、10000000] 训练集精度 -2076.9291 -1865.0913 -1653.2535 -1441.4161 -1229.5828 -1017.7904 -806.4090 -599.2828 -433.6816 -393.0386 -393.9880 -394.3349 -394.37134 -3777.694 -32191.9825 -32125.2076 -32058.4325 -31991.6569 -31924.8758 -31858.0396 -31790.7286 -31723.9835 -31809.0404 -34359.6845 -44846.6500 -50567.2306 -51394.1834 -51480.5729 -51489.2505 -9399.8784 -8674.5640 -7949.2493 -7223.9305 -6498.5714 -5772.8111 -5043.1794 -4285.4285 -3557.9619 -4441.6394 -7156.1042 -7949.3959 -8043.5842 -8053.1794 -8054.1407 -353033.8920 -353017.7734 -353001.6555 -352985.5397 -352969.4446 -352953.5568 -352939.8490 -352954.5902 -353374.1657 -358382.4072 -409460.4788 -733413.0483 -1191967.4850 -1306874.0175 -1320147.6708 -3180.1231 -3180.1231 -3180.1231 -3180.1227 -3180.1193 -3180.0847 -3179.7394 -3176.3400 -3147.0704 -3059.4605 -3189.5939 -3280.1913 -3292.5986 -3293.8829 -3294.0118 -4574.8093 -4574.8093 -4574.8092 -4574.8085 -4574.8016 -4574.7321 -4574.0420 -4567.5747 -4534.8631 -4827.2279 -5720.8408 -5979.6362 -6010.0678 -6013.1640 -6013.4741 -12062.2160 -11525.7137 -10989.2111 -10452.7065 -9916.1810 -9379.4479 -8840.6448 -8281.8848 -7575.0972 -6582.4242 -6736.2181 -7272.9840 -7364.0583 -7373.7458 -7374.7206 -5259.2360 -5259.2360 -5259.2360 -5259.2361 -5259.2374 -5259.2503 -5259.3793 -5260.7374 -3514.5228 -3489.1943 -3463.8658 -3438.5362 -3413.1960 -3387.7507 -3361.3059 -3328.3483 -3321.7284 -4067.7458 -5939.6312 -6559.4446 -6635.7135 -6643.5165 -6644.2986 -16439.2193 -16439.2192 -16439.2186 -16439.2126 -16439.1520 -16438.5472 -16432.5838 -16380.1945 -16158.3379 -16977.7163 -21690.7497 -23938.6506 -24244.2920 -24275.9518 -24279.1292
测试集准确度 -1053.7924 -945.5709 -837.3494 -729.1282 -620.9093 -512.7138 -404.7524 -299.1838 -215.8272 -196.2691 -196.9655 -197.1646 -197.1862 -197.17813 -197.1862 -197.18813 -15989.8827 -15957.6465 -15925.4103 -15893.1734 -15860.9307 -15828.6299 -15795.8012 -15761.3437 -15802.3473 -17104.9942 -22402.1020 -25280.9583 -25696.8189 -25740.2591 -25744.6225 -4534.7085 -4177.8078 -3820.9070 -3464.0050 -3107.0909 -2750.0567 -2391.8688 -2025.7431 -1695.5868 -2189.5685 -3580.8558 -3984.4531 -4032.3297 -4037.2065 -4037.6951 -177068.0250 -177042.6961 -177017.3678 -176992.0408 -176966.7274 -176941.5493 -176917.7628 -176910.2919 -177109.2109 -179594.7458 -205083.1285 -366890.6006 -596016.9983 -653440.6438 -660074.2024 -1620.5941 -1620.5941 -1620.5940 -1620.5938 -1620.5919 -1620.5727 -1620.3805 -1618.4869 -1602.0804 -1548.8017 -1601.9069 -1644.1081 -1649.9448 -1650.5496 -1650.6103 -2300.3443 -2300.3443 -2300.3443 -2300.3440 -2300.3405 -2300.3063 -2299.9661 -2296.7773 -2280.6516 -2426.8969 -2874.4986 -3004.2282 -3019.4848 -3021.0371 -3021.1926 -6679.6097 -6357.2478 -6034.8857 -5712.5225 -5390.1476 -5067.6560 -4744.0017 -4409.1390 -3991.1972 -3405.6240 -3400.4154 -3643.0063 -3685.1813 -3689.6778 -3690.1304 -2566.8863 -2566.8863 -2566.8863 -2566.8864 -2566.8872 -2566.8959 -2566.9826 -2567.8859 -2579.5423 -2735.3703 -3228.0735 -3432.8776 -3459.7340 -3462.5032 -3462.7809 -1767.5406 -1758.3303 -1749.1199 -1739.9090 -1730.6931 -1721.4272 -1711.6835 -1698.6756 -1696.0342 -2056.3261 -2977.0952 -3283.5231 -3321.2606 -3325.1219 -3325.5089 -8463.1185 -8460.8158 -8458.5130 -8456.2073 -8453.8744 -8451.2691 -8445.9788 -8417.2311 -8295.8823 -8635.8290 -10888.2128 -11982.1214 -12131.2829 -12146.7387 -12148.2898
这是一个统计问题,但对数赔率的大负值意味着它不太可能。
联合日志可能罩获得非常大的 -ve 值是很常见的(假设您获得的值是 -x )。您将其除以数据集中的观察次数(假设数据集中有 n 行)。那么每次观察的平均概率是e^(-x/n)