使用 GMM 获得的不同结果
different results obtained with GMM
我想使用 GMM 对经典鸢尾花数据集进行聚类。我的数据集来自:
https://gist.github.com/netj/8836201
到目前为止我的程序如下:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.mixture import GaussianMixture as mix
from sklearn.cross_validation import StratifiedKFold
def main():
data=pd.read_csv("iris.csv",header=None)
data=data.iloc[1:]
data[4]=data[4].astype("category")
data[4]=data[4].cat.codes
target=np.array(data.pop(4))
X=np.array(data).astype(float)
kf=StratifiedKFold(target,n_folds=10,shuffle=True,random_state=1234)
train_ind,test_ind=next(iter(kf))
X_train=X[train_ind]
y_train=target[train_ind]
gmm_calc(X_train,"full",y_train)
def gmm_calc(X_train,cov,y_train):
print X_train
print y_train
n_classes = len(np.unique(y_train))
model=mix(n_components=n_classes,covariance_type="full")
model.means_ = np.array([X_train[y_train == i].mean(axis=0) for i in
xrange(n_classes)])
model.fit(X_train)
y_predict=model.predict(X_train)
print cov," ",y_train
print cov," ",y_predict
print (np.mean(y_predict==y_train))*100
我遇到的问题是当我尝试获取巧合数 y_predict=y_train 时,因为每次我 运行 程序都会得到不同的结果。例如:
第一个运行:
full [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2]
full [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 0 2 2 2 2 2 2 2 2 2
2 2 2 0 2 2 2 2 2 2 2 2 2 2 2 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
0.0
第二个运行:
full [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2]
full [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 0 0 0 0 0 0 0 0 0
0 0 0 2 0 0 0 0 0 0 0 0 0 0 0 0 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2]
33.33333333333333
第三个运行:
full [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2]
full [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1
1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2]
98.51851851851852
因此,如您所见,每个 运行 的结果都不同。我在 Internet 上找到了一些代码,它们位于:
https://scikit-learn.org/0.16/auto_examples/mixture/plot_gmm_classifier.html
但是他们得到了完全协方差的训练集准确率大约为 82%。在这种情况下我做错了什么?
谢谢
更新: 我发现在互联网示例中使用的是 GMM 而不是新的 GaussianMixture。我还发现,在示例中,GMM 参数以受监督的方式进行了初始化:
classifier.means_ = np.array([X_train[y_train == i].mean(axis=0)
对于我在 xrange(n_classes)])
我已经将修改后的代码放在上面,但每次我 运行 它仍然会改变结果,但是对于 GMM 库,它不会发生。
1) GMM分类器使用Expectation–maximization algorithm来拟合高斯模型的混合:高斯分量随机地以数据点为中心,然后算法移动它们直到收敛到局部最优。由于随机初始化结果可能各不相同运行。因此,您还必须使用 GMM
的 random_state
参数(或尝试设置更多的初始化次数 n_init
并期望得到更多类似的结果。)
2) 出现精度问题是因为 GMM
(与 kmeans
相同)刚好适合 n
高斯并报告每个点所属的高斯分量 "number" ;这个数字在每个 运行 中都不同。您可以在您的预测中看到,集群是相同的,但它们的标签被交换了:(1,2,0) -> (1,0,2) -> (0,1,2),最后一个组合与正确 类 所以你得到 98% 的分数。如果绘制它们,您会发现在这种情况下,高斯本身往往保持不变,例如
您可以使用一些考虑到这一点的 clustering metrics:
>>> [round(i,5) for i in (metrics.homogeneity_score(y_predict, y_train),
metrics.completeness_score(y_predict, y_train),
metrics.v_measure_score(y_predict,y_train),
metrics.adjusted_rand_score(y_predict, y_train),
metrics.adjusted_mutual_info_score(y_predict, y_train))]
[0.86443, 0.8575, 0.86095, 0.84893, 0.85506]
绘图代码,来自https://scikit-learn.org/stable/auto_examples/mixture/plot_gmm_covariances.html,注意不同版本代码不同,如果使用旧的需要更换make_ellipses
函数:
model = mix(n_components=len(np.unique(y_train)), covariance_type="full", verbose=0, n_init=100)
X_train = X_train.astype(float)
model.fit(X_train)
y_predict = model.predict(X_train)
import matplotlib as mpl
import matplotlib.pyplot as plt
def make_ellipses(gmm, ax):
for n, color in enumerate(['navy', 'turquoise', 'darkorange']):
if gmm.covariance_type == 'full':
covariances = gmm.covariances_[n][:2, :2]
elif gmm.covariance_type == 'tied':
covariances = gmm.covariances_[:2, :2]
elif gmm.covariance_type == 'diag':
covariances = np.diag(gmm.covariances_[n][:2])
elif gmm.covariance_type == 'spherical':
covariances = np.eye(gmm.means_.shape[1]) * gmm.covariances_[n]
v, w = np.linalg.eigh(covariances)
u = w[0] / np.linalg.norm(w[0])
angle = np.arctan2(u[1], u[0])
angle = 180 * angle / np.pi # convert to degrees
v = 2. * np.sqrt(2.) * np.sqrt(v)
ell = mpl.patches.Ellipse(gmm.means_[n, :2], v[0], v[1],
180 + angle, color=color)
ell.set_clip_box(ax.bbox)
ell.set_alpha(0.5)
ax.add_artist(ell)
def plot(model, X, y, y_predict):
h = plt.subplot(1, 1, 1)
plt.subplots_adjust(bottom=.01, top=0.95, hspace=.15, wspace=.05,
left=.01, right=.99)
make_ellipses(model, h)
for n, color in enumerate( ['navy', 'turquoise', 'darkorange']):
plt.scatter(X[y == n][:,0], X[y == n][:,1], color=color,marker='x')
plt.text(0.05, 0.9, 'Accuracy: %.1f' % ((np.mean(y_predict == y)) * 100),
transform=h.transAxes)
plt.show()
plot(model, X_train, y_train, y_predict)
您的查询太晚了。可能对其他人有益。
正如@hellpanderr 发布的那样,在 GMM 中使用“random_state=1”
我想使用 GMM 对经典鸢尾花数据集进行聚类。我的数据集来自:
https://gist.github.com/netj/8836201
到目前为止我的程序如下:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.mixture import GaussianMixture as mix
from sklearn.cross_validation import StratifiedKFold
def main():
data=pd.read_csv("iris.csv",header=None)
data=data.iloc[1:]
data[4]=data[4].astype("category")
data[4]=data[4].cat.codes
target=np.array(data.pop(4))
X=np.array(data).astype(float)
kf=StratifiedKFold(target,n_folds=10,shuffle=True,random_state=1234)
train_ind,test_ind=next(iter(kf))
X_train=X[train_ind]
y_train=target[train_ind]
gmm_calc(X_train,"full",y_train)
def gmm_calc(X_train,cov,y_train):
print X_train
print y_train
n_classes = len(np.unique(y_train))
model=mix(n_components=n_classes,covariance_type="full")
model.means_ = np.array([X_train[y_train == i].mean(axis=0) for i in
xrange(n_classes)])
model.fit(X_train)
y_predict=model.predict(X_train)
print cov," ",y_train
print cov," ",y_predict
print (np.mean(y_predict==y_train))*100
我遇到的问题是当我尝试获取巧合数 y_predict=y_train 时,因为每次我 运行 程序都会得到不同的结果。例如:
第一个运行:
full [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2]
full [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 0 2 2 2 2 2 2 2 2 2
2 2 2 0 2 2 2 2 2 2 2 2 2 2 2 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
0.0
第二个运行:
full [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2]
full [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 0 0 0 0 0 0 0 0 0
0 0 0 2 0 0 0 0 0 0 0 0 0 0 0 0 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2]
33.33333333333333
第三个运行:
full [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2]
full [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1
1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2]
98.51851851851852
因此,如您所见,每个 运行 的结果都不同。我在 Internet 上找到了一些代码,它们位于:
https://scikit-learn.org/0.16/auto_examples/mixture/plot_gmm_classifier.html
但是他们得到了完全协方差的训练集准确率大约为 82%。在这种情况下我做错了什么?
谢谢
更新: 我发现在互联网示例中使用的是 GMM 而不是新的 GaussianMixture。我还发现,在示例中,GMM 参数以受监督的方式进行了初始化: classifier.means_ = np.array([X_train[y_train == i].mean(axis=0) 对于我在 xrange(n_classes)])
我已经将修改后的代码放在上面,但每次我 运行 它仍然会改变结果,但是对于 GMM 库,它不会发生。
1) GMM分类器使用Expectation–maximization algorithm来拟合高斯模型的混合:高斯分量随机地以数据点为中心,然后算法移动它们直到收敛到局部最优。由于随机初始化结果可能各不相同运行。因此,您还必须使用 GMM
的 random_state
参数(或尝试设置更多的初始化次数 n_init
并期望得到更多类似的结果。)
2) 出现精度问题是因为 GMM
(与 kmeans
相同)刚好适合 n
高斯并报告每个点所属的高斯分量 "number" ;这个数字在每个 运行 中都不同。您可以在您的预测中看到,集群是相同的,但它们的标签被交换了:(1,2,0) -> (1,0,2) -> (0,1,2),最后一个组合与正确 类 所以你得到 98% 的分数。如果绘制它们,您会发现在这种情况下,高斯本身往往保持不变,例如
>>> [round(i,5) for i in (metrics.homogeneity_score(y_predict, y_train),
metrics.completeness_score(y_predict, y_train),
metrics.v_measure_score(y_predict,y_train),
metrics.adjusted_rand_score(y_predict, y_train),
metrics.adjusted_mutual_info_score(y_predict, y_train))]
[0.86443, 0.8575, 0.86095, 0.84893, 0.85506]
绘图代码,来自https://scikit-learn.org/stable/auto_examples/mixture/plot_gmm_covariances.html,注意不同版本代码不同,如果使用旧的需要更换make_ellipses
函数:
model = mix(n_components=len(np.unique(y_train)), covariance_type="full", verbose=0, n_init=100)
X_train = X_train.astype(float)
model.fit(X_train)
y_predict = model.predict(X_train)
import matplotlib as mpl
import matplotlib.pyplot as plt
def make_ellipses(gmm, ax):
for n, color in enumerate(['navy', 'turquoise', 'darkorange']):
if gmm.covariance_type == 'full':
covariances = gmm.covariances_[n][:2, :2]
elif gmm.covariance_type == 'tied':
covariances = gmm.covariances_[:2, :2]
elif gmm.covariance_type == 'diag':
covariances = np.diag(gmm.covariances_[n][:2])
elif gmm.covariance_type == 'spherical':
covariances = np.eye(gmm.means_.shape[1]) * gmm.covariances_[n]
v, w = np.linalg.eigh(covariances)
u = w[0] / np.linalg.norm(w[0])
angle = np.arctan2(u[1], u[0])
angle = 180 * angle / np.pi # convert to degrees
v = 2. * np.sqrt(2.) * np.sqrt(v)
ell = mpl.patches.Ellipse(gmm.means_[n, :2], v[0], v[1],
180 + angle, color=color)
ell.set_clip_box(ax.bbox)
ell.set_alpha(0.5)
ax.add_artist(ell)
def plot(model, X, y, y_predict):
h = plt.subplot(1, 1, 1)
plt.subplots_adjust(bottom=.01, top=0.95, hspace=.15, wspace=.05,
left=.01, right=.99)
make_ellipses(model, h)
for n, color in enumerate( ['navy', 'turquoise', 'darkorange']):
plt.scatter(X[y == n][:,0], X[y == n][:,1], color=color,marker='x')
plt.text(0.05, 0.9, 'Accuracy: %.1f' % ((np.mean(y_predict == y)) * 100),
transform=h.transAxes)
plt.show()
plot(model, X_train, y_train, y_predict)
您的查询太晚了。可能对其他人有益。 正如@hellpanderr 发布的那样,在 GMM 中使用“random_state=1”