Scikit-learn:如何在一维数组上 运行 KMeans?
Scikit-learn: How to run KMeans on a one-dimensional array?
我有一个包含 13.876(13,876) 个值的数组,值介于 0 和 1 之间。我想仅将 sklearn.cluster.KMeans
应用于此向量以查找值分组的不同簇。但是,KMeans 似乎适用于多维数组而不是一维数组。我想有一个技巧可以让它发挥作用,但我不知道怎么做。我看到 KMeans.fit() 接受 "X : array-like or sparse matrix, shape=(n_samples, n_features)",但它想要n_samples
大于 1
我尝试将数组放在 np.zeros() 矩阵和 运行 KMeans 上,但随后将所有非空值放在 class 1 上,其余值放在 class0.
任何人都可以帮助运行在一维数组上使用这个算法吗?
您有 1 个特征的多个样本,因此您可以使用 numpy 的 reshape:
将数组重塑为 (13,876, 1)
from sklearn.cluster import KMeans
import numpy as np
x = np.random.random(13876)
km = KMeans()
km.fit(x.reshape(-1,1)) # -1 will be calculated to be 13876 here
了解 Jenks Natural Breaks。 Python 中的函数找到了文章中的 link:
def get_jenks_breaks(data_list, number_class):
data_list.sort()
mat1 = []
for i in range(len(data_list) + 1):
temp = []
for j in range(number_class + 1):
temp.append(0)
mat1.append(temp)
mat2 = []
for i in range(len(data_list) + 1):
temp = []
for j in range(number_class + 1):
temp.append(0)
mat2.append(temp)
for i in range(1, number_class + 1):
mat1[1][i] = 1
mat2[1][i] = 0
for j in range(2, len(data_list) + 1):
mat2[j][i] = float('inf')
v = 0.0
for l in range(2, len(data_list) + 1):
s1 = 0.0
s2 = 0.0
w = 0.0
for m in range(1, l + 1):
i3 = l - m + 1
val = float(data_list[i3 - 1])
s2 += val * val
s1 += val
w += 1
v = s2 - (s1 * s1) / w
i4 = i3 - 1
if i4 != 0:
for j in range(2, number_class + 1):
if mat2[l][j] >= (v + mat2[i4][j - 1]):
mat1[l][j] = i3
mat2[l][j] = v + mat2[i4][j - 1]
mat1[l][1] = 1
mat2[l][1] = v
k = len(data_list)
kclass = []
for i in range(number_class + 1):
kclass.append(min(data_list))
kclass[number_class] = float(data_list[len(data_list) - 1])
count_num = number_class
while count_num >= 2: # print "rank = " + str(mat1[k][count_num])
idx = int((mat1[k][count_num]) - 2)
# print "val = " + str(data_list[idx])
kclass[count_num - 1] = data_list[idx]
k = int((mat1[k][count_num] - 1))
count_num -= 1
return kclass
使用和可视化:
import numpy as np
import matplotlib.pyplot as plt
def get_jenks_breaks(...):...
x = np.random.random(30)
breaks = get_jenks_breaks(x, 5)
for line in breaks:
plt.plot([line for _ in range(len(x))], 'k--')
plt.plot(x)
plt.grid(True)
plt.show()
结果:
我有一个包含 13.876(13,876) 个值的数组,值介于 0 和 1 之间。我想仅将 sklearn.cluster.KMeans
应用于此向量以查找值分组的不同簇。但是,KMeans 似乎适用于多维数组而不是一维数组。我想有一个技巧可以让它发挥作用,但我不知道怎么做。我看到 KMeans.fit() 接受 "X : array-like or sparse matrix, shape=(n_samples, n_features)",但它想要n_samples
大于 1
我尝试将数组放在 np.zeros() 矩阵和 运行 KMeans 上,但随后将所有非空值放在 class 1 上,其余值放在 class0.
任何人都可以帮助运行在一维数组上使用这个算法吗?
您有 1 个特征的多个样本,因此您可以使用 numpy 的 reshape:
将数组重塑为 (13,876, 1)from sklearn.cluster import KMeans
import numpy as np
x = np.random.random(13876)
km = KMeans()
km.fit(x.reshape(-1,1)) # -1 will be calculated to be 13876 here
了解 Jenks Natural Breaks。 Python 中的函数找到了文章中的 link:
def get_jenks_breaks(data_list, number_class):
data_list.sort()
mat1 = []
for i in range(len(data_list) + 1):
temp = []
for j in range(number_class + 1):
temp.append(0)
mat1.append(temp)
mat2 = []
for i in range(len(data_list) + 1):
temp = []
for j in range(number_class + 1):
temp.append(0)
mat2.append(temp)
for i in range(1, number_class + 1):
mat1[1][i] = 1
mat2[1][i] = 0
for j in range(2, len(data_list) + 1):
mat2[j][i] = float('inf')
v = 0.0
for l in range(2, len(data_list) + 1):
s1 = 0.0
s2 = 0.0
w = 0.0
for m in range(1, l + 1):
i3 = l - m + 1
val = float(data_list[i3 - 1])
s2 += val * val
s1 += val
w += 1
v = s2 - (s1 * s1) / w
i4 = i3 - 1
if i4 != 0:
for j in range(2, number_class + 1):
if mat2[l][j] >= (v + mat2[i4][j - 1]):
mat1[l][j] = i3
mat2[l][j] = v + mat2[i4][j - 1]
mat1[l][1] = 1
mat2[l][1] = v
k = len(data_list)
kclass = []
for i in range(number_class + 1):
kclass.append(min(data_list))
kclass[number_class] = float(data_list[len(data_list) - 1])
count_num = number_class
while count_num >= 2: # print "rank = " + str(mat1[k][count_num])
idx = int((mat1[k][count_num]) - 2)
# print "val = " + str(data_list[idx])
kclass[count_num - 1] = data_list[idx]
k = int((mat1[k][count_num] - 1))
count_num -= 1
return kclass
使用和可视化:
import numpy as np
import matplotlib.pyplot as plt
def get_jenks_breaks(...):...
x = np.random.random(30)
breaks = get_jenks_breaks(x, 5)
for line in breaks:
plt.plot([line for _ in range(len(x))], 'k--')
plt.plot(x)
plt.grid(True)
plt.show()
结果: