运行 数据集上的不同 Scikit-learn 聚类算法

Run Different Scikit-learn Clustering Algorithms on Dataset

我有一个如下所示的数据框。形状是 (24,7)

Name   x1   x2   x3    x4    x5    x6
Harry  102  204  0.43  0.21  1.02  0.39
James  242  500  0.31  0.11  0.03  0.73
.
.
.
Mike   3555 4002 0.12  0.03  0.52. 0.11
Henry  532  643  0.01  0.02  0.33  0.10

我想 运行 Scikit-learn 在上述数据帧上的不同聚类算法脚本。然而,输入数据看起来很混乱,不太确定如何输入我的数据框

https://scikit-learn.org/stable/auto_examples/cluster/plot_cluster_comparison.html#sphx-glr-auto-examples-cluster-plot-cluster-comparison-py

PS : 请用你的 X

替换 : data = X_data.iloc[:20000]
import numpy as np
import matplotlib as plt
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn import decomposition
from sklearn import preprocessing
from sklearn import cluster, metrics
from scipy.cluster.hierarchy import linkage, fcluster
from sklearn import preprocessing
from collections import Counter
from sklearn.cluster import DBSCAN
from sklearn import mixture
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_samples, silhouette_score

comp_model = pd.DataFrame(columns=['Model', 'Score_Silhouette',
                                   'num_clusters', 'size_clusters',
                                   'parameters'])

K-Means :

def k_means(X_data, nb_clusters, model_comp):

    ks = nb_clusters
    inertias = []
    data = X_data.iloc[:20000]
    X = data.values
    X_scaled = preprocessing.StandardScaler().fit_transform(X)

    for num_clusters in ks:
        # Create a KMeans instance with k clusters: model
        model = KMeans(n_clusters=num_clusters, n_init=1)

        # Fit model to samples
        model.fit(X_scaled)

        # Append the inertia to the list of inertias
        inertias.append(model.inertia_)

        silh = metrics.silhouette_score(X_scaled, model.labels_)

        # Counting the amount of data in each cluster
        taille_clusters = Counter(model.labels_)

        data = [{'Model': 'kMeans',
                 'Score_Silhouette': silh,
                 'num_clusters': num_clusters,
                 'size_clusters': taille_clusters,
                 'parameters': 'nb_clusters :'+str(num_clusters)}]

        model_comp = model_comp.append(data, ignore_index=True, sort=False)

    # Plot ks vs inertias
    plt.plot(ks, inertias, '-o')
    plt.xlabel('number of clusters, k')
    plt.ylabel('inertia')
    plt.xticks(ks)
    plt.show()
    return model_comp

comp_model = k_means(X_data=df,
                     nb_clusters=pd.np.arange(2, 11, 1),
                     model_comp=comp_model)

数据库扫描:

def dbscan_grid_search(X_data, model_comp, eps_space=0.5,
                       min_samples_space=5, min_clust=0, max_clust=10):

    data = X_data.iloc[:20000]
    X = data.values
    X_scaled = preprocessing.StandardScaler().fit_transform(X)

    # Starting a tally of total iterations
    n_iterations = 0

    # Looping over each combination of hyperparameters
    for eps_val in eps_space:
        for samples_val in min_samples_space:

            dbscan_grid = DBSCAN(eps=eps_val,
                                 min_samples=samples_val)

            # fit_transform
            clusters = dbscan_grid.fit_predict(X=X_scaled)

            # Counting the amount of data in each cluster
            cluster_count = Counter(clusters)

            #n_clusters = sum(abs(pd.np.unique(clusters))) - 1
            n_clusters = len(set(clusters)) - (1 if -1 in clusters else 0)

            # Increasing the iteration tally with each run of the loop
            n_iterations += 1

            # Appending the lst each time n_clusters criteria is reached
            if n_clusters >= min_clust and n_clusters <= max_clust:

                silh = metrics.silhouette_score(X_scaled, clusters)

                data = [{'Model': 'Dbscan',
                         'Score_Silhouette': silh,
                         'num_clusters': n_clusters,
                         'size_clusters': cluster_count,
                         'parameters': 'eps :'+str(eps_val)+'+ samples_val :'+str(samples_val)}]

                model_comp = model_comp.append(
                    data, ignore_index=True, sort=False)

    return model_comp
comp_model = dbscan_grid_search(X_data=df,
                                model_comp=comp_model,
                                eps_space=pd.np.arange(0.1, 5, 0.6),
                                min_samples_space=pd.np.arange(1, 30, 3),
                                min_clust=2,
                                max_clust=10)

GMM :

def gmm(X_data, nb_clusters, model_comp):

    ks = nb_clusters
    data = X_data.iloc[:20000]
    X = data.values
    X_scaled = preprocessing.StandardScaler().fit_transform(X)

    for num_clusters in ks:
        # Create a KMeans instance with k clusters: model
        gmm = mixture.GaussianMixture(n_components=num_clusters).fit(X_scaled)

        # Fit model to samples
        gmm.fit(X_scaled)
        pred = gmm.predict(X_scaled)

        cluster_count = Counter(pred)

        silh = metrics.silhouette_score(X_scaled, pred)

        data = [{'Model': 'GMM',
                 'Score_Silhouette': silh,
                 'num_clusters': num_clusters,
                 'size_clusters': cluster_count,
                 'parameters': 'nb_clusters :'+str(num_clusters)}]

        model_comp = model_comp.append(data, ignore_index=True, sort=False)

    return model_comp

comp_model = gmm(X_data=df,
                 nb_clusters=pd.np.arange(2, 11, 1),
                 model_comp=comp_model
                 )

最后您将得到 comp_model,其中将包含您算法的所有结果。在这里,我使用三种算法,在您选择最适合您的算法后(具有分数轮廓和聚类数)。

您应该检查每个集群的重新分区: https://scikit-learn.org/stable/auto_examples/cluster/plot_kmeans_silhouette_analysis.html#sphx-glr-auto-examples-cluster-plot-kmeans-silhouette-analysis-py

您的场景与您 link 的 scikit-learn 示例之间存在两个主要区别:

  1. 您只有一个数据集,没有几个不同的数据集可以比较。
  2. 你有六个特征,而不仅仅是两个。

第一点允许您通过删除不同数据集和相关计算的循环来简化示例代码。第二点意味着你不能轻易地绘制你的结果。相反,您可以将每个算法找到的预测 class 标签添加到您的数据集中。

因此您可以像这样修改示例代码:

import time
import warnings

import numpy as np
import pandas as pd

from sklearn import cluster, datasets, mixture
from sklearn.neighbors import kneighbors_graph
from sklearn.preprocessing import StandardScaler
from itertools import cycle, islice

np.random.seed(0)

# ============
# Introduce your dataset
# ============
my_df =  # Insert your data here, as a pandas dataframe. 
features = [f'x{i}' for i in range(1, 7)]
X = my_df[features].values

# ============
# Set up cluster parameters
# ============
params = {
    "quantile": 0.3,
    "eps": 0.3,
    "damping": 0.9,
    "preference": -200,
    "n_neighbors": 3,
    "n_clusters": 3,
    "min_samples": 7,
    "xi": 0.05,
    "min_cluster_size": 0.1,
}

# normalize dataset for easier parameter selection
X = StandardScaler().fit_transform(X)

# estimate bandwidth for mean shift
bandwidth = max(cluster.estimate_bandwidth(X, quantile=params["quantile"]),
                0.001)  # arbitrary correction to avoid 0

# connectivity matrix for structured Ward
connectivity = kneighbors_graph(
    X, n_neighbors=params["n_neighbors"], include_self=False
)
# make connectivity symmetric
connectivity = 0.5 * (connectivity + connectivity.T)

# ============
# Create cluster objects
# ============
ms = cluster.MeanShift(bandwidth=bandwidth, bin_seeding=True)
two_means = cluster.MiniBatchKMeans(n_clusters=params["n_clusters"])
ward = cluster.AgglomerativeClustering(
    n_clusters=params["n_clusters"], linkage="ward", connectivity=connectivity
)
spectral = cluster.SpectralClustering(
    n_clusters=params["n_clusters"],
    eigen_solver="arpack",
    affinity="nearest_neighbors",
)
dbscan = cluster.DBSCAN(eps=params["eps"])
optics = cluster.OPTICS(
    min_samples=params["min_samples"],
    xi=params["xi"],
    min_cluster_size=params["min_cluster_size"],
)
affinity_propagation = cluster.AffinityPropagation(
    damping=params["damping"], preference=params["preference"], random_state=0
)
average_linkage = cluster.AgglomerativeClustering(
    linkage="average",
    affinity="cityblock",
    n_clusters=params["n_clusters"],
    connectivity=connectivity,
)
birch = cluster.Birch(n_clusters=params["n_clusters"])
gmm = mixture.GaussianMixture(
    n_components=params["n_clusters"], covariance_type="full"
)

clustering_algorithms = (
    ("MiniBatch\nKMeans", two_means),
    ("Affinity\nPropagation", affinity_propagation),
    ("MeanShift", ms),
    ("Spectral\nClustering", spectral),
    ("Ward", ward),
    ("Agglomerative\nClustering", average_linkage),
    ("DBSCAN", dbscan),
    ("OPTICS", optics),
    ("BIRCH", birch),
    ("Gaussian\nMixture", gmm),
)

for name, algorithm in clustering_algorithms:
    t0 = time.time()

    # catch warnings related to kneighbors_graph
    with warnings.catch_warnings():
        warnings.filterwarnings(
            "ignore",
            message="the number of connected components of the "
            + "connectivity matrix is [0-9]{1,2}"
            + " > 1. Completing it to avoid stopping the tree early.",
            category=UserWarning,
        )
        warnings.filterwarnings(
            "ignore",
            message="Graph is not fully connected, spectral embedding"
            + " may not work as expected.",
            category=UserWarning,
        )
        algorithm.fit(X)

    t1 = time.time()
    if hasattr(algorithm, "labels_"):
        y_pred = algorithm.labels_.astype(int)
    else:
        y_pred = algorithm.predict(X)

    # Add cluster labels to the dataset
    my_df[name] = y_pred