使用阈值在层次聚类中自动化聚类
Automating Clusters in Hierarchical clustering using threshold
我想在层次聚类过程中自动化阈值过程,我想做的是,而不是手动输入阈值,我如何检查我是否有 30 到 50 范围内的集群,如果集群不是在 30-50 范围内,通过代码更改阈值,在 python
中按 0.1 或 0.2
import pickle
import re
import string
import sys
# import gensim
# from gensim import corpora
from time import time
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scipy.cluster.hierarchy as sch
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from scipy.cluster.hierarchy import dendrogram, linkage
from scipy.spatial.distance import pdist
from scipy.spatial.distance import squareform
from sklearn.decomposition import NMF, LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from stop_word_complaints import complaint_stop_words
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 2), max_df=0.95, min_df=1, token_pattern=r'\b\w+\b',
max_features=n_features, stop_words=list(stop), analyzer='word')
X = tfidf_vectorizer.fit_transform(corpus).toarray()
non_zero_features = np.where(np.sum(X, axis=1) != 0)[0]
print("done in %0.3fs." % (time() - t0))
print("pdist ...")
t0 = time()
cos_dist = pdist(X[non_zero_features, :], 'cosine')
print("done in %0.3fs." % (time() - t0))
dists = np.asarray(squareform(cos_dist))
dists[np.isnan(dists)] = 1
# cos_dist[np.isnan(cos_dist)] = 0
# dists[np.argwhere(np.isnan(dists))] = 1
print("linkage ...")
np.savetxt(str_path + "_dist_1.csv", dists, delimiter=',')
# pickle.dump(dists, open(str_path + "_dist.p", "wb"))
t0 = time()
linkage_matrix = linkage(dists, "average")
print("done in %0.3fs." % (time() - t0))
np.savetxt(str_path + "linkage_matrix.csv", linkage_matrix, delimiter=',')
# linkage_matrix = np.loadtxt(str_path + "linkage_matrix.csv", delimiter=',')
# pickle.dump(linkage_matrix, open(str_path + "linkage_matrix.p", "wb"))
dendrogram(linkage_matrix)
# create figure & 1 axis
fig, ax = plt.subplots(nrows=1, ncols=1) # create figure & 1 axis
plt.title('Hierarchical Clustering Dendrogram')
plt.xlabel('sample index')
plt.ylabel('distance')
dendrogram(
linkage_matrix
# leaf_rotation=90., # rotates the x axis labels
# leaf_font_size=3., # font size for the x axis labels
)
plt.show()
fig.savefig(str_path + 'Agglo_Heirachy_dendo.png') # save the figure to file
min_th = min(linkage_matrix[:,2])
max_th = max(linkage_matrix[:,2])
clusters = get_clusters(linkage_matrix, min_th, max_th)
我终于找到了解决方案,我定义了新函数,在该函数中我获得了范围内的所需集群
def get_clusters(linkage_matrix, min_th, max_th):
while (True):
print("----------------\n")
th = min_th + (max_th - min_th) / 2
clusters = sch.fcluster(linkage_matrix, th, 'distance')
if max(clusters) >= 30 and max(clusters) <= 50:
print("Clusters found: %d" % max(clusters))
return clusters
elif max(clusters) > 50:
min_th = th
print("Clusters found: %d" % max(clusters))
continue
elif max(clusters) < 30:
max_th = th
print("Clusters found: %d" % max(clusters))
continue
我想在层次聚类过程中自动化阈值过程,我想做的是,而不是手动输入阈值,我如何检查我是否有 30 到 50 范围内的集群,如果集群不是在 30-50 范围内,通过代码更改阈值,在 python
中按 0.1 或 0.2 import pickle
import re
import string
import sys
# import gensim
# from gensim import corpora
from time import time
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scipy.cluster.hierarchy as sch
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from scipy.cluster.hierarchy import dendrogram, linkage
from scipy.spatial.distance import pdist
from scipy.spatial.distance import squareform
from sklearn.decomposition import NMF, LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from stop_word_complaints import complaint_stop_words
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 2), max_df=0.95, min_df=1, token_pattern=r'\b\w+\b',
max_features=n_features, stop_words=list(stop), analyzer='word')
X = tfidf_vectorizer.fit_transform(corpus).toarray()
non_zero_features = np.where(np.sum(X, axis=1) != 0)[0]
print("done in %0.3fs." % (time() - t0))
print("pdist ...")
t0 = time()
cos_dist = pdist(X[non_zero_features, :], 'cosine')
print("done in %0.3fs." % (time() - t0))
dists = np.asarray(squareform(cos_dist))
dists[np.isnan(dists)] = 1
# cos_dist[np.isnan(cos_dist)] = 0
# dists[np.argwhere(np.isnan(dists))] = 1
print("linkage ...")
np.savetxt(str_path + "_dist_1.csv", dists, delimiter=',')
# pickle.dump(dists, open(str_path + "_dist.p", "wb"))
t0 = time()
linkage_matrix = linkage(dists, "average")
print("done in %0.3fs." % (time() - t0))
np.savetxt(str_path + "linkage_matrix.csv", linkage_matrix, delimiter=',')
# linkage_matrix = np.loadtxt(str_path + "linkage_matrix.csv", delimiter=',')
# pickle.dump(linkage_matrix, open(str_path + "linkage_matrix.p", "wb"))
dendrogram(linkage_matrix)
# create figure & 1 axis
fig, ax = plt.subplots(nrows=1, ncols=1) # create figure & 1 axis
plt.title('Hierarchical Clustering Dendrogram')
plt.xlabel('sample index')
plt.ylabel('distance')
dendrogram(
linkage_matrix
# leaf_rotation=90., # rotates the x axis labels
# leaf_font_size=3., # font size for the x axis labels
)
plt.show()
fig.savefig(str_path + 'Agglo_Heirachy_dendo.png') # save the figure to file
min_th = min(linkage_matrix[:,2])
max_th = max(linkage_matrix[:,2])
clusters = get_clusters(linkage_matrix, min_th, max_th)
我终于找到了解决方案,我定义了新函数,在该函数中我获得了范围内的所需集群
def get_clusters(linkage_matrix, min_th, max_th):
while (True):
print("----------------\n")
th = min_th + (max_th - min_th) / 2
clusters = sch.fcluster(linkage_matrix, th, 'distance')
if max(clusters) >= 30 and max(clusters) <= 50:
print("Clusters found: %d" % max(clusters))
return clusters
elif max(clusters) > 50:
min_th = th
print("Clusters found: %d" % max(clusters))
continue
elif max(clusters) < 30:
max_th = th
print("Clusters found: %d" % max(clusters))
continue