sklearn 聚类:计算 TF-IDF 加权数据的剪影系数
sklearn clustering: calculate silhouette coefficient on TF-IDF-weigthed data
我想像 scikit-learn 示例 silhouette_analysis 那样计算 silhouette_score。
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(use_idf=True)
sampleText = []
sampleText.append("Some text for document clustering")
tfidf_matrix = tfidf_vectorizer.fit_transform(sampleText)
我该如何转换我的 tfidf_matrix 来做这样的事情:
import matplotlib.cm as cm
from sklearn.metrics import silhouette_samples, silhouette_score
import matplotlib.pyplot as plt
for num_clusters in range(2,6):
# Create a subplot with 1 row and 2 columns
fig, (ax1, ax2) = plt.subplots(1, 2)
fig.set_size_inches(18, 7)
# The 1st subplot is the silhouette plot
# The silhouette coefficient can range from -1, 1 but in this example all
# lie within [-0.1, 1]
ax1.set_xlim([-0.1, 1])
# The (n_clusters+1)*10 is for inserting blank space between silhouette
# plots of individual clusters, to demarcate them clearly.
ax1.set_ylim([0, len(tfidf_matrix) + (num_clusters + 1) * 10])
km = KMeans(n_clusters=num_clusters,
n_init=10, # number of iterations with different seeds
random_state=1 # fixes the seed
)
cluster_labels = km.fit_predict(tfidf_matrix)
# The silhouette_score gives the average value for all the samples.
# This gives a perspective into the density and separation of the formed
# clusters
silhouette_avg = silhouette_score(tfidf_matrix, cluster_labels)
tf-idf是多维的,必须降维到二维。这可以通过将 tf-idf 减少到具有最高方差的两个特征来完成。我使用 PCA 来减少 tf-idf。完整示例:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(use_idf=True)
sampleText = []
sampleText.append("Some text for document clustering")
tfidf_matrix = tfidf_vectorizer.fit_transform(sampleText)
X = tfidf_vectorizer.fit_transform(jobDescriptions).todense()
from sklearn.decomposition import PCA
pca = PCA(n_components=2).fit(X)
data2D = pca.transform(X)
import matplotlib.cm as cm
from sklearn.metrics import silhouette_samples, silhouette_score
import matplotlib.pyplot as plt
for num_clusters in range(2,6):
# Create a subplot with 1 row and 2 columns
fig, (ax1, ax2) = plt.subplots(1, 2)
fig.set_size_inches(18, 7)
# The 1st subplot is the silhouette plot
# The silhouette coefficient can range from -1, 1 but in this example all
# lie within [-0.1, 1]
ax1.set_xlim([-0.1, 1])
# The (n_clusters+1)*10 is for inserting blank space between silhouette
# plots of individual clusters, to demarcate them clearly.
ax1.set_ylim([0, len(data2D) + (num_clusters + 1) * 10])
km = KMeans(n_clusters=num_clusters,
n_init=10, # number of iterations with different seeds
random_state=1 # fixes the seed
)
cluster_labels = km.fit_predict(data2D)
# The silhouette_score gives the average value for all the samples.
# This gives a perspective into the density and separation of the formed
# clusters
silhouette_avg = silhouette_score(data2D, cluster_labels)
我想像 scikit-learn 示例 silhouette_analysis 那样计算 silhouette_score。
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(use_idf=True)
sampleText = []
sampleText.append("Some text for document clustering")
tfidf_matrix = tfidf_vectorizer.fit_transform(sampleText)
我该如何转换我的 tfidf_matrix 来做这样的事情:
import matplotlib.cm as cm
from sklearn.metrics import silhouette_samples, silhouette_score
import matplotlib.pyplot as plt
for num_clusters in range(2,6):
# Create a subplot with 1 row and 2 columns
fig, (ax1, ax2) = plt.subplots(1, 2)
fig.set_size_inches(18, 7)
# The 1st subplot is the silhouette plot
# The silhouette coefficient can range from -1, 1 but in this example all
# lie within [-0.1, 1]
ax1.set_xlim([-0.1, 1])
# The (n_clusters+1)*10 is for inserting blank space between silhouette
# plots of individual clusters, to demarcate them clearly.
ax1.set_ylim([0, len(tfidf_matrix) + (num_clusters + 1) * 10])
km = KMeans(n_clusters=num_clusters,
n_init=10, # number of iterations with different seeds
random_state=1 # fixes the seed
)
cluster_labels = km.fit_predict(tfidf_matrix)
# The silhouette_score gives the average value for all the samples.
# This gives a perspective into the density and separation of the formed
# clusters
silhouette_avg = silhouette_score(tfidf_matrix, cluster_labels)
tf-idf是多维的,必须降维到二维。这可以通过将 tf-idf 减少到具有最高方差的两个特征来完成。我使用 PCA 来减少 tf-idf。完整示例:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(use_idf=True)
sampleText = []
sampleText.append("Some text for document clustering")
tfidf_matrix = tfidf_vectorizer.fit_transform(sampleText)
X = tfidf_vectorizer.fit_transform(jobDescriptions).todense()
from sklearn.decomposition import PCA
pca = PCA(n_components=2).fit(X)
data2D = pca.transform(X)
import matplotlib.cm as cm
from sklearn.metrics import silhouette_samples, silhouette_score
import matplotlib.pyplot as plt
for num_clusters in range(2,6):
# Create a subplot with 1 row and 2 columns
fig, (ax1, ax2) = plt.subplots(1, 2)
fig.set_size_inches(18, 7)
# The 1st subplot is the silhouette plot
# The silhouette coefficient can range from -1, 1 but in this example all
# lie within [-0.1, 1]
ax1.set_xlim([-0.1, 1])
# The (n_clusters+1)*10 is for inserting blank space between silhouette
# plots of individual clusters, to demarcate them clearly.
ax1.set_ylim([0, len(data2D) + (num_clusters + 1) * 10])
km = KMeans(n_clusters=num_clusters,
n_init=10, # number of iterations with different seeds
random_state=1 # fixes the seed
)
cluster_labels = km.fit_predict(data2D)
# The silhouette_score gives the average value for all the samples.
# This gives a perspective into the density and separation of the formed
# clusters
silhouette_avg = silhouette_score(data2D, cluster_labels)