在 python 中通过 DBSCAN 对空间(纬度、经度)数据进行聚类后,我可以获得聚类质心吗
Can I get Cluster Centroids after Clustering the Spatial (latitude, longitude) data by DBSCAN in python
DBSCAN 与假设簇应具有特定形状的 k-means 或 k-medoids 有很大不同。它假设聚类是一组彼此靠近的点,形成数据中点的密集邻域 space.
我可以计算每个簇的数据点的平均值以获得每个簇的质心。但这是一个正确的过程吗?因为质心的概念似乎仅适用于具有特定形状(如超球面)的簇。
请回答DBSCAN计算的簇计算质心是否正确?
试试这个:
# import necessary modules
import pandas as pd, numpy as np, matplotlib.pyplot as plt, time
from sklearn.cluster import DBSCAN
from sklearn import metrics
from geopy.distance import great_circle
from shapely.geometry import MultiPoint
# define the number of kilometers in one radian
kms_per_radian = 6371.0088
# load the data set
df = pd.read_csv('C:\your_path\summer-travel-gps-full.csv', encoding = "ISO-8859-1")
df.head()
# how many rows are in this data set?
len(df)
# scatterplot it to get a sense of what it looks like
df = df.sort_values(by=['lat', 'lon'])
ax = df.plot(kind='scatter', x='lon', y='lat', alpha=0.5, linewidth=0)
# represent points consistently as (lat, lon)
# coords = df.as_matrix(columns=['lat', 'lon'])
df_coords = df[['lat', 'lon']]
# coords = df.to_numpy(df_coords)
# define epsilon as 10 kilometers, converted to radians for use by haversine
epsilon = 10 / kms_per_radian
start_time = time.time()
db = DBSCAN(eps=epsilon, min_samples=10, algorithm='ball_tree', metric='haversine').fit(np.radians(df_coords))
cluster_labels = db.labels_
unique_labels = set(cluster_labels)
# get the number of clusters
num_clusters = len(set(cluster_labels))
# get colors and plot all the points, color-coded by cluster (or gray if not in any cluster, aka noise)
fig, ax = plt.subplots()
colors = plt.cm.rainbow(np.linspace(0, 1, len(unique_labels)))
# for each cluster label and color, plot the cluster's points
for cluster_label, color in zip(unique_labels, colors):
size = 150
if cluster_label == -1: #make the noise (which is labeled -1) appear as smaller gray points
color = 'gray'
size = 30
# plot the points that match the current cluster label
# X.iloc[:-1]
# df.iloc[:, 0]
x_coords = df_coords.iloc[:, 0]
y_coords = df_coords.iloc[:, 1]
ax.scatter(x=x_coords, y=y_coords, c=color, edgecolor='k', s=size, alpha=0.5)
ax.set_title('Number of clusters: {}'.format(num_clusters))
plt.show()
coefficient = metrics.silhouette_score(df_coords, cluster_labels)
print('Silhouette coefficient: {:0.03f}'.format(metrics.silhouette_score(df_coords, cluster_labels)))
# set eps low (1.5km) so clusters are only formed by very close points
epsilon = 1.5 / kms_per_radian
# set min_samples to 1 so we get no noise - every point will be in a cluster even if it's a cluster of 1
start_time = time.time()
db = DBSCAN(eps=epsilon, min_samples=1, algorithm='ball_tree', metric='haversine').fit(np.radians(df_coords))
cluster_labels = db.labels_
unique_labels = set(cluster_labels)
# get the number of clusters
num_clusters = len(set(cluster_labels))
# all done, print the outcome
message = 'Clustered {:,} points down to {:,} clusters, for {:.1f}% compression in {:,.2f} seconds'
print(message.format(len(df), num_clusters, 100*(1 - float(num_clusters) / len(df)), time.time()-start_time))
coefficient = metrics.silhouette_score(df_coords, cluster_labels)
print('Silhouette coefficient: {:0.03f}'.format(metrics.silhouette_score(df_coords, cluster_labels)))
# number of clusters, ignoring noise if present
num_clusters = len(set(cluster_labels)) #- (1 if -1 in labels else 0)
print('Number of clusters: {}'.format(num_clusters))
# create a series to contain the clusters - each element in the series is the points that compose each cluster
clusters = pd.Series([df_coords[cluster_labels == n] for n in range(num_clusters)])
clusters.tail()
结果:
133 lat lon
662 50.37369 18.889205
134 lat lon
561 50.448704 19.0...
135 lat lon
661 50.462271 19.0...
136 lat lon
559 50.489304 19.0...
137 lat lon
1 51.474005 -0.450999
您可以在此处获取源数据:
https://github.com/gboeing/2014-summer-travels/tree/master/data
DBSCAN 与假设簇应具有特定形状的 k-means 或 k-medoids 有很大不同。它假设聚类是一组彼此靠近的点,形成数据中点的密集邻域 space.
我可以计算每个簇的数据点的平均值以获得每个簇的质心。但这是一个正确的过程吗?因为质心的概念似乎仅适用于具有特定形状(如超球面)的簇。
请回答DBSCAN计算的簇计算质心是否正确?
试试这个:
# import necessary modules
import pandas as pd, numpy as np, matplotlib.pyplot as plt, time
from sklearn.cluster import DBSCAN
from sklearn import metrics
from geopy.distance import great_circle
from shapely.geometry import MultiPoint
# define the number of kilometers in one radian
kms_per_radian = 6371.0088
# load the data set
df = pd.read_csv('C:\your_path\summer-travel-gps-full.csv', encoding = "ISO-8859-1")
df.head()
# how many rows are in this data set?
len(df)
# scatterplot it to get a sense of what it looks like
df = df.sort_values(by=['lat', 'lon'])
ax = df.plot(kind='scatter', x='lon', y='lat', alpha=0.5, linewidth=0)
# represent points consistently as (lat, lon)
# coords = df.as_matrix(columns=['lat', 'lon'])
df_coords = df[['lat', 'lon']]
# coords = df.to_numpy(df_coords)
# define epsilon as 10 kilometers, converted to radians for use by haversine
epsilon = 10 / kms_per_radian
start_time = time.time()
db = DBSCAN(eps=epsilon, min_samples=10, algorithm='ball_tree', metric='haversine').fit(np.radians(df_coords))
cluster_labels = db.labels_
unique_labels = set(cluster_labels)
# get the number of clusters
num_clusters = len(set(cluster_labels))
# get colors and plot all the points, color-coded by cluster (or gray if not in any cluster, aka noise)
fig, ax = plt.subplots()
colors = plt.cm.rainbow(np.linspace(0, 1, len(unique_labels)))
# for each cluster label and color, plot the cluster's points
for cluster_label, color in zip(unique_labels, colors):
size = 150
if cluster_label == -1: #make the noise (which is labeled -1) appear as smaller gray points
color = 'gray'
size = 30
# plot the points that match the current cluster label
# X.iloc[:-1]
# df.iloc[:, 0]
x_coords = df_coords.iloc[:, 0]
y_coords = df_coords.iloc[:, 1]
ax.scatter(x=x_coords, y=y_coords, c=color, edgecolor='k', s=size, alpha=0.5)
ax.set_title('Number of clusters: {}'.format(num_clusters))
plt.show()
coefficient = metrics.silhouette_score(df_coords, cluster_labels)
print('Silhouette coefficient: {:0.03f}'.format(metrics.silhouette_score(df_coords, cluster_labels)))
# set eps low (1.5km) so clusters are only formed by very close points
epsilon = 1.5 / kms_per_radian
# set min_samples to 1 so we get no noise - every point will be in a cluster even if it's a cluster of 1
start_time = time.time()
db = DBSCAN(eps=epsilon, min_samples=1, algorithm='ball_tree', metric='haversine').fit(np.radians(df_coords))
cluster_labels = db.labels_
unique_labels = set(cluster_labels)
# get the number of clusters
num_clusters = len(set(cluster_labels))
# all done, print the outcome
message = 'Clustered {:,} points down to {:,} clusters, for {:.1f}% compression in {:,.2f} seconds'
print(message.format(len(df), num_clusters, 100*(1 - float(num_clusters) / len(df)), time.time()-start_time))
coefficient = metrics.silhouette_score(df_coords, cluster_labels)
print('Silhouette coefficient: {:0.03f}'.format(metrics.silhouette_score(df_coords, cluster_labels)))
# number of clusters, ignoring noise if present
num_clusters = len(set(cluster_labels)) #- (1 if -1 in labels else 0)
print('Number of clusters: {}'.format(num_clusters))
# create a series to contain the clusters - each element in the series is the points that compose each cluster
clusters = pd.Series([df_coords[cluster_labels == n] for n in range(num_clusters)])
clusters.tail()
结果:
133 lat lon
662 50.37369 18.889205
134 lat lon
561 50.448704 19.0...
135 lat lon
661 50.462271 19.0...
136 lat lon
559 50.489304 19.0...
137 lat lon
1 51.474005 -0.450999
您可以在此处获取源数据:
https://github.com/gboeing/2014-summer-travels/tree/master/data