优化以找到最小距离
Optimise to find minimum distance
我希望能够将 n 个点的数据集划分为可变数量的簇 (2-10),以便点之间的距离在约束条件下最小,例如每个点只有 5 个点簇。我不需要使用整个数据集,目前正在使用 or-tools。
这是我现在使用 or-tools 的结果:
# Create data
Point = collections.namedtuple("Point", ["x", "y"])
################################
points = [Point(1,2), Point(1,2), ..., ..., ...]
################################
# Create my matrix of all points in each cluster
variables = {(i, j): solver.BoolVar("") for i in range(len(points)) for j in range(maxClusters)}
for i in range(len(points)):
solver.Maximize(***Not sure what to put here***)
# Max number of blue points in each cluster
for j in range(maxClusters):
solver.Add(sum(variables[i, j] for i in range(len(points))) <= maxBlue)
# Minimum number of blue points in each cluster
for j in range(maxClusters):
solver.Add(sum(variables[i, j] for i in range(len(points))) >= minBlue)
# Solve
status = solver.Solve()
我的问题是我不知道该写什么作为 objective 函数来最小化每个簇的直径(以确保该簇中的点靠近在一起)并最大化每个簇中包含的点数簇。
我认为这个问题有两个独立的部分:为聚类问题选择一个 objective 函数;并使用 OR-Tools 实现它。
关于 objective 功能:您的选择将对您获得的解决方案类型和您选择的权衡产生重大影响,因此最终取决于您要解决的问题。话虽如此,一种可能的方法是尝试直接优化集群质量的测量。例如:最大化Dunn Index,即最小簇间距离和最大簇内距离之间的比率。
这是一个使用 OR-Tools CP-SAT 求解器的实现:
from ortools.sat.python import cp_model
model = cp_model.CpModel()
import collections
import math
# auclidean distance
def int_ad(p1, p2):
return int(math.sqrt((p1.x-p2.x) ** 2 + (p1.y-p2.y) ** 2))
Point = collections.namedtuple("Point", ["x", "y"])
### DATA
points = [Point(1,2), Point(1,3), Point(2,2), Point(4,3), Point(4,4), Point(6,4)]
distances = [[int_ad(p1, p2) for p1 in points] for p2 in points]
### Parameters
maxClusters = 3
max_in_cluster = 5
# max distance between points
DIST_BOUND = max([i for lst in distances for i in lst]) + 1
### variables
# p,c is true iff point p is a member of cluster c
cluster_points = {(p, c): model.NewBoolVar("cluster_point_%i_%i" % (p, c)) for p in range(len(points)) for c in range(maxClusters)}
#are each 2 points on the same cluster, c
same_cluster_points_helper = {(p1, p2, c): model.NewBoolVar('same_cluster_points_helper_%i_%i_%i' % (p1, p2, c))
for p1 in range(len(points))
for p2 in range(len(points))
for c in range(maxClusters)}
# are each 2 points on the same cluster (any cluster)
same_cluster_points = {(p1, p2): model.NewBoolVar('same_cluster_points_%i_%i' % (p1, p2))
for p1 in range(len(points))
for p2 in range(len(points))}
# distance between p1 and p2 if they are not in the same cluster, 0 if they do
inter_cluster_dist = {(p1, p2): model.NewIntVar(0, DIST_BOUND, 'inter_cluster_%i_%i' % (p1, p2))
for p1 in range(len(points))
for p2 in range(len(points))}
# distance between p1 and p2 if they are in the same cluster, MAX_DIST if not
intra_cluster_dist = {(p1, p2): model.NewIntVar(0, DIST_BOUND, 'intra_cluster_%i_%i' % (p1, p2))
for p1 in range(len(points))
for p2 in range(len(points))}
### Constraints
# Max number of points in each cluster
for c in range(maxClusters):
model.Add(sum(cluster_points[p, c] for p in range(len(points))) <= max_in_cluster)
#each point must be at a single cluster
for p in range(len(points)):
model.Add(sum(cluster_points[p, c] for c in range(maxClusters)) == 1)
# p1 and p2 are in the same cluster, if they are together in any scpecific cluster c
for p1 in range(len(points)):
for p2 in range(len(points)):
model.AddMaxEquality(same_cluster_points[p1, p2], (same_cluster_points_helper[p1, p2, c] for c in range(maxClusters)))
# calculate inter- and intra-distance between points
for c in range(maxClusters):
for p1 in range(len(points)):
for p2 in range(len(points)):
# p1 and p2 are in cluster c?
model.AddMultiplicationEquality(same_cluster_points_helper[p1, p2, c], (cluster_points[p1,c], cluster_points[p2,c]))
model.Add(inter_cluster_dist[p1, p2] == distances[p1][p2]).OnlyEnforceIf(same_cluster_points[p1, p2].Not())
model.Add(inter_cluster_dist[p1, p2] == DIST_BOUND).OnlyEnforceIf(same_cluster_points[p1, p2])
model.Add(intra_cluster_dist[p1, p2] == distances[p1][p2]).OnlyEnforceIf(same_cluster_points[p1, p2])
model.Add(intra_cluster_dist[p1, p2] == 0).OnlyEnforceIf(same_cluster_points[p1, p2].Not())
# prevent a cluster from having a single element only (it'll cause issues with the Dunn formula)
for c in range(maxClusters):
model.Add(sum(cluster_points[p, c] for p in range(len(points))) != 1)
min_inter = model.NewIntVar(0, DIST_BOUND, 'min_inter')
max_intra = model.NewIntVar(0, DIST_BOUND, 'max_intra')
# since we're going to use integer division, make sure we don't get rounded to zero
scale = 100
min_inter_scaled = model.NewIntVar(0, DIST_BOUND * scale , 'min_inter_scaled')
model.AddMinEquality(min_inter, [inter_cluster_dist[p1, p2] for p1 in range(len(points)) for p2 in range(len(points)) if p1 != p2])
model.AddMaxEquality(max_intra, [intra_cluster_dist[p1, p2] for p1 in range(len(points)) for p2 in range(len(points))] + [1])
model.AddMultiplicationEquality(min_inter_scaled, (min_inter, scale))
# score
score = model.NewIntVar(0, DIST_BOUND * scale, 'score')
model.AddDivisionEquality(score, min_inter_scaled, max_intra)
model.Maximize(score)
### Solve
solver = cp_model.CpSolver()
# optional
# solver.parameters.log_search_progress = True
# solver.parameters.num_search_workers = 8
# solver.parameters.max_time_in_seconds = 5
result_status = solver.Solve(model)
if (result_status == cp_model.INFEASIBLE):
print('No feasible solution under constraints')
elif (result_status == cp_model.OPTIMAL):
print('Optimal result found, score=%i' % (solver.ObjectiveValue()))
elif (result_status == cp_model.FEASIBLE):
print('Feasible (non optimal) result found')
else:
print('No feasible solution found under constraints within time')
for c in range(maxClusters):
for p in range(len(points)):
if (solver.Value(cluster_points[p, c]) > 0):
print('%i in cluster %i' % (p, c))
生成输出:
Optimal result found, score=100
0 in cluster 1
1 in cluster 1
2 in cluster 1
3 in cluster 2
4 in cluster 2
5 in cluster 2
如果您认为您的问题是距离特定的,因为您确保集群中各点之间的距离最小,那么您应该 KMeans
:
from sklearn.cluster import KMeans
res = KMeans(n_clusters = 2).fit(points)
res.labels_
array([0, 0, 0, 1, 1, 1]) # also [1,1,1,0,0,0] is acceptable since it is a label for a cluster
我希望能够将 n 个点的数据集划分为可变数量的簇 (2-10),以便点之间的距离在约束条件下最小,例如每个点只有 5 个点簇。我不需要使用整个数据集,目前正在使用 or-tools。
这是我现在使用 or-tools 的结果:
# Create data
Point = collections.namedtuple("Point", ["x", "y"])
################################
points = [Point(1,2), Point(1,2), ..., ..., ...]
################################
# Create my matrix of all points in each cluster
variables = {(i, j): solver.BoolVar("") for i in range(len(points)) for j in range(maxClusters)}
for i in range(len(points)):
solver.Maximize(***Not sure what to put here***)
# Max number of blue points in each cluster
for j in range(maxClusters):
solver.Add(sum(variables[i, j] for i in range(len(points))) <= maxBlue)
# Minimum number of blue points in each cluster
for j in range(maxClusters):
solver.Add(sum(variables[i, j] for i in range(len(points))) >= minBlue)
# Solve
status = solver.Solve()
我的问题是我不知道该写什么作为 objective 函数来最小化每个簇的直径(以确保该簇中的点靠近在一起)并最大化每个簇中包含的点数簇。
我认为这个问题有两个独立的部分:为聚类问题选择一个 objective 函数;并使用 OR-Tools 实现它。
关于 objective 功能:您的选择将对您获得的解决方案类型和您选择的权衡产生重大影响,因此最终取决于您要解决的问题。话虽如此,一种可能的方法是尝试直接优化集群质量的测量。例如:最大化Dunn Index,即最小簇间距离和最大簇内距离之间的比率。
这是一个使用 OR-Tools CP-SAT 求解器的实现:
from ortools.sat.python import cp_model
model = cp_model.CpModel()
import collections
import math
# auclidean distance
def int_ad(p1, p2):
return int(math.sqrt((p1.x-p2.x) ** 2 + (p1.y-p2.y) ** 2))
Point = collections.namedtuple("Point", ["x", "y"])
### DATA
points = [Point(1,2), Point(1,3), Point(2,2), Point(4,3), Point(4,4), Point(6,4)]
distances = [[int_ad(p1, p2) for p1 in points] for p2 in points]
### Parameters
maxClusters = 3
max_in_cluster = 5
# max distance between points
DIST_BOUND = max([i for lst in distances for i in lst]) + 1
### variables
# p,c is true iff point p is a member of cluster c
cluster_points = {(p, c): model.NewBoolVar("cluster_point_%i_%i" % (p, c)) for p in range(len(points)) for c in range(maxClusters)}
#are each 2 points on the same cluster, c
same_cluster_points_helper = {(p1, p2, c): model.NewBoolVar('same_cluster_points_helper_%i_%i_%i' % (p1, p2, c))
for p1 in range(len(points))
for p2 in range(len(points))
for c in range(maxClusters)}
# are each 2 points on the same cluster (any cluster)
same_cluster_points = {(p1, p2): model.NewBoolVar('same_cluster_points_%i_%i' % (p1, p2))
for p1 in range(len(points))
for p2 in range(len(points))}
# distance between p1 and p2 if they are not in the same cluster, 0 if they do
inter_cluster_dist = {(p1, p2): model.NewIntVar(0, DIST_BOUND, 'inter_cluster_%i_%i' % (p1, p2))
for p1 in range(len(points))
for p2 in range(len(points))}
# distance between p1 and p2 if they are in the same cluster, MAX_DIST if not
intra_cluster_dist = {(p1, p2): model.NewIntVar(0, DIST_BOUND, 'intra_cluster_%i_%i' % (p1, p2))
for p1 in range(len(points))
for p2 in range(len(points))}
### Constraints
# Max number of points in each cluster
for c in range(maxClusters):
model.Add(sum(cluster_points[p, c] for p in range(len(points))) <= max_in_cluster)
#each point must be at a single cluster
for p in range(len(points)):
model.Add(sum(cluster_points[p, c] for c in range(maxClusters)) == 1)
# p1 and p2 are in the same cluster, if they are together in any scpecific cluster c
for p1 in range(len(points)):
for p2 in range(len(points)):
model.AddMaxEquality(same_cluster_points[p1, p2], (same_cluster_points_helper[p1, p2, c] for c in range(maxClusters)))
# calculate inter- and intra-distance between points
for c in range(maxClusters):
for p1 in range(len(points)):
for p2 in range(len(points)):
# p1 and p2 are in cluster c?
model.AddMultiplicationEquality(same_cluster_points_helper[p1, p2, c], (cluster_points[p1,c], cluster_points[p2,c]))
model.Add(inter_cluster_dist[p1, p2] == distances[p1][p2]).OnlyEnforceIf(same_cluster_points[p1, p2].Not())
model.Add(inter_cluster_dist[p1, p2] == DIST_BOUND).OnlyEnforceIf(same_cluster_points[p1, p2])
model.Add(intra_cluster_dist[p1, p2] == distances[p1][p2]).OnlyEnforceIf(same_cluster_points[p1, p2])
model.Add(intra_cluster_dist[p1, p2] == 0).OnlyEnforceIf(same_cluster_points[p1, p2].Not())
# prevent a cluster from having a single element only (it'll cause issues with the Dunn formula)
for c in range(maxClusters):
model.Add(sum(cluster_points[p, c] for p in range(len(points))) != 1)
min_inter = model.NewIntVar(0, DIST_BOUND, 'min_inter')
max_intra = model.NewIntVar(0, DIST_BOUND, 'max_intra')
# since we're going to use integer division, make sure we don't get rounded to zero
scale = 100
min_inter_scaled = model.NewIntVar(0, DIST_BOUND * scale , 'min_inter_scaled')
model.AddMinEquality(min_inter, [inter_cluster_dist[p1, p2] for p1 in range(len(points)) for p2 in range(len(points)) if p1 != p2])
model.AddMaxEquality(max_intra, [intra_cluster_dist[p1, p2] for p1 in range(len(points)) for p2 in range(len(points))] + [1])
model.AddMultiplicationEquality(min_inter_scaled, (min_inter, scale))
# score
score = model.NewIntVar(0, DIST_BOUND * scale, 'score')
model.AddDivisionEquality(score, min_inter_scaled, max_intra)
model.Maximize(score)
### Solve
solver = cp_model.CpSolver()
# optional
# solver.parameters.log_search_progress = True
# solver.parameters.num_search_workers = 8
# solver.parameters.max_time_in_seconds = 5
result_status = solver.Solve(model)
if (result_status == cp_model.INFEASIBLE):
print('No feasible solution under constraints')
elif (result_status == cp_model.OPTIMAL):
print('Optimal result found, score=%i' % (solver.ObjectiveValue()))
elif (result_status == cp_model.FEASIBLE):
print('Feasible (non optimal) result found')
else:
print('No feasible solution found under constraints within time')
for c in range(maxClusters):
for p in range(len(points)):
if (solver.Value(cluster_points[p, c]) > 0):
print('%i in cluster %i' % (p, c))
生成输出:
Optimal result found, score=100
0 in cluster 1
1 in cluster 1
2 in cluster 1
3 in cluster 2
4 in cluster 2
5 in cluster 2
如果您认为您的问题是距离特定的,因为您确保集群中各点之间的距离最小,那么您应该 KMeans
:
from sklearn.cluster import KMeans
res = KMeans(n_clusters = 2).fit(points)
res.labels_
array([0, 0, 0, 1, 1, 1]) # also [1,1,1,0,0,0] is acceptable since it is a label for a cluster