优化以找到最小距离

Optimise to find minimum distance

我希望能够将 n 个点的数据集划分为可变数量的簇 (2-10),以便点之间的距离在约束条件下最小,例如每个点只有 5 个点簇。我不需要使用整个数据集,目前正在使用 or-tools。

这是我现在使用 or-tools 的结果:

# Create data
Point = collections.namedtuple("Point", ["x", "y"])
################################
points = [Point(1,2), Point(1,2), ..., ..., ...]
################################

# Create my matrix of all points in each cluster
variables = {(i, j): solver.BoolVar("") for i in range(len(points)) for j in range(maxClusters)}

for i in range(len(points)):
    solver.Maximize(***Not sure what to put here***)

# Max number of blue points in each cluster
for j in range(maxClusters):
    solver.Add(sum(variables[i, j] for i in range(len(points))) <= maxBlue)

# Minimum number of blue points in each cluster
for j in range(maxClusters):
    solver.Add(sum(variables[i, j] for i in range(len(points))) >= minBlue)

# Solve
status = solver.Solve()

我的问题是我不知道该写什么作为 objective 函数来最小化每个簇的直径(以确保该簇中的点靠近在一起)并最大化每个簇中包含的点数簇。

我认为这个问题有两个独立的部分:为聚类问题选择一个 objective 函数;并使用 OR-Tools 实现它。

关于 objective 功能:您的选择将对您获得的解决方案类型和您选择的权衡产生重大影响,因此最终取决于您要解决的问题。话虽如此,一种可能的方法是尝试直接优化集群质量的测量。例如:最大化Dunn Index,即最小簇间距离和最大簇内距离之间的比率。

这是一个使用 OR-Tools CP-SAT 求解器的实现:

from ortools.sat.python import cp_model
model = cp_model.CpModel()

import collections
import math

# auclidean distance
def int_ad(p1, p2):
    return int(math.sqrt((p1.x-p2.x) ** 2 + (p1.y-p2.y) ** 2))

Point = collections.namedtuple("Point", ["x", "y"])

### DATA
points = [Point(1,2), Point(1,3), Point(2,2), Point(4,3), Point(4,4), Point(6,4)]

distances = [[int_ad(p1, p2) for p1 in points] for p2 in points]

### Parameters
maxClusters = 3
max_in_cluster = 5

# max distance between points
DIST_BOUND = max([i for lst in distances for i in lst]) + 1

### variables

# p,c is true iff point p is a member of cluster c
cluster_points = {(p, c): model.NewBoolVar("cluster_point_%i_%i" % (p, c)) for p in range(len(points)) for c in range(maxClusters)}


#are each 2 points on the same cluster, c
same_cluster_points_helper = {(p1, p2, c): model.NewBoolVar('same_cluster_points_helper_%i_%i_%i' % (p1, p2, c)) 
                       for p1 in range(len(points)) 
                       for p2 in range(len(points))
                       for c in range(maxClusters)} 

# are each 2 points on the same cluster (any cluster)
same_cluster_points = {(p1, p2): model.NewBoolVar('same_cluster_points_%i_%i' % (p1, p2)) 
                       for p1 in range(len(points)) 
                       for p2 in range(len(points))} 


# distance between p1 and p2 if they are not in the same cluster, 0 if they do
inter_cluster_dist = {(p1, p2): model.NewIntVar(0, DIST_BOUND, 'inter_cluster_%i_%i' % (p1, p2)) 
                      for p1 in range(len(points)) 
                      for p2 in range(len(points))}


# distance between p1 and p2 if they are in the same cluster, MAX_DIST if not
intra_cluster_dist = {(p1, p2): model.NewIntVar(0, DIST_BOUND, 'intra_cluster_%i_%i' % (p1, p2)) 
                      for p1 in range(len(points)) 
                      for p2 in range(len(points))}
                      

### Constraints
    
# Max number of points in each cluster
for c in range(maxClusters):
    model.Add(sum(cluster_points[p, c] for p in range(len(points))) <= max_in_cluster)

#each point must be at a single cluster
for p in range(len(points)):
    model.Add(sum(cluster_points[p, c] for c in range(maxClusters)) == 1)

# p1 and p2 are in the same cluster, if they are together in any scpecific cluster c
for p1 in range(len(points)):
        for p2 in range(len(points)):
            model.AddMaxEquality(same_cluster_points[p1, p2], (same_cluster_points_helper[p1, p2, c] for c in range(maxClusters)))

# calculate inter- and intra-distance between points
for c in range(maxClusters):
    for p1 in range(len(points)):
        for p2 in range(len(points)):
            # p1 and p2 are in cluster c?            
            model.AddMultiplicationEquality(same_cluster_points_helper[p1, p2, c], (cluster_points[p1,c], cluster_points[p2,c]))
            
            model.Add(inter_cluster_dist[p1, p2] == distances[p1][p2]).OnlyEnforceIf(same_cluster_points[p1, p2].Not())
            model.Add(inter_cluster_dist[p1, p2] == DIST_BOUND).OnlyEnforceIf(same_cluster_points[p1, p2])
            
            model.Add(intra_cluster_dist[p1, p2] == distances[p1][p2]).OnlyEnforceIf(same_cluster_points[p1, p2])
            model.Add(intra_cluster_dist[p1, p2] == 0).OnlyEnforceIf(same_cluster_points[p1, p2].Not())
                      
# prevent a cluster from having a single element only (it'll cause issues with the Dunn formula)    
for c in range(maxClusters):
    model.Add(sum(cluster_points[p, c] for p in range(len(points))) != 1)

                                                                                     
min_inter = model.NewIntVar(0, DIST_BOUND, 'min_inter')
max_intra = model.NewIntVar(0, DIST_BOUND, 'max_intra')

# since we're going to use integer division, make sure we don't get rounded to zero
scale = 100
min_inter_scaled = model.NewIntVar(0, DIST_BOUND * scale , 'min_inter_scaled')

model.AddMinEquality(min_inter, [inter_cluster_dist[p1, p2] for p1 in range(len(points)) for p2 in range(len(points)) if p1 != p2])
model.AddMaxEquality(max_intra, [intra_cluster_dist[p1, p2] for p1 in range(len(points)) for p2 in range(len(points))] + [1])

model.AddMultiplicationEquality(min_inter_scaled, (min_inter, scale))

# score
score = model.NewIntVar(0, DIST_BOUND * scale, 'score')
model.AddDivisionEquality(score, min_inter_scaled, max_intra)                                                                                   
model.Maximize(score)


### Solve
solver = cp_model.CpSolver()

# optional
# solver.parameters.log_search_progress = True     
# solver.parameters.num_search_workers = 8
# solver.parameters.max_time_in_seconds = 5

result_status = solver.Solve(model)


if (result_status == cp_model.INFEASIBLE): 
    print('No feasible solution under constraints')
elif (result_status == cp_model.OPTIMAL):
    print('Optimal result found, score=%i' % (solver.ObjectiveValue()))
elif (result_status == cp_model.FEASIBLE):                        
    print('Feasible (non optimal) result found')
else:
    print('No feasible solution found under constraints within time')  
    
    
for c in range(maxClusters):
    for p in range(len(points)):
        if (solver.Value(cluster_points[p, c]) > 0):
            print('%i in cluster %i' % (p, c))
            

生成输出:

Optimal result found, score=100
0 in cluster 1
1 in cluster 1
2 in cluster 1
3 in cluster 2
4 in cluster 2
5 in cluster 2

    

如果您认为您的问题是距离特定的,因为您确保集群中各点之间的距离最小,那么您应该 KMeans:

from sklearn.cluster import KMeans

res = KMeans(n_clusters = 2).fit(points)
res.labels_
array([0, 0, 0, 1, 1, 1]) # also [1,1,1,0,0,0] is acceptable since it is a label for a cluster