Python:元组在尝试将它们添加到不同列表时重复

Python: Tuples getting duplicated when attempting to add them to different lists

我对一些代码对一组点进行聚类所遇到的问题感到非常困惑。

我正在使用一种算法来播种一定数量的集群(例如 3 个),然后向每个集群添加点。然而,在玩了一会儿之后,我注意到每个单独的点都被添加到每个单独的集群中。我多次检查代码,我很确定我没有忘记从原始列表中删除内容,但我强烈怀疑通过引用传递存在问题。我对 Python 有点陌生,我的经验主要是 Java,所以我有一种预感,我被 Python 处理参数的方式绊倒了。

import random
import math

class Cluster(object):
    points = []
    centroid = ()
    dimensions = 0
    color = 'k'

    def __init__(self, init_pt, color='k'):
        self.points.append(init_pt)
        self.dimensions = len(init_pt)
        self.centroid = init_pt
        self.color = color

    def addPoint(self, pt):
        try:
            if len(pt) != self.dimensions:
                raise ArithmeticError("Wrong number of dimensions on new point, ignoring")
            else:
                centroid_dim_list = []
                for dim in range(0, self.dimensions):
                    centroid_dim_list.append((self.centroid[dim] * len(self.points) + pt[dim]) / float(len(self.points) + 1))
                self.centroid = tuple(centroid_dim_list)
                self.points.append(pt)
        except ArithmeticError as ae:
            print ae.message
            pass

class KMeans(object):
    clusters = []
    unassignedPoints = []
    dimensions = 0
    k = 0

    def __init__(self, _k, _points):
        if _k > 0:
            self.k = _k 
            self.unassignedPoints = _points
            self.dimensions = len(_points[0])
        else:
            raise Exception()

    def runKMeans(self):
        self.initializeClusters() 
        while len(self.unassignedPoints) > 0:
            pt = self.unassignedPoints.pop()
            dist_to_cent = float("+inf")
            closest_cluster = ()
            for cluster in self.clusters:
                new_dist = self.getDistance(pt, cluster.centroid)
                if new_dist < dist_to_cent:
                    closest_cluster = cluster
                    dist_to_cent = new_dist
            closest_cluster.addPoint(pt)

    def initializeClusters(self):
        cluster_seeds = []
        new_point = self.getRandomPoint()
        print "New cluster seed: " + str(new_point)
        self.clusters.append(Cluster(new_point))
        cluster_seeds.append(new_point)
        self.unassignedPoints.remove(new_point)
        for i in range(1, self.k):
            farthest_point = ()
            farthest_dist = 0
            for pt1 in self.unassignedPoints:
                for pt2 in cluster_seeds:
                    curr_dist = self.getDistance(pt1, pt2)
                    if curr_dist > farthest_dist:
                        farthest_dist = curr_dist
                        farthest_point = pt1
            self.unassignedPoints.remove(farthest_point)
            self.clusters.append(Cluster(farthest_point))
            print "New cluster seed: " + str(farthest_point)
            cluster_seeds.append(farthest_point)

    def getRandomPoint(self):
        return self.unassignedPoints[int(math.floor(len(self.unassignedPoints) * random.random()))]

    def getDistance(self, pt1, pt2):
        return math.sqrt(sum([math.pow(pt1[dim] - pt2[dim], 2) for dim in range(0, self.dimensions)]))

kmeans = KMeans(3, [(0, 0), (1, 1), (2, 2), (3, 3), (4, 4), (5,5)])
kmeans.runKMeans()

for cluster in kmeans.clusters:
    for pt in cluster.points:
       print "Assigned: " + str(pt)

for pt in kmeans.unassignedPoints:
    print "Unassigned: " + str(pt)

我在这里错过了什么?

您的问题可能是由于:

class Cluster(object):
    points = []

考虑将 points 的初始化移动到您的 __init__ 方法中:

class Cluster(object):
    def __init__(self, init_pt, color='k'):
        self.points = []            # <----
        self.points.append(init_pt)
        self.dimensions = len(init_pt)
        self.centroid = init_pt
        self.color = color