Python:元组在尝试将它们添加到不同列表时重复
Python: Tuples getting duplicated when attempting to add them to different lists
我对一些代码对一组点进行聚类所遇到的问题感到非常困惑。
我正在使用一种算法来播种一定数量的集群(例如 3 个),然后向每个集群添加点。然而,在玩了一会儿之后,我注意到每个单独的点都被添加到每个单独的集群中。我多次检查代码,我很确定我没有忘记从原始列表中删除内容,但我强烈怀疑通过引用传递存在问题。我对 Python 有点陌生,我的经验主要是 Java,所以我有一种预感,我被 Python 处理参数的方式绊倒了。
import random
import math
class Cluster(object):
points = []
centroid = ()
dimensions = 0
color = 'k'
def __init__(self, init_pt, color='k'):
self.points.append(init_pt)
self.dimensions = len(init_pt)
self.centroid = init_pt
self.color = color
def addPoint(self, pt):
try:
if len(pt) != self.dimensions:
raise ArithmeticError("Wrong number of dimensions on new point, ignoring")
else:
centroid_dim_list = []
for dim in range(0, self.dimensions):
centroid_dim_list.append((self.centroid[dim] * len(self.points) + pt[dim]) / float(len(self.points) + 1))
self.centroid = tuple(centroid_dim_list)
self.points.append(pt)
except ArithmeticError as ae:
print ae.message
pass
class KMeans(object):
clusters = []
unassignedPoints = []
dimensions = 0
k = 0
def __init__(self, _k, _points):
if _k > 0:
self.k = _k
self.unassignedPoints = _points
self.dimensions = len(_points[0])
else:
raise Exception()
def runKMeans(self):
self.initializeClusters()
while len(self.unassignedPoints) > 0:
pt = self.unassignedPoints.pop()
dist_to_cent = float("+inf")
closest_cluster = ()
for cluster in self.clusters:
new_dist = self.getDistance(pt, cluster.centroid)
if new_dist < dist_to_cent:
closest_cluster = cluster
dist_to_cent = new_dist
closest_cluster.addPoint(pt)
def initializeClusters(self):
cluster_seeds = []
new_point = self.getRandomPoint()
print "New cluster seed: " + str(new_point)
self.clusters.append(Cluster(new_point))
cluster_seeds.append(new_point)
self.unassignedPoints.remove(new_point)
for i in range(1, self.k):
farthest_point = ()
farthest_dist = 0
for pt1 in self.unassignedPoints:
for pt2 in cluster_seeds:
curr_dist = self.getDistance(pt1, pt2)
if curr_dist > farthest_dist:
farthest_dist = curr_dist
farthest_point = pt1
self.unassignedPoints.remove(farthest_point)
self.clusters.append(Cluster(farthest_point))
print "New cluster seed: " + str(farthest_point)
cluster_seeds.append(farthest_point)
def getRandomPoint(self):
return self.unassignedPoints[int(math.floor(len(self.unassignedPoints) * random.random()))]
def getDistance(self, pt1, pt2):
return math.sqrt(sum([math.pow(pt1[dim] - pt2[dim], 2) for dim in range(0, self.dimensions)]))
kmeans = KMeans(3, [(0, 0), (1, 1), (2, 2), (3, 3), (4, 4), (5,5)])
kmeans.runKMeans()
for cluster in kmeans.clusters:
for pt in cluster.points:
print "Assigned: " + str(pt)
for pt in kmeans.unassignedPoints:
print "Unassigned: " + str(pt)
我在这里错过了什么?
您的问题可能是由于:
class Cluster(object):
points = []
考虑将 points
的初始化移动到您的 __init__
方法中:
class Cluster(object):
def __init__(self, init_pt, color='k'):
self.points = [] # <----
self.points.append(init_pt)
self.dimensions = len(init_pt)
self.centroid = init_pt
self.color = color
我对一些代码对一组点进行聚类所遇到的问题感到非常困惑。
我正在使用一种算法来播种一定数量的集群(例如 3 个),然后向每个集群添加点。然而,在玩了一会儿之后,我注意到每个单独的点都被添加到每个单独的集群中。我多次检查代码,我很确定我没有忘记从原始列表中删除内容,但我强烈怀疑通过引用传递存在问题。我对 Python 有点陌生,我的经验主要是 Java,所以我有一种预感,我被 Python 处理参数的方式绊倒了。
import random
import math
class Cluster(object):
points = []
centroid = ()
dimensions = 0
color = 'k'
def __init__(self, init_pt, color='k'):
self.points.append(init_pt)
self.dimensions = len(init_pt)
self.centroid = init_pt
self.color = color
def addPoint(self, pt):
try:
if len(pt) != self.dimensions:
raise ArithmeticError("Wrong number of dimensions on new point, ignoring")
else:
centroid_dim_list = []
for dim in range(0, self.dimensions):
centroid_dim_list.append((self.centroid[dim] * len(self.points) + pt[dim]) / float(len(self.points) + 1))
self.centroid = tuple(centroid_dim_list)
self.points.append(pt)
except ArithmeticError as ae:
print ae.message
pass
class KMeans(object):
clusters = []
unassignedPoints = []
dimensions = 0
k = 0
def __init__(self, _k, _points):
if _k > 0:
self.k = _k
self.unassignedPoints = _points
self.dimensions = len(_points[0])
else:
raise Exception()
def runKMeans(self):
self.initializeClusters()
while len(self.unassignedPoints) > 0:
pt = self.unassignedPoints.pop()
dist_to_cent = float("+inf")
closest_cluster = ()
for cluster in self.clusters:
new_dist = self.getDistance(pt, cluster.centroid)
if new_dist < dist_to_cent:
closest_cluster = cluster
dist_to_cent = new_dist
closest_cluster.addPoint(pt)
def initializeClusters(self):
cluster_seeds = []
new_point = self.getRandomPoint()
print "New cluster seed: " + str(new_point)
self.clusters.append(Cluster(new_point))
cluster_seeds.append(new_point)
self.unassignedPoints.remove(new_point)
for i in range(1, self.k):
farthest_point = ()
farthest_dist = 0
for pt1 in self.unassignedPoints:
for pt2 in cluster_seeds:
curr_dist = self.getDistance(pt1, pt2)
if curr_dist > farthest_dist:
farthest_dist = curr_dist
farthest_point = pt1
self.unassignedPoints.remove(farthest_point)
self.clusters.append(Cluster(farthest_point))
print "New cluster seed: " + str(farthest_point)
cluster_seeds.append(farthest_point)
def getRandomPoint(self):
return self.unassignedPoints[int(math.floor(len(self.unassignedPoints) * random.random()))]
def getDistance(self, pt1, pt2):
return math.sqrt(sum([math.pow(pt1[dim] - pt2[dim], 2) for dim in range(0, self.dimensions)]))
kmeans = KMeans(3, [(0, 0), (1, 1), (2, 2), (3, 3), (4, 4), (5,5)])
kmeans.runKMeans()
for cluster in kmeans.clusters:
for pt in cluster.points:
print "Assigned: " + str(pt)
for pt in kmeans.unassignedPoints:
print "Unassigned: " + str(pt)
我在这里错过了什么?
您的问题可能是由于:
class Cluster(object):
points = []
考虑将 points
的初始化移动到您的 __init__
方法中:
class Cluster(object):
def __init__(self, init_pt, color='k'):
self.points = [] # <----
self.points.append(init_pt)
self.dimensions = len(init_pt)
self.centroid = init_pt
self.color = color