MemoryError，仅仅是因为ram还是nested for

Question

我在 python 中使用 DBSCAN 对 NSL KDD 数据进行聚类研究。当我尝试运行限制 10.000 数据的程序时，它说 MemoryError 而当运行宁所有数据（NSL KDD 有 125.973 行，41 列）时它说达到最大维度。仅仅是因为计算机规格问题（我使用的是 8GB RAM）还是代码问题？如何解决这个问题？最后，如何更新要保存在mySQL中的每一行的聚类结果？我是 python 新手如果您认为我问了一个愚蠢的问题，我很抱歉

def set2List(NumpyArray):
    list = []
    for item in NumpyArray:
        list.append(item.tolist())
    return list 

def GenerateData():
    mydb = pymysql.connect(
    host="localhost",user="root", password="", database="ta")
    mycursor = mydb.cursor()
    mycursor.execute("SELECT * FROM data_trans LIMIT 10000")
    myresult = mycursor.fetchall() 
    final_result= numpy.array(myresult)
    return final_result

def DBSCAN(Dataset, Epsilon,MinumumPoints,DistanceMethod = 'euclidean'):

    m,n=Dataset.shape
    Visited=numpy.zeros(m,'int')
    Type=numpy.zeros(m)

    ClustersList=[]
    Cluster=[]
    PointClusterNumber=numpy.zeros(m)
    PointClusterNumberIndex=1
    PointNeighbors=[]
    DistanceMatrix = scipy.spatial.distance.squareform(scipy.spatial.distance.pdist(Dataset, DistanceMethod))
    for i in xrange(m):
       if Visited[i]==0:
          Visited[i]=1
          PointNeighbors=numpy.where(DistanceMatrix[i]<Epsilon)[0]
          if len(PointNeighbors)<MinumumPoints:
            Type[i]=-1
          else:
            for k in xrange(len(Cluster)):
                Cluster.pop()
            Cluster.append(i)
            PointClusterNumber[i]=PointClusterNumberIndex               

            PointNeighbors=set2List(PointNeighbors)    
            ExpandClsuter(Dataset[i], PointNeighbors,Cluster,MinumumPoints,Epsilon,Visited,DistanceMatrix,PointClusterNumber,PointClusterNumberIndex  )
            Cluster.append(PointNeighbors[:])
            ClustersList.append(Cluster[:])
            PointClusterNumberIndex=PointClusterNumberIndex+1

     return PointClusterNumber 

 def ExpandClsuter(PointToExapnd, PointNeighbors, Cluster, MinumumPoints, Epsilon, Visited, DistanceMatrix, PointClusterNumber, PointClusterNumberIndex  ):
     Neighbors=[] 
     for i in PointNeighbors:
       if Visited[i]==0:
         Visited[i]=1
         Neighbors=numpy.where(DistanceMatrix[i]<Epsilon)[0]
         if len(Neighbors)>=MinumumPoints: 
            for j in Neighbors:
                try:
                    PointNeighbors.index(j)
                except ValueError:
                    PointNeighbors.append(j)

         if PointClusterNumber[i]==0:
            Cluster.append(i)
            PointClusterNumber[i]=PointClusterNumberIndex
return

Data=GenerateData()

fig = plt.figure()
ax1=fig.add_subplot(2,1,1) #row, column, figure number

ax1.scatter(Data[:,0],Data[:,1], alpha =  0.5 ) 

Epsilon=300
MinumumPoints=50
result =DBSCAN(Data,Epsilon,MinumumPoints) 
print result     
plt.show()

错误信息：

Traceback (most recent call last):

File "<ipython-input-8-20458e6efb7c>", line 1, in <module>
runfile('C:/Users/Ji Min/Downloads/oprek.py', wdir='C:/Users/Ji Min/Downloads')

File "C:\Users\Ji Min\Anaconda2\lib\site-packages\spyder\utils\site\sitecustomize.py", line 705, in runfile
execfile(filename, namespace)

File "C:\Users\Ji Min\Anaconda2\lib\site-packages\spyder\utils\site\sitecustomize.py", line 87, in execfile
exec(compile(scripttext, filename, 'exec'), glob, loc)

File "C:/Users/Ji Min/Downloads/oprek.py", line 95, in <module>
result =DBSCAN(Data,Epsilon,MinumumPoints)

File "C:/Users/Ji Min/Downloads/oprek.py", line 44, in DBSCAN
DistanceMatrix = scipy.spatial.distance.squareform(scipy.spatial.distance.pdist(Dataset, DistanceMethod))

File "C:\Users\Ji Min\Anaconda2\lib\site-packages\scipy\spatial\distance.py", line 1652, in pdist
dm = np.empty((m * (m - 1)) // 2, dtype=np.double)

MemoryError

Answer 1

关键是不计算距离矩阵。

距离矩阵需要太多内存。

但是那个数据集无论如何都没用。你计算的距离是没有意义的，所以不要指望聚类会比这好得多...

MemoryError，仅仅是因为ram还是nested for

MemoryError, is it just because ram or nested for

python

dbscan