迭代嵌套聚类

Iterative nested clustering

我有这样一个数据集:

n = 50
g = 6
set.seed(g)
d <- data.frame(x = unlist(lapply(1:g, function(i) rnorm(n/g, runif(1)*i^2))), 
            y = unlist(lapply(1:g, function(i) rnorm(n/g, runif(1)*i^2))))
plot(d)

我可以这样聚类:

library(cluster)
library(dplyr)

knb <- clusGap(d, kmeans, 10, B = 10, verbose = interactive())
k <- maxSE(knb$Tab[, "gap"], knb$Tab[, "SE.sim"], method="Tibs2001SEmax")
d_dist <- dist(as.matrix(d))   # find distance matrix 
plot(hclust(d_dist))
rect.hclust(hclust(d_dist), k=k)
j <- rect.hclust(hclust(d_dist), k=k)

集群组存储在j中,如:

j

[[1]]
 [1] 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48
[[2]]
 [1]  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32

如何在生成的两个集群上迭代重复该过程? (我不知道每一步会有多少簇,也不知道需要多少步才能达到clusGap函数给出的“无簇可能性”

非常感谢你的帮助,我被卡住了;)

这是我的问题的解决方案草案。它无疑coult/must得到改进。如果有谁能提出更好的工作流程,欢迎大家提出!

library(dplyr)
library(cluster)

n = 500
g = 30
set.seed(g)
d <- data.frame(x = unlist(lapply(1:g, function(i) rnorm(n/g, runif(1)*i^1.5))), 
                y = unlist(lapply(1:g, function(i) rnorm(n/g, runif(1)*i^1.5))))
rm(g,n)
plot(d)

flattenlist <- function(x){  
  morelists <- sapply(x, function(xprime) class(xprime)[1]=="list")
  out <- c(x[!morelists], unlist(x[morelists], recursive=FALSE))
  if(sum(morelists)){ 
    Recall(out)
  }else{
    return(out)
  }
}

clusters <- list()

for (zz in 1:2){
  if(exists("j")){
    for (i in 1:10000){
      if(iteration == 1){
        lg <- length(j)
        for (z in 1:lg){
          assign(paste0("subd"), d%>% filter(row.names(d) %in% j[[z]]))
          try(knb <- clusGap(subd, kmeans, 10, B = 10, verbose = interactive()))
          k <- maxSE(knb$Tab[, "gap"], knb$Tab[, "SE.sim"], method="Tibs2001SEmax")
          if(k>1){
          d_dist <- dist(as.matrix(subd))   # find distance matrix 
          plot(hclust(d_dist))
          rect.hclust(hclust(d_dist), k=k)
          assign(paste0("j", z), rect.hclust(hclust(d_dist), k=k))
          }else{
            clusters <- do.call(c, list(clusters, list(row.names(subd))))
          }
          if(z == lg){
            gset <- ls(pat = "j")
            gset <- gset[-1]
            j <- do.call(c, list(mget(gset)))
            j <- flattenlist(j)
          iteration <- iteration + 1
          rm(list=(gset))
          }
          rm(knb, subd, d_dist, k)
        }
      }else{
        for(f in 1:10000){
          #gset <- ls(pat = "j")
          #gset <- gset[-1]
          assign(paste0("subd"), d%>% filter(row.names(d) %in% j[[f]]))
          try(knb <- clusGap(subd, kmeans, 10, B = 10, verbose = interactive()))
          if (exists("knb")){
            k <- maxSE(knb$Tab[, "gap"], knb$Tab[, "SE.sim"], method="Tibs2001SEmax")
            if(k>1){
            d_dist <- dist(as.matrix(subd))   # find distance matrix 
            plot(hclust(d_dist))
            rect.hclust(hclust(d_dist), k=k)
            assign(paste0("j", f), rect.hclust(hclust(d_dist), k=k))
            tmp <- flattenlist(get(paste0("j", f)))
            j <- do.call(c, list(j, tmp))
            iteration <- iteration + 1
            }else{
              clusters <- do.call(c, list(clusters, list(row.names(subd))))
            }
          }else{
            clusters <- do.call(c, list(clusters, list(row.names(subd))))
          }
        }
      }
    }
  }else{
    knb <- clusGap(d, kmeans, 10, B = 10, verbose = interactive())
    k <- maxSE(knb$Tab[, "gap"], knb$Tab[, "SE.sim"], method="Tibs2001SEmax")
    d_dist <- dist(as.matrix(d))   # find distance matrix 
    plot(hclust(d_dist))
    rect.hclust(hclust(d_dist), k=k)
    j <- rect.hclust(hclust(d_dist), k=k)
    iteration <- c(1)
    rm(k, d_dist, knb)
  }
}

欢迎任何改进;)