如何取消嵌套并将复杂的嵌套列表重新组织为 运行 kmeans

How to unnest and reorganize a complex nested list to run kmeans

我有一个包含多个数据框列表的列表。 在 Rviewer 中,这是您所看到的示例:

image1

注意:数据帧 B-F 具有相同的命名变量。

我的问题是:

  1. 是否有一种简单的方法来转换信息,使这将是一个 table 以便:

A:F 将成为新变量(例如“alphabet”)下的字符值,并且所有嵌套变量将被合并,因此没有任何重复的变量名称?例如,列表 1 将被分解为: (img请点击link->)

image2

注意:所有的变量都会被填充,这里我只是留空了。

我正在尝试对 运行 kmeans 执行此操作,具体针对以下示例代码中的三个变量 bm1、bm2 和 ls。

  1. 在这样做之后,是否有一种简单的方法可以通过一些额外的变量(例如,簇)将其恢复到其原始结构?

这里是 dput(data) 示例代码:

list(A = structure(list(r = c(0, 0, 0, 0, 0, 0), x = c(4300, 
4800, 5300, 4300, 4800, 5300), y = c(4400, 4400, 4400, 4800, 
4800, 4800), fm1 = c(3800, 4400, 5000, 3600, 4200, 5200), fm2 = 
c(3900, 
4600, 5300, 3900, 4400, 5600), bm1 = c(400, 400, 400, 400, 400, 
400), bm2 = c(300, 300, 400, 300, 300, 400), ns = c(3600, 4200, 
4900, 3600, 4100, 5200), sn = c(0, 0, 0, 0, 0, 0), ls = c(0, 
0, 0, 0, 0, 0), fa = c(0, 0, 0, 0, 0, 0), sln = c(0, 0, 0, 0, 
0, 0)), row.names = c(NA, 6L), class = "data.frame"), B = 
structure(list(
r = c(0, 0, 0, 0, 0, 0), x = c(4300, 4800, 5300, 4300, 4800, 
5300), y = c(4500, 4500, 4500, 4900, 4900, 4900), fm1 = c(1300, 
1400, 1500, 1100, 1200, 1200), fm2 = c(1400, 1500, 1500, 
1200, 1300, 1300), bm1 = c(100, 100, 100, 100, 100, 100), 
bm2 = c(100, 100, 100, 100, 100, 100), ns = c(1200, 1400, 
1400, 1100, 1100, 1200), sn = c(0, 0, 100, 100, 0, 100), 
ls = c(0, 0, 0, 0, 0, 0), fa = c(0, 0, 0, 0, 0, 0), sln = c(0, 
0, 0, 0, 0, 0)), row.names = c(NA, 6L), class = "data.frame"), 
C = structure(list(r = c(0, 0, 0, 0, 0, 0), x = c(4300, 4800, 
5300, 4300, 4800, 5300), y = c(4400, 4400, 4400, 4800, 4800, 
4800), fm1 = c(4100, 4400, 4600, 3700, 4100, 3900), fm2 = c(4400, 
4600, 4900, 4000, 4400, 4300), bm1 = c(200, 200, 200, 200, 
200, 200), bm2 = c(200, 200, 200, 200, 200, 200), ns = c(4200, 
4500, 4700, 3800, 4200, 4100), sn = c(0, 100, 100, 0, 0, 
200), ls = c(0, 0, 0, 0, 0, 0), fa = c(0, 0, 0, 0, 0, 0), 
sln = c(0, 0, 0, 0, 0, 0)), row.names = c(NA, 6L), class = 
"data.frame"), 
D = structure(list(r = c(0, 0, 0, 0, 0, 0), x = c(4400, 4900, 
5400, 4400, 4900, 5400), y = c(4500, 4500, 4500, 4900, 4900, 
4900), fm1 = c(3000, 3200, 3300, 2500, 2600, 2600), fm2 = c(3400, 
3600, 3600, 2700, 2900, 2900), bm1 = c(300, 300, 300, 300, 
300, 200), bm2 = c(300, 200, 200, 200, 200, 200), ns = c(3100, 
3400, 3400, 2500, 2700, 2700), sn = c(0, 0, 0, 0, 0, 0), 
ls = c(0, 0, 0, 0, 0, 0), fa = c(0, 0, 0, 0, 0, 0), sln = c(0, 
0, 0, 0, 0, 0)), row.names = c(NA, 6L), class = "data.frame"), 
E = structure(list(r = c(0, 0, 0, 0, 0, 0), x = c(4400, 4900, 
5400, 4400, 4900, 5400), y = c(4500, 4500, 4500, 4900, 4900, 
4900), fm1 = c(2500, 2300, 2400, 2700, 2400, 2300), fm2 = c(2600, 
2400, 2600, 2900, 2600, 2500), bm1 = c(200, 200, 200, 200, 
200, 200), bm2 = c(200, 200, 200, 200, 200, 200), ns = c(2400, 
2200, 2400, 2700, 2400, 2300), sn = c(0, 100, 100, 0, 100, 
100), ls = c(0, 0, 0, 0, 0, 0), fa = c(0, 0, 0, 0, 0, 0), 
sln = c(0, 0, 0, 0, 0, 0)), row.names = c(NA, 6L), class = 
"data.frame"), 
F = structure(list(r = c(0, 0, 0, 0, 0, 0), x = c(4300, 4800, 
5300, 4300, 4800, 5300), y = c(4400, 4400, 4400, 4800, 4800, 
4800), fm1 = c(3300, 3500, 3400, 2700, 3100, 3100), fm2 = c(3500, 
3700, 3700, 2900, 3300, 3400), bm1 = c(200, 200, 200, 200, 
200, 200), bm2 = c(200, 200, 200, 200, 200, 200), ns = c(3300, 
3600, 3500, 2700, 3100, 3200), sn = c(0, 100, 100, 0, 0, 
0), ls = c(0, 0, 0, 0, 0, 0), fa = c(0, 0, 0, 0, 0, 0), sln = c(0, 
0, 0, 0, 0, 0)), row.names = c(NA, 6L), class = "data.frame"))

是不是就这么简单:

library(dplyr)
dat <- lapply(dat, function(x){
  tmp <- x %>% select(where(is.numeric))
  k <- kmeans(tmp, centers=2)
  x$cluster <- k$cluster
  x
})
dat
#> $A
#>   r    x    y  fm1  fm2 bm1 bm2   ns sn ls fa sln cluster
#> 1 0 4300 4400 3800 3900 400 300 3600  0  0  0   0       1
#> 2 0 4800 4400 4400 4600 400 300 4200  0  0  0   0       1
#> 3 0 5300 4400 5000 5300 400 400 4900  0  0  0   0       2
#> 4 0 4300 4800 3600 3900 400 300 3600  0  0  0   0       1
#> 5 0 4800 4800 4200 4400 400 300 4100  0  0  0   0       1
#> 6 0 5300 4800 5200 5600 400 400 5200  0  0  0   0       2
#> 
#> $B
#>   r    x    y  fm1  fm2 bm1 bm2   ns  sn ls fa sln cluster
#> 1 0 4300 4500 1300 1400 100 100 1200   0  0  0   0       1
#> 2 0 4800 4500 1400 1500 100 100 1400   0  0  0   0       2
#> 3 0 5300 4500 1500 1500 100 100 1400 100  0  0   0       2
#> 4 0 4300 4900 1100 1200 100 100 1100 100  0  0   0       1
#> 5 0 4800 4900 1200 1300 100 100 1100   0  0  0   0       1
#> 6 0 5300 4900 1200 1300 100 100 1200 100  0  0   0       2
#> 
#> $C
#>   r    x    y  fm1  fm2 bm1 bm2   ns  sn ls fa sln cluster
#> 1 0 4300 4400 4100 4400 200 200 4200   0  0  0   0       2
#> 2 0 4800 4400 4400 4600 200 200 4500 100  0  0   0       1
#> 3 0 5300 4400 4600 4900 200 200 4700 100  0  0   0       1
#> 4 0 4300 4800 3700 4000 200 200 3800   0  0  0   0       2
#> 5 0 4800 4800 4100 4400 200 200 4200   0  0  0   0       2
#> 6 0 5300 4800 3900 4300 200 200 4100 200  0  0   0       2
#> 
#> $D
#>   r    x    y  fm1  fm2 bm1 bm2   ns sn ls fa sln cluster
#> 1 0 4400 4500 3000 3400 300 300 3100  0  0  0   0       1
#> 2 0 4900 4500 3200 3600 300 200 3400  0  0  0   0       1
#> 3 0 5400 4500 3300 3600 300 200 3400  0  0  0   0       1
#> 4 0 4400 4900 2500 2700 300 200 2500  0  0  0   0       2
#> 5 0 4900 4900 2600 2900 300 200 2700  0  0  0   0       2
#> 6 0 5400 4900 2600 2900 200 200 2700  0  0  0   0       2
#> 
#> $E
#>   r    x    y  fm1  fm2 bm1 bm2   ns  sn ls fa sln cluster
#> 1 0 4400 4500 2500 2600 200 200 2400   0  0  0   0       1
#> 2 0 4900 4500 2300 2400 200 200 2200 100  0  0   0       2
#> 3 0 5400 4500 2400 2600 200 200 2400 100  0  0   0       2
#> 4 0 4400 4900 2700 2900 200 200 2700   0  0  0   0       1
#> 5 0 4900 4900 2400 2600 200 200 2400 100  0  0   0       2
#> 6 0 5400 4900 2300 2500 200 200 2300 100  0  0   0       2
#> 
#> $F
#>   r    x    y  fm1  fm2 bm1 bm2   ns  sn ls fa sln cluster
#> 1 0 4300 4400 3300 3500 200 200 3300   0  0  0   0       1
#> 2 0 4800 4400 3500 3700 200 200 3600 100  0  0   0       2
#> 3 0 5300 4400 3400 3700 200 200 3500 100  0  0   0       2
#> 4 0 4300 4800 2700 2900 200 200 2700   0  0  0   0       1
#> 5 0 4800 4800 3100 3300 200 200 3100   0  0  0   0       1
#> 6 0 5300 4800 3100 3400 200 200 3200   0  0  0   0       2

此代码会将所有数据与数据集所来自的适当标识符放在一起:

dat <- lapply(1:length(dat), function(i){
  x <- dat[[i]]
  x$alphabet <- names(dat)[i]
  x
})

all_dat <- dplyr::bind_rows(dat)