重复聚类观察并为重复的聚类创建唯一标识符

Duplicate clustered observations and create a unique identifiers for the duplicated clusters

考虑小型数据集 df1。有 5 个集群由 ID 标识,row_numbers 包含每个观察的唯一值,权重标识我们想要每个集群的副本数。

df1 <-data.frame(ID=c("10","20","30","30","30", "40", "40","50","50","50","50"), row_numbers = c(1,2,3,4,5,6,7,8,9,10,11),weights=c(4,3,2,2,2,3,3,2,2,2,2))
df1
#>    ID row_numbers weights
#> 1  10           1       4
#> 2  20           2       3
#> 3  30           3       2
#> 4  30           4       2
#> 5  30           5       2
#> 6  40           6       3
#> 7  40           7       3
#> 8  50           8       2
#> 9  50           9       2
#> 10 50          10       2
#> 11 50          11       2

预期输出为 df2 df2 最重要的部分是新变量“newID”。重复集群的唯一标识符存储在 newID 中,其中 newID 将使用从 1 开始的连续整数来标识集群。

df2 <-data.frame(ID=c("10","10","10","10","20","20","20","30","30","30","30","30","30", "40", "40","40", "40","40", "40","50","50","50","50","50","50","50","50"), row_numbers = c(1,1,1,1,2,2,2,3,3,4,4,5,5,6,6,6,7,7,7,8,8,9,9,10,10,11,11),weights=c(4,4,4,4,3,3,3,2,2,2,2,2,2,3,3,3,3,3,3,2,2,2,2,2,2,2,2), newID= c(1,2,3,4,5,6,7,8,8,8,9,9,9,10,10,11,11,12,12,13,13,13,13,14,14,14,14))
df2
#>    ID row_numbers weights newID
#> 1  10           1       4     1
#> 2  10           1       4     2
#> 3  10           1       4     3
#> 4  10           1       4     4
#> 5  20           2       3     5
#> 6  20           2       3     6
#> 7  20           2       3     7
#> 8  30           3       2     8
#> 9  30           3       2     8
#> 10 30           4       2     8
#> 11 30           4       2     9
#> 12 30           5       2     9
#> 13 30           5       2     9
#> 14 40           6       3    10
#> 15 40           6       3    10
#> 16 40           6       3    11
#> 17 40           7       3    11
#> 18 40           7       3    12
#> 19 40           7       3    12
#> 20 50           8       2    13
#> 21 50           8       2    13
#> 22 50           9       2    13
#> 23 50           9       2    13
#> 24 50          10       2    14
#> 25 50          10       2    14
#> 26 50          11       2    14
#> 27 50          11       2    14

这是一个使用拆分-应用-绑定方法的解决方案:

df3 <- do.call(rbind, lapply(split(df1, df1$ID), function(x) 
{
  group_size   <- nrow(x)
  n_groups     <- x$weights[1]
  if(is.na(n_groups)) n_groups <- 1
  if (n_groups < 1)   n_groups <- 1
  
  group_labels <- rep(paste(x$ID[1], seq(n_groups)), each = group_size)

  x <- x[rep(seq(group_size), each = n_groups), ]
  x$newID <- group_labels
  x
}))

df3$newID <- as.numeric(as.factor(df3$newID))
df3 <- `rownames<-`(df3, seq(nrow(df3)))

符合您的预期输出:

df3
#>    ID row_numbers weights newID
#> 1  10           1       4     1
#> 2  10           1       4     2
#> 3  10           1       4     3
#> 4  10           1       4     4
#> 5  20           2       3     5
#> 6  20           2       3     6
#> 7  20           2       3     7
#> 8  30           3       2     8
#> 9  30           3       2     8
#> 10 30           4       2     8
#> 11 30           4       2     9
#> 12 30           5       2     9
#> 13 30           5       2     9
#> 14 40           6       3    10
#> 15 40           6       3    10
#> 16 40           6       3    11
#> 17 40           7       3    11
#> 18 40           7       3    12
#> 19 40           7       3    12
#> 20 50           8       2    13
#> 21 50           8       2    13
#> 22 50           9       2    13
#> 23 50           9       2    13
#> 24 50          10       2    14
#> 25 50          10       2    14
#> 26 50          11       2    14
#> 27 50          11       2    14

我们可以证明这与您想要的结果相同:

identical(df2, df3)
#> [1] TRUE

data.table 的解决方案:

library(data.table)
df1 <-data.frame(ID=c("10","20","30","30","30", "40", "40","50","50","50","50"), row_numbers = c(1,2,3,4,5,6,7,8,9,10,11),weights=c(4,3,2,2,2,3,3,2,2,2,2))
dt1 <- data.table(df1)

# with .x a data.table with cols : ID, row_numbers (integer), weight (integer) 
duplicate_weight <- function(.x) {
  # get the part to keep unchanged
  untouched <- list(
    .x[is.na(weights), .(ID, row_numbers, weights = 1, repetition = ID)] , 
    .x[weights == 0, .(ID, row_numbers, weights = 1, repetition = ID)],
    .x[weights == 1, .(ID, row_numbers, weights = 1, repetition = ID)]
  )
  # list of the weights > 1
  weights_list <- sort(unique(.x[['weights']]))
  weights_list <- weights_list[weights_list > 1]
  # repeat accordingly to weights
  repeated <- lapply(weights_list, # for each weight
                     function(.y) { 
                       rbindlist( # make a data.table
                         lapply(1:.y, # repetead .y times
                                function(.z) { 
                                  .x[weights == .y, .(ID, row_numbers, weights = 1, repetition_position = .z)])
                                }
                         )
                       )
                     } 
  )
  
  result <- rbindlist(c(untouched, repeated))
  setorder(result, ID, repetition_position)
  result[, new_id := .GRP, by = .(ID, repetition_position)]
  result[, repetition_position := NULL]
  
  result
}

duplicate_weight(dt1)

它类似于@Allan Cameron