重复聚类观察并为重复的聚类创建唯一标识符
Duplicate clustered observations and create a unique identifiers for the duplicated clusters
考虑小型数据集 df1。有 5 个集群由 ID 标识,row_numbers 包含每个观察的唯一值,权重标识我们想要每个集群的副本数。
df1 <-data.frame(ID=c("10","20","30","30","30", "40", "40","50","50","50","50"), row_numbers = c(1,2,3,4,5,6,7,8,9,10,11),weights=c(4,3,2,2,2,3,3,2,2,2,2))
df1
#> ID row_numbers weights
#> 1 10 1 4
#> 2 20 2 3
#> 3 30 3 2
#> 4 30 4 2
#> 5 30 5 2
#> 6 40 6 3
#> 7 40 7 3
#> 8 50 8 2
#> 9 50 9 2
#> 10 50 10 2
#> 11 50 11 2
预期输出为 df2
df2 最重要的部分是新变量“newID”。重复集群的唯一标识符存储在 newID 中,其中 newID 将使用从 1 开始的连续整数来标识集群。
df2 <-data.frame(ID=c("10","10","10","10","20","20","20","30","30","30","30","30","30", "40", "40","40", "40","40", "40","50","50","50","50","50","50","50","50"), row_numbers = c(1,1,1,1,2,2,2,3,3,4,4,5,5,6,6,6,7,7,7,8,8,9,9,10,10,11,11),weights=c(4,4,4,4,3,3,3,2,2,2,2,2,2,3,3,3,3,3,3,2,2,2,2,2,2,2,2), newID= c(1,2,3,4,5,6,7,8,8,8,9,9,9,10,10,11,11,12,12,13,13,13,13,14,14,14,14))
df2
#> ID row_numbers weights newID
#> 1 10 1 4 1
#> 2 10 1 4 2
#> 3 10 1 4 3
#> 4 10 1 4 4
#> 5 20 2 3 5
#> 6 20 2 3 6
#> 7 20 2 3 7
#> 8 30 3 2 8
#> 9 30 3 2 8
#> 10 30 4 2 8
#> 11 30 4 2 9
#> 12 30 5 2 9
#> 13 30 5 2 9
#> 14 40 6 3 10
#> 15 40 6 3 10
#> 16 40 6 3 11
#> 17 40 7 3 11
#> 18 40 7 3 12
#> 19 40 7 3 12
#> 20 50 8 2 13
#> 21 50 8 2 13
#> 22 50 9 2 13
#> 23 50 9 2 13
#> 24 50 10 2 14
#> 25 50 10 2 14
#> 26 50 11 2 14
#> 27 50 11 2 14
这是一个使用拆分-应用-绑定方法的解决方案:
df3 <- do.call(rbind, lapply(split(df1, df1$ID), function(x)
{
group_size <- nrow(x)
n_groups <- x$weights[1]
if(is.na(n_groups)) n_groups <- 1
if (n_groups < 1) n_groups <- 1
group_labels <- rep(paste(x$ID[1], seq(n_groups)), each = group_size)
x <- x[rep(seq(group_size), each = n_groups), ]
x$newID <- group_labels
x
}))
df3$newID <- as.numeric(as.factor(df3$newID))
df3 <- `rownames<-`(df3, seq(nrow(df3)))
符合您的预期输出:
df3
#> ID row_numbers weights newID
#> 1 10 1 4 1
#> 2 10 1 4 2
#> 3 10 1 4 3
#> 4 10 1 4 4
#> 5 20 2 3 5
#> 6 20 2 3 6
#> 7 20 2 3 7
#> 8 30 3 2 8
#> 9 30 3 2 8
#> 10 30 4 2 8
#> 11 30 4 2 9
#> 12 30 5 2 9
#> 13 30 5 2 9
#> 14 40 6 3 10
#> 15 40 6 3 10
#> 16 40 6 3 11
#> 17 40 7 3 11
#> 18 40 7 3 12
#> 19 40 7 3 12
#> 20 50 8 2 13
#> 21 50 8 2 13
#> 22 50 9 2 13
#> 23 50 9 2 13
#> 24 50 10 2 14
#> 25 50 10 2 14
#> 26 50 11 2 14
#> 27 50 11 2 14
我们可以证明这与您想要的结果相同:
identical(df2, df3)
#> [1] TRUE
data.table 的解决方案:
library(data.table)
df1 <-data.frame(ID=c("10","20","30","30","30", "40", "40","50","50","50","50"), row_numbers = c(1,2,3,4,5,6,7,8,9,10,11),weights=c(4,3,2,2,2,3,3,2,2,2,2))
dt1 <- data.table(df1)
# with .x a data.table with cols : ID, row_numbers (integer), weight (integer)
duplicate_weight <- function(.x) {
# get the part to keep unchanged
untouched <- list(
.x[is.na(weights), .(ID, row_numbers, weights = 1, repetition = ID)] ,
.x[weights == 0, .(ID, row_numbers, weights = 1, repetition = ID)],
.x[weights == 1, .(ID, row_numbers, weights = 1, repetition = ID)]
)
# list of the weights > 1
weights_list <- sort(unique(.x[['weights']]))
weights_list <- weights_list[weights_list > 1]
# repeat accordingly to weights
repeated <- lapply(weights_list, # for each weight
function(.y) {
rbindlist( # make a data.table
lapply(1:.y, # repetead .y times
function(.z) {
.x[weights == .y, .(ID, row_numbers, weights = 1, repetition_position = .z)])
}
)
)
}
)
result <- rbindlist(c(untouched, repeated))
setorder(result, ID, repetition_position)
result[, new_id := .GRP, by = .(ID, repetition_position)]
result[, repetition_position := NULL]
result
}
duplicate_weight(dt1)
它类似于@Allan Cameron
考虑小型数据集 df1。有 5 个集群由 ID 标识,row_numbers 包含每个观察的唯一值,权重标识我们想要每个集群的副本数。
df1 <-data.frame(ID=c("10","20","30","30","30", "40", "40","50","50","50","50"), row_numbers = c(1,2,3,4,5,6,7,8,9,10,11),weights=c(4,3,2,2,2,3,3,2,2,2,2))
df1
#> ID row_numbers weights
#> 1 10 1 4
#> 2 20 2 3
#> 3 30 3 2
#> 4 30 4 2
#> 5 30 5 2
#> 6 40 6 3
#> 7 40 7 3
#> 8 50 8 2
#> 9 50 9 2
#> 10 50 10 2
#> 11 50 11 2
预期输出为 df2 df2 最重要的部分是新变量“newID”。重复集群的唯一标识符存储在 newID 中,其中 newID 将使用从 1 开始的连续整数来标识集群。
df2 <-data.frame(ID=c("10","10","10","10","20","20","20","30","30","30","30","30","30", "40", "40","40", "40","40", "40","50","50","50","50","50","50","50","50"), row_numbers = c(1,1,1,1,2,2,2,3,3,4,4,5,5,6,6,6,7,7,7,8,8,9,9,10,10,11,11),weights=c(4,4,4,4,3,3,3,2,2,2,2,2,2,3,3,3,3,3,3,2,2,2,2,2,2,2,2), newID= c(1,2,3,4,5,6,7,8,8,8,9,9,9,10,10,11,11,12,12,13,13,13,13,14,14,14,14))
df2
#> ID row_numbers weights newID
#> 1 10 1 4 1
#> 2 10 1 4 2
#> 3 10 1 4 3
#> 4 10 1 4 4
#> 5 20 2 3 5
#> 6 20 2 3 6
#> 7 20 2 3 7
#> 8 30 3 2 8
#> 9 30 3 2 8
#> 10 30 4 2 8
#> 11 30 4 2 9
#> 12 30 5 2 9
#> 13 30 5 2 9
#> 14 40 6 3 10
#> 15 40 6 3 10
#> 16 40 6 3 11
#> 17 40 7 3 11
#> 18 40 7 3 12
#> 19 40 7 3 12
#> 20 50 8 2 13
#> 21 50 8 2 13
#> 22 50 9 2 13
#> 23 50 9 2 13
#> 24 50 10 2 14
#> 25 50 10 2 14
#> 26 50 11 2 14
#> 27 50 11 2 14
这是一个使用拆分-应用-绑定方法的解决方案:
df3 <- do.call(rbind, lapply(split(df1, df1$ID), function(x)
{
group_size <- nrow(x)
n_groups <- x$weights[1]
if(is.na(n_groups)) n_groups <- 1
if (n_groups < 1) n_groups <- 1
group_labels <- rep(paste(x$ID[1], seq(n_groups)), each = group_size)
x <- x[rep(seq(group_size), each = n_groups), ]
x$newID <- group_labels
x
}))
df3$newID <- as.numeric(as.factor(df3$newID))
df3 <- `rownames<-`(df3, seq(nrow(df3)))
符合您的预期输出:
df3
#> ID row_numbers weights newID
#> 1 10 1 4 1
#> 2 10 1 4 2
#> 3 10 1 4 3
#> 4 10 1 4 4
#> 5 20 2 3 5
#> 6 20 2 3 6
#> 7 20 2 3 7
#> 8 30 3 2 8
#> 9 30 3 2 8
#> 10 30 4 2 8
#> 11 30 4 2 9
#> 12 30 5 2 9
#> 13 30 5 2 9
#> 14 40 6 3 10
#> 15 40 6 3 10
#> 16 40 6 3 11
#> 17 40 7 3 11
#> 18 40 7 3 12
#> 19 40 7 3 12
#> 20 50 8 2 13
#> 21 50 8 2 13
#> 22 50 9 2 13
#> 23 50 9 2 13
#> 24 50 10 2 14
#> 25 50 10 2 14
#> 26 50 11 2 14
#> 27 50 11 2 14
我们可以证明这与您想要的结果相同:
identical(df2, df3)
#> [1] TRUE
data.table 的解决方案:
library(data.table)
df1 <-data.frame(ID=c("10","20","30","30","30", "40", "40","50","50","50","50"), row_numbers = c(1,2,3,4,5,6,7,8,9,10,11),weights=c(4,3,2,2,2,3,3,2,2,2,2))
dt1 <- data.table(df1)
# with .x a data.table with cols : ID, row_numbers (integer), weight (integer)
duplicate_weight <- function(.x) {
# get the part to keep unchanged
untouched <- list(
.x[is.na(weights), .(ID, row_numbers, weights = 1, repetition = ID)] ,
.x[weights == 0, .(ID, row_numbers, weights = 1, repetition = ID)],
.x[weights == 1, .(ID, row_numbers, weights = 1, repetition = ID)]
)
# list of the weights > 1
weights_list <- sort(unique(.x[['weights']]))
weights_list <- weights_list[weights_list > 1]
# repeat accordingly to weights
repeated <- lapply(weights_list, # for each weight
function(.y) {
rbindlist( # make a data.table
lapply(1:.y, # repetead .y times
function(.z) {
.x[weights == .y, .(ID, row_numbers, weights = 1, repetition_position = .z)])
}
)
)
}
)
result <- rbindlist(c(untouched, repeated))
setorder(result, ID, repetition_position)
result[, new_id := .GRP, by = .(ID, repetition_position)]
result[, repetition_position := NULL]
result
}
duplicate_weight(dt1)
它类似于@Allan Cameron