根据ID随机删除和添加行

Randomly delete and add rows based on ID

有谁知道如何根据ID随机删除和添加行?这是一个可重现的例子:

> y <- rnorm(20)
> x <- rnorm(20)
> z <- rep(1:5, 4)
> w <- rep(1:4, each=5)
> data.frame(id=z,cluster=w,x=x,y=y) #this is a balanced dataset
   id cluster           x           y
1   1       1  0.30003855  0.65325768
2   2       1 -1.00563626 -0.12270866
3   3       1  0.01925927 -0.41367651
4   4       1 -1.07742065 -2.64314895
5   5       1  0.71270333 -0.09294102
6   1       2  1.08477509  0.43028470
7   2       2 -2.22498770  0.53539884
8   3       2  1.23569346 -0.55527835
9   4       2 -1.24104450  1.77950291
10  5       2  0.45476927  0.28642442
11  1       3  0.65990264  0.12631586
12  2       3 -0.19988983  1.27226678
13  3       3 -0.64511396 -0.71846622
14  4       3  0.16532102 -0.45033862
15  5       3  0.43881870  2.39745248
16  1       4  0.88330282  0.01112919
17  2       4 -2.05233698  1.63356842
18  3       4 -1.63637927 -1.43850664
19  4       4  1.43040234 -0.19051680
20  5       4  1.04662885  0.37842390

根据ID随机添加和删除一些数据后,数据集如下所示,观察总数应与上面的一致:

            id   cluster   x     y
       1     1       1  0.895 -0.659 
       2     2       1 -0.160 -0.366 
       3     1       2 -0.528 -0.294 
       4     2       2 -0.919  0.362 
       5     3       2 -0.901 -0.467 
       6     1       3  0.275  0.134 
       7     2       3  0.423  0.534 
       8     3       3  0.929 -0.953 
       9     4       3  1.67   0.668 
      10     5       3  0.286  0.0872
      11     1       4 -0.373 -0.109 
      12     2       4  0.289  0.299 
      13     3       4 -1.43  -0.677 
      14     4       4 -0.884  1.70  
      15     5       4  1.12   0.386 
      16     1       5 -0.723  0.247 
      17     2       5  0.463 -2.59  
      18     3       5  0.234  0.893 
      19     4       5 -0.313 -1.96  
      20     5       5  0.848 -0.0613

我们可以通过从新的 id-cluster ID id2 中获取 nrow(d) 大小的 sample 来从每个集群中采样行数 delete/add。然后,我们只需根据该样本大小添加一些行,并将簇数增加一个。

通过检查您显示的预期输出,您可能希望结果中每个簇至少有 2 个 nob。我们可以在函数参数中处理它,并用几个 stopifnot 来防止无意义的组​​合。 repeat 循环 break 然后当条件满足时。

FUN <- function(d, cl.obs=2, min.cl=NA) {
  l.cl <- length(unique(d$cluster))
  if (is.na(min.cl)) min.cl <- l.cl
  stopifnot(cl.obs <= min(table(d$cluster)))
  stopifnot(min.cl <= l.cl + 1)
  stopifnot(cl.obs*min.cl <= nrow(d))
  d$id2 <- Reduce(paste, d[c("id", "cluster")])
  repeat({
    samp <- sample(d$id2, sample(1:nrow(d), 1))
    l <- length(samp)
    if (l == 0) {
      return(d[,-5])
    }
    else {
      a <- cbind(id=1:l, cluster= max(d$cluster) + 1, 
                 matrix(rnorm(l*2),,2, dimnames=list(NULL, letters[24:25])))
      o <- rbind(d[!d$id2 %in% samp, -5], a)
      (cl.tb <- table(o$cluster))
      if (all(cl.tb >= cl.obs) & length(cl.tb) >= min.cl) break
    }
  })
  return(`rownames<-`(o, NULL))
}

set.seed(42)
FUN(d)
#    id cluster           x           y
# 1   1       1 -0.30663859  1.37095845
# 2   2       1 -1.78130843 -0.56469817
# 3   3       1 -0.17191736  0.36312841
# 4   4       1  1.21467470  0.63286260
# 5   1       2 -0.43046913 -0.10612452
# 6   2       2 -0.25726938  1.51152200
# 7   3       2 -1.76316309 -0.09465904
# 8   4       2  0.46009735  2.01842371
# 9   5       2 -0.63999488 -0.06271410
# 10  1       3  0.45545012  1.30486965
# 11  2       3  0.70483734  2.28664539
# 12  3       3  1.03510352 -1.38886070
# 13  5       3  0.50495512 -0.13332134
# 14  2       4 -0.78445901 -0.28425292
# 15  3       4 -0.85090759 -2.65645542
# 16  4       4 -2.41420765 -2.44046693
# 17  5       4  0.03612261  1.32011335
# 18  1       5 -0.43144620 -0.78383894
# 19  2       5  0.65564788  1.57572752
# 20  3       5  0.32192527  0.64289931

使用参数:

set.seed(666)
FUN(d, cl.obs=1)
#    id cluster           x           y
# 1   4       1  1.21467470  0.63286260  ## just one obs in cl. 1
# 2   2       2 -0.25726938  1.51152200
# 3   3       2 -1.76316309 -0.09465904
# 4   5       2 -0.63999488 -0.06271410
# 5   1       3  0.45545012  1.30486965
# 6   3       3  1.03510352 -1.38886070
# 7   5       3  0.50495512 -0.13332134
# 8   1       4 -1.71700868  0.63595040
# 9   3       4 -0.85090759 -2.65645542
# 10  1       5 -0.08365711  0.07771005
# 11  2       5  0.25683143  2.12925556
# 12  3       5 -1.07362365  0.63895459
# 13  4       5 -0.62286788  0.26934743
# 14  5       5  0.28499111  2.29896933
# 15  6       5  1.05156653 -1.37464590
# 16  7       5 -0.25952120  0.66236713
# 17  8       5  0.02230428  0.48351632
# 18  9       5 -0.01440929  1.23229183
# 19 10       5  1.33285534 -1.77762517
# 20 11       5  0.14842679  0.88552740

数据:

d <- structure(list(id = c(1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 
1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L), cluster = c(1L, 1L, 
1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 4L, 4L, 4L, 
4L, 4L), x = c(-0.306638594078475, -1.78130843398, -0.171917355759621, 
1.2146746991726, 1.89519346126497, -0.4304691316062, -0.25726938276893, 
-1.76316308519478, 0.460097354831271, -0.639994875960119, 0.455450123241219, 
0.704837337228819, 1.03510352196992, -0.608926375407211, 0.50495512329797, 
-1.71700867907334, -0.784459008379496, -0.850907594176518, -2.41420764994663, 
0.0361226068922556), y = c(1.37095844714667, -0.564698171396089, 
0.363128411337339, 0.63286260496104, 0.404268323140999, -0.106124516091484, 
1.51152199743894, -0.0946590384130976, 2.01842371387704, -0.062714099052421, 
1.30486965422349, 2.28664539270111, -1.38886070111234, -0.278788766817371, 
-0.133321336393658, 0.635950398070074, -0.284252921416072, -2.65645542090478, 
-2.44046692857552, 1.32011334573019)), class = "data.frame", row.names = c(NA, 
-20L))