根据ID随机删除和添加行
Randomly delete and add rows based on ID
有谁知道如何根据ID随机删除和添加行?这是一个可重现的例子:
> y <- rnorm(20)
> x <- rnorm(20)
> z <- rep(1:5, 4)
> w <- rep(1:4, each=5)
> data.frame(id=z,cluster=w,x=x,y=y) #this is a balanced dataset
id cluster x y
1 1 1 0.30003855 0.65325768
2 2 1 -1.00563626 -0.12270866
3 3 1 0.01925927 -0.41367651
4 4 1 -1.07742065 -2.64314895
5 5 1 0.71270333 -0.09294102
6 1 2 1.08477509 0.43028470
7 2 2 -2.22498770 0.53539884
8 3 2 1.23569346 -0.55527835
9 4 2 -1.24104450 1.77950291
10 5 2 0.45476927 0.28642442
11 1 3 0.65990264 0.12631586
12 2 3 -0.19988983 1.27226678
13 3 3 -0.64511396 -0.71846622
14 4 3 0.16532102 -0.45033862
15 5 3 0.43881870 2.39745248
16 1 4 0.88330282 0.01112919
17 2 4 -2.05233698 1.63356842
18 3 4 -1.63637927 -1.43850664
19 4 4 1.43040234 -0.19051680
20 5 4 1.04662885 0.37842390
根据ID随机添加和删除一些数据后,数据集如下所示,观察总数应与上面的一致:
id cluster x y
1 1 1 0.895 -0.659
2 2 1 -0.160 -0.366
3 1 2 -0.528 -0.294
4 2 2 -0.919 0.362
5 3 2 -0.901 -0.467
6 1 3 0.275 0.134
7 2 3 0.423 0.534
8 3 3 0.929 -0.953
9 4 3 1.67 0.668
10 5 3 0.286 0.0872
11 1 4 -0.373 -0.109
12 2 4 0.289 0.299
13 3 4 -1.43 -0.677
14 4 4 -0.884 1.70
15 5 4 1.12 0.386
16 1 5 -0.723 0.247
17 2 5 0.463 -2.59
18 3 5 0.234 0.893
19 4 5 -0.313 -1.96
20 5 5 0.848 -0.0613
我们可以通过从新的 id-cluster ID id2
中获取 nrow(d)
大小的 sample
来从每个集群中采样行数 delete/add。然后,我们只需根据该样本大小添加一些行,并将簇数增加一个。
通过检查您显示的预期输出,您可能希望结果中每个簇至少有 2 个 nob。我们可以在函数参数中处理它,并用几个 stopifnot
来防止无意义的组合。 repeat
循环 break
然后当条件满足时。
FUN <- function(d, cl.obs=2, min.cl=NA) {
l.cl <- length(unique(d$cluster))
if (is.na(min.cl)) min.cl <- l.cl
stopifnot(cl.obs <= min(table(d$cluster)))
stopifnot(min.cl <= l.cl + 1)
stopifnot(cl.obs*min.cl <= nrow(d))
d$id2 <- Reduce(paste, d[c("id", "cluster")])
repeat({
samp <- sample(d$id2, sample(1:nrow(d), 1))
l <- length(samp)
if (l == 0) {
return(d[,-5])
}
else {
a <- cbind(id=1:l, cluster= max(d$cluster) + 1,
matrix(rnorm(l*2),,2, dimnames=list(NULL, letters[24:25])))
o <- rbind(d[!d$id2 %in% samp, -5], a)
(cl.tb <- table(o$cluster))
if (all(cl.tb >= cl.obs) & length(cl.tb) >= min.cl) break
}
})
return(`rownames<-`(o, NULL))
}
set.seed(42)
FUN(d)
# id cluster x y
# 1 1 1 -0.30663859 1.37095845
# 2 2 1 -1.78130843 -0.56469817
# 3 3 1 -0.17191736 0.36312841
# 4 4 1 1.21467470 0.63286260
# 5 1 2 -0.43046913 -0.10612452
# 6 2 2 -0.25726938 1.51152200
# 7 3 2 -1.76316309 -0.09465904
# 8 4 2 0.46009735 2.01842371
# 9 5 2 -0.63999488 -0.06271410
# 10 1 3 0.45545012 1.30486965
# 11 2 3 0.70483734 2.28664539
# 12 3 3 1.03510352 -1.38886070
# 13 5 3 0.50495512 -0.13332134
# 14 2 4 -0.78445901 -0.28425292
# 15 3 4 -0.85090759 -2.65645542
# 16 4 4 -2.41420765 -2.44046693
# 17 5 4 0.03612261 1.32011335
# 18 1 5 -0.43144620 -0.78383894
# 19 2 5 0.65564788 1.57572752
# 20 3 5 0.32192527 0.64289931
使用参数:
set.seed(666)
FUN(d, cl.obs=1)
# id cluster x y
# 1 4 1 1.21467470 0.63286260 ## just one obs in cl. 1
# 2 2 2 -0.25726938 1.51152200
# 3 3 2 -1.76316309 -0.09465904
# 4 5 2 -0.63999488 -0.06271410
# 5 1 3 0.45545012 1.30486965
# 6 3 3 1.03510352 -1.38886070
# 7 5 3 0.50495512 -0.13332134
# 8 1 4 -1.71700868 0.63595040
# 9 3 4 -0.85090759 -2.65645542
# 10 1 5 -0.08365711 0.07771005
# 11 2 5 0.25683143 2.12925556
# 12 3 5 -1.07362365 0.63895459
# 13 4 5 -0.62286788 0.26934743
# 14 5 5 0.28499111 2.29896933
# 15 6 5 1.05156653 -1.37464590
# 16 7 5 -0.25952120 0.66236713
# 17 8 5 0.02230428 0.48351632
# 18 9 5 -0.01440929 1.23229183
# 19 10 5 1.33285534 -1.77762517
# 20 11 5 0.14842679 0.88552740
数据:
d <- structure(list(id = c(1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L,
1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L), cluster = c(1L, 1L,
1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 4L, 4L, 4L,
4L, 4L), x = c(-0.306638594078475, -1.78130843398, -0.171917355759621,
1.2146746991726, 1.89519346126497, -0.4304691316062, -0.25726938276893,
-1.76316308519478, 0.460097354831271, -0.639994875960119, 0.455450123241219,
0.704837337228819, 1.03510352196992, -0.608926375407211, 0.50495512329797,
-1.71700867907334, -0.784459008379496, -0.850907594176518, -2.41420764994663,
0.0361226068922556), y = c(1.37095844714667, -0.564698171396089,
0.363128411337339, 0.63286260496104, 0.404268323140999, -0.106124516091484,
1.51152199743894, -0.0946590384130976, 2.01842371387704, -0.062714099052421,
1.30486965422349, 2.28664539270111, -1.38886070111234, -0.278788766817371,
-0.133321336393658, 0.635950398070074, -0.284252921416072, -2.65645542090478,
-2.44046692857552, 1.32011334573019)), class = "data.frame", row.names = c(NA,
-20L))
有谁知道如何根据ID随机删除和添加行?这是一个可重现的例子:
> y <- rnorm(20)
> x <- rnorm(20)
> z <- rep(1:5, 4)
> w <- rep(1:4, each=5)
> data.frame(id=z,cluster=w,x=x,y=y) #this is a balanced dataset
id cluster x y
1 1 1 0.30003855 0.65325768
2 2 1 -1.00563626 -0.12270866
3 3 1 0.01925927 -0.41367651
4 4 1 -1.07742065 -2.64314895
5 5 1 0.71270333 -0.09294102
6 1 2 1.08477509 0.43028470
7 2 2 -2.22498770 0.53539884
8 3 2 1.23569346 -0.55527835
9 4 2 -1.24104450 1.77950291
10 5 2 0.45476927 0.28642442
11 1 3 0.65990264 0.12631586
12 2 3 -0.19988983 1.27226678
13 3 3 -0.64511396 -0.71846622
14 4 3 0.16532102 -0.45033862
15 5 3 0.43881870 2.39745248
16 1 4 0.88330282 0.01112919
17 2 4 -2.05233698 1.63356842
18 3 4 -1.63637927 -1.43850664
19 4 4 1.43040234 -0.19051680
20 5 4 1.04662885 0.37842390
根据ID随机添加和删除一些数据后,数据集如下所示,观察总数应与上面的一致:
id cluster x y
1 1 1 0.895 -0.659
2 2 1 -0.160 -0.366
3 1 2 -0.528 -0.294
4 2 2 -0.919 0.362
5 3 2 -0.901 -0.467
6 1 3 0.275 0.134
7 2 3 0.423 0.534
8 3 3 0.929 -0.953
9 4 3 1.67 0.668
10 5 3 0.286 0.0872
11 1 4 -0.373 -0.109
12 2 4 0.289 0.299
13 3 4 -1.43 -0.677
14 4 4 -0.884 1.70
15 5 4 1.12 0.386
16 1 5 -0.723 0.247
17 2 5 0.463 -2.59
18 3 5 0.234 0.893
19 4 5 -0.313 -1.96
20 5 5 0.848 -0.0613
我们可以通过从新的 id-cluster ID id2
中获取 nrow(d)
大小的 sample
来从每个集群中采样行数 delete/add。然后,我们只需根据该样本大小添加一些行,并将簇数增加一个。
通过检查您显示的预期输出,您可能希望结果中每个簇至少有 2 个 nob。我们可以在函数参数中处理它,并用几个 stopifnot
来防止无意义的组合。 repeat
循环 break
然后当条件满足时。
FUN <- function(d, cl.obs=2, min.cl=NA) {
l.cl <- length(unique(d$cluster))
if (is.na(min.cl)) min.cl <- l.cl
stopifnot(cl.obs <= min(table(d$cluster)))
stopifnot(min.cl <= l.cl + 1)
stopifnot(cl.obs*min.cl <= nrow(d))
d$id2 <- Reduce(paste, d[c("id", "cluster")])
repeat({
samp <- sample(d$id2, sample(1:nrow(d), 1))
l <- length(samp)
if (l == 0) {
return(d[,-5])
}
else {
a <- cbind(id=1:l, cluster= max(d$cluster) + 1,
matrix(rnorm(l*2),,2, dimnames=list(NULL, letters[24:25])))
o <- rbind(d[!d$id2 %in% samp, -5], a)
(cl.tb <- table(o$cluster))
if (all(cl.tb >= cl.obs) & length(cl.tb) >= min.cl) break
}
})
return(`rownames<-`(o, NULL))
}
set.seed(42)
FUN(d)
# id cluster x y
# 1 1 1 -0.30663859 1.37095845
# 2 2 1 -1.78130843 -0.56469817
# 3 3 1 -0.17191736 0.36312841
# 4 4 1 1.21467470 0.63286260
# 5 1 2 -0.43046913 -0.10612452
# 6 2 2 -0.25726938 1.51152200
# 7 3 2 -1.76316309 -0.09465904
# 8 4 2 0.46009735 2.01842371
# 9 5 2 -0.63999488 -0.06271410
# 10 1 3 0.45545012 1.30486965
# 11 2 3 0.70483734 2.28664539
# 12 3 3 1.03510352 -1.38886070
# 13 5 3 0.50495512 -0.13332134
# 14 2 4 -0.78445901 -0.28425292
# 15 3 4 -0.85090759 -2.65645542
# 16 4 4 -2.41420765 -2.44046693
# 17 5 4 0.03612261 1.32011335
# 18 1 5 -0.43144620 -0.78383894
# 19 2 5 0.65564788 1.57572752
# 20 3 5 0.32192527 0.64289931
使用参数:
set.seed(666)
FUN(d, cl.obs=1)
# id cluster x y
# 1 4 1 1.21467470 0.63286260 ## just one obs in cl. 1
# 2 2 2 -0.25726938 1.51152200
# 3 3 2 -1.76316309 -0.09465904
# 4 5 2 -0.63999488 -0.06271410
# 5 1 3 0.45545012 1.30486965
# 6 3 3 1.03510352 -1.38886070
# 7 5 3 0.50495512 -0.13332134
# 8 1 4 -1.71700868 0.63595040
# 9 3 4 -0.85090759 -2.65645542
# 10 1 5 -0.08365711 0.07771005
# 11 2 5 0.25683143 2.12925556
# 12 3 5 -1.07362365 0.63895459
# 13 4 5 -0.62286788 0.26934743
# 14 5 5 0.28499111 2.29896933
# 15 6 5 1.05156653 -1.37464590
# 16 7 5 -0.25952120 0.66236713
# 17 8 5 0.02230428 0.48351632
# 18 9 5 -0.01440929 1.23229183
# 19 10 5 1.33285534 -1.77762517
# 20 11 5 0.14842679 0.88552740
数据:
d <- structure(list(id = c(1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L,
1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L), cluster = c(1L, 1L,
1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 4L, 4L, 4L,
4L, 4L), x = c(-0.306638594078475, -1.78130843398, -0.171917355759621,
1.2146746991726, 1.89519346126497, -0.4304691316062, -0.25726938276893,
-1.76316308519478, 0.460097354831271, -0.639994875960119, 0.455450123241219,
0.704837337228819, 1.03510352196992, -0.608926375407211, 0.50495512329797,
-1.71700867907334, -0.784459008379496, -0.850907594176518, -2.41420764994663,
0.0361226068922556), y = c(1.37095844714667, -0.564698171396089,
0.363128411337339, 0.63286260496104, 0.404268323140999, -0.106124516091484,
1.51152199743894, -0.0946590384130976, 2.01842371387704, -0.062714099052421,
1.30486965422349, 2.28664539270111, -1.38886070111234, -0.278788766817371,
-0.133321336393658, 0.635950398070074, -0.284252921416072, -2.65645542090478,
-2.44046692857552, 1.32011334573019)), class = "data.frame", row.names = c(NA,
-20L))