如何在 R 中按组删除前导和尾随 NA 的行
How to delete rows for leading and trailing NAs by group in R
我需要删除包含 NA 的行,但前提是它们是前导(尾随),即在变量出现任何数据之前(之后)。这非常类似于:
和:
How delete leading and trailing rows by condition in R?
但是,我需要按变量 "ID" 分组执行此过程。我将在后面的步骤中估算两者之间的 NA 数据。
同样适用于尾随的 NA。
最初的data.frame是这样的:
df1<-data.frame(ID=(rep(c("C1001","C1008","C1009","C1012"),each=17)),
Year=(rep(c(1996:2012),4)),x1=(floor(runif(68,20,75))),x2=
(floor(runif(68,1,100))))
#Introduce leading / tailing NAs
df1[1:5,3]<-NA
df1[18:23,4]<-NA
df1[35:42,4]<-NA
df1[49:51,3]<-NA
df1[66:68,3]<-NA
#introduce "gap"- NAs
set.seed(123)
df1$x1[rbinom(68,1,0.1)==1]<-NA
df1$x2[rbinom(68,1,0.1)==1]<-NA
输出很长。这是为了正确区分 "gap"-NAs 和 "leading/trailing" NAs possible
head(df1,10)
ID Year x1 x2
1 C1001 1996 NA 40
2 C1001 1997 NA 88
3 C1001 1998 NA 37
4 C1001 1999 NA 29
5 C1001 2000 NA 17
6 C1001 2001 42 18
7 C1001 2002 20 48
8 C1001 2003 30 26
9 C1001 2004 66 22
10 C1001 2005 32 67
输出应按 ID 组删除前导 NA(参见上面的行 1:5)。或者下面示例中的行 18:23:
df1[18:28,]
ID Year x1 x2
18 C1008 1996 33 NA
19 C1008 1997 26 NA
20 C1008 1998 NA NA
21 C1008 1999 51 NA
22 C1008 2000 31 NA
23 C1008 2001 44 NA
24 C1008 2002 NA 56
25 C1008 2003 47 70
26 C1008 2004 39 91
27 C1008 2005 55 62
28 C1008 2006 40 43
最终输出应如下所示(当然取决于随机输入的 NA!):
ID Year x1 x2
6 C1001 2001 42 18
7 C1001 2002 20 48
8 C1001 2003 30 26
9 C1001 2004 66 22
10 C1001 2005 32 67
11 C1001 2006 NA 5
12 C1001 2007 24 70
13 C1001 2008 33 35
14 C1001 2009 60 41
15 C1001 2010 66 82
16 C1001 2011 47 91
17 C1001 2012 41 28
24 C1008 2002 NA 56
25 C1008 2003 47 70
26 C1008 2004 39 91
27 C1008 2005 55 62
28 C1008 2006 40 43
29 C1008 2007 39 54
30 C1008 2008 49 6
31 C1008 2009 NA 26
32 C1008 2010 NA 40
33 C1008 2011 42 20
34 C1008 2012 34 83
44 C1009 2005 51 96
45 C1009 2006 66 96
46 C1009 2007 37 NA
47 C1009 2008 58 26
48 C1009 2009 34 22
52 C1012 1996 51 78
53 C1012 1997 70 17
54 C1012 1998 69 41
55 C1012 1999 35 47
56 C1012 2000 37 86
57 C1012 2001 74 92
58 C1012 2002 54 NA
59 C1012 2003 71 67
60 C1012 2004 45 95
61 C1012 2005 42 52
62 C1012 2006 56 58
63 C1012 2007 28 34
64 C1012 2008 51 35
65 C1012 2009 33 2
非常感谢!
这是一个 data.table 解决方案,它依赖 rleid
来仅删除前导 NA:
library(data.table)
dt <- as.data.table(df1)
dt[,
.SD[!(rleid(x1) %in% c(1, max(rleid(x1))) & is.na(x1)) &
!(rleid(x2) %in% c(1, max(rleid(x2))) & is.na(x2))],
by = ID
]
要自动处理多个列,假设它们都以 x
开头,您可以这样做:
dt[dt[, Reduce('&',
lapply(.SD, function(x) !(rleid(x) %in% c(1, max(rleid(x1))) & is.na(x)))),
by = ID,
.SDcols = grep('x', names(dt))]$V1
]
# or using .SD as before
dt[,
.SD[Reduce('&', lapply(.SD, function(x) !(rleid(x) %in% c(1, max(rleid(x1))) & is.na(x)))),
.SDcols = grep('x', names(dt))],
by = ID
]
或与dplyr相同的想法:
library(dplyr)
library(data.table)
df1%>%
group_by(ID)%>%
filter_at(vars(starts_with('x')), all_vars(!(is.na(.) & rleid(.) %in% c(1, max(rleid(.))))))
结果在 42 行中:
# A tibble: 42 x 4
# Groups: ID [4]
ID Year x1 x2
<fct> <int> <dbl> <dbl>
1 C1001 2001 25 54
2 C1001 2002 28 50
3 C1001 2003 35 94
4 C1001 2004 52 34
5 C1001 2005 60 47
6 C1001 2006 NA 9
7 C1001 2007 67 86
8 C1001 2008 58 40
9 C1001 2009 61 73
10 C1001 2010 28 18
# ... with 32 more rows
这是一种使用 filter_at()
的方法,它用 cumsum()
识别前导 NA
值,并用相同的想法识别尾随值,但向量相反。
library(dplyr)
df1 %>%
group_by(ID) %>%
filter_at(vars(-ID, -Year), all_vars(pmin(cumsum(!is.na(.)), rev(cumsum(!is.na(rev(.))))) != 0))
# A tibble: 42 x 4
# Groups: ID [4]
ID Year x1 x2
<fct> <int> <dbl> <dbl>
1 C1001 2001 42 18
2 C1001 2002 20 48
3 C1001 2003 30 26
4 C1001 2004 66 22
5 C1001 2005 32 67
6 C1001 2006 NA 5
7 C1001 2007 24 70
8 C1001 2008 33 35
9 C1001 2009 60 41
10 C1001 2010 66 82
# ... with 32 more rows
使用 data.table 的另一个选项:
f4 <- function(DT) {
setindex(DT, ID)
DT[, rn := .I]
uid <- DT[,.(ID=unique(ID), V=TRUE)]
rows <- rbindlist(lapply(cols, function(x) {
merge(
DT[, V := !is.na(get(x))][uid, on=c("ID", "V"), mult="first", .(ID, S=rn)],
DT[uid, on=c("ID", "V"), mult="last", .(ID, E=rn)],
by="ID")
}))[, .(S=max(S), E=min(E)) , ID]
DT[rows, on=.(ID, rn>=S, rn<=E), .SD]
}
f4(df1)
输出:
ID Year V1 V2 rn V
1: C1001 2001 45 70 6 TRUE
2: C1001 2002 74 78 6 TRUE
3: C1001 2003 48 9 6 TRUE
4: C1001 2004 27 32 6 TRUE
5: C1001 2005 39 3 6 TRUE
6: C1001 2006 NA 89 6 TRUE
7: C1001 2007 22 2 6 TRUE
8: C1001 2008 56 12 6 TRUE
9: C1001 2009 29 34 6 TRUE
10: C1001 2010 30 53 6 TRUE
11: C1001 2011 61 46 6 TRUE
12: C1001 2012 23 42 6 TRUE
13: C1008 2002 NA 95 24 TRUE
14: C1008 2003 71 64 24 TRUE
15: C1008 2004 41 92 24 TRUE
16: C1008 2005 45 28 24 TRUE
17: C1008 2006 74 59 24 TRUE
18: C1008 2007 45 16 24 TRUE
19: C1008 2008 57 64 24 TRUE
20: C1008 2009 NA 35 24 TRUE
21: C1008 2010 NA 2 24 TRUE
22: C1008 2011 32 27 24 TRUE
23: C1008 2012 69 41 24 TRUE
24: C1009 2005 30 24 44 TRUE
25: C1009 2006 43 49 44 TRUE
26: C1009 2007 50 NA 44 FALSE
27: C1009 2008 28 72 44 TRUE
28: C1009 2009 43 20 44 TRUE
29: C1012 1996 36 73 52 TRUE
30: C1012 1997 52 4 52 TRUE
31: C1012 1998 67 14 52 TRUE
32: C1012 1999 39 59 52 TRUE
33: C1012 2000 56 12 52 TRUE
34: C1012 2001 25 92 52 TRUE
35: C1012 2002 26 NA 52 FALSE
36: C1012 2003 73 11 52 TRUE
37: C1012 2004 39 50 52 TRUE
38: C1012 2005 65 89 52 TRUE
39: C1012 2006 70 21 52 TRUE
40: C1012 2007 54 86 52 TRUE
41: C1012 2008 37 70 52 TRUE
42: C1012 2009 66 22 52 TRUE
ID Year V1 V2 rn V
数据:
library(data.table)
df1 <- data.frame(ID=(rep(c("C1001","C1008","C1009","C1012"),each=17)),
Year=(rep(c(1996:2012),4)), V1=(floor(runif(68,20,75))), V2=
(floor(runif(68,1,100))))
df1[1:5,3]<-NA
df1[18:23,4]<-NA
df1[35:42,4]<-NA
df1[49:51,3]<-NA
df1[66:68,3]<-NA
set.seed(123)
df1$V1[rbinom(68,1,0.1)==1]<-NA
df1$V2[rbinom(68,1,0.1)==1]<-NA
setDT(df1)[, rn := .I]
cols <- paste0("V", 1:5)
行数多、组数多的数据时序码:
set.seed(0L)
if ((BIGDATA <- TRUE)) {
nr <- 1e7
nc <- 5
nid <- 1e5
dat <- data.table(ID=sample(nid, nr, TRUE),
as.data.table(matrix(sample(c(1, NA), nr*nc, TRUE), ncol=nc)),
key="ID")
cols <- paste0("V", 1:5)
} else {
df1 <- data.frame(ID=(rep(c("C1001","C1008","C1009","C1012"),each=17)),
Year=(rep(c(1996:2012),4)), V1=(floor(runif(68,20,75))), V2=
(floor(runif(68,1,100))))
df1[1:5,3]<-NA
df1[18:23,4]<-NA
df1[35:42,4]<-NA
df1[49:51,3]<-NA
df1[66:68,3]<-NA
set.seed(123)
df1$V1[rbinom(68,1,0.1)==1]<-NA
df1$V2[rbinom(68,1,0.1)==1]<-NA
dat <- setDT(df1)[, rn := .I]
cols <- paste0("V", 1:2)
}
DT0 <- copy(dat)
DT1 <- copy(dat)
DT2 <- copy(dat)
DT3 <- copy(dat)
DT4 <- copy(dat)
f0 <- function(DT) {
DT[DT[, Reduce('&',
lapply(.SD, function(x) {
r <- rleid(x)
!(r %in% c(1, max(r)) & is.na(x))
})),
ID,
.SDcols=cols]$V1]
}
f1 <- function(DT) {
DT[, c("rn", "rid") := .(.I, rowid(ID))][.N:1L, rev_rid := rowid(ID)]
for (x in cols) {
idx <- DT[is.na(get(x)) & ID %in% DT[is.na(get(x)) & (rid==1L | rev_rid==1L), ID],
if (rid[1L]==1L || rev_rid[.N]==1L) rn,
cumsum(c(0L, diff(rn) > 1L))]$V1
DT <- DT[!rn %in% idx]
}
DT
}
f2 <- function(DT) {
DT[, c("rn", "rid") := .(.I, rowid(ID))][.N:1L, rev_rid := rowid(ID)]
for (x in cols) {
DT <- DT[!rn %in% DT[is.na(get(x)),
if (rid[1L]==1L || rev_rid[.N]==1L) rn,
cumsum(c(0L, diff(rn) > 1L))]$V1]
}
DT
}
f3 <- function(DT) {
DT[, rn := .I]
rows <- DT[, transpose(lapply(.SD, function(x) c(rn[match(TRUE, !is.na(x))],
rev(rn)[match(TRUE, !is.na(rev(x)))]))),
ID, .SDcols=cols][, .(S=max(V1), E=min(V2)) , ID]
DT[rows, on=.(ID, rn>=S, rn<=E), .SD]
}
f4 <- function(DT) {
setindex(DT, ID)
DT[, rn := .I]
uid <- DT[,.(ID=unique(ID), V=TRUE)]
rows <- rbindlist(lapply(cols, function(x) {
merge(
DT[, V := !is.na(get(x))][uid, on=c("ID", "V"), mult="first", .(ID, S=rn)],
DT[uid, on=c("ID", "V"), mult="last", .(ID, E=rn)],
by="ID")
}))[, .(S=max(S), E=min(E)) , ID]
DT[rows, on=.(ID, rn>=S, rn<=E), .SD]
}
microbenchmark::microbenchmark(f0(DT0), f1(DT1), f2(DT2), f3(DT3), f4(DT4), times=3L)
时间:
Unit: seconds
expr min lq mean median uq max neval
f0(DT0) 8.874985 8.950951 8.993281 9.026917 9.052429 9.077942 3
f1(DT1) 16.249656 16.337013 16.657910 16.424370 16.862038 17.299706 3
f2(DT2) 18.225748 18.284212 18.391198 18.342676 18.473922 18.605169 3
f3(DT3) 10.361079 10.612313 10.698897 10.863548 10.867806 10.872063 3
f4(DT4) 3.106936 3.137846 3.173174 3.168755 3.206293 3.243830 3
另一个行数相同但组数少得多的测试:
set.seed(0L)
nr <- 1e7
nc <- 5
nid <- 1e2
dat <- data.table(ID=sample(nid, nr, TRUE),
as.data.table(matrix(sample(c(1, NA), nr*nc, TRUE), ncol=nc)),
key="ID")
cols <- paste0("V", 1:5)
DT0 <- copy(dat)
DT3 <- copy(dat)
microbenchmark::microbenchmark(f0(DT0), f3(DT3), f4(DT4), times=3L)
时间:
Unit: seconds
expr min lq mean median uq max neval
f0(DT0) 2.317905 2.331944 2.358256 2.345983 2.378431 2.410880 3
f3(DT3) 2.108385 2.123889 2.132315 2.139392 2.144280 2.149168 3
f4(DT4) 2.050805 2.079687 2.101211 2.108569 2.126414 2.144258 3
我需要删除包含 NA 的行,但前提是它们是前导(尾随),即在变量出现任何数据之前(之后)。这非常类似于:
但是,我需要按变量 "ID" 分组执行此过程。我将在后面的步骤中估算两者之间的 NA 数据。
同样适用于尾随的 NA。
最初的data.frame是这样的:
df1<-data.frame(ID=(rep(c("C1001","C1008","C1009","C1012"),each=17)),
Year=(rep(c(1996:2012),4)),x1=(floor(runif(68,20,75))),x2=
(floor(runif(68,1,100))))
#Introduce leading / tailing NAs
df1[1:5,3]<-NA
df1[18:23,4]<-NA
df1[35:42,4]<-NA
df1[49:51,3]<-NA
df1[66:68,3]<-NA
#introduce "gap"- NAs
set.seed(123)
df1$x1[rbinom(68,1,0.1)==1]<-NA
df1$x2[rbinom(68,1,0.1)==1]<-NA
输出很长。这是为了正确区分 "gap"-NAs 和 "leading/trailing" NAs possible
head(df1,10)
ID Year x1 x2
1 C1001 1996 NA 40
2 C1001 1997 NA 88
3 C1001 1998 NA 37
4 C1001 1999 NA 29
5 C1001 2000 NA 17
6 C1001 2001 42 18
7 C1001 2002 20 48
8 C1001 2003 30 26
9 C1001 2004 66 22
10 C1001 2005 32 67
输出应按 ID 组删除前导 NA(参见上面的行 1:5)。或者下面示例中的行 18:23:
df1[18:28,]
ID Year x1 x2
18 C1008 1996 33 NA
19 C1008 1997 26 NA
20 C1008 1998 NA NA
21 C1008 1999 51 NA
22 C1008 2000 31 NA
23 C1008 2001 44 NA
24 C1008 2002 NA 56
25 C1008 2003 47 70
26 C1008 2004 39 91
27 C1008 2005 55 62
28 C1008 2006 40 43
最终输出应如下所示(当然取决于随机输入的 NA!):
ID Year x1 x2
6 C1001 2001 42 18
7 C1001 2002 20 48
8 C1001 2003 30 26
9 C1001 2004 66 22
10 C1001 2005 32 67
11 C1001 2006 NA 5
12 C1001 2007 24 70
13 C1001 2008 33 35
14 C1001 2009 60 41
15 C1001 2010 66 82
16 C1001 2011 47 91
17 C1001 2012 41 28
24 C1008 2002 NA 56
25 C1008 2003 47 70
26 C1008 2004 39 91
27 C1008 2005 55 62
28 C1008 2006 40 43
29 C1008 2007 39 54
30 C1008 2008 49 6
31 C1008 2009 NA 26
32 C1008 2010 NA 40
33 C1008 2011 42 20
34 C1008 2012 34 83
44 C1009 2005 51 96
45 C1009 2006 66 96
46 C1009 2007 37 NA
47 C1009 2008 58 26
48 C1009 2009 34 22
52 C1012 1996 51 78
53 C1012 1997 70 17
54 C1012 1998 69 41
55 C1012 1999 35 47
56 C1012 2000 37 86
57 C1012 2001 74 92
58 C1012 2002 54 NA
59 C1012 2003 71 67
60 C1012 2004 45 95
61 C1012 2005 42 52
62 C1012 2006 56 58
63 C1012 2007 28 34
64 C1012 2008 51 35
65 C1012 2009 33 2
非常感谢!
这是一个 data.table 解决方案,它依赖 rleid
来仅删除前导 NA:
library(data.table)
dt <- as.data.table(df1)
dt[,
.SD[!(rleid(x1) %in% c(1, max(rleid(x1))) & is.na(x1)) &
!(rleid(x2) %in% c(1, max(rleid(x2))) & is.na(x2))],
by = ID
]
要自动处理多个列,假设它们都以 x
开头,您可以这样做:
dt[dt[, Reduce('&',
lapply(.SD, function(x) !(rleid(x) %in% c(1, max(rleid(x1))) & is.na(x)))),
by = ID,
.SDcols = grep('x', names(dt))]$V1
]
# or using .SD as before
dt[,
.SD[Reduce('&', lapply(.SD, function(x) !(rleid(x) %in% c(1, max(rleid(x1))) & is.na(x)))),
.SDcols = grep('x', names(dt))],
by = ID
]
或与dplyr相同的想法:
library(dplyr)
library(data.table)
df1%>%
group_by(ID)%>%
filter_at(vars(starts_with('x')), all_vars(!(is.na(.) & rleid(.) %in% c(1, max(rleid(.))))))
结果在 42 行中:
# A tibble: 42 x 4
# Groups: ID [4]
ID Year x1 x2
<fct> <int> <dbl> <dbl>
1 C1001 2001 25 54
2 C1001 2002 28 50
3 C1001 2003 35 94
4 C1001 2004 52 34
5 C1001 2005 60 47
6 C1001 2006 NA 9
7 C1001 2007 67 86
8 C1001 2008 58 40
9 C1001 2009 61 73
10 C1001 2010 28 18
# ... with 32 more rows
这是一种使用 filter_at()
的方法,它用 cumsum()
识别前导 NA
值,并用相同的想法识别尾随值,但向量相反。
library(dplyr)
df1 %>%
group_by(ID) %>%
filter_at(vars(-ID, -Year), all_vars(pmin(cumsum(!is.na(.)), rev(cumsum(!is.na(rev(.))))) != 0))
# A tibble: 42 x 4
# Groups: ID [4]
ID Year x1 x2
<fct> <int> <dbl> <dbl>
1 C1001 2001 42 18
2 C1001 2002 20 48
3 C1001 2003 30 26
4 C1001 2004 66 22
5 C1001 2005 32 67
6 C1001 2006 NA 5
7 C1001 2007 24 70
8 C1001 2008 33 35
9 C1001 2009 60 41
10 C1001 2010 66 82
# ... with 32 more rows
使用 data.table 的另一个选项:
f4 <- function(DT) {
setindex(DT, ID)
DT[, rn := .I]
uid <- DT[,.(ID=unique(ID), V=TRUE)]
rows <- rbindlist(lapply(cols, function(x) {
merge(
DT[, V := !is.na(get(x))][uid, on=c("ID", "V"), mult="first", .(ID, S=rn)],
DT[uid, on=c("ID", "V"), mult="last", .(ID, E=rn)],
by="ID")
}))[, .(S=max(S), E=min(E)) , ID]
DT[rows, on=.(ID, rn>=S, rn<=E), .SD]
}
f4(df1)
输出:
ID Year V1 V2 rn V
1: C1001 2001 45 70 6 TRUE
2: C1001 2002 74 78 6 TRUE
3: C1001 2003 48 9 6 TRUE
4: C1001 2004 27 32 6 TRUE
5: C1001 2005 39 3 6 TRUE
6: C1001 2006 NA 89 6 TRUE
7: C1001 2007 22 2 6 TRUE
8: C1001 2008 56 12 6 TRUE
9: C1001 2009 29 34 6 TRUE
10: C1001 2010 30 53 6 TRUE
11: C1001 2011 61 46 6 TRUE
12: C1001 2012 23 42 6 TRUE
13: C1008 2002 NA 95 24 TRUE
14: C1008 2003 71 64 24 TRUE
15: C1008 2004 41 92 24 TRUE
16: C1008 2005 45 28 24 TRUE
17: C1008 2006 74 59 24 TRUE
18: C1008 2007 45 16 24 TRUE
19: C1008 2008 57 64 24 TRUE
20: C1008 2009 NA 35 24 TRUE
21: C1008 2010 NA 2 24 TRUE
22: C1008 2011 32 27 24 TRUE
23: C1008 2012 69 41 24 TRUE
24: C1009 2005 30 24 44 TRUE
25: C1009 2006 43 49 44 TRUE
26: C1009 2007 50 NA 44 FALSE
27: C1009 2008 28 72 44 TRUE
28: C1009 2009 43 20 44 TRUE
29: C1012 1996 36 73 52 TRUE
30: C1012 1997 52 4 52 TRUE
31: C1012 1998 67 14 52 TRUE
32: C1012 1999 39 59 52 TRUE
33: C1012 2000 56 12 52 TRUE
34: C1012 2001 25 92 52 TRUE
35: C1012 2002 26 NA 52 FALSE
36: C1012 2003 73 11 52 TRUE
37: C1012 2004 39 50 52 TRUE
38: C1012 2005 65 89 52 TRUE
39: C1012 2006 70 21 52 TRUE
40: C1012 2007 54 86 52 TRUE
41: C1012 2008 37 70 52 TRUE
42: C1012 2009 66 22 52 TRUE
ID Year V1 V2 rn V
数据:
library(data.table)
df1 <- data.frame(ID=(rep(c("C1001","C1008","C1009","C1012"),each=17)),
Year=(rep(c(1996:2012),4)), V1=(floor(runif(68,20,75))), V2=
(floor(runif(68,1,100))))
df1[1:5,3]<-NA
df1[18:23,4]<-NA
df1[35:42,4]<-NA
df1[49:51,3]<-NA
df1[66:68,3]<-NA
set.seed(123)
df1$V1[rbinom(68,1,0.1)==1]<-NA
df1$V2[rbinom(68,1,0.1)==1]<-NA
setDT(df1)[, rn := .I]
cols <- paste0("V", 1:5)
行数多、组数多的数据时序码:
set.seed(0L)
if ((BIGDATA <- TRUE)) {
nr <- 1e7
nc <- 5
nid <- 1e5
dat <- data.table(ID=sample(nid, nr, TRUE),
as.data.table(matrix(sample(c(1, NA), nr*nc, TRUE), ncol=nc)),
key="ID")
cols <- paste0("V", 1:5)
} else {
df1 <- data.frame(ID=(rep(c("C1001","C1008","C1009","C1012"),each=17)),
Year=(rep(c(1996:2012),4)), V1=(floor(runif(68,20,75))), V2=
(floor(runif(68,1,100))))
df1[1:5,3]<-NA
df1[18:23,4]<-NA
df1[35:42,4]<-NA
df1[49:51,3]<-NA
df1[66:68,3]<-NA
set.seed(123)
df1$V1[rbinom(68,1,0.1)==1]<-NA
df1$V2[rbinom(68,1,0.1)==1]<-NA
dat <- setDT(df1)[, rn := .I]
cols <- paste0("V", 1:2)
}
DT0 <- copy(dat)
DT1 <- copy(dat)
DT2 <- copy(dat)
DT3 <- copy(dat)
DT4 <- copy(dat)
f0 <- function(DT) {
DT[DT[, Reduce('&',
lapply(.SD, function(x) {
r <- rleid(x)
!(r %in% c(1, max(r)) & is.na(x))
})),
ID,
.SDcols=cols]$V1]
}
f1 <- function(DT) {
DT[, c("rn", "rid") := .(.I, rowid(ID))][.N:1L, rev_rid := rowid(ID)]
for (x in cols) {
idx <- DT[is.na(get(x)) & ID %in% DT[is.na(get(x)) & (rid==1L | rev_rid==1L), ID],
if (rid[1L]==1L || rev_rid[.N]==1L) rn,
cumsum(c(0L, diff(rn) > 1L))]$V1
DT <- DT[!rn %in% idx]
}
DT
}
f2 <- function(DT) {
DT[, c("rn", "rid") := .(.I, rowid(ID))][.N:1L, rev_rid := rowid(ID)]
for (x in cols) {
DT <- DT[!rn %in% DT[is.na(get(x)),
if (rid[1L]==1L || rev_rid[.N]==1L) rn,
cumsum(c(0L, diff(rn) > 1L))]$V1]
}
DT
}
f3 <- function(DT) {
DT[, rn := .I]
rows <- DT[, transpose(lapply(.SD, function(x) c(rn[match(TRUE, !is.na(x))],
rev(rn)[match(TRUE, !is.na(rev(x)))]))),
ID, .SDcols=cols][, .(S=max(V1), E=min(V2)) , ID]
DT[rows, on=.(ID, rn>=S, rn<=E), .SD]
}
f4 <- function(DT) {
setindex(DT, ID)
DT[, rn := .I]
uid <- DT[,.(ID=unique(ID), V=TRUE)]
rows <- rbindlist(lapply(cols, function(x) {
merge(
DT[, V := !is.na(get(x))][uid, on=c("ID", "V"), mult="first", .(ID, S=rn)],
DT[uid, on=c("ID", "V"), mult="last", .(ID, E=rn)],
by="ID")
}))[, .(S=max(S), E=min(E)) , ID]
DT[rows, on=.(ID, rn>=S, rn<=E), .SD]
}
microbenchmark::microbenchmark(f0(DT0), f1(DT1), f2(DT2), f3(DT3), f4(DT4), times=3L)
时间:
Unit: seconds
expr min lq mean median uq max neval
f0(DT0) 8.874985 8.950951 8.993281 9.026917 9.052429 9.077942 3
f1(DT1) 16.249656 16.337013 16.657910 16.424370 16.862038 17.299706 3
f2(DT2) 18.225748 18.284212 18.391198 18.342676 18.473922 18.605169 3
f3(DT3) 10.361079 10.612313 10.698897 10.863548 10.867806 10.872063 3
f4(DT4) 3.106936 3.137846 3.173174 3.168755 3.206293 3.243830 3
另一个行数相同但组数少得多的测试:
set.seed(0L)
nr <- 1e7
nc <- 5
nid <- 1e2
dat <- data.table(ID=sample(nid, nr, TRUE),
as.data.table(matrix(sample(c(1, NA), nr*nc, TRUE), ncol=nc)),
key="ID")
cols <- paste0("V", 1:5)
DT0 <- copy(dat)
DT3 <- copy(dat)
microbenchmark::microbenchmark(f0(DT0), f3(DT3), f4(DT4), times=3L)
时间:
Unit: seconds
expr min lq mean median uq max neval
f0(DT0) 2.317905 2.331944 2.358256 2.345983 2.378431 2.410880 3
f3(DT3) 2.108385 2.123889 2.132315 2.139392 2.144280 2.149168 3
f4(DT4) 2.050805 2.079687 2.101211 2.108569 2.126414 2.144258 3