具有缺失数据和差距的跨时间个体
Individuals across time with missing data and gaps
清理某个时间段内可能存在也可能不存在的数据方法。我想随着时间的推移查看个人,他们可能存在于第一个时间段或开始于第一个时间段以外的时间段。个人可能在某个点之后没有数据,或者数据有差距。数据中的间隙可能没有一行 NA,而是可能完全从数据集中丢失。我希望能够保留连续出现 'n' 次且时间间隔少于 'n' 的个体(或按特定列名)。
Drop variable in panel data in R conditional based on a defined number of consecutive observations
上面的问题和我的差不多。但是,有些时期我没有数据而不是所有 NA。这就是为什么计算 NAs 是不够的,我研究了测量时间的距离。它必须为每个组重新设置,并且对于不是在 t=1 开始的组来说很困难。
set.seed(5)
data<-data.table(y=rnorm(100))
data[sample(1:100, 40),]<-NA
data1 <- data.table(id = rep(1:10, each = 10),
time = seq(1,10),
x = rnorm(100),
z = rnorm(100))
data2<-cbind(data1,data)
data2$row<-1:nrow(data2)
data2a<-subset(data2,row<55|row>62 )
data3<-data2a[-sample(nrow(data2a), 5)]
View(data3)
count(data3$id)
x freq
1 1 10
2 2 10
3 3 10
4 4 8
5 5 10
6 6 4
7 7 7
8 8 9
9 9 10
10 10 9
如果我希望 gaps=0 并且每个 id 至少有 5 个观察值。然后我只会保留 ids 1、2、3、5、7、9、10。由于所有这些组都有 gaps=0 并且我也会删除 id 6,因为它只有 4 个观察值。
也请让我知道你在哪里学到的方法,这样我就可以按照那个来学习更多。
输出:
set.seed(5)
library(plyr)
data<-data.table(y=rnorm(100))
data[sample(1:100, 40),]<-NA
data1 <- data.table(id = rep(1:10, each = 10),
time = seq(1,10),
x = rnorm(100),
z = rnorm(100))
data2<-cbind(data1,data)
data2$row<-1:nrow(data2)
data2a<-subset(data2,row<55|row>62 )
data3<-data2a[-sample(nrow(data2a), 5)]
View(data3)
dt<-data.table(count(data3$id))
dt2<-subset(dt, x!=6 &x!=4)
View(dt2)
dta<-data3[data3$id %in% dt2$x,]
dt3<-subset(dta, id!=8 |time < 7)
View(dt3)
print(dt3)
id time x z y row
1: 1 1 1.17085642 0.21083288 -0.84085548 1
2: 1 2 0.88484486 -0.03329921 NA 2
3: 1 3 -1.31788860 2.02519699 NA 3
4: 1 4 -1.64325094 -0.37078675 0.07014277 4
5: 1 5 1.05925039 -1.57823445 NA 5
6: 1 6 0.29008358 -0.12157195 NA 6
7: 1 7 -0.40003350 -1.79667682 NA 7
8: 1 8 1.24309578 -0.47559154 -0.63537131 8
9: 1 9 -1.36641052 -0.88410232 -0.28577363 9
10: 1 10 -1.44141330 -3.49805898 NA 10
11: 2 1 1.34854906 -0.38198337 NA 11
12: 2 2 -1.97852834 0.97768813 NA 12
13: 2 3 -1.24095058 -0.55804095 NA 13
14: 2 4 -0.10403913 -0.62645515 NA 14
15: 2 5 0.73297296 -0.53045123 -1.07176004 15
16: 2 6 0.45567962 1.89762159 -0.13898614 16
17: 2 7 0.28807955 1.39554068 -0.59731309 17
18: 2 8 -1.07369091 -0.74602587 NA 18
19: 2 9 0.64874254 -0.30557308 NA 19
20: 2 10 0.29916228 1.16967817 -0.25935541 20
21: 3 1 -0.79599499 0.30438718 0.90051195 21
22: 3 2 -0.02935340 -0.11749825 0.94186939 22
23: 3 3 2.18023570 -0.06008553 1.46796190 23
24: 3 4 0.95741847 1.47093895 NA 24
25: 3 5 -0.30504863 -1.47814761 0.81900893 25
26: 3 6 -0.41840334 -0.68361295 -0.29348185 26
27: 3 7 0.09995405 0.46054060 NA 27
28: 3 8 -0.22980962 -0.18150193 NA 28
29: 3 9 -1.41521488 -1.15881631 -0.65708209 29
30: 3 10 -0.39259886 0.40901892 -0.85279544 30
31: 5 1 -2.62134481 -1.45565758 1.55006037 41
32: 5 2 2.24625462 0.09378492 NA 42
33: 5 3 0.09343168 0.98234922 NA 43
34: 5 4 1.62728009 -0.59671016 NA 44
35: 5 5 -0.51091755 0.07480485 NA 45
36: 5 6 -0.65938084 2.19742943 0.56222336 46
37: 5 7 -0.04019016 0.79502321 -0.88700851 47
38: 5 8 -0.11869400 -0.53894221 -0.46024458 48
39: 5 9 -0.01965686 -1.60128318 -0.72432849 49
40: 5 10 -0.48567849 -0.73137357 NA 50
41: 7 4 0.97438263 0.96691960 0.49636154 64
42: 7 5 -1.26447348 -0.42332730 -0.76005793 65
43: 7 6 -0.27742142 -0.83159945 -0.34138627 66
44: 7 7 -0.18939869 1.39995727 -2.10232912 67
45: 7 8 -0.38402495 0.01701396 NA 68
46: 7 9 0.74058802 1.84749695 NA 69
47: 7 10 -1.16833839 -0.68633938 -0.27966611 70
48: 8 1 0.66753870 -0.21872403 -0.20409732 71
49: 8 2 0.36623695 0.68259291 -0.22561419 72
50: 8 3 -0.51494299 0.52413002 NA 73
51: 8 4 0.45056824 0.08054998 NA 74
52: 8 5 -0.18772038 0.05378554 NA 75
53: 8 6 1.33906937 -0.73725899 NA 76
54: 9 1 -0.11367818 1.21014609 NA 81
55: 9 2 -0.29510083 0.18865716 NA 82
56: 9 3 0.98916847 1.96249867 0.97552910 83
57: 9 4 -0.77513181 0.13871194 NA 84
58: 9 5 0.27589827 -1.57862735 0.67568448 85
59: 9 6 0.41078165 -0.79702127 NA 86
60: 9 7 0.61118316 1.22435388 2.38723265 87
61: 9 8 0.93657072 -0.36533356 -0.47343201 88
62: 9 9 -0.36754170 -0.16259028 -0.07577256 89
63: 9 10 0.74037676 0.56047918 NA 90
64: 10 2 0.62913443 1.23863449 -1.06241117 92
65: 10 3 0.52774631 0.76743575 0.55703387 93
66: 10 4 -0.47225530 -1.08740911 0.90073058 94
67: 10 5 0.82371516 0.06750377 0.98994568 95
68: 10 6 -0.42778825 1.60514057 0.38360809 96
69: 10 7 -0.14264393 1.23222943 -0.34658381 97
70: 10 8 1.41878305 -0.37911379 -0.54018925 98
71: 10 9 0.48713390 -1.34986658 -0.18255559 99
72: 10 10 0.60344145 0.36491810 NA 100
"lapply" 可能有用:
ID <- unique(data3$id)
n <- lapply(ID, function(i){which(data3$id==i)})
tn <- lapply(n , function(i){data3$time[i]})
gapCount <- lapply(tn, function(ti){sum(diff(ti)>1)})
maxPeriod <- lapply(tn, function(ti){max( c(ti[which(diff(ti)>1)+1],max(ti)) -
c(min(ti)-1,ti[which(diff(ti)>1)]) ) } )
obsCount <- lapply(n , length)
#------------------------------------------------------------------------
# Example 1: Remove all individuals with
# at least one gap or
# at most 4 observations.
keepTheseIDs_Ex1 <- which( gapCount==0 & obsCount>4 )
data_Ex1 <- data3[which(data3$id %in% keepTheseIDs_Ex1),]
#------------------------------------------------------------------------
# Example 2: Remove all individuals with
# at most 8 observations or
# no connected period of length at least 5
keepTheseIDs_Ex2 <- which( obsCount>8 & maxPeriod>=5 )
data_Ex2 <- data3[which(data3$id %in% keepTheseIDs_Ex2),]
每个人"ID[i]"
- n[i]是行号列表,
- tn[i] 是与该个人关联的时间列表。
如果 "tn[i]" 中有空隙,即 "tn[i][j+1]-tn[i][j]>1","diff[tn[i]" 在索引 "j" 处跳转
通过间隙的长度加 1,这就是间隙数的计算和收集方式
列表 "gapCount".
连接的周期从索引 "which(diff(ti)>1)" 开始,并且
在索引 "which(diff(ti)>1)+1" 处结束。所以相应时间的差异给出
连接期间的长度。对于每个人 "ID[i]" 的最大长度
连通分量是列表 "maxPeriod".
的第 "i" 项
个人 "ID[i]" 有 "obsCount[[i]]" 个观察结果。
尝试打包 dplyr 并使用此脚本:
data3 %>%
data.frame() %>% # seems that with data.tables the group_by is lost after mutate
group_by(id) %>%
mutate(time_lag_1 = lag(time),
time_diff = time-time_lag_1,
N = n()) %>%
summarise(max_time_diff = max(time_diff, na.rm=T),
N = unique(N)) %>%
filter(max_time_diff == 1 &
N >= 5)
关于其工作原理的一些解释。
第一部分:
data3 %>%
data.frame() %>%
group_by(id) %>%
mutate(time_lag_1 = lag(time),
time_diff = time-time_lag_1,
N = n())
计算列 "time_lag_1"(移动列 "time")以便您可以比较 2 个连续行的时间(将差异存储在列 "time_diff" 中)并计算观察次数每个"id"。当然,你得先按"id"分组:
# id time x z y row time_lag_1 time_diff N
# 1 1 1 1.17085642 0.21083288 -0.84085548 1 NA NA 10
# 2 1 2 0.88484486 -0.03329921 NA 2 1 1 10
# 3 1 3 -1.31788860 2.02519699 NA 3 2 1 10
# 4 1 4 -1.64325094 -0.37078675 0.07014277 4 3 1 10
# 5 1 5 1.05925039 -1.57823445 NA 5 4 1 10
# 6 1 6 0.29008358 -0.12157195 NA 6 5 1 10
# 7 1 7 -0.40003350 -1.79667682 NA 7 6 1 10
# 8 1 8 1.24309578 -0.47559154 -0.63537131 8 7 1 10
# 9 1 9 -1.36641052 -0.88410232 -0.28577363 9 8 1 10
# 10 1 10 -1.44141330 -3.49805898 NA 10 9 1 10
# 11 2 1 1.34854906 -0.38198337 NA 11 NA NA 10
# 12 2 2 -1.97852834 0.97768813 NA 12 1 1 10
# 13 2 3 -1.24095058 -0.55804095 NA 13 2 1 10
# 14 2 4 -0.10403913 -0.62645515 NA 14 3 1 10
# 15 2 5 0.73297296 -0.53045123 -1.07176004 15 4 1 10
# 16 2 6 0.45567962 1.89762159 -0.13898614 16 5 1 10
# 17 2 7 0.28807955 1.39554068 -0.59731309 17 6 1 10
# 18 2 8 -1.07369091 -0.74602587 NA 18 7 1 10
# 19 2 9 0.64874254 -0.30557308 NA 19 8 1 10
# 20 2 10 0.29916228 1.16967817 -0.25935541 20 9 1 10
# 21 3 1 -0.79599499 0.30438718 0.90051195 21 NA NA 10
# 22 3 2 -0.02935340 -0.11749825 0.94186939 22 1 1 10
# 23 3 3 2.18023570 -0.06008553 1.46796190 23 2 1 10
# 24 3 4 0.95741847 1.47093895 NA 24 3 1 10
# 25 3 5 -0.30504863 -1.47814761 0.81900893 25 4 1 10
# 26 3 6 -0.41840334 -0.68361295 -0.29348185 26 5 1 10
# 27 3 7 0.09995405 0.46054060 NA 27 6 1 10
# 28 3 8 -0.22980962 -0.18150193 NA 28 7 1 10
# 29 3 9 -1.41521488 -1.15881631 -0.65708209 29 8 1 10
# 30 3 10 -0.39259886 0.40901892 -0.85279544 30 9 1 10
# 31 4 1 0.94608855 -0.25820706 0.31591504 31 NA NA 8
# 32 4 2 0.75177087 -0.26689944 1.10969417 32 1 1 8
# 33 4 4 0.80833598 -0.39345895 NA 34 2 2 8
# 34 4 5 -0.61453522 -1.84373725 NA 35 4 1 8
# 35 4 6 1.23825893 -1.54228827 0.95157383 36 5 1 8
# 36 4 7 -0.33809514 -0.58624036 NA 37 6 1 8
# 37 4 8 1.19636636 -0.85213891 -2.00047274 38 7 1 8
# 38 4 9 -0.44331838 0.77832456 -1.76218587 39 8 1 8
# 39 5 1 -2.62134481 -1.45565758 1.55006037 41 NA NA 10
# 40 5 2 2.24625462 0.09378492 NA 42 1 1 10
# 41 5 3 0.09343168 0.98234922 NA 43 2 1 10
# 42 5 4 1.62728009 -0.59671016 NA 44 3 1 10
# 43 5 5 -0.51091755 0.07480485 NA 45 4 1 10
# 44 5 6 -0.65938084 2.19742943 0.56222336 46 5 1 10
# 45 5 7 -0.04019016 0.79502321 -0.88700851 47 6 1 10
# 46 5 8 -0.11869400 -0.53894221 -0.46024458 48 7 1 10
# 47 5 9 -0.01965686 -1.60128318 -0.72432849 49 8 1 10
# 48 5 10 -0.48567849 -0.73137357 NA 50 9 1 10
# 49 6 1 -1.44014752 -0.35574079 NA 51 NA NA 4
# 50 6 2 0.14376888 -0.98541432 0.18772610 52 1 1 4
# 51 6 3 -1.23458665 -0.73117064 1.02202286 53 2 1 4
# 52 6 4 -1.75250121 1.46532408 -0.59183483 54 3 1 4
# 53 7 4 0.97438263 0.96691960 0.49636154 64 NA NA 7
# 54 7 5 -1.26447348 -0.42332730 -0.76005793 65 4 1 7
# 55 7 6 -0.27742142 -0.83159945 -0.34138627 66 5 1 7
# 56 7 7 -0.18939869 1.39995727 -2.10232912 67 6 1 7
# 57 7 8 -0.38402495 0.01701396 NA 68 7 1 7
# 58 7 9 0.74058802 1.84749695 NA 69 8 1 7
# 59 7 10 -1.16833839 -0.68633938 -0.27966611 70 9 1 7
# 60 8 1 0.66753870 -0.21872403 -0.20409732 71 NA NA 9
# 61 8 2 0.36623695 0.68259291 -0.22561419 72 1 1 9
# 62 8 3 -0.51494299 0.52413002 NA 73 2 1 9
# 63 8 4 0.45056824 0.08054998 NA 74 3 1 9
# 64 8 5 -0.18772038 0.05378554 NA 75 4 1 9
# 65 8 6 1.33906937 -0.73725899 NA 76 5 1 9
# 66 8 7 0.81621918 0.96643806 0.97348539 77 6 1 9
# 67 8 9 -0.65086272 0.18729094 0.18917369 79 7 2 9
# 68 8 10 0.72640902 0.27298575 -0.56288507 80 9 1 9
# 69 9 1 -0.11367818 1.21014609 NA 81 NA NA 10
# 70 9 2 -0.29510083 0.18865716 NA 82 1 1 10
# 71 9 3 0.98916847 1.96249867 0.97552910 83 2 1 10
# 72 9 4 -0.77513181 0.13871194 NA 84 3 1 10
# 73 9 5 0.27589827 -1.57862735 0.67568448 85 4 1 10
# 74 9 6 0.41078165 -0.79702127 NA 86 5 1 10
# 75 9 7 0.61118316 1.22435388 2.38723265 87 6 1 10
# 76 9 8 0.93657072 -0.36533356 -0.47343201 88 7 1 10
# 77 9 9 -0.36754170 -0.16259028 -0.07577256 89 8 1 10
# 78 9 10 0.74037676 0.56047918 NA 90 9 1 10
# 79 10 2 0.62913443 1.23863449 -1.06241117 92 NA NA 9
# 80 10 3 0.52774631 0.76743575 0.55703387 93 2 1 9
# 81 10 4 -0.47225530 -1.08740911 0.90073058 94 3 1 9
# 82 10 5 0.82371516 0.06750377 0.98994568 95 4 1 9
# 83 10 6 -0.42778825 1.60514057 0.38360809 96 5 1 9
# 84 10 7 -0.14264393 1.23222943 -0.34658381 97 6 1 9
# 85 10 8 1.41878305 -0.37911379 -0.54018925 98 7 1 9
# 86 10 9 0.48713390 -1.34986658 -0.18255559 99 8 1 9
# 87 10 10 0.60344145 0.36491810 NA 100 9 1 9
第二部分:
summarise(max_time_diff = max(time_diff, na.rm=T),
N = unique(N))
计算连续时间之间的最大差异(这将发现您的差距)并保留 N(唯一值,因为对于特定 "id" 所有 N 都是相同的),对于每个 "id" :
# Source: local data frame [10 x 3]
#
# id max_time_diff N
# 1 1 1 10
# 2 2 1 10
# 3 3 1 10
# 4 4 2 8
# 5 5 1 10
# 6 6 1 4
# 7 7 1 7
# 8 8 2 9
# 9 9 1 10
# 10 10 1 9
然后最后一部分只是进行过滤,您将得到:
# Source: local data frame [7 x 3]
#
# id max_time_diff N
# 1 1 1 10
# 2 2 1 10
# 3 3 1 10
# 4 5 1 10
# 5 7 1 7
# 6 9 1 10
# 7 10 1 9
您可以在最后添加%>% select(id)
以保留满足您筛选条件的ID。
清理某个时间段内可能存在也可能不存在的数据方法。我想随着时间的推移查看个人,他们可能存在于第一个时间段或开始于第一个时间段以外的时间段。个人可能在某个点之后没有数据,或者数据有差距。数据中的间隙可能没有一行 NA,而是可能完全从数据集中丢失。我希望能够保留连续出现 'n' 次且时间间隔少于 'n' 的个体(或按特定列名)。
Drop variable in panel data in R conditional based on a defined number of consecutive observations
上面的问题和我的差不多。但是,有些时期我没有数据而不是所有 NA。这就是为什么计算 NAs 是不够的,我研究了测量时间的距离。它必须为每个组重新设置,并且对于不是在 t=1 开始的组来说很困难。
set.seed(5)
data<-data.table(y=rnorm(100))
data[sample(1:100, 40),]<-NA
data1 <- data.table(id = rep(1:10, each = 10),
time = seq(1,10),
x = rnorm(100),
z = rnorm(100))
data2<-cbind(data1,data)
data2$row<-1:nrow(data2)
data2a<-subset(data2,row<55|row>62 )
data3<-data2a[-sample(nrow(data2a), 5)]
View(data3)
count(data3$id)
x freq
1 1 10
2 2 10
3 3 10
4 4 8
5 5 10
6 6 4
7 7 7
8 8 9
9 9 10
10 10 9
如果我希望 gaps=0 并且每个 id 至少有 5 个观察值。然后我只会保留 ids 1、2、3、5、7、9、10。由于所有这些组都有 gaps=0 并且我也会删除 id 6,因为它只有 4 个观察值。
也请让我知道你在哪里学到的方法,这样我就可以按照那个来学习更多。
输出:
set.seed(5)
library(plyr)
data<-data.table(y=rnorm(100))
data[sample(1:100, 40),]<-NA
data1 <- data.table(id = rep(1:10, each = 10),
time = seq(1,10),
x = rnorm(100),
z = rnorm(100))
data2<-cbind(data1,data)
data2$row<-1:nrow(data2)
data2a<-subset(data2,row<55|row>62 )
data3<-data2a[-sample(nrow(data2a), 5)]
View(data3)
dt<-data.table(count(data3$id))
dt2<-subset(dt, x!=6 &x!=4)
View(dt2)
dta<-data3[data3$id %in% dt2$x,]
dt3<-subset(dta, id!=8 |time < 7)
View(dt3)
print(dt3)
id time x z y row
1: 1 1 1.17085642 0.21083288 -0.84085548 1
2: 1 2 0.88484486 -0.03329921 NA 2
3: 1 3 -1.31788860 2.02519699 NA 3
4: 1 4 -1.64325094 -0.37078675 0.07014277 4
5: 1 5 1.05925039 -1.57823445 NA 5
6: 1 6 0.29008358 -0.12157195 NA 6
7: 1 7 -0.40003350 -1.79667682 NA 7
8: 1 8 1.24309578 -0.47559154 -0.63537131 8
9: 1 9 -1.36641052 -0.88410232 -0.28577363 9
10: 1 10 -1.44141330 -3.49805898 NA 10
11: 2 1 1.34854906 -0.38198337 NA 11
12: 2 2 -1.97852834 0.97768813 NA 12
13: 2 3 -1.24095058 -0.55804095 NA 13
14: 2 4 -0.10403913 -0.62645515 NA 14
15: 2 5 0.73297296 -0.53045123 -1.07176004 15
16: 2 6 0.45567962 1.89762159 -0.13898614 16
17: 2 7 0.28807955 1.39554068 -0.59731309 17
18: 2 8 -1.07369091 -0.74602587 NA 18
19: 2 9 0.64874254 -0.30557308 NA 19
20: 2 10 0.29916228 1.16967817 -0.25935541 20
21: 3 1 -0.79599499 0.30438718 0.90051195 21
22: 3 2 -0.02935340 -0.11749825 0.94186939 22
23: 3 3 2.18023570 -0.06008553 1.46796190 23
24: 3 4 0.95741847 1.47093895 NA 24
25: 3 5 -0.30504863 -1.47814761 0.81900893 25
26: 3 6 -0.41840334 -0.68361295 -0.29348185 26
27: 3 7 0.09995405 0.46054060 NA 27
28: 3 8 -0.22980962 -0.18150193 NA 28
29: 3 9 -1.41521488 -1.15881631 -0.65708209 29
30: 3 10 -0.39259886 0.40901892 -0.85279544 30
31: 5 1 -2.62134481 -1.45565758 1.55006037 41
32: 5 2 2.24625462 0.09378492 NA 42
33: 5 3 0.09343168 0.98234922 NA 43
34: 5 4 1.62728009 -0.59671016 NA 44
35: 5 5 -0.51091755 0.07480485 NA 45
36: 5 6 -0.65938084 2.19742943 0.56222336 46
37: 5 7 -0.04019016 0.79502321 -0.88700851 47
38: 5 8 -0.11869400 -0.53894221 -0.46024458 48
39: 5 9 -0.01965686 -1.60128318 -0.72432849 49
40: 5 10 -0.48567849 -0.73137357 NA 50
41: 7 4 0.97438263 0.96691960 0.49636154 64
42: 7 5 -1.26447348 -0.42332730 -0.76005793 65
43: 7 6 -0.27742142 -0.83159945 -0.34138627 66
44: 7 7 -0.18939869 1.39995727 -2.10232912 67
45: 7 8 -0.38402495 0.01701396 NA 68
46: 7 9 0.74058802 1.84749695 NA 69
47: 7 10 -1.16833839 -0.68633938 -0.27966611 70
48: 8 1 0.66753870 -0.21872403 -0.20409732 71
49: 8 2 0.36623695 0.68259291 -0.22561419 72
50: 8 3 -0.51494299 0.52413002 NA 73
51: 8 4 0.45056824 0.08054998 NA 74
52: 8 5 -0.18772038 0.05378554 NA 75
53: 8 6 1.33906937 -0.73725899 NA 76
54: 9 1 -0.11367818 1.21014609 NA 81
55: 9 2 -0.29510083 0.18865716 NA 82
56: 9 3 0.98916847 1.96249867 0.97552910 83
57: 9 4 -0.77513181 0.13871194 NA 84
58: 9 5 0.27589827 -1.57862735 0.67568448 85
59: 9 6 0.41078165 -0.79702127 NA 86
60: 9 7 0.61118316 1.22435388 2.38723265 87
61: 9 8 0.93657072 -0.36533356 -0.47343201 88
62: 9 9 -0.36754170 -0.16259028 -0.07577256 89
63: 9 10 0.74037676 0.56047918 NA 90
64: 10 2 0.62913443 1.23863449 -1.06241117 92
65: 10 3 0.52774631 0.76743575 0.55703387 93
66: 10 4 -0.47225530 -1.08740911 0.90073058 94
67: 10 5 0.82371516 0.06750377 0.98994568 95
68: 10 6 -0.42778825 1.60514057 0.38360809 96
69: 10 7 -0.14264393 1.23222943 -0.34658381 97
70: 10 8 1.41878305 -0.37911379 -0.54018925 98
71: 10 9 0.48713390 -1.34986658 -0.18255559 99
72: 10 10 0.60344145 0.36491810 NA 100
"lapply" 可能有用:
ID <- unique(data3$id)
n <- lapply(ID, function(i){which(data3$id==i)})
tn <- lapply(n , function(i){data3$time[i]})
gapCount <- lapply(tn, function(ti){sum(diff(ti)>1)})
maxPeriod <- lapply(tn, function(ti){max( c(ti[which(diff(ti)>1)+1],max(ti)) -
c(min(ti)-1,ti[which(diff(ti)>1)]) ) } )
obsCount <- lapply(n , length)
#------------------------------------------------------------------------
# Example 1: Remove all individuals with
# at least one gap or
# at most 4 observations.
keepTheseIDs_Ex1 <- which( gapCount==0 & obsCount>4 )
data_Ex1 <- data3[which(data3$id %in% keepTheseIDs_Ex1),]
#------------------------------------------------------------------------
# Example 2: Remove all individuals with
# at most 8 observations or
# no connected period of length at least 5
keepTheseIDs_Ex2 <- which( obsCount>8 & maxPeriod>=5 )
data_Ex2 <- data3[which(data3$id %in% keepTheseIDs_Ex2),]
每个人"ID[i]"
- n[i]是行号列表,
- tn[i] 是与该个人关联的时间列表。
如果 "tn[i]" 中有空隙,即 "tn[i][j+1]-tn[i][j]>1","diff[tn[i]" 在索引 "j" 处跳转 通过间隙的长度加 1,这就是间隙数的计算和收集方式 列表 "gapCount".
连接的周期从索引 "which(diff(ti)>1)" 开始,并且 在索引 "which(diff(ti)>1)+1" 处结束。所以相应时间的差异给出 连接期间的长度。对于每个人 "ID[i]" 的最大长度 连通分量是列表 "maxPeriod".
的第 "i" 项个人 "ID[i]" 有 "obsCount[[i]]" 个观察结果。
尝试打包 dplyr 并使用此脚本:
data3 %>%
data.frame() %>% # seems that with data.tables the group_by is lost after mutate
group_by(id) %>%
mutate(time_lag_1 = lag(time),
time_diff = time-time_lag_1,
N = n()) %>%
summarise(max_time_diff = max(time_diff, na.rm=T),
N = unique(N)) %>%
filter(max_time_diff == 1 &
N >= 5)
关于其工作原理的一些解释。
第一部分:
data3 %>%
data.frame() %>%
group_by(id) %>%
mutate(time_lag_1 = lag(time),
time_diff = time-time_lag_1,
N = n())
计算列 "time_lag_1"(移动列 "time")以便您可以比较 2 个连续行的时间(将差异存储在列 "time_diff" 中)并计算观察次数每个"id"。当然,你得先按"id"分组:
# id time x z y row time_lag_1 time_diff N
# 1 1 1 1.17085642 0.21083288 -0.84085548 1 NA NA 10
# 2 1 2 0.88484486 -0.03329921 NA 2 1 1 10
# 3 1 3 -1.31788860 2.02519699 NA 3 2 1 10
# 4 1 4 -1.64325094 -0.37078675 0.07014277 4 3 1 10
# 5 1 5 1.05925039 -1.57823445 NA 5 4 1 10
# 6 1 6 0.29008358 -0.12157195 NA 6 5 1 10
# 7 1 7 -0.40003350 -1.79667682 NA 7 6 1 10
# 8 1 8 1.24309578 -0.47559154 -0.63537131 8 7 1 10
# 9 1 9 -1.36641052 -0.88410232 -0.28577363 9 8 1 10
# 10 1 10 -1.44141330 -3.49805898 NA 10 9 1 10
# 11 2 1 1.34854906 -0.38198337 NA 11 NA NA 10
# 12 2 2 -1.97852834 0.97768813 NA 12 1 1 10
# 13 2 3 -1.24095058 -0.55804095 NA 13 2 1 10
# 14 2 4 -0.10403913 -0.62645515 NA 14 3 1 10
# 15 2 5 0.73297296 -0.53045123 -1.07176004 15 4 1 10
# 16 2 6 0.45567962 1.89762159 -0.13898614 16 5 1 10
# 17 2 7 0.28807955 1.39554068 -0.59731309 17 6 1 10
# 18 2 8 -1.07369091 -0.74602587 NA 18 7 1 10
# 19 2 9 0.64874254 -0.30557308 NA 19 8 1 10
# 20 2 10 0.29916228 1.16967817 -0.25935541 20 9 1 10
# 21 3 1 -0.79599499 0.30438718 0.90051195 21 NA NA 10
# 22 3 2 -0.02935340 -0.11749825 0.94186939 22 1 1 10
# 23 3 3 2.18023570 -0.06008553 1.46796190 23 2 1 10
# 24 3 4 0.95741847 1.47093895 NA 24 3 1 10
# 25 3 5 -0.30504863 -1.47814761 0.81900893 25 4 1 10
# 26 3 6 -0.41840334 -0.68361295 -0.29348185 26 5 1 10
# 27 3 7 0.09995405 0.46054060 NA 27 6 1 10
# 28 3 8 -0.22980962 -0.18150193 NA 28 7 1 10
# 29 3 9 -1.41521488 -1.15881631 -0.65708209 29 8 1 10
# 30 3 10 -0.39259886 0.40901892 -0.85279544 30 9 1 10
# 31 4 1 0.94608855 -0.25820706 0.31591504 31 NA NA 8
# 32 4 2 0.75177087 -0.26689944 1.10969417 32 1 1 8
# 33 4 4 0.80833598 -0.39345895 NA 34 2 2 8
# 34 4 5 -0.61453522 -1.84373725 NA 35 4 1 8
# 35 4 6 1.23825893 -1.54228827 0.95157383 36 5 1 8
# 36 4 7 -0.33809514 -0.58624036 NA 37 6 1 8
# 37 4 8 1.19636636 -0.85213891 -2.00047274 38 7 1 8
# 38 4 9 -0.44331838 0.77832456 -1.76218587 39 8 1 8
# 39 5 1 -2.62134481 -1.45565758 1.55006037 41 NA NA 10
# 40 5 2 2.24625462 0.09378492 NA 42 1 1 10
# 41 5 3 0.09343168 0.98234922 NA 43 2 1 10
# 42 5 4 1.62728009 -0.59671016 NA 44 3 1 10
# 43 5 5 -0.51091755 0.07480485 NA 45 4 1 10
# 44 5 6 -0.65938084 2.19742943 0.56222336 46 5 1 10
# 45 5 7 -0.04019016 0.79502321 -0.88700851 47 6 1 10
# 46 5 8 -0.11869400 -0.53894221 -0.46024458 48 7 1 10
# 47 5 9 -0.01965686 -1.60128318 -0.72432849 49 8 1 10
# 48 5 10 -0.48567849 -0.73137357 NA 50 9 1 10
# 49 6 1 -1.44014752 -0.35574079 NA 51 NA NA 4
# 50 6 2 0.14376888 -0.98541432 0.18772610 52 1 1 4
# 51 6 3 -1.23458665 -0.73117064 1.02202286 53 2 1 4
# 52 6 4 -1.75250121 1.46532408 -0.59183483 54 3 1 4
# 53 7 4 0.97438263 0.96691960 0.49636154 64 NA NA 7
# 54 7 5 -1.26447348 -0.42332730 -0.76005793 65 4 1 7
# 55 7 6 -0.27742142 -0.83159945 -0.34138627 66 5 1 7
# 56 7 7 -0.18939869 1.39995727 -2.10232912 67 6 1 7
# 57 7 8 -0.38402495 0.01701396 NA 68 7 1 7
# 58 7 9 0.74058802 1.84749695 NA 69 8 1 7
# 59 7 10 -1.16833839 -0.68633938 -0.27966611 70 9 1 7
# 60 8 1 0.66753870 -0.21872403 -0.20409732 71 NA NA 9
# 61 8 2 0.36623695 0.68259291 -0.22561419 72 1 1 9
# 62 8 3 -0.51494299 0.52413002 NA 73 2 1 9
# 63 8 4 0.45056824 0.08054998 NA 74 3 1 9
# 64 8 5 -0.18772038 0.05378554 NA 75 4 1 9
# 65 8 6 1.33906937 -0.73725899 NA 76 5 1 9
# 66 8 7 0.81621918 0.96643806 0.97348539 77 6 1 9
# 67 8 9 -0.65086272 0.18729094 0.18917369 79 7 2 9
# 68 8 10 0.72640902 0.27298575 -0.56288507 80 9 1 9
# 69 9 1 -0.11367818 1.21014609 NA 81 NA NA 10
# 70 9 2 -0.29510083 0.18865716 NA 82 1 1 10
# 71 9 3 0.98916847 1.96249867 0.97552910 83 2 1 10
# 72 9 4 -0.77513181 0.13871194 NA 84 3 1 10
# 73 9 5 0.27589827 -1.57862735 0.67568448 85 4 1 10
# 74 9 6 0.41078165 -0.79702127 NA 86 5 1 10
# 75 9 7 0.61118316 1.22435388 2.38723265 87 6 1 10
# 76 9 8 0.93657072 -0.36533356 -0.47343201 88 7 1 10
# 77 9 9 -0.36754170 -0.16259028 -0.07577256 89 8 1 10
# 78 9 10 0.74037676 0.56047918 NA 90 9 1 10
# 79 10 2 0.62913443 1.23863449 -1.06241117 92 NA NA 9
# 80 10 3 0.52774631 0.76743575 0.55703387 93 2 1 9
# 81 10 4 -0.47225530 -1.08740911 0.90073058 94 3 1 9
# 82 10 5 0.82371516 0.06750377 0.98994568 95 4 1 9
# 83 10 6 -0.42778825 1.60514057 0.38360809 96 5 1 9
# 84 10 7 -0.14264393 1.23222943 -0.34658381 97 6 1 9
# 85 10 8 1.41878305 -0.37911379 -0.54018925 98 7 1 9
# 86 10 9 0.48713390 -1.34986658 -0.18255559 99 8 1 9
# 87 10 10 0.60344145 0.36491810 NA 100 9 1 9
第二部分:
summarise(max_time_diff = max(time_diff, na.rm=T),
N = unique(N))
计算连续时间之间的最大差异(这将发现您的差距)并保留 N(唯一值,因为对于特定 "id" 所有 N 都是相同的),对于每个 "id" :
# Source: local data frame [10 x 3]
#
# id max_time_diff N
# 1 1 1 10
# 2 2 1 10
# 3 3 1 10
# 4 4 2 8
# 5 5 1 10
# 6 6 1 4
# 7 7 1 7
# 8 8 2 9
# 9 9 1 10
# 10 10 1 9
然后最后一部分只是进行过滤,您将得到:
# Source: local data frame [7 x 3]
#
# id max_time_diff N
# 1 1 1 10
# 2 2 1 10
# 3 3 1 10
# 4 5 1 10
# 5 7 1 7
# 6 9 1 10
# 7 10 1 9
您可以在最后添加%>% select(id)
以保留满足您筛选条件的ID。