删除未出现在每个时间段和数据框中的组
Deleting groups that don't appear in every time period and data frame
我正在用多个数据帧清理每个数据帧中多个时间段的数据。每个数据框都有一年的数据。我想删除每个时间段(数据框内)没有出现的组,删除每个数据框没有出现的组。换句话说,我想在每个数据帧中保留每个时间段内存在的组。我创建了具有 ID、时间变量和两个代表我的数据的变量的数据。我的数据也会有更多的数据框、ID、组和变量。
t<-c(1,1,2,2,3,3,3,4,4,4)
id<-c(200,300,200,300,100,200,300,200,300,400)
x1<-rnorm(1:10)
x2<-rnorm(1:10)
?df
df<-data.frame(id,t,x1,x2)
t<-c(1,1,1,2,2,3,3,3,4,4)
id<-c(200,300,400,200,300,200,300,400,200,300)
x1<-rnorm(1:10)
x2<-rnorm(1:10)
df2<-data.frame(id,t,x1,x2)
id<-c(200,300,200,300,600,200,300,100,200,300)
t<-c(1,1,2,2,2,3,3,4,4,4)
x1<-rnorm(1:10)
x2<-rnorm(1:10)
df3<-data.frame(id,t,x1,x2)
rb<-rbind(df,df2,df3)
rb
cb<-cbind(df,df2,df3)
cb
id t x1 x2 id t x1 x2 id t x1 x2
1 200 1 0.37223136 -0.04918183 200 1 0.6489171399 -0.1324335 200 1 -0.41387676 -0.4566678425
2 300 1 -0.22062416 0.05150952 300 1 -0.3669090613 3.0826144 300 1 0.48237987 -0.0325861333
3 200 2 0.32912208 1.03922999 400 1 0.9347859735 0.1026632 200 2 -0.31308242 -0.3021501845
4 300 2 -0.18172302 -1.41669927 200 2 0.4814364147 -0.1087465 300 2 -1.52273626 0.6357750776
5 100 3 -0.81072008 0.64522238 300 2 -0.5676866296 0.2371230 600 2 -0.09687669 2.2883585934
6 200 3 0.45175343 0.64197622 200 3 0.0006852893 0.5830704 200 3 0.01726120 -0.5905109745
7 300 3 0.40465989 -0.70796588 300 3 -0.0008717189 -1.1969493 300 3 -0.18603781 0.3722390396
8 200 4 0.09852108 -1.76958443 400 3 0.9343534507 -1.3671447 100 4 -0.57308316 0.4749221706
9 300 4 -0.53951022 0.97306346 200 4 1.9176422485 0.9879788 200 4 0.40222133 0.3278821640
10 400 4 0.24271562 -1.37269617 300 4 1.4298971045 1.6095265 300 4 0.85799186 0.0006593401
我的最终输出是这样的:
id t x1 x2
200 1 0.37223136 -0.04918183
300 1 -0.22062416 0.05150952
200 2 0.32912208 1.03922999
300 2 -0.18172302 -1.41669927
200 3 0.45175343 0.64197622
300 3 0.40465989 -0.70796588
200 4 0.09852108 -1.76958443
300 4 -0.53951022 0.97306346
200 1 0.6489171399 -0.1324335
300 1 -0.3669090613 3.0826144
200 2 0.4814364147 -0.1087465
300 2 -0.5676866296 0.2371230
200 3 0.0006852893 0.5830704
300 3 -0.0008717189 -1.1969493
200 4 1.9176422485 0.9879788
300 4 1.4298971045 1.6095265
200 1 -0.41387676 -0.4566678425
300 1 0.48237987 -0.0325861333
200 2 -0.31308242 -0.3021501845
300 2 -1.52273626 0.6357750776
200 3 0.01726120 -0.5905109745
300 3 -0.18603781 0.3722390396
200 4 0.40222133 0.3278821640
300 4 0.85799186 0.0006593401
一种策略是计算id
和t
的每个组合出现的次数。如果这等于最大可能值,则保留该 ID。 (我使用 max
来获得最大可能的组合,但只有在每个 t
中至少出现一个 id
时才有效。
我在这里使用 plyr
包中的 adply
来替换您的 rbind
步骤,因为 adply
保留了关于每行来自哪个数据框的信息(在X1
列)。
library(plyr)
rb <- adply(list(df, df2, df3), 1)
unique_combo <- unique(rb[, c("X1", "id", "t")])
## X1 id t
## 1 1 200 1
## 2 1 300 1
## 3 1 200 2
## 4 1 300 2
## 5 1 100 3
## 6 1 200 3
## 7 1 300 3
## 8 1 200 4
## 9 1 300 4
## 10 1 400 4
## 11 2 200 1
## 12 2 300 1 etc.
combos_per_id <- aggregate(t ~ id, FUN = length, data = unique_combo)
## id t
## 1 100 2
## 2 200 12
## 3 300 12
## 4 400 3
## 5 600 1
ids_you_want <- subset(combos_per_id, t == max(t))
## id t
## 2 200 12
## 3 300 12
rb[rb$id %in% ids_you_want$id, ]
## X1 id t x1 x2
## 1 1 200 1 0.41800060 -0.729280896
## 2 1 300 1 -1.26310444 0.649438361
## 3 1 200 2 1.75130801 0.340464369
## 4 1 300 2 -0.47751518 -1.396611139
## 6 1 200 3 -0.11537438 -1.483654622
## 7 1 300 3 -1.33689508 -1.219725112 etc.
编辑处理另一列
library(plyr)
t<-c(1,1,2,2,3,3,3,4,4,4)
id<-c(200,300,200,300,100,200,300,200,300,400)
x1<-rnorm(1:10)
x2<-rnorm(1:10)
r<-c("b","a","a","a","a","a","a","a","a","a")
df<-data.frame(id,t,x1,x2, r)
t<-c(1,1,1,2,2,3,3,3,4,4)
id<-c(200,300,400,200,300,200,300,400,200,300)
x1<-rnorm(1:10)
x2<-rnorm(1:10)
r<-c("a","a","a","a","a","a","a","a","a","a")
df2<-data.frame(id,t,x1,x2, r)
id<-c(200,300,200,300,600,200,300,100,200,300)
t<-c(1,1,2,2,2,3,3,4,4,4)
x1<-rnorm(1:10)
x2<-rnorm(1:10)
r<-c("a","a","a","a","a","a","a","a","a","a")
df3<-data.frame(id,t,x1,x2, r)
rb <- adply(list(df, df2, df3), 1)
unique_combo <- unique(rb[, c("X1", "id", "t", "r")])
(combos_per_id <- aggregate(t ~ id + r, FUN = length, data = unique_combo))
ids_you_want <- subset(combos_per_id, t == max(t))
rb[rb$id %in% ids_you_want$id, ]
这有点蛮力,但应该可行:
frames <- list(df,df2,df3)
lu <- function(x)
length(unique(x))
timePeriodsPerDataframe <- sapply(frames,function(x)lu(x))
for(i in seq(length(frames))){
appearsInEveryTimePeriod <- tapply(frames[[i]]$id,
frames[[i]]$t,
lu) == timePeriodsPerDataframe[i]
if(i == 1)
IDsInEveryTimePeriod <- names(tmp[tmp])
else
IDsInEveryTimePeriod <- intersect(names(tmp[tmp]),IDsInEveryTimePeriod)
}
IDsInEveryTimePeriod <- as.numeric(IDsInEveryTimePeriod)
我正在用多个数据帧清理每个数据帧中多个时间段的数据。每个数据框都有一年的数据。我想删除每个时间段(数据框内)没有出现的组,删除每个数据框没有出现的组。换句话说,我想在每个数据帧中保留每个时间段内存在的组。我创建了具有 ID、时间变量和两个代表我的数据的变量的数据。我的数据也会有更多的数据框、ID、组和变量。
t<-c(1,1,2,2,3,3,3,4,4,4)
id<-c(200,300,200,300,100,200,300,200,300,400)
x1<-rnorm(1:10)
x2<-rnorm(1:10)
?df
df<-data.frame(id,t,x1,x2)
t<-c(1,1,1,2,2,3,3,3,4,4)
id<-c(200,300,400,200,300,200,300,400,200,300)
x1<-rnorm(1:10)
x2<-rnorm(1:10)
df2<-data.frame(id,t,x1,x2)
id<-c(200,300,200,300,600,200,300,100,200,300)
t<-c(1,1,2,2,2,3,3,4,4,4)
x1<-rnorm(1:10)
x2<-rnorm(1:10)
df3<-data.frame(id,t,x1,x2)
rb<-rbind(df,df2,df3)
rb
cb<-cbind(df,df2,df3)
cb
id t x1 x2 id t x1 x2 id t x1 x2
1 200 1 0.37223136 -0.04918183 200 1 0.6489171399 -0.1324335 200 1 -0.41387676 -0.4566678425
2 300 1 -0.22062416 0.05150952 300 1 -0.3669090613 3.0826144 300 1 0.48237987 -0.0325861333
3 200 2 0.32912208 1.03922999 400 1 0.9347859735 0.1026632 200 2 -0.31308242 -0.3021501845
4 300 2 -0.18172302 -1.41669927 200 2 0.4814364147 -0.1087465 300 2 -1.52273626 0.6357750776
5 100 3 -0.81072008 0.64522238 300 2 -0.5676866296 0.2371230 600 2 -0.09687669 2.2883585934
6 200 3 0.45175343 0.64197622 200 3 0.0006852893 0.5830704 200 3 0.01726120 -0.5905109745
7 300 3 0.40465989 -0.70796588 300 3 -0.0008717189 -1.1969493 300 3 -0.18603781 0.3722390396
8 200 4 0.09852108 -1.76958443 400 3 0.9343534507 -1.3671447 100 4 -0.57308316 0.4749221706
9 300 4 -0.53951022 0.97306346 200 4 1.9176422485 0.9879788 200 4 0.40222133 0.3278821640
10 400 4 0.24271562 -1.37269617 300 4 1.4298971045 1.6095265 300 4 0.85799186 0.0006593401
我的最终输出是这样的:
id t x1 x2
200 1 0.37223136 -0.04918183
300 1 -0.22062416 0.05150952
200 2 0.32912208 1.03922999
300 2 -0.18172302 -1.41669927
200 3 0.45175343 0.64197622
300 3 0.40465989 -0.70796588
200 4 0.09852108 -1.76958443
300 4 -0.53951022 0.97306346
200 1 0.6489171399 -0.1324335
300 1 -0.3669090613 3.0826144
200 2 0.4814364147 -0.1087465
300 2 -0.5676866296 0.2371230
200 3 0.0006852893 0.5830704
300 3 -0.0008717189 -1.1969493
200 4 1.9176422485 0.9879788
300 4 1.4298971045 1.6095265
200 1 -0.41387676 -0.4566678425
300 1 0.48237987 -0.0325861333
200 2 -0.31308242 -0.3021501845
300 2 -1.52273626 0.6357750776
200 3 0.01726120 -0.5905109745
300 3 -0.18603781 0.3722390396
200 4 0.40222133 0.3278821640
300 4 0.85799186 0.0006593401
一种策略是计算id
和t
的每个组合出现的次数。如果这等于最大可能值,则保留该 ID。 (我使用 max
来获得最大可能的组合,但只有在每个 t
中至少出现一个 id
时才有效。
我在这里使用 plyr
包中的 adply
来替换您的 rbind
步骤,因为 adply
保留了关于每行来自哪个数据框的信息(在X1
列)。
library(plyr)
rb <- adply(list(df, df2, df3), 1)
unique_combo <- unique(rb[, c("X1", "id", "t")])
## X1 id t
## 1 1 200 1
## 2 1 300 1
## 3 1 200 2
## 4 1 300 2
## 5 1 100 3
## 6 1 200 3
## 7 1 300 3
## 8 1 200 4
## 9 1 300 4
## 10 1 400 4
## 11 2 200 1
## 12 2 300 1 etc.
combos_per_id <- aggregate(t ~ id, FUN = length, data = unique_combo)
## id t
## 1 100 2
## 2 200 12
## 3 300 12
## 4 400 3
## 5 600 1
ids_you_want <- subset(combos_per_id, t == max(t))
## id t
## 2 200 12
## 3 300 12
rb[rb$id %in% ids_you_want$id, ]
## X1 id t x1 x2
## 1 1 200 1 0.41800060 -0.729280896
## 2 1 300 1 -1.26310444 0.649438361
## 3 1 200 2 1.75130801 0.340464369
## 4 1 300 2 -0.47751518 -1.396611139
## 6 1 200 3 -0.11537438 -1.483654622
## 7 1 300 3 -1.33689508 -1.219725112 etc.
编辑处理另一列
library(plyr)
t<-c(1,1,2,2,3,3,3,4,4,4)
id<-c(200,300,200,300,100,200,300,200,300,400)
x1<-rnorm(1:10)
x2<-rnorm(1:10)
r<-c("b","a","a","a","a","a","a","a","a","a")
df<-data.frame(id,t,x1,x2, r)
t<-c(1,1,1,2,2,3,3,3,4,4)
id<-c(200,300,400,200,300,200,300,400,200,300)
x1<-rnorm(1:10)
x2<-rnorm(1:10)
r<-c("a","a","a","a","a","a","a","a","a","a")
df2<-data.frame(id,t,x1,x2, r)
id<-c(200,300,200,300,600,200,300,100,200,300)
t<-c(1,1,2,2,2,3,3,4,4,4)
x1<-rnorm(1:10)
x2<-rnorm(1:10)
r<-c("a","a","a","a","a","a","a","a","a","a")
df3<-data.frame(id,t,x1,x2, r)
rb <- adply(list(df, df2, df3), 1)
unique_combo <- unique(rb[, c("X1", "id", "t", "r")])
(combos_per_id <- aggregate(t ~ id + r, FUN = length, data = unique_combo))
ids_you_want <- subset(combos_per_id, t == max(t))
rb[rb$id %in% ids_you_want$id, ]
这有点蛮力,但应该可行:
frames <- list(df,df2,df3)
lu <- function(x)
length(unique(x))
timePeriodsPerDataframe <- sapply(frames,function(x)lu(x))
for(i in seq(length(frames))){
appearsInEveryTimePeriod <- tapply(frames[[i]]$id,
frames[[i]]$t,
lu) == timePeriodsPerDataframe[i]
if(i == 1)
IDsInEveryTimePeriod <- names(tmp[tmp])
else
IDsInEveryTimePeriod <- intersect(names(tmp[tmp]),IDsInEveryTimePeriod)
}
IDsInEveryTimePeriod <- as.numeric(IDsInEveryTimePeriod)