将第二次出现保留在 R 的列中
Keep the second occurrence in a column in R
我有一个非常简单的数据集:
ID Value Time
1 censored 1
1 censored 2
1 uncensored 3
1 uncensored 4
1 censored 5
1 censored 6
2 censored 1
2 uncensored 2
2 uncensored 3
2 uncensored 4
2 censored 5
我想保留第一个 uncensored
出现,我想在 uncensored
个之后保留第一个 censored
出现。例如:
ID Value Time
1 uncensored 3
1 censored 5
2 uncensored 2
2 censored 5
并不是每个人的第一个删失日期都是在时间 5,这只是一个例子。
Value
是一个二元变量:1 表示已审查,0 表示未审查,但我已经标记了它们。
尝试
library(data.table)
indx <- setDT(df1)[, gr:= rleid(Value), ID
][, c(.I[Value=='uncensored'][1L], .I[Value=='censored' & gr>1][1L]) , ID]$V1
df1[indx][,gr:=NULL]
# ID Value Time
#1: 1 uncensored 3
#2: 1 censored 5
#3: 2 uncensored 2
#4: 2 censored 5
或使用与@Thomas 类似的想法post
indx <- setDT(df1)[, {
i1 <-.I[Value=='uncensored'][1L]
i2=.I[Value=='censored']
list(c(i1,i2[i2>i1][1L])) }, ID]$V1
df1[indx]
# ID Value Time
#1: 1 uncensored 3
#2: 1 censored 5
#3: 2 uncensored 2
#4: 2 censored 5
或使用dplyr
library(dplyr)
df1 %>%
group_by(ID) %>%
slice(which(Value=='uncensored')[1L]:n()) %>%
slice(match(c('uncensored', 'censored'), Value))
# ID Value Time
#1 1 uncensored 3
#2 1 censored 5
#3 2 uncensored 2
#4 2 censored 5
尝试
result=c()
for(i in unique(df$ID)){
subdf = df[which(df$ID) == i), ]
idx = min(which(subdf$Value == 0))
result = rbind(result, subdf[idx, ])
idx = min(which(subdf$Value[-(1:idx)] == 1))
result = rbind(result, subdf[idx, ])
}
假设期望的观察总是存在。
您可以使用标准的拆分-应用-组合策略来执行此操作:
do.call(rbind, lapply(split(d, d$ID), function(x) {
u1 <- which(x$Value == "uncensored")[1]
c1 <- which((x$Value == "censored") & seq_along(x$Value) > u1)[1]
return(x[c(u1, c1),])
}))
结果:
ID Value Time
1.3 1 uncensored 3
1.5 1 censored 5
2.8 2 uncensored 2
2.11 2 censored 5
这是另一个可能的 data.table
解决方案
library(data.table)
setDT(df1)[, list(Value = c("uncensored", "censored"),
Time = c(Time[match("uncensored", Value)],
Time[(.N - match("uncensored", rev(Value))) + 2L])),
by = ID]
# ID Value Time
# 1: 1 uncensored 3
# 2: 1 censored 5
# 3: 2 uncensored 2
# 4: 2 censored 5
或者类似地,使用 which
而不是 match
setDT(df1)[, list(Value = c("uncensored", "censored"),
Time = c(Time[which(Value == "uncensored")[1L]],
Time[(.N - which(rev(Value) == "uncensored")[1L]) + 2L])),
by = ID]
只要您希望识别表现出惰性的行,就可以应用以下内容 w.r.t 特定列(即使是具有多个级别的分类列或数字列)
df <- read.table("clipboard")
a <- c(TRUE)
for (i in 1:(nrow(df)-1))
{
a <- c(a,duplicated(df[i:(i+1),2])[2])
}
df[!a,]
既然你提到 Value
是一个二进制变量,这里有另一个使用 dplyr
的想法:
library(dplyr)
df %>%
group_by(ID) %>%
## convert the labels to binary
## 1 for censored, and 0 for uncensored
mutate(Value = ifelse(Value == "censored", 1, 0)) %>%
## filter first 'uncensored' value in each 'ID' group
## or the 'censored' values that have 'uncensored' as a predecessor
filter(Value == 0 & row_number(Value) == 1 | Value == 1 & lag(Value) == 0)
给出:
#Source: local data frame [4 x 3]
#Groups: ID
#
# ID Value Time
#1 1 0 3
#2 1 1 5
#3 2 0 2
#4 2 1 5
我有一个非常简单的数据集:
ID Value Time
1 censored 1
1 censored 2
1 uncensored 3
1 uncensored 4
1 censored 5
1 censored 6
2 censored 1
2 uncensored 2
2 uncensored 3
2 uncensored 4
2 censored 5
我想保留第一个 uncensored
出现,我想在 uncensored
个之后保留第一个 censored
出现。例如:
ID Value Time
1 uncensored 3
1 censored 5
2 uncensored 2
2 censored 5
并不是每个人的第一个删失日期都是在时间 5,这只是一个例子。
Value
是一个二元变量:1 表示已审查,0 表示未审查,但我已经标记了它们。
尝试
library(data.table)
indx <- setDT(df1)[, gr:= rleid(Value), ID
][, c(.I[Value=='uncensored'][1L], .I[Value=='censored' & gr>1][1L]) , ID]$V1
df1[indx][,gr:=NULL]
# ID Value Time
#1: 1 uncensored 3
#2: 1 censored 5
#3: 2 uncensored 2
#4: 2 censored 5
或使用与@Thomas 类似的想法post
indx <- setDT(df1)[, {
i1 <-.I[Value=='uncensored'][1L]
i2=.I[Value=='censored']
list(c(i1,i2[i2>i1][1L])) }, ID]$V1
df1[indx]
# ID Value Time
#1: 1 uncensored 3
#2: 1 censored 5
#3: 2 uncensored 2
#4: 2 censored 5
或使用dplyr
library(dplyr)
df1 %>%
group_by(ID) %>%
slice(which(Value=='uncensored')[1L]:n()) %>%
slice(match(c('uncensored', 'censored'), Value))
# ID Value Time
#1 1 uncensored 3
#2 1 censored 5
#3 2 uncensored 2
#4 2 censored 5
尝试
result=c()
for(i in unique(df$ID)){
subdf = df[which(df$ID) == i), ]
idx = min(which(subdf$Value == 0))
result = rbind(result, subdf[idx, ])
idx = min(which(subdf$Value[-(1:idx)] == 1))
result = rbind(result, subdf[idx, ])
}
假设期望的观察总是存在。
您可以使用标准的拆分-应用-组合策略来执行此操作:
do.call(rbind, lapply(split(d, d$ID), function(x) {
u1 <- which(x$Value == "uncensored")[1]
c1 <- which((x$Value == "censored") & seq_along(x$Value) > u1)[1]
return(x[c(u1, c1),])
}))
结果:
ID Value Time
1.3 1 uncensored 3
1.5 1 censored 5
2.8 2 uncensored 2
2.11 2 censored 5
这是另一个可能的 data.table
解决方案
library(data.table)
setDT(df1)[, list(Value = c("uncensored", "censored"),
Time = c(Time[match("uncensored", Value)],
Time[(.N - match("uncensored", rev(Value))) + 2L])),
by = ID]
# ID Value Time
# 1: 1 uncensored 3
# 2: 1 censored 5
# 3: 2 uncensored 2
# 4: 2 censored 5
或者类似地,使用 which
而不是 match
setDT(df1)[, list(Value = c("uncensored", "censored"),
Time = c(Time[which(Value == "uncensored")[1L]],
Time[(.N - which(rev(Value) == "uncensored")[1L]) + 2L])),
by = ID]
只要您希望识别表现出惰性的行,就可以应用以下内容 w.r.t 特定列(即使是具有多个级别的分类列或数字列)
df <- read.table("clipboard")
a <- c(TRUE)
for (i in 1:(nrow(df)-1))
{
a <- c(a,duplicated(df[i:(i+1),2])[2])
}
df[!a,]
既然你提到 Value
是一个二进制变量,这里有另一个使用 dplyr
的想法:
library(dplyr)
df %>%
group_by(ID) %>%
## convert the labels to binary
## 1 for censored, and 0 for uncensored
mutate(Value = ifelse(Value == "censored", 1, 0)) %>%
## filter first 'uncensored' value in each 'ID' group
## or the 'censored' values that have 'uncensored' as a predecessor
filter(Value == 0 & row_number(Value) == 1 | Value == 1 & lag(Value) == 0)
给出:
#Source: local data frame [4 x 3]
#Groups: ID
#
# ID Value Time
#1 1 0 3
#2 1 1 5
#3 2 0 2
#4 2 1 5