按起始值和结束值标识行块
ID chunks of rows by start and end value
我需要根据起始行和结束行标准来标识 data.table 中的行块。在下面的 MWE 中,开始行由 colA=="d" 定义,并且该组继续直到 colA=="a"
library(data.table)
in.data <- data.table(colA=c("b", "f", "b", "k", "d", "b", "a", "s", "a", "n", "d", "f", "d", "a", "t"))
in.data$wanted.column <- c(NA, NA, NA, NA, 1, 1, 1, NA, NA, NA, 2, 2, 2, 2, NA)
in.data
# colA wanted.column
# 1: b NA
# 2: f NA
# 3: b NA
# 4: k NA
# 5: d 1
# 6: b 1
# 7: a 1
# 8: s NA
# 9: a NA
# 10: n NA
# 11: d 2
# 12: f 2
# 13: d 2
# 14: a 2
# 15: t NA
(组外值为NA、零或任何其他可识别结果无关紧要)
老实说有点讨厌,但对我有用:
library(data.table)
in.data <- data.table(colA=c("b", "f", "b", "k", "d", "b", "a", "s", "a", "n", "d", "f", "d", "a", "t"))
in.data$out <- rep(NA,nrow(in.data))
activator <- FALSE
counter <- 1
for (i in 1:nrow(in.data)) {
if (activator == TRUE & in.data$colA[i] !='a') {
in.data$out[i] <- counter
next
}
if( in.data$colA[i]=='d') {
activator <- TRUE
in.data$out[i] <- counter
} else if (in.data$colA[i]=='a' & activator==TRUE ) {
in.data$out[i] <- counter
counter <- counter + 1
activator <- FALSE
} else {next}
}
in.data
输出:
> in.data
colA out
1: b NA
2: f NA
3: b NA
4: k NA
5: d 1
6: b 1
7: a 1
8: s NA
9: a NA
10: n NA
11: d 2
12: f 2
13: d 2
14: a 2
15: t NA
如果你愿意,你可以做一个 sapply
,但是 if
语句太多了,for-loop
可能更容易阅读......
我相信有人会想出一个不错的 data.table
解决方案。在等待期间,还有另一种 base
可能性:
in.df <- as.data.frame(in.data)
# index of "d", start index
start <- which(in.df$colA == "d")
# index of "a"
idx_a <- which(in.df$colA == "a")
# end index: for each start index, select the first index of "a" which is larger
end <- a[sapply(start, function(x) which.max(x < idx_a))]
# check if runs overlap and create groups of runs
lag_end <- c(0, head(end, -1))
run <- cumsum(start >= lag_end)
df <- data.frame(start, end, run)
# within each run, expand the sequence of idx, from min(start) to max(end)
df2 <- do.call(rbind,
by(df, df$run, function(x){
data.frame(run = x$run, idx = min(x$start):max(x$end))
})
)
# add an empty 'run' variable to in.df
in.df$run <- NA
# assign df2$run at idx in in.data
in.df$run[df2$idx] <- df2$run
# idx colA wanted.column run
# 1 1 b NA NA
# 2 2 f NA NA
# 3 3 b NA NA
# 4 4 k NA NA
# 5 5 d 1 1
# 6 6 b 1 1
# 7 7 a 1 1
# 8 8 s NA NA
# 9 9 a NA NA
# 10 10 n NA NA
# 11 11 d 2 2
# 12 12 f 2 2
# 13 13 d 2 2
# 14 14 a 2 2
# 15 15 t NA NA
同样,在 base-R 中,有点令人讨厌,但迭代次数较少且没有 if else。
library(data.table)
in.data <- data.table(colA=c("b", "f", "b", "k", "d", "b", "a", "s", "a", "n", "d", "f", "d", "a", "t"))
in.data$out <- rep(NA,nrow(in.data))
d <- which(in.data$colA=="d")
a <- which(in.data$colA=="a")
end <- rep(NA, length(d))
for (i in seq_along(d)){
begin <- d[i]
if(begin>=max(a)) # this cdn accomodates a case where no "a" appears after some "d"
break
end[i] <- min(a[d[i]<a])
in.data$out[begin: end[i]] <- sum(!is.na(unique(end)))
}
in.data
# colA out
# 1: b NA
# 2: f NA
# 3: b NA
# 4: k NA
# 5: d 1
# 6: b 1
# 7: a 1
# 8: s NA
# 9: a NA
#10: n NA
#11: d 2
#12: f 2
#13: d 2
#14: a 2
#15: t NA
更新
答案的原始版本寻找最短的序列,这是不正确的,因为它们可以在中间包含起始符号,例如c('d','f','d','a')
。答案的编辑版本修复了这个问题
更新2
我被告知当两个序列相互跟随时(例如in.data <- data.table(colA=c("b", "f", "b", "k", "d", "b", "a", "d", "f", "d", "a", "t"))
),它们被列为一个解决方案,这是错误的。在这里,我通过跟踪 colA
.
中 symbol.stop
符号的出现来解决这个问题
设置
library(data.table)
in.data <- data.table(colA=c("b", "f", "b", "k", "d", "b", "a", "s", "a", "n", "d", "f", "d", "a", "t"))
symbol.start='d'
symbol.stop='a'
实际代码
in.data[,y := rev(cumsum(rev(colA)==symbol.stop))][,out:=(!match(symbol.start,colA,nomatch=.N+1)>1:.N),by=y]
in.data$out[in.data$out] <- as.factor(max(in.data$y)-in.data$y[in.data$out])
此处,[,y := rev(cumsum(rev(colA)==symbol.stop))]
创建了一个列 y
,可用于根据背面出现的 symbol.stop
对数据集进行分组。 [,out:=(!match(symbol.start,colA,nomatch=.N+1)>1:.N),by=y]
表达式 returns 一个布尔向量,指示一行是否属于 start.symbol...end.symbol
序列。需要下一行来枚举这样的序列。
清理输出
in.data$y <- NULL
in.data
# colA out
# 1: b 0
# 2: f 0
# 3: b 0
# 4: k 0
# 5: d 1
# 6: b 1
# 7: a 1
# 8: s 0
# 9: a 0
# 10: n 0
# 11: d 2
# 12: f 2
# 13: d 2
# 14: a 2
# 15: t 0
更新3
以防万一有人需要它,单行解决方案:
in.data[ , y := rev(cumsum(rev(colA)==symbol.stop))
][ , z:=(!match(symbol.start,colA,nomatch=.N+1)>1:.N), by=y
][ z==T, out:=as.numeric(factor(y,levels=unique(y)))
][ , c('z','y'):=list(NULL,NULL)]
事实证明这很复杂,但它没有循环或匹配(因此应该很快):
library(zoo)
in.data[, newcol := (colA=='d') - (colA=='a')
][newcol == 0 & 1:.N > 1, newcol := NA
][, newcol := na.locf(newcol, F)
][newcol < 0, newcol := 0
][, newcol := newcol * cumsum(diff(c(0, newcol)) != 0)
][newcol == 0 & c(NA, head(newcol, -1)), newcol := NA
][, newcol := na.locf(newcol, F)
][newcol != 0, newcol := .GRP, by = newcol][]
# colA wanted.column newcol
# 1: b NA 0
# 2: f NA 0
# 3: b NA 0
# 4: k NA 0
# 5: d 1 1
# 6: b 1 1
# 7: a 1 1
# 8: s NA 0
# 9: a NA 0
#10: n NA 0
#11: d 2 2
#12: f 2 2
#13: d 2 2
#14: a 2 2
#15: t NA 0
每一步都非常简单,如果您运行一步一步来,应该是不言自明的。
没有经过严格测试,但这里有另一个:
require(data.table)
cj_dt = CJ(which(in.data$colA == "d"), which(in.data$colA == "a"))[V1 <= V2]
idx1 = cj_dt[, if (.N > 1) list(V2 = V2[1L]), by=V1]
idx2 = cj_dt[!idx1][, list(V1 = V1[1L]), by=V2]
ans = rbind(idx1, idx2)
# V1 V2
# 1: 5 7
# 2: 11 14
现在我们要做的就是将 wanted.column
的 5:7, 11:14
替换为 1
。
有没有人看到这会中断的情况?
我需要根据起始行和结束行标准来标识 data.table 中的行块。在下面的 MWE 中,开始行由 colA=="d" 定义,并且该组继续直到 colA=="a"
library(data.table)
in.data <- data.table(colA=c("b", "f", "b", "k", "d", "b", "a", "s", "a", "n", "d", "f", "d", "a", "t"))
in.data$wanted.column <- c(NA, NA, NA, NA, 1, 1, 1, NA, NA, NA, 2, 2, 2, 2, NA)
in.data
# colA wanted.column
# 1: b NA
# 2: f NA
# 3: b NA
# 4: k NA
# 5: d 1
# 6: b 1
# 7: a 1
# 8: s NA
# 9: a NA
# 10: n NA
# 11: d 2
# 12: f 2
# 13: d 2
# 14: a 2
# 15: t NA
(组外值为NA、零或任何其他可识别结果无关紧要)
老实说有点讨厌,但对我有用:
library(data.table)
in.data <- data.table(colA=c("b", "f", "b", "k", "d", "b", "a", "s", "a", "n", "d", "f", "d", "a", "t"))
in.data$out <- rep(NA,nrow(in.data))
activator <- FALSE
counter <- 1
for (i in 1:nrow(in.data)) {
if (activator == TRUE & in.data$colA[i] !='a') {
in.data$out[i] <- counter
next
}
if( in.data$colA[i]=='d') {
activator <- TRUE
in.data$out[i] <- counter
} else if (in.data$colA[i]=='a' & activator==TRUE ) {
in.data$out[i] <- counter
counter <- counter + 1
activator <- FALSE
} else {next}
}
in.data
输出:
> in.data
colA out
1: b NA
2: f NA
3: b NA
4: k NA
5: d 1
6: b 1
7: a 1
8: s NA
9: a NA
10: n NA
11: d 2
12: f 2
13: d 2
14: a 2
15: t NA
如果你愿意,你可以做一个 sapply
,但是 if
语句太多了,for-loop
可能更容易阅读......
我相信有人会想出一个不错的 data.table
解决方案。在等待期间,还有另一种 base
可能性:
in.df <- as.data.frame(in.data)
# index of "d", start index
start <- which(in.df$colA == "d")
# index of "a"
idx_a <- which(in.df$colA == "a")
# end index: for each start index, select the first index of "a" which is larger
end <- a[sapply(start, function(x) which.max(x < idx_a))]
# check if runs overlap and create groups of runs
lag_end <- c(0, head(end, -1))
run <- cumsum(start >= lag_end)
df <- data.frame(start, end, run)
# within each run, expand the sequence of idx, from min(start) to max(end)
df2 <- do.call(rbind,
by(df, df$run, function(x){
data.frame(run = x$run, idx = min(x$start):max(x$end))
})
)
# add an empty 'run' variable to in.df
in.df$run <- NA
# assign df2$run at idx in in.data
in.df$run[df2$idx] <- df2$run
# idx colA wanted.column run
# 1 1 b NA NA
# 2 2 f NA NA
# 3 3 b NA NA
# 4 4 k NA NA
# 5 5 d 1 1
# 6 6 b 1 1
# 7 7 a 1 1
# 8 8 s NA NA
# 9 9 a NA NA
# 10 10 n NA NA
# 11 11 d 2 2
# 12 12 f 2 2
# 13 13 d 2 2
# 14 14 a 2 2
# 15 15 t NA NA
同样,在 base-R 中,有点令人讨厌,但迭代次数较少且没有 if else。
library(data.table)
in.data <- data.table(colA=c("b", "f", "b", "k", "d", "b", "a", "s", "a", "n", "d", "f", "d", "a", "t"))
in.data$out <- rep(NA,nrow(in.data))
d <- which(in.data$colA=="d")
a <- which(in.data$colA=="a")
end <- rep(NA, length(d))
for (i in seq_along(d)){
begin <- d[i]
if(begin>=max(a)) # this cdn accomodates a case where no "a" appears after some "d"
break
end[i] <- min(a[d[i]<a])
in.data$out[begin: end[i]] <- sum(!is.na(unique(end)))
}
in.data
# colA out
# 1: b NA
# 2: f NA
# 3: b NA
# 4: k NA
# 5: d 1
# 6: b 1
# 7: a 1
# 8: s NA
# 9: a NA
#10: n NA
#11: d 2
#12: f 2
#13: d 2
#14: a 2
#15: t NA
更新
答案的原始版本寻找最短的序列,这是不正确的,因为它们可以在中间包含起始符号,例如c('d','f','d','a')
。答案的编辑版本修复了这个问题
更新2
我被告知当两个序列相互跟随时(例如in.data <- data.table(colA=c("b", "f", "b", "k", "d", "b", "a", "d", "f", "d", "a", "t"))
),它们被列为一个解决方案,这是错误的。在这里,我通过跟踪 colA
.
symbol.stop
符号的出现来解决这个问题
设置
library(data.table)
in.data <- data.table(colA=c("b", "f", "b", "k", "d", "b", "a", "s", "a", "n", "d", "f", "d", "a", "t"))
symbol.start='d'
symbol.stop='a'
实际代码
in.data[,y := rev(cumsum(rev(colA)==symbol.stop))][,out:=(!match(symbol.start,colA,nomatch=.N+1)>1:.N),by=y]
in.data$out[in.data$out] <- as.factor(max(in.data$y)-in.data$y[in.data$out])
此处,[,y := rev(cumsum(rev(colA)==symbol.stop))]
创建了一个列 y
,可用于根据背面出现的 symbol.stop
对数据集进行分组。 [,out:=(!match(symbol.start,colA,nomatch=.N+1)>1:.N),by=y]
表达式 returns 一个布尔向量,指示一行是否属于 start.symbol...end.symbol
序列。需要下一行来枚举这样的序列。
清理输出
in.data$y <- NULL
in.data
# colA out
# 1: b 0
# 2: f 0
# 3: b 0
# 4: k 0
# 5: d 1
# 6: b 1
# 7: a 1
# 8: s 0
# 9: a 0
# 10: n 0
# 11: d 2
# 12: f 2
# 13: d 2
# 14: a 2
# 15: t 0
更新3
以防万一有人需要它,单行解决方案:
in.data[ , y := rev(cumsum(rev(colA)==symbol.stop))
][ , z:=(!match(symbol.start,colA,nomatch=.N+1)>1:.N), by=y
][ z==T, out:=as.numeric(factor(y,levels=unique(y)))
][ , c('z','y'):=list(NULL,NULL)]
事实证明这很复杂,但它没有循环或匹配(因此应该很快):
library(zoo)
in.data[, newcol := (colA=='d') - (colA=='a')
][newcol == 0 & 1:.N > 1, newcol := NA
][, newcol := na.locf(newcol, F)
][newcol < 0, newcol := 0
][, newcol := newcol * cumsum(diff(c(0, newcol)) != 0)
][newcol == 0 & c(NA, head(newcol, -1)), newcol := NA
][, newcol := na.locf(newcol, F)
][newcol != 0, newcol := .GRP, by = newcol][]
# colA wanted.column newcol
# 1: b NA 0
# 2: f NA 0
# 3: b NA 0
# 4: k NA 0
# 5: d 1 1
# 6: b 1 1
# 7: a 1 1
# 8: s NA 0
# 9: a NA 0
#10: n NA 0
#11: d 2 2
#12: f 2 2
#13: d 2 2
#14: a 2 2
#15: t NA 0
每一步都非常简单,如果您运行一步一步来,应该是不言自明的。
没有经过严格测试,但这里有另一个:
require(data.table)
cj_dt = CJ(which(in.data$colA == "d"), which(in.data$colA == "a"))[V1 <= V2]
idx1 = cj_dt[, if (.N > 1) list(V2 = V2[1L]), by=V1]
idx2 = cj_dt[!idx1][, list(V1 = V1[1L]), by=V2]
ans = rbind(idx1, idx2)
# V1 V2
# 1: 5 7
# 2: 11 14
现在我们要做的就是将 wanted.column
的 5:7, 11:14
替换为 1
。
有没有人看到这会中断的情况?