如何在 R 中查找和标记唯一序列的起点/终点
How to find and label start / end points of unique sequences in R
我有一个 1 和 0 的序列以及一个时间向量。我想找到所有1s序列的开始和结束时间点,并给每个序列一个唯一的ID。这是一些示例数据和我到目前为止的尝试。
创建虚拟数据
# Create the sequence
x = c(0,0,0,0,1,1,1,1,0,0,0,1,1,1,1,1,1,0,0,0,1,1,1,1,0)
# Create the time vector
t = 10:34
这是我的努力
#Get changepoints using diff()
diff_result <- diff(x)
# Use ifelse() to get start and end times (i.e. on and off)
on_t <- ifelse(diff_result == 1, t, NA)
off_t <- ifelse(diff_result == -1, t, NA)
# Combine into data frame and remove NAs, add 1 to on_t
results <- data.frame(on_t = on_t[!is.na(on_t)] + 1, off_t = off_t[!is.na(off_t)])
# Create unique ID for each sequence
results$ID <- factor(1:nrow(results))
print(results)
on_t off_t ID
1 14 17 1
2 21 26 2
3 30 33 3
我确定有更好的方法...
这是找到上述向量的起始和停止位置的一种方法:
# get positions of the 1s
onePos <- which(x == 1)
# get the ending positions
stopPos <- onePos[c(which(diff(onePos) != 1), length(onePos))]
# get the starting positions
startPos <- onePos[c(1, which(diff(onePos) != 1) + 1)]
ts的值可以通过子集得到:
t[startPos]
[1] 14 21 30
t[stopPos]
[1] 17 26 33
最后,添加一个id:
df <- data.frame(id=seq_along(startPos), on_t=t[startPos], off_t=t[stopPos])
你也可以这样做
x = c(0,0,0,0,1,1,1,1,0,0,0,1,1,1,1,1,1,0,0,0,1,1,1,1,0)
# Create the time vector
t = 10:34
xy <- data.frame(x, t)
mr <- rle(xy$x)$lengths
xy$group <- rep(letters[1:length(mr)], times = mr)
onesies <- xy[xy$x == 1, ]
out <- by(onesies, INDICES = onesies$group,
FUN = function(x) {
data.frame(on_t = x$t[1], off_t = x$t[nrow(x)], ID = unique(x$group))
})
do.call("rbind", out)
on_t off_t ID
b 14 17 b
d 21 26 d
f 30 33 f
将两个向量放在一个 data.table
中,然后进行典型的分组依据、过滤和变异转换是另一种选择:
library(data.table)
dt = data.table(seq = x, time = t)
dt[, .(on_t = min(time), off_t = max(time), lab = unique(seq)), .(ID = rleid(seq))]
# Use rleid to create a unique ID for each sequence as a group by variable, find the start
# and end point for each sequence as well as a label for each sequence;
[lab == 1]
# filter label so that the result only contains time for sequence of 1
[, `:=`(lab = NULL, ID = seq_along(ID))][]
# Remove label and recreate the ID
# ID on_t off_t
# 1: 1 14 17
# 2: 2 21 26
# 3: 3 30 33
按照 OP 的逻辑,这可能是更好的方法:
d = diff(c(0, x, 0))
# prepend and append a 0 at the beginning and ending of x to make sure this always work
# if the sequence starts or ends with 1.
results = data.frame(on_t = t[d == 1], off_t = t[(d == -1)[-1]])
# pick up the time where 1 sequence starts as on time, and 0 starts as off time. Here d is
# one element longer than t and x but since the last element for d == 1 will always be false, it won't affect the result.
results$ID = 1:nrow(results)
# create an ID
results
# on_t off_t ID
# 1 14 17 1
# 2 21 26 2
# 3 30 33 3
我有一个 1 和 0 的序列以及一个时间向量。我想找到所有1s序列的开始和结束时间点,并给每个序列一个唯一的ID。这是一些示例数据和我到目前为止的尝试。
创建虚拟数据
# Create the sequence
x = c(0,0,0,0,1,1,1,1,0,0,0,1,1,1,1,1,1,0,0,0,1,1,1,1,0)
# Create the time vector
t = 10:34
这是我的努力
#Get changepoints using diff()
diff_result <- diff(x)
# Use ifelse() to get start and end times (i.e. on and off)
on_t <- ifelse(diff_result == 1, t, NA)
off_t <- ifelse(diff_result == -1, t, NA)
# Combine into data frame and remove NAs, add 1 to on_t
results <- data.frame(on_t = on_t[!is.na(on_t)] + 1, off_t = off_t[!is.na(off_t)])
# Create unique ID for each sequence
results$ID <- factor(1:nrow(results))
print(results)
on_t off_t ID
1 14 17 1
2 21 26 2
3 30 33 3
我确定有更好的方法...
这是找到上述向量的起始和停止位置的一种方法:
# get positions of the 1s
onePos <- which(x == 1)
# get the ending positions
stopPos <- onePos[c(which(diff(onePos) != 1), length(onePos))]
# get the starting positions
startPos <- onePos[c(1, which(diff(onePos) != 1) + 1)]
ts的值可以通过子集得到:
t[startPos]
[1] 14 21 30
t[stopPos]
[1] 17 26 33
最后,添加一个id:
df <- data.frame(id=seq_along(startPos), on_t=t[startPos], off_t=t[stopPos])
你也可以这样做
x = c(0,0,0,0,1,1,1,1,0,0,0,1,1,1,1,1,1,0,0,0,1,1,1,1,0)
# Create the time vector
t = 10:34
xy <- data.frame(x, t)
mr <- rle(xy$x)$lengths
xy$group <- rep(letters[1:length(mr)], times = mr)
onesies <- xy[xy$x == 1, ]
out <- by(onesies, INDICES = onesies$group,
FUN = function(x) {
data.frame(on_t = x$t[1], off_t = x$t[nrow(x)], ID = unique(x$group))
})
do.call("rbind", out)
on_t off_t ID
b 14 17 b
d 21 26 d
f 30 33 f
将两个向量放在一个 data.table
中,然后进行典型的分组依据、过滤和变异转换是另一种选择:
library(data.table)
dt = data.table(seq = x, time = t)
dt[, .(on_t = min(time), off_t = max(time), lab = unique(seq)), .(ID = rleid(seq))]
# Use rleid to create a unique ID for each sequence as a group by variable, find the start
# and end point for each sequence as well as a label for each sequence;
[lab == 1]
# filter label so that the result only contains time for sequence of 1
[, `:=`(lab = NULL, ID = seq_along(ID))][]
# Remove label and recreate the ID
# ID on_t off_t
# 1: 1 14 17
# 2: 2 21 26
# 3: 3 30 33
按照 OP 的逻辑,这可能是更好的方法:
d = diff(c(0, x, 0))
# prepend and append a 0 at the beginning and ending of x to make sure this always work
# if the sequence starts or ends with 1.
results = data.frame(on_t = t[d == 1], off_t = t[(d == -1)[-1]])
# pick up the time where 1 sequence starts as on time, and 0 starts as off time. Here d is
# one element longer than t and x but since the last element for d == 1 will always be false, it won't affect the result.
results$ID = 1:nrow(results)
# create an ID
results
# on_t off_t ID
# 1 14 17 1
# 2 21 26 2
# 3 30 33 3