在 R 中按 ID 移动一行
Shift one row by ID in R
我有数据框,我想创建一个新变量"Begin1",条件是:如果第二行变量"Begin"小于第一行变量"End",设置由于 ID
重叠,"End" 的值替换了 "Begin"
ID <- c(rep(1,3), rep(3, 5), rep(4,4))
Begin <- c(0,2.5,5, 7,8,7,25,25,10,15,17,20)
End <- c(1.5,3.5,6, 7.5,8,11,29,35, 12,19,21,28)
df <- data.frame(ID, Begin, End)
df
ID Begin End
1 1 0.0 1.5
2 1 2.5 3.5
3 1 5.0 6.0
4 3 7.0 7.5
5 3 8.0 8.0
6 3 7.0 11.0**
7 3 25.0 29.0
8 3 25.0 35.0**
9 4 10.0 12.0
10 4 15.0 19.0
11 4 17.0 21.0**
12 4 20.0 28.0**
如果你能看到,行加粗,行 (6,8,11,12)。从 ID 为 3 的第 6 行开始,您会看到 "Begin" = 7.0,它比上一行的 "End" 小,现在我们设置 "Begin1" = 8.0。对于ID为3的第8行,"Begin"=25,比之前的"End"=29小,现在我们设置"Begin1"=29,以此类推。这是输出
ID Begin Begin1 End
1 1 0.0 0.0 1.5
2 1 2.5 2.5 3.5
3 1 5.0 5.0 6.0
4 3 7.0 7.0 7.5
5 3 8.0 8.0 8.0
6 3 7.0 8.0 11.0**
7 3 25.0 25.0 29.0
8 3 25.0 29.0 35.0**
9 4 10.0 10.0 12.0
10 4 15.0 15.0 19.0
11 4 17.0 19.0 21.0**
12 4 20.0 21.0 28.0**
感谢您的建议
这是更新
ID <- c(rep(1,3), rep(3, 5), rep(4,4))
Group <-c(1,1,2,1,1,1,2,2,1,1,1,2)
Begin <- c(0,2.5,5, 7,8,7,25,25,10,15,17,20)
End <- c(1.5,3.5,6, 7.5,8,11,29,35, 12,19,21,28)
df <- data.frame(ID,Group, Begin, End)
这次想按ID和Group分组,报错data.table。
这是输出
ID Group Begin End Begin1
1 1 1 0.0 1.5 0.0
2 1 1 2.5 3.5 2.5
3 1 2 5.0 6.0 5.0
4 3 1 7.0 7.5 7.0
5 3 1 8.0 8.0 8.0
6 3 1 7.0 11.0 8.0
7 3 2 25.0 29.0 25.0
8 3 2 25.0 35.0 29.0
9 4 1 10.0 12.0 35.0
10 4 1 15.0 19.0 15.0
11 4 1 17.0 21.0 19.0
12 4 2 20.0 28.0 20.0**** Right here is not change bc it's group 2
这是 dplyr 包的结果,它可以工作,但是 data.table 不工作
library(dplyr)
df %>%
group_by(ID, Group) %>%
mutate(Begin1 = pmax(Begin, lag(End), na.rm =TRUE))
Source: local data frame [12 x 5]
Groups: ID, Group [6]
ID Group Begin End Begin1
(dbl) (dbl) (dbl) (dbl) (dbl)
1 1 1 0.0 1.5 0.0
2 1 1 2.5 3.5 2.5
3 1 2 5.0 6.0 5.0
4 3 1 7.0 7.5 7.0
5 3 1 8.0 8.0 8.0
6 3 1 7.0 11.0 8.0
7 3 2 25.0 29.0 25.0
8 3 2 25.0 35.0 29.0
9 4 1 10.0 12.0 10.0
10 4 1 15.0 19.0 15.0
11 4 1 17.0 21.0 19.0
12 4 2 20.0 28.0 20.0**** It works
我们可以使用 data.table
library(data.table)
setDT(df)[, Begin1 := Begin]
i1 <- df[, .I[Begin < shift(End, fill = Begin[1L])], by = ID]$V1
df$Begin1[i1] <- df$End[i1-1]
df
# ID Begin End Begin1
# 1: 1 0.0 1.5 0.0
# 2: 1 2.5 3.5 2.5
# 3: 1 5.0 6.0 5.0
# 4: 3 7.0 7.5 7.0
# 5: 3 8.0 8.0 8.0
# 6: 3 7.0 11.0 8.0
# 7: 3 25.0 29.0 25.0
# 8: 3 25.0 35.0 29.0
# 9: 4 10.0 12.0 10.0
#10: 4 15.0 19.0 15.0
#11: 4 17.0 21.0 19.0
#12: 4 20.0 28.0 21.0
或者另一种选择是
setDT(df)[, Begin1 := shift(End), by = ID][!which(Begin < Begin1), Begin1:= Begin]
df
# ID Begin End Begin1
# 1: 1 0.0 1.5 0.0
# 2: 1 2.5 3.5 2.5
# 3: 1 5.0 6.0 5.0
# 4: 3 7.0 7.5 7.0
# 5: 3 8.0 8.0 8.0
# 6: 3 7.0 11.0 8.0
# 7: 3 25.0 29.0 25.0
# 8: 3 25.0 35.0 29.0
# 9: 4 10.0 12.0 10.0
#10: 4 15.0 19.0 15.0
#11: 4 17.0 21.0 19.0
#12: 4 20.0 28.0 21.0
或使用dplyr
library(dplyr)
df %>%
group_by(ID) %>%
mutate(Begin1 = pmax(Begin, lag(End), na.rm =TRUE))
# ID Begin End Begin1
# <dbl> <dbl> <dbl> <dbl>
#1 1 0.0 1.5 0.0
#2 1 2.5 3.5 2.5
#3 1 5.0 6.0 5.0
#4 3 7.0 7.5 7.0
#5 3 8.0 8.0 8.0
#6 3 7.0 11.0 8.0
#7 3 25.0 29.0 25.0
#8 3 25.0 35.0 29.0
#9 4 10.0 12.0 10.0
#10 4 15.0 19.0 15.0
#11 4 17.0 21.0 19.0
#12 4 20.0 28.0 21.0
更新
基于 OP 的新数据
setDT(df)[, Begin1 := shift(End), by = .(ID, Group)][
!which(Begin < Begin1), Begin1 := Begin]
df
# ID Group Begin End Begin1
#1: 1 1 0.0 1.5 0.0
#2: 1 1 2.5 3.5 2.5
#3: 1 2 5.0 6.0 5.0
#4: 3 1 7.0 7.5 7.0
#5: 3 1 8.0 8.0 8.0
#6: 3 1 7.0 11.0 8.0
#7: 3 2 25.0 29.0 25.0
#8: 3 2 25.0 35.0 29.0
#9: 4 1 10.0 12.0 10.0
#10: 4 1 15.0 19.0 15.0
#11: 4 1 17.0 21.0 19.0
#12: 4 2 20.0 28.0 20.0
使用 data.table
的不同方式。关键如下。
- 根据ID
计算的by
语句
shift
函数,滞后于End变量与Begin比较
pmax
函数,它执行逐元素 max
计算
代码如下:
library(data.table)
dt <- as.data.table(df)
dt[, Begin1 := pmax(Begin, shift(End, type = 'lag'), na.rm = TRUE), by = ID]
这是一种基于 End
列的 lag
使用 ifelse
创建列的基础 R 的方法。
df$Begin1 <- ifelse(df$Begin <= lag(df$End), lag(df$End), df$Begin)
df$Begin1[which(is.na(df$Begin1))] <- df$Begin[which(is.na(df$Begin1))]
> df
ID Begin End Begin1
1 1 0.0 1.5 0.0
2 1 2.5 3.5 2.5
3 1 5.0 6.0 5.0
4 3 7.0 7.5 7.0
5 3 8.0 8.0 8.0
6 3 7.0 11.0 8.0
7 3 25.0 29.0 25.0
8 3 25.0 35.0 29.0
9 4 10.0 12.0 35.0
10 4 15.0 19.0 15.0
11 4 17.0 21.0 19.0
12 4 20.0 28.0 21.0
我有数据框,我想创建一个新变量"Begin1",条件是:如果第二行变量"Begin"小于第一行变量"End",设置由于 ID
重叠,"End" 的值替换了 "Begin"ID <- c(rep(1,3), rep(3, 5), rep(4,4))
Begin <- c(0,2.5,5, 7,8,7,25,25,10,15,17,20)
End <- c(1.5,3.5,6, 7.5,8,11,29,35, 12,19,21,28)
df <- data.frame(ID, Begin, End)
df
ID Begin End
1 1 0.0 1.5
2 1 2.5 3.5
3 1 5.0 6.0
4 3 7.0 7.5
5 3 8.0 8.0
6 3 7.0 11.0**
7 3 25.0 29.0
8 3 25.0 35.0**
9 4 10.0 12.0
10 4 15.0 19.0
11 4 17.0 21.0**
12 4 20.0 28.0**
如果你能看到,行加粗,行 (6,8,11,12)。从 ID 为 3 的第 6 行开始,您会看到 "Begin" = 7.0,它比上一行的 "End" 小,现在我们设置 "Begin1" = 8.0。对于ID为3的第8行,"Begin"=25,比之前的"End"=29小,现在我们设置"Begin1"=29,以此类推。这是输出
ID Begin Begin1 End
1 1 0.0 0.0 1.5
2 1 2.5 2.5 3.5
3 1 5.0 5.0 6.0
4 3 7.0 7.0 7.5
5 3 8.0 8.0 8.0
6 3 7.0 8.0 11.0**
7 3 25.0 25.0 29.0
8 3 25.0 29.0 35.0**
9 4 10.0 10.0 12.0
10 4 15.0 15.0 19.0
11 4 17.0 19.0 21.0**
12 4 20.0 21.0 28.0**
感谢您的建议
这是更新
ID <- c(rep(1,3), rep(3, 5), rep(4,4))
Group <-c(1,1,2,1,1,1,2,2,1,1,1,2)
Begin <- c(0,2.5,5, 7,8,7,25,25,10,15,17,20)
End <- c(1.5,3.5,6, 7.5,8,11,29,35, 12,19,21,28)
df <- data.frame(ID,Group, Begin, End)
这次想按ID和Group分组,报错data.table。
这是输出
ID Group Begin End Begin1
1 1 1 0.0 1.5 0.0
2 1 1 2.5 3.5 2.5
3 1 2 5.0 6.0 5.0
4 3 1 7.0 7.5 7.0
5 3 1 8.0 8.0 8.0
6 3 1 7.0 11.0 8.0
7 3 2 25.0 29.0 25.0
8 3 2 25.0 35.0 29.0
9 4 1 10.0 12.0 35.0
10 4 1 15.0 19.0 15.0
11 4 1 17.0 21.0 19.0
12 4 2 20.0 28.0 20.0**** Right here is not change bc it's group 2
这是 dplyr 包的结果,它可以工作,但是 data.table 不工作
library(dplyr)
df %>%
group_by(ID, Group) %>%
mutate(Begin1 = pmax(Begin, lag(End), na.rm =TRUE))
Source: local data frame [12 x 5]
Groups: ID, Group [6]
ID Group Begin End Begin1
(dbl) (dbl) (dbl) (dbl) (dbl)
1 1 1 0.0 1.5 0.0
2 1 1 2.5 3.5 2.5
3 1 2 5.0 6.0 5.0
4 3 1 7.0 7.5 7.0
5 3 1 8.0 8.0 8.0
6 3 1 7.0 11.0 8.0
7 3 2 25.0 29.0 25.0
8 3 2 25.0 35.0 29.0
9 4 1 10.0 12.0 10.0
10 4 1 15.0 19.0 15.0
11 4 1 17.0 21.0 19.0
12 4 2 20.0 28.0 20.0**** It works
我们可以使用 data.table
library(data.table)
setDT(df)[, Begin1 := Begin]
i1 <- df[, .I[Begin < shift(End, fill = Begin[1L])], by = ID]$V1
df$Begin1[i1] <- df$End[i1-1]
df
# ID Begin End Begin1
# 1: 1 0.0 1.5 0.0
# 2: 1 2.5 3.5 2.5
# 3: 1 5.0 6.0 5.0
# 4: 3 7.0 7.5 7.0
# 5: 3 8.0 8.0 8.0
# 6: 3 7.0 11.0 8.0
# 7: 3 25.0 29.0 25.0
# 8: 3 25.0 35.0 29.0
# 9: 4 10.0 12.0 10.0
#10: 4 15.0 19.0 15.0
#11: 4 17.0 21.0 19.0
#12: 4 20.0 28.0 21.0
或者另一种选择是
setDT(df)[, Begin1 := shift(End), by = ID][!which(Begin < Begin1), Begin1:= Begin]
df
# ID Begin End Begin1
# 1: 1 0.0 1.5 0.0
# 2: 1 2.5 3.5 2.5
# 3: 1 5.0 6.0 5.0
# 4: 3 7.0 7.5 7.0
# 5: 3 8.0 8.0 8.0
# 6: 3 7.0 11.0 8.0
# 7: 3 25.0 29.0 25.0
# 8: 3 25.0 35.0 29.0
# 9: 4 10.0 12.0 10.0
#10: 4 15.0 19.0 15.0
#11: 4 17.0 21.0 19.0
#12: 4 20.0 28.0 21.0
或使用dplyr
library(dplyr)
df %>%
group_by(ID) %>%
mutate(Begin1 = pmax(Begin, lag(End), na.rm =TRUE))
# ID Begin End Begin1
# <dbl> <dbl> <dbl> <dbl>
#1 1 0.0 1.5 0.0
#2 1 2.5 3.5 2.5
#3 1 5.0 6.0 5.0
#4 3 7.0 7.5 7.0
#5 3 8.0 8.0 8.0
#6 3 7.0 11.0 8.0
#7 3 25.0 29.0 25.0
#8 3 25.0 35.0 29.0
#9 4 10.0 12.0 10.0
#10 4 15.0 19.0 15.0
#11 4 17.0 21.0 19.0
#12 4 20.0 28.0 21.0
更新
基于 OP 的新数据
setDT(df)[, Begin1 := shift(End), by = .(ID, Group)][
!which(Begin < Begin1), Begin1 := Begin]
df
# ID Group Begin End Begin1
#1: 1 1 0.0 1.5 0.0
#2: 1 1 2.5 3.5 2.5
#3: 1 2 5.0 6.0 5.0
#4: 3 1 7.0 7.5 7.0
#5: 3 1 8.0 8.0 8.0
#6: 3 1 7.0 11.0 8.0
#7: 3 2 25.0 29.0 25.0
#8: 3 2 25.0 35.0 29.0
#9: 4 1 10.0 12.0 10.0
#10: 4 1 15.0 19.0 15.0
#11: 4 1 17.0 21.0 19.0
#12: 4 2 20.0 28.0 20.0
使用 data.table
的不同方式。关键如下。
- 根据ID 计算的
shift
函数,滞后于End变量与Begin比较pmax
函数,它执行逐元素max
计算
by
语句
代码如下:
library(data.table)
dt <- as.data.table(df)
dt[, Begin1 := pmax(Begin, shift(End, type = 'lag'), na.rm = TRUE), by = ID]
这是一种基于 End
列的 lag
使用 ifelse
创建列的基础 R 的方法。
df$Begin1 <- ifelse(df$Begin <= lag(df$End), lag(df$End), df$Begin)
df$Begin1[which(is.na(df$Begin1))] <- df$Begin[which(is.na(df$Begin1))]
> df
ID Begin End Begin1
1 1 0.0 1.5 0.0
2 1 2.5 3.5 2.5
3 1 5.0 6.0 5.0
4 3 7.0 7.5 7.0
5 3 8.0 8.0 8.0
6 3 7.0 11.0 8.0
7 3 25.0 29.0 25.0
8 3 25.0 35.0 29.0
9 4 10.0 12.0 35.0
10 4 15.0 19.0 15.0
11 4 17.0 21.0 19.0
12 4 20.0 28.0 21.0