如何创建滞后变量
How to create lag variables
我想为变量 pm10 创建滞后变量并使用以下代码。但是,我得不到我想要的。我怎样才能造成 pm10 的延迟?
df2$l1pm10 <- lag(df2$pm10, -1, na.pad = TRUE)
df2$l1pm102 <- lag(df2$pm10, 1)
dput(df2)
structure(list(var1 = 1:10, pm10 = c(26.956073733, NA, 32.838694951,
39.9560737332, NA, 40.9560737332, 33.956073733, 28.956073733,
32.348770798, NA), l1pm10 = structure(c(26.956073733, NA, 32.838694951,
39.9560737332, NA, 40.9560737332, 33.956073733, 28.956073733,
32.348770798, NA), .Tsp = c(2, 11, 1))), .Names = c("var1", "pm10",
"l1pm10"), row.names = c("1", "2", "3", "4", "5", "6", "7", "8",
"9", "10"), class = "data.frame")
在 base R 中,函数 lag()
对时间序列对象很有用。这里有一个数据框,情况有些不同。
你可以试试下面的方法,我承认这不是很优雅:
df2$l1pm10 <- sapply(1:nrow(df2), function(x) df2$pm10[x+1])
df2$l1pm102 <- sapply(1:nrow(df2), function(x) df2$pm10[x-1])
#> df2
# var1 pm10 l1pm10 l1pm102
#1 1 26.95607 NA
#2 2 NA 32.83869 26.95607
#3 3 32.83869 39.95607 NA
#4 4 39.95607 NA 32.83869
#5 5 NA 40.95607 39.95607
#6 6 40.95607 33.95607 NA
#7 7 33.95607 28.95607 40.95607
#8 8 28.95607 32.34877 33.95607
#9 9 32.34877 NA 28.95607
#10 10 NA NA 32.34877
另一种方法是使用 Hmisc
包中的 Lag()
函数(大写 "L"):
library(Hmisc)
df2$l1pm10 <- Lag(df2$pm10, -1)
df2$l1pm102 <- Lag(df2$pm10, +1)
#> df2
# var1 pm10 l1pm10 l1pm102
#1 1 26.95607 NA NA
#2 2 NA 32.83869 26.95607
#3 3 32.83869 39.95607 NA
#4 4 39.95607 NA 32.83869
#5 5 NA 40.95607 39.95607
#6 6 40.95607 33.95607 NA
#7 7 33.95607 28.95607 40.95607
#8 8 28.95607 32.34877 33.95607
#9 9 32.34877 NA 28.95607
#10 10 NA NA 32.34877
我知道问题已被接受,但几个月前我遇到了同样的问题(在 this 问题中),我想创建一个自制的 lag
函数。
这是代码:
df2$lagpm10 <- c(NA, df2$pm10[seq_along(df2$pm10) -1])
df2
var1 pm10 l1pm10 lagpm10
1 1 26.95607 26.95607 NA
2 2 NA NA 26.95607
3 3 32.83869 32.83869 NA
4 4 39.95607 39.95607 32.83869
5 5 NA NA 39.95607
6 6 40.95607 40.95607 NA
7 7 33.95607 33.95607 40.95607
8 8 28.95607 28.95607 33.95607
9 9 32.34877 32.34877 28.95607
10 10 NA NA 32.34877
基准
其中Rhertel1和Rhertel2是Rhertel的两行代码,Sabdem是我的。
Unit: microseconds
expr min lq mean median uq max neval
Rhertel1 250.523 257.740 272.07275 260.3355 264.0945 3540.187 10000
Rhertel2 246.641 253.887 271.77003 256.5380 260.4935 14637.791 10000
Sabdem 57.762 60.521 65.85315 61.3765 62.6050 12275.979 10000
另一种方法是使用 data.table 包中的 shift
函数:
library(data.table)
setDT(df2)[, c("l1pm10","l1pm102") := .(shift(pm10, 1L, fill = NA, type = "lag"),
shift(pm10, 1L, fill = NA, type = "lead"))]
这给出:
> df2
var1 pm10 l1pm10 l1pm102
1: 1 26.95607 NA NA
2: 2 NA 26.95607 32.83869
3: 3 32.83869 NA 39.95607
4: 4 39.95607 32.83869 NA
5: 5 NA 39.95607 40.95607
6: 6 40.95607 NA 33.95607
7: 7 33.95607 40.95607 28.95607
8: 8 28.95607 33.95607 32.34877
9: 9 32.34877 28.95607 NA
10: 10 NA 32.34877 NA
已用数据:
df2 <- structure(list(var1 = 1:10, pm10 = c(26.956073733, NA, 32.838694951,
39.9560737332, NA, 40.9560737332, 33.956073733, 28.956073733,
32.348770798, NA)), .Names = c("var1", "pm10"), row.names = c("1",
"2", "3", "4", "5", "6", "7", "8", "9", "10"), class = "data.frame")
我想傻瓜的解决方案就是创建向量或列的 "lagged" 版本(在第一个位置添加 NA),然后将列绑定在一起:
x<-1:10; #Example vector
x_lagged <- c(NA, x[1:(length(x)-1)]);
new_x <- cbind(x,x_lagged);
您可以使用head
和tail
如下
df2$l1pm10 <- c(tail(df2$pm10, -1), NA)
df2$l1pm102 <- c(NA, head(df2$pm10, -1))
df2
#R> var1 pm10 l1pm10 l1pm102
#R> 1 1 26.95607 NA NA
#R> 2 2 NA 32.83869 26.95607
#R> 3 3 32.83869 39.95607 NA
#R> 4 4 39.95607 NA 32.83869
#R> 5 5 NA 40.95607 39.95607
#R> 6 6 40.95607 33.95607 NA
#R> 7 7 33.95607 28.95607 40.95607
#R> 8 8 28.95607 32.34877 33.95607
#R> 9 9 32.34877 NA 28.95607
#R> 10 10 NA NA 32.34877
# or with transfrom
transform(df2, l1pm10 = c(tail(pm10, -1), NA), l1pm102 = c(NA, head(pm10, -1)))
#R> var1 pm10 l1pm10 l1pm102
#R> 1 1 26.95607 NA NA
#R> 2 2 NA 32.83869 26.95607
#R> 3 3 32.83869 39.95607 NA
#R> 4 4 39.95607 NA 32.83869
#R> 5 5 NA 40.95607 39.95607
#R> 6 6 40.95607 33.95607 NA
#R> 7 7 33.95607 28.95607 40.95607
#R> 8 8 28.95607 32.34877 33.95607
#R> 9 9 32.34877 NA 28.95607
#R> 10 10 NA NA 32.34877
你可以用这两个写一个通用的滞后函数如下
lag_func <- function(x, k = 1, pad = NA){
if(k == 0)
return(x)
nas <- rep(pad, min(length(x), abs(k)))
if(k < 0)
c(tail(x, k), nas) else c(nas, head(x, -k))
}
# use the function to lag the variable
sapply((-11):11, lag_func, x = df2$pm10)
#R> [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9]
#R> [1,] NA NA NA 32.34877 28.95607 33.95607 40.95607 NA 39.95607
#R> [2,] NA NA NA NA 32.34877 28.95607 33.95607 40.95607 NA
#R> [3,] NA NA NA NA NA 32.34877 28.95607 33.95607 40.95607
#R> [4,] NA NA NA NA NA NA 32.34877 28.95607 33.95607
#R> [5,] NA NA NA NA NA NA NA 32.34877 28.95607
#R> [6,] NA NA NA NA NA NA NA NA 32.34877
#R> [7,] NA NA NA NA NA NA NA NA NA
#R> [8,] NA NA NA NA NA NA NA NA NA
#R> [9,] NA NA NA NA NA NA NA NA NA
#R> [10,] NA NA NA NA NA NA NA NA NA
#R> [,10] [,11] [,12] [,13] [,14] [,15] [,16] [,17]
#R> [1,] 32.83869 NA 26.95607 NA NA NA NA NA
#R> [2,] 39.95607 32.83869 NA 26.95607 NA NA NA NA
#R> [3,] NA 39.95607 32.83869 NA 26.95607 NA NA NA
#R> [4,] 40.95607 NA 39.95607 32.83869 NA 26.95607 NA NA
#R> [5,] 33.95607 40.95607 NA 39.95607 32.83869 NA 26.95607 NA
#R> [6,] 28.95607 33.95607 40.95607 NA 39.95607 32.83869 NA 26.95607
#R> [7,] 32.34877 28.95607 33.95607 40.95607 NA 39.95607 32.83869 NA
#R> [8,] NA 32.34877 28.95607 33.95607 40.95607 NA 39.95607 32.83869
#R> [9,] NA NA 32.34877 28.95607 33.95607 40.95607 NA 39.95607
#R> [10,] NA NA NA 32.34877 28.95607 33.95607 40.95607 NA
#R> [,18] [,19] [,20] [,21] [,22] [,23]
#R> [1,] NA NA NA NA NA NA
#R> [2,] NA NA NA NA NA NA
#R> [3,] NA NA NA NA NA NA
#R> [4,] NA NA NA NA NA NA
#R> [5,] NA NA NA NA NA NA
#R> [6,] NA NA NA NA NA NA
#R> [7,] 26.95607 NA NA NA NA NA
#R> [8,] NA 26.95607 NA NA NA NA
#R> [9,] 32.83869 NA 26.95607 NA NA NA
#R> [10,] 39.95607 32.83869 NA 26.95607 NA NA
我想为变量 pm10 创建滞后变量并使用以下代码。但是,我得不到我想要的。我怎样才能造成 pm10 的延迟?
df2$l1pm10 <- lag(df2$pm10, -1, na.pad = TRUE)
df2$l1pm102 <- lag(df2$pm10, 1)
dput(df2)
structure(list(var1 = 1:10, pm10 = c(26.956073733, NA, 32.838694951,
39.9560737332, NA, 40.9560737332, 33.956073733, 28.956073733,
32.348770798, NA), l1pm10 = structure(c(26.956073733, NA, 32.838694951,
39.9560737332, NA, 40.9560737332, 33.956073733, 28.956073733,
32.348770798, NA), .Tsp = c(2, 11, 1))), .Names = c("var1", "pm10",
"l1pm10"), row.names = c("1", "2", "3", "4", "5", "6", "7", "8",
"9", "10"), class = "data.frame")
在 base R 中,函数 lag()
对时间序列对象很有用。这里有一个数据框,情况有些不同。
你可以试试下面的方法,我承认这不是很优雅:
df2$l1pm10 <- sapply(1:nrow(df2), function(x) df2$pm10[x+1])
df2$l1pm102 <- sapply(1:nrow(df2), function(x) df2$pm10[x-1])
#> df2
# var1 pm10 l1pm10 l1pm102
#1 1 26.95607 NA
#2 2 NA 32.83869 26.95607
#3 3 32.83869 39.95607 NA
#4 4 39.95607 NA 32.83869
#5 5 NA 40.95607 39.95607
#6 6 40.95607 33.95607 NA
#7 7 33.95607 28.95607 40.95607
#8 8 28.95607 32.34877 33.95607
#9 9 32.34877 NA 28.95607
#10 10 NA NA 32.34877
另一种方法是使用 Hmisc
包中的 Lag()
函数(大写 "L"):
library(Hmisc)
df2$l1pm10 <- Lag(df2$pm10, -1)
df2$l1pm102 <- Lag(df2$pm10, +1)
#> df2
# var1 pm10 l1pm10 l1pm102
#1 1 26.95607 NA NA
#2 2 NA 32.83869 26.95607
#3 3 32.83869 39.95607 NA
#4 4 39.95607 NA 32.83869
#5 5 NA 40.95607 39.95607
#6 6 40.95607 33.95607 NA
#7 7 33.95607 28.95607 40.95607
#8 8 28.95607 32.34877 33.95607
#9 9 32.34877 NA 28.95607
#10 10 NA NA 32.34877
我知道问题已被接受,但几个月前我遇到了同样的问题(在 this 问题中),我想创建一个自制的 lag
函数。
这是代码:
df2$lagpm10 <- c(NA, df2$pm10[seq_along(df2$pm10) -1])
df2
var1 pm10 l1pm10 lagpm10
1 1 26.95607 26.95607 NA
2 2 NA NA 26.95607
3 3 32.83869 32.83869 NA
4 4 39.95607 39.95607 32.83869
5 5 NA NA 39.95607
6 6 40.95607 40.95607 NA
7 7 33.95607 33.95607 40.95607
8 8 28.95607 28.95607 33.95607
9 9 32.34877 32.34877 28.95607
10 10 NA NA 32.34877
基准
其中Rhertel1和Rhertel2是Rhertel的两行代码,Sabdem是我的。
Unit: microseconds
expr min lq mean median uq max neval
Rhertel1 250.523 257.740 272.07275 260.3355 264.0945 3540.187 10000
Rhertel2 246.641 253.887 271.77003 256.5380 260.4935 14637.791 10000
Sabdem 57.762 60.521 65.85315 61.3765 62.6050 12275.979 10000
另一种方法是使用 data.table 包中的 shift
函数:
library(data.table)
setDT(df2)[, c("l1pm10","l1pm102") := .(shift(pm10, 1L, fill = NA, type = "lag"),
shift(pm10, 1L, fill = NA, type = "lead"))]
这给出:
> df2 var1 pm10 l1pm10 l1pm102 1: 1 26.95607 NA NA 2: 2 NA 26.95607 32.83869 3: 3 32.83869 NA 39.95607 4: 4 39.95607 32.83869 NA 5: 5 NA 39.95607 40.95607 6: 6 40.95607 NA 33.95607 7: 7 33.95607 40.95607 28.95607 8: 8 28.95607 33.95607 32.34877 9: 9 32.34877 28.95607 NA 10: 10 NA 32.34877 NA
已用数据:
df2 <- structure(list(var1 = 1:10, pm10 = c(26.956073733, NA, 32.838694951,
39.9560737332, NA, 40.9560737332, 33.956073733, 28.956073733,
32.348770798, NA)), .Names = c("var1", "pm10"), row.names = c("1",
"2", "3", "4", "5", "6", "7", "8", "9", "10"), class = "data.frame")
我想傻瓜的解决方案就是创建向量或列的 "lagged" 版本(在第一个位置添加 NA),然后将列绑定在一起:
x<-1:10; #Example vector
x_lagged <- c(NA, x[1:(length(x)-1)]);
new_x <- cbind(x,x_lagged);
您可以使用head
和tail
如下
df2$l1pm10 <- c(tail(df2$pm10, -1), NA)
df2$l1pm102 <- c(NA, head(df2$pm10, -1))
df2
#R> var1 pm10 l1pm10 l1pm102
#R> 1 1 26.95607 NA NA
#R> 2 2 NA 32.83869 26.95607
#R> 3 3 32.83869 39.95607 NA
#R> 4 4 39.95607 NA 32.83869
#R> 5 5 NA 40.95607 39.95607
#R> 6 6 40.95607 33.95607 NA
#R> 7 7 33.95607 28.95607 40.95607
#R> 8 8 28.95607 32.34877 33.95607
#R> 9 9 32.34877 NA 28.95607
#R> 10 10 NA NA 32.34877
# or with transfrom
transform(df2, l1pm10 = c(tail(pm10, -1), NA), l1pm102 = c(NA, head(pm10, -1)))
#R> var1 pm10 l1pm10 l1pm102
#R> 1 1 26.95607 NA NA
#R> 2 2 NA 32.83869 26.95607
#R> 3 3 32.83869 39.95607 NA
#R> 4 4 39.95607 NA 32.83869
#R> 5 5 NA 40.95607 39.95607
#R> 6 6 40.95607 33.95607 NA
#R> 7 7 33.95607 28.95607 40.95607
#R> 8 8 28.95607 32.34877 33.95607
#R> 9 9 32.34877 NA 28.95607
#R> 10 10 NA NA 32.34877
你可以用这两个写一个通用的滞后函数如下
lag_func <- function(x, k = 1, pad = NA){
if(k == 0)
return(x)
nas <- rep(pad, min(length(x), abs(k)))
if(k < 0)
c(tail(x, k), nas) else c(nas, head(x, -k))
}
# use the function to lag the variable
sapply((-11):11, lag_func, x = df2$pm10)
#R> [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9]
#R> [1,] NA NA NA 32.34877 28.95607 33.95607 40.95607 NA 39.95607
#R> [2,] NA NA NA NA 32.34877 28.95607 33.95607 40.95607 NA
#R> [3,] NA NA NA NA NA 32.34877 28.95607 33.95607 40.95607
#R> [4,] NA NA NA NA NA NA 32.34877 28.95607 33.95607
#R> [5,] NA NA NA NA NA NA NA 32.34877 28.95607
#R> [6,] NA NA NA NA NA NA NA NA 32.34877
#R> [7,] NA NA NA NA NA NA NA NA NA
#R> [8,] NA NA NA NA NA NA NA NA NA
#R> [9,] NA NA NA NA NA NA NA NA NA
#R> [10,] NA NA NA NA NA NA NA NA NA
#R> [,10] [,11] [,12] [,13] [,14] [,15] [,16] [,17]
#R> [1,] 32.83869 NA 26.95607 NA NA NA NA NA
#R> [2,] 39.95607 32.83869 NA 26.95607 NA NA NA NA
#R> [3,] NA 39.95607 32.83869 NA 26.95607 NA NA NA
#R> [4,] 40.95607 NA 39.95607 32.83869 NA 26.95607 NA NA
#R> [5,] 33.95607 40.95607 NA 39.95607 32.83869 NA 26.95607 NA
#R> [6,] 28.95607 33.95607 40.95607 NA 39.95607 32.83869 NA 26.95607
#R> [7,] 32.34877 28.95607 33.95607 40.95607 NA 39.95607 32.83869 NA
#R> [8,] NA 32.34877 28.95607 33.95607 40.95607 NA 39.95607 32.83869
#R> [9,] NA NA 32.34877 28.95607 33.95607 40.95607 NA 39.95607
#R> [10,] NA NA NA 32.34877 28.95607 33.95607 40.95607 NA
#R> [,18] [,19] [,20] [,21] [,22] [,23]
#R> [1,] NA NA NA NA NA NA
#R> [2,] NA NA NA NA NA NA
#R> [3,] NA NA NA NA NA NA
#R> [4,] NA NA NA NA NA NA
#R> [5,] NA NA NA NA NA NA
#R> [6,] NA NA NA NA NA NA
#R> [7,] 26.95607 NA NA NA NA NA
#R> [8,] NA 26.95607 NA NA NA NA
#R> [9,] 32.83869 NA 26.95607 NA NA NA
#R> [10,] 39.95607 32.83869 NA 26.95607 NA NA