R - 按组应用 adf.test

Question

我有一个 data.frame bbm，其中包含变量 ticker、variable 和 value。我想通过按代码和变量分组的 adf.test 函数应用 Augmented Dickey Fuller 测试。 R 应使用相应的 p 值向初始 data.frame 添加一个新列。

我试过了

x <- with(bbm, tapply(value, list(ticker, variable), adf.test$p.value))
cbind(bbm, x)

这会产生 Error in adf.test$p.value : object of type 'closure' is not subsettable。

然后我试了

x <- with(bbm, tapply(value, list(ticker, variable), as.list(adf.test)$p.value))
cbind(bbm, x)

这产生了一个结果，但在新列中并不是我想要的。即使我将代码上的 p.value 更改为方法，它仍然会产生一些奇数。

然后我尝试使用 ddply:

bbm<-ddply(bbm, .(ticker, variable), mutate, df=adf.test(value)$p.value)

产生 Error: wrong embedding Dimension.

我该如何解决这个问题？有什么建议吗？

这是 df 的示例：

            ticker                    variable   value
1  1002Z AV Equity        BS_CUSTOMER_DEPOSITS 29898.0
2  1002Z AV Equity        BS_CUSTOMER_DEPOSITS 31302.0
3  1002Z AV Equity        BS_CUSTOMER_DEPOSITS 29127.0
4  1002Z AV Equity        BS_CUSTOMER_DEPOSITS 24056.0
5  1002Z AV Equity        BS_CUSTOMER_DEPOSITS 22080.0
6  1002Z AV Equity        BS_CUSTOMER_DEPOSITS 22585.0
7  1002Z AV Equity        BS_CUSTOMER_DEPOSITS 22674.0
8  1002Z AV Equity        BS_CUSTOMER_DEPOSITS 21733.0
9  1002Z AV Equity        BS_CUSTOMER_DEPOSITS 22016.0
10 1002Z AV Equity        BS_CUSTOMER_DEPOSITS 21999.0
11 1002Z AV Equity        BS_CUSTOMER_DEPOSITS 22013.0
12 1002Z AV Equity        BS_CUSTOMER_DEPOSITS 21135.0
13 1002Z AV Equity                 BS_TOT_LOAN 28476.0
14 1002Z AV Equity                 BS_TOT_LOAN 29446.0
15 1002Z AV Equity                 BS_TOT_LOAN 29273.0
16 1002Z AV Equity                 BS_TOT_LOAN 27579.0
17 1002Z AV Equity                 BS_TOT_LOAN 20769.0
18 1002Z AV Equity                 BS_TOT_LOAN 21370.0
19 1002Z AV Equity                 BS_TOT_LOAN 22306.0
20 1002Z AV Equity                 BS_TOT_LOAN 21013.0
21 1002Z AV Equity                 BS_TOT_LOAN 21810.0
22 1002Z AV Equity          BS_TIER1_CAP_RATIO     6.5
23 1002Z AV Equity          BS_TIER1_CAP_RATIO     6.2
24 1002Z AV Equity          BS_TIER1_CAP_RATIO     7.9
25 1002Z AV Equity          BS_TIER1_CAP_RATIO     9.2
26 1002Z AV Equity          BS_TIER1_CAP_RATIO     8.5
27 1002Z AV Equity          BS_TIER1_CAP_RATIO     6.6
28 1002Z AV Equity          BS_TIER1_CAP_RATIO     9.6
29 1002Z AV Equity BS_TOT_CAP_TO_RISK_BASE_CAP    11.5
30 1002Z AV Equity BS_TOT_CAP_TO_RISK_BASE_CAP    10.9



 > dput(head(select(bbm, ticker, variable, value), 30))
structure(list(ticker = c("1002Z AV Equity", "1002Z AV Equity", 
"1002Z AV Equity", "1002Z AV Equity", "1002Z AV Equity", "1002Z AV Equity", 
"1002Z AV Equity", "1002Z AV Equity", "1002Z AV Equity", "1002Z AV Equity", 
"1002Z AV Equity", "1002Z AV Equity", "1002Z AV Equity", "1002Z AV Equity", 
"1002Z AV Equity", "1002Z AV Equity", "1002Z AV Equity", "1002Z AV Equity", 
"1002Z AV Equity", "1002Z AV Equity", "1002Z AV Equity", "1002Z AV Equity", 
"1002Z AV Equity", "1002Z AV Equity", "1002Z AV Equity", "1002Z AV Equity", 
"1002Z AV Equity", "1002Z AV Equity", "1002Z AV Equity", "1002Z AV Equity"
), variable = structure(c(4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 
4L, 4L, 4L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 8L, 8L, 8L, 8L, 
8L, 8L, 8L, 9L, 9L), .Label = c("PX_LAST", "PE_RATIO", "VOL_MEAN", 
"BS_CUSTOMER_DEPOSITS", "BS_TOT_LOAN", "*", "RN366", "BS_TIER1_CAP_RATIO", 
"BS_TOT_CAP_TO_RISK_BASE_CAP", "RETURN_COM_EQY", "BS_LEV_RATIO_TO_TANG_CAP",
"NPLS_TO_TOTAL_LOANS"), class = "factor"), value = c(29898, 31302, 
29127, 24056, 22080, 22585, 22674, 21733, 22016, 21999, 22013, 
21135, 28476, 29446, 29273, 27579, 20769, 21370, 22306, 21013, 
21810, 6.5, 6.2, 7.9, 9.2, 8.5, 6.6, 9.6, 11.5, 10.9)), .Names = c("ticker", 
"variable", "value"), row.names = c(NA, 30L), class = "data.frame")

哦，使用模拟 dplyr 函数也会产生与 ddply 相同的错误。

Answer 1

看来问题可能出在小组太小而无法处理。处理此问题的一个选项是创建一个自定义函数来捕获错误（使用 tryCatch 并通过 lapply() 调用传递此函数，如下所示：

testx <- function (x) {
  return(tryCatch(adf.test(x), error=function(e) NULL))
}

g<- lapply(split(bbm, bbm$variable), function(x) testx(x$value))
str(g)
#List of 12
# $ PX_LAST                    : NULL
# $ PE_RATIO                   : NULL
# $ VOL_MEAN                   : NULL
# $ BS_CUSTOMER_DEPOSITS       :List of 6
# ..$ statistic  : Named num -4.86
#  .. ..- attr(*, "names")= chr "Dickey-Fuller"
#  ..$ parameter  : Named num 2
#  .. ..- attr(*, "names")= chr "Lag order"
#  ..$ alternative: chr "stationary"
#  ..$ p.value    : num 0.01
#  ..$ method     : chr "Augmented Dickey-Fuller Test"
#  ..$ data.name  : chr "x"
#  ..- attr(*, "class")= chr "htest"
# $ BS_TOT_LOAN                :List of 6
#  ..$ statistic  : Named num -0.784
#  .. ..- attr(*, "names")= chr "Dickey-Fuller"
#  ..$ parameter  : Named num 2
#  .. ..- attr(*, "names")= chr "Lag order"
#  ..$ alternative: chr "stationary"
#  ..$ p.value    : num 0.951
#  ..$ method     : chr "Augmented Dickey-Fuller Test"
#  ..$ data.name  : chr "x"
#  ..- attr(*, "class")= chr "htest"
# $ *                          : NULL
# $ RN366                      : NULL
# $ BS_TIER1_CAP_RATIO         :List of 6
#  ..$ statistic  : Named num -4.33
#  .. ..- attr(*, "names")= chr "Dickey-Fuller"
#  ..$ parameter  : Named num 1
#  .. ..- attr(*, "names")= chr "Lag order"
#  ..$ alternative: chr "stationary"
#  ..$ p.value    : num 0.0118
#  ..$ method     : chr "Augmented Dickey-Fuller Test"
#  ..$ data.name  : chr "x"
#  ..- attr(*, "class")= chr "htest"
# $ BS_TOT_CAP_TO_RISK_BASE_CAP: NULL
# $ RETURN_COM_EQY             : NULL
# $ BS_LEV_RATIO_TO_TANG_CAP   : NULL
# $ NPLS_TO_TOTAL_LOANS        : NULL

这将创建一个长度为 12（每个因子一个）的列表对象 g，其中，对于有效的 adf.test 调用，元素由相关特征填充，其余部分NULL通过了

如果感兴趣的参数只是每组的p.value，那么前面的lapply可以环绕一个sapply()得到下面的对象：

h<- sapply(lapply(split(bbm, bbm$variable), function(x) testx(x$value)), function(x) print(x$p.value))
str(h)
#List of 12
# $ PX_LAST                    : NULL
# $ PE_RATIO                   : NULL
# $ VOL_MEAN                   : NULL
# $ BS_CUSTOMER_DEPOSITS       : num 0.01
# $ BS_TOT_LOAN                : num 0.951
# $ *                          : NULL
# $ RN366                      : NULL
# $ BS_TIER1_CAP_RATIO         : num 0.0118
# $ BS_TOT_CAP_TO_RISK_BASE_CAP: NULL
# $ RETURN_COM_EQY             : NULL
# $ BS_LEV_RATIO_TO_TANG_CAP   : NULL
# $ NPLS_TO_TOTAL_LOANS        : NULL

根据评论，如果需要按 ticker 和 variable 进行分组，这将产生所需的结果：

g<- lapply(split(bbm, list(bbm$variable, bbm$ticker)), function(x) testx(x$value))
#to remove the NULL which are not needed:
g[g != "NULL"]

Answer 2

这是一个 tidyverse 解决方案：

bbm %>% 
    group_by(ticker,variable) %>% 
    summarise(pval = ifelse(n() <= 3,NA, adf.test(value)$p.value))

# A tibble: 4 x 3
# Groups:   ticker [?]
  ticker          variable                       pval
  <chr>           <fct>                         <dbl>
1 1002Z AV Equity BS_CUSTOMER_DEPOSITS         0.01  
2 1002Z AV Equity BS_TOT_LOAN                  0.951 
3 1002Z AV Equity BS_TIER1_CAP_RATIO           0.0118
4 1002Z AV Equity BS_TOT_CAP_TO_RISK_BASE_CAP NA     
Warning message:
In adf.test(value) : p-value smaller than printed p-value

您可以只使用基础 R ifelse 函数来检查每个组中是否存在少于 3 个点（这会将 pval 设置为 NA）否则您可以运行 adf.test

我试了一下，@erasmortg 似乎是正确的。错误 "embedding" 来自没有足够的数据点来实际运行 adf.test 函数。

这需要至少四个数据点：

> adf.test(rnorm(1))
Error in embed(y, k) : wrong embedding dimension
> adf.test(rnorm(2))
Error in embed(y, k) : wrong embedding dimension
> adf.test(rnorm(3))
Error in res.sum$coefficients[2, 1] : subscript out of bounds
> adf.test(rnorm(4))

    Augmented Dickey-Fuller Test

data:  rnorm(4)
Dickey-Fuller = NaN, Lag order = 1, p-value = NA
alternative hypothesis: stationary

R - 按组应用 adf.test

R - apply adf.test by group

grouping

r

plyr

tapply