识别时间序列(字符)值的变化并标记相应数据集中新值的位置

Identify changes in time series (character) values and flag location of new value in corresponding dataset

我正在处理带有字符输入(特别是未来合同细节)的时间序列数据集。我想确定字符值与先前可用日期不同的日期,并且对于这些特定日期,确定哪些连续列具有该值。我在下面提供了一个示例数据集。我考虑过在此 xts 对象上使用 lag(),但出现错误:

Error in `[.xts`(x, seq_len(xlen - n)) : subscript out of bounds

此外,我目前的方法有点蛮力,我想避免(特别是因为对应列的数量因不同的数据集而异)。

目的:我有一个对应的returns时间序列,格式和字符时间序列一样。通过确定这个新角色(合同细节)特征所在的相应列和日期位置 [新位置],我想用新位置的 return 替换该日期第一列中现有的 return。

分享字符时间序列的样本dput输出tempContracts:

structure(c("SPU19-USA", "SPU19-USA", "SPU19-USA", "SPU19-USA", 
"SPZ19-USA", "SPZ19-USA", "SPZ19-USA", "SPZ19-USA", "SPZ19-USA", 
"SPZ19-USA", "SPZ19-USA", "SPZ19-USA", "SPZ19-USA", "SPZ19-USA", 
"SPZ19-USA", "SPZ19-USA", "SPZ19-USA", "SPZ19-USA", "SPZ19-USA", 
"SPZ19-USA", "SPZ19-USA", "SPU19-USA", "SPU19-USA", "SPU19-USA", 
"SPU19-USA", "SPU19-USA", "SPU19-USA", "SPU19-USA", "SPU19-USA", 
"SPU19-USA", "SPU19-USA", "SPZ19-USA", "SPZ19-USA", "SPZ19-USA", 
"SPZ19-USA", "SPZ19-USA", "SPZ19-USA", "SPZ19-USA", "SPZ19-USA", 
"SPZ19-USA", "SPZ19-USA", "SPZ19-USA", "SPZ19-USA", "SPZ19-USA", 
"SPZ19-USA", "SPZ19-USA", "SPZ19-USA", "SPZ19-USA", "SPZ19-USA", 
"SPZ19-USA", "SPZ19-USA", "SPZ19-USA", "SPH20-USA", "SPH20-USA", 
"SPH20-USA", "SPH20-USA", "SPH20-USA", "SPH20-USA", "SPH20-USA", 
"SPH20-USA", "SPH20-USA", "SPH20-USA", "SPH20-USA"), class = c("xts", 
"zoo"), .indexCLASS = "Date", .indexTZ = "UTC", tclass = "Date", tzone = "UTC", index = structure(c(1567728000, 
1567987200, 1568073600, 1568160000, 1568246400, 1568332800, 1568592000, 
1568678400, 1568764800, 1568851200, 1568937600, 1569196800, 1569283200, 
1569369600, 1569456000, 1569542400, 1569801600, 1569888000, 1569974400, 
1570060800, 1570147200), tzone = "UTC", tclass = "Date"), .Dim = c(21L, 
3L), .Dimnames = list(NULL, c("SP00.USA", "SP.1.USA", "SP.2.USA"
)))

return 时间序列的样本 dput 输出 tempRI:

structure(c(0.00295659400967452, -0.000872629691220261, 0.000100726912638294, 
0.00785891512466552, 0.00388982653805137, -0.00169370546773528, 
-0.00236269057182703, 0.00212999714436535, 0.000232693427461683, 
-0.000232693427461683, -0.00613601151530396, 0.00253900513908256, 
-0.00901586386319586, 0.00540587231028766, -0.001944091247152, 
-0.00561884290390235, 0.00494758931266404, -0.0137588161714284, 
-0.0196623961323645, 0.0107728408742762, 0.0133726493037134, 
0.00295659400967452, -0.000872629691220261, 0.000100726912638294, 
0.00785891512466552, 0.00325917345931082, -0.00179455699351827, 
-0.00243110601795671, 0.00209842695038276, 0.00029941614076634, 
-9.97954194730255e-05, -0.00550414199196148, 0.00253900513908256, 
-0.00901586386319586, 0.00540587231028766, -0.001944091247152, 
-0.00561884290390235, 0.00494758931266404, -0.0137588161714284, 
-0.0196623961323645, 0.0107728408742762, 0.0133726493037134, 
0.00298883607603528, -0.000805099003568621, 0.000134228188120922, 
0.00785444971143257, 0.0033236976786668, -0.00169370546773528, 
-0.00236269057182703, 0.00212999714436535, 0.000232693427461683, 
-0.000232693427461683, -0.00526667884051868, 0.00240344609471244, 
-0.00907650876598698, 0.00550263098282411, -0.00197611974611434, 
-0.00568212002020996, 0.00497781197372316, -0.0140212104959856, 
-0.019827124891334, 0.0105981832589173, 0.013540683361386), class = c("xts", 
"zoo"), .indexCLASS = "Date", .indexTZ = "UTC", tclass = "Date", tzone = "UTC", index = structure(c(1567728000, 
1567987200, 1568073600, 1568160000, 1568246400, 1568332800, 1568592000, 
1568678400, 1568764800, 1568851200, 1568937600, 1569196800, 1569283200, 
1569369600, 1569456000, 1569542400, 1569801600, 1569888000, 1569974400, 
1570060800, 1570147200), tzone = "UTC", tclass = "Date"), .Dim = c(21L, 
3L), .Dimnames = list(NULL, c("SP00.USA", "SP.1.USA", "SP.2.USA"
)))

预期输出 - 丢弃其余列 adjRI:

structure(c(0.00295659400967452, -0.000872629691220261, 0.000100726912638294, 
0.00785891512466552, 0.0033236976786668, -0.00169370546773528, 
-0.00236269057182703, 0.00212999714436535, 0.000232693427461683, 
-0.000232693427461683, -0.00613601151530396, 0.00253900513908256, 
-0.00901586386319586, 0.00540587231028766, -0.001944091247152, 
-0.00561884290390235, 0.00494758931266404, -0.0137588161714284, 
-0.0196623961323645, 0.0107728408742762, 0.0133726493037134), class = c("xts", 
"zoo"), .indexCLASS = "Date", .indexTZ = "UTC", tclass = "Date", tzone = "UTC", index = structure(c(1567728000, 
1567987200, 1568073600, 1568160000, 1568246400, 1568332800, 1568592000, 
1568678400, 1568764800, 1568851200, 1568937600, 1569196800, 1569283200, 
1569369600, 1569456000, 1569542400, 1569801600, 1569888000, 1569974400, 
1570060800, 1570147200), tzone = "UTC", tclass = "Date"), .Dim = c(21L, 
1L), .Dimnames = list(NULL, "SP00.USA"))

注:2019-09-12 的数值变化

更新: 对所需输出的快速评论:关注 SP00.USA

非常感谢对此问题的任何帮助!

是的,您可以使用延迟。像这样滞后 tempContracts[,"SP00.USA"]!=lag(tempContracts[,"SP00.USA"]) 来识别切换的行。然后使用这个布尔索引,您可以替换 adjRI 中的值。看下面,我存为test,和你提供的adjRI对比。

library(zoo)
library(xts)

test <- tempRI[,"SP00.USA",drop=FALSE]
toChange <- tempContracts[,"SP00.USA"]!=lag(tempContracts[,"SP00.USA"])
test[toChange,1] = tempRI[toChange,"SP.2.USA"]
identical(test,adjRI)

@StupidWolf - 感谢您的输入。很有帮助!

备注:

  1. 如更新中所指定,第 1 列中合同发生变化的日期(在下面的代码中,这些日期在 FlagRoll 中标记),我想在以下哪一列(样本中只有 SP.2.USA,但实际数据集有多个(和不同的)列数要查看)合同出现在上一个更改日期。

  2. 代码会逐列检查以确定出现这种情况的位置,并在变量 flagCol 中进行标记。对于这些日期,将第 1 列中的 returns 替换为新计算的 returns(计算使用 tempPI,这是与 tempRI 格式相同的价格 xts 对象;计算不重要,但我还是提供了)

#identify dates in "SP00.USA" (col1) where there is change in value
flagRoll <- tempContracts[,1]!=lag(tempContracts[,1])
flagRoll[1] = FALSE #adjust for value on day 1 from NA

#for the dates in flagRoll, identify the column in which the contract appears       
#method: 
#1. look at one column at a time (k is set to >=3 for my specific case, but can be set to 
#   k>=2 for a more generic sample)
#2. for each col, identify locations where contracts are same (flagCol) within the set of 
#   flagRoll
#3. for those dates, replace return values as difference in log of correct prices from 
#   tempPI [for ref: tempPI is the same as tempRI but contains prices instead of log 
#   returns)
#4. after all incorrect returns are replaced, save tempRI as adjRI 

for (k in seq(3,ncol(tempContracts))){
       #identify reference col for the roll
       flagCol <- tempContracts[flagRoll,1] == lag(tempContracts[,k])[flagRoll] 
       #note that you will compare with contracts second to expire or later, which start
       #from cols to onwards 

       #replace returns from identified col in continuous time series
       tempRI[flagRoll,1][flagCol] = log(tempPI[flagRoll,1][flagCol]) - log(lag(tempPI[,k])[flagRoll][flagCol])

       rm(flagCol) #to allow it to be reset for next k
}

#replace column j in adjRI with revised roll returns
adjRI <- tempRI[,1]

#clear variables  to run for another sample   
rm(k)
rm(tempPI)
rm(tempRI)
rm(tempContracts)
rm(flagRoll)