dplyr 并检查前 24-60 个月的观察结果
dplyr and checking previous 24-60 months for observations
我有一堆独特的 cusip 代码(唯一 ID),需要检查以确保有 24-60 个上个月的观察结果,但我不确定如何使用 dplyr
[=17 进行检查=]
可重现的例子:
tdata <- structure(list(cusip = c(2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2), fyear = c("1971", "1971", "1971", "1971",
"1971", "1971", "1971", "1971", "1971", "1971", "1971", "1971",
"1972", "1972", "1972", "1972", "1972", "1972", "1972", "1972",
"1972", "1972", "1972", "1972", "1972", "1973", "1973", "1973",
"1973", "1973", "1973", "1973", "1973", "1973", "1973", "1973",
"1973", "1974", "1974", "1974", "1974", "1974", "1974", "1974",
"1974", "1974", "1974", "1974", "1974", "1975", "1975", "1975",
"1975", "1975", "1975", "1975", "1975", "1975", "1975", "1975"
), datadate = c(19711231L, 19710129L, 19710226L, 19710331L, 19710430L,
19710528L, 19710630L, 19710730L, 19710831L, 19710930L, 19711029L,
19711130L, 19721231L, 19720131L, 19720229L, 19720330L, 19720428L,
19720531L, 19720630L, 19720731L, 19720831L, 19720929L, 19721031L,
19721130L, 19721229L, 19731231L, 19730131L, 19730228L, 19730330L,
19730430L, 19730531L, 19730629L, 19730731L, 19730831L, 19730928L,
19731031L, 19731130L, 19741231L, 19740131L, 19740228L, 19740329L,
19740430L, 19740531L, 19740628L, 19740731L, 19740830L, 19740930L,
19741031L, 19741129L, 19751231L, 19750131L, 19750228L, 19750331L,
19750430L, 19750530L, 19750630L, 19750731L, 19750829L, 19750930L,
19751031L)), .Names = c("cusip", "fyear", "datadate"), class = c("grouped_df",
"tbl_df", "tbl", "data.frame"), row.names = c(NA, -60L), vars = list(
cusip, fyear), drop = TRUE, indices = list(0:11, 12:24, 25:36,
37:48, 49:59), group_sizes = c(12L, 13L, 12L, 12L, 11L), biggest_group_size = 13L, labels = structure(list(
cusip = c(2, 2, 2, 2, 2), fyear = c("1971", "1972", "1973",
"1974", "1975")), class = "data.frame", row.names = c(NA,
-5L), .Names = c("cusip", "fyear"), vars = list(cusip, fyear)))
逻辑
我正在考虑检查每年的总月数,但不知道提取前几个月来检查 24/60 是否 >= 0.4。我将如何编辑此代码以检查前 60 个月并确保至少有 24 个月,包括....
tdata %>%
group_by(cusip, fyear) %>%
mutate(month = substr(datadate, 5, 6) %>%
mutate(pre_countmonths = length(unique(month))
编辑 2015 年 4 月 7 日:
这是我使用 for 循环遵循的逻辑。我在使用 R 时遇到的挑战之一是在 for 循环之外分支。在可以使用 dplyr
而不是 for
循环的地方编辑这个有什么可能的方法吗?对于我当前的数据,运行 这将花费太长时间。
for(i in min(tdata$cusip):max(tdata$cusip)){
for (j in min(tdata$fyear):max(tdata$fyear) {
monthcheck <- filter(tdata, cusip == i & (fyear == j-1 | fyear == j-2 | fyear == j-3 | fyear == j-4))
if(length(monthcheck$month) / 40 >= 0.4) if(any(tdata$fyear == j)) tdata$check <- 1
}}
编辑:04/08/2015 - 添加了包含主要变量的完整样本数据集
小子集:https://www.dropbox.com/s/mf0o0tbgbame6k8/testdata.csv?dl=0
这是我在限定时间内得到的。我希望这能给你一些想法,也希望其他用户提供更好的解决方案。
mydf <- as_data_frame(list(cusip = c(2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2), fyear = c("1971", "1971", "1971", "1971",
"1971", "1971", "1971", "1971", "1971", "1971", "1971", "1971",
"1972", "1972", "1972", "1972", "1972", "1972", "1972", "1972",
"1972", "1972", "1972", "1972", "1972", "1973", "1973", "1973",
"1973", "1973", "1973", "1973", "1973", "1973", "1973", "1973",
"1973", "1974", "1974", "1974", "1974", "1974", "1974", "1974",
"1974", "1974", "1974", "1974", "1974", "1975", "1975", "1975",
"1975", "1975", "1975", "1975", "1975", "1975", "1975", "1975"
), datadate = c(19711231L, 19710129L, 19710226L, 19710331L, 19710430L,
19710528L, 19710630L, 19710730L, 19710831L, 19710930L, 19711029L,
19711130L, 19721231L, 19720131L, 19720229L, 19720330L, 19720428L,
19720531L, 19720630L, 19720731L, 19720831L, 19720929L, 19721031L,
19721130L, 19721229L, 19731231L, 19730131L, 19730228L, 19730330L,
19730430L, 19730531L, 19730629L, 19730731L, 19730831L, 19730928L,
19731031L, 19731130L, 19741231L, 19740131L, 19740228L, 19740329L,
19740430L, 19740531L, 19740628L, 19740731L, 19740830L, 19740930L,
19741031L, 19741129L, 19751231L, 19750131L, 19750228L, 19750331L,
19750430L, 19750530L, 19750630L, 19750731L, 19750829L, 19750930L,
19751031L)))
# Make it normal data.frame
mydf <- data.frame(mydf)
# Create another data frame with a new cusip
mydf2 <- mutate(mydf, cusip = 3)
### Create a new data frame which is missing one data point
foo <- bind_rows(mydf, mydf2[-4, ])
在这个伪数据中,cusip 3 缺少一个月的数据。这意味着,您没有 cusip
的连续 24-60 个月的数据 3. 首先,我创建了一个包含月份的列和一个包含日期对象的列。然后,我按 cusp
和数据日期订购了您的数据。我想要 select 个数据点,这些数据点停留在这 24-60 个月之间。这是第一 filter
部分。我按 cusp
对数据进行了分组。使用月份,我想检查我是否有连续的数据点。您会期望 lead(month)-month
= 1、11 或 0。如果您有来自同一个月的两个数据点,您会期望 0。这在您的数据中发生。最后的 filter
是你可以修改的。在这里,我想删除所有检查为 FALSE 的 cusip
。在这个草案中,这个过滤器似乎在做正确的事情;最后你看不到cusip 3的任何数据。希望对你有帮助。
mutate(foo, month = as.numeric(substr(datadate, 5, 6))) %>%
mutate(datadate = as.POSIXct(gsub("^(\d{4})(\d{2}).*$", "\1-\2-01", datadate),
format("%Y-%m-%d"), tz = "GMT")) %>%
arrange(cusip, datadate) %>%
filter(between(datadate,
datadate[tail(which(month == 6, arr.ind = TRUE), n = 1)] - (60*60*24*30*60),
datadate[tail(which(month == 6, arr.ind = TRUE), n = 1)] -(60*60*24*30*24))) %>%
group_by(cusip) %>%
mutate(check = abs(lead(month)-month) == 11|abs(lead(month)-month) == 1|abs(lead(month)-month) == 0) %>%
filter(all(check == TRUE | check %in% NA))
我有一堆独特的 cusip 代码(唯一 ID),需要检查以确保有 24-60 个上个月的观察结果,但我不确定如何使用 dplyr
[=17 进行检查=]
可重现的例子:
tdata <- structure(list(cusip = c(2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2), fyear = c("1971", "1971", "1971", "1971",
"1971", "1971", "1971", "1971", "1971", "1971", "1971", "1971",
"1972", "1972", "1972", "1972", "1972", "1972", "1972", "1972",
"1972", "1972", "1972", "1972", "1972", "1973", "1973", "1973",
"1973", "1973", "1973", "1973", "1973", "1973", "1973", "1973",
"1973", "1974", "1974", "1974", "1974", "1974", "1974", "1974",
"1974", "1974", "1974", "1974", "1974", "1975", "1975", "1975",
"1975", "1975", "1975", "1975", "1975", "1975", "1975", "1975"
), datadate = c(19711231L, 19710129L, 19710226L, 19710331L, 19710430L,
19710528L, 19710630L, 19710730L, 19710831L, 19710930L, 19711029L,
19711130L, 19721231L, 19720131L, 19720229L, 19720330L, 19720428L,
19720531L, 19720630L, 19720731L, 19720831L, 19720929L, 19721031L,
19721130L, 19721229L, 19731231L, 19730131L, 19730228L, 19730330L,
19730430L, 19730531L, 19730629L, 19730731L, 19730831L, 19730928L,
19731031L, 19731130L, 19741231L, 19740131L, 19740228L, 19740329L,
19740430L, 19740531L, 19740628L, 19740731L, 19740830L, 19740930L,
19741031L, 19741129L, 19751231L, 19750131L, 19750228L, 19750331L,
19750430L, 19750530L, 19750630L, 19750731L, 19750829L, 19750930L,
19751031L)), .Names = c("cusip", "fyear", "datadate"), class = c("grouped_df",
"tbl_df", "tbl", "data.frame"), row.names = c(NA, -60L), vars = list(
cusip, fyear), drop = TRUE, indices = list(0:11, 12:24, 25:36,
37:48, 49:59), group_sizes = c(12L, 13L, 12L, 12L, 11L), biggest_group_size = 13L, labels = structure(list(
cusip = c(2, 2, 2, 2, 2), fyear = c("1971", "1972", "1973",
"1974", "1975")), class = "data.frame", row.names = c(NA,
-5L), .Names = c("cusip", "fyear"), vars = list(cusip, fyear)))
逻辑
我正在考虑检查每年的总月数,但不知道提取前几个月来检查 24/60 是否 >= 0.4。我将如何编辑此代码以检查前 60 个月并确保至少有 24 个月,包括....
tdata %>%
group_by(cusip, fyear) %>%
mutate(month = substr(datadate, 5, 6) %>%
mutate(pre_countmonths = length(unique(month))
编辑 2015 年 4 月 7 日:
这是我使用 for 循环遵循的逻辑。我在使用 R 时遇到的挑战之一是在 for 循环之外分支。在可以使用 dplyr
而不是 for
循环的地方编辑这个有什么可能的方法吗?对于我当前的数据,运行 这将花费太长时间。
for(i in min(tdata$cusip):max(tdata$cusip)){
for (j in min(tdata$fyear):max(tdata$fyear) {
monthcheck <- filter(tdata, cusip == i & (fyear == j-1 | fyear == j-2 | fyear == j-3 | fyear == j-4))
if(length(monthcheck$month) / 40 >= 0.4) if(any(tdata$fyear == j)) tdata$check <- 1
}}
编辑:04/08/2015 - 添加了包含主要变量的完整样本数据集
小子集:https://www.dropbox.com/s/mf0o0tbgbame6k8/testdata.csv?dl=0
这是我在限定时间内得到的。我希望这能给你一些想法,也希望其他用户提供更好的解决方案。
mydf <- as_data_frame(list(cusip = c(2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2), fyear = c("1971", "1971", "1971", "1971",
"1971", "1971", "1971", "1971", "1971", "1971", "1971", "1971",
"1972", "1972", "1972", "1972", "1972", "1972", "1972", "1972",
"1972", "1972", "1972", "1972", "1972", "1973", "1973", "1973",
"1973", "1973", "1973", "1973", "1973", "1973", "1973", "1973",
"1973", "1974", "1974", "1974", "1974", "1974", "1974", "1974",
"1974", "1974", "1974", "1974", "1974", "1975", "1975", "1975",
"1975", "1975", "1975", "1975", "1975", "1975", "1975", "1975"
), datadate = c(19711231L, 19710129L, 19710226L, 19710331L, 19710430L,
19710528L, 19710630L, 19710730L, 19710831L, 19710930L, 19711029L,
19711130L, 19721231L, 19720131L, 19720229L, 19720330L, 19720428L,
19720531L, 19720630L, 19720731L, 19720831L, 19720929L, 19721031L,
19721130L, 19721229L, 19731231L, 19730131L, 19730228L, 19730330L,
19730430L, 19730531L, 19730629L, 19730731L, 19730831L, 19730928L,
19731031L, 19731130L, 19741231L, 19740131L, 19740228L, 19740329L,
19740430L, 19740531L, 19740628L, 19740731L, 19740830L, 19740930L,
19741031L, 19741129L, 19751231L, 19750131L, 19750228L, 19750331L,
19750430L, 19750530L, 19750630L, 19750731L, 19750829L, 19750930L,
19751031L)))
# Make it normal data.frame
mydf <- data.frame(mydf)
# Create another data frame with a new cusip
mydf2 <- mutate(mydf, cusip = 3)
### Create a new data frame which is missing one data point
foo <- bind_rows(mydf, mydf2[-4, ])
在这个伪数据中,cusip 3 缺少一个月的数据。这意味着,您没有 cusip
的连续 24-60 个月的数据 3. 首先,我创建了一个包含月份的列和一个包含日期对象的列。然后,我按 cusp
和数据日期订购了您的数据。我想要 select 个数据点,这些数据点停留在这 24-60 个月之间。这是第一 filter
部分。我按 cusp
对数据进行了分组。使用月份,我想检查我是否有连续的数据点。您会期望 lead(month)-month
= 1、11 或 0。如果您有来自同一个月的两个数据点,您会期望 0。这在您的数据中发生。最后的 filter
是你可以修改的。在这里,我想删除所有检查为 FALSE 的 cusip
。在这个草案中,这个过滤器似乎在做正确的事情;最后你看不到cusip 3的任何数据。希望对你有帮助。
mutate(foo, month = as.numeric(substr(datadate, 5, 6))) %>%
mutate(datadate = as.POSIXct(gsub("^(\d{4})(\d{2}).*$", "\1-\2-01", datadate),
format("%Y-%m-%d"), tz = "GMT")) %>%
arrange(cusip, datadate) %>%
filter(between(datadate,
datadate[tail(which(month == 6, arr.ind = TRUE), n = 1)] - (60*60*24*30*60),
datadate[tail(which(month == 6, arr.ind = TRUE), n = 1)] -(60*60*24*30*24))) %>%
group_by(cusip) %>%
mutate(check = abs(lead(month)-month) == 11|abs(lead(month)-month) == 1|abs(lead(month)-month) == 0) %>%
filter(all(check == TRUE | check %in% NA))