重写一个需要永远的 for 循环
Rewrite a for-loop that takes forever
我用R写了一段代码,计算一些数据的累加和。有用。问题是,我有 25,000 个数字 X 12 个月需要 "melt",所以我最终得到 300,000 行(每个月大约会有 2000x12 行)。前六行是重新创建我的 table 样本(一个巨大的 excel 文件)。然后有一些魔法将东西转换成正确的格式,最后我有这个双 for 循环,它根据它是否是双 "PDRcount" 来计算每个月的累计和。当我在我的真实数据上尝试时,循环需要 6 个小时......我怎样才能更快地做到这一点?
library(reshape2)
PDR <- (c( 1,2,3,4,5,2))
START <- as.Date(c("2008-01-01","2007-01-01","2010-01-01","2011-01-01","2017-02-01","2017-03-01"))
SWITCHOUT <- as.Date(c(NA, "2017-02-28", NA, NA, "2017-03-31",NA))
JAN17 <- (c(100,124,165,178,0,0))
FEB17 <- (c(101,125,133,178,170,0))
MAR17 <- (c(99,0,165,180,166,99))
APR17 <- (c(100,0,156,178,0,78))
alldata <- data.frame(PDR=PDR,
START=START,
SWITCHOUT=SWITCHOUT,
JAN17=JAN17,
FEB17=FEB17,
MAR17=MAR17,
APR17=APR17)
## count PDR occurrences
alldata$PDRcount <- ave(alldata$PDR,alldata$PDR,FUN=length)
alldata$PDRcount <- as.numeric(alldata$PDRcount)
crossdata<-melt(alldata,id=(c("PDR", "START","SWITCHOUT","PDRcount" )))
colnames(crossdata) <- c("PDR","START","SWITCHOUT","PDRcount","MONTH","SMC")
## transform levels to date format
levels(crossdata$MONTH)[1] <- "2017-01-01"
levels(crossdata$MONTH)[2] <- "2017-02-01"
levels(crossdata$MONTH)[3] <- "2017-03-01"
levels(crossdata$MONTH)[4] <- "2017-04-01"
crossdata$MONTH <- as.Date(crossdata$MONTH,format = "%Y-%m-%d" )
for (pdr in crossdata[,"PDR"]){
maxPDR <- max(crossdata$PDRcount[crossdata$PDR == pdr])
dates <- unique(crossdata$START[crossdata$PDR == pdr])
for (i in 1:maxPDR) {
CumSum <- cumsum( crossdata$SMC[crossdata$PDR == pdr & crossdata$START == dates[i]] )
crossdata$SMCcum[crossdata$PDR == pdr & crossdata$START == dates[i] & crossdata$MONTH == "2017-01-01"] <- CumSum[1]
crossdata$SMCcum[crossdata$PDR == pdr & crossdata$START == dates[i] & crossdata$MONTH == "2017-02-01"] <- CumSum[2]
crossdata$SMCcum[crossdata$PDR == pdr & crossdata$START == dates[i] & crossdata$MONTH == "2017-03-01"] <- CumSum[3]
crossdata$SMCcum[crossdata$PDR == pdr & crossdata$START == dates[i] & crossdata$MONTH == "2017-04-01"] <- CumSum[4]
}
}
已编辑:抱歉出现错误...
这是部分答案。我不明白“...基于它是否是双 "PDRcount"” 部分。
这里是 PDR !=2 使用 dplyr
库的部分答案。我还通过在任何计算之前对您的 crossdata 变量使用 dput 来简化数据输入。
crossdata1<-structure(list(PDR = c(1, 2, 3, 4, 5, 2, 1, 2, 3, 4, 5, 2, 1,
2, 3, 4, 5, 2, 1, 2, 3, 4, 5, 2),
START = structure(c(13879, 13514, 14610, 14975, 17198, 17226, 13879, 13514, 14610, 14975,
17198, 17226, 13879, 13514, 14610, 14975, 17198, 17226, 13879,
13514, 14610, 14975, 17198, 17226), class = "Date"),
SWITCHOUT = structure(c(NA, 17225, NA, NA, 17256, NA, NA, 17225, NA, NA, 17256, NA, NA, 17225,
NA, NA, 17256, NA, NA, 17225, NA, NA, 17256, NA), class = "Date"),
PDRcount = c(1, 2, 1, 1, 1, 2, 1, 2, 1, 1, 1, 2, 1, 2, 1, 1, 1, 2, 1, 2, 1, 1, 1, 2),
MONTH = structure(c(17167, 17167,
17167, 17167, 17167, 17167, 17198, 17198, 17198, 17198, 17198,
17198, 17226, 17226, 17226, 17226, 17226, 17226, 17257, 17257,
17257, 17257, 17257, 17257), class = "Date"),
SMC = c(100, 124, 165, 178, 0, 0, 101, 125, 133, 178, 170, 0, 99, 0, 165,
180, 166, 99, 100, 0, 156, 178, 0, 78)),
row.names = c(NA, -24L), .Names = c("PDR", "START", "SWITCHOUT", "PDRcount", "MONTH", "SMC"),
class = "data.frame")
#test to see if starting data is the same
identical(crossdata, crossdata1)
library(dplyr)
#group by and add the cumsum column to answer dataframe
ans<-group_by(crossdata1, PDR) %>%
mutate(SMCcum = cumsum(SMC))
#rows where the 2 final dataframes do not match
crossdata[-which(crossdata$SMCcum== ans$SMCcum),]
如果应用附加过滤器来删除“...double "PDRcount" or not”的情况,上述行很可能会起作用。适用。
我发现这个 post 有用:
祝你好运。
您不断地覆盖您的结果。
一个明显的改进是循环 unique(crossdata[,"PDR"])
而不是为每一行调用循环。
我不确定你的内部循环是否为 maxPDR > 1
提供了预期的结果你不断地覆盖 START
匹配第 maxPDR
个 dates
条目的值- 请注意,您没有对 dates
进行排序,因此无法保证 dates[maxPDR]
是最大的(最新的)条目。
我在 dplyr
中编写了一个替代解决方案,其中包含两个步骤以简化转换为所需格式的过程。
alldata <- data.frame(PDR=PDR, START=START, SWITCHOUT=SWITCHOUT, JAN17=JAN17,
FEB17=FEB17, MAR17=MAR17, APR17=APR17)
library(dplyr)
library(tidyr) # to reshape the data
crossdata_2 <- alldata %>%
gather(MONTH,SMC,ends_with("17")) %>%
mutate(MONTH = as.character(strptime(paste0(MONTH,"-01"), format = "%b%y-%d"))) %>%
# the following line adds your PDRcount but is unnecessary for further computation
group_by(PDR) %>% mutate(PDRcount = n_distinct(START)) %>%
group_by(PDR,START) %>% mutate(SMCcum = cumsum(SMC))
请注意,我为每个 PDR
和 START
计算了 cumsum()
。如果每个 PDR
只需要一个结果,则只需添加一个合适的过滤器即可。
我想指出 strptime
中的缩写月份转换 %b
是特定于语言环境的。要正常工作,您可能必须更改 LC_TIME
.
我用R写了一段代码,计算一些数据的累加和。有用。问题是,我有 25,000 个数字 X 12 个月需要 "melt",所以我最终得到 300,000 行(每个月大约会有 2000x12 行)。前六行是重新创建我的 table 样本(一个巨大的 excel 文件)。然后有一些魔法将东西转换成正确的格式,最后我有这个双 for 循环,它根据它是否是双 "PDRcount" 来计算每个月的累计和。当我在我的真实数据上尝试时,循环需要 6 个小时......我怎样才能更快地做到这一点?
library(reshape2)
PDR <- (c( 1,2,3,4,5,2))
START <- as.Date(c("2008-01-01","2007-01-01","2010-01-01","2011-01-01","2017-02-01","2017-03-01"))
SWITCHOUT <- as.Date(c(NA, "2017-02-28", NA, NA, "2017-03-31",NA))
JAN17 <- (c(100,124,165,178,0,0))
FEB17 <- (c(101,125,133,178,170,0))
MAR17 <- (c(99,0,165,180,166,99))
APR17 <- (c(100,0,156,178,0,78))
alldata <- data.frame(PDR=PDR,
START=START,
SWITCHOUT=SWITCHOUT,
JAN17=JAN17,
FEB17=FEB17,
MAR17=MAR17,
APR17=APR17)
## count PDR occurrences
alldata$PDRcount <- ave(alldata$PDR,alldata$PDR,FUN=length)
alldata$PDRcount <- as.numeric(alldata$PDRcount)
crossdata<-melt(alldata,id=(c("PDR", "START","SWITCHOUT","PDRcount" )))
colnames(crossdata) <- c("PDR","START","SWITCHOUT","PDRcount","MONTH","SMC")
## transform levels to date format
levels(crossdata$MONTH)[1] <- "2017-01-01"
levels(crossdata$MONTH)[2] <- "2017-02-01"
levels(crossdata$MONTH)[3] <- "2017-03-01"
levels(crossdata$MONTH)[4] <- "2017-04-01"
crossdata$MONTH <- as.Date(crossdata$MONTH,format = "%Y-%m-%d" )
for (pdr in crossdata[,"PDR"]){
maxPDR <- max(crossdata$PDRcount[crossdata$PDR == pdr])
dates <- unique(crossdata$START[crossdata$PDR == pdr])
for (i in 1:maxPDR) {
CumSum <- cumsum( crossdata$SMC[crossdata$PDR == pdr & crossdata$START == dates[i]] )
crossdata$SMCcum[crossdata$PDR == pdr & crossdata$START == dates[i] & crossdata$MONTH == "2017-01-01"] <- CumSum[1]
crossdata$SMCcum[crossdata$PDR == pdr & crossdata$START == dates[i] & crossdata$MONTH == "2017-02-01"] <- CumSum[2]
crossdata$SMCcum[crossdata$PDR == pdr & crossdata$START == dates[i] & crossdata$MONTH == "2017-03-01"] <- CumSum[3]
crossdata$SMCcum[crossdata$PDR == pdr & crossdata$START == dates[i] & crossdata$MONTH == "2017-04-01"] <- CumSum[4]
}
}
已编辑:抱歉出现错误...
这是部分答案。我不明白“...基于它是否是双 "PDRcount"” 部分。
这里是 PDR !=2 使用 dplyr
库的部分答案。我还通过在任何计算之前对您的 crossdata 变量使用 dput 来简化数据输入。
crossdata1<-structure(list(PDR = c(1, 2, 3, 4, 5, 2, 1, 2, 3, 4, 5, 2, 1,
2, 3, 4, 5, 2, 1, 2, 3, 4, 5, 2),
START = structure(c(13879, 13514, 14610, 14975, 17198, 17226, 13879, 13514, 14610, 14975,
17198, 17226, 13879, 13514, 14610, 14975, 17198, 17226, 13879,
13514, 14610, 14975, 17198, 17226), class = "Date"),
SWITCHOUT = structure(c(NA, 17225, NA, NA, 17256, NA, NA, 17225, NA, NA, 17256, NA, NA, 17225,
NA, NA, 17256, NA, NA, 17225, NA, NA, 17256, NA), class = "Date"),
PDRcount = c(1, 2, 1, 1, 1, 2, 1, 2, 1, 1, 1, 2, 1, 2, 1, 1, 1, 2, 1, 2, 1, 1, 1, 2),
MONTH = structure(c(17167, 17167,
17167, 17167, 17167, 17167, 17198, 17198, 17198, 17198, 17198,
17198, 17226, 17226, 17226, 17226, 17226, 17226, 17257, 17257,
17257, 17257, 17257, 17257), class = "Date"),
SMC = c(100, 124, 165, 178, 0, 0, 101, 125, 133, 178, 170, 0, 99, 0, 165,
180, 166, 99, 100, 0, 156, 178, 0, 78)),
row.names = c(NA, -24L), .Names = c("PDR", "START", "SWITCHOUT", "PDRcount", "MONTH", "SMC"),
class = "data.frame")
#test to see if starting data is the same
identical(crossdata, crossdata1)
library(dplyr)
#group by and add the cumsum column to answer dataframe
ans<-group_by(crossdata1, PDR) %>%
mutate(SMCcum = cumsum(SMC))
#rows where the 2 final dataframes do not match
crossdata[-which(crossdata$SMCcum== ans$SMCcum),]
如果应用附加过滤器来删除“...double "PDRcount" or not”的情况,上述行很可能会起作用。适用。
我发现这个 post 有用:
祝你好运。
您不断地覆盖您的结果。
一个明显的改进是循环 unique(crossdata[,"PDR"])
而不是为每一行调用循环。
我不确定你的内部循环是否为 maxPDR > 1
提供了预期的结果你不断地覆盖 START
匹配第 maxPDR
个 dates
条目的值- 请注意,您没有对 dates
进行排序,因此无法保证 dates[maxPDR]
是最大的(最新的)条目。
我在 dplyr
中编写了一个替代解决方案,其中包含两个步骤以简化转换为所需格式的过程。
alldata <- data.frame(PDR=PDR, START=START, SWITCHOUT=SWITCHOUT, JAN17=JAN17,
FEB17=FEB17, MAR17=MAR17, APR17=APR17)
library(dplyr)
library(tidyr) # to reshape the data
crossdata_2 <- alldata %>%
gather(MONTH,SMC,ends_with("17")) %>%
mutate(MONTH = as.character(strptime(paste0(MONTH,"-01"), format = "%b%y-%d"))) %>%
# the following line adds your PDRcount but is unnecessary for further computation
group_by(PDR) %>% mutate(PDRcount = n_distinct(START)) %>%
group_by(PDR,START) %>% mutate(SMCcum = cumsum(SMC))
请注意,我为每个 PDR
和 START
计算了 cumsum()
。如果每个 PDR
只需要一个结果,则只需添加一个合适的过滤器即可。
我想指出 strptime
中的缩写月份转换 %b
是特定于语言环境的。要正常工作,您可能必须更改 LC_TIME
.