根据 R 中的条件操作向量
Manipulating a vector based on a condition in R
我正在尝试寻找一种有效的方法来执行以下代码片段,但要在相当大的数据框(超过 200 万条记录)上执行。我试过使用 sapply
和 for loop
,但都不是很好。
'ACCOUNT_CLOSE_DATE' 恰好是空白的,但它是给定 'U_ID' 的最后一个 'AS_OF_DATE'。我想将此特定值插入到 'U_ID' 的 'ACCOUNT_CLOSE_DATE' 列中。任何帮助将不胜感激!
library(reshape2)
cnames = c("Date", "999:999", "888:888", "777:777")
dates = c("31JAN2005", "28FEB2005", "31MAR2005")
val = ""
mydf = data.frame(cbind(dates, matrix(val, nrow = 3, ncol = 3)))
colnames(mydf) = cnames
nmydf = melt(mydf, id.vars = "Date")
nmydf$Date = as.character(nmydf$Date)
colnames(nmydf) = c("AS_OF_DATE", "U_ID", "ACCOUNT_CLOSE_DATE")
uvals = c("999:999", "888:888", "777:777")
nmydf[nmydf$U_ID == uvals[1],"ACCOUNT_CLOSE_DATE"] =
nmydf[nmydf$U_ID == uvals[1], "AS_OF_DATE"][length(nmydf[nmydf$U_ID == uvals[1], "AS_OF_DATE"])]
您可以进行 data.table 连接,以便获得每组 "U_ID" 到 table 的截止日期,如下所示:
library(data.table)
cnames <- c("Date", "999:999", "888:888", "777:777")
dates <- c("31JAN2005", "28FEB2005", "31MAR2005")
val <- ""
mydt <- data.table(cbind(dates, matrix(val, nrow = 3, ncol = 3)))
setnames(mydt, cnames)
nmydt <- melt(mydt, id.vars = "Date")[,Date:variable]
setnames(nmydt, c("AS_OF_DATE", "U_ID"))
closing_dates <- setnames(nmydt[, .SD[.N], by=U_ID], c("U_ID", "ACCOUNT_CLOSE_DATE"))
merge(nmydt, closing_dates, by="U_ID", all.x=TRUE)
# U_ID AS_OF_DATE ACCOUNT_CLOSE_DATE
# 1: 999:999 31JAN2005 31MAR2005
# 2: 999:999 28FEB2005 31MAR2005
# 3: 999:999 31MAR2005 31MAR2005
# 4: 888:888 31JAN2005 31MAR2005
# 5: 888:888 28FEB2005 31MAR2005
# 6: 888:888 31MAR2005 31MAR2005
# 7: 777:777 31JAN2005 31MAR2005
# 8: 777:777 28FEB2005 31MAR2005
# 9: 777:777 31MAR2005 31MAR2005
此解决方案还应提供效率提升。
我正在尝试寻找一种有效的方法来执行以下代码片段,但要在相当大的数据框(超过 200 万条记录)上执行。我试过使用 sapply
和 for loop
,但都不是很好。
'ACCOUNT_CLOSE_DATE' 恰好是空白的,但它是给定 'U_ID' 的最后一个 'AS_OF_DATE'。我想将此特定值插入到 'U_ID' 的 'ACCOUNT_CLOSE_DATE' 列中。任何帮助将不胜感激!
library(reshape2)
cnames = c("Date", "999:999", "888:888", "777:777")
dates = c("31JAN2005", "28FEB2005", "31MAR2005")
val = ""
mydf = data.frame(cbind(dates, matrix(val, nrow = 3, ncol = 3)))
colnames(mydf) = cnames
nmydf = melt(mydf, id.vars = "Date")
nmydf$Date = as.character(nmydf$Date)
colnames(nmydf) = c("AS_OF_DATE", "U_ID", "ACCOUNT_CLOSE_DATE")
uvals = c("999:999", "888:888", "777:777")
nmydf[nmydf$U_ID == uvals[1],"ACCOUNT_CLOSE_DATE"] =
nmydf[nmydf$U_ID == uvals[1], "AS_OF_DATE"][length(nmydf[nmydf$U_ID == uvals[1], "AS_OF_DATE"])]
您可以进行 data.table 连接,以便获得每组 "U_ID" 到 table 的截止日期,如下所示:
library(data.table)
cnames <- c("Date", "999:999", "888:888", "777:777")
dates <- c("31JAN2005", "28FEB2005", "31MAR2005")
val <- ""
mydt <- data.table(cbind(dates, matrix(val, nrow = 3, ncol = 3)))
setnames(mydt, cnames)
nmydt <- melt(mydt, id.vars = "Date")[,Date:variable]
setnames(nmydt, c("AS_OF_DATE", "U_ID"))
closing_dates <- setnames(nmydt[, .SD[.N], by=U_ID], c("U_ID", "ACCOUNT_CLOSE_DATE"))
merge(nmydt, closing_dates, by="U_ID", all.x=TRUE)
# U_ID AS_OF_DATE ACCOUNT_CLOSE_DATE
# 1: 999:999 31JAN2005 31MAR2005
# 2: 999:999 28FEB2005 31MAR2005
# 3: 999:999 31MAR2005 31MAR2005
# 4: 888:888 31JAN2005 31MAR2005
# 5: 888:888 28FEB2005 31MAR2005
# 6: 888:888 31MAR2005 31MAR2005
# 7: 777:777 31JAN2005 31MAR2005
# 8: 777:777 28FEB2005 31MAR2005
# 9: 777:777 31MAR2005 31MAR2005
此解决方案还应提供效率提升。