如何在缺少数据的情况下划分 R 中不相等的数据帧
How to divide unequal dataframes in R with missing data
我有两个不同维度的数据框。并想划分两个数据帧。我的原始数据框有很大的编号。超过 4000 列,因此,给每列命名可能很麻烦。此外,如下图所示,一个数据框中的一列可能在其他数据框中没有对应的列。而列名A.S
,A
是公司名称,S
是公司A
的价格数据,A.V
是公司的成交量数据公司 A
。我的样本量是 2000-2014 年的一个重要细节。因此,如果公司 a 在 2002 年开始交易,它将在 2000 年和 2001.So 中有 NA,我应该如何处理它。
df1<- S
Date A.S B.S C.S
01/01/2000 1 10 19
02/01/2000 2 11 20
03/01/2000 3 12 21
04/01/2000 NA 13 22
05/01/2000 NA 14 23
06/01/2000 NA NA 24
07/01/2000 7 NA 25
08/01/2000 8 NA 26
09/01/2000 9 18 27
The other dataframe
df2<-V
Date A.V B.V
01/01/2000 12 NA
02/01/2000 12 NA
03/01/2000 12 3
04/01/2000 12 4
05/01/2000 12 5
06/01/2000 NA 6
07/01/2000 NA 7
08/01/2000 NA 8
09/01/2000 NA 9
并且想要想要的结果如下。 T
df3<-df1/df2
Date A B C
01/01/2000 0.08 NA NA
02/01/2000 0.17 NA NA
03/01/2000 0.25 4 NA
04/01/2000 NA 3 NA
05/01/2000 NA 3 NA
06/01/2000 NA NA NA
07/01/2000 NA NA NA
08/01/2000 NA NA NA
09/01/2000 NA 2 NA
非常感谢您的帮助
有几种方法可以解决这个问题。一种方法是使用正则表达式对列名进行同质化(我为此创建了 'edit'-dataframes,您当然可以使用原始数据执行此操作。
#edit column names
df1_edit <- df1
colnames(df1_edit) <- gsub("\.S","",colnames(df1_edit))
df2_edit <- df2
colnames(df2_edit) <- gsub("\.V","",colnames(df2_edit))
#create vector of all columns that need to be made, excluding 'Date'
all_cols <- unique(c(colnames(df1_edit)[-1],colnames(df2_edit)[-1]))
#create missing columns
df1_edit[,setdiff(all_cols,colnames(df1_edit))] <- NA
df2_edit[,setdiff(all_cols,colnames(df2_edit))] <- NA
#now divide the dataframes, using all_cols to ensure correct order (and thus division)
res <- cbind(Date=df1_edit$Date, df1_edit[,all_cols]/df2_edit[,all_cols])
> res
Date A B C
1 01/01/2000 0.08333333 10.000000 NA
2 02/01/2000 0.16666667 5.500000 NA
3 03/01/2000 0.25000000 4.000000 NA
4 04/01/2000 0.33333333 3.250000 NA
5 05/01/2000 0.41666667 2.800000 NA
6 06/01/2000 0.50000000 2.500000 NA
7 07/01/2000 0.58333333 2.285714 NA
8 08/01/2000 0.66666667 2.125000 NA
9 09/01/2000 0.75000000 2.000000 NA
另一种方法是做一些data-reshaping。首先,我们将两个数据帧都变成 long 并操作 'variable' 变量。然后我们合并(all=T 生成我们的 NA),划分并重塑为宽。
library(data.table)
df1_l <- melt(setDT(df1),id.var="Date", value.var="value.S")
df1_l$var <-gsub("\.S","",df1_l$variable)
df2_l <- melt(setDT(df2), id.var="Date",value.var="value.V")
df2_l$var <-gsub("\.V","",df2_l$variable)
df_merge <- merge(df1_l, df2_l, by=c("Date","var"),all=T)
df_merge$res <- df_merge$value.x/df_merge$value.y
res <- dcast(df_merge, Date~var,value.var="res")
> res
Date A B C
1: 01/01/2000 0.08333333 10.000000 NA
2: 02/01/2000 0.16666667 5.500000 NA
3: 03/01/2000 0.25000000 4.000000 NA
4: 04/01/2000 0.33333333 3.250000 NA
5: 05/01/2000 0.41666667 2.800000 NA
6: 06/01/2000 0.50000000 2.500000 NA
7: 07/01/2000 0.58333333 2.285714 NA
8: 08/01/2000 0.66666667 2.125000 NA
9: 09/01/2000 0.75000000 2.000000 NA
考虑使用交叉列和不同列的比较的 mapply 路线:
# OBTAIN SAME/DIFFERENT COLUMNS USING REGEX FOR SUFFIX
samecols <- intersect(unlist(gsub("\.*S$", "", names(df1)[2:ncol(df1)])),
unlist(gsub("\.*TURNOVER.BY.VOLUME$", "", names(df2)[2:ncol(df1)])))
diffcols <- setdiff(unlist(gsub("\.*S$", "", names(df1))),
unlist(gsub("\.*TURNOVER.BY.VOLUME$", "", names(df2))))
# DEFINED DIV FUNCTION
divfct <- function(var1, var2){
return (var1/var2)
}
# MAPPLY USING DIV FUNCTION
fctresults <- as.data.frame(mapply(divfct, var1=df1[, paste0(samecols, ".S")],
var2=df2[, paste0(samecols, "...TURNOVER.BY.VOLUME")]))
# MONTHLY DATES: 2000-2014
datelist <- lapply(1:12, function(m) {
lapply(2000:2014, function(y) paste(m, "1", y, sep="/"))
})
datedf <- data.frame(Date=unlist(datelist))
# MERGE DATE AND DIV FUNCTION RESULTS
finaldf <- cbind(list(Date = df1[,c("Date")]), fctresults)
finaldf <- merge(datedf, finaldf, by="Date", all=TRUE)
finaldf$Date <- strptime(finaldf$Date, "%m/%d/%Y") # CONVERT COLUMN TO DATE (POSIXlt)
finaldf <- finaldf[order(finaldf$Date),] # RE-ORDER BY DATE (POSIXlt)
row.names(finaldf) <- 1:nrow(finaldf) # RESET ROW NAMES
for (i in diffcols) {
finaldf[[i]] <- NA
}
# REMOVE TEMP OBJECTS
rm(i, diffcols, samecols, fctresults, divfct, datedf, datelist)
我有两个不同维度的数据框。并想划分两个数据帧。我的原始数据框有很大的编号。超过 4000 列,因此,给每列命名可能很麻烦。此外,如下图所示,一个数据框中的一列可能在其他数据框中没有对应的列。而列名A.S
,A
是公司名称,S
是公司A
的价格数据,A.V
是公司的成交量数据公司 A
。我的样本量是 2000-2014 年的一个重要细节。因此,如果公司 a 在 2002 年开始交易,它将在 2000 年和 2001.So 中有 NA,我应该如何处理它。
df1<- S
Date A.S B.S C.S
01/01/2000 1 10 19
02/01/2000 2 11 20
03/01/2000 3 12 21
04/01/2000 NA 13 22
05/01/2000 NA 14 23
06/01/2000 NA NA 24
07/01/2000 7 NA 25
08/01/2000 8 NA 26
09/01/2000 9 18 27
The other dataframe
df2<-V
Date A.V B.V
01/01/2000 12 NA
02/01/2000 12 NA
03/01/2000 12 3
04/01/2000 12 4
05/01/2000 12 5
06/01/2000 NA 6
07/01/2000 NA 7
08/01/2000 NA 8
09/01/2000 NA 9
并且想要想要的结果如下。 T
df3<-df1/df2
Date A B C
01/01/2000 0.08 NA NA
02/01/2000 0.17 NA NA
03/01/2000 0.25 4 NA
04/01/2000 NA 3 NA
05/01/2000 NA 3 NA
06/01/2000 NA NA NA
07/01/2000 NA NA NA
08/01/2000 NA NA NA
09/01/2000 NA 2 NA
非常感谢您的帮助
有几种方法可以解决这个问题。一种方法是使用正则表达式对列名进行同质化(我为此创建了 'edit'-dataframes,您当然可以使用原始数据执行此操作。
#edit column names
df1_edit <- df1
colnames(df1_edit) <- gsub("\.S","",colnames(df1_edit))
df2_edit <- df2
colnames(df2_edit) <- gsub("\.V","",colnames(df2_edit))
#create vector of all columns that need to be made, excluding 'Date'
all_cols <- unique(c(colnames(df1_edit)[-1],colnames(df2_edit)[-1]))
#create missing columns
df1_edit[,setdiff(all_cols,colnames(df1_edit))] <- NA
df2_edit[,setdiff(all_cols,colnames(df2_edit))] <- NA
#now divide the dataframes, using all_cols to ensure correct order (and thus division)
res <- cbind(Date=df1_edit$Date, df1_edit[,all_cols]/df2_edit[,all_cols])
> res
Date A B C
1 01/01/2000 0.08333333 10.000000 NA
2 02/01/2000 0.16666667 5.500000 NA
3 03/01/2000 0.25000000 4.000000 NA
4 04/01/2000 0.33333333 3.250000 NA
5 05/01/2000 0.41666667 2.800000 NA
6 06/01/2000 0.50000000 2.500000 NA
7 07/01/2000 0.58333333 2.285714 NA
8 08/01/2000 0.66666667 2.125000 NA
9 09/01/2000 0.75000000 2.000000 NA
另一种方法是做一些data-reshaping。首先,我们将两个数据帧都变成 long 并操作 'variable' 变量。然后我们合并(all=T 生成我们的 NA),划分并重塑为宽。
library(data.table)
df1_l <- melt(setDT(df1),id.var="Date", value.var="value.S")
df1_l$var <-gsub("\.S","",df1_l$variable)
df2_l <- melt(setDT(df2), id.var="Date",value.var="value.V")
df2_l$var <-gsub("\.V","",df2_l$variable)
df_merge <- merge(df1_l, df2_l, by=c("Date","var"),all=T)
df_merge$res <- df_merge$value.x/df_merge$value.y
res <- dcast(df_merge, Date~var,value.var="res")
> res
Date A B C
1: 01/01/2000 0.08333333 10.000000 NA
2: 02/01/2000 0.16666667 5.500000 NA
3: 03/01/2000 0.25000000 4.000000 NA
4: 04/01/2000 0.33333333 3.250000 NA
5: 05/01/2000 0.41666667 2.800000 NA
6: 06/01/2000 0.50000000 2.500000 NA
7: 07/01/2000 0.58333333 2.285714 NA
8: 08/01/2000 0.66666667 2.125000 NA
9: 09/01/2000 0.75000000 2.000000 NA
考虑使用交叉列和不同列的比较的 mapply 路线:
# OBTAIN SAME/DIFFERENT COLUMNS USING REGEX FOR SUFFIX
samecols <- intersect(unlist(gsub("\.*S$", "", names(df1)[2:ncol(df1)])),
unlist(gsub("\.*TURNOVER.BY.VOLUME$", "", names(df2)[2:ncol(df1)])))
diffcols <- setdiff(unlist(gsub("\.*S$", "", names(df1))),
unlist(gsub("\.*TURNOVER.BY.VOLUME$", "", names(df2))))
# DEFINED DIV FUNCTION
divfct <- function(var1, var2){
return (var1/var2)
}
# MAPPLY USING DIV FUNCTION
fctresults <- as.data.frame(mapply(divfct, var1=df1[, paste0(samecols, ".S")],
var2=df2[, paste0(samecols, "...TURNOVER.BY.VOLUME")]))
# MONTHLY DATES: 2000-2014
datelist <- lapply(1:12, function(m) {
lapply(2000:2014, function(y) paste(m, "1", y, sep="/"))
})
datedf <- data.frame(Date=unlist(datelist))
# MERGE DATE AND DIV FUNCTION RESULTS
finaldf <- cbind(list(Date = df1[,c("Date")]), fctresults)
finaldf <- merge(datedf, finaldf, by="Date", all=TRUE)
finaldf$Date <- strptime(finaldf$Date, "%m/%d/%Y") # CONVERT COLUMN TO DATE (POSIXlt)
finaldf <- finaldf[order(finaldf$Date),] # RE-ORDER BY DATE (POSIXlt)
row.names(finaldf) <- 1:nrow(finaldf) # RESET ROW NAMES
for (i in diffcols) {
finaldf[[i]] <- NA
}
# REMOVE TEMP OBJECTS
rm(i, diffcols, samecols, fctresults, divfct, datedf, datelist)