基于其他两个数据框创建面板数据框
Creating a panel data frame based on two other dataframes
我有两个数据框:
ANNUALSALARY <- structure(list(FIRM = structure(1:3, .Label = c("A", "B", "C"), class = "factor"), SLY_ADMIN = c(0.1, 0.2, 0.3), SLY_MKT = c(0.5, 0.003,0.3), SLY_FIN = c(0.11, 0.12, 0.03)), .Names = c("FIRM", "SLY_ADMIN", "SLY_MKT", "SLY_FIN"), row.names = c(NA, -3L), class = "data.frame")
和:
WEEKLYPRODUCTIVITY <- structure(list(FIRM = structure(c(1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L), .Label = c("A", "B", "C"), class = "factor"), WEEKS = structure(c(1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L), .Label = c("1", "2", "3", "4", "5"), class = "factor"), PR_ADMIN = c(1, 5, 4, 3, 2, 1, 4, 2, 4, 2, 3, 1, 4, 5, 5), Z_ADMIN = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 1, 2, 3, 4, 5, 6), PR_MKT = c(0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2), Z_MKT = c(9, 8, 7, 6, 5, 4, 3, 2, 1, 9, 8, 7, 6, 5, 4), PR_FIN = c(5, 4, 3, 2, 1, 5, 4, 3, 2, 1, 5, 4, 3, 2, 1), Z_FIN = c(1, 2, 3, 4, 5, 5, 4, 3, 2, 1, 1, 2, 3, 4, 5)), .Names = c("FIRM", "WEEKS", "PR_ADMIN", "Z_ADMIN", "PR_MKT", "Z_MKT", "PR_FIN", "Z_FIN"), row.names = c(NA, 15L), class = c("plm.dim", "data.frame"))
我有兴趣创建一个数据框,每个 FIRM
从 SLY_ADMIN
、SLY_MKT
和 SLY_FIN
中取最小值。然后从PR_ADMIN
、PR_MKT
、PR_FIN
和Z_ADMIN
、Z_MKT
、Z_FIN
中取出相应的值。例如如果 SLY_MKT
是公司 A 的最小值,那么它 returns PR_MKT
和 Z_MKT
5 周。面板数据框看起来像这样(我手动创建的):
REQUIRED <- structure(list(FIRM = structure(c(1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L), .Label = c("A", "B", "C"), class = "factor"),WEEKS = structure(c(1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L), .Label = c("1", "2", "3", "4", "5"), class = "factor"), PR = c(1, 5, 4, 3, 2, 5, 0, 1, 2, 3, 5, 4, 3, 2, 1), MIN_SLY = c(0.1, 0.1, 0.1, 0.1, 0.1, 0.003, 0.003, 0.003, 0.003, 0.003, 0.03, 0.03, 0.03, 0.03, 0.03), SLY_DEPT = structure(c(1L, 1L, 1L, 1L, 1L, 3L, 3L, 3L, 3L, 3L, 2L, 2L, 2L, 2L, 2L), .Label = c("SLY_ADMIN", "SLY_FIN", "SLY_MKT"), class = "factor"), Z = c(1, 2, 3, 4, 5, 4, 3, 2, 1, 9, 1, 2, 3, 4, 5)), .Names = c("FIRM", "WEEKS", "PR", "MIN_SLY", "SLY_DEPT", "Z"), row.names = c(NA, 15L), class = c("plm.dim", "data.frame"))
请帮忙。谢谢
我们可以使用data.table
。使用 max.col
获取 "ANNUALSALARY" 中数字列的最小值的索引。然后,我们将 'data.frame' 转换为 'data.table' 和 melt
它从 'wide' 转换为 'long' 格式,得到 "MIN_SLY" 和 "S
library(data.table)
i1 <- max.col(-1*ANNUALSALARY[-1])
dN <- melt(setDT(ANNUALSALARY), id.var = "FIRM", value.name = "MIN_SLY",
variable.name = "SLY_DEPT")[ , .SD[which.min(MIN_SLY)], by = FIRM]
setDT(WEEKLYPRODUCTIVITY)
或者我们可以使用 'i1'
创建 'data.table' 而不是 melt
ing
dN <- data.table(FIRM= ANNUALSALARY$FIRM,
MIN_SLY=as.data.frame(ANNUALSALARY)[-1][cbind(1:nrow(ANNUALSALARY), i1)],
SLY_DEPT = names(ANNUALSALARY)[-1][i1])
然后,我们根据列名中的 patterns
将 join
由 'WEEKLYPRODUCTIVITY' 和 melt
转换为 'long' 格式。我们order
by 'FIRM'、'variable'、'WEEKS',根据"WEEKS"值创建分组变量('gr1'),按[=分组44=].
dN2 <- melt(dN[WEEKLYPRODUCTIVITY, on = "FIRM"], measure = patterns("^PR", "^Z"),
value.name = c("PR", "Z"))[order(FIRM, variable, WEEKS)
][, gr1 := cumsum(WEEKS==1), FIRM][]
最后,我们加入使用 'i1'、on
"FIRM" 创建的 data.table',对 'gr1' 等于 [= 的行进行子集化40=],以及 select 感兴趣的列。
res <- data.table(FIRM= ANNUALSALARY$FIRM, i1)[dN2, on = "FIRM"
][gr1==i1][,names(REQUIRED), with = FALSE]
all.equal(as.data.frame(res), REQUIRED, check.attributes=FALSE)
#[1] TRUE
res
# FIRM WEEKS PR MIN_SLY SLY_DEPT Z
# 1: A 1 1 0.100 SLY_ADMIN 1
# 2: A 2 5 0.100 SLY_ADMIN 2
# 3: A 3 4 0.100 SLY_ADMIN 3
# 4: A 4 3 0.100 SLY_ADMIN 4
# 5: A 5 2 0.100 SLY_ADMIN 5
# 6: B 1 5 0.003 SLY_MKT 4
# 7: B 2 0 0.003 SLY_MKT 3
# 8: B 3 1 0.003 SLY_MKT 2
# 9: B 4 2 0.003 SLY_MKT 1
#10: B 5 3 0.003 SLY_MKT 9
#11: C 1 5 0.030 SLY_FIN 1
#12: C 2 4 0.030 SLY_FIN 2
#13: C 3 3 0.030 SLY_FIN 3
#14: C 4 2 0.030 SLY_FIN 4
#15: C 5 1 0.030 SLY_FIN 5
一种不同的方法,但也使用了 data.table
包:
library(data.table)
# convert the dataframes to datatables (which is an enhanced form of dataframe)
setDT(ANNUALSALARY)
setDT(WEEKLYPRODUCTIVITY)
# join them on 'FIRM'
res <- WEEKLYPRODUCTIVITY[ANNUALSALARY, on = 'FIRM']
# create a convenience vector with the columnnames starting with 'SLY_
sly.cols <- grep('^SLY_', names(res), value = TRUE)
# create the 'MIN_SLY' & 'SLY_DEPT' columns
res[, `:=` (MIN_SLY = min(.SD),
SLY_DEPT = sly.cols[which.min(.SD)]),
by = 1:nrow(res), .SDcols = sly.cols][]
# melt it in log format and create the 'PR' & 'Z' column
res2 <- melt(res, id = c('FIRM','WEEKS','MIN_SLY','SLY_DEPT'),
measure.vars = patterns('^PR_','^Z_'),
value.name = c('PR','Z'))[, variable := c('ADMIN','MKT','FIN')[variable]
][, `:=` (PR = PR[sub('^SLY_','',SLY_DEPT) == variable],
Z = Z[sub('^SLY_','',SLY_DEPT) == variable]),
by = .(FIRM,WEEKS)
][, variable := NULL]
# removing the duplicates
res2 <- res2[!duplicated(res2)]
这导致:
> res2
FIRM WEEKS MIN_SLY SLY_DEPT PR Z
1: A 1 0.100 SLY_ADMIN 1 1
2: A 2 0.100 SLY_ADMIN 5 2
3: A 3 0.100 SLY_ADMIN 4 3
4: A 4 0.100 SLY_ADMIN 3 4
5: A 5 0.100 SLY_ADMIN 2 5
6: B 1 0.003 SLY_MKT 5 4
7: B 2 0.003 SLY_MKT 0 3
8: B 3 0.003 SLY_MKT 1 2
9: B 4 0.003 SLY_MKT 2 1
10: B 5 0.003 SLY_MKT 3 9
11: C 1 0.030 SLY_FIN 5 1
12: C 2 0.030 SLY_FIN 4 2
13: C 3 0.030 SLY_FIN 3 3
14: C 4 0.030 SLY_FIN 2 4
15: C 5 0.030 SLY_FIN 1 5
这是一个棘手的问题!我提出了一个围绕 max.col()
、merge()
和索引矩阵构建的基础 R 解决方案。
请注意,为了简洁起见,我使用了变量名称 sal
和 prod
。
sufs <- c('ADMIN','MKT','FIN');
slys <- paste0('SLY_',sufs);
mins <- max.col(-sal[slys]);
res <- merge(prod[,c('FIRM','WEEKS')],cbind(sal[,'FIRM',drop=F],SLY_DEPT=slys[mins],MIN_SLY=sal[slys][cbind(seq_len(nrow(sal)),mins)]));
res.sufs <- sub('.*_','',res$SLY_DEPT);
for (pre in c('PR','Z')) { pre.cns <- paste0(pre,'_',sufs); res[[pre]] <- prod[pre.cns][cbind(seq_len(nrow(prod)),match(paste0(pre,'_',res.sufs),pre.cns))]; };
res;
## FIRM WEEKS SLY_DEPT MIN_SLY PR Z
## 1 A 1 SLY_ADMIN 0.100 1 1
## 2 A 2 SLY_ADMIN 0.100 5 2
## 3 A 3 SLY_ADMIN 0.100 4 3
## 4 A 4 SLY_ADMIN 0.100 3 4
## 5 A 5 SLY_ADMIN 0.100 2 5
## 6 B 1 SLY_MKT 0.003 5 4
## 7 B 2 SLY_MKT 0.003 0 3
## 8 B 3 SLY_MKT 0.003 1 2
## 9 B 4 SLY_MKT 0.003 2 1
## 10 B 5 SLY_MKT 0.003 3 9
## 11 C 1 SLY_FIN 0.030 5 1
## 12 C 2 SLY_FIN 0.030 4 2
## 13 C 3 SLY_FIN 0.030 3 3
## 14 C 4 SLY_FIN 0.030 2 4
## 15 C 5 SLY_FIN 0.030 1 5
基准测试
## libraries
library(data.table);
library(microbenchmark);
## define inputs, including data.table instances for akrun and maximus solutions
sal <- structure(list(FIRM = structure(1:3, .Label = c("A", "B", "C"), class = "factor"), SLY_ADMIN = c(0.1, 0.2, 0.3), SLY_MKT = c(0.5, 0.003,0.3), SLY_FIN = c(0.11, 0.12, 0.03)), .Names = c("FIRM", "SLY_ADMIN", "SLY_MKT", "SLY_FIN"), row.names = c(NA, -3L), class = "data.frame");
prod <- structure(list(FIRM = structure(c(1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L), .Label = c("A", "B", "C"), class = "factor"), WEEKS = structure(c(1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L), .Label = c("1", "2", "3", "4", "5"), class = "factor"), PR_ADMIN = c(1, 5, 4, 3, 2, 1, 4, 2, 4, 2, 3, 1, 4, 5, 5), Z_ADMIN = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 1, 2, 3, 4, 5, 6), PR_MKT = c(0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2), Z_MKT = c(9, 8, 7, 6, 5, 4, 3, 2, 1, 9, 8, 7, 6, 5, 4), PR_FIN = c(5, 4, 3, 2, 1, 5, 4, 3, 2, 1, 5, 4, 3, 2, 1), Z_FIN = c(1, 2, 3, 4, 5, 5, 4, 3, 2, 1, 1, 2, 3, 4, 5)), .Names = c("FIRM", "WEEKS", "PR_ADMIN", "Z_ADMIN", "PR_MKT", "Z_MKT", "PR_FIN", "Z_FIN"), row.names = c(NA, 15L), class = c("plm.dim", "data.frame"));
sal.dt <- as.data.table(sal);
prod.dt <- as.data.table(prod);
## solutions
bgoldst <- function(sal,prod) { sufs <- c('ADMIN','MKT','FIN'); slys <- paste0('SLY_',sufs); mins <- max.col(-sal[slys]); res <- merge(prod[,c('FIRM','WEEKS')],cbind(sal[,'FIRM',drop=F],SLY_DEPT=slys[mins],MIN_SLY=sal[slys][cbind(seq_len(nrow(sal)),mins)])); res.sufs <- sub('.*_','',res$SLY_DEPT); for (pre in c('PR','Z')) { pre.cns <- paste0(pre,'_',sufs); res[[pre]] <- prod[pre.cns][cbind(seq_len(nrow(prod)),match(paste0(pre,'_',res.sufs),pre.cns))]; }; res; };
akrun <- function(ANNUALSALARY,WEEKLYPRODUCTIVITY) { i1 <- max.col(-1*ANNUALSALARY[,-1,with=F]); dN <- data.table(FIRM= ANNUALSALARY$FIRM, MIN_SLY=as.data.frame(ANNUALSALARY)[-1][cbind(1:nrow(ANNUALSALARY), i1)], SLY_DEPT = names(ANNUALSALARY)[-1][i1]); dN2 <- melt(dN[WEEKLYPRODUCTIVITY, on = "FIRM"], measure = patterns("^PR", "^Z"), value.name = c("PR", "Z"))[order(FIRM, variable, WEEKS)][, gr1 := cumsum(WEEKS==1), FIRM][]; res <- data.table(FIRM= ANNUALSALARY$FIRM, i1)[dN2, on = "FIRM"][gr1==i1]; res[,!names(res)%in%c('i1','variable','gr1'),with=F]; };
maximus <- function(ANNUALSALARY,WEEKLYPRODUCTIVITY) { res <- WEEKLYPRODUCTIVITY[ANNUALSALARY, on = 'FIRM']; sly.cols <- grep('^SLY_', names(res), value = TRUE); res[, `:=` (MIN_SLY = min(.SD), SLY_DEPT = sly.cols[which.min(.SD)]), by = 1:nrow(res), .SDcols = sly.cols][]; res2 <- melt(res, id = c('FIRM','WEEKS','MIN_SLY','SLY_DEPT'), measure.vars = patterns('^PR_','^Z_'), value.name = c('PR','Z'))[, variable := c('ADMIN','MKT','FIN')[variable]][, `:=` (PR = PR[sub('^SLY_','',SLY_DEPT) == variable], Z = Z[sub('^SLY_','',SLY_DEPT) == variable]), by = .(FIRM,WEEKS)][, variable := NULL]; res2 <- res2[!duplicated(res2)]; };
## proofs of equivalence
ex <- bgoldst(sal,prod); co <- names(ex);
identical(ex,transform(as.data.frame(akrun(sal.dt,prod.dt))[co],SLY_DEPT=factor(SLY_DEPT)));
## [1] TRUE
identical(ex,transform(as.data.frame(maximus(sal.dt,prod.dt))[co],SLY_DEPT=factor(SLY_DEPT)));
## [1] TRUE
## benchmark
microbenchmark(bgoldst(sal,prod),akrun(sal.dt,prod.dt),maximus(sal.dt,prod.dt));
## Unit: milliseconds
## expr min lq mean median uq max neval
## bgoldst(sal, prod) 1.639193 1.730070 1.883285 1.807047 1.881031 3.230917 100
## akrun(sal.dt, prod.dt) 6.392125 6.666251 7.744077 6.901033 7.230752 53.621663 100
## maximus(sal.dt, prod.dt) 5.002254 5.229979 5.853681 5.423492 6.034609 12.182544 100
我有两个数据框:
ANNUALSALARY <- structure(list(FIRM = structure(1:3, .Label = c("A", "B", "C"), class = "factor"), SLY_ADMIN = c(0.1, 0.2, 0.3), SLY_MKT = c(0.5, 0.003,0.3), SLY_FIN = c(0.11, 0.12, 0.03)), .Names = c("FIRM", "SLY_ADMIN", "SLY_MKT", "SLY_FIN"), row.names = c(NA, -3L), class = "data.frame")
和:
WEEKLYPRODUCTIVITY <- structure(list(FIRM = structure(c(1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L), .Label = c("A", "B", "C"), class = "factor"), WEEKS = structure(c(1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L), .Label = c("1", "2", "3", "4", "5"), class = "factor"), PR_ADMIN = c(1, 5, 4, 3, 2, 1, 4, 2, 4, 2, 3, 1, 4, 5, 5), Z_ADMIN = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 1, 2, 3, 4, 5, 6), PR_MKT = c(0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2), Z_MKT = c(9, 8, 7, 6, 5, 4, 3, 2, 1, 9, 8, 7, 6, 5, 4), PR_FIN = c(5, 4, 3, 2, 1, 5, 4, 3, 2, 1, 5, 4, 3, 2, 1), Z_FIN = c(1, 2, 3, 4, 5, 5, 4, 3, 2, 1, 1, 2, 3, 4, 5)), .Names = c("FIRM", "WEEKS", "PR_ADMIN", "Z_ADMIN", "PR_MKT", "Z_MKT", "PR_FIN", "Z_FIN"), row.names = c(NA, 15L), class = c("plm.dim", "data.frame"))
我有兴趣创建一个数据框,每个 FIRM
从 SLY_ADMIN
、SLY_MKT
和 SLY_FIN
中取最小值。然后从PR_ADMIN
、PR_MKT
、PR_FIN
和Z_ADMIN
、Z_MKT
、Z_FIN
中取出相应的值。例如如果 SLY_MKT
是公司 A 的最小值,那么它 returns PR_MKT
和 Z_MKT
5 周。面板数据框看起来像这样(我手动创建的):
REQUIRED <- structure(list(FIRM = structure(c(1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L), .Label = c("A", "B", "C"), class = "factor"),WEEKS = structure(c(1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L), .Label = c("1", "2", "3", "4", "5"), class = "factor"), PR = c(1, 5, 4, 3, 2, 5, 0, 1, 2, 3, 5, 4, 3, 2, 1), MIN_SLY = c(0.1, 0.1, 0.1, 0.1, 0.1, 0.003, 0.003, 0.003, 0.003, 0.003, 0.03, 0.03, 0.03, 0.03, 0.03), SLY_DEPT = structure(c(1L, 1L, 1L, 1L, 1L, 3L, 3L, 3L, 3L, 3L, 2L, 2L, 2L, 2L, 2L), .Label = c("SLY_ADMIN", "SLY_FIN", "SLY_MKT"), class = "factor"), Z = c(1, 2, 3, 4, 5, 4, 3, 2, 1, 9, 1, 2, 3, 4, 5)), .Names = c("FIRM", "WEEKS", "PR", "MIN_SLY", "SLY_DEPT", "Z"), row.names = c(NA, 15L), class = c("plm.dim", "data.frame"))
请帮忙。谢谢
我们可以使用data.table
。使用 max.col
获取 "ANNUALSALARY" 中数字列的最小值的索引。然后,我们将 'data.frame' 转换为 'data.table' 和 melt
它从 'wide' 转换为 'long' 格式,得到 "MIN_SLY" 和 "S
library(data.table)
i1 <- max.col(-1*ANNUALSALARY[-1])
dN <- melt(setDT(ANNUALSALARY), id.var = "FIRM", value.name = "MIN_SLY",
variable.name = "SLY_DEPT")[ , .SD[which.min(MIN_SLY)], by = FIRM]
setDT(WEEKLYPRODUCTIVITY)
或者我们可以使用 'i1'
创建 'data.table' 而不是melt
ing
dN <- data.table(FIRM= ANNUALSALARY$FIRM,
MIN_SLY=as.data.frame(ANNUALSALARY)[-1][cbind(1:nrow(ANNUALSALARY), i1)],
SLY_DEPT = names(ANNUALSALARY)[-1][i1])
然后,我们根据列名中的 patterns
将 join
由 'WEEKLYPRODUCTIVITY' 和 melt
转换为 'long' 格式。我们order
by 'FIRM'、'variable'、'WEEKS',根据"WEEKS"值创建分组变量('gr1'),按[=分组44=].
dN2 <- melt(dN[WEEKLYPRODUCTIVITY, on = "FIRM"], measure = patterns("^PR", "^Z"),
value.name = c("PR", "Z"))[order(FIRM, variable, WEEKS)
][, gr1 := cumsum(WEEKS==1), FIRM][]
最后,我们加入使用 'i1'、on
"FIRM" 创建的 data.table',对 'gr1' 等于 [= 的行进行子集化40=],以及 select 感兴趣的列。
res <- data.table(FIRM= ANNUALSALARY$FIRM, i1)[dN2, on = "FIRM"
][gr1==i1][,names(REQUIRED), with = FALSE]
all.equal(as.data.frame(res), REQUIRED, check.attributes=FALSE)
#[1] TRUE
res
# FIRM WEEKS PR MIN_SLY SLY_DEPT Z
# 1: A 1 1 0.100 SLY_ADMIN 1
# 2: A 2 5 0.100 SLY_ADMIN 2
# 3: A 3 4 0.100 SLY_ADMIN 3
# 4: A 4 3 0.100 SLY_ADMIN 4
# 5: A 5 2 0.100 SLY_ADMIN 5
# 6: B 1 5 0.003 SLY_MKT 4
# 7: B 2 0 0.003 SLY_MKT 3
# 8: B 3 1 0.003 SLY_MKT 2
# 9: B 4 2 0.003 SLY_MKT 1
#10: B 5 3 0.003 SLY_MKT 9
#11: C 1 5 0.030 SLY_FIN 1
#12: C 2 4 0.030 SLY_FIN 2
#13: C 3 3 0.030 SLY_FIN 3
#14: C 4 2 0.030 SLY_FIN 4
#15: C 5 1 0.030 SLY_FIN 5
一种不同的方法,但也使用了 data.table
包:
library(data.table)
# convert the dataframes to datatables (which is an enhanced form of dataframe)
setDT(ANNUALSALARY)
setDT(WEEKLYPRODUCTIVITY)
# join them on 'FIRM'
res <- WEEKLYPRODUCTIVITY[ANNUALSALARY, on = 'FIRM']
# create a convenience vector with the columnnames starting with 'SLY_
sly.cols <- grep('^SLY_', names(res), value = TRUE)
# create the 'MIN_SLY' & 'SLY_DEPT' columns
res[, `:=` (MIN_SLY = min(.SD),
SLY_DEPT = sly.cols[which.min(.SD)]),
by = 1:nrow(res), .SDcols = sly.cols][]
# melt it in log format and create the 'PR' & 'Z' column
res2 <- melt(res, id = c('FIRM','WEEKS','MIN_SLY','SLY_DEPT'),
measure.vars = patterns('^PR_','^Z_'),
value.name = c('PR','Z'))[, variable := c('ADMIN','MKT','FIN')[variable]
][, `:=` (PR = PR[sub('^SLY_','',SLY_DEPT) == variable],
Z = Z[sub('^SLY_','',SLY_DEPT) == variable]),
by = .(FIRM,WEEKS)
][, variable := NULL]
# removing the duplicates
res2 <- res2[!duplicated(res2)]
这导致:
> res2
FIRM WEEKS MIN_SLY SLY_DEPT PR Z
1: A 1 0.100 SLY_ADMIN 1 1
2: A 2 0.100 SLY_ADMIN 5 2
3: A 3 0.100 SLY_ADMIN 4 3
4: A 4 0.100 SLY_ADMIN 3 4
5: A 5 0.100 SLY_ADMIN 2 5
6: B 1 0.003 SLY_MKT 5 4
7: B 2 0.003 SLY_MKT 0 3
8: B 3 0.003 SLY_MKT 1 2
9: B 4 0.003 SLY_MKT 2 1
10: B 5 0.003 SLY_MKT 3 9
11: C 1 0.030 SLY_FIN 5 1
12: C 2 0.030 SLY_FIN 4 2
13: C 3 0.030 SLY_FIN 3 3
14: C 4 0.030 SLY_FIN 2 4
15: C 5 0.030 SLY_FIN 1 5
这是一个棘手的问题!我提出了一个围绕 max.col()
、merge()
和索引矩阵构建的基础 R 解决方案。
请注意,为了简洁起见,我使用了变量名称 sal
和 prod
。
sufs <- c('ADMIN','MKT','FIN');
slys <- paste0('SLY_',sufs);
mins <- max.col(-sal[slys]);
res <- merge(prod[,c('FIRM','WEEKS')],cbind(sal[,'FIRM',drop=F],SLY_DEPT=slys[mins],MIN_SLY=sal[slys][cbind(seq_len(nrow(sal)),mins)]));
res.sufs <- sub('.*_','',res$SLY_DEPT);
for (pre in c('PR','Z')) { pre.cns <- paste0(pre,'_',sufs); res[[pre]] <- prod[pre.cns][cbind(seq_len(nrow(prod)),match(paste0(pre,'_',res.sufs),pre.cns))]; };
res;
## FIRM WEEKS SLY_DEPT MIN_SLY PR Z
## 1 A 1 SLY_ADMIN 0.100 1 1
## 2 A 2 SLY_ADMIN 0.100 5 2
## 3 A 3 SLY_ADMIN 0.100 4 3
## 4 A 4 SLY_ADMIN 0.100 3 4
## 5 A 5 SLY_ADMIN 0.100 2 5
## 6 B 1 SLY_MKT 0.003 5 4
## 7 B 2 SLY_MKT 0.003 0 3
## 8 B 3 SLY_MKT 0.003 1 2
## 9 B 4 SLY_MKT 0.003 2 1
## 10 B 5 SLY_MKT 0.003 3 9
## 11 C 1 SLY_FIN 0.030 5 1
## 12 C 2 SLY_FIN 0.030 4 2
## 13 C 3 SLY_FIN 0.030 3 3
## 14 C 4 SLY_FIN 0.030 2 4
## 15 C 5 SLY_FIN 0.030 1 5
基准测试
## libraries
library(data.table);
library(microbenchmark);
## define inputs, including data.table instances for akrun and maximus solutions
sal <- structure(list(FIRM = structure(1:3, .Label = c("A", "B", "C"), class = "factor"), SLY_ADMIN = c(0.1, 0.2, 0.3), SLY_MKT = c(0.5, 0.003,0.3), SLY_FIN = c(0.11, 0.12, 0.03)), .Names = c("FIRM", "SLY_ADMIN", "SLY_MKT", "SLY_FIN"), row.names = c(NA, -3L), class = "data.frame");
prod <- structure(list(FIRM = structure(c(1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L), .Label = c("A", "B", "C"), class = "factor"), WEEKS = structure(c(1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L), .Label = c("1", "2", "3", "4", "5"), class = "factor"), PR_ADMIN = c(1, 5, 4, 3, 2, 1, 4, 2, 4, 2, 3, 1, 4, 5, 5), Z_ADMIN = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 1, 2, 3, 4, 5, 6), PR_MKT = c(0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2), Z_MKT = c(9, 8, 7, 6, 5, 4, 3, 2, 1, 9, 8, 7, 6, 5, 4), PR_FIN = c(5, 4, 3, 2, 1, 5, 4, 3, 2, 1, 5, 4, 3, 2, 1), Z_FIN = c(1, 2, 3, 4, 5, 5, 4, 3, 2, 1, 1, 2, 3, 4, 5)), .Names = c("FIRM", "WEEKS", "PR_ADMIN", "Z_ADMIN", "PR_MKT", "Z_MKT", "PR_FIN", "Z_FIN"), row.names = c(NA, 15L), class = c("plm.dim", "data.frame"));
sal.dt <- as.data.table(sal);
prod.dt <- as.data.table(prod);
## solutions
bgoldst <- function(sal,prod) { sufs <- c('ADMIN','MKT','FIN'); slys <- paste0('SLY_',sufs); mins <- max.col(-sal[slys]); res <- merge(prod[,c('FIRM','WEEKS')],cbind(sal[,'FIRM',drop=F],SLY_DEPT=slys[mins],MIN_SLY=sal[slys][cbind(seq_len(nrow(sal)),mins)])); res.sufs <- sub('.*_','',res$SLY_DEPT); for (pre in c('PR','Z')) { pre.cns <- paste0(pre,'_',sufs); res[[pre]] <- prod[pre.cns][cbind(seq_len(nrow(prod)),match(paste0(pre,'_',res.sufs),pre.cns))]; }; res; };
akrun <- function(ANNUALSALARY,WEEKLYPRODUCTIVITY) { i1 <- max.col(-1*ANNUALSALARY[,-1,with=F]); dN <- data.table(FIRM= ANNUALSALARY$FIRM, MIN_SLY=as.data.frame(ANNUALSALARY)[-1][cbind(1:nrow(ANNUALSALARY), i1)], SLY_DEPT = names(ANNUALSALARY)[-1][i1]); dN2 <- melt(dN[WEEKLYPRODUCTIVITY, on = "FIRM"], measure = patterns("^PR", "^Z"), value.name = c("PR", "Z"))[order(FIRM, variable, WEEKS)][, gr1 := cumsum(WEEKS==1), FIRM][]; res <- data.table(FIRM= ANNUALSALARY$FIRM, i1)[dN2, on = "FIRM"][gr1==i1]; res[,!names(res)%in%c('i1','variable','gr1'),with=F]; };
maximus <- function(ANNUALSALARY,WEEKLYPRODUCTIVITY) { res <- WEEKLYPRODUCTIVITY[ANNUALSALARY, on = 'FIRM']; sly.cols <- grep('^SLY_', names(res), value = TRUE); res[, `:=` (MIN_SLY = min(.SD), SLY_DEPT = sly.cols[which.min(.SD)]), by = 1:nrow(res), .SDcols = sly.cols][]; res2 <- melt(res, id = c('FIRM','WEEKS','MIN_SLY','SLY_DEPT'), measure.vars = patterns('^PR_','^Z_'), value.name = c('PR','Z'))[, variable := c('ADMIN','MKT','FIN')[variable]][, `:=` (PR = PR[sub('^SLY_','',SLY_DEPT) == variable], Z = Z[sub('^SLY_','',SLY_DEPT) == variable]), by = .(FIRM,WEEKS)][, variable := NULL]; res2 <- res2[!duplicated(res2)]; };
## proofs of equivalence
ex <- bgoldst(sal,prod); co <- names(ex);
identical(ex,transform(as.data.frame(akrun(sal.dt,prod.dt))[co],SLY_DEPT=factor(SLY_DEPT)));
## [1] TRUE
identical(ex,transform(as.data.frame(maximus(sal.dt,prod.dt))[co],SLY_DEPT=factor(SLY_DEPT)));
## [1] TRUE
## benchmark
microbenchmark(bgoldst(sal,prod),akrun(sal.dt,prod.dt),maximus(sal.dt,prod.dt));
## Unit: milliseconds
## expr min lq mean median uq max neval
## bgoldst(sal, prod) 1.639193 1.730070 1.883285 1.807047 1.881031 3.230917 100
## akrun(sal.dt, prod.dt) 6.392125 6.666251 7.744077 6.901033 7.230752 53.621663 100
## maximus(sal.dt, prod.dt) 5.002254 5.229979 5.853681 5.423492 6.034609 12.182544 100