矢量化多条件数据帧合并

Vectorising a mutlitple condition dataframe merge

我正在尝试合并两个数据框。原始数据框比要合并的数据框大得多,但是每一行只有 1 个可能的匹配项。通过匹配类型(一个因素)和级别找到该行。级别是一个整数,将被放入几个桶之一(示例只有两个)

我当前的方法有效,但使用 sapply 并且对于大量行来说速度很慢。如何矢量化此操作?

set.seed(123)
sample <- 100
data <- data.frame(type= sample(LETTERS[1:4], sample, replace=TRUE), level =round(runif(sample, 1,sample)), value = round(runif(sample, 200,1000)))

data2 <- data.frame(type= rep(LETTERS[1:4],2), lower= c(rep(1,4), rep(51,4)), upper = c(rep(50,4), rep(sample,4)), cost1 = runif(8, 0,1), cost2 = runif(8, 0,1),cost3 = runif(8, 0,1))
data2[,4:6] <- data2[,4:6]/rowSums(data2[,4:6]) #turns the variables in to percentages, not necessary on real data

x <- unlist(sapply(1:sample, function(n) which(ll <-data$type[n]==data2$type & data$level[n] >= data2$lower & data$level[n] <= data2$upper)))

data3 <- cbind(data, percentage= data2[x, -c(1:3)])

如果我理解你设置的匹配问题,下面的代码似乎通过按类型划分数据然后使用 cut 找到合适的桶来加快速度。我认为它将容纳更多的上下值对,但没有仔细检查。

library(plyr) 
percents <- function(value, cost) {
   cost <- cost[cost[,1]== value[1,1],]
   cost <- cost[order(cost[,2]),]
   ints <- cut(value[,2], breaks=c(t(cost[,2:3])), labels=FALSE, include.lowest=TRUE )
   cbind(value,percentage=cost[ceiling(ints/2),-(1:3)])
}
data4 <- rbind.fill(mapply(percents, value=split(data, data$type), cost=list(data2), SIMPLIFY=FALSE)  )

设置

sample <- 10000

给出以下执行时间比较

microbenchmark({x <- unlist(sapply(1:sample, function(n) which(ll <-data$type[n]==data2$type & data$level[n] >= data2$lower & data$level[n] <= data2$upper)));
             data3 <- cbind(data, percentage= data2[x, -c(1:3)])} ,
             data4 <- rbind.fill(mapply(percents, value=split(data, data$type), cost=list(data2), SIMPLIFY=FALSE)  ),
            times=10)

Unit: milliseconds
                                                                                                                                                                                                                            expr
{     x <- unlist(sapply(1:sample, function(n) which(ll <- data$type[n] ==          data2$type & data$level[n] >= data2$lower & data$level[n] <=          data2$upper)))     data3 <- cbind(data, percentage = data2[x, -c(1:3)]) }
                                                                                                                data4 <- rbind.fill(mapply(percents, value = split(data, data$type),      cost = list(data2), SIMPLIFY = FALSE))

       min         lq       mean     median        uq        max neval
1198.18269 1214.10560 1225.85117 1226.79838 1234.2671 1258.63122    10
  20.81022   20.93255   21.50001   21.24237   22.1305   22.65291    10

第一个数字代表您问题中显示的代码,第二个数字代表我 post 中的代码。对于这种情况,新代码似乎快了将近 60 倍。

编辑

要使用 rbind_all 并避免 mapply,请使用以下内容:

microbenchmark({x <- unlist(sapply(1:sample, function(n) which(ll <-data$type[n]==data2$type & data$level[n] >= data2$lower & data$level[n] <= data2$upper)));
            data3 <- cbind(data, percentage= data2[x, -c(1:3)])} ,
           data4 <- rbind_all(lapply(split(data, data$type), percents, cost=data2  )),
           times=10)

执行时间略有缩短

       min         lq       mean     median         uq        max neval
1271.57023 1289.17614 1297.68572 1301.84540 1308.31476 1313.56822    10
  18.33819   18.57373   23.28578   19.53742   19.95132   58.96143    10

编辑 2

修改为仅将 data2$lower 值用于设置间隔

percents <- function(value, cost) {
  cost <- cost[cost[,"type"] == value[1,"type"],]
  cost <- cost[order(cost[,"lower"]),]
  ints <- cut(value[,"value"], breaks= c(cost[,"lower"], max(cost[,"upper"])), labels=FALSE, right=FALSE, include.highest=TRUE )
  cbind(value,percentage=cost[ints,-(1:3)])
}

一起使用
data4 <- rbind_all(lapply(split(data, data$type), percents, cost=data2  ))