R data.table 按行快速求和随机样本
R data.table fast sum of random samples by row
我有:
require(data.table)
dataDT <- data.table(ID = 1:4, Num_Times = c(7, 9, 10, 13))
dataDT # the main data
ID Num_Times
1: 1 7
2: 2 9
3: 3 10
4: 4 13
probabilityDT <- data.table(val = 1:3, prob = c(0.5, 0.3, 0.2))
probabilityDT # the probabilty matrix
val prob
1: 1 0.5
2: 2 0.3
3: 3 0.2
我想做以下事情:
对于每一行,采样并计算总和。
valTemp <- c()
set.seed(999)
for (i in 1:nrow(dataDT)) {
# sample size
num_times <- dataDT[i, Num_Times]
# get samples
Temp1 <- sample(x = probabilityDT[["val"]], size = num_times, replace = TRUE, prob = probabilityDT[["prob"]])
# get sum
Temp1 <- sum(Temp1)
valTemp <- c(valTemp, Temp1)
}
dataDT[, sample_sum := valTemp]
dataDT
ID Num_Times sample_sum
1: 1 7 12
2: 2 9 14
3: 3 10 20
4: 4 13 25
怎样做更有效率?我有 ~500k 行。
这个操作完全可以向量化吗?
看看这是否更快:
set.seed(999)
sample_all <- sample(probabilityDT[["val"]], #
size = sum(dataDT[["Num_Times"]]), #get all values sampled at once
TRUE, probabilityDT[["prob"]])
res <- data.table(sample_all, ID = rep(dataDT[["ID"]], dataDT[["Num_Times"]]))
res <- res[, .(sample_sum = sum(sample_all)), by = "ID"]
dataDT[res, sample_sum := i.sample_sum, on = "ID"]
# ID Num_Times sample_sum
#1: 1 7 12
#2: 2 9 14
#3: 3 10 20
#4: 4 13 25
我有:
require(data.table)
dataDT <- data.table(ID = 1:4, Num_Times = c(7, 9, 10, 13))
dataDT # the main data
ID Num_Times
1: 1 7
2: 2 9
3: 3 10
4: 4 13
probabilityDT <- data.table(val = 1:3, prob = c(0.5, 0.3, 0.2))
probabilityDT # the probabilty matrix
val prob
1: 1 0.5
2: 2 0.3
3: 3 0.2
我想做以下事情:
对于每一行,采样并计算总和。
valTemp <- c()
set.seed(999)
for (i in 1:nrow(dataDT)) {
# sample size
num_times <- dataDT[i, Num_Times]
# get samples
Temp1 <- sample(x = probabilityDT[["val"]], size = num_times, replace = TRUE, prob = probabilityDT[["prob"]])
# get sum
Temp1 <- sum(Temp1)
valTemp <- c(valTemp, Temp1)
}
dataDT[, sample_sum := valTemp]
dataDT
ID Num_Times sample_sum
1: 1 7 12
2: 2 9 14
3: 3 10 20
4: 4 13 25
怎样做更有效率?我有 ~500k 行。 这个操作完全可以向量化吗?
看看这是否更快:
set.seed(999)
sample_all <- sample(probabilityDT[["val"]], #
size = sum(dataDT[["Num_Times"]]), #get all values sampled at once
TRUE, probabilityDT[["prob"]])
res <- data.table(sample_all, ID = rep(dataDT[["ID"]], dataDT[["Num_Times"]]))
res <- res[, .(sample_sum = sum(sample_all)), by = "ID"]
dataDT[res, sample_sum := i.sample_sum, on = "ID"]
# ID Num_Times sample_sum
#1: 1 7 12
#2: 2 9 14
#3: 3 10 20
#4: 4 13 25