Dataframe 对于超级计算机来说太大了
Dataframe is too big for supercomputer
我正在尝试创建一个捐赠者和接受者矩阵,填充每对夫妇产生的捐赠总和以保持最终的 NA。
它适用于小型数据集(见下面的玩具示例)但是当我切换到国家数据集(3m 条目)时出现了几个问题:除了非常缓慢之外,填充 df 的创建消耗了( super)computer 我收到错误 "Error: cannot allocate vector of size 1529.0 Gb"
我该如何解决这个问题?
非常感谢!
library(dplyr)
library(tidyr)
libray(bigmemory)
candidate_id <- c("cand_1","cand_1","cand_1","cand_2","cand_3")
donor_id <- c("don_1","don_1","don_2","don_2","don_3")
donation <- c(1,2,3.5,4,10)
df = data.frame(candidate_id,donor_id,donation)
colnames(df) <- c("candidate_id","donor_id","donation")
fill <- df %>%
group_by(df$candidate_id,df$donor_id) %>%
summarise(tot_donation=sum(as.numeric(donation))) %>%
complete(df$candidate_id,df$donor_id)
fill <- unique(fill[ ,1:3])
colnames(fill) <- c("candidate_id","donor_id","tot_donation")
nrow = length(unique(df$candidate_id))
ncol = length(unique(df$donor_id))
row_names = unique(fill$candidate_id)
col_names = unique(fill$donor_id)
x <- big.matrix(nrow, ncol, init=NA,dimnames=list(row_names,col_names))
for (i in 1:nrow){
for (j in 1:ncol){
x[i,j] <- fill[which(fill$candidate_id == row_names[i] &
fill$donor_id == col_names[j]), 3]
}
}
你可以试试
library(reshape2)
dcast(fill, candidate_id ~ donor_id,
value.var = "tot_donation",
fun.aggregate = sum)
我不知道它是否会避免内存问题,但它可能比双 for
循环快得多。
我必须 运行 开会,但我的一部分想知道是否有办法用 outer
来做到这一点。
我看到你正在使用 unique
因为你的输出有重复的值。
基于 ,
您应该尝试以下操作以避免重复:
fill <- df %>%
group_by(candidate_id, donor_id) %>%
summarise(tot_donation=sum(donation)) %>%
ungroup %>%
complete(candidate_id, donor_id)
然后您可以尝试创建您想要的输出吗?
我认为 unique
可能会占用大量资源,
所以尽量避免调用它。
本杰明建议的 tidyr
版本应该是:
spread(fill, donor_id, tot_donation)
编辑:顺便说一句,因为你用 sparse-matrix
标记了问题,
您确实可以利用稀疏性来发挥自己的优势:
library(Matrix)
library(dplyr)
df <- data.frame(
candidate_id = c("cand_1","cand_1","cand_1","cand_2","cand_3"),
donor_id = c("don_1","don_1","don_2","don_2","don_3"),
donation = c(1,2,3.5,4,10)
)
summ <- df %>%
group_by(candidate_id, donor_id) %>%
summarise(tot_donation=sum(donation)) %>%
ungroup
num_candidates <- nlevels(df$candidate_id)
num_donors <- nlevels(df$donor_id)
smat <- Matrix(0, num_candidates, num_donors, sparse = TRUE, dimnames = list(
levels(df$candidate_id),
levels(df$donor_id)
))
indices <- summ %>%
select(candidate_id, donor_id) %>%
mutate_all(unclass) %>%
as.matrix
smat[indices] <- summ$tot_donation
smat
3 x 3 sparse Matrix of class "dgCMatrix"
don_1 don_2 don_3
cand_1 3 3.5 .
cand_2 . 4.0 .
cand_3 . . 10
我正在尝试创建一个捐赠者和接受者矩阵,填充每对夫妇产生的捐赠总和以保持最终的 NA。
它适用于小型数据集(见下面的玩具示例)但是当我切换到国家数据集(3m 条目)时出现了几个问题:除了非常缓慢之外,填充 df 的创建消耗了( super)computer 我收到错误 "Error: cannot allocate vector of size 1529.0 Gb"
我该如何解决这个问题? 非常感谢!
library(dplyr)
library(tidyr)
libray(bigmemory)
candidate_id <- c("cand_1","cand_1","cand_1","cand_2","cand_3")
donor_id <- c("don_1","don_1","don_2","don_2","don_3")
donation <- c(1,2,3.5,4,10)
df = data.frame(candidate_id,donor_id,donation)
colnames(df) <- c("candidate_id","donor_id","donation")
fill <- df %>%
group_by(df$candidate_id,df$donor_id) %>%
summarise(tot_donation=sum(as.numeric(donation))) %>%
complete(df$candidate_id,df$donor_id)
fill <- unique(fill[ ,1:3])
colnames(fill) <- c("candidate_id","donor_id","tot_donation")
nrow = length(unique(df$candidate_id))
ncol = length(unique(df$donor_id))
row_names = unique(fill$candidate_id)
col_names = unique(fill$donor_id)
x <- big.matrix(nrow, ncol, init=NA,dimnames=list(row_names,col_names))
for (i in 1:nrow){
for (j in 1:ncol){
x[i,j] <- fill[which(fill$candidate_id == row_names[i] &
fill$donor_id == col_names[j]), 3]
}
}
你可以试试
library(reshape2)
dcast(fill, candidate_id ~ donor_id,
value.var = "tot_donation",
fun.aggregate = sum)
我不知道它是否会避免内存问题,但它可能比双 for
循环快得多。
我必须 运行 开会,但我的一部分想知道是否有办法用 outer
来做到这一点。
我看到你正在使用 unique
因为你的输出有重复的值。
基于
fill <- df %>%
group_by(candidate_id, donor_id) %>%
summarise(tot_donation=sum(donation)) %>%
ungroup %>%
complete(candidate_id, donor_id)
然后您可以尝试创建您想要的输出吗?
我认为 unique
可能会占用大量资源,
所以尽量避免调用它。
本杰明建议的 tidyr
版本应该是:
spread(fill, donor_id, tot_donation)
编辑:顺便说一句,因为你用 sparse-matrix
标记了问题,
您确实可以利用稀疏性来发挥自己的优势:
library(Matrix)
library(dplyr)
df <- data.frame(
candidate_id = c("cand_1","cand_1","cand_1","cand_2","cand_3"),
donor_id = c("don_1","don_1","don_2","don_2","don_3"),
donation = c(1,2,3.5,4,10)
)
summ <- df %>%
group_by(candidate_id, donor_id) %>%
summarise(tot_donation=sum(donation)) %>%
ungroup
num_candidates <- nlevels(df$candidate_id)
num_donors <- nlevels(df$donor_id)
smat <- Matrix(0, num_candidates, num_donors, sparse = TRUE, dimnames = list(
levels(df$candidate_id),
levels(df$donor_id)
))
indices <- summ %>%
select(candidate_id, donor_id) %>%
mutate_all(unclass) %>%
as.matrix
smat[indices] <- summ$tot_donation
smat
3 x 3 sparse Matrix of class "dgCMatrix"
don_1 don_2 don_3
cand_1 3 3.5 .
cand_2 . 4.0 .
cand_3 . . 10