加快R中大数据for循环的处理时间
Speed up the processing time of for loop for big data in R
我有非常大的数据集 bdd_cases
有 150,000 行和 bdd_control
包含 1500 万行。在这里,为了简单起见,我减少了这些数据集的大小,并作为驱动器 link 给出。除其他事项外,我正在尝试根据 cluster_case
和 subset
变量将匹配行从 bdd_control
添加到 bdd_cases
。
我为此编写了以下 for loop
,它非常适合此处给出的小型数据集示例。即使是这个小数据集也需要大约 13 秒。
#import data
id1 <- "199TNlYFwqzzWpi1iY5qX1-M11UoC51Cp"
id2 <- "1TeFCkqLDtEBz0JMBHh8goNWEjYol4O2z"
bdd_cases <- as.data.frame(read.csv(sprintf("https://docs.google.com/uc?id=%s&export=download", id1)))
bdd_control <- as.data.frame(read.csv(sprintf("https://docs.google.com/uc?id=%s&export=download", id2)))
#declare empty dataframe
bdd_temp <- NULL
list_p <- unique(bdd_cases$cluster_case)
#for loop
for (i in 1:length(list_p)) {
temp <- bdd_cases %>%
filter(cluster_case==list_p[i]) #select the first case from bdd_cases
temp0 <- bdd_control %>% filter(subset==temp$subset) #select the rows from bdd_control that match the first case above on the subset variable
temp <- rbind(temp, temp0) #bind the two
temp$cluster_case <- list_p[i] #add the ith cluster_case to all the rows
temp <- temp %>%
group_by(cluster_case) %>% #group by cluster case
mutate(age_diff = abs(age - age[case_control=="case"]), #calculate difference in age between case and controls
fup_diff = foll_up - foll_up[case_control=="case"], #calculate difference in foll_up between case and controls
age_fup = ifelse(age_diff<=2 & fup_diff==0,"accept","delete")) %>% #keep the matching controls and remove the other controls for the ith cluster_case
filter(age_fup=="accept") %>%
select(-age_fup)
bdd_temp <- bdd_temp %>% # finally add this matched case and control to the empty dataframe
bind_rows(temp)
}
当我对具有数百万行的原始数据集尝试相同的 for loop
时,我的问题出现了。我的程序已经 运行 2 天了。我 运行 它在 R studio server
上,它有 64 个内核和 270 GB RAM。
我已经参考了以前的帖子,例如这篇文章 (Speed up the loop operation in R),其中讨论了矢量化和列表而不是数据帧的使用。但是,我无法将这些应用到我的具体情况。
我可以对 for loop
中的命令进行任何具体改进以加快执行速度吗?
速度上的任何小改进都意义重大。谢谢。
这应该会大大加快速度。
在我的系统上,速度增益大约是 5 倍。
#import data
id1 <- "199TNlYFwqzzWpi1iY5qX1-M11UoC51Cp"
id2 <- "1TeFCkqLDtEBz0JMBHh8goNWEjYol4O2z"
library(data.table)
# use fread for reading, fast and get a nice progress bar as bonus
bdd_cases <- fread(sprintf("https://docs.google.com/uc?id=%s&export=download", id1))
bdd_control <- fread(sprintf("https://docs.google.com/uc?id=%s&export=download", id2))
#Put everything in a list
L <- lapply(unique(bdd_cases$cluster_case), function(x){
temp <- rbind(bdd_cases[cluster_case == x, ],
bdd_control[subset == bdd_cases[cluster_case == x, ]$subset])
temp[, cluster_case := x]
temp[, `:=`(age_diff = abs(age - age[case_control=="case"]),
fup_diff = foll_up - foll_up[case_control=="case"])]
temp[age_diff <= 2 & fup_diff == 0, ]
})
#Rowbind the list
final <- rbindlist(L, use.names = TRUE, fill = TRUE)
我有非常大的数据集 bdd_cases
有 150,000 行和 bdd_control
包含 1500 万行。在这里,为了简单起见,我减少了这些数据集的大小,并作为驱动器 link 给出。除其他事项外,我正在尝试根据 cluster_case
和 subset
变量将匹配行从 bdd_control
添加到 bdd_cases
。
我为此编写了以下 for loop
,它非常适合此处给出的小型数据集示例。即使是这个小数据集也需要大约 13 秒。
#import data
id1 <- "199TNlYFwqzzWpi1iY5qX1-M11UoC51Cp"
id2 <- "1TeFCkqLDtEBz0JMBHh8goNWEjYol4O2z"
bdd_cases <- as.data.frame(read.csv(sprintf("https://docs.google.com/uc?id=%s&export=download", id1)))
bdd_control <- as.data.frame(read.csv(sprintf("https://docs.google.com/uc?id=%s&export=download", id2)))
#declare empty dataframe
bdd_temp <- NULL
list_p <- unique(bdd_cases$cluster_case)
#for loop
for (i in 1:length(list_p)) {
temp <- bdd_cases %>%
filter(cluster_case==list_p[i]) #select the first case from bdd_cases
temp0 <- bdd_control %>% filter(subset==temp$subset) #select the rows from bdd_control that match the first case above on the subset variable
temp <- rbind(temp, temp0) #bind the two
temp$cluster_case <- list_p[i] #add the ith cluster_case to all the rows
temp <- temp %>%
group_by(cluster_case) %>% #group by cluster case
mutate(age_diff = abs(age - age[case_control=="case"]), #calculate difference in age between case and controls
fup_diff = foll_up - foll_up[case_control=="case"], #calculate difference in foll_up between case and controls
age_fup = ifelse(age_diff<=2 & fup_diff==0,"accept","delete")) %>% #keep the matching controls and remove the other controls for the ith cluster_case
filter(age_fup=="accept") %>%
select(-age_fup)
bdd_temp <- bdd_temp %>% # finally add this matched case and control to the empty dataframe
bind_rows(temp)
}
当我对具有数百万行的原始数据集尝试相同的 for loop
时,我的问题出现了。我的程序已经 运行 2 天了。我 运行 它在 R studio server
上,它有 64 个内核和 270 GB RAM。
我已经参考了以前的帖子,例如这篇文章 (Speed up the loop operation in R),其中讨论了矢量化和列表而不是数据帧的使用。但是,我无法将这些应用到我的具体情况。
我可以对 for loop
中的命令进行任何具体改进以加快执行速度吗?
速度上的任何小改进都意义重大。谢谢。
这应该会大大加快速度。
在我的系统上,速度增益大约是 5 倍。
#import data
id1 <- "199TNlYFwqzzWpi1iY5qX1-M11UoC51Cp"
id2 <- "1TeFCkqLDtEBz0JMBHh8goNWEjYol4O2z"
library(data.table)
# use fread for reading, fast and get a nice progress bar as bonus
bdd_cases <- fread(sprintf("https://docs.google.com/uc?id=%s&export=download", id1))
bdd_control <- fread(sprintf("https://docs.google.com/uc?id=%s&export=download", id2))
#Put everything in a list
L <- lapply(unique(bdd_cases$cluster_case), function(x){
temp <- rbind(bdd_cases[cluster_case == x, ],
bdd_control[subset == bdd_cases[cluster_case == x, ]$subset])
temp[, cluster_case := x]
temp[, `:=`(age_diff = abs(age - age[case_control=="case"]),
fup_diff = foll_up - foll_up[case_control=="case"])]
temp[age_diff <= 2 & fup_diff == 0, ]
})
#Rowbind the list
final <- rbindlist(L, use.names = TRUE, fill = TRUE)