如何将嵌套 for 循环转换为 lapply 以在 R 中进行代码优化
How to convert nested for-loops to lapply for code optimization in R
我正在尝试将 for 循环转换为任何应用系列以进行代码优化
这是示例数据
my_data = structure(list(Sector = c("AAA", "BBB", "AAA", "CCC", "AAA",
"BBB", "AAA", "CCC"), Sub_Sector = c("AAA1", "BBB1", "AAA1",
"CCC1", "AAA1", "BBB2", "AAA1", "CCC2"), count = c(1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L), type = c("Actual", "Actual", "Actual", "Actual",
"Actual", "Actual", "Actual", "Actual")), class = "data.frame", row.names = c(NA,
-8L))
实际函数(使用 for 循环)这个函数给了我们预期的输出
expand_collapse_compliance <- function(right_table){
sector_list <- unique(right_table$Sector)
df = data.frame("Sector1"=c(""),"Sector"=c(""),"Sub_Sector"=c(""),"Actual"=c(""))
for(s in sector_list){
df1 = right_table[right_table$Sector==s,]
sector1 = df1$Sector[1]
Sector = df1$Sector[1]
Sub_Sector = ""
actual = as.character(nrow(df1))
mainrow = c(sector1,Sector,Sub_Sector,actual)
df = rbind(df,mainrow)
Sub_Sector_list <- unique(df1$Sub_Sector)
for(i in Sub_Sector_list){
df2 = right_table[right_table$Sub_Sector==i,]
sector1 = df1$Sector[1]
Sector = ""
Sub_Sector = df2$Sub_Sector[1]
actual = nrow(df2)
subrow = c(sector1,Sector,Sub_Sector,actual)
df = rbind(df,subrow)
}
}
df = df[2:nrow(df),]
df$Actual = as.numeric(df$Actual)
df_total = nrow(right_table)
df = rbind(df,c("","Total","",df_total))
return(df)
}
DT::datatable(expand_collapse_compliance(mydata1),
rownames = F,escape = FALSE,
selection=list(mode="single",target="row"),
options = list(pageLength = 50,scrollX = TRUE,
dom = 'tp',ordering=F,
columnDefs = list(list(visible=FALSE, targets=0),
list(className = 'dt-left', targets = '_all'))),class='hover cell-border stripe')
我尝试先将内部循环转换为 lapply,同时这样做 sub_sector 值未显示在输出 table 中,请告诉我如何修复任何想法受到赞赏
expand_collapse_compliance <- function(right_table){
sector_list <- unique(right_table$Sector)
df = data.frame("Sector1"=c(""),"Sector"=c(""),"Sub_Sector"=c(""),"Actual"=c(""))
for(s in sector_list){
df1 = right_table[right_table$Sector==s,]
sector1 = df1$Sector[1]
Sector = df1$Sector[1]
Sub_Sector = ""
actual = as.character(nrow(df1))
mainrow = c(sector1,Sector,Sub_Sector,actual)
df = rbind(df,mainrow)
Sub_Sector_list <- unique(df1$Sub_Sector)
#for(i in Sub_Sector_list){
lapply(Sub_Sector_list, function(x){
df2 = right_table[right_table$Sub_Sector==Sub_Sector_list,]
sector1 = df1$Sector[1]
Sector = ""
Sub_Sector = df2$Sub_Sector[1]
actual = nrow(df2)
subrow = c(sector1,Sector,Sub_Sector,actual)
df = rbind(df,subrow)
})
}
df = df[2:nrow(df),]
df$Actual = as.numeric(df$Actual)
df_total = nrow(right_table)
df = rbind(df,c("","Total","",df_total))
return(df)
}
使用 dplyr
和 tidyr
你可以:
注意:我删除了 DT
部分。
library(dplyr)
library(tidyr)
expand_collapse_compliance1 <- function(x) {
x <- x %>%
count(Sector, Sub_Sector, name = "Actual") %>%
group_split(Sector) %>%
lapply(function(x) {
main <- group_by(x, Sector) %>% summarise(Actual = sum(Actual))
bind_rows(main, x)
}) %>%
bind_rows() %>%
mutate(Sector1 = Sector) %>%
select(Sector1, Sector, Sub_Sector, Actual)
total <- x %>%
filter(is.na(Sub_Sector)) %>%
group_by(Sector = "Total") %>%
summarise(Actual = sum(Actual))
bind_rows(x, total) %>%
mutate(Sector = ifelse(!is.na(Sub_Sector), "", Sector)) %>%
replace_na(list(Sub_Sector = "", Sector1 = ""))
}
expand_collapse_compliance1(my_data)
#> # A tibble: 9 × 4
#> Sector1 Sector Sub_Sector Actual
#> <chr> <chr> <chr> <int>
#> 1 "AAA" "AAA" "" 4
#> 2 "AAA" "" "AAA1" 4
#> 3 "BBB" "BBB" "" 2
#> 4 "BBB" "" "BBB1" 1
#> 5 "BBB" "" "BBB2" 1
#> 6 "CCC" "CCC" "" 2
#> 7 "CCC" "" "CCC1" 1
#> 8 "CCC" "" "CCC2" 1
#> 9 "" "Total" "" 8
expand_collapse_compliance(my_data)
#> Sector1 Sector Sub_Sector Actual
#> 2 AAA AAA 4
#> 3 AAA AAA1 4
#> 4 BBB BBB 2
#> 5 BBB BBB1 1
#> 6 BBB BBB2 1
#> 7 CCC CCC 2
#> 8 CCC CCC1 1
#> 9 CCC CCC2 1
#> 91 Total 8
*apply
系列的适当功能可以是 tapply
使用拆分-应用-组合方法。因为我们只在有多个Sub_Sector
的时候才需要tapply
,所以为了速度我们实现了一个案例处理。
expand_collapse_complianceA <- \(data) {
r <- do.call(rbind, c(by(data, data$Sector, \(x) {
if (length(unique(x$Sub_Sector)) != 1L) {
tt <- t(unname(with(x, tapply(count, list(Sector, Sub_Sector), sum))))
tt <- cbind(x[!duplicated(x$Sub_Sector), 1:2], foo='', Actual=tt)
} else {
tt <- as.data.frame(t(c(unlist(x[!duplicated(x$Sub_Sector), 1:2]), foo='',
Actual=sum(x$count))))
}
rbind(c(tt[1, 1], '', tt[1, 1], sum(as.numeric(tt[, 4]))), tt)[c(1, 3, 2, 4)]
}), make.row.names=FALSE))
rbind(r, c('', 'Total', '', sum(as.numeric(r$Actual[!r$foo %in% ''])))) |>
setNames(c('Sector1', 'Sector', 'Sub_Sector', 'Actual'))
}
注: R version 4.1.2 (2021-11-01)
.
给予
expand_collapse_compliance(my_data)
# Sector1 Sector Sub_Sector Actual
# 1 AAA AAA 4
# 2 AAA AAA1 4
# 3 BBB BBB 2
# 4 BBB BBB1 1
# 5 BBB BBB2 1
# 6 CCC CCC 2
# 7 CCC CCC1 1
# 8 CCC CCC2 1
# 9 Total 8
expand_collapse_complianceA(my_data) |>
(\(x) DT::datatable(
x, rownames=F, escape=FALSE, selection=list(mode="single", target="row"),
options=list(pageLength=50, scrollX=TRUE, dom='tp', ordering=F,
columnDefs=list(list(visible=FALSE, targets=0),
list(className='dt-left', targets='_all'))),
class='hover cell-border stripe'))()
expand_collapse_complianceA
现在只需要原来 for
循环的 1/10 的时间。这里有一个 基准测试 (在 1080 行上测试)。
# Unit: milliseconds
# expr min lq mean median uq max neval cld
# ecc_for 304.723781 305.426934 346.878188 308.208294 335.944407 598.94351 10 c
# ecc_tapply 29.768177 29.851975 31.083977 30.611982 32.058980 34.50901 10 a
# ecc_tidy 135.326594 135.952068 143.967550 138.475437 149.352409 164.94652 10 b
# ecc_DT 3.267969 3.611711 4.610916 3.664493 3.707528 13.48797 10 a
当然data.table更快。但是,我希望在数据即将达到 exhaust the RAM.
时查看性能
基准代码:
microbenchmark::microbenchmark(
ecc_for=expand_collapse_compliance(dat),
ecc_tapply=expand_collapse_complianceA(dat),
ecc_tidy={library(dplyr);library(tidyr);expand_collapse_compliance1(dat)},
ecc_DT={library(data.table);expand_collapse_complianceDT(as.data.table(dat))},
times=10L)
请注意,“整洁”版本到目前为止存在一些缺陷(至少对于新数据而言)。
res_for <- expand_collapse_compliance(dat)
res_tapply <- expand_collapse_complianceA(dat)
res_tidy <- {library(dplyr);library(tidyr);expand_collapse_compliance1(dat)}
all.equal(res_for, res_tapply, check.attributes=FALSE)
# [1] TRUE
all.equal(res_for, res_tidy, check.attributes=FALSE)
# [1] "Component “Sub_Sector”: 1053 string mismatches"
# [2] "Component “Actual”: target is character, current is numeric"
数据
dat <- expand.grid(Sector=c("AA", "AB", "AC", "AD", "AE", "AF", "AG", "AH", "AI", "AJ",
"AK", "AL", "AM", "AN", "AO", "AP", "AQ", "AR", "AS", "AT", "AU",
"AV", "AW", "AX", "AY", "AZ", "BA"),
Sub_Sector=1:40, stringsAsFactors=F)
dat <- transform(dat, Sub_Sector=Reduce(paste0, dat[1:2]), count=1, type='Actual')
dat <- dat[order(dat$Sector), ]
不需要任何循环或应用,我们在这里需要的是三个不同的组计数和一些格式。假设 - 如示例数据所示 - 不需要拆分。
my_data = structure(list(Sector = c("AAA", "BBB", "AAA", "CCC", "AAA",
"BBB", "AAA", "CCC"), Sub_Sector = c("AAA1", "BBB1", "AAA1",
"CCC1", "AAA1", "BBB2", "AAA1", "CCC2"), count = c(1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L), type = c("Actual", "Actual", "Actual", "Actual",
"Actual", "Actual", "Actual", "Actual")), class = "data.frame", row.names = c(NA,
-8L))
library(data.table)
setDT(my_data)
expand_collapse_compliance <- function(x) {
x <- rbindlist(list(
x[, .(Sector1 = Sector, Actual = .N), by = Sector],
setnames(x[, .(Actual = .N), by = .(Sector, Sub_Sector)], "Sector", "Sector1"),
x[, .(Sector = "Total", Actual = .N)]
), fill = T)
setcolorder(x, c("Sector1", "Sector", "Sub_Sector", "Actual"))
setorder(x, Sector1, Sector, na.last = T)
x
}
expand_collapse_compliance(my_data)
# Sector1 Sector Sub_Sector Actual
# 1: AAA AAA <NA> 4
# 2: AAA <NA> AAA1 4
# 3: BBB BBB <NA> 2
# 4: BBB <NA> BBB1 1
# 5: BBB <NA> BBB2 1
# 6: CCC CCC <NA> 2
# 7: CCC <NA> CCC1 1
# 8: CCC <NA> CCC2 1
# 9: <NA> Total <NA> 8
旁注
无需将 NA
转换为 ""
,因为在 shiny DT 中会显示 blancs。
速度测试
正如我提到的,for
在非常小的数据集上通常更快,因为任何库解决方案都使用一些需要一些时间才能加载一次的函数...
my_data_small = structure(list(Sector = c("AAA", "BBB", "AAA", "CCC", "AAA",
"BBB", "AAA", "CCC"), Sub_Sector = c("AAA1", "BBB1", "AAA1",
"CCC1", "AAA1", "BBB2", "AAA1", "CCC2"), count = c(1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L), type = c("Actual", "Actual", "Actual", "Actual",
"Actual", "Actual", "Actual", "Actual")), class = "data.frame", row.names = c(NA,
-8L))
library(data.table)
setDT(my_data)
test replications elapsed relative
2 eccB 150 0.32 1.00
1 eccDT 150 0.72 2.25
# well just make it a milion times bigger :D
my_data_large <- rbindlist(rep(list(my_data_small), 1000000L))
test replications elapsed relative
2 eccB 50 79.30 5.146
1 eccDT 50 15.41 1.000
我正在尝试将 for 循环转换为任何应用系列以进行代码优化
这是示例数据
my_data = structure(list(Sector = c("AAA", "BBB", "AAA", "CCC", "AAA",
"BBB", "AAA", "CCC"), Sub_Sector = c("AAA1", "BBB1", "AAA1",
"CCC1", "AAA1", "BBB2", "AAA1", "CCC2"), count = c(1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L), type = c("Actual", "Actual", "Actual", "Actual",
"Actual", "Actual", "Actual", "Actual")), class = "data.frame", row.names = c(NA,
-8L))
实际函数(使用 for 循环)这个函数给了我们预期的输出
expand_collapse_compliance <- function(right_table){
sector_list <- unique(right_table$Sector)
df = data.frame("Sector1"=c(""),"Sector"=c(""),"Sub_Sector"=c(""),"Actual"=c(""))
for(s in sector_list){
df1 = right_table[right_table$Sector==s,]
sector1 = df1$Sector[1]
Sector = df1$Sector[1]
Sub_Sector = ""
actual = as.character(nrow(df1))
mainrow = c(sector1,Sector,Sub_Sector,actual)
df = rbind(df,mainrow)
Sub_Sector_list <- unique(df1$Sub_Sector)
for(i in Sub_Sector_list){
df2 = right_table[right_table$Sub_Sector==i,]
sector1 = df1$Sector[1]
Sector = ""
Sub_Sector = df2$Sub_Sector[1]
actual = nrow(df2)
subrow = c(sector1,Sector,Sub_Sector,actual)
df = rbind(df,subrow)
}
}
df = df[2:nrow(df),]
df$Actual = as.numeric(df$Actual)
df_total = nrow(right_table)
df = rbind(df,c("","Total","",df_total))
return(df)
}
DT::datatable(expand_collapse_compliance(mydata1),
rownames = F,escape = FALSE,
selection=list(mode="single",target="row"),
options = list(pageLength = 50,scrollX = TRUE,
dom = 'tp',ordering=F,
columnDefs = list(list(visible=FALSE, targets=0),
list(className = 'dt-left', targets = '_all'))),class='hover cell-border stripe')
我尝试先将内部循环转换为 lapply,同时这样做 sub_sector 值未显示在输出 table 中,请告诉我如何修复任何想法受到赞赏
expand_collapse_compliance <- function(right_table){
sector_list <- unique(right_table$Sector)
df = data.frame("Sector1"=c(""),"Sector"=c(""),"Sub_Sector"=c(""),"Actual"=c(""))
for(s in sector_list){
df1 = right_table[right_table$Sector==s,]
sector1 = df1$Sector[1]
Sector = df1$Sector[1]
Sub_Sector = ""
actual = as.character(nrow(df1))
mainrow = c(sector1,Sector,Sub_Sector,actual)
df = rbind(df,mainrow)
Sub_Sector_list <- unique(df1$Sub_Sector)
#for(i in Sub_Sector_list){
lapply(Sub_Sector_list, function(x){
df2 = right_table[right_table$Sub_Sector==Sub_Sector_list,]
sector1 = df1$Sector[1]
Sector = ""
Sub_Sector = df2$Sub_Sector[1]
actual = nrow(df2)
subrow = c(sector1,Sector,Sub_Sector,actual)
df = rbind(df,subrow)
})
}
df = df[2:nrow(df),]
df$Actual = as.numeric(df$Actual)
df_total = nrow(right_table)
df = rbind(df,c("","Total","",df_total))
return(df)
}
使用 dplyr
和 tidyr
你可以:
注意:我删除了 DT
部分。
library(dplyr)
library(tidyr)
expand_collapse_compliance1 <- function(x) {
x <- x %>%
count(Sector, Sub_Sector, name = "Actual") %>%
group_split(Sector) %>%
lapply(function(x) {
main <- group_by(x, Sector) %>% summarise(Actual = sum(Actual))
bind_rows(main, x)
}) %>%
bind_rows() %>%
mutate(Sector1 = Sector) %>%
select(Sector1, Sector, Sub_Sector, Actual)
total <- x %>%
filter(is.na(Sub_Sector)) %>%
group_by(Sector = "Total") %>%
summarise(Actual = sum(Actual))
bind_rows(x, total) %>%
mutate(Sector = ifelse(!is.na(Sub_Sector), "", Sector)) %>%
replace_na(list(Sub_Sector = "", Sector1 = ""))
}
expand_collapse_compliance1(my_data)
#> # A tibble: 9 × 4
#> Sector1 Sector Sub_Sector Actual
#> <chr> <chr> <chr> <int>
#> 1 "AAA" "AAA" "" 4
#> 2 "AAA" "" "AAA1" 4
#> 3 "BBB" "BBB" "" 2
#> 4 "BBB" "" "BBB1" 1
#> 5 "BBB" "" "BBB2" 1
#> 6 "CCC" "CCC" "" 2
#> 7 "CCC" "" "CCC1" 1
#> 8 "CCC" "" "CCC2" 1
#> 9 "" "Total" "" 8
expand_collapse_compliance(my_data)
#> Sector1 Sector Sub_Sector Actual
#> 2 AAA AAA 4
#> 3 AAA AAA1 4
#> 4 BBB BBB 2
#> 5 BBB BBB1 1
#> 6 BBB BBB2 1
#> 7 CCC CCC 2
#> 8 CCC CCC1 1
#> 9 CCC CCC2 1
#> 91 Total 8
*apply
系列的适当功能可以是 tapply
使用拆分-应用-组合方法。因为我们只在有多个Sub_Sector
的时候才需要tapply
,所以为了速度我们实现了一个案例处理。
expand_collapse_complianceA <- \(data) {
r <- do.call(rbind, c(by(data, data$Sector, \(x) {
if (length(unique(x$Sub_Sector)) != 1L) {
tt <- t(unname(with(x, tapply(count, list(Sector, Sub_Sector), sum))))
tt <- cbind(x[!duplicated(x$Sub_Sector), 1:2], foo='', Actual=tt)
} else {
tt <- as.data.frame(t(c(unlist(x[!duplicated(x$Sub_Sector), 1:2]), foo='',
Actual=sum(x$count))))
}
rbind(c(tt[1, 1], '', tt[1, 1], sum(as.numeric(tt[, 4]))), tt)[c(1, 3, 2, 4)]
}), make.row.names=FALSE))
rbind(r, c('', 'Total', '', sum(as.numeric(r$Actual[!r$foo %in% ''])))) |>
setNames(c('Sector1', 'Sector', 'Sub_Sector', 'Actual'))
}
注: R version 4.1.2 (2021-11-01)
.
给予
expand_collapse_compliance(my_data)
# Sector1 Sector Sub_Sector Actual
# 1 AAA AAA 4
# 2 AAA AAA1 4
# 3 BBB BBB 2
# 4 BBB BBB1 1
# 5 BBB BBB2 1
# 6 CCC CCC 2
# 7 CCC CCC1 1
# 8 CCC CCC2 1
# 9 Total 8
expand_collapse_complianceA(my_data) |>
(\(x) DT::datatable(
x, rownames=F, escape=FALSE, selection=list(mode="single", target="row"),
options=list(pageLength=50, scrollX=TRUE, dom='tp', ordering=F,
columnDefs=list(list(visible=FALSE, targets=0),
list(className='dt-left', targets='_all'))),
class='hover cell-border stripe'))()
expand_collapse_complianceA
现在只需要原来 for
循环的 1/10 的时间。这里有一个 基准测试 (在 1080 行上测试)。
# Unit: milliseconds
# expr min lq mean median uq max neval cld
# ecc_for 304.723781 305.426934 346.878188 308.208294 335.944407 598.94351 10 c
# ecc_tapply 29.768177 29.851975 31.083977 30.611982 32.058980 34.50901 10 a
# ecc_tidy 135.326594 135.952068 143.967550 138.475437 149.352409 164.94652 10 b
# ecc_DT 3.267969 3.611711 4.610916 3.664493 3.707528 13.48797 10 a
当然data.table更快。但是,我希望在数据即将达到 exhaust the RAM.
时查看性能基准代码:
microbenchmark::microbenchmark(
ecc_for=expand_collapse_compliance(dat),
ecc_tapply=expand_collapse_complianceA(dat),
ecc_tidy={library(dplyr);library(tidyr);expand_collapse_compliance1(dat)},
ecc_DT={library(data.table);expand_collapse_complianceDT(as.data.table(dat))},
times=10L)
请注意,“整洁”版本到目前为止存在一些缺陷(至少对于新数据而言)。
res_for <- expand_collapse_compliance(dat)
res_tapply <- expand_collapse_complianceA(dat)
res_tidy <- {library(dplyr);library(tidyr);expand_collapse_compliance1(dat)}
all.equal(res_for, res_tapply, check.attributes=FALSE)
# [1] TRUE
all.equal(res_for, res_tidy, check.attributes=FALSE)
# [1] "Component “Sub_Sector”: 1053 string mismatches"
# [2] "Component “Actual”: target is character, current is numeric"
数据
dat <- expand.grid(Sector=c("AA", "AB", "AC", "AD", "AE", "AF", "AG", "AH", "AI", "AJ",
"AK", "AL", "AM", "AN", "AO", "AP", "AQ", "AR", "AS", "AT", "AU",
"AV", "AW", "AX", "AY", "AZ", "BA"),
Sub_Sector=1:40, stringsAsFactors=F)
dat <- transform(dat, Sub_Sector=Reduce(paste0, dat[1:2]), count=1, type='Actual')
dat <- dat[order(dat$Sector), ]
不需要任何循环或应用,我们在这里需要的是三个不同的组计数和一些格式。假设 - 如示例数据所示 - 不需要拆分。
my_data = structure(list(Sector = c("AAA", "BBB", "AAA", "CCC", "AAA",
"BBB", "AAA", "CCC"), Sub_Sector = c("AAA1", "BBB1", "AAA1",
"CCC1", "AAA1", "BBB2", "AAA1", "CCC2"), count = c(1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L), type = c("Actual", "Actual", "Actual", "Actual",
"Actual", "Actual", "Actual", "Actual")), class = "data.frame", row.names = c(NA,
-8L))
library(data.table)
setDT(my_data)
expand_collapse_compliance <- function(x) {
x <- rbindlist(list(
x[, .(Sector1 = Sector, Actual = .N), by = Sector],
setnames(x[, .(Actual = .N), by = .(Sector, Sub_Sector)], "Sector", "Sector1"),
x[, .(Sector = "Total", Actual = .N)]
), fill = T)
setcolorder(x, c("Sector1", "Sector", "Sub_Sector", "Actual"))
setorder(x, Sector1, Sector, na.last = T)
x
}
expand_collapse_compliance(my_data)
# Sector1 Sector Sub_Sector Actual
# 1: AAA AAA <NA> 4
# 2: AAA <NA> AAA1 4
# 3: BBB BBB <NA> 2
# 4: BBB <NA> BBB1 1
# 5: BBB <NA> BBB2 1
# 6: CCC CCC <NA> 2
# 7: CCC <NA> CCC1 1
# 8: CCC <NA> CCC2 1
# 9: <NA> Total <NA> 8
旁注
无需将 NA
转换为 ""
,因为在 shiny DT 中会显示 blancs。
速度测试
正如我提到的,for
在非常小的数据集上通常更快,因为任何库解决方案都使用一些需要一些时间才能加载一次的函数...
my_data_small = structure(list(Sector = c("AAA", "BBB", "AAA", "CCC", "AAA",
"BBB", "AAA", "CCC"), Sub_Sector = c("AAA1", "BBB1", "AAA1",
"CCC1", "AAA1", "BBB2", "AAA1", "CCC2"), count = c(1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L), type = c("Actual", "Actual", "Actual", "Actual",
"Actual", "Actual", "Actual", "Actual")), class = "data.frame", row.names = c(NA,
-8L))
library(data.table)
setDT(my_data)
test replications elapsed relative
2 eccB 150 0.32 1.00
1 eccDT 150 0.72 2.25
# well just make it a milion times bigger :D
my_data_large <- rbindlist(rep(list(my_data_small), 1000000L))
test replications elapsed relative
2 eccB 50 79.30 5.146
1 eccDT 50 15.41 1.000