如何将嵌套 for 循环转换为 lapply 以在 R 中进行代码优化

How to convert nested for-loops to lapply for code optimization in R

我正在尝试将 for 循环转换为任何应用系列以进行代码优化

这是示例数据

my_data = structure(list(Sector = c("AAA", "BBB", "AAA", "CCC", "AAA",
    "BBB", "AAA", "CCC"), Sub_Sector = c("AAA1", "BBB1", "AAA1",
    "CCC1", "AAA1", "BBB2", "AAA1", "CCC2"), count = c(1L, 1L, 1L,
    1L, 1L, 1L, 1L, 1L), type = c("Actual", "Actual", "Actual", "Actual",
    "Actual", "Actual", "Actual", "Actual")), class = "data.frame", row.names = c(NA,
    -8L))

实际函数(使用 for 循环)这个函数给了我们预期的输出

expand_collapse_compliance <- function(right_table){
  
  sector_list <- unique(right_table$Sector)
  df = data.frame("Sector1"=c(""),"Sector"=c(""),"Sub_Sector"=c(""),"Actual"=c(""))
  
  for(s in sector_list){
    df1 = right_table[right_table$Sector==s,]
    sector1 = df1$Sector[1]
    Sector = df1$Sector[1]
    Sub_Sector = ""
    actual = as.character(nrow(df1))
    mainrow = c(sector1,Sector,Sub_Sector,actual)
    df = rbind(df,mainrow)
    Sub_Sector_list <- unique(df1$Sub_Sector)
    
    for(i in Sub_Sector_list){
      df2 = right_table[right_table$Sub_Sector==i,]
      sector1 = df1$Sector[1]
      Sector = ""
      Sub_Sector = df2$Sub_Sector[1]
      actual = nrow(df2)
      subrow = c(sector1,Sector,Sub_Sector,actual)
      df = rbind(df,subrow)
    }
  }
  df = df[2:nrow(df),]
  df$Actual = as.numeric(df$Actual)
  df_total = nrow(right_table)
  df = rbind(df,c("","Total","",df_total))
  return(df)
  
}

DT::datatable(expand_collapse_compliance(mydata1), 
              rownames = F,escape = FALSE,
              selection=list(mode="single",target="row"),
              options = list(pageLength = 50,scrollX = TRUE,
                             dom = 'tp',ordering=F,
                             columnDefs = list(list(visible=FALSE, targets=0),
              list(className = 'dt-left', targets = '_all'))),class='hover cell-border stripe')

我尝试先将内部循环转换为 lapply,同时这样做 sub_sector 值未显示在输出 table 中,请告诉我如何修复任何想法受到赞赏

expand_collapse_compliance <- function(right_table){
  sector_list <- unique(right_table$Sector)
  df = data.frame("Sector1"=c(""),"Sector"=c(""),"Sub_Sector"=c(""),"Actual"=c(""))
  
  for(s in sector_list){
    df1 = right_table[right_table$Sector==s,]
    sector1 = df1$Sector[1]
    Sector = df1$Sector[1]
    Sub_Sector = ""
    actual = as.character(nrow(df1))
    mainrow = c(sector1,Sector,Sub_Sector,actual)
    df = rbind(df,mainrow)
    Sub_Sector_list <- unique(df1$Sub_Sector)
    
    #for(i in Sub_Sector_list){
      lapply(Sub_Sector_list, function(x){
      df2 = right_table[right_table$Sub_Sector==Sub_Sector_list,]
      sector1 = df1$Sector[1]
      Sector = ""
      Sub_Sector = df2$Sub_Sector[1]
      actual = nrow(df2)
      subrow = c(sector1,Sector,Sub_Sector,actual)
      df = rbind(df,subrow)
      })
  }
  df = df[2:nrow(df),]
  df$Actual = as.numeric(df$Actual)
  df_total = nrow(right_table)
  df = rbind(df,c("","Total","",df_total))
  return(df)
  
}

使用 dplyrtidyr 你可以:

注意:我删除了 DT 部分。

library(dplyr)
library(tidyr)

expand_collapse_compliance1 <- function(x) {
  x <- x %>% 
    count(Sector, Sub_Sector, name = "Actual") %>% 
    group_split(Sector) %>% 
    lapply(function(x) {
      main <- group_by(x, Sector) %>% summarise(Actual = sum(Actual)) 
      bind_rows(main, x)
    }) %>%
    bind_rows() %>% 
    mutate(Sector1 = Sector) %>%
    select(Sector1, Sector, Sub_Sector, Actual)
  
  total <- x %>%
    filter(is.na(Sub_Sector)) %>% 
    group_by(Sector = "Total") %>% 
    summarise(Actual = sum(Actual))
  
  bind_rows(x, total) %>% 
    mutate(Sector = ifelse(!is.na(Sub_Sector), "", Sector)) %>% 
    replace_na(list(Sub_Sector = "", Sector1 = ""))
}

expand_collapse_compliance1(my_data)
#> # A tibble: 9 × 4
#>   Sector1 Sector  Sub_Sector Actual
#>   <chr>   <chr>   <chr>       <int>
#> 1 "AAA"   "AAA"   ""              4
#> 2 "AAA"   ""      "AAA1"          4
#> 3 "BBB"   "BBB"   ""              2
#> 4 "BBB"   ""      "BBB1"          1
#> 5 "BBB"   ""      "BBB2"          1
#> 6 "CCC"   "CCC"   ""              2
#> 7 "CCC"   ""      "CCC1"          1
#> 8 "CCC"   ""      "CCC2"          1
#> 9 ""      "Total" ""              8

expand_collapse_compliance(my_data)
#>    Sector1 Sector Sub_Sector Actual
#> 2      AAA    AAA                 4
#> 3      AAA              AAA1      4
#> 4      BBB    BBB                 2
#> 5      BBB              BBB1      1
#> 6      BBB              BBB2      1
#> 7      CCC    CCC                 2
#> 8      CCC              CCC1      1
#> 9      CCC              CCC2      1
#> 91          Total                 8

*apply 系列的适当功能可以是 tapply 使用拆分-应用-组合方法。因为我们只在有多个Sub_Sector的时候才需要tapply,所以为了速度我们实现了一个案例处理。

expand_collapse_complianceA <- \(data) {
  r <- do.call(rbind, c(by(data, data$Sector, \(x) {
    if (length(unique(x$Sub_Sector)) != 1L) {
      tt <- t(unname(with(x, tapply(count, list(Sector, Sub_Sector), sum))))
      tt <- cbind(x[!duplicated(x$Sub_Sector), 1:2], foo='', Actual=tt)
    } else {
      tt <- as.data.frame(t(c(unlist(x[!duplicated(x$Sub_Sector), 1:2]), foo='',
                              Actual=sum(x$count))))
    }
    rbind(c(tt[1, 1], '', tt[1, 1], sum(as.numeric(tt[, 4]))), tt)[c(1, 3, 2, 4)]
  }), make.row.names=FALSE))
  rbind(r, c('', 'Total', '', sum(as.numeric(r$Actual[!r$foo %in% ''])))) |>
    setNames(c('Sector1', 'Sector', 'Sub_Sector', 'Actual'))
}

注: R version 4.1.2 (2021-11-01).

给予

expand_collapse_compliance(my_data)
#   Sector1 Sector Sub_Sector Actual
# 1     AAA    AAA                 4
# 2     AAA              AAA1      4
# 3     BBB    BBB                 2
# 4     BBB              BBB1      1
# 5     BBB              BBB2      1
# 6     CCC    CCC                 2
# 7     CCC              CCC1      1
# 8     CCC              CCC2      1
# 9          Total                 8


expand_collapse_complianceA(my_data) |> 
  (\(x) DT::datatable(
    x, rownames=F, escape=FALSE, selection=list(mode="single", target="row"), 
    options=list(pageLength=50, scrollX=TRUE, dom='tp', ordering=F, 
                 columnDefs=list(list(visible=FALSE, targets=0),
                                 list(className='dt-left', targets='_all'))), 
    class='hover cell-border stripe'))()

expand_collapse_complianceA 现在只需要原来 for 循环的 1/10 的时间。这里有一个 基准测试 (在 1080 行上测试)。

# Unit: milliseconds
#       expr        min         lq       mean     median         uq       max neval cld
#    ecc_for 304.723781 305.426934 346.878188 308.208294 335.944407 598.94351    10   c
# ecc_tapply  29.768177  29.851975  31.083977  30.611982  32.058980  34.50901    10 a  
#   ecc_tidy 135.326594 135.952068 143.967550 138.475437 149.352409 164.94652    10  b 
#     ecc_DT   3.267969   3.611711   4.610916   3.664493   3.707528  13.48797    10 a  

当然data.table更快。但是,我希望在数据即将达到 exhaust the RAM.

时查看性能

基准代码:

microbenchmark::microbenchmark(
  ecc_for=expand_collapse_compliance(dat),
  ecc_tapply=expand_collapse_complianceA(dat),
  ecc_tidy={library(dplyr);library(tidyr);expand_collapse_compliance1(dat)},
  ecc_DT={library(data.table);expand_collapse_complianceDT(as.data.table(dat))},
  times=10L)

请注意,“整洁”版本到目前为止存在一些缺陷(至少对于新数据而言)。

res_for <- expand_collapse_compliance(dat)
res_tapply <- expand_collapse_complianceA(dat)
res_tidy <- {library(dplyr);library(tidyr);expand_collapse_compliance1(dat)}


all.equal(res_for, res_tapply, check.attributes=FALSE)
# [1] TRUE
all.equal(res_for, res_tidy, check.attributes=FALSE)
# [1] "Component “Sub_Sector”: 1053 string mismatches"             
# [2] "Component “Actual”: target is character, current is numeric"

数据

dat <- expand.grid(Sector=c("AA", "AB", "AC", "AD", "AE", "AF", "AG", "AH", "AI", "AJ", 
                     "AK", "AL", "AM", "AN", "AO", "AP", "AQ", "AR", "AS", "AT", "AU", 
                     "AV", "AW", "AX", "AY", "AZ", "BA"),
            Sub_Sector=1:40, stringsAsFactors=F)
dat <- transform(dat, Sub_Sector=Reduce(paste0, dat[1:2]), count=1, type='Actual')
dat <- dat[order(dat$Sector), ]

不需要任何循环或应用,我们在这里需要的是三个不同的组计数和一些格式。假设 - 如示例数据所示 - 不需要拆分。

my_data = structure(list(Sector = c("AAA", "BBB", "AAA", "CCC", "AAA",
    "BBB", "AAA", "CCC"), Sub_Sector = c("AAA1", "BBB1", "AAA1",
    "CCC1", "AAA1", "BBB2", "AAA1", "CCC2"), count = c(1L, 1L, 1L,
    1L, 1L, 1L, 1L, 1L), type = c("Actual", "Actual", "Actual", "Actual",
    "Actual", "Actual", "Actual", "Actual")), class = "data.frame", row.names = c(NA,
    -8L))

library(data.table)
setDT(my_data)

expand_collapse_compliance <- function(x) {
  x <- rbindlist(list(
    x[, .(Sector1 = Sector, Actual = .N), by = Sector], 
    setnames(x[, .(Actual = .N), by = .(Sector, Sub_Sector)], "Sector", "Sector1"),
    x[, .(Sector = "Total", Actual = .N)]
  ), fill = T)
  setcolorder(x, c("Sector1", "Sector", "Sub_Sector", "Actual"))
  setorder(x, Sector1, Sector, na.last = T)
  x
}

expand_collapse_compliance(my_data)

#    Sector1 Sector Sub_Sector Actual
# 1:     AAA    AAA       <NA>      4
# 2:     AAA   <NA>       AAA1      4
# 3:     BBB    BBB       <NA>      2
# 4:     BBB   <NA>       BBB1      1
# 5:     BBB   <NA>       BBB2      1
# 6:     CCC    CCC       <NA>      2
# 7:     CCC   <NA>       CCC1      1
# 8:     CCC   <NA>       CCC2      1
# 9:    <NA>  Total       <NA>      8

旁注 无需将 NA 转换为 "",因为在 shiny DT 中会显示 blancs。

速度测试

正如我提到的,for 在非常小的数据集上通常更快,因为任何库解决方案都使用一些需要一些时间才能加载一次的函数...

my_data_small = structure(list(Sector = c("AAA", "BBB", "AAA", "CCC", "AAA",
    "BBB", "AAA", "CCC"), Sub_Sector = c("AAA1", "BBB1", "AAA1",
    "CCC1", "AAA1", "BBB2", "AAA1", "CCC2"), count = c(1L, 1L, 1L,
    1L, 1L, 1L, 1L, 1L), type = c("Actual", "Actual", "Actual", "Actual",
    "Actual", "Actual", "Actual", "Actual")), class = "data.frame", row.names = c(NA,
    -8L))

library(data.table)
setDT(my_data)

   test replications elapsed relative
2  eccB          150    0.32     1.00
1 eccDT          150    0.72     2.25

# well just make it a milion times bigger :D
my_data_large <- rbindlist(rep(list(my_data_small), 1000000L))

   test replications elapsed relative
2  eccB           50   79.30    5.146
1 eccDT           50   15.41    1.000