改进读取文件夹中文件的功能

improving a function to read files in a folder

我构建了这个函数:

alterations <- function() {
  
  if (!require(readr)) {install.packages("readr")}
  if (!require(stringr)) {install.packages("stringr")}
  if (!require(data.table)) {install.packages("data.table")}
  
  temp <- list.files(pattern = "*_automat.lif")
  dados <- NULL
  
  for (i in 1:length(temp)) {
    
    df1 <- fread(file = temp[[i]],
                 select = c(1, 18),
                 col.names = c("name", "classificador"))[order(name)]
    
    df2 <-fread(str_remove(temp[[i]], "_automat"),
                select = c(1, 18),
                col.names = c("name", "validador"))[order(name)]
    
    tb1 <- cbind(df1[, 2], df2[, 2]) %>%
      table()
    
    df3 <- cbind(ciclo = temp[[i]],
                 validation_date = str_sub(file.info(temp[[i]])$mtime, 1, 10), 
                 as.data.table(tb1))
    
    dados <- rbind(dados, df3)
  }
  
  write_csv(dados, file = "dados_brutos.csv")
  return(dados)
  
}

我需要帮助来改进这个功能:

  1. 我在一个文件夹中有数百个文件,名称为:aaa.lif、aaa_automat.lif、bbb.lif、bbb_automat.lif、ccc.lif, ccc.automat.lif, ... 具有相同的列和数千行,但某些列中的值不同(如代码中指定的第 18 列)。我需要将这些文件的第 18 列 (classificador) 绑定到相同的第 1 列 (name)。但是,有些文件有问题,文件 *_automat.lif 中没有某些行。我尝试使用 merge 代替 cbind 来按列 name 合并 data.tables df1 和 df2,但是执行函数的时间更糟。

  2. 不知道我的for(){}是否高效,有没有更好的方法?

数据示例:

dput(df1)
setDT(structure(list(name = c("2020-12-01_00_34_54.029_1009_1943.png", 
"2020-12-01_00_34_54.029_1025_394.png", "2020-12-01_00_34_54.029_1077_1739.png", 
"2020-12-01_00_34_54.029_1345_631.png", "2020-12-01_00_34_54.029_1360_1538.png", 
"2020-12-01_00_34_54.029_1435_1340.png", "2020-12-01_00_34_54.029_1508_352.png"
), classificador = c("organism", "shadow", "coscinodiscus", "shadow", 
"shadow", "shadow", "shadow")), row.names = c(NA, -7L), class = c("data.table", 
"data.frame")))

dput(df2)
setDT(structure(list(name = c("2020-12-01_00_34_54.029_1009_1943.png", 
    "2020-12-01_00_34_54.029_1025_394.png", "2020-12-01_00_34_54.029_1077_1739.png", 
    "2020-12-01_00_34_54.029_1345_631.png", "2020-12-01_00_34_54.029_1360_1538.png", 
    "2020-12-01_00_34_54.029_1435_1340.png", "2020-12-01_00_34_54.029_1508_352.png"
    ), validador = c("shadow", "shadow", "coscinodiscus", "shadow", 
    "shadow", "shadow", "shadow")), row.names = c(NA, -7L), class = c("data.table", 
    "data.frame")))

输出:

                                                  ciclo validation_date classificador     validador N
1: Basler_2020-12-01 00_34_52.441983_frames_automat.lif      2021-07-09 coscinodiscus coscinodiscus 1
2: Basler_2020-12-01 00_34_52.441983_frames_automat.lif      2021-07-09      organism coscinodiscus 0
3: Basler_2020-12-01 00_34_52.441983_frames_automat.lif      2021-07-09        shadow coscinodiscus 0
4: Basler_2020-12-01 00_34_52.441983_frames_automat.lif      2021-07-09 coscinodiscus        shadow 0
5: Basler_2020-12-01 00_34_52.441983_frames_automat.lif      2021-07-09      organism        shadow 1
6: Basler_2020-12-01 00_34_52.441983_frames_automat.lif      2021-07-09        shadow        shadow 5
7: Basler_2020-12-01 01_35_01.902191_frames_automat.lif      2021-07-10        shadow        shadow 7

谢谢

我认为这与您尝试实现的目标非常接近。

files <- list.files(pattern = "*_automat.lif")

dados <- lapply(files, function(file) {
  
  df1 <- fread(file = file, select = c(1, 18), col.names = c("name", "classificador"))
  df2 <- fread(str_remove(file, "_automat"), select = c(1, 18), col.names = c("name", "validador"))
  
  tbl <- merge(df1, df2)
  tbl[, ciclo := file]
  tbl[, validation_date := str_sub(file.info(file)$mtime, 1, 10)]
  tbl
  
})

dados <- rbindlist(dados)

dados[, name := NULL]
setcolorder(dados, c("ciclo", "validation_date", "classificador", "validador"))

# from the output it seems you want to group and show counts
dados <- dados[, .(N = .N), by = .(ciclo, validation_date, classificador, validador)]

write_csv(dados, file = "dados_brutos.csv")