改进读取文件夹中文件的功能
improving a function to read files in a folder
我构建了这个函数:
alterations <- function() {
if (!require(readr)) {install.packages("readr")}
if (!require(stringr)) {install.packages("stringr")}
if (!require(data.table)) {install.packages("data.table")}
temp <- list.files(pattern = "*_automat.lif")
dados <- NULL
for (i in 1:length(temp)) {
df1 <- fread(file = temp[[i]],
select = c(1, 18),
col.names = c("name", "classificador"))[order(name)]
df2 <-fread(str_remove(temp[[i]], "_automat"),
select = c(1, 18),
col.names = c("name", "validador"))[order(name)]
tb1 <- cbind(df1[, 2], df2[, 2]) %>%
table()
df3 <- cbind(ciclo = temp[[i]],
validation_date = str_sub(file.info(temp[[i]])$mtime, 1, 10),
as.data.table(tb1))
dados <- rbind(dados, df3)
}
write_csv(dados, file = "dados_brutos.csv")
return(dados)
}
我需要帮助来改进这个功能:
我在一个文件夹中有数百个文件,名称为:aaa.lif、aaa_automat.lif、bbb.lif、bbb_automat.lif、ccc.lif, ccc.automat.lif, ... 具有相同的列和数千行,但某些列中的值不同(如代码中指定的第 18 列)。我需要将这些文件的第 18 列 (classificador) 绑定到相同的第 1 列 (name)。但是,有些文件有问题,文件 *_automat.lif 中没有某些行。我尝试使用 merge
代替 cbind
来按列 name 合并 data.tables df1 和 df2,但是执行函数的时间更糟。
不知道我的for(){}是否高效,有没有更好的方法?
数据示例:
dput(df1)
setDT(structure(list(name = c("2020-12-01_00_34_54.029_1009_1943.png",
"2020-12-01_00_34_54.029_1025_394.png", "2020-12-01_00_34_54.029_1077_1739.png",
"2020-12-01_00_34_54.029_1345_631.png", "2020-12-01_00_34_54.029_1360_1538.png",
"2020-12-01_00_34_54.029_1435_1340.png", "2020-12-01_00_34_54.029_1508_352.png"
), classificador = c("organism", "shadow", "coscinodiscus", "shadow",
"shadow", "shadow", "shadow")), row.names = c(NA, -7L), class = c("data.table",
"data.frame")))
dput(df2)
setDT(structure(list(name = c("2020-12-01_00_34_54.029_1009_1943.png",
"2020-12-01_00_34_54.029_1025_394.png", "2020-12-01_00_34_54.029_1077_1739.png",
"2020-12-01_00_34_54.029_1345_631.png", "2020-12-01_00_34_54.029_1360_1538.png",
"2020-12-01_00_34_54.029_1435_1340.png", "2020-12-01_00_34_54.029_1508_352.png"
), validador = c("shadow", "shadow", "coscinodiscus", "shadow",
"shadow", "shadow", "shadow")), row.names = c(NA, -7L), class = c("data.table",
"data.frame")))
输出:
ciclo validation_date classificador validador N
1: Basler_2020-12-01 00_34_52.441983_frames_automat.lif 2021-07-09 coscinodiscus coscinodiscus 1
2: Basler_2020-12-01 00_34_52.441983_frames_automat.lif 2021-07-09 organism coscinodiscus 0
3: Basler_2020-12-01 00_34_52.441983_frames_automat.lif 2021-07-09 shadow coscinodiscus 0
4: Basler_2020-12-01 00_34_52.441983_frames_automat.lif 2021-07-09 coscinodiscus shadow 0
5: Basler_2020-12-01 00_34_52.441983_frames_automat.lif 2021-07-09 organism shadow 1
6: Basler_2020-12-01 00_34_52.441983_frames_automat.lif 2021-07-09 shadow shadow 5
7: Basler_2020-12-01 01_35_01.902191_frames_automat.lif 2021-07-10 shadow shadow 7
谢谢
我认为这与您尝试实现的目标非常接近。
files <- list.files(pattern = "*_automat.lif")
dados <- lapply(files, function(file) {
df1 <- fread(file = file, select = c(1, 18), col.names = c("name", "classificador"))
df2 <- fread(str_remove(file, "_automat"), select = c(1, 18), col.names = c("name", "validador"))
tbl <- merge(df1, df2)
tbl[, ciclo := file]
tbl[, validation_date := str_sub(file.info(file)$mtime, 1, 10)]
tbl
})
dados <- rbindlist(dados)
dados[, name := NULL]
setcolorder(dados, c("ciclo", "validation_date", "classificador", "validador"))
# from the output it seems you want to group and show counts
dados <- dados[, .(N = .N), by = .(ciclo, validation_date, classificador, validador)]
write_csv(dados, file = "dados_brutos.csv")
我构建了这个函数:
alterations <- function() {
if (!require(readr)) {install.packages("readr")}
if (!require(stringr)) {install.packages("stringr")}
if (!require(data.table)) {install.packages("data.table")}
temp <- list.files(pattern = "*_automat.lif")
dados <- NULL
for (i in 1:length(temp)) {
df1 <- fread(file = temp[[i]],
select = c(1, 18),
col.names = c("name", "classificador"))[order(name)]
df2 <-fread(str_remove(temp[[i]], "_automat"),
select = c(1, 18),
col.names = c("name", "validador"))[order(name)]
tb1 <- cbind(df1[, 2], df2[, 2]) %>%
table()
df3 <- cbind(ciclo = temp[[i]],
validation_date = str_sub(file.info(temp[[i]])$mtime, 1, 10),
as.data.table(tb1))
dados <- rbind(dados, df3)
}
write_csv(dados, file = "dados_brutos.csv")
return(dados)
}
我需要帮助来改进这个功能:
我在一个文件夹中有数百个文件,名称为:aaa.lif、aaa_automat.lif、bbb.lif、bbb_automat.lif、ccc.lif, ccc.automat.lif, ... 具有相同的列和数千行,但某些列中的值不同(如代码中指定的第 18 列)。我需要将这些文件的第 18 列 (classificador) 绑定到相同的第 1 列 (name)。但是,有些文件有问题,文件 *_automat.lif 中没有某些行。我尝试使用
merge
代替cbind
来按列 name 合并 data.tables df1 和 df2,但是执行函数的时间更糟。不知道我的for(){}是否高效,有没有更好的方法?
数据示例:
dput(df1)
setDT(structure(list(name = c("2020-12-01_00_34_54.029_1009_1943.png",
"2020-12-01_00_34_54.029_1025_394.png", "2020-12-01_00_34_54.029_1077_1739.png",
"2020-12-01_00_34_54.029_1345_631.png", "2020-12-01_00_34_54.029_1360_1538.png",
"2020-12-01_00_34_54.029_1435_1340.png", "2020-12-01_00_34_54.029_1508_352.png"
), classificador = c("organism", "shadow", "coscinodiscus", "shadow",
"shadow", "shadow", "shadow")), row.names = c(NA, -7L), class = c("data.table",
"data.frame")))
dput(df2)
setDT(structure(list(name = c("2020-12-01_00_34_54.029_1009_1943.png",
"2020-12-01_00_34_54.029_1025_394.png", "2020-12-01_00_34_54.029_1077_1739.png",
"2020-12-01_00_34_54.029_1345_631.png", "2020-12-01_00_34_54.029_1360_1538.png",
"2020-12-01_00_34_54.029_1435_1340.png", "2020-12-01_00_34_54.029_1508_352.png"
), validador = c("shadow", "shadow", "coscinodiscus", "shadow",
"shadow", "shadow", "shadow")), row.names = c(NA, -7L), class = c("data.table",
"data.frame")))
输出:
ciclo validation_date classificador validador N
1: Basler_2020-12-01 00_34_52.441983_frames_automat.lif 2021-07-09 coscinodiscus coscinodiscus 1
2: Basler_2020-12-01 00_34_52.441983_frames_automat.lif 2021-07-09 organism coscinodiscus 0
3: Basler_2020-12-01 00_34_52.441983_frames_automat.lif 2021-07-09 shadow coscinodiscus 0
4: Basler_2020-12-01 00_34_52.441983_frames_automat.lif 2021-07-09 coscinodiscus shadow 0
5: Basler_2020-12-01 00_34_52.441983_frames_automat.lif 2021-07-09 organism shadow 1
6: Basler_2020-12-01 00_34_52.441983_frames_automat.lif 2021-07-09 shadow shadow 5
7: Basler_2020-12-01 01_35_01.902191_frames_automat.lif 2021-07-10 shadow shadow 7
谢谢
我认为这与您尝试实现的目标非常接近。
files <- list.files(pattern = "*_automat.lif")
dados <- lapply(files, function(file) {
df1 <- fread(file = file, select = c(1, 18), col.names = c("name", "classificador"))
df2 <- fread(str_remove(file, "_automat"), select = c(1, 18), col.names = c("name", "validador"))
tbl <- merge(df1, df2)
tbl[, ciclo := file]
tbl[, validation_date := str_sub(file.info(file)$mtime, 1, 10)]
tbl
})
dados <- rbindlist(dados)
dados[, name := NULL]
setcolorder(dados, c("ciclo", "validation_date", "classificador", "validador"))
# from the output it seems you want to group and show counts
dados <- dados[, .(N = .N), by = .(ciclo, validation_date, classificador, validador)]
write_csv(dados, file = "dados_brutos.csv")