在 r 中,如何根据文件名将 X 个固定宽度的平面文件读入 Y 个数据帧?
In r, How can you read X number of fixed width flat files into Y data frames based on the file name?
所以我有 80 个文件,文件名格式为:
P.A3588.ACO.CCLF0.D00001.TO30000
P.A3588.ACO.CCLF0.D30001.TO60000
...
P.A3588.ACO.CCLF1.D30001.TO60000
P.A3588.ACO.CCLF1.D30001.TO60000
...
P.A3588.ACO.CCLF9.D30001.TO60000
P.A3588.ACO.CCLF9.D30001.TO60000
有 80 个固定宽度的文本文件:10 个 CCLF 编号(CCLF0、CCLF1、...、CCLF9)各有 8 个部分。我希望能够按 CCLF 编号分组,应用列宽向量,添加列名,并绑定 CCLF 部件的行。
以下是我迄今为止尝试过的内容。它不起作用,但让我知道我正在尝试什么。
filenames <- list.files(dataPath)
names <- substr(filenames,13,17)
CCLF1_width <- c(13,6,11,2,10,10,1,1,7,7,2,17,1,2,2,4,1,10,10,10,10,10,2,10,10,10,11,2,2,1,1,1)
CCLF2_width <- c(13,10,11,2,10,10,4,10,5,11,6,10,10,24,17,2,2,2,2,2)
CCLF3_width <- c(13,11,2,2,7,10,11,6,10,10,1)
CCLF4_width <- c(13,11,2,1,2,7,11,6,10,10,7,1)
CCLF5_width <- c(13,10,11,2,10,10,3,2,2,1,2,10,10,5,15,1,7,10,10,2,2,2,10,10,40,11,17,24,2,2,2,2,2,2,7,7,7,7,7,7,7,7,1)
CCLF6_width <- c(13,10,11,2,10,10,1,2,10,10,5,15,1,10,10,2,2,2,10,10,40,11,17,2)
CCLF7_width <- c(13,11,11,2,10,2,20,1,1,24,9,2,20,13,2,10,10,12,9)
CCLF8_width <- c(11,2,3,5,10,1,1,3,2,2,10,10,10,30,15,40,1,1)
CCLF9_width <- c(11,11,10,10,12)
CCLF0_width <- c(11,11)
for (i in length(filenames)){
assign(paste0(substr(filenames,13,17)),
read_fwf(grepl("CCLF1",filenames),
paste0(i,"_width")))
}
这个问题可以用一个for循环和一个函数来解决。我编写了函数 combine_fwf
以将所有具有给定 CCLF 编号的固定宽度文件合并到一个数据帧中。 list.files
行查找当前工作目录中包含 CCLF 的所有文件,其中是您的数字之一 (0-9)。然后使用 read.fwf
将该文件名读入数据帧,cbind CCLF 编号并将其 rbind 到整体结果。
combine_fwf = function(CCLF_num, colwidths) {
filenames = list.files(pattern = paste0("CCLF", CCLF_num))
df_list = vector("list", length(filenames))
for (i in 1:length(filenames)) {
df_list[[i]] = cbind.data.frame(CCLF_num, read.fwf(filenames[[i]],
colwidths))
}
return(do.call(rbind, df_list))
}
combine_fwf(2, c(12,6))
>> CCLF_num V1 V2
1 2 adsfasdfadsf 123123
2 2 lkjhlkjhlkjh 98098
3 2 adsfasdfadsf 123123
4 2 lkjhlkjhlkjh 98098
5 2 adsfasdfadsf 123123
6 2 lkjhlkjhlkjh 98098
> combine_fwf(1, c(12,6))
CCLF_num V1 V2
1 1 adsfasdfadsf 123123
2 1 lkjhlkjhlkjh 98098
3 1 adsfasdfadsf 123123
4 1 lkjhlkjhlkjh 98098
5 1 adsfasdfadsf 123123
6 1 lkjhlkjhlkjh 98098
考虑通过两个等长列表进行映射,并使用 Map
按元素迭代(包装到 mapply
):
cclf_files <- paste0("CCLF", seq(0:9))
cclf_widths <- list(
CCLF0_width = c(11,11)
CCLF1_width = c(13,6,11,2,10,10,1,1,7,7,2,17,1,2,2,4,1,10,10,10,10,10,2,10,10,10,11,2,2,1,1,1)
CCLF2_width = c(13,10,11,2,10,10,4,10,5,11,6,10,10,24,17,2,2,2,2,2)
CCLF3_width = c(13,11,2,2,7,10,11,6,10,10,1)
CCLF4_width = c(13,11,2,1,2,7,11,6,10,10,7,1)
CCLF5_width = c(13,10,11,2,10,10,3,2,2,1,2,10,10,5,15,1,7,10,10,2,2,2,10,10,40,11,17,24,2,2,2,2,2,2,7,7,7,7,7,7,7,7,1)
CCLF6_width = c(13,10,11,2,10,10,1,2,10,10,5,15,1,10,10,2,2,2,10,10,40,11,17,2)
CCLF7_width = c(13,11,11,2,10,2,20,1,1,24,9,2,20,13,2,10,10,12,9)
CCLF8_width = c(11,2,3,5,10,1,1,3,2,2,10,10,10,30,15,40,1,1)
CCLF9_width = c(11,11,10,10,12)
)
proc_files <- function(f, w) {
# RETRIEVE FILES WITH CURRENT CCF# IN NAME
files <- list.files(path = "/path/to/cclf/files", pattern = f)
print(files)
# BUILD A LIST OF DFs FROM ALL FILES WITH CURRENT CCF#_width
df_list <- lapply(files, function(x) {
tryCatch(read.fwf(x, widths=w), error = function(e) return(NA))
})
# ROW BIND ALL DFs TO FINAL FOR RETURN
df <- do.call(rbind, df_list)
}
# BUILD A NAMED LIST OF DATA FRAMES FOR EACH CCLF#
df_list <- setNames(Map(proc_files, cclf_files, cclf_widths), cclf_files)
df_list$CCLF0
df_list$CCLF1
df_list$CCLF2
...
所以我有 80 个文件,文件名格式为:
P.A3588.ACO.CCLF0.D00001.TO30000
P.A3588.ACO.CCLF0.D30001.TO60000
...
P.A3588.ACO.CCLF1.D30001.TO60000
P.A3588.ACO.CCLF1.D30001.TO60000
...
P.A3588.ACO.CCLF9.D30001.TO60000
P.A3588.ACO.CCLF9.D30001.TO60000
有 80 个固定宽度的文本文件:10 个 CCLF 编号(CCLF0、CCLF1、...、CCLF9)各有 8 个部分。我希望能够按 CCLF 编号分组,应用列宽向量,添加列名,并绑定 CCLF 部件的行。
以下是我迄今为止尝试过的内容。它不起作用,但让我知道我正在尝试什么。
filenames <- list.files(dataPath)
names <- substr(filenames,13,17)
CCLF1_width <- c(13,6,11,2,10,10,1,1,7,7,2,17,1,2,2,4,1,10,10,10,10,10,2,10,10,10,11,2,2,1,1,1)
CCLF2_width <- c(13,10,11,2,10,10,4,10,5,11,6,10,10,24,17,2,2,2,2,2)
CCLF3_width <- c(13,11,2,2,7,10,11,6,10,10,1)
CCLF4_width <- c(13,11,2,1,2,7,11,6,10,10,7,1)
CCLF5_width <- c(13,10,11,2,10,10,3,2,2,1,2,10,10,5,15,1,7,10,10,2,2,2,10,10,40,11,17,24,2,2,2,2,2,2,7,7,7,7,7,7,7,7,1)
CCLF6_width <- c(13,10,11,2,10,10,1,2,10,10,5,15,1,10,10,2,2,2,10,10,40,11,17,2)
CCLF7_width <- c(13,11,11,2,10,2,20,1,1,24,9,2,20,13,2,10,10,12,9)
CCLF8_width <- c(11,2,3,5,10,1,1,3,2,2,10,10,10,30,15,40,1,1)
CCLF9_width <- c(11,11,10,10,12)
CCLF0_width <- c(11,11)
for (i in length(filenames)){
assign(paste0(substr(filenames,13,17)),
read_fwf(grepl("CCLF1",filenames),
paste0(i,"_width")))
}
这个问题可以用一个for循环和一个函数来解决。我编写了函数 combine_fwf
以将所有具有给定 CCLF 编号的固定宽度文件合并到一个数据帧中。 list.files
行查找当前工作目录中包含 CCLF 的所有文件,其中是您的数字之一 (0-9)。然后使用 read.fwf
将该文件名读入数据帧,cbind CCLF 编号并将其 rbind 到整体结果。
combine_fwf = function(CCLF_num, colwidths) {
filenames = list.files(pattern = paste0("CCLF", CCLF_num))
df_list = vector("list", length(filenames))
for (i in 1:length(filenames)) {
df_list[[i]] = cbind.data.frame(CCLF_num, read.fwf(filenames[[i]],
colwidths))
}
return(do.call(rbind, df_list))
}
combine_fwf(2, c(12,6))
>> CCLF_num V1 V2
1 2 adsfasdfadsf 123123
2 2 lkjhlkjhlkjh 98098
3 2 adsfasdfadsf 123123
4 2 lkjhlkjhlkjh 98098
5 2 adsfasdfadsf 123123
6 2 lkjhlkjhlkjh 98098
> combine_fwf(1, c(12,6))
CCLF_num V1 V2
1 1 adsfasdfadsf 123123
2 1 lkjhlkjhlkjh 98098
3 1 adsfasdfadsf 123123
4 1 lkjhlkjhlkjh 98098
5 1 adsfasdfadsf 123123
6 1 lkjhlkjhlkjh 98098
考虑通过两个等长列表进行映射,并使用 Map
按元素迭代(包装到 mapply
):
cclf_files <- paste0("CCLF", seq(0:9))
cclf_widths <- list(
CCLF0_width = c(11,11)
CCLF1_width = c(13,6,11,2,10,10,1,1,7,7,2,17,1,2,2,4,1,10,10,10,10,10,2,10,10,10,11,2,2,1,1,1)
CCLF2_width = c(13,10,11,2,10,10,4,10,5,11,6,10,10,24,17,2,2,2,2,2)
CCLF3_width = c(13,11,2,2,7,10,11,6,10,10,1)
CCLF4_width = c(13,11,2,1,2,7,11,6,10,10,7,1)
CCLF5_width = c(13,10,11,2,10,10,3,2,2,1,2,10,10,5,15,1,7,10,10,2,2,2,10,10,40,11,17,24,2,2,2,2,2,2,7,7,7,7,7,7,7,7,1)
CCLF6_width = c(13,10,11,2,10,10,1,2,10,10,5,15,1,10,10,2,2,2,10,10,40,11,17,2)
CCLF7_width = c(13,11,11,2,10,2,20,1,1,24,9,2,20,13,2,10,10,12,9)
CCLF8_width = c(11,2,3,5,10,1,1,3,2,2,10,10,10,30,15,40,1,1)
CCLF9_width = c(11,11,10,10,12)
)
proc_files <- function(f, w) {
# RETRIEVE FILES WITH CURRENT CCF# IN NAME
files <- list.files(path = "/path/to/cclf/files", pattern = f)
print(files)
# BUILD A LIST OF DFs FROM ALL FILES WITH CURRENT CCF#_width
df_list <- lapply(files, function(x) {
tryCatch(read.fwf(x, widths=w), error = function(e) return(NA))
})
# ROW BIND ALL DFs TO FINAL FOR RETURN
df <- do.call(rbind, df_list)
}
# BUILD A NAMED LIST OF DATA FRAMES FOR EACH CCLF#
df_list <- setNames(Map(proc_files, cclf_files, cclf_widths), cclf_files)
df_list$CCLF0
df_list$CCLF1
df_list$CCLF2
...