从不等长的数据帧列表中提取 colname 并在 R 中制作一个 colname 数据帧
Extract colname from a list of unequal length of dataframes and make a colname dataframe in R
我正在尝试导入一堆列长度不等的 csv。其中一些共享相同的列名,而另一些则有自己的列名。我知道如何导入所有这些并将它们绑定为一个数据框并使用 fill = TRUE
来处理不相等的长度。但我想将每个 csv 的列名绑定到一个数据框中,这样我就可以看到它们有何不同。最有效的方法是什么?
write.csv(mtcars[, 1:5], "mtcars5.csv")
write.csv(mtcars[, 1:6], "mtcars6.csv")
write.csv(mtcars[, 1:4], "mtcars4.csv")
files_to_read <- list.files(
path = here(),
pattern = ".*csv$",
recursive = TRUE,
full.names = TRUE
)
# Bind all the csv into one but I only wanted the header
cars <- rbindlist(lapply(files_to_read, fread), fill=TRUE)
# Maybe I should read all of them into a list first but how can I extract the colum name and bind them into a dataframe?
dflist <- lapply(files_to_read, fread)
预期输出如下:
df1 mpg cyl disp hp
df2 mpg cyl disp hp drat
df3 mpg cyl disp hp drat wt
试试这个:
cars <- rbindlist(lapply(files_to_read, fread, nrows=1, header=FALSE), fill=TRUE)
cars[, 1] <- rownames(cars)
第一列是文件索引。
如果您希望第一列是文件名,请使用
cars[, 1] <- files_to_read
这将包括完整路径,这可能不是您想要的。如果您只想要没有路径的文件名,请使用 full.names = FALSE
调用 list.files
,并将 cars[, 1]
设置为结果:
cars[, 1] <- list.files(
path = here(),
pattern = ".*csv$",
recursive = TRUE,
full.names = FALSE)
或者您可以使用正则表达式将其 grep 出来,但如果您还没有像我一样掌握正则表达式,那么这似乎需要大量工作。
使用purrr
拍摄
# Define a function read data from file - get colnames and create a data.frame
# with file_name, col_index, and col_names
get_col_df <- function(file_name) {
data <- read.csv(file_name)
col_names <- names(data)
df_col_names <- tibble(data = file_name,
col_index = seq_len(length(col_names)),
col_names = col_names)
}
library(purrr)
col_names_df <- map_dfr(.x = files_to_read, .f = get_col_df)
这个输出
# A tibble: 18 x 3
data col_index col_names
<chr> <int> <chr>
1 mtcars4.csv 1 X
2 mtcars4.csv 2 mpg
3 mtcars4.csv 3 cyl
4 mtcars4.csv 4 disp
5 mtcars4.csv 5 hp
6 mtcars5.csv 1 X
7 mtcars5.csv 2 mpg
8 mtcars5.csv 3 cyl
9 mtcars5.csv 4 disp
10 mtcars5.csv 5 hp
11 mtcars5.csv 6 drat
12 mtcars6.csv 1 X
13 mtcars6.csv 2 mpg
14 mtcars6.csv 3 cyl
15 mtcars6.csv 4 disp
16 mtcars6.csv 5 hp
17 mtcars6.csv 6 drat
18 mtcars6.csv 7 wt
如果您想将它们排成一行以便于阅读
library(dplyr)
library(tidyr)
col_names_df %>%
pivot_wider(names_from = col_index, values_from = col_names)
输出
# A tibble: 3 x 8
data `1` `2` `3` `4` `5` `6` `7`
<chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
1 mtcars4.csv X mpg cyl disp hp NA NA
2 mtcars5.csv X mpg cyl disp hp drat NA
3 mtcars6.csv X mpg cyl disp hp drat wt
我正在尝试导入一堆列长度不等的 csv。其中一些共享相同的列名,而另一些则有自己的列名。我知道如何导入所有这些并将它们绑定为一个数据框并使用 fill = TRUE
来处理不相等的长度。但我想将每个 csv 的列名绑定到一个数据框中,这样我就可以看到它们有何不同。最有效的方法是什么?
write.csv(mtcars[, 1:5], "mtcars5.csv")
write.csv(mtcars[, 1:6], "mtcars6.csv")
write.csv(mtcars[, 1:4], "mtcars4.csv")
files_to_read <- list.files(
path = here(),
pattern = ".*csv$",
recursive = TRUE,
full.names = TRUE
)
# Bind all the csv into one but I only wanted the header
cars <- rbindlist(lapply(files_to_read, fread), fill=TRUE)
# Maybe I should read all of them into a list first but how can I extract the colum name and bind them into a dataframe?
dflist <- lapply(files_to_read, fread)
预期输出如下:
df1 mpg cyl disp hp
df2 mpg cyl disp hp drat
df3 mpg cyl disp hp drat wt
试试这个:
cars <- rbindlist(lapply(files_to_read, fread, nrows=1, header=FALSE), fill=TRUE)
cars[, 1] <- rownames(cars)
第一列是文件索引。
如果您希望第一列是文件名,请使用
cars[, 1] <- files_to_read
这将包括完整路径,这可能不是您想要的。如果您只想要没有路径的文件名,请使用 full.names = FALSE
调用 list.files
,并将 cars[, 1]
设置为结果:
cars[, 1] <- list.files(
path = here(),
pattern = ".*csv$",
recursive = TRUE,
full.names = FALSE)
或者您可以使用正则表达式将其 grep 出来,但如果您还没有像我一样掌握正则表达式,那么这似乎需要大量工作。
使用purrr
# Define a function read data from file - get colnames and create a data.frame
# with file_name, col_index, and col_names
get_col_df <- function(file_name) {
data <- read.csv(file_name)
col_names <- names(data)
df_col_names <- tibble(data = file_name,
col_index = seq_len(length(col_names)),
col_names = col_names)
}
library(purrr)
col_names_df <- map_dfr(.x = files_to_read, .f = get_col_df)
这个输出
# A tibble: 18 x 3
data col_index col_names
<chr> <int> <chr>
1 mtcars4.csv 1 X
2 mtcars4.csv 2 mpg
3 mtcars4.csv 3 cyl
4 mtcars4.csv 4 disp
5 mtcars4.csv 5 hp
6 mtcars5.csv 1 X
7 mtcars5.csv 2 mpg
8 mtcars5.csv 3 cyl
9 mtcars5.csv 4 disp
10 mtcars5.csv 5 hp
11 mtcars5.csv 6 drat
12 mtcars6.csv 1 X
13 mtcars6.csv 2 mpg
14 mtcars6.csv 3 cyl
15 mtcars6.csv 4 disp
16 mtcars6.csv 5 hp
17 mtcars6.csv 6 drat
18 mtcars6.csv 7 wt
如果您想将它们排成一行以便于阅读
library(dplyr)
library(tidyr)
col_names_df %>%
pivot_wider(names_from = col_index, values_from = col_names)
输出
# A tibble: 3 x 8
data `1` `2` `3` `4` `5` `6` `7`
<chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
1 mtcars4.csv X mpg cyl disp hp NA NA
2 mtcars5.csv X mpg cyl disp hp drat NA
3 mtcars6.csv X mpg cyl disp hp drat wt