迭代读取、操作多个 excel 文件并使用 R 将它们附加到一个数据帧中
Iteratively read, manipulate multiple excel files and append them into one dataframe using R
在一个目录下,我有多个excel个格式相似的文件(您可以从here下载示例文件):
我需要
- 循环文件和
read_excel()
,
- 用第二个列名
改变一个新列name
- 将第一列和第二列分别重命名为
date
和value
,删除最后一列(其原始列名称为1
);
- 使用
do.call(rbind, df.list)
将所有 dfs 附加到一个数据帧
我做了什么:
循环获取文件路径:
library(fs)
folder_path <- './test/'
file_paths <- dir_ls(folder_path, regexp = ".xlsx")
函数读取 excels:
read_excel_file <- function(path) {
df <- read_excel(path = path, header = TRUE)
}
lapply read_excel()
函数到每个 excel 文件:
df.list = lapply(file_paths, function(file) read_excel(file, skip = 2, col_names = FALSE))
df <- do.call(rbind, df.list)
预期结果将是这样的数据框:
date value name
2 2021-01-07 -76.5 J05-J01
3 2021-01-08 -93.5 J05-J01
4 2021-01-15 -305 J05-J01
5 2021-01-22 289 J05-J01
6 2021-01-29 242.5 J05-J01
7 2021-02-05 266 J05-J01
8 2021-02-10 239.5 J05-J01
9 2021-02-19 305.5 J05-J01
10 2021-01-07 323 J01-J09
11 2021-01-08 317.5 J01-J09
12 2021-01-15 527.5 J01-J09
13 2021-01-22 -51 J01-J09
14 2021-01-29 -58.5 J01-J09
15 2021-02-05 -76 J01-J09
16 2021-01-07 76.5 J01-J05
17 2021-01-08 93.5 J01-J05
18 2021-01-15 305 J01-J05
19 2021-01-22 -289 J01-J05
20 2021-01-29 -242.5 J01-J05
21 2021-02-05 -266 J01-J05
22 2021-02-10 -239.5 J01-J05
我如何使用 R 实现这一点?提前致谢。
你可以试试:
library(fs)
library(readxl)
file_paths = list.files("./test/", pattern = "*.xlsx")
df = data.frame()
for(i in file_paths){
df_temp = read_xlsx(path=paste0("./test/", i))
df_temp$`1` = names(df_temp)[2]
names(df_temp) = c("date", "value", "name")
df = rbind(df, df_temp)
}
rm(df_temp)
输出:
> df
# A tibble: 21 x 3
date value name
<dttm> <dbl> <chr>
1 2021-01-07 00:00:00 76.5 J01-J05
2 2021-01-08 00:00:00 93.5 J01-J05
3 2021-01-15 00:00:00 305 J01-J05
4 2021-01-22 00:00:00 -289 J01-J05
5 2021-01-29 00:00:00 -242. J01-J05
6 2021-02-05 00:00:00 -266 J01-J05
7 2021-02-10 00:00:00 -240. J01-J05
8 2021-01-07 00:00:00 323 J01-J09
9 2021-01-08 00:00:00 318. J01-J09
10 2021-01-15 00:00:00 528. J01-J09
# ... with 11 more rows
更新,函数:
read_excel = function(name) {
df_temp = read_xlsx(path=paste0("./test/", name))
df_temp$`1` = names(df_temp)[2]
names(df_temp) = c("date", "value", "name")
return(df_temp)
}
df = do.call(rbind, lapply(file_paths, read_excel))
library(dplyr)
library(readxl)
files <- list.files()
combined <- bind_rows(
lapply(
files,
function(f) {
df <- read_xlsx(f)
df %>%
select(date = 1, value = 2) %>%
mutate(name = colnames(df)[2])
}
)
)
@ah bon 的替代方案:
read_file <- function(file) {
df <- read_xlsx(file)
df <- df %>%
select(date = 1, price = 2) %>%
mutate(name = colnames(df)[2])
return(df)
}
df <- bind_rows(
lapply(
files,
read_file
)
)
# or `df <- do.call(rbind, lapply(files, read_file))`
在一个目录下,我有多个excel个格式相似的文件(您可以从here下载示例文件):
我需要
- 循环文件和
read_excel()
, - 用第二个列名 改变一个新列
- 将第一列和第二列分别重命名为
date
和value
,删除最后一列(其原始列名称为1
); - 使用
do.call(rbind, df.list)
将所有 dfs 附加到一个数据帧
name
我做了什么:
循环获取文件路径:
library(fs)
folder_path <- './test/'
file_paths <- dir_ls(folder_path, regexp = ".xlsx")
函数读取 excels:
read_excel_file <- function(path) {
df <- read_excel(path = path, header = TRUE)
}
lapply read_excel()
函数到每个 excel 文件:
df.list = lapply(file_paths, function(file) read_excel(file, skip = 2, col_names = FALSE))
df <- do.call(rbind, df.list)
预期结果将是这样的数据框:
date value name
2 2021-01-07 -76.5 J05-J01
3 2021-01-08 -93.5 J05-J01
4 2021-01-15 -305 J05-J01
5 2021-01-22 289 J05-J01
6 2021-01-29 242.5 J05-J01
7 2021-02-05 266 J05-J01
8 2021-02-10 239.5 J05-J01
9 2021-02-19 305.5 J05-J01
10 2021-01-07 323 J01-J09
11 2021-01-08 317.5 J01-J09
12 2021-01-15 527.5 J01-J09
13 2021-01-22 -51 J01-J09
14 2021-01-29 -58.5 J01-J09
15 2021-02-05 -76 J01-J09
16 2021-01-07 76.5 J01-J05
17 2021-01-08 93.5 J01-J05
18 2021-01-15 305 J01-J05
19 2021-01-22 -289 J01-J05
20 2021-01-29 -242.5 J01-J05
21 2021-02-05 -266 J01-J05
22 2021-02-10 -239.5 J01-J05
我如何使用 R 实现这一点?提前致谢。
你可以试试:
library(fs)
library(readxl)
file_paths = list.files("./test/", pattern = "*.xlsx")
df = data.frame()
for(i in file_paths){
df_temp = read_xlsx(path=paste0("./test/", i))
df_temp$`1` = names(df_temp)[2]
names(df_temp) = c("date", "value", "name")
df = rbind(df, df_temp)
}
rm(df_temp)
输出:
> df
# A tibble: 21 x 3
date value name
<dttm> <dbl> <chr>
1 2021-01-07 00:00:00 76.5 J01-J05
2 2021-01-08 00:00:00 93.5 J01-J05
3 2021-01-15 00:00:00 305 J01-J05
4 2021-01-22 00:00:00 -289 J01-J05
5 2021-01-29 00:00:00 -242. J01-J05
6 2021-02-05 00:00:00 -266 J01-J05
7 2021-02-10 00:00:00 -240. J01-J05
8 2021-01-07 00:00:00 323 J01-J09
9 2021-01-08 00:00:00 318. J01-J09
10 2021-01-15 00:00:00 528. J01-J09
# ... with 11 more rows
更新,函数:
read_excel = function(name) {
df_temp = read_xlsx(path=paste0("./test/", name))
df_temp$`1` = names(df_temp)[2]
names(df_temp) = c("date", "value", "name")
return(df_temp)
}
df = do.call(rbind, lapply(file_paths, read_excel))
library(dplyr)
library(readxl)
files <- list.files()
combined <- bind_rows(
lapply(
files,
function(f) {
df <- read_xlsx(f)
df %>%
select(date = 1, value = 2) %>%
mutate(name = colnames(df)[2])
}
)
)
@ah bon 的替代方案:
read_file <- function(file) {
df <- read_xlsx(file)
df <- df %>%
select(date = 1, price = 2) %>%
mutate(name = colnames(df)[2])
return(df)
}
df <- bind_rows(
lapply(
files,
read_file
)
)
# or `df <- do.call(rbind, lapply(files, read_file))`