在 R 中是否有一种好的、通用的方法将 semi-structured 数据转换为 tibble/dataframe?
Is there a good, general approach to convert semi-structured data to tibble/dataframe in R?
我是 R 编程的新手,到目前为止,我的大部分经验都是使用来自 .csv 或 .xlsx 的高度结构化矩形数据。但现在我收到了大约 30 个预算数据电子表格,如下所示:
并且为了与它们一起工作,我想让它们变成更友好的格式(Q1 到 Q4 could/should 的不完全整齐 b/c 是一个单一的变量 --但我稍后可以用 pivot_longer) 修复它,像这样:
搜索 SO,我找到的最接近的 problem/solution 是这样的:,但该示例包含一系列结构化表格,不需要我做的修改,另外,它是一个文本文件转换为字符向量,我有 Excel 个包含多个工作表的工作簿(我只需要其中一张)。
这是我到目前为止尝试过的方法:
library(tidyverse)
library(xlsx)
# Straight read in the worksheet as-is
df <- read_xlsx(path = "filename.xlsx", sheet = "worksheet", col_names = FALSE)
# Get the location name into its own column, then delete row 1 since it's not needed
df <- df %>%
mutate(location = df[[1,1]])
df <- df[-c(1),]
# Add a column and initialize it to "empty"
df <- df %>%
add_column(budget_type = "empty")
# Now loop through the dataframe in Column 1, search for the keyword(s) and place
# them in the last "budget_type" column
for (row in 1:nrow(df)){
print(df[[row,1]])
if (df[[row,1]] %in% c("Baseline","Scope Changes")){
budget_type <- df[[row,1]]
}
if (!is.na(df[[row,1]])){
if (str_detect(df[[row,1]], "[0-9]{4}") == TRUE){
df[[row, "budget_type"]] <- budget_type
}
}
}
# ...and from here I could write another loop going from bottom to top seeking
# the categories and placing them in another created column, and finally delete the rows
# that are empty, total rows, or unnecessary header rows.
我的问题是:在 R 中是否有明显更好的方法来代替循环和我描述的以整洁格式获取数据的一般方法?
提前致谢。
编辑 2021 年 6 月 7 日:
看来我无法附加 Excel 文件,但如果我正确地遵循“最小可重现示例”指南,这里是从 [= 读取数据后来自 dput() 的未处理数据48=]:
结构(列表(...1 = c(“Mehoopany”,NA,“CLASS代码”,“基线”,
NA, "0201", "0300", "0301", NA, NA, "5500", "8245", "8260", NA,
NA, "5710", "8224", "8235", NA, NA, NA, NA, "CLASS 代码", "范围变更",
NA, "0201", "0300", "0301", NA, NA, "5500", "8245", "8260", NA,
NA, "5710", "8224", "8235", NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA), ...2 = c(NA, NA, "Classification", NA, NA, "Cleaning, Spills, and Trash / Recycle Bin Pickup",
"专业清洁", "Window 清洁", "清洁", NA, "Gen Office Exp",
“健康/健身中心”、“会议室”、“员工便利设施”、
NA, "接待处", "摄影服务", "收发室服务",
“收发室和接待处”,NA,“总计”,NA,“分类”,NA,
NA,“清洁、溢出和垃圾/回收站拾取”,“专业清洁”,
"Window 清洁", "清洁", NA, "Gen Office Exp", "健康/健身中心",
“会议室”、“员工便利设施”、NA、“接待处”、“摄影服务”、
“收发室服务”、“收发室和接待处”、NA、“总计”、NA、
NA, NA, NA, NA, NA, NA, NA, NA), ...3 = c(NA, NA, "2021 财年阶段化",
NA, "Q1", "1205", "0", "0", "1205", NA, "0", "0", "174", "174",
NA, "0", "0", "1453.625", "1453.625", NA, "2832.625", NA, "2021 财年阶段化",
NA, "Q1", "25", "0", "0", "25", NA, "0", "0", "37", "37", NA,
"0", "17", "0", "17", NA, "79", NA, NA, NA, NA, NA, NA, NA, NA,
NA), ...4 = c(NA, NA, NA, NA, "Q2", "1205", "0", "0", "1205",
NA, "0", "0", "174", "174", NA, "0", "0", "1453.625", "1453.625",
NA, "2832.625", NA, NA, NA, "Q2", "25", "0", "0", "25", NA, "0",
"0", "37", "37", NA, "0", "17", "0", "17", NA, "79", NA, NA,
NA, NA, NA, NA, NA, NA, NA), ...5 = c(NA, NA, NA, NA, "Q3", "1205",
“0”、“0”、“1205”、NA、“0”、“0”、“174”、“174”、NA、“0”、“0”、“1453.625”、
"1453.625", NA, "2832.625", NA, NA, NA, "Q3", "25", "0", "0",
"25", NA, "0", "0", "37", "37", NA, "0", "17", "0", "17", NA,
"79", NA, NA, NA, NA, NA, NA, NA, NA, NA), ...6 = c(NA, NA, NA,
NA, "Q4", "1205", "0", "0", "1205", NA, "0", "0", "174", "174",
NA, "0", "0", "1453.625", "1453.625", NA, "2832.625", NA, NA,
NA, "Q4", "25", "0", "0", "25", NA, "0", "0", "37", "37", 和,
"0", "17", "0", "17", AND, "79", AND, AND, AND, AND, AND, AND, AND, AND,
NA), ...7 = c(NA, NA, NA, NA, "总计", "4820", "0", "0", "4820",
NA, "0", "0", "696", "696", NA, "0", "0", "5814.5", "5814.5",
NA, "11330.5", NA, NA, NA, "总计", "100", "0", "0", "100", NA,
“0”,“0”,“148”,“148”,NA,“0”,“68”,“0”,“68”,NA,“316”,
NA, NA, NA, NA, NA, NA, NA, NA, NA)), row.names = c(NA, -50L), class = c("tbl_df",
"tbl", "data.frame"))
这是我使用的脚本 - 它有效 - 带有解释性注释:
library(tidyverse)
library(xlsx)
file <- "C:/Path/To/Book1.xlsx"
names <- c("class_code", "classification", "Q1", "Q2", "Q3", "Q4",
"Total", "Location", "Budget_Type", "Category")
# Read in the file, setting range to restrict columns ingested as some
# scrap work exists in some files beyond column G; row 50 is well beyond
# expected data range.
dframe <- read_xlsx(path = file, range = "'Original Data'!A1:G50",
col_names = FALSE)
# Output above data so it can be included in Stack Overflow as a 'minimal
# reproducible example'.
dput(dframe)
# Move the location name ('Mehoopany') to it own column, then delete the row.
dframe <- dframe %>%
mutate(location = dframe[[1,1]])
dframe <- dframe[-c(1),]
# Add/define two additional columns which will be used in loops below.
dframe <- dframe %>%
add_column(budget_type = "empty", category = "empty")
# Loop 1: Move *DOWN* the data set, labeling every line with 'CLASS CODE' as
# being either "Baseline" or "Scope Changes" in 'budget_type' column.
for (row in 1:nrow(dframe)){
if (dframe[[row,1]] %in% c("Baseline","Scope Changes")){
budget_type <- dframe[[row,1]]
}
if (!is.na(dframe[[row,1]])){
if (str_detect(dframe[[row,1]], "[0-9]{4}") == TRUE){
dframe[[row, "budget_type"]] <- budget_type
}
}
}
# Loop 2: Move *UP* the data set, labeling every line with 'CLASS CODE' with
# it's respective roll-up category, and otherwise delete the line.
for (row in nrow(dframe):1){
if ( dframe[[row,2]] == "Total" ||
is.na(dframe[[row,2]]) ||
dframe[[row, 2]] == "Classification" ) {
# delete rows where the 2nd column is <blank>, 'Classification', or 'Total'.
dframe <- dframe[-row,]
} else {
if ( !is.na(dframe[[row,2]]) && is.na(dframe[[row,1]]) ){
# if row no 'CLASS CODE' but has value in 2nd column, assign value to
# category then delete the row entirely.
category <- dframe[[row,2]]
dframe <- dframe[-row,]
} else if ( str_detect(dframe[[row,1]], "[:digit:]{4}") ){
# if row has 'CLASS CODE', then label the category column with the
# stored value.
dframe[[row, "category"]] <- category
}
}
}
# Assign the names from the character vector set at the beginning.
names(dframe) <- names
# Print out the resulting dataframe.
dframe
我是 R 编程的新手,到目前为止,我的大部分经验都是使用来自 .csv 或 .xlsx 的高度结构化矩形数据。但现在我收到了大约 30 个预算数据电子表格,如下所示:
并且为了与它们一起工作,我想让它们变成更友好的格式(Q1 到 Q4 could/should 的不完全整齐 b/c 是一个单一的变量 --但我稍后可以用 pivot_longer) 修复它,像这样:
搜索 SO,我找到的最接近的 problem/solution 是这样的:
这是我到目前为止尝试过的方法:
library(tidyverse)
library(xlsx)
# Straight read in the worksheet as-is
df <- read_xlsx(path = "filename.xlsx", sheet = "worksheet", col_names = FALSE)
# Get the location name into its own column, then delete row 1 since it's not needed
df <- df %>%
mutate(location = df[[1,1]])
df <- df[-c(1),]
# Add a column and initialize it to "empty"
df <- df %>%
add_column(budget_type = "empty")
# Now loop through the dataframe in Column 1, search for the keyword(s) and place
# them in the last "budget_type" column
for (row in 1:nrow(df)){
print(df[[row,1]])
if (df[[row,1]] %in% c("Baseline","Scope Changes")){
budget_type <- df[[row,1]]
}
if (!is.na(df[[row,1]])){
if (str_detect(df[[row,1]], "[0-9]{4}") == TRUE){
df[[row, "budget_type"]] <- budget_type
}
}
}
# ...and from here I could write another loop going from bottom to top seeking
# the categories and placing them in another created column, and finally delete the rows
# that are empty, total rows, or unnecessary header rows.
我的问题是:在 R 中是否有明显更好的方法来代替循环和我描述的以整洁格式获取数据的一般方法?
提前致谢。
编辑 2021 年 6 月 7 日:
看来我无法附加 Excel 文件,但如果我正确地遵循“最小可重现示例”指南,这里是从 [= 读取数据后来自 dput() 的未处理数据48=]:
结构(列表(...1 = c(“Mehoopany”,NA,“CLASS代码”,“基线”, NA, "0201", "0300", "0301", NA, NA, "5500", "8245", "8260", NA, NA, "5710", "8224", "8235", NA, NA, NA, NA, "CLASS 代码", "范围变更", NA, "0201", "0300", "0301", NA, NA, "5500", "8245", "8260", NA, NA, "5710", "8224", "8235", NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), ...2 = c(NA, NA, "Classification", NA, NA, "Cleaning, Spills, and Trash / Recycle Bin Pickup", "专业清洁", "Window 清洁", "清洁", NA, "Gen Office Exp", “健康/健身中心”、“会议室”、“员工便利设施”、 NA, "接待处", "摄影服务", "收发室服务", “收发室和接待处”,NA,“总计”,NA,“分类”,NA, NA,“清洁、溢出和垃圾/回收站拾取”,“专业清洁”, "Window 清洁", "清洁", NA, "Gen Office Exp", "健康/健身中心", “会议室”、“员工便利设施”、NA、“接待处”、“摄影服务”、 “收发室服务”、“收发室和接待处”、NA、“总计”、NA、 NA, NA, NA, NA, NA, NA, NA, NA), ...3 = c(NA, NA, "2021 财年阶段化", NA, "Q1", "1205", "0", "0", "1205", NA, "0", "0", "174", "174", NA, "0", "0", "1453.625", "1453.625", NA, "2832.625", NA, "2021 财年阶段化", NA, "Q1", "25", "0", "0", "25", NA, "0", "0", "37", "37", NA, "0", "17", "0", "17", NA, "79", NA, NA, NA, NA, NA, NA, NA, NA, NA), ...4 = c(NA, NA, NA, NA, "Q2", "1205", "0", "0", "1205", NA, "0", "0", "174", "174", NA, "0", "0", "1453.625", "1453.625", NA, "2832.625", NA, NA, NA, "Q2", "25", "0", "0", "25", NA, "0", "0", "37", "37", NA, "0", "17", "0", "17", NA, "79", NA, NA, NA, NA, NA, NA, NA, NA, NA), ...5 = c(NA, NA, NA, NA, "Q3", "1205", “0”、“0”、“1205”、NA、“0”、“0”、“174”、“174”、NA、“0”、“0”、“1453.625”、 "1453.625", NA, "2832.625", NA, NA, NA, "Q3", "25", "0", "0", "25", NA, "0", "0", "37", "37", NA, "0", "17", "0", "17", NA, "79", NA, NA, NA, NA, NA, NA, NA, NA, NA), ...6 = c(NA, NA, NA, NA, "Q4", "1205", "0", "0", "1205", NA, "0", "0", "174", "174", NA, "0", "0", "1453.625", "1453.625", NA, "2832.625", NA, NA, NA, "Q4", "25", "0", "0", "25", NA, "0", "0", "37", "37", 和, "0", "17", "0", "17", AND, "79", AND, AND, AND, AND, AND, AND, AND, AND, NA), ...7 = c(NA, NA, NA, NA, "总计", "4820", "0", "0", "4820", NA, "0", "0", "696", "696", NA, "0", "0", "5814.5", "5814.5", NA, "11330.5", NA, NA, NA, "总计", "100", "0", "0", "100", NA, “0”,“0”,“148”,“148”,NA,“0”,“68”,“0”,“68”,NA,“316”, NA, NA, NA, NA, NA, NA, NA, NA, NA)), row.names = c(NA, -50L), class = c("tbl_df", "tbl", "data.frame"))
这是我使用的脚本 - 它有效 - 带有解释性注释:
library(tidyverse)
library(xlsx)
file <- "C:/Path/To/Book1.xlsx"
names <- c("class_code", "classification", "Q1", "Q2", "Q3", "Q4",
"Total", "Location", "Budget_Type", "Category")
# Read in the file, setting range to restrict columns ingested as some
# scrap work exists in some files beyond column G; row 50 is well beyond
# expected data range.
dframe <- read_xlsx(path = file, range = "'Original Data'!A1:G50",
col_names = FALSE)
# Output above data so it can be included in Stack Overflow as a 'minimal
# reproducible example'.
dput(dframe)
# Move the location name ('Mehoopany') to it own column, then delete the row.
dframe <- dframe %>%
mutate(location = dframe[[1,1]])
dframe <- dframe[-c(1),]
# Add/define two additional columns which will be used in loops below.
dframe <- dframe %>%
add_column(budget_type = "empty", category = "empty")
# Loop 1: Move *DOWN* the data set, labeling every line with 'CLASS CODE' as
# being either "Baseline" or "Scope Changes" in 'budget_type' column.
for (row in 1:nrow(dframe)){
if (dframe[[row,1]] %in% c("Baseline","Scope Changes")){
budget_type <- dframe[[row,1]]
}
if (!is.na(dframe[[row,1]])){
if (str_detect(dframe[[row,1]], "[0-9]{4}") == TRUE){
dframe[[row, "budget_type"]] <- budget_type
}
}
}
# Loop 2: Move *UP* the data set, labeling every line with 'CLASS CODE' with
# it's respective roll-up category, and otherwise delete the line.
for (row in nrow(dframe):1){
if ( dframe[[row,2]] == "Total" ||
is.na(dframe[[row,2]]) ||
dframe[[row, 2]] == "Classification" ) {
# delete rows where the 2nd column is <blank>, 'Classification', or 'Total'.
dframe <- dframe[-row,]
} else {
if ( !is.na(dframe[[row,2]]) && is.na(dframe[[row,1]]) ){
# if row no 'CLASS CODE' but has value in 2nd column, assign value to
# category then delete the row entirely.
category <- dframe[[row,2]]
dframe <- dframe[-row,]
} else if ( str_detect(dframe[[row,1]], "[:digit:]{4}") ){
# if row has 'CLASS CODE', then label the category column with the
# stored value.
dframe[[row, "category"]] <- category
}
}
}
# Assign the names from the character vector set at the beginning.
names(dframe) <- names
# Print out the resulting dataframe.
dframe