如何动态更改数据框中列的数据类型
How to dynamically change data type of columns in data frame
我从中将数据导入 R 的平台不支持指定数据类型,因此我的所有列都是 character
。我有一个 Excel 文件,指定哪些列是 factor
,包括相关的 labels
和 levels
。现在,我正在尝试编写一个函数来动态更改 data.frame
中各个列的数据类型
感谢这个问题的出色回答(dplyr - mutate: use dynamic variable names),我设法编写了以下函数,其中我将列名动态设置为 mutate
函数。
readFactorData <- function(filepath) {
t <- read.xlsx(filepath)
sapply(nrow(t), function(i) {
colname <- as.character(t[i, "Item"])
factorLevels <- t[i, 3:ncol(t)][which(!is.na(t[i, 3:ncol(t)]))]
totalLevels <- length(factorLevels)
listOfLabels <- as.character(unlist(factorLevels))
mutate(d, !!colname := factor(d[[colname]], labels=(1:totalLevels), levels=listOfLabels))
# requires dplyr v.0.7+
# the syntax `!!variablename:=` forces evaluation of the variablename before evaluating the rest of the function
})
}
有效,每次迭代 returns 整个数据框,相关列 (colname
) 更改为因子。但是,每次迭代都会覆盖前一次,因此此函数仅 returns i
的最后结果。我如何确保最终得到 1 个数据框,其中保存了所有相关列?
示例数据(确保注释掉上面函数的第一行,因为我们在这里定义 t
):
d <- data.frame("id" = sample(100:999, 10), "age" = sample(18:80, 10), "factor1" = c(rep("a", 3), rep("b", 3), rep("c", 4)), "factor2" = c("x","y","y","y","y","x","x","x","x","y"), stringsAsFactors = FALSE)
t <- data.frame("Item" = c("factor1","factor2"), "Label" = c("This is factor 1", "This is factor 2"), "level1" = c("a","x"), "level2" = c("b","y"), "level3" = c("c","NA"))
如果我理解正确的话,你有一个数据框,其中包含另一个数据框的因子列值。你想从第一个 df 中提取这些并在第二个 df 中改变这些列并将它们变成因子。
保留列名的向量,然后将它们全部改变怎么样?
colnames <- t %>%
pull(Item) %>%
as.character()
d_with_factors <- d %>%
mutate_at(colnames, as.factor)
然后
sapply(d_with_factors, class)
Returns
id age factor1 factor2
"integer" "integer" "factor" "factor"
如果您想将 data.frame
中的所有 factor
转换为 character
,您可以使用 dplyr
的 mutate_if
。否则,如果您想使用列名向量,@Eden Z 的 答案会为您完成。
library(tidyverse)
d_out <- d %>%
mutate_if(is.character, as.factor)
d_out
# id age factor1 factor2
#1 933 61 a x
#2 208 52 a y
#3 193 25 a y
#4 231 47 b y
#5 595 78 b y
#6 675 28 b x
#7 387 71 c x
#8 386 80 c x
#9 893 20 c x
#10 272 23 c y
当您可以检查 class 变量时:
sapply(d_out, class)
# id age factor1 factor2
#"integer" "integer" "factor" "factor"
下面的函数映射到为您要更改的每个命名列指定的 readr::parse_*
函数,然后允许您为每个列指定参数(例如,如果使用 parse_factor
,则为 levels
)。
library(tidyverse)
parse_cols <- function(df, f, col_names, levels, ...){
# df: dataframe, f: char vec, col_names: char vec, levels: list of char vecs,
# ...: list of other potential args for parse_*
params_t <- tibble(x = map(col_names, ~df[[.x]]), levels = levels, ...) %>% transpose()
new_cols <- map2_df(.x = structure(f, names = col_names),
.y = params_t,
~R.utils::doCall(.x, args = .y, .ignoreUnusedArgs = TRUE))
df[names(new_cols)] <- new_cols
df
}
# function inputs -- perhaps just requiring a tibble input would be safer
parsings_vec <- c("parse_factor","parse_double", "parse_factor")
cols_vec <- c("manufacturer", "cty", "class")
factors_list <- list(unique(mpg[["manufacturer"]]), NULL, unique(mpg[["class"]]))
parse_cols(df = mpg, f = parsings_vec, col_names = cols_vec, levels = factors_list)
#> # A tibble: 234 x 11
#> manufacturer model displ year cyl trans drv cty hwy fl cla~
#> <fct> <chr> <dbl> <int> <int> <chr> <chr> <dbl> <int> <chr> <fc>
#> 1 audi a4 1.8 1999 4 auto~ f 18 29 p com~
#> 2 audi a4 1.8 1999 4 manu~ f 21 29 p com~
#> 3 audi a4 2 2008 4 manu~ f 20 31 p com~
#> 4 audi a4 2 2008 4 auto~ f 21 30 p com~
#> 5 audi a4 2.8 1999 6 auto~ f 16 26 p com~
#> 6 audi a4 2.8 1999 6 manu~ f 18 26 p com~
#> 7 audi a4 3.1 2008 6 auto~ f 18 27 p com~
#> 8 audi a4 q~ 1.8 1999 4 manu~ 4 18 26 p com~
#> 9 audi a4 q~ 1.8 1999 4 auto~ 4 16 25 p com~
#> 10 audi a4 q~ 2 2008 4 manu~ 4 20 28 p com~
#> # ... with 224 more rows
我从中将数据导入 R 的平台不支持指定数据类型,因此我的所有列都是 character
。我有一个 Excel 文件,指定哪些列是 factor
,包括相关的 labels
和 levels
。现在,我正在尝试编写一个函数来动态更改 data.frame
感谢这个问题的出色回答(dplyr - mutate: use dynamic variable names),我设法编写了以下函数,其中我将列名动态设置为 mutate
函数。
readFactorData <- function(filepath) {
t <- read.xlsx(filepath)
sapply(nrow(t), function(i) {
colname <- as.character(t[i, "Item"])
factorLevels <- t[i, 3:ncol(t)][which(!is.na(t[i, 3:ncol(t)]))]
totalLevels <- length(factorLevels)
listOfLabels <- as.character(unlist(factorLevels))
mutate(d, !!colname := factor(d[[colname]], labels=(1:totalLevels), levels=listOfLabels))
# requires dplyr v.0.7+
# the syntax `!!variablename:=` forces evaluation of the variablename before evaluating the rest of the function
})
}
有效,每次迭代 returns 整个数据框,相关列 (colname
) 更改为因子。但是,每次迭代都会覆盖前一次,因此此函数仅 returns i
的最后结果。我如何确保最终得到 1 个数据框,其中保存了所有相关列?
示例数据(确保注释掉上面函数的第一行,因为我们在这里定义 t
):
d <- data.frame("id" = sample(100:999, 10), "age" = sample(18:80, 10), "factor1" = c(rep("a", 3), rep("b", 3), rep("c", 4)), "factor2" = c("x","y","y","y","y","x","x","x","x","y"), stringsAsFactors = FALSE)
t <- data.frame("Item" = c("factor1","factor2"), "Label" = c("This is factor 1", "This is factor 2"), "level1" = c("a","x"), "level2" = c("b","y"), "level3" = c("c","NA"))
如果我理解正确的话,你有一个数据框,其中包含另一个数据框的因子列值。你想从第一个 df 中提取这些并在第二个 df 中改变这些列并将它们变成因子。
保留列名的向量,然后将它们全部改变怎么样?
colnames <- t %>%
pull(Item) %>%
as.character()
d_with_factors <- d %>%
mutate_at(colnames, as.factor)
然后
sapply(d_with_factors, class)
Returns
id age factor1 factor2
"integer" "integer" "factor" "factor"
如果您想将 data.frame
中的所有 factor
转换为 character
,您可以使用 dplyr
的 mutate_if
。否则,如果您想使用列名向量,@Eden Z 的 答案会为您完成。
library(tidyverse)
d_out <- d %>%
mutate_if(is.character, as.factor)
d_out
# id age factor1 factor2
#1 933 61 a x
#2 208 52 a y
#3 193 25 a y
#4 231 47 b y
#5 595 78 b y
#6 675 28 b x
#7 387 71 c x
#8 386 80 c x
#9 893 20 c x
#10 272 23 c y
当您可以检查 class 变量时:
sapply(d_out, class)
# id age factor1 factor2
#"integer" "integer" "factor" "factor"
下面的函数映射到为您要更改的每个命名列指定的 readr::parse_*
函数,然后允许您为每个列指定参数(例如,如果使用 parse_factor
,则为 levels
)。
library(tidyverse)
parse_cols <- function(df, f, col_names, levels, ...){
# df: dataframe, f: char vec, col_names: char vec, levels: list of char vecs,
# ...: list of other potential args for parse_*
params_t <- tibble(x = map(col_names, ~df[[.x]]), levels = levels, ...) %>% transpose()
new_cols <- map2_df(.x = structure(f, names = col_names),
.y = params_t,
~R.utils::doCall(.x, args = .y, .ignoreUnusedArgs = TRUE))
df[names(new_cols)] <- new_cols
df
}
# function inputs -- perhaps just requiring a tibble input would be safer
parsings_vec <- c("parse_factor","parse_double", "parse_factor")
cols_vec <- c("manufacturer", "cty", "class")
factors_list <- list(unique(mpg[["manufacturer"]]), NULL, unique(mpg[["class"]]))
parse_cols(df = mpg, f = parsings_vec, col_names = cols_vec, levels = factors_list)
#> # A tibble: 234 x 11
#> manufacturer model displ year cyl trans drv cty hwy fl cla~
#> <fct> <chr> <dbl> <int> <int> <chr> <chr> <dbl> <int> <chr> <fc>
#> 1 audi a4 1.8 1999 4 auto~ f 18 29 p com~
#> 2 audi a4 1.8 1999 4 manu~ f 21 29 p com~
#> 3 audi a4 2 2008 4 manu~ f 20 31 p com~
#> 4 audi a4 2 2008 4 auto~ f 21 30 p com~
#> 5 audi a4 2.8 1999 6 auto~ f 16 26 p com~
#> 6 audi a4 2.8 1999 6 manu~ f 18 26 p com~
#> 7 audi a4 3.1 2008 6 auto~ f 18 27 p com~
#> 8 audi a4 q~ 1.8 1999 4 manu~ 4 18 26 p com~
#> 9 audi a4 q~ 1.8 1999 4 auto~ 4 16 25 p com~
#> 10 audi a4 q~ 2 2008 4 manu~ 4 20 28 p com~
#> # ... with 224 more rows