将 readr col_cpec 应用到 data.frame,独立于从文件读取
Apply readr col_cpec to data.frame, independently of reading from file
我有一个 tibble
(data.frame
),我需要对其应用一些类型更新。我有一个描述所需类型的 readr
::col_spec
对象,但由于数据不是 csv 文件,我无法使用 read_csv(..., col_types=cspec)
将更改应用于指定的列.
由于 col_spec
是一种专门为指定所需数据类型而设计的数据结构,因此我还是会直接将其用作为我应用更改的函数的输入,而不是编写冗长的自定义脚本应用不同的列。请参阅以下示例:
library(tidyverse)
# Subset starwars to get sw (comparable to my input data)
sw <- starwars %>%
select(name, height, ends_with("_color")) %>%
slice(c(1,4,5,19))
sw
#> # A tibble: 4 × 5
#> name height hair_color skin_color eye_color
#> <chr> <int> <chr> <chr> <chr>
#> 1 Luke Skywalker 172 blond fair blue
#> 2 Darth Vader 202 none white yellow
#> 3 Leia Organa 150 brown light brown
#> 4 Yoda 66 white green brown
# The col_spec that I have
cspec <- cols(
hair_color = col_factor(c("brown", "blond", "white", "none")),
skin_color = col_factor(c( "green", "light", "fair", "white")),
eye_color = col_factor(c("blue", "brown", "yellow"))
)
# I would like to apply the col_spec directly to sw
# A not so great workaround is to use a tempfile
tf <- tempfile()
sw %>% write_csv(tf)
sw_fct <- read_csv(tf, col_types=cspec)
# This is more or less the result I am after:
# But note how info on other columns (height) is lost in the roundtrip
sw_fct
#> # A tibble: 4 × 5
#> name height hair_color skin_color eye_color
#> <chr> <dbl> <fct> <fct> <fct>
#> 1 Luke Skywalker 172 blond fair blue
#> 2 Darth Vader 202 none white yellow
#> 3 Leia Organa 150 brown light brown
#> 4 Yoda 66 white green brown
我们可以通过遍历 cols
从对象中提取元素来做到这一点
library(readr)
library(purrr)
sw[names(cspec$cols)] <- imap(cspec$cols, ~ parse_factor(sw[[.y]],
levels = .x$levels, ordered = .x$ordered, include_na = .x$include_na))
-检查输出
> sw
# A tibble: 4 × 5
name height hair_color skin_color eye_color
<chr> <int> <fct> <fct> <fct>
1 Luke Skywalker 172 blond fair blue
2 Darth Vader 202 none white yellow
3 Leia Organa 150 brown light brown
4 Yoda 66 white green brown
> str(sw)
tibble [4 × 5] (S3: tbl_df/tbl/data.frame)
$ name : chr [1:4] "Luke Skywalker" "Darth Vader" "Leia Organa" "Yoda"
$ height : int [1:4] 172 202 150 66
$ hair_color: Factor w/ 4 levels "brown","blond",..: 2 4 1 3
$ skin_color: Factor w/ 4 levels "green","light",..: 3 4 2 1
$ eye_color : Factor w/ 3 levels "blue","brown",..: 1 3 2 2
如果我们还需要 'spec' 的 attr
ibutes,请执行分配
attr(sw, "spec") <- cspec
-检查 str
> str(sw)
tibble [4 × 5] (S3: tbl_df/tbl/data.frame)
$ name : chr [1:4] "Luke Skywalker" "Darth Vader" "Leia Organa" "Yoda"
$ height : int [1:4] 172 202 150 66
$ hair_color: Factor w/ 4 levels "brown","blond",..: 2 4 1 3
$ skin_color: Factor w/ 4 levels "green","light",..: 3 4 2 1
$ eye_color : Factor w/ 3 levels "blue","brown",..: 1 3 2 2
- attr(*, "spec")=
.. cols(
.. hair_color = col_factor(levels = c("brown", "blond", "white", "none"), ordered = FALSE, include_na = FALSE),
.. skin_color = col_factor(levels = c("green", "light", "fair", "white"), ordered = FALSE, include_na = FALSE),
.. eye_color = col_factor(levels = c("blue", "brown", "yellow"), ordered = FALSE, include_na = FALSE)
.. )
这个答案将来自@akrun 的解决方案包装成一个函数,供那些不太熟悉 purrr 的人使用。
apply_col_spec <- function(d, cspec, set_spec_attribute=FALSE) {
# A bit of input checking
if (!all(inherits(d, "data.frame"), inherits(cspec, "col_spec"),
is.logical(set_spec_attribute))) {
stop("apply_col_spec(): wrong input types")
}
if (!all(sapply(cspec$cols, inherits, "collector_factor"))) {
stop("apply_col_spec(): only implemented for factor columns")
}
# Do the actual application of the col_spec
d[names(cspec$cols)] <- imap(cspec$cols, ~ parse_factor(d[[.y]],
levels = .x$levels, ordered = .x$ordered, include_na = .x$include_na))
# If requested, set col_spec as an attribute, for consistency with readr
if (set_spec_attribute) {
attr(d, "spec") <- cspec
}
d
}
并且运行问题中定义的变量的函数产生预期结果:
> apply_col_spec(sw, cspec)
# A tibble: 4 × 5
name height hair_color skin_color eye_color
<chr> <int> <fct> <fct> <fct>
1 Luke Skywalker 172 blond fair blue
2 Darth Vader 202 none white yellow
3 Leia Organa 150 brown light brown
4 Yoda 66 white green brown
我有一个 tibble
(data.frame
),我需要对其应用一些类型更新。我有一个描述所需类型的 readr
::col_spec
对象,但由于数据不是 csv 文件,我无法使用 read_csv(..., col_types=cspec)
将更改应用于指定的列.
由于 col_spec
是一种专门为指定所需数据类型而设计的数据结构,因此我还是会直接将其用作为我应用更改的函数的输入,而不是编写冗长的自定义脚本应用不同的列。请参阅以下示例:
library(tidyverse)
# Subset starwars to get sw (comparable to my input data)
sw <- starwars %>%
select(name, height, ends_with("_color")) %>%
slice(c(1,4,5,19))
sw
#> # A tibble: 4 × 5
#> name height hair_color skin_color eye_color
#> <chr> <int> <chr> <chr> <chr>
#> 1 Luke Skywalker 172 blond fair blue
#> 2 Darth Vader 202 none white yellow
#> 3 Leia Organa 150 brown light brown
#> 4 Yoda 66 white green brown
# The col_spec that I have
cspec <- cols(
hair_color = col_factor(c("brown", "blond", "white", "none")),
skin_color = col_factor(c( "green", "light", "fair", "white")),
eye_color = col_factor(c("blue", "brown", "yellow"))
)
# I would like to apply the col_spec directly to sw
# A not so great workaround is to use a tempfile
tf <- tempfile()
sw %>% write_csv(tf)
sw_fct <- read_csv(tf, col_types=cspec)
# This is more or less the result I am after:
# But note how info on other columns (height) is lost in the roundtrip
sw_fct
#> # A tibble: 4 × 5
#> name height hair_color skin_color eye_color
#> <chr> <dbl> <fct> <fct> <fct>
#> 1 Luke Skywalker 172 blond fair blue
#> 2 Darth Vader 202 none white yellow
#> 3 Leia Organa 150 brown light brown
#> 4 Yoda 66 white green brown
我们可以通过遍历 cols
library(readr)
library(purrr)
sw[names(cspec$cols)] <- imap(cspec$cols, ~ parse_factor(sw[[.y]],
levels = .x$levels, ordered = .x$ordered, include_na = .x$include_na))
-检查输出
> sw
# A tibble: 4 × 5
name height hair_color skin_color eye_color
<chr> <int> <fct> <fct> <fct>
1 Luke Skywalker 172 blond fair blue
2 Darth Vader 202 none white yellow
3 Leia Organa 150 brown light brown
4 Yoda 66 white green brown
> str(sw)
tibble [4 × 5] (S3: tbl_df/tbl/data.frame)
$ name : chr [1:4] "Luke Skywalker" "Darth Vader" "Leia Organa" "Yoda"
$ height : int [1:4] 172 202 150 66
$ hair_color: Factor w/ 4 levels "brown","blond",..: 2 4 1 3
$ skin_color: Factor w/ 4 levels "green","light",..: 3 4 2 1
$ eye_color : Factor w/ 3 levels "blue","brown",..: 1 3 2 2
如果我们还需要 'spec' 的 attr
ibutes,请执行分配
attr(sw, "spec") <- cspec
-检查 str
> str(sw)
tibble [4 × 5] (S3: tbl_df/tbl/data.frame)
$ name : chr [1:4] "Luke Skywalker" "Darth Vader" "Leia Organa" "Yoda"
$ height : int [1:4] 172 202 150 66
$ hair_color: Factor w/ 4 levels "brown","blond",..: 2 4 1 3
$ skin_color: Factor w/ 4 levels "green","light",..: 3 4 2 1
$ eye_color : Factor w/ 3 levels "blue","brown",..: 1 3 2 2
- attr(*, "spec")=
.. cols(
.. hair_color = col_factor(levels = c("brown", "blond", "white", "none"), ordered = FALSE, include_na = FALSE),
.. skin_color = col_factor(levels = c("green", "light", "fair", "white"), ordered = FALSE, include_na = FALSE),
.. eye_color = col_factor(levels = c("blue", "brown", "yellow"), ordered = FALSE, include_na = FALSE)
.. )
这个答案将来自@akrun 的解决方案包装成一个函数,供那些不太熟悉 purrr 的人使用。
apply_col_spec <- function(d, cspec, set_spec_attribute=FALSE) {
# A bit of input checking
if (!all(inherits(d, "data.frame"), inherits(cspec, "col_spec"),
is.logical(set_spec_attribute))) {
stop("apply_col_spec(): wrong input types")
}
if (!all(sapply(cspec$cols, inherits, "collector_factor"))) {
stop("apply_col_spec(): only implemented for factor columns")
}
# Do the actual application of the col_spec
d[names(cspec$cols)] <- imap(cspec$cols, ~ parse_factor(d[[.y]],
levels = .x$levels, ordered = .x$ordered, include_na = .x$include_na))
# If requested, set col_spec as an attribute, for consistency with readr
if (set_spec_attribute) {
attr(d, "spec") <- cspec
}
d
}
并且运行问题中定义的变量的函数产生预期结果:
> apply_col_spec(sw, cspec)
# A tibble: 4 × 5
name height hair_color skin_color eye_color
<chr> <int> <fct> <fct> <fct>
1 Luke Skywalker 172 blond fair blue
2 Darth Vader 202 none white yellow
3 Leia Organa 150 brown light brown
4 Yoda 66 white green brown