readr - 如何从 spec() 更新 col_spec 对象
readr - how to update col_spec object from spec()
我喜欢 this RStudio blog post 中描述的有关色谱柱规格的工作流程。基本上,可以在 read_csv
导入后获取列规范,然后将其保存下来以备后用。例如,从 post:
mtcars2 <- read_csv(readr_example("mtcars.csv"))
#> Parsed with column specification:
#> cols(
#> mpg = col_double(),
#> cyl = col_integer(),
#> disp = col_double(),
#> hp = col_integer(),
#> drat = col_double(),
#> wt = col_double(),
#> qsec = col_double(),
#> vs = col_integer(),
#> am = col_integer(),
#> gear = col_integer(),
#> carb = col_integer()
#> )
# Once you've figured out the correct types
mtcars_spec <- write_rds(spec(mtcars2), "mtcars2-spec.rds")
# Every subsequent load
mtcars2 <- read_csv(
readr_example("mtcars.csv"),
col_types = read_rds("mtcars2-spec.rds")
)
不幸的是,规范对象本身是带有属性的列表,但它们与通过 col_types
参数
提供给 read_csv
函数的不同列规范不匹配
> mtcars_spec$cols$cyl
<collector_integer>
> str(mtcars_spec$cols$cyl)
list()
- attr(*, "class")= chr [1:2] "collector_integer" "collector"
> class(mtcars_spec)
[1] "col_spec"
此外,在 Windows 中编辑 .rds 文件很难看(至少对我而言)。
我希望能够编辑大型 col_spec
对象(例如,跳过某些列,或者以其他方式编辑 class)。我可以继续猜测编辑列表所需的字符串,如下所示:
attr(mtcars_spec$cols$cyl,"class")[1] = "collector_skip"` # this worked!
> mtcars_spec
cols(
mpg = col_double(),
cyl = col_skip(),
disp = col_double(),
hp = col_integer(),
drat = col_double(),
wt = col_double(),
qsec = col_double(),
vs = col_integer(),
am = col_integer(),
gear = col_integer(),
carb = col_integer()
)
但这似乎很尴尬。是否有更优雅的方法来更新列 classifications,例如,如我的示例所示,尝试跳过 mtcars$cyl
列?或者,如果不是一种优雅的方式,一种涵盖所有可能类型的方式?我不想对如何使用各种日期格式实现 <collector_date>
进行大量猜测。
的最小版本
library(readr)
test_spec <- spec_csv('x,y,theDate,skipCol
1,a,"21/01/2018", "skip1
2,z,"31/01/2018", "skip2')
test_spec
#> cols(
#> x = col_integer(),
#> y = col_character(),
#> theDate = col_character(),
#> skipCol = col_character()
#> )
test_spec$cols[["theDate"]] <- col_date("%d/%m/%Y")
test_spec$cols[["skipCol"]] <- col_skip()
test_spec
#> cols(
#> x = col_integer(),
#> y = col_character(),
#> theDate = col_date(format = "%d/%m/%Y"),
#> skipCol = col_skip()
#> )
备注
- 您需要了解数据的日期格式。
- 您可以对文件使用 readr::spec_csv()
我喜欢 this RStudio blog post 中描述的有关色谱柱规格的工作流程。基本上,可以在 read_csv
导入后获取列规范,然后将其保存下来以备后用。例如,从 post:
mtcars2 <- read_csv(readr_example("mtcars.csv"))
#> Parsed with column specification:
#> cols(
#> mpg = col_double(),
#> cyl = col_integer(),
#> disp = col_double(),
#> hp = col_integer(),
#> drat = col_double(),
#> wt = col_double(),
#> qsec = col_double(),
#> vs = col_integer(),
#> am = col_integer(),
#> gear = col_integer(),
#> carb = col_integer()
#> )
# Once you've figured out the correct types
mtcars_spec <- write_rds(spec(mtcars2), "mtcars2-spec.rds")
# Every subsequent load
mtcars2 <- read_csv(
readr_example("mtcars.csv"),
col_types = read_rds("mtcars2-spec.rds")
)
不幸的是,规范对象本身是带有属性的列表,但它们与通过 col_types
参数
read_csv
函数的不同列规范不匹配
> mtcars_spec$cols$cyl
<collector_integer>
> str(mtcars_spec$cols$cyl)
list()
- attr(*, "class")= chr [1:2] "collector_integer" "collector"
> class(mtcars_spec)
[1] "col_spec"
此外,在 Windows 中编辑 .rds 文件很难看(至少对我而言)。
我希望能够编辑大型 col_spec
对象(例如,跳过某些列,或者以其他方式编辑 class)。我可以继续猜测编辑列表所需的字符串,如下所示:
attr(mtcars_spec$cols$cyl,"class")[1] = "collector_skip"` # this worked!
> mtcars_spec
cols(
mpg = col_double(),
cyl = col_skip(),
disp = col_double(),
hp = col_integer(),
drat = col_double(),
wt = col_double(),
qsec = col_double(),
vs = col_integer(),
am = col_integer(),
gear = col_integer(),
carb = col_integer()
)
但这似乎很尴尬。是否有更优雅的方法来更新列 classifications,例如,如我的示例所示,尝试跳过 mtcars$cyl
列?或者,如果不是一种优雅的方式,一种涵盖所有可能类型的方式?我不想对如何使用各种日期格式实现 <collector_date>
进行大量猜测。
library(readr)
test_spec <- spec_csv('x,y,theDate,skipCol
1,a,"21/01/2018", "skip1
2,z,"31/01/2018", "skip2')
test_spec
#> cols(
#> x = col_integer(),
#> y = col_character(),
#> theDate = col_character(),
#> skipCol = col_character()
#> )
test_spec$cols[["theDate"]] <- col_date("%d/%m/%Y")
test_spec$cols[["skipCol"]] <- col_skip()
test_spec
#> cols(
#> x = col_integer(),
#> y = col_character(),
#> theDate = col_date(format = "%d/%m/%Y"),
#> skipCol = col_skip()
#> )
备注
- 您需要了解数据的日期格式。
- 您可以对文件使用 readr::spec_csv()