为什么替换函数适用于数据框而不适用于 r 中的小标题?
Why a replace function works on data frames but not on tibbles in r?
我查看了讨论 以了解为什么 replace_na
函数(如下所示)适用于数据帧而不适用于 tibbles。你能帮我理解为什么它对 tibbles 不起作用吗?如何修改该函数,使其同时适用于 data.frame
和 tibble
?
数据
library(dplyr)
#dput(df1)
df1 <- structure(list(id = c(1, 2, 3, 4), gender = c("M", "F", NA, "F"
), grade = c("A", NA, NA, NA), age = c(2, NA, 2, NA)), row.names = c(NA,
-4L), class = c("tbl_df", "tbl", "data.frame"))
#dput(df2)
df2 <- structure(list(id = c(1, 2, 3, 4), gender = c("M", "F", "M",
"F"), grade = c("A", "A", "B", "NG"), age = c(22, 23, 21, 19)), row.names = c(NA,
-4L), class = c("tbl_df", "tbl", "data.frame"))
替换函数
replace_na <- function(df_to, df_from) {
replace(df_to, is.na(df_to), df_from[is.na(df_to)])
}
用法
replace_na(df1,df2)
Error: Must use a vector in [
, not an object of class matrix.
Call rlang::last_error()
to see a backtrace
Called from: abort(error_dim_column_index(j))
但是;将 arglist
强制转换为数据帧会产生所需的输出,如下所示。
replace_na(as.data.frame(df1), as.data.frame(df2))
# id gender grade age
# 1 1 M A 2
# 2 2 F A 23
# 3 3 M B 2
# 4 4 F NG 19
谢谢。
is.na()
returns 数据框的逻辑矩阵:
is.na(df1)
#> id gender grade age
#> [1,] FALSE FALSE FALSE FALSE
#> [2,] FALSE FALSE TRUE TRUE
#> [3,] FALSE TRUE TRUE FALSE
#> [4,] FALSE FALSE TRUE TRUE
基础data.frame
class支持用矩阵进行子集化; tbl_df
更严格,没有。
as.data.frame(df2)[is.na(df1)]
#> [1] "M" "A" "B" "NG" "23" "19"
df2[is.na(df1)]
#> Must use a vector in `[`, not an object of class matrix.
要使您的 replace_na()
函数与 tbl_df
一起使用,您需要为每一列单独执行操作。例如,递归:
replace_na <- function(x, y) {
if (is.data.frame(x)) {
x[] <- Map(replace_na, x, y)
return(x)
}
replace(x, is.na(x), y[is.na(x)])
}
replace_na(df1, df2)
#> # A tibble: 4 x 4
#> id gender grade age
#> <dbl> <chr> <chr> <dbl>
#> 1 1 M A 2
#> 2 2 F A 23
#> 3 3 M B 2
#> 4 4 F NG 19
这种方法通常也更快:
replace_na_vec <- function(x, y) {
replace(x, is.na(x), y[is.na(x)])
}
df1_10k <- do.call("rbind", replicate(10000, df1, simplify = FALSE))
df2_10k <- do.call("rbind", replicate(10000, df2, simplify = FALSE))
bench::mark(
check = FALSE,
new = replace_na(df1, df2),
old = replace_na_vec(as.data.frame(df1), as.data.frame(df2)),
new_10k = replace_na(df1_10k, df2_10k),
old_10k = replace_na_vec(as.data.frame(df1_10k), as.data.frame(df2_10k))
)
#> # A tibble: 4 x 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
#> 1 new 74.01us 97.79us 7295. 0B 12.6
#> 2 old 269.97us 529.93us 1845. 81.02KB 8.23
#> 3 new_10k 1.82ms 2.75ms 338. 4.27MB 32.3
#> 4 old_10k 94.29ms 104.05ms 9.68 10.24MB 2.42
由 reprex package (v0.3.0)
于 2019-09-12 创建
我查看了讨论 replace_na
函数(如下所示)适用于数据帧而不适用于 tibbles。你能帮我理解为什么它对 tibbles 不起作用吗?如何修改该函数,使其同时适用于 data.frame
和 tibble
?
数据
library(dplyr)
#dput(df1)
df1 <- structure(list(id = c(1, 2, 3, 4), gender = c("M", "F", NA, "F"
), grade = c("A", NA, NA, NA), age = c(2, NA, 2, NA)), row.names = c(NA,
-4L), class = c("tbl_df", "tbl", "data.frame"))
#dput(df2)
df2 <- structure(list(id = c(1, 2, 3, 4), gender = c("M", "F", "M",
"F"), grade = c("A", "A", "B", "NG"), age = c(22, 23, 21, 19)), row.names = c(NA,
-4L), class = c("tbl_df", "tbl", "data.frame"))
替换函数
replace_na <- function(df_to, df_from) {
replace(df_to, is.na(df_to), df_from[is.na(df_to)])
}
用法
replace_na(df1,df2)
Error: Must use a vector in
[
, not an object of class matrix.Call
rlang::last_error()
to see a backtraceCalled from: abort(error_dim_column_index(j))
但是;将 arglist
强制转换为数据帧会产生所需的输出,如下所示。
replace_na(as.data.frame(df1), as.data.frame(df2))
# id gender grade age
# 1 1 M A 2
# 2 2 F A 23
# 3 3 M B 2
# 4 4 F NG 19
谢谢。
is.na()
returns 数据框的逻辑矩阵:
is.na(df1)
#> id gender grade age
#> [1,] FALSE FALSE FALSE FALSE
#> [2,] FALSE FALSE TRUE TRUE
#> [3,] FALSE TRUE TRUE FALSE
#> [4,] FALSE FALSE TRUE TRUE
基础data.frame
class支持用矩阵进行子集化; tbl_df
更严格,没有。
as.data.frame(df2)[is.na(df1)]
#> [1] "M" "A" "B" "NG" "23" "19"
df2[is.na(df1)]
#> Must use a vector in `[`, not an object of class matrix.
要使您的 replace_na()
函数与 tbl_df
一起使用,您需要为每一列单独执行操作。例如,递归:
replace_na <- function(x, y) {
if (is.data.frame(x)) {
x[] <- Map(replace_na, x, y)
return(x)
}
replace(x, is.na(x), y[is.na(x)])
}
replace_na(df1, df2)
#> # A tibble: 4 x 4
#> id gender grade age
#> <dbl> <chr> <chr> <dbl>
#> 1 1 M A 2
#> 2 2 F A 23
#> 3 3 M B 2
#> 4 4 F NG 19
这种方法通常也更快:
replace_na_vec <- function(x, y) {
replace(x, is.na(x), y[is.na(x)])
}
df1_10k <- do.call("rbind", replicate(10000, df1, simplify = FALSE))
df2_10k <- do.call("rbind", replicate(10000, df2, simplify = FALSE))
bench::mark(
check = FALSE,
new = replace_na(df1, df2),
old = replace_na_vec(as.data.frame(df1), as.data.frame(df2)),
new_10k = replace_na(df1_10k, df2_10k),
old_10k = replace_na_vec(as.data.frame(df1_10k), as.data.frame(df2_10k))
)
#> # A tibble: 4 x 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
#> 1 new 74.01us 97.79us 7295. 0B 12.6
#> 2 old 269.97us 529.93us 1845. 81.02KB 8.23
#> 3 new_10k 1.82ms 2.75ms 338. 4.27MB 32.3
#> 4 old_10k 94.29ms 104.05ms 9.68 10.24MB 2.42
由 reprex package (v0.3.0)
于 2019-09-12 创建