用于替换数据框中值的 R 函数组合
R fuction composition for the substitution of values in dataframe
给定以下可重现的示例
my objective 是将数据框相邻列的原始值逐行替换为NA;我知道这是一个问题(有这么多变体)已经发布但我还没有找到我试图完成的方法的解决方案:即通过应用函数组合
在可重现的示例中,驱动用 NA 替换原始值的列是列 a
这是我目前所做的
最后一个代码片段是我实际搜索内容的失败尝试...
#-----------------------------------------------------------
# ifelse approach, it works but...
# it's error prone: i.e. copy and paste for all columns can introduce a lot of troubles
df<-data.frame(a=c(1, 2, NA), b=c(3, NA, 4), c=c(NA, 5, 6))
df
df$b<-ifelse(is.na(df$a), NA, df$b)
df$c<-ifelse(is.na(df$a), NA, df$c)
df
#--------------------------------------------------------
# extraction and subsitution approach
# same as above
df<-data.frame(a=c(1, 2, NA), b=c(3, NA, 4), c=c(NA, 5, 6))
df
df$b[is.na(df$a)]<-NA
df$c[is.na(df$a)]<-NA
df
#----------------------------------------------------------
# definition of a function
# it's a bit better, but still error prone because of the copy and paste
df<-data.frame(a=c(1, 2, NA), b=c(3, NA, 4), c=c(NA, 5, 6))
df
fix<-function(x,y){
ifelse(is.na(x), NA, y)
}
df$b<-fix(df$a, df$b)
df$c<-fix(df$a, df$c)
df
#------------------------------------------------------------
# this approach is not working as expected!
# the idea behind is of function composition;
# lapply does the fix to some columns of data frame
df<-data.frame(a=c(1, 2, NA), b=c(3, NA, 4), c=c(NA, 5, 6))
df
fix2<-function(x){
x[is.na(x[1])]<-NA
x
}
df[]<-lapply(df, fix2)
df
对这种特殊方法有什么帮助吗?
我坚持如何正确构思传递给 lapply
的替代函数
谢谢
试试这个函数,在输入中你有原始数据集,在输出中有清理过的数据集:
输入
df<-data.frame(a=c(1, 2, NA), b=c(3, NA, 4), c=c(NA, 5, 6))
> df
a b c
1 1 3 NA
2 2 NA 5
3 NA 4 6
函数
fix<-function(df,var_x,list_y)
{
df[is.na(df[,var_x]),list_y]<-NA
return(df)
}
输出
fix(df,"a",c("b","c"))
a b c
1 1 3 NA
2 2 NA 5
3 NA NA NA
使用词法闭包
如果您使用词法闭包 - 您定义一个函数,该函数首先生成您需要的函数。
然后你就可以随意使用这个功能了
# given a column all other columns' values at that row should become NA
# if the driver column's value at that row is NA
# using lexical scoping of R function definitions, one can reach that.
df<-data.frame(a=c(1, 2, NA), b=c(3, NA, 4), c=c(NA, 5, 6))
df
# whatever vector given, this vector's value should be changed
# according to first column's value
na_accustomizer <- function(df, driver_col) {
## Returns a function which will accustomize any vector/column
## to driver column's NAs
function(vec) {
vec[is.na(df[, driver_col])] <- NA
vec
}
}
df[] <- lapply(df, na_accustomizer(df, "a"))
df
## a b c
## 1 1 3 NA
## 2 2 NA 5
## 3 NA NA NA
#
# na_accustomizer(df, "a") returns
#
# function(vec) {
# vec[is.na(df[, "a"])] <- NA
# vec
# }
#
# which then can be used like you want:
# df[] <- lapply(df, na_accustomize(df, "a"))
使用正常功能
df<-data.frame(a=c(1, 2, NA), b=c(3, NA, 4), c=c(NA, 5, 6))
df
# define it for one column
overtake_NA <- function(df, driver_col, target_col) {
df[, target_col] <- ifelse(is.na(df[, driver_col]), NA, df[, target_col])
df
}
# define it for all columns of df
overtake_driver_col_NAs <- function(df, driver_col) {
for (i in 1:ncol(df)) {
df <- overtake_NA(df, driver_col, i)
}
df
}
overtake_driver_col_NAs(df, "a")
# a b c
# 1 1 3 NA
# 2 2 NA 5
# 3 NA NA NA
泛化任何谓词函数
driver_col_to_other_cols <- function(df, driver_col, pred) {
## overtake any value of the driver column to the other columns of df,
## whenever predicate function (pred) is fulfilled.
# define it for one column
overtake_ <- function(df, driver_col, target_col, pred) {
selectors <- do.call(pred, list(df[, driver_col]))
if (deparse(substitute(pred)) != "is.na") {
# this is to 'recorrect' NA's which intrude into the selector vector
# then driver_col has NAs. For sure "is.na" is not the only possible
# way to check for NA - so this edge case is not covered fully
selectors[is.na(selectors)] <- FALSE
}
df[, target_col] <- ifelse(selectors, df[, driver_col], df[, target_col])
df
}
for (i in 1:ncol(df)) {
df <- overtake_(df, driver_col, i, pred)
}
df
}
driver_col_to_other_cols(df, "a", function(x) x == 1)
# a b c
# 1 1 1 1
# 2 2 NA 5
# 3 NA 4 6
## if the "is.na" check is not done, then this would give
## (because of NA in selectorvector):
# a b c
# 1 1 1 1
# 2 2 NA 5
# 3 NA NA NA
## hence in the case that pred doesn't check for NA in 'a',
## these NA vlaues have to be reverted to the original columns' value.
driver_col_to_other_cols(df, "a", is.na)
# a b c
# 1 1 3 NA
# 2 2 NA 5
# 3 NA NA NA
给定以下可重现的示例
my objective 是将数据框相邻列的原始值逐行替换为NA;我知道这是一个问题(有这么多变体)已经发布但我还没有找到我试图完成的方法的解决方案:即通过应用函数组合
在可重现的示例中,驱动用 NA 替换原始值的列是列 a
这是我目前所做的
最后一个代码片段是我实际搜索内容的失败尝试...
#-----------------------------------------------------------
# ifelse approach, it works but...
# it's error prone: i.e. copy and paste for all columns can introduce a lot of troubles
df<-data.frame(a=c(1, 2, NA), b=c(3, NA, 4), c=c(NA, 5, 6))
df
df$b<-ifelse(is.na(df$a), NA, df$b)
df$c<-ifelse(is.na(df$a), NA, df$c)
df
#--------------------------------------------------------
# extraction and subsitution approach
# same as above
df<-data.frame(a=c(1, 2, NA), b=c(3, NA, 4), c=c(NA, 5, 6))
df
df$b[is.na(df$a)]<-NA
df$c[is.na(df$a)]<-NA
df
#----------------------------------------------------------
# definition of a function
# it's a bit better, but still error prone because of the copy and paste
df<-data.frame(a=c(1, 2, NA), b=c(3, NA, 4), c=c(NA, 5, 6))
df
fix<-function(x,y){
ifelse(is.na(x), NA, y)
}
df$b<-fix(df$a, df$b)
df$c<-fix(df$a, df$c)
df
#------------------------------------------------------------
# this approach is not working as expected!
# the idea behind is of function composition;
# lapply does the fix to some columns of data frame
df<-data.frame(a=c(1, 2, NA), b=c(3, NA, 4), c=c(NA, 5, 6))
df
fix2<-function(x){
x[is.na(x[1])]<-NA
x
}
df[]<-lapply(df, fix2)
df
对这种特殊方法有什么帮助吗? 我坚持如何正确构思传递给 lapply
的替代函数谢谢
试试这个函数,在输入中你有原始数据集,在输出中有清理过的数据集:
输入
df<-data.frame(a=c(1, 2, NA), b=c(3, NA, 4), c=c(NA, 5, 6))
> df
a b c
1 1 3 NA
2 2 NA 5
3 NA 4 6
函数
fix<-function(df,var_x,list_y)
{
df[is.na(df[,var_x]),list_y]<-NA
return(df)
}
输出
fix(df,"a",c("b","c"))
a b c
1 1 3 NA
2 2 NA 5
3 NA NA NA
使用词法闭包
如果您使用词法闭包 - 您定义一个函数,该函数首先生成您需要的函数。 然后你就可以随意使用这个功能了
# given a column all other columns' values at that row should become NA
# if the driver column's value at that row is NA
# using lexical scoping of R function definitions, one can reach that.
df<-data.frame(a=c(1, 2, NA), b=c(3, NA, 4), c=c(NA, 5, 6))
df
# whatever vector given, this vector's value should be changed
# according to first column's value
na_accustomizer <- function(df, driver_col) {
## Returns a function which will accustomize any vector/column
## to driver column's NAs
function(vec) {
vec[is.na(df[, driver_col])] <- NA
vec
}
}
df[] <- lapply(df, na_accustomizer(df, "a"))
df
## a b c
## 1 1 3 NA
## 2 2 NA 5
## 3 NA NA NA
#
# na_accustomizer(df, "a") returns
#
# function(vec) {
# vec[is.na(df[, "a"])] <- NA
# vec
# }
#
# which then can be used like you want:
# df[] <- lapply(df, na_accustomize(df, "a"))
使用正常功能
df<-data.frame(a=c(1, 2, NA), b=c(3, NA, 4), c=c(NA, 5, 6))
df
# define it for one column
overtake_NA <- function(df, driver_col, target_col) {
df[, target_col] <- ifelse(is.na(df[, driver_col]), NA, df[, target_col])
df
}
# define it for all columns of df
overtake_driver_col_NAs <- function(df, driver_col) {
for (i in 1:ncol(df)) {
df <- overtake_NA(df, driver_col, i)
}
df
}
overtake_driver_col_NAs(df, "a")
# a b c
# 1 1 3 NA
# 2 2 NA 5
# 3 NA NA NA
泛化任何谓词函数
driver_col_to_other_cols <- function(df, driver_col, pred) {
## overtake any value of the driver column to the other columns of df,
## whenever predicate function (pred) is fulfilled.
# define it for one column
overtake_ <- function(df, driver_col, target_col, pred) {
selectors <- do.call(pred, list(df[, driver_col]))
if (deparse(substitute(pred)) != "is.na") {
# this is to 'recorrect' NA's which intrude into the selector vector
# then driver_col has NAs. For sure "is.na" is not the only possible
# way to check for NA - so this edge case is not covered fully
selectors[is.na(selectors)] <- FALSE
}
df[, target_col] <- ifelse(selectors, df[, driver_col], df[, target_col])
df
}
for (i in 1:ncol(df)) {
df <- overtake_(df, driver_col, i, pred)
}
df
}
driver_col_to_other_cols(df, "a", function(x) x == 1)
# a b c
# 1 1 1 1
# 2 2 NA 5
# 3 NA 4 6
## if the "is.na" check is not done, then this would give
## (because of NA in selectorvector):
# a b c
# 1 1 1 1
# 2 2 NA 5
# 3 NA NA NA
## hence in the case that pred doesn't check for NA in 'a',
## these NA vlaues have to be reverted to the original columns' value.
driver_col_to_other_cols(df, "a", is.na)
# a b c
# 1 1 3 NA
# 2 2 NA 5
# 3 NA NA NA