R-Hmisc 按聚类结果估算
R-Hmisc impute by cluster result
我想通过对应于考虑其他 2 个变量 X1 和 X2 计算的每个聚类的均值来估算变量 x3。
我知道你可以传递一个函数来从 Hmisc 包中推算,比如 "mean" 并且它可以完成工作。所以我想传递一个执行以下所有操作的函数。
我用来编写代码来这样做:
df1 <- data.frame(x1=runif(1000,0,100),
x2=runif(1000,0,100),
x3=c(runif(900,0,100),rep(NA,100)))
我想传递一个函数来完成所有这些:
clust<-kmeans(df1[,-grep('x3', colnames(df1))], 3)
df1$clust<-clust$cluster
library(plyr)
cc<-ddply(df1, 'clust',summarise, mean=mean(x3, na.rm=TRUE))
df2<-merge(df1,cc, by='clust')
df2$x3imputed2<-ifelse(is.na(df2$x3),df2$mean, df2$x3)
有没有办法将所有这些代码作为函数传递并在 Hmisc 中使用? (我在 ddply 引入 x3 作为变量时遇到了问题)。
类似于以下内容:
ff<-function(i) {
clust<-kmeans(df1[,-grep(i, colnames(df1))], 3)
df1$clust<-clust$cluster
cc<-aggregate(df1[,i], by=list(clust=df1$clust), "mean", na.rm=TRUE)
df2<-merge(df1,cc, by='clust')
df2$x3imputed2<-ifelse(is.na(df2[, i]),df2$x, df2[,i])
}
f1$imputedx3<-with(df1, impute(x3,ff))
但是我得到一个错误:
empty cluster: try a better set of initial centers
当我用 x3 替换它时,我没有得到同样的错误。
尝试
library(lazyeval)
library(dplyr)
f1 <- function(dat, cname){
#get the third argument i.e, 'cname'
nm1 <- match.call()[[3]]
#paste 'imputed' for renaming the new column later
nm2 <- paste0(nm1, 'imputed')
#create an numeric column index that will be removed in kmeans calc
indx <- grep(cname, colnames(dat))
#get the 'kmeans' of the columns other than the 'cname'
clust <- kmeans(dat[,-indx],3)$cluster
#group by 'clust' and create new column with 'mutate'
dat %>%
group_by(clust=clust) %>%
mutate_(interp(~ifelse(is.na(v), mean(v, na.rm=TRUE), v),
v=as.name(cname))) %>%
#rename the column
setNames(., c(head(names(.),-1), nm2))
}
f1(df1, 'x3')
或者您可以使用 v= lazy(cname)
不带引号传递它
f2 <- function(dat, cname){
nm1 <- match.call()[[3]]
nm2 <- paste0(nm1, 'imputed')
indx <- grep(nm1, colnames(dat))
clust <- kmeans(dat[,-indx],3)$cluster
dat %>%
group_by(clust=clust) %>%
mutate_(interp(~ifelse(is.na(v), mean(v, na.rm=TRUE), v),
v= lazy(cname))) %>%
setNames(., c(head(names(.),-1), nm2))
}
f2(df1, x3)
我想通过对应于考虑其他 2 个变量 X1 和 X2 计算的每个聚类的均值来估算变量 x3。 我知道你可以传递一个函数来从 Hmisc 包中推算,比如 "mean" 并且它可以完成工作。所以我想传递一个执行以下所有操作的函数。
我用来编写代码来这样做:
df1 <- data.frame(x1=runif(1000,0,100),
x2=runif(1000,0,100),
x3=c(runif(900,0,100),rep(NA,100)))
我想传递一个函数来完成所有这些:
clust<-kmeans(df1[,-grep('x3', colnames(df1))], 3)
df1$clust<-clust$cluster
library(plyr)
cc<-ddply(df1, 'clust',summarise, mean=mean(x3, na.rm=TRUE))
df2<-merge(df1,cc, by='clust')
df2$x3imputed2<-ifelse(is.na(df2$x3),df2$mean, df2$x3)
有没有办法将所有这些代码作为函数传递并在 Hmisc 中使用? (我在 ddply 引入 x3 作为变量时遇到了问题)。
类似于以下内容:
ff<-function(i) {
clust<-kmeans(df1[,-grep(i, colnames(df1))], 3)
df1$clust<-clust$cluster
cc<-aggregate(df1[,i], by=list(clust=df1$clust), "mean", na.rm=TRUE)
df2<-merge(df1,cc, by='clust')
df2$x3imputed2<-ifelse(is.na(df2[, i]),df2$x, df2[,i])
}
f1$imputedx3<-with(df1, impute(x3,ff))
但是我得到一个错误:
empty cluster: try a better set of initial centers
当我用 x3 替换它时,我没有得到同样的错误。
尝试
library(lazyeval)
library(dplyr)
f1 <- function(dat, cname){
#get the third argument i.e, 'cname'
nm1 <- match.call()[[3]]
#paste 'imputed' for renaming the new column later
nm2 <- paste0(nm1, 'imputed')
#create an numeric column index that will be removed in kmeans calc
indx <- grep(cname, colnames(dat))
#get the 'kmeans' of the columns other than the 'cname'
clust <- kmeans(dat[,-indx],3)$cluster
#group by 'clust' and create new column with 'mutate'
dat %>%
group_by(clust=clust) %>%
mutate_(interp(~ifelse(is.na(v), mean(v, na.rm=TRUE), v),
v=as.name(cname))) %>%
#rename the column
setNames(., c(head(names(.),-1), nm2))
}
f1(df1, 'x3')
或者您可以使用 v= lazy(cname)
f2 <- function(dat, cname){
nm1 <- match.call()[[3]]
nm2 <- paste0(nm1, 'imputed')
indx <- grep(nm1, colnames(dat))
clust <- kmeans(dat[,-indx],3)$cluster
dat %>%
group_by(clust=clust) %>%
mutate_(interp(~ifelse(is.na(v), mean(v, na.rm=TRUE), v),
v= lazy(cname))) %>%
setNames(., c(head(names(.),-1), nm2))
}
f2(df1, x3)