根据行向函数过滤行(dplyr)
Filter rows according to rowwise function (dplyr)
你能帮我在下面的最后一个命令中使用 dplyr
而不是 apply
进行过滤吗?
我正在尝试解决发布的问题 here
library(gtools)
n <- 8
dt <- permutations(n+1,6,v=0:n,repeats.allowed=TRUE)
SmplMode <- function(x) {
tabSmpl <- tabulate(x)
SmplMode <- which(tabSmpl == max(tabSmpl))
if (sum(tabSmpl == max(tabSmpl)) > 1)
SmplMode <- 0
return(SmplMode)
}
res <- dt[apply(dt,1,function(x) {
y <- rep(c(1,2,3,4,5,6),c(x[1],x[2],x[3],x[4],x[5],x[6]))
return(mean(y)==3 & diff(range(y))==4 & median(y)==3.5 & SmplMode(y)==4)
}),]
您可以使用 rowwise
进行按行操作。然后用mutate
判断是否满足条件,filter
按条件过滤。
res <- dt %>%
data.frame %>% # convert to data.frame, so you can use dplyr
rowwise %>% # for rowwise calculations
mutate(cond = {y = rep(1:6, c(X1, X2, X3, X4, X5, X6)) # calculate condition
mean(y)==3 & diff(range(y))==4 & median(y)==3.5 & SmplMode(y)==4}) %>%
filter(cond) %>% # filter by condition
mutate(cond = NULL) %>% # remove condition
as.matrix # convert back to matrix
使用 rowwise
的操作很慢,因此在 matrixStats
包中的行操作的帮助下尽早过滤掉 SmplMode(y), mean(y), diff(range(y))
条件可以很好地加快速度。以下在我的笔记本电脑上运行大约 0.4 秒,而您的原始解决方案和@shadow 的解决方案都运行大约 70 秒。
library(dplyr)
library(matrixStats)
df <- data.frame(dt)
df$m <- rowMaxs(dt) #for SmplMode(y)
S <- matrix(1:6, ncol=ncol(dt), nrow=nrow(dt), byrow=T)
Z <- S*(dt!=0)
Z[Z==0] <- NA
df$Range <- rowMaxs(Z, na.rm=TRUE)-rowMins(Z, na.rm=TRUE) #for diff(rang(y))
df$Mean <- rowSums(S*dt)/rowSums(dt) #for mean(y)
res <- df %>%
filter(X4 == m, (X1==m)+(X2==m)+(X3==m)+(X4==m)+(X5==m)+(X6==m)==1,
Range == 4, # range condition here
Mean == 3) %>% #mean condition here
rowwise() %>%
mutate(Med = median(rep(c(1,2,3,4,5,6), c(X1, X2, X3, X4, X5, X6)))) %>%
filter(Med == 3.5) %>% #median condition here
select(-m, -Range, -Mean, -Med) %>% # get rid of newcols
as.matrix
你能帮我在下面的最后一个命令中使用 dplyr
而不是 apply
进行过滤吗?
我正在尝试解决发布的问题 here
library(gtools)
n <- 8
dt <- permutations(n+1,6,v=0:n,repeats.allowed=TRUE)
SmplMode <- function(x) {
tabSmpl <- tabulate(x)
SmplMode <- which(tabSmpl == max(tabSmpl))
if (sum(tabSmpl == max(tabSmpl)) > 1)
SmplMode <- 0
return(SmplMode)
}
res <- dt[apply(dt,1,function(x) {
y <- rep(c(1,2,3,4,5,6),c(x[1],x[2],x[3],x[4],x[5],x[6]))
return(mean(y)==3 & diff(range(y))==4 & median(y)==3.5 & SmplMode(y)==4)
}),]
您可以使用 rowwise
进行按行操作。然后用mutate
判断是否满足条件,filter
按条件过滤。
res <- dt %>%
data.frame %>% # convert to data.frame, so you can use dplyr
rowwise %>% # for rowwise calculations
mutate(cond = {y = rep(1:6, c(X1, X2, X3, X4, X5, X6)) # calculate condition
mean(y)==3 & diff(range(y))==4 & median(y)==3.5 & SmplMode(y)==4}) %>%
filter(cond) %>% # filter by condition
mutate(cond = NULL) %>% # remove condition
as.matrix # convert back to matrix
使用 rowwise
的操作很慢,因此在 matrixStats
包中的行操作的帮助下尽早过滤掉 SmplMode(y), mean(y), diff(range(y))
条件可以很好地加快速度。以下在我的笔记本电脑上运行大约 0.4 秒,而您的原始解决方案和@shadow 的解决方案都运行大约 70 秒。
library(dplyr)
library(matrixStats)
df <- data.frame(dt)
df$m <- rowMaxs(dt) #for SmplMode(y)
S <- matrix(1:6, ncol=ncol(dt), nrow=nrow(dt), byrow=T)
Z <- S*(dt!=0)
Z[Z==0] <- NA
df$Range <- rowMaxs(Z, na.rm=TRUE)-rowMins(Z, na.rm=TRUE) #for diff(rang(y))
df$Mean <- rowSums(S*dt)/rowSums(dt) #for mean(y)
res <- df %>%
filter(X4 == m, (X1==m)+(X2==m)+(X3==m)+(X4==m)+(X5==m)+(X6==m)==1,
Range == 4, # range condition here
Mean == 3) %>% #mean condition here
rowwise() %>%
mutate(Med = median(rep(c(1,2,3,4,5,6), c(X1, X2, X3, X4, X5, X6)))) %>%
filter(Med == 3.5) %>% #median condition here
select(-m, -Range, -Mean, -Med) %>% # get rid of newcols
as.matrix