带 dplyr 的条件线性拟合
Conditional linear fitting with dplyr
我正在尝试对分组数据进行线性拟合。
但是,我也想在整个拟合过程中添加一些条件,但是当我对某些条件进行子集化时却没有这样做。
set.seed(183)
library(dplyr)
V <- rep(seq(1,8),3)
value = c(c(sort(runif(5,0.001,1)),rep(0,3)),c(sort(runif(5,0.001,1)),rep(0,2),runif(1,0.001,1)),c(sort(runif(5,0.001,1)),rep(0,2),runif(1,0.001,1)))
group=rep(letters[1:3],each=8)
df <- data.frame(group,V,value)
# > df
# group V value
#1 a 1 0.15087459
#2 a 2 0.35408406
#3 a 3 0.47339320
#4 a 4 0.67614665
#5 a 5 0.98273932
#6 a 6 0.00000000
#7 a 7 0.00000000
#8 a 8 0.00000000
#9 b 1 0.32821476
#10 b 2 0.35737009
#11 b 3 0.58821689
#12 b 4 0.81088053
#13 b 5 0.99122633
#14 b 6 0.00000000
#15 b 7 0.00000000
#16 b 8 0.03697432
#17 c 1 0.12940226
#18 c 2 0.41918905
#19 c 3 0.66020739
#20 c 4 0.84124155
#21 c 5 0.95052213
#22 c 6 0.00000000
#23 c 7 0.00000000
#24 c 8 0.15071444
我在各组内的条件是
1) if all last 3 value==0
fit only when V>=4&V<=5
2)if any last 2 value>0
fit only when V>=7
.
这是我为执行此操作而编写的函数
get_slope <- function(df){
if (tail(df$value,3)==0)
slp = coef(lm(value~V, data=subset(df,V>=4&V<=5)))[2]
else
if (any(tail(df$value,3)>=0))
slp = coef(lm(value ~ V, data=subset(df,V>=7)))[2]
return(slp)
}
df_slope <- df%>%
group_by(group)%>%
do(.,slope=get_slope(df))
Warning messages:
1: In if (tail(df$value, 3) == 0) slp = coef(lm(value ~ V, data = subset(df, :
the condition has length > 1 and only the first element will be used
2: In if (tail(df$value, 3) == 0) slp = coef(lm(value ~ V, data = subset(df, :
the condition has length > 1 and only the first element will be used
3: In if (tail(df$value, 3) == 0) slp = coef(lm(value ~ V, data = subset(df, :
the condition has length > 1 and only the first element will be used
最后我想得到每组的斜率值。
有没有简单的方法来做到这一点?
非常感谢!
条件tail(df$value,3)==0
将给出3个T/F值。我在下面的函数中交换它,并使用 split 和 plyr::ldply 而不是 dplyr.
df1<-split(df, df$group)
get_slope <- function(df){
if (any(!tail(df$value,3)==0)) coef(lm(value ~ V, data=subset(df,V>=7)))[2]
else coef(lm(value~V, data=subset(df,V>=4&V<=5)))[2]
}
library(plyr)
ldply(df1, get_slope)
.id V
1 a 0.06940913
2 b 0.20794964
3 c 0.84607397
请求的 dplyr 方法:
df_slope <- df %>%
group_by(group) %>%
do(.,slope=get_slope(.))
df_slope$slope <- df_slope$slope %>% unlist %>% as.numeric
df_slope
Source: local data frame [3 x 2]
Groups: <by row>
group slope
(fctr) (dbl)
1 a 0.06940913
2 b 0.20794964
3 c 0.84607397
我想到的是循环所有级别并应用函数。
1.set 接收所有结果的数组
slp = c()
2.Loop组内所有级别,满足条件则执行动作。
for( group in unique(df$group)){
if(sum(tail(df$value[which(df$group==group)],3) == c(0,0,0))==3){
slp = c(slp,coef(lm(value~V, data=subset(df,V>=4&V<=5)))[2])
}else if(any(tail(df$value[which(df$group==group)],2)>=0)){
slp = c(slp, coef(lm(value ~ V, data=subset(df,V>=7)))[2])
}}
3.Print 结果
slp
a b c
0.06448301 0.55057826 0.55057826
我的解决方案可能不像预期的那样简单,但应该很容易理解,希望对您有所帮助。
我正在尝试对分组数据进行线性拟合。
但是,我也想在整个拟合过程中添加一些条件,但是当我对某些条件进行子集化时却没有这样做。
set.seed(183)
library(dplyr)
V <- rep(seq(1,8),3)
value = c(c(sort(runif(5,0.001,1)),rep(0,3)),c(sort(runif(5,0.001,1)),rep(0,2),runif(1,0.001,1)),c(sort(runif(5,0.001,1)),rep(0,2),runif(1,0.001,1)))
group=rep(letters[1:3],each=8)
df <- data.frame(group,V,value)
# > df
# group V value
#1 a 1 0.15087459
#2 a 2 0.35408406
#3 a 3 0.47339320
#4 a 4 0.67614665
#5 a 5 0.98273932
#6 a 6 0.00000000
#7 a 7 0.00000000
#8 a 8 0.00000000
#9 b 1 0.32821476
#10 b 2 0.35737009
#11 b 3 0.58821689
#12 b 4 0.81088053
#13 b 5 0.99122633
#14 b 6 0.00000000
#15 b 7 0.00000000
#16 b 8 0.03697432
#17 c 1 0.12940226
#18 c 2 0.41918905
#19 c 3 0.66020739
#20 c 4 0.84124155
#21 c 5 0.95052213
#22 c 6 0.00000000
#23 c 7 0.00000000
#24 c 8 0.15071444
我在各组内的条件是
1) if all last 3 value==0
fit only when V>=4&V<=5
2)if any last 2 value>0
fit only when V>=7
.
这是我为执行此操作而编写的函数
get_slope <- function(df){
if (tail(df$value,3)==0)
slp = coef(lm(value~V, data=subset(df,V>=4&V<=5)))[2]
else
if (any(tail(df$value,3)>=0))
slp = coef(lm(value ~ V, data=subset(df,V>=7)))[2]
return(slp)
}
df_slope <- df%>%
group_by(group)%>%
do(.,slope=get_slope(df))
Warning messages:
1: In if (tail(df$value, 3) == 0) slp = coef(lm(value ~ V, data = subset(df, :
the condition has length > 1 and only the first element will be used
2: In if (tail(df$value, 3) == 0) slp = coef(lm(value ~ V, data = subset(df, :
the condition has length > 1 and only the first element will be used
3: In if (tail(df$value, 3) == 0) slp = coef(lm(value ~ V, data = subset(df, :
the condition has length > 1 and only the first element will be used
最后我想得到每组的斜率值。
有没有简单的方法来做到这一点?
非常感谢!
条件tail(df$value,3)==0
将给出3个T/F值。我在下面的函数中交换它,并使用 split 和 plyr::ldply 而不是 dplyr.
df1<-split(df, df$group)
get_slope <- function(df){
if (any(!tail(df$value,3)==0)) coef(lm(value ~ V, data=subset(df,V>=7)))[2]
else coef(lm(value~V, data=subset(df,V>=4&V<=5)))[2]
}
library(plyr)
ldply(df1, get_slope)
.id V
1 a 0.06940913
2 b 0.20794964
3 c 0.84607397
请求的 dplyr 方法:
df_slope <- df %>%
group_by(group) %>%
do(.,slope=get_slope(.))
df_slope$slope <- df_slope$slope %>% unlist %>% as.numeric
df_slope
Source: local data frame [3 x 2]
Groups: <by row>
group slope
(fctr) (dbl)
1 a 0.06940913
2 b 0.20794964
3 c 0.84607397
我想到的是循环所有级别并应用函数。
1.set 接收所有结果的数组
slp = c()
2.Loop组内所有级别,满足条件则执行动作。
for( group in unique(df$group)){ if(sum(tail(df$value[which(df$group==group)],3) == c(0,0,0))==3){ slp = c(slp,coef(lm(value~V, data=subset(df,V>=4&V<=5)))[2]) }else if(any(tail(df$value[which(df$group==group)],2)>=0)){ slp = c(slp, coef(lm(value ~ V, data=subset(df,V>=7)))[2]) }}3.Print 结果
slp a b c 0.06448301 0.55057826 0.55057826
我的解决方案可能不像预期的那样简单,但应该很容易理解,希望对您有所帮助。