如何检测和删除 R 中循环内的异常值?
How to detect and remove outliers within loop in R?
希望有人能为绝望的学生提供帮助:-)
我有一组程序代码,我有不同数量的手术(这里:程序)及其各自的持续时间。我想得到一些关于持续时间的描述性统计数据。为此,我希望我的循环已经通过 IQR 函数检测并删除异常值。这是没有异常检测和去除的代码:
# variables for output - run before each loop
Counter0<-1
Procedure_codes<-NULL
Number<-NULL
Min_Times<-NULL
Max_Times<-NULL
Average_Times<-NULL
Median_Times<-NULL
SD_Times<-NULL
#loop over all procedure codes
while(Counter0<=number_of_different_procedurecodes) {
a_g_procedures2<-NULL
Procedure_Name<-eval(list_of_procedurecodes[Counter0])
Procedure_name<-unlist(Procedure_Name)
print(Procedure_Name)
a_g_procedures2$Duration<-NULL
Durations<-NULL
number_of_procedures<-0
#Subset data for the specific procedure
a_g_procedures2<-subset(a_g_procedures1,ProcedureCode==Procedure_Name)
number_of_procedures<-length(a_g_procedures2$ProcedureCode)
Counter1<-1
#loop over specific procedure
while(Counter1<=number_of_procedures){
a_g_procedures$Duration<-NULL
TimeIn_1_Selected<-a_g_procedures2$"TimeIn_1"[Counter1]
TimeIn_1_Selected<-as.POSIXct(TimeIn_1_Selected,format="%d/%m/%Y %H:%M")
TimeIn_1_S<-as.numeric(TimeIn_1_Selected)
TimeIn_2_Selected<-a_g_procedures2$"TimeIn_2"[Counter1]
TimeIn_2_Selected<-as.POSIXct(TimeIn_2_Selected,format="%d/%m/%Y %H:%M")
TimeIn_2_S<-as.numeric(TimeIn_2_Selected)
TimeOut_Selected<-a_g_procedures2$"TimeOut"[Counter1]
TimeOut_Selected<-as.POSIXct(TimeOut_Selected,format="%d/%m/%Y %H:%M")
if (TimeIn_1_S>TimeIn_2_S) {
Start_Time<-TimeIn_2_Selected
}
if (TimeIn_1_S<=TimeIn_2_S) {
Start_Time<-TimeIn_1_Selected
}
print (Start_Time)
print(TimeOut_Selected)
Duration<-difftime(TimeOut_Selected, Start_Time, units = "mins")
Durations<-c(Durations,Duration)
Counter1<-Counter1+1
}
Procedure_codes<-c(Procedure_codes,Procedure_name)
Durations<-as.numeric(Durations)
Mean_Time<-mean(Durations, digits=1)
SD_Time<-sd(Durations,na.rm=TRUE)
Min_Time<-min(Durations, na.rm=TRUE)
Max_Time<-max(Durations, na.rm=TRUE)
Median_Time<-median(Durations, na.rm=TRUE)
Average_Times<-c(Average_Times,Mean_Time)
SD_Times<-c(SD_Times,SD_Time)
Min_Times<-c(Min_Times, Min_Time)
Max_Times<-c(Max_Times, Max_Time)
Median_Times<-c(Median_Times, Median_Time)
Number<-c(Number,number_of_procedures)
Counter0<-Counter0+1
}
ag_output<-data.frame(Procedure_codes,Number,Min_Times, Max_Times, Average_Times, Median_Times, SD_Times)
这是我想添加到特定过程循环中的内容:
Q<-quantile(Duration, probs=c(.25,.75), na.rm=FALSE)
iqr<-IQR(Duration)
up<-Q[2]+1.5*iqr
low<-Q[1]-1.5*iqr
remove<-Duration>(Q[1]-1.5*iqr) & Durations<(Q[1]-1.5*iqr)
setdiff(Duration, remove)
有人知道我该怎么做吗?
非常感谢您!
让它成为一个函数?
f.remove_outliers_IQR <- function(Duration)
{
Q <- quantile(Duration, probs=c(.25,.75), na.rm=FALSE)
iqr <- IQR(Duration)
up <- Q[2]+1.5*iqr
low <- Q[1]-1.5*iqr
remove <- Duration>(Q[1]-1.5*iqr) & Durations<(Q[1]-1.5*iqr)
Duration_out <- setdiff(Duration, remove)
return(Duration_out)
}
并在主循环中调用它,也许就在 Counter1<-Counter1+1
?
之前
希望有人能为绝望的学生提供帮助:-) 我有一组程序代码,我有不同数量的手术(这里:程序)及其各自的持续时间。我想得到一些关于持续时间的描述性统计数据。为此,我希望我的循环已经通过 IQR 函数检测并删除异常值。这是没有异常检测和去除的代码:
# variables for output - run before each loop Counter0<-1 Procedure_codes<-NULL Number<-NULL Min_Times<-NULL Max_Times<-NULL Average_Times<-NULL Median_Times<-NULL SD_Times<-NULL #loop over all procedure codes while(Counter0<=number_of_different_procedurecodes) { a_g_procedures2<-NULL Procedure_Name<-eval(list_of_procedurecodes[Counter0]) Procedure_name<-unlist(Procedure_Name) print(Procedure_Name) a_g_procedures2$Duration<-NULL Durations<-NULL number_of_procedures<-0 #Subset data for the specific procedure a_g_procedures2<-subset(a_g_procedures1,ProcedureCode==Procedure_Name) number_of_procedures<-length(a_g_procedures2$ProcedureCode) Counter1<-1 #loop over specific procedure while(Counter1<=number_of_procedures){ a_g_procedures$Duration<-NULL TimeIn_1_Selected<-a_g_procedures2$"TimeIn_1"[Counter1] TimeIn_1_Selected<-as.POSIXct(TimeIn_1_Selected,format="%d/%m/%Y %H:%M") TimeIn_1_S<-as.numeric(TimeIn_1_Selected) TimeIn_2_Selected<-a_g_procedures2$"TimeIn_2"[Counter1] TimeIn_2_Selected<-as.POSIXct(TimeIn_2_Selected,format="%d/%m/%Y %H:%M") TimeIn_2_S<-as.numeric(TimeIn_2_Selected) TimeOut_Selected<-a_g_procedures2$"TimeOut"[Counter1] TimeOut_Selected<-as.POSIXct(TimeOut_Selected,format="%d/%m/%Y %H:%M") if (TimeIn_1_S>TimeIn_2_S) { Start_Time<-TimeIn_2_Selected } if (TimeIn_1_S<=TimeIn_2_S) { Start_Time<-TimeIn_1_Selected } print (Start_Time) print(TimeOut_Selected) Duration<-difftime(TimeOut_Selected, Start_Time, units = "mins") Durations<-c(Durations,Duration) Counter1<-Counter1+1 } Procedure_codes<-c(Procedure_codes,Procedure_name) Durations<-as.numeric(Durations) Mean_Time<-mean(Durations, digits=1) SD_Time<-sd(Durations,na.rm=TRUE) Min_Time<-min(Durations, na.rm=TRUE) Max_Time<-max(Durations, na.rm=TRUE) Median_Time<-median(Durations, na.rm=TRUE) Average_Times<-c(Average_Times,Mean_Time) SD_Times<-c(SD_Times,SD_Time) Min_Times<-c(Min_Times, Min_Time) Max_Times<-c(Max_Times, Max_Time) Median_Times<-c(Median_Times, Median_Time) Number<-c(Number,number_of_procedures) Counter0<-Counter0+1 } ag_output<-data.frame(Procedure_codes,Number,Min_Times, Max_Times, Average_Times, Median_Times, SD_Times)
这是我想添加到特定过程循环中的内容:
Q<-quantile(Duration, probs=c(.25,.75), na.rm=FALSE)
iqr<-IQR(Duration)
up<-Q[2]+1.5*iqr
low<-Q[1]-1.5*iqr
remove<-Duration>(Q[1]-1.5*iqr) & Durations<(Q[1]-1.5*iqr)
setdiff(Duration, remove)
有人知道我该怎么做吗?
非常感谢您!
让它成为一个函数?
f.remove_outliers_IQR <- function(Duration)
{
Q <- quantile(Duration, probs=c(.25,.75), na.rm=FALSE)
iqr <- IQR(Duration)
up <- Q[2]+1.5*iqr
low <- Q[1]-1.5*iqr
remove <- Duration>(Q[1]-1.5*iqr) & Durations<(Q[1]-1.5*iqr)
Duration_out <- setdiff(Duration, remove)
return(Duration_out)
}
并在主循环中调用它,也许就在 Counter1<-Counter1+1
?