R for循环在应用于函数时出错
R for loop going wrong when applied to function
我正在尝试使用 for 循环来使 运行ning 我开发的函数更有效率。
但是,当我将它放在 for 循环中时,它会覆盖不应该覆盖的列并返回不正确的结果。
编辑:错误是在结果数据帧 MiSeq_Bord_Outliers_table0 中,包含标签 Outlier_type 的结果列返回不正确的输出。
根据 Outlier_Hunter 函数,当 Avg_Trim_Cov 和 S2_Total_Read_Pairs_Processed 低于它们的值时
各自的 Q1 阈值它们各自的 Outlier_type 列应显示为“Lower_Outlier”,如果在 Q1 和 Q3 阈值之间,则为“正常”,如果高于 Q3 阈值则为“Upper_outlier”。但是当执行for循环时,Outlier_type列中只显示“Upper_outlier”。
编辑:输入已经过简化,并在不同的计算机上使用干净的控制台进行了测试。如果说那里之前有什么神器,现在应该已经消除了,现在这里应该不会有什么差错。首先 运行 outlier_results_1var 部分很重要。如果您测试 运行 这段代码并遇到错误,请告诉我哪一部分失败了。
编辑:MiSeq_Bord_Outliers_table0_error 是正在重现的错误。这是错误结果,不是输入。
有人可以告诉我为什么返回这些不正确的结果以及我可以做些什么来解决这个问题吗?我将在下面上传相关代码。还是有另一种不用 for 循环的方法?
#libraries used
library(tidyverse)
library(datapasta)
library(data.table)
library(janitor)
library(ggpubr)
library(labeling)
#2.) Outlier_Hunter Function
#Function to Generate the Outlier table
#Outlier Hunter function takes 4 arguments: the dataset, column/variable of interest,
#Q1 and Q3. Q1 and Q3 are stored in the results of Quartile_Hunter.
#Input ex: MiSeq_Bord_final_report0, Avg_Trim_Cov, MiSeq_Bord_Quartiles_ATC$First_Quartile[1], MiSeq_Bord_Quartiles_ATC$Third_Quartile[1]
#Usage ex: Outlier_Hunter(MiSeq_Bord_final_report0, Avg_Trim_Cov,
#MiSeq_Bord_Quartiles_ATC$First_Quartile[1], MiSeq_Bord_Quartiles_ATC$Third_Quartile[1])
#Here is the Function to get the Outlier Table
Outlier_Hunter <- function(Platform_Genus_final_report0, my_col, Q1, Q3) {
#set up and generalize the variable name you want to work with
varname <- enquo(my_col)
#print(varname) #just to see what variable the function is working with
#get the outliers
Platform_Genus_Variable_Outliers <- Platform_Genus_final_report0 %>%
select(ReadID, Platform, Genus, !!varname) %>%
#Tell if it is an outlier, and if so, what kind of outlier
mutate(
Q1_Threshold = Q1,
Q3_Threshold = Q3,
Outlier_type =
case_when(
!!varname < Q1_Threshold ~ "Lower_Outlier",
!!varname >= Q1_Threshold & !!varname <= Q3_Threshold ~ "Normal",
!!varname > Q3_Threshold ~ "Upper_Outlier"
)
)
}
#MiSeq_Bord_Quartiles entries
MiSeq_Bord_Quartiles <- data.frame(
stringsAsFactors = FALSE,
row.names = c("Avg_Trim_Cov", "S2_Total_Read_Pairs_Processed"),
Platform = c("MiSeq", "MiSeq"),
Genus = c("Bord", "Bord"),
Min = c(0.03, 295),
First_Quartile = c(80.08, 687613.25),
Median = c(97.085, 818806.5),
Third_Quartile = c(121.5625, 988173.75),
Max = c(327.76, 2836438)
)
#Remove the hashtag below to test if what you have is correct
#datapasta::df_paste(head(MiSeq_Bord_Quartiles, 5))
#dataset entry
MiSeq_Bord_final_report0 <- data.frame(
stringsAsFactors = FALSE,
ReadID = c("A005_20160223_S11_L001","A050_20210122_S6_L001",
"A073_20210122_S7_L001",
"A076_20210426_S11_L001",
"A080_20210426_S12_L001"),
Platform = c("MiSeq","MiSeq",
"MiSeq","MiSeq","MiSeq"),
Genus = c("Bordetella",
"Bordetella","Bordetella",
"Bordetella","Bordetella"),
Avg_Raw_Read_bp = c(232.85,241.09,
248.54,246.99,248.35),
Avg_Trimmed_Read_bp = c(204.32,232.6,
238.56,242.54,244.91),
Avg_Trim_Cov = c(72.04,101.05,
92.81,41.77,54.83),
Genome_Size_Mb = c(4.1, 4.1, 4.1, 4.1, 4.1),
S1_Input_reads = c(1450010L,
1786206L,1601542L,710792L,925462L),
S1_Contaminant_reads = c(12220L,6974L,
7606L,1076L,1782L),
S1_Total_reads_removed = c(12220L,6974L,
7606L,1076L,1782L),
S1_Result_reads = c(1437790L,
1779232L,1593936L,709716L,923680L),
S2_Read_Pairs_Written = c(712776L,882301L,
790675L,352508L,459215L),
S2_Total_Read_Pairs_Processed = c(718895L,889616L,
796968L,354858L,461840L)
)
MiSeq_Bord_final_report0
#Execution for 1 variable
outlier_results_1var <- Outlier_Hunter(MiSeq_Bord_final_report0, Avg_Trim_Cov,
MiSeq_Bord_Quartiles$First_Quartile[1], MiSeq_Bord_Quartiles$Third_Quartile[1])
#Now do it with a for loop
col_var_outliers <- row.names(MiSeq_Bord_Quartiles)
#col_var_outliers <- c("Avg_Trim_Cov", "S2_Total_Read_Pairs_Processed")
#change line above to change input of variables few into Outlier Hunter Function
outlier_list_MiSeq_Bord <- list()
for (y in col_var_outliers) {
outlier_results0 <- Outlier_Hunter(MiSeq_Bord_final_report0, y, MiSeq_Bord_Quartiles[y, "First_Quartile"], MiSeq_Bord_Quartiles[y, "Third_Quartile"])
outlier_results1 <- outlier_results0
colnames(outlier_results1)[5:7] <- paste0(y, "_", colnames(outlier_results1[, c(5:7)]), sep = "")
outlier_list_MiSeq_Bord[[y]] <- outlier_results1
}
MiSeq_Bord_Outliers_table0 <- reduce(outlier_list_MiSeq_Bord, left_join, by = c("ReadID", "Platform", "Genus"))
#the columns containing label Outlier_type is where the code goes wrong.
#When Avg_Trim_Cov and S2_Total_Read_Pairs_Processed are below their
#respective Q1 Thresholds their respective Outlier_type columns should read
#"Lower_Outlier", if between Q1 & Q3 Threshold, "Normal" and if above Q3
#Threshold then "Upper_outlier". But when the for loop is executed, only
"Upper_outlier" is shown in the Outlier_type columns.
datapasta::df_paste(head(MiSeq_Bord_Outliers_table0, 5))
MiSeq_Bord_Outliers_table0_error <- data.frame(
stringsAsFactors = FALSE,
ReadID = c("A005_20160223_S11_L001",
"A050_20210122_S6_L001",
"A073_20210122_S7_L001","A076_20210426_S11_L001",
"A080_20210426_S12_L001"),
Platform = c("MiSeq",
"MiSeq","MiSeq","MiSeq",
"MiSeq"),
Genus = c("Bordetella","Bordetella","Bordetella",
"Bordetella","Bordetella"),
Avg_Trim_Cov = c(72.04,
101.05,92.81,41.77,54.83),
Avg_Trim_Cov_Q1_Threshold = c(80.08,
80.08,80.08,80.08,80.08),
Avg_Trim_Cov_Q3_Threshold = c(121.5625,
121.5625,121.5625,121.5625,
121.5625),
Avg_Trim_Cov_Outlier_type = c("Upper_Outlier","Upper_Outlier",
"Upper_Outlier","Upper_Outlier",
"Upper_Outlier"),
S2_Total_Read_Pairs_Processed = c(718895L,
889616L,796968L,354858L,
461840L),
S2_Total_Read_Pairs_Processed_Q1_Threshold = c(687613.25,
687613.25,687613.25,
687613.25,687613.25),
S2_Total_Read_Pairs_Processed_Q3_Threshold = c(988173.75,
988173.75,988173.75,
988173.75,988173.75),
S2_Total_Read_Pairs_Processed_Outlier_type = c("Upper_Outlier","Upper_Outlier",
"Upper_Outlier","Upper_Outlier",
"Upper_Outlier")
)
为了像您一样在循环中使用,编写 Outlier_Hunter()
函数以将目标列作为字符串而不是表达式会更有用。
为此,请尝试将函数中 !!varname
的所有实例替换为 .data[[my_col]]
,并完全删除 enquo()
行。
请注意,进行这些更改后,您还需要更改在变量中没有列名时调用函数的方式。例如,您的单次执行将变为:
Outlier_Hunter(
MiSeq_Bord_final_report0,
"Avg_Trim_Cov",
MiSeq_Bord_Quartiles$First_Quartile[1],
MiSeq_Bord_Quartiles$Third_Quartile[1]
)
有关使用 tidy 求值函数进行编程的更多信息,您可能会发现 this rlang vignette 有用。
我正在尝试使用 for 循环来使 运行ning 我开发的函数更有效率。 但是,当我将它放在 for 循环中时,它会覆盖不应该覆盖的列并返回不正确的结果。
编辑:错误是在结果数据帧 MiSeq_Bord_Outliers_table0 中,包含标签 Outlier_type 的结果列返回不正确的输出。
根据 Outlier_Hunter 函数,当 Avg_Trim_Cov 和 S2_Total_Read_Pairs_Processed 低于它们的值时
各自的 Q1 阈值它们各自的 Outlier_type 列应显示为“Lower_Outlier”,如果在 Q1 和 Q3 阈值之间,则为“正常”,如果高于 Q3 阈值则为“Upper_outlier”。但是当执行for循环时,Outlier_type列中只显示“Upper_outlier”。
编辑:输入已经过简化,并在不同的计算机上使用干净的控制台进行了测试。如果说那里之前有什么神器,现在应该已经消除了,现在这里应该不会有什么差错。首先 运行 outlier_results_1var 部分很重要。如果您测试 运行 这段代码并遇到错误,请告诉我哪一部分失败了。
编辑:MiSeq_Bord_Outliers_table0_error 是正在重现的错误。这是错误结果,不是输入。
有人可以告诉我为什么返回这些不正确的结果以及我可以做些什么来解决这个问题吗?我将在下面上传相关代码。还是有另一种不用 for 循环的方法?
#libraries used
library(tidyverse)
library(datapasta)
library(data.table)
library(janitor)
library(ggpubr)
library(labeling)
#2.) Outlier_Hunter Function
#Function to Generate the Outlier table
#Outlier Hunter function takes 4 arguments: the dataset, column/variable of interest,
#Q1 and Q3. Q1 and Q3 are stored in the results of Quartile_Hunter.
#Input ex: MiSeq_Bord_final_report0, Avg_Trim_Cov, MiSeq_Bord_Quartiles_ATC$First_Quartile[1], MiSeq_Bord_Quartiles_ATC$Third_Quartile[1]
#Usage ex: Outlier_Hunter(MiSeq_Bord_final_report0, Avg_Trim_Cov,
#MiSeq_Bord_Quartiles_ATC$First_Quartile[1], MiSeq_Bord_Quartiles_ATC$Third_Quartile[1])
#Here is the Function to get the Outlier Table
Outlier_Hunter <- function(Platform_Genus_final_report0, my_col, Q1, Q3) {
#set up and generalize the variable name you want to work with
varname <- enquo(my_col)
#print(varname) #just to see what variable the function is working with
#get the outliers
Platform_Genus_Variable_Outliers <- Platform_Genus_final_report0 %>%
select(ReadID, Platform, Genus, !!varname) %>%
#Tell if it is an outlier, and if so, what kind of outlier
mutate(
Q1_Threshold = Q1,
Q3_Threshold = Q3,
Outlier_type =
case_when(
!!varname < Q1_Threshold ~ "Lower_Outlier",
!!varname >= Q1_Threshold & !!varname <= Q3_Threshold ~ "Normal",
!!varname > Q3_Threshold ~ "Upper_Outlier"
)
)
}
#MiSeq_Bord_Quartiles entries
MiSeq_Bord_Quartiles <- data.frame(
stringsAsFactors = FALSE,
row.names = c("Avg_Trim_Cov", "S2_Total_Read_Pairs_Processed"),
Platform = c("MiSeq", "MiSeq"),
Genus = c("Bord", "Bord"),
Min = c(0.03, 295),
First_Quartile = c(80.08, 687613.25),
Median = c(97.085, 818806.5),
Third_Quartile = c(121.5625, 988173.75),
Max = c(327.76, 2836438)
)
#Remove the hashtag below to test if what you have is correct
#datapasta::df_paste(head(MiSeq_Bord_Quartiles, 5))
#dataset entry
MiSeq_Bord_final_report0 <- data.frame(
stringsAsFactors = FALSE,
ReadID = c("A005_20160223_S11_L001","A050_20210122_S6_L001",
"A073_20210122_S7_L001",
"A076_20210426_S11_L001",
"A080_20210426_S12_L001"),
Platform = c("MiSeq","MiSeq",
"MiSeq","MiSeq","MiSeq"),
Genus = c("Bordetella",
"Bordetella","Bordetella",
"Bordetella","Bordetella"),
Avg_Raw_Read_bp = c(232.85,241.09,
248.54,246.99,248.35),
Avg_Trimmed_Read_bp = c(204.32,232.6,
238.56,242.54,244.91),
Avg_Trim_Cov = c(72.04,101.05,
92.81,41.77,54.83),
Genome_Size_Mb = c(4.1, 4.1, 4.1, 4.1, 4.1),
S1_Input_reads = c(1450010L,
1786206L,1601542L,710792L,925462L),
S1_Contaminant_reads = c(12220L,6974L,
7606L,1076L,1782L),
S1_Total_reads_removed = c(12220L,6974L,
7606L,1076L,1782L),
S1_Result_reads = c(1437790L,
1779232L,1593936L,709716L,923680L),
S2_Read_Pairs_Written = c(712776L,882301L,
790675L,352508L,459215L),
S2_Total_Read_Pairs_Processed = c(718895L,889616L,
796968L,354858L,461840L)
)
MiSeq_Bord_final_report0
#Execution for 1 variable
outlier_results_1var <- Outlier_Hunter(MiSeq_Bord_final_report0, Avg_Trim_Cov,
MiSeq_Bord_Quartiles$First_Quartile[1], MiSeq_Bord_Quartiles$Third_Quartile[1])
#Now do it with a for loop
col_var_outliers <- row.names(MiSeq_Bord_Quartiles)
#col_var_outliers <- c("Avg_Trim_Cov", "S2_Total_Read_Pairs_Processed")
#change line above to change input of variables few into Outlier Hunter Function
outlier_list_MiSeq_Bord <- list()
for (y in col_var_outliers) {
outlier_results0 <- Outlier_Hunter(MiSeq_Bord_final_report0, y, MiSeq_Bord_Quartiles[y, "First_Quartile"], MiSeq_Bord_Quartiles[y, "Third_Quartile"])
outlier_results1 <- outlier_results0
colnames(outlier_results1)[5:7] <- paste0(y, "_", colnames(outlier_results1[, c(5:7)]), sep = "")
outlier_list_MiSeq_Bord[[y]] <- outlier_results1
}
MiSeq_Bord_Outliers_table0 <- reduce(outlier_list_MiSeq_Bord, left_join, by = c("ReadID", "Platform", "Genus"))
#the columns containing label Outlier_type is where the code goes wrong.
#When Avg_Trim_Cov and S2_Total_Read_Pairs_Processed are below their
#respective Q1 Thresholds their respective Outlier_type columns should read
#"Lower_Outlier", if between Q1 & Q3 Threshold, "Normal" and if above Q3
#Threshold then "Upper_outlier". But when the for loop is executed, only
"Upper_outlier" is shown in the Outlier_type columns.
datapasta::df_paste(head(MiSeq_Bord_Outliers_table0, 5))
MiSeq_Bord_Outliers_table0_error <- data.frame(
stringsAsFactors = FALSE,
ReadID = c("A005_20160223_S11_L001",
"A050_20210122_S6_L001",
"A073_20210122_S7_L001","A076_20210426_S11_L001",
"A080_20210426_S12_L001"),
Platform = c("MiSeq",
"MiSeq","MiSeq","MiSeq",
"MiSeq"),
Genus = c("Bordetella","Bordetella","Bordetella",
"Bordetella","Bordetella"),
Avg_Trim_Cov = c(72.04,
101.05,92.81,41.77,54.83),
Avg_Trim_Cov_Q1_Threshold = c(80.08,
80.08,80.08,80.08,80.08),
Avg_Trim_Cov_Q3_Threshold = c(121.5625,
121.5625,121.5625,121.5625,
121.5625),
Avg_Trim_Cov_Outlier_type = c("Upper_Outlier","Upper_Outlier",
"Upper_Outlier","Upper_Outlier",
"Upper_Outlier"),
S2_Total_Read_Pairs_Processed = c(718895L,
889616L,796968L,354858L,
461840L),
S2_Total_Read_Pairs_Processed_Q1_Threshold = c(687613.25,
687613.25,687613.25,
687613.25,687613.25),
S2_Total_Read_Pairs_Processed_Q3_Threshold = c(988173.75,
988173.75,988173.75,
988173.75,988173.75),
S2_Total_Read_Pairs_Processed_Outlier_type = c("Upper_Outlier","Upper_Outlier",
"Upper_Outlier","Upper_Outlier",
"Upper_Outlier")
)
为了像您一样在循环中使用,编写 Outlier_Hunter()
函数以将目标列作为字符串而不是表达式会更有用。
为此,请尝试将函数中 !!varname
的所有实例替换为 .data[[my_col]]
,并完全删除 enquo()
行。
请注意,进行这些更改后,您还需要更改在变量中没有列名时调用函数的方式。例如,您的单次执行将变为:
Outlier_Hunter(
MiSeq_Bord_final_report0,
"Avg_Trim_Cov",
MiSeq_Bord_Quartiles$First_Quartile[1],
MiSeq_Bord_Quartiles$Third_Quartile[1]
)
有关使用 tidy 求值函数进行编程的更多信息,您可能会发现 this rlang vignette 有用。