如何在每个块重复名称更改的代码? (与R)
How do I repeat codes with names changing at every block? (with R)
我正在处理从 QIIME 获得的几个输出,我想对其进行操作以获取箱线图的文本。每个输入都以相同的方式格式化,因此操作始终相同,但会更改源名称。对于每个输入,我想提取最后 5 行,为每个 column/sample 取一个平均值,将这些值与从 mapfile 中获取的样本实验标签(组)相关联,并将它们按照我用于制作箱线图的顺序排列在获得的所有 6 个数据中。
在 bash 中,我做了类似“for i in GG97 GG100 SILVA97 SILVA100 NCBI RDP; do cp ${i}/alpha/collated_alpha/chao1.txt alpha_tot/${i}_chao1.txt; done
”的操作,通过 ${i}
.[=17= 自动更改代码中的名称多次执行命令。 ]
我正在努力寻找一种方法来对 R 做同样的事情。我想创建一个包含名称的向量,然后通过将 i
移动到 [=16] 来使用 for
循环=] 等,但它不起作用,它停在 read.delim 行未在 wd 中找到文件。
这是我写的操作代码。发表评论后,它将使用我正在使用的 6 个数据库(GG97 GG100 SILVA97 SILVA100 NCBI RDP)重复 6 次。
另外,我重复这个过程 4 次,因为我有 4 个指标要使用(这里我展示的是 shannon,但我也有一份 chao1、observed_species 和 PD_whole_tree).
library(tidyverse)
library(labelled)
mapfile <- read.delim(file="mapfile_HC+BV.txt", check.names=FALSE);
mapfile <- mapfile[,c(1,4)]
colnames(mapfile) <- c("SampleID","Pathology_group")
#GG97
collated <- read.delim(file="alpha_diversity/GG97_shannon.txt", check.names=FALSE);
collated <- tail(collated,5); collated <- collated[,-c(1:3)]
collated_reorder <- collated[,match(mapfile[,1], colnames(collated))]
labels <- t(mapfile)
colnames(collated_reorder) <- labels[2,]
mean <- colMeans(collated_reorder, na.rm = FALSE, dims = 1)
mean = as.matrix(mean); mean <- t(mean)
GG97_shannon <- as.data.frame(rbind(labels[2,],mean))
GG97_shannon <- t(GG97_shannon);
DB_type <- list(DB = "GG97"); DB_type <- rep(DB_type, 41)
GG97_shannon <- as.data.frame(cbind(DB_type,GG97_shannon))
colnames(GG97_shannon) <- c("DB","Group","value")
rm(collated,collated_reorder,DB_type,labels,mean)
这里我将所有输出粘贴在一起,冻结顺序并制作箱线图。
alpha_shannon <- as.data.frame(rbind(GG97_shannon,GG100_shannon,SILVA97_shannon,SILVA100_shannon,NCBI_shannon,RDP_shannon))
rownames(alpha_shannon) <- NULL
rm(GG97_shannon,GG100_shannon,SILVA97_shannon,SILVA100_shannon,NCBI_shannon,RDP_shannon)
alpha_shannon$Group = factor(alpha_shannon$Group, unique(alpha_shannon$Group))
alpha_shannon$DB = factor(alpha_shannon$DB, unique(alpha_shannon$DB))
library(ggplot2)
ggplot(data = alpha_shannon) +
aes(x = DB, y = value, colour = Group) +
geom_boxplot()+
labs(title = 'Shannon',
x = 'Database',
y = 'Diversity') +
theme(legend.position = 'bottom')+
theme_grey(base_size = 16)
如何保留此代码 "DRY" 并且不需要 146 行代码一遍又一遍地重复相同的事情?谢谢!!
您没有提供Minimal reproducible example,所以这个答案不能保证正确性。
需要注意的重要一点是您使用 rm(...)
,这意味着某些变量仅在特定范围内相关。因此,将这个作用域封装成一个函数。这使您的代码可重用并避免手动删除变量:
process <- function(file, DB){
# -> Use the function parameter `file` instead of a hardcoded filename
collated <- read.delim(file=file, check.names=FALSE);
collated <- tail(collated,5); collated <- collated[,-c(1:3)]
collated_reorder <- collated[,match(mapfile[,1], colnames(collated))]
labels <- t(mapfile)
colnames(collated_reorder) <- labels[2,]
mean <- colMeans(collated_reorder, na.rm = FALSE, dims = 1)
mean = as.matrix(mean); mean <- t(mean)
# -> rename this variable to a more general name, e.g. `result`
result <- as.data.frame(rbind(labels[2,],mean))
result <- t(result);
# -> Use the function parameter `DB` instead of a hardcoded string
DB_type <- list(DB = DB); DB_type <- rep(DB_type, 41)
result <- as.data.frame(cbind(DB_type,result))
colnames(result) <- c("DB","Group","value")
# -> After the end of this function, the variables defined in this function
# vanish automatically, you just need to specify the result
return(result)
}
现在您可以重复使用该块了:
GG97_shannon <- process(file = "alpha_diversity/GG97_shannon.txt", DB = "GG97")
GG100_shannon <- process(file =...., DB = ....)
SILVA97_shannon <- ...
SILVA100_shannon <- ...
NCBI_shannon <- ...
RDP_shannon <- ...
或者,您可以使用循环结构:
通用for
:
datasets <- c("GG97_shannon", "GG100_shannon", "SILVA97_shannon",
"SILVA100_shannon", "NCBI_shannon", "RDP_shannon")
files <- c("alpha_diversity/GG97_shannon.txt", .....)
DBs <- c("GG97", ....)
result <- list()
for(i in seq_along(datasets)){
result[[datasets[i]]] <- process(files[i], DBs[i])
}
mapply
,一个"specialized for
"用于并行循环多个向量:
# the first argument is the function from above, the other ones are given as arguments
# to our process(.) function
results <- mapply(process, files, DBs)
我正在处理从 QIIME 获得的几个输出,我想对其进行操作以获取箱线图的文本。每个输入都以相同的方式格式化,因此操作始终相同,但会更改源名称。对于每个输入,我想提取最后 5 行,为每个 column/sample 取一个平均值,将这些值与从 mapfile 中获取的样本实验标签(组)相关联,并将它们按照我用于制作箱线图的顺序排列在获得的所有 6 个数据中。
在 bash 中,我做了类似“for i in GG97 GG100 SILVA97 SILVA100 NCBI RDP; do cp ${i}/alpha/collated_alpha/chao1.txt alpha_tot/${i}_chao1.txt; done
”的操作,通过 ${i}
.[=17= 自动更改代码中的名称多次执行命令。 ]
我正在努力寻找一种方法来对 R 做同样的事情。我想创建一个包含名称的向量,然后通过将 i
移动到 [=16] 来使用 for
循环=] 等,但它不起作用,它停在 read.delim 行未在 wd 中找到文件。
这是我写的操作代码。发表评论后,它将使用我正在使用的 6 个数据库(GG97 GG100 SILVA97 SILVA100 NCBI RDP)重复 6 次。
另外,我重复这个过程 4 次,因为我有 4 个指标要使用(这里我展示的是 shannon,但我也有一份 chao1、observed_species 和 PD_whole_tree).
library(tidyverse)
library(labelled)
mapfile <- read.delim(file="mapfile_HC+BV.txt", check.names=FALSE);
mapfile <- mapfile[,c(1,4)]
colnames(mapfile) <- c("SampleID","Pathology_group")
#GG97
collated <- read.delim(file="alpha_diversity/GG97_shannon.txt", check.names=FALSE);
collated <- tail(collated,5); collated <- collated[,-c(1:3)]
collated_reorder <- collated[,match(mapfile[,1], colnames(collated))]
labels <- t(mapfile)
colnames(collated_reorder) <- labels[2,]
mean <- colMeans(collated_reorder, na.rm = FALSE, dims = 1)
mean = as.matrix(mean); mean <- t(mean)
GG97_shannon <- as.data.frame(rbind(labels[2,],mean))
GG97_shannon <- t(GG97_shannon);
DB_type <- list(DB = "GG97"); DB_type <- rep(DB_type, 41)
GG97_shannon <- as.data.frame(cbind(DB_type,GG97_shannon))
colnames(GG97_shannon) <- c("DB","Group","value")
rm(collated,collated_reorder,DB_type,labels,mean)
这里我将所有输出粘贴在一起,冻结顺序并制作箱线图。
alpha_shannon <- as.data.frame(rbind(GG97_shannon,GG100_shannon,SILVA97_shannon,SILVA100_shannon,NCBI_shannon,RDP_shannon))
rownames(alpha_shannon) <- NULL
rm(GG97_shannon,GG100_shannon,SILVA97_shannon,SILVA100_shannon,NCBI_shannon,RDP_shannon)
alpha_shannon$Group = factor(alpha_shannon$Group, unique(alpha_shannon$Group))
alpha_shannon$DB = factor(alpha_shannon$DB, unique(alpha_shannon$DB))
library(ggplot2)
ggplot(data = alpha_shannon) +
aes(x = DB, y = value, colour = Group) +
geom_boxplot()+
labs(title = 'Shannon',
x = 'Database',
y = 'Diversity') +
theme(legend.position = 'bottom')+
theme_grey(base_size = 16)
如何保留此代码 "DRY" 并且不需要 146 行代码一遍又一遍地重复相同的事情?谢谢!!
您没有提供Minimal reproducible example,所以这个答案不能保证正确性。
需要注意的重要一点是您使用 rm(...)
,这意味着某些变量仅在特定范围内相关。因此,将这个作用域封装成一个函数。这使您的代码可重用并避免手动删除变量:
process <- function(file, DB){
# -> Use the function parameter `file` instead of a hardcoded filename
collated <- read.delim(file=file, check.names=FALSE);
collated <- tail(collated,5); collated <- collated[,-c(1:3)]
collated_reorder <- collated[,match(mapfile[,1], colnames(collated))]
labels <- t(mapfile)
colnames(collated_reorder) <- labels[2,]
mean <- colMeans(collated_reorder, na.rm = FALSE, dims = 1)
mean = as.matrix(mean); mean <- t(mean)
# -> rename this variable to a more general name, e.g. `result`
result <- as.data.frame(rbind(labels[2,],mean))
result <- t(result);
# -> Use the function parameter `DB` instead of a hardcoded string
DB_type <- list(DB = DB); DB_type <- rep(DB_type, 41)
result <- as.data.frame(cbind(DB_type,result))
colnames(result) <- c("DB","Group","value")
# -> After the end of this function, the variables defined in this function
# vanish automatically, you just need to specify the result
return(result)
}
现在您可以重复使用该块了:
GG97_shannon <- process(file = "alpha_diversity/GG97_shannon.txt", DB = "GG97")
GG100_shannon <- process(file =...., DB = ....)
SILVA97_shannon <- ...
SILVA100_shannon <- ...
NCBI_shannon <- ...
RDP_shannon <- ...
或者,您可以使用循环结构:
通用
for
:datasets <- c("GG97_shannon", "GG100_shannon", "SILVA97_shannon", "SILVA100_shannon", "NCBI_shannon", "RDP_shannon") files <- c("alpha_diversity/GG97_shannon.txt", .....) DBs <- c("GG97", ....) result <- list() for(i in seq_along(datasets)){ result[[datasets[i]]] <- process(files[i], DBs[i]) }
mapply
,一个"specializedfor
"用于并行循环多个向量:# the first argument is the function from above, the other ones are given as arguments # to our process(.) function results <- mapply(process, files, DBs)