select 数据框中样本组非零的列
select columns from dataframe where groups of samples are nonzero
我有一个按物种(列)数据框分类的样本(行)。另一个数据框中的一列将样本编码成组。我想 select 所有组中所有样本都具有非零值的列。
物种框架:
structure(list(Otu000132 = c(0L, 56L, 30L, 52L, 1L, 4L, 31L, 4L, 17L, 9L, 4L),
Otu000144 = c(191L, 14L, 58L, 137L, 127L, 222L, 26L, 175L, 133L, 107L, 43L),
Otu000146 = c(0L, 0L, 0L, 0L, 16L, 62L, 41L, 16L, 60L, 32L, 0L),
Otu000147 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L),
Otu000151 = c(2L, 9L, 4L, 1L, 0L, 4L, 4L, 2L, 3L, 0L, 0L),
Otu000162 = c(2L, 1L, 0L, 0L, 1L, 1L, 0L, 2L, 1L, 0L, 0L),
Otu000164 = c(2L, 0L, 1L, 2L, 0L, 0L, 0L, 0L, 0L, 0L, 0L),
Otu000174 = c(0L, 0L, 3L, 1L, 0L, 2L, 0L, 1L, 2L, 1L, 0L),
Otu000176 = c(1L, 9L, 0L, 1L, 2L, 5L, 3L, 3L, 8L, 2L, 2L),
Otu000186 = c(1L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L),
Otu000190 = c(1L, 1L, 1L, 0L, 0L, 5L, 1L, 2L, 7L, 0L, 0L)),
.Names = c("Otu000132", "Otu000144", "Otu000146", "Otu000147",
"Otu000151", "Otu000162", "Otu000164", "Otu000174",
"Otu000176", "Otu000186", "Otu000190"),
row.names = 30:40, class = "data.frame")
分组框:
structure(c(30, 31, 32, 33, 34, 35, 36, 37, 38, 39,
40, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3),
.Dim = c(11L, 2L))
期望的输出:
structure(list(Otu000132 = c(0L, 56L, 30L, 52L, 1L, 4L, 31L, 4L, 17L, 9L, 4L),
Otu000144 = c(191L, 14L, 58L, 137L, 127L, 222L, 26L, 175L, 133L, 107L, 43L),
Otu000151 = c(2L, 9L, 4L, 1L, 0L, 4L, 4L, 2L, 3L, 0L, 0L),
Otu000176 = c(1L, 9L, 0L, 1L, 2L, 5L, 3L, 3L, 8L, 2L, 2L),
Otu000190 = c(1L, 1L, 1L, 0L, 0L, 5L, 1L, 2L, 7L, 0L, 0L)),
.Names = c("Otu000132", "Otu000144", "Otu000151",
"Otu000176", "Otu000190"),
row.names = 30:40, class = "data.frame")
我觉得这应该是我可以用 dplyr select 做的事情,但我想不通。任何人都对开始我的道路有建议吗?
这确实可以用 dplyr 来完成,而且是一种相当直接的方式。正如其他人指出的那样,"Otu000146" 不符合您描述的标准,因此不会包含在最终的列选择中。
library(dplyr)
library(tidyr)
df.species <- cbind(species, group = grouping[,2]) %>% # merge the grouping variable into the main data set
gather(variable, value, -group) %>% # gather the columns into 'long' format
group_by(variable, group) %>% # group by column name and group
summarize(keep = all(value != 0)) %>% # variables and groups where all values are non-zero
ungroup %>% group_by(variable) %>% # reset grouping
summarize(keep = any(keep)) %>% # variables where at least 1 group met the aforementioned criterion
dplyr::filter(keep) # final list
variable keep
<chr> <lgl>
1 Otu000132 TRUE
2 Otu000144 TRUE
3 Otu000151 TRUE
4 Otu000176 TRUE
5 Otu000190 TRUE
# retrieve only the matching columns
df.desired <- species[df.species$variable]
Otu000132 Otu000144 Otu000151 Otu000176 Otu000190
30 0 191 2 1 1
31 56 14 9 9 1
32 30 58 4 0 1
33 52 137 1 1 0
34 1 127 0 2 0
35 4 222 4 5 5
36 31 26 4 3 1
37 4 175 2 3 2
38 17 133 3 8 7
39 9 107 0 2 0
40 4 43 0 2 0
我们split
分组数据集的第一列('gp1')由第二列(gp1[,2]
)到list
,循环遍历list
,通过将其行名称与 list
元素匹配来对物种数据集的行进行子集,得到逻辑矩阵(x1==0
)的列总和,检查它是否大于 0,比较相应的元素每个 list
元素在 Reduce
中使用 &
,否定 (!
) 索引以将 TRUE 更改为 FALSE(反之亦然)以对物种数据集的列进行子集化。
sp1[!Reduce(`&`,lapply(split(gp1[,1], gp1[,2]), function(x) {
x1 <- sp1[match(x, row.names(sp1)),]
colSums(x1==0)>0}))]
# Otu000132 Otu000144 Otu000151 Otu000176 Otu000190
#30 0 191 2 1 1
#31 56 14 9 9 1
#32 30 58 4 0 1
#33 52 137 1 1 0
#34 1 127 0 2 0
#35 4 222 4 5 5
#36 31 26 4 3 1
#37 4 175 2 3 2
#38 17 133 3 8 7
#39 9 107 0 2 0
#40 4 43 0 2 0
您可以使用 dplyr
或仅使用基本函数:
species = merge(species, group, by.x=c("row.names"), by.y=c("V1"))
#Find the lowest values in each grouping
check = aggregate(species[,c("Otu000132", "Otu000144", "Otu000146",
"Otu000147", "Otu000151", "Otu000162", "Otu000164",
"Otu000174", "Otu000176", "Otu000186", "Otu000190")],
by=list(species$V2), min)
#sum across the groupings
vars = apply(check, 2, function(x) sum(x))
#retain variables where sum > 1, indicating at least one grouping has full observations
vars = vars[vars!=0]
#extract the variable names
vars = names(vars)[-1]
#subset dataset to select variables identified above
out = species[vars]
out
# Otu000132 Otu000144 Otu000151 Otu000176 Otu000190
#1 0 191 2 1 1
#2 56 14 9 9 1
#3 30 58 4 0 1
#4 52 137 1 1 0
#5 1 127 0 2 0
#6 4 222 4 5 5
#7 31 26 4 3 1
#8 4 175 2 3 2
#9 17 133 3 8 7
#10 9 107 0 2 0
#11 4 43 0 2 0
我有一个按物种(列)数据框分类的样本(行)。另一个数据框中的一列将样本编码成组。我想 select 所有组中所有样本都具有非零值的列。
物种框架:
structure(list(Otu000132 = c(0L, 56L, 30L, 52L, 1L, 4L, 31L, 4L, 17L, 9L, 4L),
Otu000144 = c(191L, 14L, 58L, 137L, 127L, 222L, 26L, 175L, 133L, 107L, 43L),
Otu000146 = c(0L, 0L, 0L, 0L, 16L, 62L, 41L, 16L, 60L, 32L, 0L),
Otu000147 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L),
Otu000151 = c(2L, 9L, 4L, 1L, 0L, 4L, 4L, 2L, 3L, 0L, 0L),
Otu000162 = c(2L, 1L, 0L, 0L, 1L, 1L, 0L, 2L, 1L, 0L, 0L),
Otu000164 = c(2L, 0L, 1L, 2L, 0L, 0L, 0L, 0L, 0L, 0L, 0L),
Otu000174 = c(0L, 0L, 3L, 1L, 0L, 2L, 0L, 1L, 2L, 1L, 0L),
Otu000176 = c(1L, 9L, 0L, 1L, 2L, 5L, 3L, 3L, 8L, 2L, 2L),
Otu000186 = c(1L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L),
Otu000190 = c(1L, 1L, 1L, 0L, 0L, 5L, 1L, 2L, 7L, 0L, 0L)),
.Names = c("Otu000132", "Otu000144", "Otu000146", "Otu000147",
"Otu000151", "Otu000162", "Otu000164", "Otu000174",
"Otu000176", "Otu000186", "Otu000190"),
row.names = 30:40, class = "data.frame")
分组框:
structure(c(30, 31, 32, 33, 34, 35, 36, 37, 38, 39,
40, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3),
.Dim = c(11L, 2L))
期望的输出:
structure(list(Otu000132 = c(0L, 56L, 30L, 52L, 1L, 4L, 31L, 4L, 17L, 9L, 4L),
Otu000144 = c(191L, 14L, 58L, 137L, 127L, 222L, 26L, 175L, 133L, 107L, 43L),
Otu000151 = c(2L, 9L, 4L, 1L, 0L, 4L, 4L, 2L, 3L, 0L, 0L),
Otu000176 = c(1L, 9L, 0L, 1L, 2L, 5L, 3L, 3L, 8L, 2L, 2L),
Otu000190 = c(1L, 1L, 1L, 0L, 0L, 5L, 1L, 2L, 7L, 0L, 0L)),
.Names = c("Otu000132", "Otu000144", "Otu000151",
"Otu000176", "Otu000190"),
row.names = 30:40, class = "data.frame")
我觉得这应该是我可以用 dplyr select 做的事情,但我想不通。任何人都对开始我的道路有建议吗?
这确实可以用 dplyr 来完成,而且是一种相当直接的方式。正如其他人指出的那样,"Otu000146" 不符合您描述的标准,因此不会包含在最终的列选择中。
library(dplyr)
library(tidyr)
df.species <- cbind(species, group = grouping[,2]) %>% # merge the grouping variable into the main data set
gather(variable, value, -group) %>% # gather the columns into 'long' format
group_by(variable, group) %>% # group by column name and group
summarize(keep = all(value != 0)) %>% # variables and groups where all values are non-zero
ungroup %>% group_by(variable) %>% # reset grouping
summarize(keep = any(keep)) %>% # variables where at least 1 group met the aforementioned criterion
dplyr::filter(keep) # final list
variable keep
<chr> <lgl>
1 Otu000132 TRUE
2 Otu000144 TRUE
3 Otu000151 TRUE
4 Otu000176 TRUE
5 Otu000190 TRUE
# retrieve only the matching columns
df.desired <- species[df.species$variable]
Otu000132 Otu000144 Otu000151 Otu000176 Otu000190
30 0 191 2 1 1
31 56 14 9 9 1
32 30 58 4 0 1
33 52 137 1 1 0
34 1 127 0 2 0
35 4 222 4 5 5
36 31 26 4 3 1
37 4 175 2 3 2
38 17 133 3 8 7
39 9 107 0 2 0
40 4 43 0 2 0
我们split
分组数据集的第一列('gp1')由第二列(gp1[,2]
)到list
,循环遍历list
,通过将其行名称与 list
元素匹配来对物种数据集的行进行子集,得到逻辑矩阵(x1==0
)的列总和,检查它是否大于 0,比较相应的元素每个 list
元素在 Reduce
中使用 &
,否定 (!
) 索引以将 TRUE 更改为 FALSE(反之亦然)以对物种数据集的列进行子集化。
sp1[!Reduce(`&`,lapply(split(gp1[,1], gp1[,2]), function(x) {
x1 <- sp1[match(x, row.names(sp1)),]
colSums(x1==0)>0}))]
# Otu000132 Otu000144 Otu000151 Otu000176 Otu000190
#30 0 191 2 1 1
#31 56 14 9 9 1
#32 30 58 4 0 1
#33 52 137 1 1 0
#34 1 127 0 2 0
#35 4 222 4 5 5
#36 31 26 4 3 1
#37 4 175 2 3 2
#38 17 133 3 8 7
#39 9 107 0 2 0
#40 4 43 0 2 0
您可以使用 dplyr
或仅使用基本函数:
species = merge(species, group, by.x=c("row.names"), by.y=c("V1"))
#Find the lowest values in each grouping
check = aggregate(species[,c("Otu000132", "Otu000144", "Otu000146",
"Otu000147", "Otu000151", "Otu000162", "Otu000164",
"Otu000174", "Otu000176", "Otu000186", "Otu000190")],
by=list(species$V2), min)
#sum across the groupings
vars = apply(check, 2, function(x) sum(x))
#retain variables where sum > 1, indicating at least one grouping has full observations
vars = vars[vars!=0]
#extract the variable names
vars = names(vars)[-1]
#subset dataset to select variables identified above
out = species[vars]
out
# Otu000132 Otu000144 Otu000151 Otu000176 Otu000190
#1 0 191 2 1 1
#2 56 14 9 9 1
#3 30 58 4 0 1
#4 52 137 1 1 0
#5 1 127 0 2 0
#6 4 222 4 5 5
#7 31 26 4 3 1
#8 4 175 2 3 2
#9 17 133 3 8 7
#10 9 107 0 2 0
#11 4 43 0 2 0