从另一列中提取包含文本的列名(示例)
Extracting column names (samples) contaning text from another column
我只想提取那些包含 SNP1 到 9 的等位基因 2 的样本,并希望将它们并排放置在 table 中,以便我可以区分哪个 SNP 对应于哪个样本。
我的数据是这样的:
dput(soil)
structure(list(SNPs = c("SNP1", "SNP2", "SNP3", "SNP4", "SNP5",
"SNP6", "SNP7", "SNP8", "SNP9"), alleles = c("C/T", "G/C", "A/C",
"G/T", "A/C", "C/A", "T/C", "T/G", "A/G"), alleles1 = c("CC",
"GG", "AA", "GG", "AA", "CC", "TT", "TT", "AA"), alleles2 = c("TT",
"CC", "CC", "TT", "CC", "AA", "CC", "GG", "GG"), Sample1 = c("CC",
"GG", "CA", "TT", "CC", "AA", "CC", "GG", "GG"), Sample2 = c("CC",
"GG", "AA", "TG", "CA", "AC", "CT", "GT", "GA"), Sample3 = c("CC",
"CC", "AA", "TG", "CA", "AC", "CT", "GT", "GA"), Sample4 = c("CC",
"GG", "AA", "GG", "AA", "CC", "TT", "TT", "AA"), Sample5 = c("CC",
"GG", "CC", "GG", "AA", "CC", "TT", "TT", "AA"), Sample6 = c("CC",
"CG", "AA", "TG", "CA", "AA", "CT", "GT", "GA"), Sample7 = c("CC",
"CC", "AA", "GG", "AA", "CC", "TT", "GG", "AA"), Sample8 = c("CC",
"GG", "AA", "TT", "CC", "AC", "CT", "GT", "GA"), Sample9 = c("CC",
"GG", "AA", "GG", "AA", "AC", "CC", "TT", "AA"), Sample10 = c("TT",
"GG", "CA", "TG", "CA", "AC", "TT", "TT", "AA"), Sample11 = c("TT",
"GG", "AA", "GG", "AA", "CC", "TT", "TT", "AA"), Sample12 = c("TT",
"GG", "CC", "TT", "CC", "AA", "CC", "TT", "GG"), Sample13 = c("TT",
"GG", "CA", "TG", "CA", "AC", "TT", "TT", "GG")), class = "data.frame", row.names = c(NA,
-9L))
谢谢,
您可以使用data.table将数据融化成长格式,并确定每个SNP的哪些样本具有等位基因2:
library(data.table)
melt(
setDT(soil)
,id.vars = c("SNPs", "alleles2"),
measure.vars = patterns("Sample."),
variable.name = "Sample"
)[value==alleles2, .("Samples" = list(Sample)), by=SNPs]
输出:
SNPs Samples
<char> <list>
1: SNP4 Sample1,Sample8,Sample12
2: SNP5 Sample1,Sample8,Sample12
3: SNP6 Sample1,Sample6,Sample12
4: SNP7 Sample1,Sample9,Sample12
5: SNP8 Sample1,Sample7
6: SNP9 Sample1,Sample12,Sample13
7: SNP2 Sample3,Sample7
8: SNP3 Sample5,Sample12
9: SNP1 Sample10,Sample11,Sample12,Sample13
更新:
假设表示“示例”的列的列名不一致,而是它们具有整个名称范围,例如原始数据:
SNPs alleles alleles1 alleles2 yngi mdaj osiw nvzg gfbu avnb rjhe eaug hfgx xrhp zajg zrdj vule
1: SNP1 C/T CC TT CC CC CC CC CC CC CC CC CC TT TT TT TT
2: SNP2 G/C GG CC GG GG CC GG GG CG CC GG GG GG GG GG GG
3: SNP3 A/C AA CC CA AA AA AA CC AA AA AA AA CA AA CC CA
4: SNP4 G/T GG TT TT TG TG GG GG TG GG TT GG TG GG TT TG
5: SNP5 A/C AA CC CC CA CA AA AA CA AA CC AA CA AA CC CA
6: SNP6 C/A CC AA AA AC AC CC CC AA CC AC AC AC CC AA AC
7: SNP7 T/C TT CC CC CT CT TT TT CT TT CT CC TT TT CC TT
8: SNP8 T/G TT GG GG GT GT TT TT GT GG GT TT TT TT TT TT
9: SNP9 A/G AA GG GG GA GA AA AA GA AA GA AA AA AA GG GG
然后,如果感兴趣的列的整数范围已知,则可以在 measure.vars
参数中使用该范围,如下所示(在本例中为 5:!7
):
library(data.table)
melt(
setDT(soil)
,id.vars = c("SNPs", "alleles2"),
measure.vars = 5:17,
variable.name = "Sample"
)[value==alleles2, .("Samples" = list(Sample)), by=SNPs]
输出:
SNPs Samples
1: SNP4 yngi,eaug,zrdj
2: SNP5 yngi,eaug,zrdj
3: SNP6 yngi,avnb,zrdj
4: SNP7 yngi,hfgx,zrdj
5: SNP8 yngi,rjhe
6: SNP9 yngi,zrdj,vule
7: SNP2 osiw,rjhe
8: SNP3 gfbu,zrdj
9: SNP1 xrhp,zajg,zrdj,vule
最后,如果删除 non-needed 列,在本例中为 alleles
和 alleles1
,我们实际上可以完全取消 meaures.vars
参数,并且 melt
,将假设 measure.vars
是所有其他变量:
melt(
setDT(soil)[, !c("alleles", "alleles1")]
,id.vars = c("SNPs", "alleles2"),
variable.name = "Sample"
)[value==alleles2, .("Samples" = list(Sample)), by=SNPs]
我只想提取那些包含 SNP1 到 9 的等位基因 2 的样本,并希望将它们并排放置在 table 中,以便我可以区分哪个 SNP 对应于哪个样本。
我的数据是这样的:
dput(soil)
structure(list(SNPs = c("SNP1", "SNP2", "SNP3", "SNP4", "SNP5",
"SNP6", "SNP7", "SNP8", "SNP9"), alleles = c("C/T", "G/C", "A/C",
"G/T", "A/C", "C/A", "T/C", "T/G", "A/G"), alleles1 = c("CC",
"GG", "AA", "GG", "AA", "CC", "TT", "TT", "AA"), alleles2 = c("TT",
"CC", "CC", "TT", "CC", "AA", "CC", "GG", "GG"), Sample1 = c("CC",
"GG", "CA", "TT", "CC", "AA", "CC", "GG", "GG"), Sample2 = c("CC",
"GG", "AA", "TG", "CA", "AC", "CT", "GT", "GA"), Sample3 = c("CC",
"CC", "AA", "TG", "CA", "AC", "CT", "GT", "GA"), Sample4 = c("CC",
"GG", "AA", "GG", "AA", "CC", "TT", "TT", "AA"), Sample5 = c("CC",
"GG", "CC", "GG", "AA", "CC", "TT", "TT", "AA"), Sample6 = c("CC",
"CG", "AA", "TG", "CA", "AA", "CT", "GT", "GA"), Sample7 = c("CC",
"CC", "AA", "GG", "AA", "CC", "TT", "GG", "AA"), Sample8 = c("CC",
"GG", "AA", "TT", "CC", "AC", "CT", "GT", "GA"), Sample9 = c("CC",
"GG", "AA", "GG", "AA", "AC", "CC", "TT", "AA"), Sample10 = c("TT",
"GG", "CA", "TG", "CA", "AC", "TT", "TT", "AA"), Sample11 = c("TT",
"GG", "AA", "GG", "AA", "CC", "TT", "TT", "AA"), Sample12 = c("TT",
"GG", "CC", "TT", "CC", "AA", "CC", "TT", "GG"), Sample13 = c("TT",
"GG", "CA", "TG", "CA", "AC", "TT", "TT", "GG")), class = "data.frame", row.names = c(NA,
-9L))
谢谢,
您可以使用data.table将数据融化成长格式,并确定每个SNP的哪些样本具有等位基因2:
library(data.table)
melt(
setDT(soil)
,id.vars = c("SNPs", "alleles2"),
measure.vars = patterns("Sample."),
variable.name = "Sample"
)[value==alleles2, .("Samples" = list(Sample)), by=SNPs]
输出:
SNPs Samples
<char> <list>
1: SNP4 Sample1,Sample8,Sample12
2: SNP5 Sample1,Sample8,Sample12
3: SNP6 Sample1,Sample6,Sample12
4: SNP7 Sample1,Sample9,Sample12
5: SNP8 Sample1,Sample7
6: SNP9 Sample1,Sample12,Sample13
7: SNP2 Sample3,Sample7
8: SNP3 Sample5,Sample12
9: SNP1 Sample10,Sample11,Sample12,Sample13
更新:
假设表示“示例”的列的列名不一致,而是它们具有整个名称范围,例如原始数据:
SNPs alleles alleles1 alleles2 yngi mdaj osiw nvzg gfbu avnb rjhe eaug hfgx xrhp zajg zrdj vule
1: SNP1 C/T CC TT CC CC CC CC CC CC CC CC CC TT TT TT TT
2: SNP2 G/C GG CC GG GG CC GG GG CG CC GG GG GG GG GG GG
3: SNP3 A/C AA CC CA AA AA AA CC AA AA AA AA CA AA CC CA
4: SNP4 G/T GG TT TT TG TG GG GG TG GG TT GG TG GG TT TG
5: SNP5 A/C AA CC CC CA CA AA AA CA AA CC AA CA AA CC CA
6: SNP6 C/A CC AA AA AC AC CC CC AA CC AC AC AC CC AA AC
7: SNP7 T/C TT CC CC CT CT TT TT CT TT CT CC TT TT CC TT
8: SNP8 T/G TT GG GG GT GT TT TT GT GG GT TT TT TT TT TT
9: SNP9 A/G AA GG GG GA GA AA AA GA AA GA AA AA AA GG GG
然后,如果感兴趣的列的整数范围已知,则可以在 measure.vars
参数中使用该范围,如下所示(在本例中为 5:!7
):
library(data.table)
melt(
setDT(soil)
,id.vars = c("SNPs", "alleles2"),
measure.vars = 5:17,
variable.name = "Sample"
)[value==alleles2, .("Samples" = list(Sample)), by=SNPs]
输出:
SNPs Samples
1: SNP4 yngi,eaug,zrdj
2: SNP5 yngi,eaug,zrdj
3: SNP6 yngi,avnb,zrdj
4: SNP7 yngi,hfgx,zrdj
5: SNP8 yngi,rjhe
6: SNP9 yngi,zrdj,vule
7: SNP2 osiw,rjhe
8: SNP3 gfbu,zrdj
9: SNP1 xrhp,zajg,zrdj,vule
最后,如果删除 non-needed 列,在本例中为 alleles
和 alleles1
,我们实际上可以完全取消 meaures.vars
参数,并且 melt
,将假设 measure.vars
是所有其他变量:
melt(
setDT(soil)[, !c("alleles", "alleles1")]
,id.vars = c("SNPs", "alleles2"),
variable.name = "Sample"
)[value==alleles2, .("Samples" = list(Sample)), by=SNPs]