从另一列中提取包含文本的列名(示例)

Extracting column names (samples) contaning text from another column

我只想提取那些包含 SNP1 到 9 的等位基因 2 的样本,并希望将它们并排放置在 table 中,以便我可以区分哪个 SNP 对应于哪个样本。

我的数据是这样的:

dput(soil)    
structure(list(SNPs = c("SNP1", "SNP2", "SNP3", "SNP4", "SNP5", 
    "SNP6", "SNP7", "SNP8", "SNP9"), alleles = c("C/T", "G/C", "A/C", 
    "G/T", "A/C", "C/A", "T/C", "T/G", "A/G"), alleles1 = c("CC", 
    "GG", "AA", "GG", "AA", "CC", "TT", "TT", "AA"), alleles2 = c("TT", 
    "CC", "CC", "TT", "CC", "AA", "CC", "GG", "GG"), Sample1 = c("CC", 
    "GG", "CA", "TT", "CC", "AA", "CC", "GG", "GG"), Sample2 = c("CC", 
    "GG", "AA", "TG", "CA", "AC", "CT", "GT", "GA"), Sample3 = c("CC", 
    "CC", "AA", "TG", "CA", "AC", "CT", "GT", "GA"), Sample4 = c("CC", 
    "GG", "AA", "GG", "AA", "CC", "TT", "TT", "AA"), Sample5 = c("CC", 
    "GG", "CC", "GG", "AA", "CC", "TT", "TT", "AA"), Sample6 = c("CC", 
    "CG", "AA", "TG", "CA", "AA", "CT", "GT", "GA"), Sample7 = c("CC", 
    "CC", "AA", "GG", "AA", "CC", "TT", "GG", "AA"), Sample8 = c("CC", 
    "GG", "AA", "TT", "CC", "AC", "CT", "GT", "GA"), Sample9 = c("CC", 
    "GG", "AA", "GG", "AA", "AC", "CC", "TT", "AA"), Sample10 = c("TT", 
    "GG", "CA", "TG", "CA", "AC", "TT", "TT", "AA"), Sample11 = c("TT", 
    "GG", "AA", "GG", "AA", "CC", "TT", "TT", "AA"), Sample12 = c("TT", 
    "GG", "CC", "TT", "CC", "AA", "CC", "TT", "GG"), Sample13 = c("TT", 
    "GG", "CA", "TG", "CA", "AC", "TT", "TT", "GG")), class = "data.frame", row.names = c(NA, 
    -9L))

谢谢,

您可以使用data.table将数据融化成长格式,并确定每个SNP的哪些样本具有等位基因2:

library(data.table)
melt(
  setDT(soil)
  ,id.vars = c("SNPs", "alleles2"),
  measure.vars = patterns("Sample."),
  variable.name = "Sample"
)[value==alleles2, .("Samples" = list(Sample)), by=SNPs]

输出:

     SNPs                             Samples
   <char>                              <list>
1:   SNP4            Sample1,Sample8,Sample12
2:   SNP5            Sample1,Sample8,Sample12
3:   SNP6            Sample1,Sample6,Sample12
4:   SNP7            Sample1,Sample9,Sample12
5:   SNP8                     Sample1,Sample7
6:   SNP9           Sample1,Sample12,Sample13
7:   SNP2                     Sample3,Sample7
8:   SNP3                    Sample5,Sample12
9:   SNP1 Sample10,Sample11,Sample12,Sample13

更新:

假设表示“示例”的列的列名不一致,而是它们具有整个名称范围,例如原始数据:

   SNPs alleles alleles1 alleles2 yngi mdaj osiw nvzg gfbu avnb rjhe eaug hfgx xrhp zajg zrdj vule
1: SNP1     C/T       CC       TT   CC   CC   CC   CC   CC   CC   CC   CC   CC   TT   TT   TT   TT
2: SNP2     G/C       GG       CC   GG   GG   CC   GG   GG   CG   CC   GG   GG   GG   GG   GG   GG
3: SNP3     A/C       AA       CC   CA   AA   AA   AA   CC   AA   AA   AA   AA   CA   AA   CC   CA
4: SNP4     G/T       GG       TT   TT   TG   TG   GG   GG   TG   GG   TT   GG   TG   GG   TT   TG
5: SNP5     A/C       AA       CC   CC   CA   CA   AA   AA   CA   AA   CC   AA   CA   AA   CC   CA
6: SNP6     C/A       CC       AA   AA   AC   AC   CC   CC   AA   CC   AC   AC   AC   CC   AA   AC
7: SNP7     T/C       TT       CC   CC   CT   CT   TT   TT   CT   TT   CT   CC   TT   TT   CC   TT
8: SNP8     T/G       TT       GG   GG   GT   GT   TT   TT   GT   GG   GT   TT   TT   TT   TT   TT
9: SNP9     A/G       AA       GG   GG   GA   GA   AA   AA   GA   AA   GA   AA   AA   AA   GG   GG

然后,如果感兴趣的列的整数范围已知,则可以在 measure.vars 参数中使用该范围,如下所示(在本例中为 5:!7):

library(data.table)
melt(
  setDT(soil)
  ,id.vars = c("SNPs", "alleles2"),
  measure.vars = 5:17,
  variable.name = "Sample"
)[value==alleles2, .("Samples" = list(Sample)), by=SNPs]

输出:

   SNPs             Samples
1: SNP4      yngi,eaug,zrdj
2: SNP5      yngi,eaug,zrdj
3: SNP6      yngi,avnb,zrdj
4: SNP7      yngi,hfgx,zrdj
5: SNP8           yngi,rjhe
6: SNP9      yngi,zrdj,vule
7: SNP2           osiw,rjhe
8: SNP3           gfbu,zrdj
9: SNP1 xrhp,zajg,zrdj,vule

最后,如果删除 non-needed 列,在本例中为 allelesalleles1,我们实际上可以完全取消 meaures.vars 参数,并且 melt,将假设 measure.vars 是所有其他变量:

melt(
  setDT(soil)[, !c("alleles", "alleles1")]
  ,id.vars = c("SNPs", "alleles2"),
  variable.name = "Sample"
)[value==alleles2, .("Samples" = list(Sample)), by=SNPs]