在 R 中重新排列数据,拆分列名称

Rearrange Data in R, Breaking Apart Column Names

我得到了一个 table,我需要将其转换成另外两种形式。谁能帮我找到一个系统的 R-way 来转换它而不需要一堆嵌套的 for 循环?

这是 table 的简化版本:

Status <- rep(c(paste0("B",seq(1,4)),"Total"),3)
FID <- c(rep("N123",10),rep("K541",5))
IID <- c(rep(123,5),rep(456,5),rep(789,5))

Value1.G1 <- c(rep(c(888,345,765,875,875,323),2),8039,830,849)
Value2.G1 <- c(rep(c(443,325,761),4),649,975,323)
Value1.G2 <- rep(c(446,345,765,875,323),3)
Value2.G2 <- c(rep(c(540,345,765),4),169,875,431)

dat <-data.frame(FID,IID,Status,Value1.G1,Value2.G1,Value1.G2,Value2.G2)
print(dat)
    FID IID Status Value1.G1 Value2.G1 Value1.G2 Value2.G2
1  N123 123     B1       888       443       446       540
2  N123 123     B2       345       325       345       345
3  N123 123     B3       765       761       765       765
4  N123 123     B4       875       443       875       540
5  N123 123  Total       875       325       323       345
6  N123 456     B1       323       761       446       765
7  N123 456     B2       888       443       345       540
8  N123 456     B3       345       325       765       345
9  N123 456     B4       765       761       875       765
10 N123 456  Total       875       443       323       540
11 K541 789     B1       875       325       446       345
12 K541 789     B2       323       761       345       765
13 K541 789     B3      8039       649       765       169
14 K541 789     B4       830       975       875       875
15 K541 789  Total       849       323       323       431

简而言之,除了前三列,每个单元格中的数字都是特定样本的特定值(在本例中为Value1Value2)(123456, 和 789), 在特定凝胶上 (12), 在特定状态下 (B1,B2,B3B4Total)。前三列 (FIDIIDStatus) 列出有关样本 (FIDIID) 和状态 (Status ).

初一

我需要创建的第一个表格将标题分解成它们的组成部分。 (澄清点:"Gel" 列指的是列名称中 "G" 之后的前一个数值。)

    FID IID Status Value1 Value2 Gel
1  N123 123     B1    888    443   1
2  N123 456     B1    323    761   1
3  K541 789     B1    875    325   1
4  N123 123     B1    446    540   2
5  N123 456     B1    446    765   2
6  K541 789     B1    446    345   2
7  N123 123     B2    345    325   1
8  N123 456     B2    888    443   1
9  K541 789     B2    323    761   1
10 N123 123     B2    345    345   2
11 N123 456     B2    345    540   2
12 K541 789     B2    345    765   2
13 N123 123     B3    765    761   1
14 N123 456     B3    345    325   1
15 K541 789     B3   8039    649   1
16 N123 123     B3    765    765   2
17 N123 456     B3    765    345   2
18 K541 789     B3    765    169   2
19 N123 123     B4    875    443   1
20 N123 456     B4    765    761   1
21 K541 789     B4    830    975   1
22 N123 123     B4    875    540   2
23 N123 456     B4    875    765   2
24 K541 789     B4    875    875   2
25 N123 123  Total    875    325   1
26 N123 456  Total    875    443   1
27 K541 789  Total    849    323   1
28 N123 123  Total    323    345   2
29 N123 456  Total    323    540   2
30 K541 789  Total    323    431   2

中学

我需要创建的第二种形式要求每个唯一的个人 ID (IID) 有一行包含它的所有值信息。在这种情况下,列名称指示有关值及其特定条件(即哪个值、哪个凝胶和哪个状态)的所有信息。对于此示例数据,这意味着有 3 行和 22 列。

   FID IID Value1.G1.B1 Value2.G1.B1 Value1.G2.B1 Value2.G2.B1 Value1.G1.B2 Value2.G1.B2 Value1.G2.B2
1 N123 123          888          443          446          540          345          325          345
2 N123 456          323          761          446          765          888          443          345
3 K541 789          875          325          446          345          323          761          345
  Value2.G2.B2 Value1.G1.B3 Value2.G1.B3 Value1.G2.B3 Value2.G2.B3 Value1.G1.B4 Value2.G1.B4 Value1.G2.B4
1          345          765          761          765          765          875          443          875
2          540          345          325          765          345          765          761          875
3          765         8039          649          765          169          830          975          875
  Value2.G2.B4 Value1.G1.Total Value2.G1.Total Value1.G2.Total Value2.G2.Total
1          540             875             325             323             345
2          765             875             443             323             540
3          875             849             323             323             431

您可以从 data.table 的开发版本中尝试 meltdcast,即 v1.9.5,它可以采用多个值列。它可以从 here

安装

要从 'wide' 转换为 'long',请在 'data.table' 上使用 meltsetDT(dat) - 将 'data.frame' 转换为 'data.table') 并将 'Value1' 和 'Value2' 的索引指定为 measure.vars

中的 'list'
library(data.table)#v1.9.5+
dM <- melt(setDT(dat), measure.vars=list(c(4,6), c(5,7)), variable.name='Gel')

dM[order(Status)]
#     FID IID Status Gel value1 value2
# 1: N123 123     B1   1    888    443
# 2: N123 456     B1   1    323    761
# 3: K541 789     B1   1    875    325
# 4: N123 123     B1   2    446    540
# 5: N123 456     B1   2    446    765
# 6: K541 789     B1   2    446    345
# 7: N123 123     B2   1    345    325
# 8: N123 456     B2   1    888    443
# 9: K541 789     B2   1    323    761
#10: N123 123     B2   2    345    345
#11: N123 456     B2   2    345    540
#12: K541 789     B2   2    345    765
#13: N123 123     B3   1    765    761
#14: N123 456     B3   1    345    325
#15: K541 789     B3   1   8039    649
#16: N123 123     B3   2    765    765
#17: N123 456     B3   2    765    345
#18: K541 789     B3   2    765    169
#19: N123 123     B4   1    875    443
#20: N123 456     B4   1    765    761
#21: K541 789     B4   1    830    975
#22: N123 123     B4   2    875    540
#23: N123 456     B4   2    875    765
#24: K541 789     B4   2    875    875
#25: N123 123  Total   1    875    325
#26: N123 456  Total   1    875    443
#27: K541 789  Total   1    849    323
#28: N123 123  Total   2    323    345
#29: N123 456  Total   2    323    540
#30: K541 789  Total   2    323    431

我们可以用dcast将'long'格式转换为'wide'格式。在这里,我们在 value.var

中指定多个值列
dC <- dcast(dM, FID+IID~Gel+Status, value.var=c('value1', 'value2'))
dC
#   FID IID 1_B1_value1 1_B2_value1 1_B3_value1 1_B4_value1 1_Total_value1
#1: K541 789         875         323        8039         830            849
#2: N123 123         888         345         765         875            875
#3: N123 456         323         888         345         765            875
#   2_B1_value1 2_B2_value1 2_B3_value1 2_B4_value1 2_Total_value1 1_B1_value2
#1:         446         345         765         875            323         325
#2:         446         345         765         875            323         443
#3:         446         345         765         875            323         761
#   1_B2_value2 1_B3_value2 1_B4_value2 1_Total_value2 2_B1_value2 2_B2_value2
#1:         761         649         975            323         345         765
#2:         325         761         443            325         540         345
#3:         443         325         761            443         765         540
#   2_B3_value2 2_B4_value2 2_Total_value2
#1:         169         875            431
#2:         765         540            345
#3:         345         765            540

我们也可以从原始数据集中得到宽格式

dcast(setDT(dat), FID+IID~Status, value.var=names(dat)[4:7])
#   FID IID B1_Value1.G1 B2_Value1.G1 B3_Value1.G1 B4_Value1.G1 Total_Value1.G1
#1: K541 789          875          323         8039        830          849
#2: N123 123          888          345          765          875          875
#3: N123 456          323          888          345          765          875
#   B1_Value2.G1 B2_Value2.G1 B3_Value2.G1 B4_Value2.G1 Total_Value2.G1
#1:          325          761          649          975             323
#2:          443          325          761          443             325
#3:          761          443          325          761             443
#    B1_Value1.G2 B2_Value1.G2 B3_Value1.G2 B4_Value1.G2 Total_Value1.G2
#1:          446          345          765          875             323
#2:          446          345          765          875             323
#3:          446          345          765          875             323
#   B1_Value2.G2 B2_Value2.G2 B3_Value2.G2 B4_Value2.G2 Total_Value2.G2
#1:          345          765          169          875             431
#2:          540          345          765          540             345
#3:          765          540          345          765             540

更新

只需仔细检查 OP 的预期输出 ('form1')

   merge(dM,form1,by=c("FID","IID","Gel","Status"),suffixes=c("akrun","real"))
#      FID IID Status Gel value1 value2 Value1 Value2
#  1: K541 789     B1   1    875    325    875    325
#  2: K541 789     B2   1    323    761    323    761
#  3: K541 789     B3   1   8039    649   8039    649
#  4: K541 789     B4   1    830    975    830    975   
#  5: K541 789  Total   1    849    323    849    323
#  6: K541 789     B1   2    446    345    446    345
#  7: K541 789     B2   2    345    765    345    765
#  8: K541 789     B3   2    765    169    765    169
#  9: K541 789     B4   2    875    875    875    875
# 10: K541 789  Total   2    323    431    323    431
# 11: N123 123     B1   1    888    443    888    443
# 12: N123 123     B2   1    345    325    345    325
# 13: N123 123     B3   1    765    761    765    761
# 14: N123 123     B4   1    875    443    875    443
# 15: N123 123  Total   1    875    325    875    325
# 16: N123 123     B1   2    446    540    446    540
# 17: N123 123     B2   2    345    345    345    345
# 18: N123 123     B3   2    765    765    765    765
# 19: N123 123     B4   2    875    540    875    540
# 20: N123 123  Total   2    323    345    323    345
# 21: N123 456     B1   1    323    761    323    761
# 22: N123 456     B2   1    888    443    888    443
# 23: N123 456     B3   1    345    325    345    325
# 24: N123 456     B4   1    765    761    765    761
# 25: N123 456  Total   1    875    443    875    443
# 26: N123 456     B1   2    446    765    446    765
# 27: N123 456     B2   2    345    540    345    540
# 28: N123 456     B3   2    765    345    765    345
# 29: N123 456     B4   2    875    765    875    765
# 30: N123 456  Total   2    323    540    323    540

使用?reshape

Status <- rep(c(paste0("B",seq(1,4)),"Total"),3)
FID <- c(rep("N123",10),rep("K541",5))
IID <- c(rep(123,5),rep(456,5),rep(789,5))

Value1.G1 <- c(rep(c(888,345,765,875,875,323),2),8039,830,849)
Value2.G1 <- c(rep(c(443,325,761),4),649,975,323)
Value1.G2 <- rep(c(446,345,765,875,323),3)
Value2.G2 <- c(rep(c(540,345,765),4),169,875,431)

dat <-data.frame(FID,IID,Status,Value1.G1,Value2.G1,Value1.G2,Value2.G2)
dat

对于长格式

l <- reshape(dat, direction = 'long', varying = list(c(4,6), c(5,7)),
             v.names = c('Value1', 'Value2'), timevar = 'Gel')
l[order(l$Status), ]

#       FID IID Status Gel Value1 Value2 id
# 1.1  N123 123     B1   1    888    443  1
# 6.1  N123 456     B1   1    323    761  6
# 11.1 K541 789     B1   1    875    325 11
# 1.2  N123 123     B1   2    446    540  1
# 6.2  N123 456     B1   2    446    765  6
# 11.2 K541 789     B1   2    446    345 11
# 2.1  N123 123     B2   1    345    325  2
# 7.1  N123 456     B2   1    888    443  7
# 12.1 K541 789     B2   1    323    761 12
# 2.2  N123 123     B2   2    345    345  2
# 7.2  N123 456     B2   2    345    540  7
# 12.2 K541 789     B2   2    345    765 12
# 3.1  N123 123     B3   1    765    761  3
# 8.1  N123 456     B3   1    345    325  8
# 13.1 K541 789     B3   1   8039    649 13
# 3.2  N123 123     B3   2    765    765  3
# 8.2  N123 456     B3   2    765    345  8
# 13.2 K541 789     B3   2    765    169 13
# 4.1  N123 123     B4   1    875    443  4
# 9.1  N123 456     B4   1    765    761  9
# 14.1 K541 789     B4   1    830    975 14
# 4.2  N123 123     B4   2    875    540  4
# 9.2  N123 456     B4   2    875    765  9
# 14.2 K541 789     B4   2    875    875 14
# 5.1  N123 123  Total   1    875    325  5
# 10.1 N123 456  Total   1    875    443 10
# 15.1 K541 789  Total   1    849    323 15
# 5.2  N123 123  Total   2    323    345  5
# 10.2 N123 456  Total   2    323    540 10
# 15.2 K541 789  Total   2    323    431 15

reshape(dat, direction = 'wide', timevar = 'Status', idvar = names(dat)[1:2])

#     FID IID Value1.G1.B1 Value2.G1.B1 Value1.G2.B1 Value2.G2.B1 Value1.G1.B2
# 1  N123 123          888          443          446          540          345
# 6  N123 456          323          761          446          765          888
# 11 K541 789          875          325          446          345          323
#    Value2.G1.B2 Value1.G2.B2 Value2.G2.B2 Value1.G1.B3 Value2.G1.B3 Value1.G2.B3
# 1           325          345          345          765          761          765
# 6           443          345          540          345          325          765
# 11          761          345          765         8039          649          765
#    Value2.G2.B3 Value1.G1.B4 Value2.G1.B4 Value1.G2.B4 Value2.G2.B4 Value1.G1.Total
# 1           765          875          443          875          540             875
# 6           345          765          761          875          765             875
# 11          169          830          975          875          875             849
#    Value2.G1.Total Value1.G2.Total Value2.G2.Total
# 1              325             323             345
# 6              443             323             540
# 11             323             323             431