我如何旋转更宽并按两列计算一对的出现?

How do i pivot wider and count the occurrence of a pair by two columns?

在这里查看数据框

dt <- structure(list(ID = c(1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 4, 5, 5, 
5, 6, 6, 6, 7, 7, 7), V1 = c("ABC", "ABC", "DEF", "GHI", "GHI", 
"GHI", "JKL", "JKL", "DEF", "ABC", "MNO", "GHI", "GHI", "ABC", 
"DEF", "DEF", "GHI", "MNO", "MNO", "ABC"), V2 = c("DEF", "MNO", 
"MNO", "JKL", "DEF", "ABC", "DEF", "ABC", "ABC", "JKL", "JKL",                                                                                               
"ABC", "DEF", "DEF", "GHI", "MNO", "MNO", "ABC", "JKL", "JKL"
)), row.names = c(NA, -20L), class = c("data.table", "data.frame"))

例如在V1列中,ABC出现了5次,而在V2中,DEF也出现了5次。然而,它们配对了三次。我想创建一个计数列来计算它们的对,无论它们属于哪一列(V1 或 V2)。

更新

dt[, c(2, 3, 1)] %>%
    graph_from_data_frame(directed = FALSE) %>%
    get.adjacency(type = "upper") %>%
    graph_from_adjacency_matrix(weighted = TRUE) %>%
    get.data.frame()

给予

   from  to weight
1   ABC DEF      3
2   ABC GHI      2
3   DEF GHI      3
4   ABC JKL      3
5   DEF JKL      1
6   GHI JKL      1
7   ABC MNO      2
8   DEF MNO      2
9   GHI MNO      1
10  JKL MNO      2

我想你可以试试下面的 igraph 选项

library(igraph)
get.adjacency(
    graph_from_data_frame(dt[, -"ID"],
        directed = FALSE
    ),
    sparse = FALSE
)

这给出了

    ABC DEF GHI JKL MNO
ABC   0   3   2   3   2
DEF   3   0   3   1   2
GHI   2   3   0   1   1
JKL   3   1   1   0   2
MNO   2   2   1   2   0

如果你想添加一个列来表示计数,你可以试试

transform(
    dt,
    cnts = ave(ID, pmin(V1, V2), pmax(V1, V2), FUN = length)
)

这给出了

   ID  V1  V2 cnts
 1:  1 ABC DEF    3
 2:  1 ABC MNO    2
 3:  1 DEF MNO    2
 4:  2 GHI JKL    1
 5:  2 GHI DEF    3
 6:  2 GHI ABC    2
 7:  2 JKL DEF    1
 8:  2 JKL ABC    3
 9:  2 DEF ABC    3
10:  3 ABC JKL    3
11:  4 MNO JKL    2
12:  5 GHI ABC    2
13:  5 GHI DEF    3
14:  5 ABC DEF    3
15:  6 DEF GHI    3
16:  6 DEF MNO    2
17:  6 GHI MNO    1
18:  7 MNO ABC    2
19:  7 MNO JKL    2
20:  7 ABC JKL    3

在 base R 中你可以这样做:

codes = unique(c(dt$V1, dt$V2))
output = data.frame(code1 = character(0), code2 = character(0), occurances = integer(0))
for(i1 in 1:length(codes)){
    code1 = codes[i1]
    for(i2 in 1:i1){
        code2 = codes[i2]
        count = sum((code1==dt$V1 & code2 == dt$V2) | (code1==dt$V2 & code2 == dt$V1))
        output = rbind(output, data.frame(code1=code1, code2=code2, occurances=count))
    }
}
output

输出:

   code1 code2 occurances
1    ABC   ABC          0
2    DEF   ABC          3
3    DEF   DEF          0
4    GHI   ABC          2
5    GHI   DEF          3
6    GHI   GHI          0
7    JKL   ABC          3
8    JKL   DEF          1
9    JKL   GHI          1
10   JKL   JKL          0
11   MNO   ABC          2
12   MNO   DEF          2
13   MNO   GHI          1
14   MNO   JKL          2
15   MNO   MNO          0

您也可以使用 table() 并将结果及其转置相加:

tbl <- table(dt[-1]) 

(cnts <- tbl + `diag<-`(t(tbl), 0))

     V2
V1    ABC DEF GHI JKL MNO
  ABC   0   3   2   3   2
  DEF   3   0   3   1   2
  GHI   2   3   0   1   1
  JKL   3   1   1   0   2
  MNO   2   2   1   2   0

去重并转换为数据框:

cnts[upper.tri(cnts)] <- NA

subset(as.data.frame.table(cnts), !is.na(Freq))

    V1  V2 Freq
1  ABC ABC    0
2  DEF ABC    3
3  GHI ABC    2
4  JKL ABC    3
5  MNO ABC    2
7  DEF DEF    0
8  GHI DEF    3
9  JKL DEF    1
10 MNO DEF    2
13 GHI GHI    0
14 JKL GHI    1
15 MNO GHI    1
19 JKL JKL    0
20 MNO JKL    2
25 MNO MNO    0