如何从R中的同一数据框中按行成对查找公共元素

How to find common element pairwise by rows from the same data frame in R

我有一个数据框。数据框的小样本可以这样想:

    V1        V2        V3        V4..........
0  a2er56    b34er12   aer234     NA
1  2fr34rt   a2er56    aer234        NA
2  b34er12   2fr34t     NA        NA
.
.
.

共有1193行173列。

我想找到公共元素,即一对行的列,所有对都应该是 运行。对于从 0 开始到除 0 以外的所有行 (1193) 的行,它需要找到它们在列下的公共条目是什么。这应该对所有可能的对完成。

成对是指 (0,1), (0,2), (0,3)....(1,2), (1,3),.... 即所有可能的成对对于常见元素,我的意思是:

如果我们在小样本数据集中看到 (0,2) 对,我们可以看到 b34er12 存在于两者中,因此我想要一个包含三列的数据框,其中包含:0、2、b34er12。

同样对于 (1,2) 对,公共元素是 2fr34t。

我想要这样的数据框:

name1    name2   common_element1  common_element2......
 0        1       a2er56               aer234
 0        2       b34er12
 1        2       2fr34t

我的数据框的两行 (2, 19) 的 dput 有一个公共元素 (20a8ed50-e592-70e4-aecd-ec17f1723495)

 structure(list(V1 = list(`2` = "20a8ed50-e592-70e4-aecd-ec17f1723495", 
`19` = "5e2406fe-ddb4-a797-508e-2aff5baae0fc"), V2 = list(
`2` = NA_character_, `19` = "f8bd691e-ab7f-3718-c1d2-1361dab00bcd"), 
V3 = list(`2` = NA_character_, `19` = "b76c2ff6-87f0-2cfb-ac8c-e4adcb580c9e"), 
V4 = list(`2` = NA_character_, `19` = "ce09b782-92f6-245c-70dd-886bdd1d5590"), 
V5 = list(`2` = NA_character_, `19` = "8f305f42-6ad3-2a5d-8910-e2c9415a8a7f"), 
V6 = list(`2` = NA_character_, `19` = "409d5592-8bac-5814-6e41-6b239fac65b8"), 
V7 = list(`2` = NA_character_, `19` = "3a12ffe6-69a8-9bf6-0ed6-bd9587f4ae2f"), 
V8 = list(`2` = NA_character_, `19` = "a1b39b52-e345-0fbe-3beb-156a7ffb9894"), 
V9 = list(`2` = NA_character_, `19` = "a0d0c78b-2a73-aff5-582e-7e8dcaa96058"), 
V10 = list(`2` = NA_character_, `19` = "74b70d92-29cc-7b6e-58e9-515998b434f4"), 
V11 = list(`2` = NA_character_, `19` = "72cd80a5-ae3d-496b-f1fd-e3722c2b504e"), 
V12 = list(`2` = NA_character_, `19` = "eac9b054-e49d-109c-7c70-37473b2c998b"), 
V13 = list(`2` = NA_character_, `19` = "331c1a13-1723-d521-42b9-dc4b4f3ece45"), 
V14 = list(`2` = NA_character_, `19` = "45f73feb-6f8c-2ecb-2fab-cd38362e4b7a"), 
V15 = list(`2` = NA_character_, `19` = "f10dc385-9f8e-a85e-1962-5b63976f40fa"), 
V16 = list(`2` = NA_character_, `19` = "54dfb458-669b-49e9-1f09-4543bd2f49b7"), 
V17 = list(`2` = NA_character_, `19` = "a04a9177-e309-b0b0-cc60-109c61a4caf1"), 
V18 = list(`2` = NA_character_, `19` = "a5551ec1-1c06-8727-2046-f3c51854decf"), 
V19 = list(`2` = NA_character_, `19` = "acffb1a5-ddda-460d-a6fb-99d3cdb67aca"), 
V20 = list(`2` = NA_character_, `19` = "16063f02-0f0c-e2f7-a433-1fb062229801"), 
V21 = list(`2` = NA_character_, `19` = "1ec467d7-320a-09dd-6fe2-6cb42023d4e9"), 
V22 = list(`2` = NA_character_, `19` = "2882a616-e376-e72b-e10f-6adc937bfb17"), 
V23 = list(`2` = NA_character_, `19` = "53380cb1-297a-eaa3-f319-9b5184d5cad2"), 
V24 = list(`2` = NA_character_, `19` = "20a8ed50-e592-70e4-aecd-ec17f1723495"), 
V25 = list(`2` = NA_character_, `19` = "23437f56-d286-bba3-a7ed-e5173acb785c"), 
V26 = list(`2` = NA_character_, `19` = "24221c40-7a63-e932-3a82-6daa305ff7a4"), 
V27 = list(`2` = NA_character_, `19` = "7bc9216e-a9a5-3c43-bd3d-3bf9680a0799"), 
V28 = list(`2` = NA_character_, `19` = "40880c67-e4b4-95eb-bf7b-58f07dcd93ca"), 
V29 = list(`2` = NA_character_, `19` = "03c5eef8-5442-2f5d-7c33-44c2d59aeb4e"), 
V30 = list(`2` = NA_character_, `19` = "24d95b8a-42ed-fa15-13b1-53892d0339fe"), 
V31 = list(`2` = NA_character_, `19` = "a08a6b2b-577f-9cdd-3fec-992b54f4a0bb"), 
V32 = list(`2` = NA_character_, `19` = "6d65b9cc-fe48-fc94-404b-6cbc300f044e"), 
V33 = list(`2` = NA_character_, `19` = "fd5a9a7a-e07b-6b0a-a01c-c71b35940330"), 
V34 = list(`2` = NA_character_, `19` = "d2e09e88-5ff4-753c-7554-28fad0fcd63f"), 
V35 = list(`2` = NA_character_, `19` = "c0645d9e-eb24-eeff-e7ff-28f839b90c37"), 
V36 = list(`2` = NA_character_, `19` = "a093a047-9d04-0a32-5c42-bb2b5b58b1c5"), 
V37 = list(`2` = NA_character_, `19` = "09199dbf-1f96-5cc0-6cdb-96f2802e7487"), 
V38 = list(`2` = NA_character_, `19` = "338ed72b-5b6a-ecbe-7bf4-8449d452cefc"), 
V39 = list(`2` = NA_character_, `19` = "deb26b9c-b8a2-9a02-d8d3-1805e522e4d5"), 
V40 = list(`2` = NA_character_, `19` = "9d99226f-e484-dddd-7d1c-a9f1803fc21d"), 
V41 = list(`2` = NA_character_, `19` = "a340124f-a029-ee62-683a-152bef28db6b"), 
V42 = list(`2` = NA_character_, `19` = "2c147f88-a297-4932-4b1e-9caffb1982d4"), 
V43 = list(`2` = NA_character_, `19` = "03f230ba-ed9d-db27-71ba-b0ebe57a7827"), 
V44 = list(`2` = NA_character_, `19` = "a8c7b55d-431f-9876-e839-623912250da6"), 
V45 = list(`2` = NA_character_, `19` = "7c7e6f90-ad68-8fca-cbad-91cd2ba6110d"), 
V46 = list(`2` = NA_character_, `19` = "577b02ae-7b6f-676b-5e03-aabb11e8bf04"), 
V47 = list(`2` = NA_character_, `19` = "bf2f74db-223a-380b-531e-2810df300b15"), 
V48 = list(`2` = NA_character_, `19` = "a92967ec-2219-3a0e-df0a-419ce07e5bd8"), 
V49 = list(`2` = NA_character_, `19` = "9f1eb31e-efc0-6b92-e2f1-8df231329752"), 
V50 = list(`2` = NA_character_, `19` = "7a909135-ca3f-c392-8aa3-693382647029"), 
V51 = list(`2` = NA_character_, `19` = "869fd4d7-1670-62b5-dbf3-2f8ea99a52dc"), 
V52 = list(`2` = NA_character_, `19` = "718a995a-5281-9d08-a916-fa37b541cbd1"), 
V53 = list(`2` = NA_character_, `19` = "be9dec6a-d2fb-60dc-a7d3-9013b4fe92b7"), 
V54 = list(`2` = NA_character_, `19` = "1304f539-4a82-0e17-0623-aa84a5fea370"), 
V55 = list(`2` = NA_character_, `19` = "f8bfb612-9b95-df4e-7f89-022528e43f5f"), 
V56 = list(`2` = NA_character_, `19` = "11a3e1e9-de5e-ac6e-d198-a7cceb5cac3c"), 
V57 = list(`2` = NA_character_, `19` = "dccd545b-5e83-bd67-158e-1939bae072f5"), 
V58 = list(`2` = NA_character_, `19` = "fd6e2a94-5a3b-2969-744d-efa09d5257a8"), 
V59 = list(`2` = NA_character_, `19` = "ff437f85-db94-12d0-7621-e0a1fe217eb6"), 
V60 = list(`2` = NA_character_, `19` = "2107b5c3-194b-c3ec-8ae2-93c7214e90da"), 
V61 = list(`2` = NA_character_, `19` = "acf5a19e-7292-dc1e-8b68-d4d629843935"), 
V62 = list(`2` = NA_character_, `19` = "a14c6399-a7bb-00e8-043a-a529104dd866"), 
V63 = list(`2` = NA_character_, `19` = "fbce294b-4dbc-fbd3-5caf-7e66a25d2a2c"), 
V64 = list(`2` = NA_character_, `19` = "29e32b77-49be-a503-96d3-ce32b04f5d25"), 
V65 = list(`2` = NA_character_, `19` = "1218d219-7778-e30d-1e05-95406ac7520f"), 
V66 = list(`2` = NA_character_, `19` = "8f4819b8-1ac0-4d47-62d2-a10f9eeac5af"), 
V67 = list(`2` = NA_character_, `19` = "0c37cbe5-6922-bb53-1611-2926ceccb776"), 
V68 = list(`2` = NA_character_, `19` = "d26fa4f0-1918-097c-6176-e4fe4ee2e7cb"), 
V69 = list(`2` = NA_character_, `19` = "4f2d0023-2a04-9fa5-49ec-27da08a4567a"), 
V70 = list(`2` = NA_character_, `19` = "f7cb205c-1885-3bb6-5963-0d4eb90cf763"), 
V71 = list(`2` = NA_character_, `19` = "825843e2-8536-41c3-5df2-0363c8c87dfa"), 
V72 = list(`2` = NA_character_, `19` = "c596f761-a75b-10e1-1bfc-fda0b247fc05"), 
V73 = list(`2` = NA_character_, `19` = "73f09340-4b13-5716-8496-d412796ec7fd"), 
V74 = list(`2` = NA_character_, `19` = "7028ac54-d368-28d1-26bd-7abe3ac0c656"), 
V75 = list(`2` = NA_character_, `19` = "5cddb8c8-c5b5-af9d-f503-e0a8f0cf9c3f"), 
V76 = list(`2` = NA_character_, `19` = "71df6470-5429-6f55-ba12-cfff63849cce"), 
V77 = list(`2` = NA_character_, `19` = "870aa18e-f4c5-9a90-afe1-99b3c2fe8534"), 
V78 = list(`2` = NA_character_, `19` = "52c163c5-ac34-4c93-4aba-368fd00808b0"), 
V79 = list(`2` = NA_character_, `19` = "313f157b-60d6-b82c-c6fa-028d4a226e94"), 
V80 = list(`2` = NA_character_, `19` = "ce5c86af-8e37-c2cb-9d44-db1e3687b6b4"), 
V81 = list(`2` = NA_character_, `19` = "516494df-f44b-a4bd-ce35-cae9f2102ca6"), 
V82 = list(`2` = NA_character_, `19` = "fb423fb0-bc21-3556-386f-9ff05722166c"), 
V83 = list(`2` = NA_character_, `19` = "0e36a888-38a0-54be-6a27-366a5e17fc9a"), 
V84 = list(`2` = NA_character_, `19` = "8d2f177e-a485-d865-a220-09d42aa4afc4"), 
V85 = list(`2` = NA_character_, `19` = "605924f4-aa86-54af-ae01-782c1fd9eb45"), 
V86 = list(`2` = NA_character_, `19` = "ddd7cf47-66cd-4706-9df1-b0aa05ec594d"), 
V87 = list(`2` = NA_character_, `19` = "cfd1d1dc-2fa4-c7a3-f6fa-e93083d33386"), 
V88 = list(`2` = NA_character_, `19` = "4a1626e6-5137-87f4-8c23-4ec92c29252d"), 
V89 = list(`2` = NA_character_, `19` = "c46e93c7-f9fc-220b-705b-e9adcc95c3ac"), 
V90 = list(`2` = NA_character_, `19` = "b4cdd55e-b56a-5b38-8539-3f09857d242b"), 
V91 = list(`2` = NA_character_, `19` = "c9953591-82e2-461d-275b-6bf5e84c902c"), 
V92 = list(`2` = NA_character_, `19` = "03392f1f-93dd-7684-67a4-821d875b209e"), 
V93 = list(`2` = NA_character_, `19` = "b11f4d5e-1af0-b0b7-c2de-e71afe1ba4f7"), 
V94 = list(`2` = NA_character_, `19` = "7c06ac0b-8662-595d-b05c-e8a6d14ae347"), 
V95 = list(`2` = NA_character_, `19` = "0bb57f8b-6d9b-9c8a-c8ad-bf1ee2e64843"), 
V96 = list(`2` = NA_character_, `19` = "907b69bc-fd0b-4ec5-e912-b24219bb45a2"), 
V97 = list(`2` = NA_character_, `19` = "2373dc5d-c24d-be1a-dba4-04de497bef48"), 
V98 = list(`2` = NA_character_, `19` = "e3829233-bf93-691e-910e-5251876b63ff"), 
V99 = list(`2` = NA_character_, `19` = "81bf5b0f-d69f-3e3f-189a-4b6b5fa508c8"), 
V100 = list(`2` = NA_character_, `19` = "1843bd54-78d5-6768-7bc4-c83dc51a3c33"), 
V101 = list(`2` = NA_character_, `19` = "b2e019ff-6364-9de8-a9d5-2fb683393f66"), 
V102 = list(`2` = NA_character_, `19` = "f84bc0b5-dba4-3208-2eaf-2f2c0e3c5207"), 
V103 = list(`2` = NA_character_, `19` = "bd8245a7-28d3-b2f1-ad24-433c224147a6"), 
V104 = list(`2` = NA_character_, `19` = "d06e992f-4212-b12e-58a7-d5cc8cbf1433"), 
V105 = list(`2` = NA_character_, `19` = NA_character_), V106 = list(
    `2` = NA_character_, `19` = NA_character_), V107 = list(
    `2` = NA_character_, `19` = NA_character_), V108 = list(
    `2` = NA_character_, `19` = NA_character_), V109 = list(
    `2` = NA_character_, `19` = NA_character_), V110 = list(
    `2` = NA_character_, `19` = NA_character_), V111 = list(
    `2` = NA_character_, `19` = NA_character_), V112 = list(
    `2` = NA_character_, `19` = NA_character_), V113 = list(
    `2` = NA_character_, `19` = NA_character_), V114 = list(
    `2` = NA_character_, `19` = NA_character_), V115 = list(
    `2` = NA_character_, `19` = NA_character_), V116 = list(
    `2` = NA_character_, `19` = NA_character_), V117 = list(
    `2` = NA_character_, `19` = NA_character_), V118 = list(
    `2` = NA_character_, `19` = NA_character_), V119 = list(
    `2` = NA_character_, `19` = NA_character_), V120 = list(
    `2` = NA_character_, `19` = NA_character_), V121 = list(
    `2` = NA_character_, `19` = NA_character_), V122 = list(
    `2` = NA_character_, `19` = NA_character_), V123 = list(
    `2` = NA_character_, `19` = NA_character_), V124 = list(
    `2` = NA_character_, `19` = NA_character_), V125 = list(
    `2` = NA_character_, `19` = NA_character_), V126 = list(
    `2` = NA_character_, `19` = NA_character_), V127 = list(
    `2` = NA_character_, `19` = NA_character_), V128 = list(
    `2` = NA_character_, `19` = NA_character_), V129 = list(
    `2` = NA_character_, `19` = NA_character_), V130 = list(
    `2` = NA_character_, `19` = NA_character_), V131 = list(
    `2` = NA_character_, `19` = NA_character_), V132 = list(
    `2` = NA_character_, `19` = NA_character_), V133 = list(
    `2` = NA_character_, `19` = NA_character_), V134 = list(
    `2` = NA_character_, `19` = NA_character_), V135 = list(
    `2` = NA_character_, `19` = NA_character_), V136 = list(
    `2` = NA_character_, `19` = NA_character_), V137 = list(
    `2` = NA_character_, `19` = NA_character_), V138 = list(
    `2` = NA_character_, `19` = NA_character_), V139 = list(
    `2` = NA_character_, `19` = NA_character_), V140 = list(
    `2` = NA_character_, `19` = NA_character_), V141 = list(
    `2` = NA_character_, `19` = NA_character_), V142 = list(
    `2` = NA_character_, `19` = NA_character_), V143 = list(
    `2` = NA_character_, `19` = NA_character_), V144 = list(
    `2` = NA_character_, `19` = NA_character_), V145 = list(
    `2` = NA_character_, `19` = NA_character_), V146 = list(
    `2` = NA_character_, `19` = NA_character_), V147 = list(
    `2` = NA_character_, `19` = NA_character_), V148 = list(
    `2` = NA_character_, `19` = NA_character_), V149 = list(
    `2` = NA_character_, `19` = NA_character_), V150 = list(
    `2` = NA_character_, `19` = NA_character_), V151 = list(
    `2` = NA_character_, `19` = NA_character_), V152 = list(
    `2` = NA_character_, `19` = NA_character_), V153 = list(
    `2` = NA_character_, `19` = NA_character_), V154 = list(
    `2` = NA_character_, `19` = NA_character_), V155 = list(
    `2` = NA_character_, `19` = NA_character_), V156 = list(
    `2` = NA_character_, `19` = NA_character_), V157 = list(
    `2` = NA_character_, `19` = NA_character_), V158 = list(
    `2` = NA_character_, `19` = NA_character_), V159 = list(
    `2` = NA_character_, `19` = NA_character_), V160 = list(
    `2` = NA_character_, `19` = NA_character_), V161 = list(
    `2` = NA_character_, `19` = NA_character_), V162 = list(
    `2` = NA_character_, `19` = NA_character_), V163 = list(
    `2` = NA_character_, `19` = NA_character_), V164 = list(
    `2` = NA_character_, `19` = NA_character_), V165 = list(
    `2` = NA_character_, `19` = NA_character_), V166 = list(
    `2` = NA_character_, `19` = NA_character_), V167 = list(
    `2` = NA_character_, `19` = NA_character_), V168 = list(
    `2` = NA_character_, `19` = NA_character_), V169 = list(
    `2` = NA_character_, `19` = NA_character_), V170 = list(
    `2` = NA_character_, `19` = NA_character_), V171 = list(
    `2` = NA_character_, `19` = NA_character_), V172 = list(
    `2` = NA_character_, `19` = NA_character_), V173 = list(
    `2` = NA_character_, `19` = NA_character_)), row.names = c("2", 
    "19"), class = "data.frame")

诀窍是转换为包含所有 nameelement 对的长 table:

转换为包含列 namevariableelement 的长 table:

library(dplyr)
library(tidyr)

# Note that you also use the native pipe |> on R >= 4.1
data = data %>% 
  # Add index to data
  mutate(name = row_number()) %>% 
  # Convert from wide to long
  pivot_longer(!name, names_to = 'variable', values_to = 'element') %>% 
  # Remove any NAs
  drop_na()

这给出了如下所示的输出:

  name variable element
1    1     col1      a1
2    2     col1      c2
3    3     col1      a3
4    4     col1      d4
5    1     col2      c2
6    2     col2      v5
7    3     col2      d4
8    4     col2      x6

然后我们可以通过按 element 分组并过滤重复项来找到所有重复项:

dups = longdata %>% 
  select(-variable) %>% 
  group_by(element) %>% 
  mutate(numdups = n()) %>% 
  filter(numdups > 1) %>% 
  select(-numdups)

然后可以通过元素内连接找到所有对。这匹配所有条目,因此我们必须过滤掉具有相同名称的连接(例如 (name1, name2) = (1,1):

dups %>% inner_join(dups, by = 'element') %>% 
  filter(name.x < name.y) %>% 
  select (name1 = name.x, name2 = name.y, element)