数据转换为邻接矩阵
Data transformation into adjacency matrix
我在将数据集转换为邻接矩阵时遇到了一些问题。数据来自事件。它有五个变量:事件的 ID (id)、出席的组织的 ID(org1、org2、org3 - 每个事件最多编码三个)、二分位置信息 (BP) 和参与者的数量 ( nr_participants)。它看起来像这样:
# A tibble: 6 × 6
id org1 org2 org3 BP nr_participants
<int> <fct> <fct> <fct> <dbl> <dbl>
1 1 15 23 0 1 0
2 2 11 11 13 1 127000
3 3 23 13 28 1 500
4 4 11 11 13 1 500
5 5 29 28 23 1 50000
6 6 31 49 0 1 0
我想从这个数据中得到两个邻接矩阵。在这两者中,唯一组织是行和列(109 个唯一值)。所以根据上面的head
table,数据应该是这样的:
15 23 0 11 13 28 29 31 49
15
23
0
11
13
28
29
31
49
第一种情况:单元格应该是任何给定组织对组织的活动中nr_participants的总和。
例如短头table,它应该是这样的:
在第二种情况下:单元格应该是 BP 中 1 的份额,来自任何给定组织对组织的所有事件。
这是一个 50 obs。数据样本:
sample <- structure(list(id = c(4262L, 5140L, 3171L, 2167L, 4618L, 1668L,
771L, 4975L, 3563L, 4014L, 5695L, 1412L, 3752L, 4165L, 5282L,
5538L, 3339L, 3555L, 945L, 1620L, 3187L, 4955L, 4436L, 4609L,
4205L, 4402L, 2156L, 3745L, 895L, 5774L, 4969L, 114L, 4600L,
4188L, 5315L, 1092L, 3726L, 1488L, 1619L, 2853L, 5298L, 3095L,
502L, 2711L, 789L, 185L, 293L, 3456L, 5605L, 2783L), org1 = structure(c(23L,
16L, 4L, 48L, 9L, 4L, 48L, 1L, 4L, 48L, 25L, 4L, 4L, 48L, 10L,
26L, 3L, 4L, 49L, 4L, 4L, 26L, 48L, 49L, 48L, 22L, 11L, 4L, 1L,
1L, 49L, 31L, 51L, 48L, 26L, 21L, 4L, 4L, 4L, 11L, 1L, 4L, 82L,
4L, 54L, 48L, 83L, 1L, 48L, 49L), .Label = c("1", "2", "3", "4",
"5", "6", "7", "8", "9", "10", "11", "12", "13", "14", "15",
"16", "17", "18", "19", "20", "21", "22", "23", "24", "25", "26",
"27", "28", "29", "30", "31", "32", "33", "34", "35", "36", "37",
"38", "39", "42", "43", "44", "46", "47", "48", "49", "50", "51",
"52", "53", "54", "55", "56", "57", "58", "59", "60", "62", "64",
"65", "67", "70", "72", "73", "75", "76", "77", "78", "79", "80",
"81", "82", "83", "84", "87", "89", "90", "91", "92", "93", "98",
"100", "102", "103", "104", "105", "107", "109", "111", "114",
"117", "120", "122"), class = "factor"), org2 = structure(c(1L,
1L, 1L, 1L, 1L, 5L, 1L, 1L, 5L, 1L, 1L, 5L, 5L, 1L, 1L, 1L, 1L,
5L, 1L, 5L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 5L, 2L, 1L, 1L, 28L,
1L, 1L, 1L, 1L, 5L, 5L, 5L, 12L, 2L, 1L, 1L, 30L, 1L, 1L, 1L,
1L, 1L, 1L), .Label = c("0", "1", "2", "3", "4", "5", "6", "7",
"8", "9", "10", "11", "12", "13", "15", "16", "17", "19", "20",
"21", "22", "23", "24", "25", "26", "27", "28", "29", "30", "31",
"32", "33", "34", "35", "36", "37", "39", "42", "44", "46", "47",
"48", "49", "50", "51", "52", "54", "56", "57", "58", "59", "61",
"64", "67", "68", "73", "81", "83", "84", "88", "89", "90", "94",
"99", "100", "104", "106", "107", "109", "117", "118", "120",
"122", "124"), class = "factor"), org3 = structure(c(1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 5L, 1L, 1L, 1L, 5L, 1L, 1L, 1L, 1L, 5L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 5L, 1L, 1L, 1L, 24L, 1L,
1L, 1L, 1L, 5L, 1L, 1L, 11L, 2L, 1L, 1L, 46L, 1L, 1L, 1L, 1L,
1L, 1L), .Label = c("0", "1", "2", "3", "4", "6", "7", "8", "9",
"10", "11", "13", "15", "16", "17", "19", "21", "22", "23", "24",
"25", "26", "27", "28", "29", "31", "32", "34", "35", "36", "39",
"42", "43", "46", "47", "48", "49", "50", "52", "54", "56", "57",
"58", "59", "63", "66", "68", "73", "77", "85", "107", "117",
"120", "122", "123"), class = "factor"), BP = structure(c(0,
1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0,
0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1,
0, 1, 0, 1, 0, 0, 0), format.stata = "%9.0g"), nr_participants = c(50,
0, 0, 0, 80, 0, 40, 0, 0, 100, 100, 0, 0, 80, 50, 36, 50, 0,
587, 0, 0, 0, 200, 20, 200, 160, 5, 0, 341, 20, 1600, 250, 20,
0, 200, 0, 0, 0, 0, 500, 0, 0, 15, 0, 0, 0, 50, 382, 300, 100
)), row.names = c(NA, -50L), class = c("tbl_df", "tbl", "data.frame"
))
如何使用上面定义的总和值和份额值填充矩阵?
这是 outer
-
的方法
#get the column numbers with 'org' in it
cols <- grep('org', names(df))
#Get all the unique org values.
unique_orgs <- unique(unlist(df[cols]))
#Function to calculate sum of nr_participants where
#the two numbers exist in the same row
calculate_sum <- function(x, y) {
sum(df$nr_participants[rowSums(df[cols] == x) > 0 |
rowSums(df[cols] == y) > 0])
}
#Use outer to apply it for every unique values
mat <- outer(unique_orgs, unique_orgs, Vectorize(calculate_sum))
#assign row and column names
dimnames(mat) <- list(unique_orgs, unique_orgs)
mat
# 15 11 23 29 31 13 28 49 0
#15 0 127500 50500 50000 0 128000 50500 0 0
#11 127500 127500 178000 177500 127500 128000 178000 127500 127500
#23 50500 178000 50500 50500 50500 178000 50500 50500 50500
#29 50000 177500 50500 50000 50000 178000 50500 50000 50000
#31 0 127500 50500 50000 0 128000 50500 0 0
#13 128000 128000 178000 178000 128000 128000 178000 128000 128000
#28 50500 178000 50500 50500 50500 178000 50500 50500 50500
#49 0 127500 50500 50000 0 128000 50500 0 0
#0 0 127500 50500 50000 0 128000 50500 0 0
我在将数据集转换为邻接矩阵时遇到了一些问题。数据来自事件。它有五个变量:事件的 ID (id)、出席的组织的 ID(org1、org2、org3 - 每个事件最多编码三个)、二分位置信息 (BP) 和参与者的数量 ( nr_participants)。它看起来像这样:
# A tibble: 6 × 6
id org1 org2 org3 BP nr_participants
<int> <fct> <fct> <fct> <dbl> <dbl>
1 1 15 23 0 1 0
2 2 11 11 13 1 127000
3 3 23 13 28 1 500
4 4 11 11 13 1 500
5 5 29 28 23 1 50000
6 6 31 49 0 1 0
我想从这个数据中得到两个邻接矩阵。在这两者中,唯一组织是行和列(109 个唯一值)。所以根据上面的head
table,数据应该是这样的:
15 23 0 11 13 28 29 31 49
15
23
0
11
13
28
29
31
49
第一种情况:单元格应该是任何给定组织对组织的活动中nr_participants的总和。
例如短头table,它应该是这样的:
在第二种情况下:单元格应该是 BP 中 1 的份额,来自任何给定组织对组织的所有事件。
这是一个 50 obs。数据样本:
sample <- structure(list(id = c(4262L, 5140L, 3171L, 2167L, 4618L, 1668L,
771L, 4975L, 3563L, 4014L, 5695L, 1412L, 3752L, 4165L, 5282L,
5538L, 3339L, 3555L, 945L, 1620L, 3187L, 4955L, 4436L, 4609L,
4205L, 4402L, 2156L, 3745L, 895L, 5774L, 4969L, 114L, 4600L,
4188L, 5315L, 1092L, 3726L, 1488L, 1619L, 2853L, 5298L, 3095L,
502L, 2711L, 789L, 185L, 293L, 3456L, 5605L, 2783L), org1 = structure(c(23L,
16L, 4L, 48L, 9L, 4L, 48L, 1L, 4L, 48L, 25L, 4L, 4L, 48L, 10L,
26L, 3L, 4L, 49L, 4L, 4L, 26L, 48L, 49L, 48L, 22L, 11L, 4L, 1L,
1L, 49L, 31L, 51L, 48L, 26L, 21L, 4L, 4L, 4L, 11L, 1L, 4L, 82L,
4L, 54L, 48L, 83L, 1L, 48L, 49L), .Label = c("1", "2", "3", "4",
"5", "6", "7", "8", "9", "10", "11", "12", "13", "14", "15",
"16", "17", "18", "19", "20", "21", "22", "23", "24", "25", "26",
"27", "28", "29", "30", "31", "32", "33", "34", "35", "36", "37",
"38", "39", "42", "43", "44", "46", "47", "48", "49", "50", "51",
"52", "53", "54", "55", "56", "57", "58", "59", "60", "62", "64",
"65", "67", "70", "72", "73", "75", "76", "77", "78", "79", "80",
"81", "82", "83", "84", "87", "89", "90", "91", "92", "93", "98",
"100", "102", "103", "104", "105", "107", "109", "111", "114",
"117", "120", "122"), class = "factor"), org2 = structure(c(1L,
1L, 1L, 1L, 1L, 5L, 1L, 1L, 5L, 1L, 1L, 5L, 5L, 1L, 1L, 1L, 1L,
5L, 1L, 5L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 5L, 2L, 1L, 1L, 28L,
1L, 1L, 1L, 1L, 5L, 5L, 5L, 12L, 2L, 1L, 1L, 30L, 1L, 1L, 1L,
1L, 1L, 1L), .Label = c("0", "1", "2", "3", "4", "5", "6", "7",
"8", "9", "10", "11", "12", "13", "15", "16", "17", "19", "20",
"21", "22", "23", "24", "25", "26", "27", "28", "29", "30", "31",
"32", "33", "34", "35", "36", "37", "39", "42", "44", "46", "47",
"48", "49", "50", "51", "52", "54", "56", "57", "58", "59", "61",
"64", "67", "68", "73", "81", "83", "84", "88", "89", "90", "94",
"99", "100", "104", "106", "107", "109", "117", "118", "120",
"122", "124"), class = "factor"), org3 = structure(c(1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 5L, 1L, 1L, 1L, 5L, 1L, 1L, 1L, 1L, 5L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 5L, 1L, 1L, 1L, 24L, 1L,
1L, 1L, 1L, 5L, 1L, 1L, 11L, 2L, 1L, 1L, 46L, 1L, 1L, 1L, 1L,
1L, 1L), .Label = c("0", "1", "2", "3", "4", "6", "7", "8", "9",
"10", "11", "13", "15", "16", "17", "19", "21", "22", "23", "24",
"25", "26", "27", "28", "29", "31", "32", "34", "35", "36", "39",
"42", "43", "46", "47", "48", "49", "50", "52", "54", "56", "57",
"58", "59", "63", "66", "68", "73", "77", "85", "107", "117",
"120", "122", "123"), class = "factor"), BP = structure(c(0,
1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0,
0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1,
0, 1, 0, 1, 0, 0, 0), format.stata = "%9.0g"), nr_participants = c(50,
0, 0, 0, 80, 0, 40, 0, 0, 100, 100, 0, 0, 80, 50, 36, 50, 0,
587, 0, 0, 0, 200, 20, 200, 160, 5, 0, 341, 20, 1600, 250, 20,
0, 200, 0, 0, 0, 0, 500, 0, 0, 15, 0, 0, 0, 50, 382, 300, 100
)), row.names = c(NA, -50L), class = c("tbl_df", "tbl", "data.frame"
))
如何使用上面定义的总和值和份额值填充矩阵?
这是 outer
-
#get the column numbers with 'org' in it
cols <- grep('org', names(df))
#Get all the unique org values.
unique_orgs <- unique(unlist(df[cols]))
#Function to calculate sum of nr_participants where
#the two numbers exist in the same row
calculate_sum <- function(x, y) {
sum(df$nr_participants[rowSums(df[cols] == x) > 0 |
rowSums(df[cols] == y) > 0])
}
#Use outer to apply it for every unique values
mat <- outer(unique_orgs, unique_orgs, Vectorize(calculate_sum))
#assign row and column names
dimnames(mat) <- list(unique_orgs, unique_orgs)
mat
# 15 11 23 29 31 13 28 49 0
#15 0 127500 50500 50000 0 128000 50500 0 0
#11 127500 127500 178000 177500 127500 128000 178000 127500 127500
#23 50500 178000 50500 50500 50500 178000 50500 50500 50500
#29 50000 177500 50500 50000 50000 178000 50500 50000 50000
#31 0 127500 50500 50000 0 128000 50500 0 0
#13 128000 128000 178000 178000 128000 128000 178000 128000 128000
#28 50500 178000 50500 50500 50500 178000 50500 50500 50500
#49 0 127500 50500 50000 0 128000 50500 0 0
#0 0 127500 50500 50000 0 128000 50500 0 0