在 R 中将 DataFrame 转换为 Adjacency/Weights 矩阵
Convert a DataFrame into Adjacency/Weights Matrix in R
我有一个 DataFrame,df
。
n
是一列,表示 x
列中的组数。
x
是包含逗号分隔组的列。
df <- data.frame(n = c(2, 3, 2, 2),
x = c("a, b", "a, c, d", "c, d", "d, b"))
> df
n x
2 a, b
3 a, c, d
2 c, d
2 d, b
我想将这个DataFrame转换成一个权重矩阵,其中行名和列名是df$x
中组的唯一值,元素代表每个组一起出现的次数在 df$x
.
输出应如下所示:
m <- matrix(c(0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 2, 1, 1, 2, 0), nrow = 4, ncol = 4)
rownames(m) <- letters[1:4]; colnames(m) <- letters[1:4]
> m
a b c d
a 0 1 1 1
b 1 0 0 1
c 1 0 0 2
d 1 1 2 0
这是一个非常粗略且可能非常低效的解决方案,使用 tidyverse
进行争论并使用 combinat
生成排列。
library(tidyverse)
library(combinat)
df <- data.frame(n = c(2, 3, 2, 2),
x = c("a, b", "a, c, d", "c, d", "d, b"))
df %>%
## Parse entries in x into distinct elements
mutate(split = map(x, str_split, pattern = ', '),
flat = flatten(split)) %>%
## Construct 2-element subsets of each set of elements
mutate(combn = map(flat, combn, 2, simplify = FALSE)) %>%
unnest(combn) %>%
## Construct permutations of the 2-element subsets
mutate(perm = map(combn, permn)) %>%
unnest(perm) %>%
## Parse the permutations into row and column indices
mutate(row = map_chr(perm, 1),
col = map_chr(perm, 2)) %>%
count(row, col) %>%
## Long to wide representation
spread(key = col, value = nn, fill = 0) %>%
## Coerce to matrix
column_to_rownames(var = 'row') %>%
as.matrix()
使用 Base R,你可以做如下的事情
a = strsplit(as.character(df$x),', ')
b = unique(unlist(a))
d = unlist(sapply(a,combn,2,toString))
e = data.frame(table(factor(d,c(paste(b,b,sep=','),combn(b,2,toString)))))
f = read.table(text = do.call(paste,c(sep =',', e)),sep=',',strip.white = T)
g = xtabs(V3~V1+V2,f)
g[lower.tri(g)] = t(g)[lower.tri(g)]
g
V2
V1 a b c d
a 0 1 1 1
b 1 0 0 0
c 1 0 0 2
d 1 0 2 0
这是使用 data.table
的另一种可能方法:
#generate the combis
combis <- df[, transpose(combn(sort(strsplit(x, ", ")[[1L]]), 2L, simplify=FALSE)),
by=1L:df[,.N]]
#create new rows for identical letters within a pair or any other missing combi
withDiag <- out[CJ(c(V1,V2), c(V1,V2), unique=TRUE), on=.(V1, V2)]
#duplicate the above for lower triangular part of the matrix
withLowerTri <- rbindlist(list(withDiag, withDiag[,.(df, V2, V1)]))
#pivot to get weights matrix
outDT <- dcast(withLowerTri, V1 ~ V2, function(x) sum(!is.na(x)), value.var="df")
outDT
输出:
V1 a b c d
1: a 0 1 1 1
2: b 1 0 0 1
3: c 1 0 0 2
4: d 1 1 2 0
如果需要矩阵输出,那么
mat <- as.matrix(outDT[, -1L])
rownames(mat) <- unlist(outDT[,1L])
输出:
a b c d
a 0 1 1 1
b 1 0 0 1
c 1 0 0 2
d 1 1 2 0
我有一个 DataFrame,df
。
n
是一列,表示 x
列中的组数。
x
是包含逗号分隔组的列。
df <- data.frame(n = c(2, 3, 2, 2),
x = c("a, b", "a, c, d", "c, d", "d, b"))
> df
n x
2 a, b
3 a, c, d
2 c, d
2 d, b
我想将这个DataFrame转换成一个权重矩阵,其中行名和列名是df$x
中组的唯一值,元素代表每个组一起出现的次数在 df$x
.
输出应如下所示:
m <- matrix(c(0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 2, 1, 1, 2, 0), nrow = 4, ncol = 4)
rownames(m) <- letters[1:4]; colnames(m) <- letters[1:4]
> m
a b c d
a 0 1 1 1
b 1 0 0 1
c 1 0 0 2
d 1 1 2 0
这是一个非常粗略且可能非常低效的解决方案,使用 tidyverse
进行争论并使用 combinat
生成排列。
library(tidyverse)
library(combinat)
df <- data.frame(n = c(2, 3, 2, 2),
x = c("a, b", "a, c, d", "c, d", "d, b"))
df %>%
## Parse entries in x into distinct elements
mutate(split = map(x, str_split, pattern = ', '),
flat = flatten(split)) %>%
## Construct 2-element subsets of each set of elements
mutate(combn = map(flat, combn, 2, simplify = FALSE)) %>%
unnest(combn) %>%
## Construct permutations of the 2-element subsets
mutate(perm = map(combn, permn)) %>%
unnest(perm) %>%
## Parse the permutations into row and column indices
mutate(row = map_chr(perm, 1),
col = map_chr(perm, 2)) %>%
count(row, col) %>%
## Long to wide representation
spread(key = col, value = nn, fill = 0) %>%
## Coerce to matrix
column_to_rownames(var = 'row') %>%
as.matrix()
使用 Base R,你可以做如下的事情
a = strsplit(as.character(df$x),', ')
b = unique(unlist(a))
d = unlist(sapply(a,combn,2,toString))
e = data.frame(table(factor(d,c(paste(b,b,sep=','),combn(b,2,toString)))))
f = read.table(text = do.call(paste,c(sep =',', e)),sep=',',strip.white = T)
g = xtabs(V3~V1+V2,f)
g[lower.tri(g)] = t(g)[lower.tri(g)]
g
V2
V1 a b c d
a 0 1 1 1
b 1 0 0 0
c 1 0 0 2
d 1 0 2 0
这是使用 data.table
的另一种可能方法:
#generate the combis
combis <- df[, transpose(combn(sort(strsplit(x, ", ")[[1L]]), 2L, simplify=FALSE)),
by=1L:df[,.N]]
#create new rows for identical letters within a pair or any other missing combi
withDiag <- out[CJ(c(V1,V2), c(V1,V2), unique=TRUE), on=.(V1, V2)]
#duplicate the above for lower triangular part of the matrix
withLowerTri <- rbindlist(list(withDiag, withDiag[,.(df, V2, V1)]))
#pivot to get weights matrix
outDT <- dcast(withLowerTri, V1 ~ V2, function(x) sum(!is.na(x)), value.var="df")
outDT
输出:
V1 a b c d
1: a 0 1 1 1
2: b 1 0 0 1
3: c 1 0 0 2
4: d 1 1 2 0
如果需要矩阵输出,那么
mat <- as.matrix(outDT[, -1L])
rownames(mat) <- unlist(outDT[,1L])
输出:
a b c d
a 0 1 1 1
b 1 0 0 1
c 1 0 0 2
d 1 1 2 0