从 data.frame 或 data.table 构建方形邻接矩阵
Build a square adjacency matrix from data.frame or data.table
我正在尝试从 data.table
构建方形邻接 matrix
。
这是我已有的可重现示例:
require(data.table)
require(plyr)
require(reshape2)
# Build a mock data.table
dt <- data.table(Source=as.character(rep(letters[1:3],2)),Target=as.character(rep(letters[4:2],2)))
dt
# Source Target
#1: a d
#2: b c
#3: c b
#4: a d
#5: b c
#6: c b
sry <- ddply(dt, .(Source,Target), summarize, Frequency=length(Source))
sry
# Source Target Frequency
#1 a d 2
#2 b c 2
#3 c b 2
mtx <- as.matrix(dcast(sry, Source ~ Target, value.var="Frequency", fill=0))
rownames(mtx) <- mtx[,1]
mtx <- mtx[,2:ncol(mtx)]
mtx
# b c d
#a "0" "0" "2"
#b "0" "2" "0"
#c "2" "0" "0"
现在,这与我想要得到的非常接近,除了我希望所有节点都在两个维度上表示,例如:
a b c d
a 0 0 0 2
b 0 0 2 0
c 0 2 0 0
d 0 0 0 0
请注意,我正在处理相当大的数据,因此我想为此找到一个有效的解决方案。
感谢您的帮助。
解决方案(编辑):
鉴于所提供解决方案的质量和我的数据集的大小,我对所有解决方案进行了基准测试。
#The bench was made with a 1-million-row sample from my original dataset
library(data.table)
aa <- fread("small2.csv",sep="^")
dt <- aa[,c(8,9),with=F]
colnames(dt) <- c("Source","Target")
dim(dt)
#[1] 1000001 2
levs <- unique(unlist(dt, use.names=F))
length(levs)
#[1] 2222
鉴于此数据,所需的输出是一个 2222*2222 矩阵(2222*2223 的解决方案,其中第一列包含行 names 显然也是可以接受的)。
# Ananda Mahto's first solution
am1 <- function() {
table(dt[, lapply(.SD, factor, levs)])
}
dim(am1())
#[1] 2222 2222
# Ananda Mahto's second solution
am2 <- function() {
as.matrix(dcast(dt[, lapply(.SD, factor, levs)], Source~Target, drop=F, value.var="Target", fun.aggregate=length))
}
dim(am2())
#[1] 2222 2223
library(dplyr)
library(tidyr)
# Akrun's solution
akr <- function() {
dt %>%
mutate_each(funs(factor(., levs))) %>%
group_by(Source, Target) %>%
tally() %>%
spread(Target, n, drop=FALSE, fill=0)
}
dim(akr())
#[1] 2222 2223
library(igraph)
# Carlos Cinelli's solution
cc <- function() {
g <- graph_from_data_frame(dt)
as_adjacency_matrix(g)
}
dim(cc())
#[1] 2222 2222
基准测试的结果是……
library(rbenchmark)
benchmark(am1(), am2(), akr(), cc(), replications=75)
# test replications elapsed relative user.self sys.self user.child sys.child
# 1 am1() 75 15.939 1.000 15.636 0.280 0 0
# 2 am2() 75 111.558 6.999 109.345 1.616 0 0
# 3 akr() 75 43.786 2.747 42.463 1.134 0 0
# 4 cc() 75 46.193 2.898 45.532 0.563 0 0
听起来您只是在寻找 table
,但您应该确保两列具有相同的因子水平:
levs <- unique(unlist(dt, use.names = FALSE))
table(lapply(dt, factor, levs))
# Target
# Source a b c d
# a 0 0 0 2
# b 0 0 2 0
# c 0 2 0 0
# d 0 0 0 0
我不知道它是否会提供任何速度改进,但您也可以使用 "data.table" 中的 dcast
:
dcast(lapply(dt, factor, levs), Source ~ Target, drop = FALSE,
value.var = "Target", fun.aggregate = length)
我们可以使用dplyr/tidyr
library(dplyr)
library(tidyr)
dt %>%
mutate_each(funs(factor(., letters[1:4]))) %>%
group_by(Source, Target) %>%
tally() %>%
spread(Target, n, drop=FALSE, fill=0)
# Source a b c d
# (fctr) (dbl) (dbl) (dbl) (dbl)
#1 a 0 0 0 2
#2 b 0 0 2 0
#3 c 0 2 0 0
#4 d 0 0 0 0
您也可以使用igraph
。既然你说你是在处理大数据,那么igraph
的优势在于它使用了稀疏矩阵:
library(igraph)
g <- graph_from_data_frame(dt)
as_adjacency_matrix(g)
4 x 4 sparse Matrix of class "dgCMatrix"
a b c d
a . . . 2
b . . 2 .
c . 2 . .
d . . . .
我正在尝试从 data.table
构建方形邻接 matrix
。
这是我已有的可重现示例:
require(data.table)
require(plyr)
require(reshape2)
# Build a mock data.table
dt <- data.table(Source=as.character(rep(letters[1:3],2)),Target=as.character(rep(letters[4:2],2)))
dt
# Source Target
#1: a d
#2: b c
#3: c b
#4: a d
#5: b c
#6: c b
sry <- ddply(dt, .(Source,Target), summarize, Frequency=length(Source))
sry
# Source Target Frequency
#1 a d 2
#2 b c 2
#3 c b 2
mtx <- as.matrix(dcast(sry, Source ~ Target, value.var="Frequency", fill=0))
rownames(mtx) <- mtx[,1]
mtx <- mtx[,2:ncol(mtx)]
mtx
# b c d
#a "0" "0" "2"
#b "0" "2" "0"
#c "2" "0" "0"
现在,这与我想要得到的非常接近,除了我希望所有节点都在两个维度上表示,例如:
a b c d
a 0 0 0 2
b 0 0 2 0
c 0 2 0 0
d 0 0 0 0
请注意,我正在处理相当大的数据,因此我想为此找到一个有效的解决方案。
感谢您的帮助。
解决方案(编辑):
鉴于所提供解决方案的质量和我的数据集的大小,我对所有解决方案进行了基准测试。
#The bench was made with a 1-million-row sample from my original dataset
library(data.table)
aa <- fread("small2.csv",sep="^")
dt <- aa[,c(8,9),with=F]
colnames(dt) <- c("Source","Target")
dim(dt)
#[1] 1000001 2
levs <- unique(unlist(dt, use.names=F))
length(levs)
#[1] 2222
鉴于此数据,所需的输出是一个 2222*2222 矩阵(2222*2223 的解决方案,其中第一列包含行 names 显然也是可以接受的)。
# Ananda Mahto's first solution
am1 <- function() {
table(dt[, lapply(.SD, factor, levs)])
}
dim(am1())
#[1] 2222 2222
# Ananda Mahto's second solution
am2 <- function() {
as.matrix(dcast(dt[, lapply(.SD, factor, levs)], Source~Target, drop=F, value.var="Target", fun.aggregate=length))
}
dim(am2())
#[1] 2222 2223
library(dplyr)
library(tidyr)
# Akrun's solution
akr <- function() {
dt %>%
mutate_each(funs(factor(., levs))) %>%
group_by(Source, Target) %>%
tally() %>%
spread(Target, n, drop=FALSE, fill=0)
}
dim(akr())
#[1] 2222 2223
library(igraph)
# Carlos Cinelli's solution
cc <- function() {
g <- graph_from_data_frame(dt)
as_adjacency_matrix(g)
}
dim(cc())
#[1] 2222 2222
基准测试的结果是……
library(rbenchmark)
benchmark(am1(), am2(), akr(), cc(), replications=75)
# test replications elapsed relative user.self sys.self user.child sys.child
# 1 am1() 75 15.939 1.000 15.636 0.280 0 0
# 2 am2() 75 111.558 6.999 109.345 1.616 0 0
# 3 akr() 75 43.786 2.747 42.463 1.134 0 0
# 4 cc() 75 46.193 2.898 45.532 0.563 0 0
听起来您只是在寻找 table
,但您应该确保两列具有相同的因子水平:
levs <- unique(unlist(dt, use.names = FALSE))
table(lapply(dt, factor, levs))
# Target
# Source a b c d
# a 0 0 0 2
# b 0 0 2 0
# c 0 2 0 0
# d 0 0 0 0
我不知道它是否会提供任何速度改进,但您也可以使用 "data.table" 中的 dcast
:
dcast(lapply(dt, factor, levs), Source ~ Target, drop = FALSE,
value.var = "Target", fun.aggregate = length)
我们可以使用dplyr/tidyr
library(dplyr)
library(tidyr)
dt %>%
mutate_each(funs(factor(., letters[1:4]))) %>%
group_by(Source, Target) %>%
tally() %>%
spread(Target, n, drop=FALSE, fill=0)
# Source a b c d
# (fctr) (dbl) (dbl) (dbl) (dbl)
#1 a 0 0 0 2
#2 b 0 0 2 0
#3 c 0 2 0 0
#4 d 0 0 0 0
您也可以使用igraph
。既然你说你是在处理大数据,那么igraph
的优势在于它使用了稀疏矩阵:
library(igraph)
g <- graph_from_data_frame(dt)
as_adjacency_matrix(g)
4 x 4 sparse Matrix of class "dgCMatrix"
a b c d
a . . . 2
b . . 2 .
c . 2 . .
d . . . .