使用 R 创建关系矩阵
Creating relational matrices with R
我的数据框包含参与其中的不同个人的项目,以及项目执行的年份。
我如何为每一年创建一个 nxn 关系矩阵(n 是个人的数量)来计算个人之间的合作数量。
考虑以下重现所需结构的示例:
# Example dataframe
set.seed(1)
tp=cbind(paste(rep("project",10),1:10,sep=""),sample(2005:2010,10,replace=T))
tp=tp[sample(1:10,50,T),]
id=sample(paste(rep("id",10),1:10,sep=""),50,T)
df=as.data.frame(cbind(tp,id));rm(tp,id)
names(df)=c("project","year","id")
df=df[order(df$project,df$id),]
df[1:10,]
# project year id
# project1 2006 id1
# project1 2006 id3
# project1 2006 id5
# project1 2006 id5
# project4 2006 id3
# project4 2006 id4
# project5 2006 id3
# project5 2006 id4
# project6 2008 id2
# project6 2008 id3
例如,2006 年的关系矩阵如下所示
id1 id2 id3 id4 id5
id1 0 0 1 0 1
id2 0 0 0 0 0
id3 1 0 0 2 1
id4 0 0 2 0 0
id5 1 0 1 0 0
# link between 1 and 3, 1 and 5, 3 and 5 on project 1
# links between 3 and 4 on project 4 and project 5
# the matrix is symmetric
# the diagonal is O because an individual cannot collaborate with himself
我稍微修改了您的采样代码,使项目维度与 id 维度不同,因为我正在研究矩阵的维度以确保我得到正确的 n x n
矩阵。这是有效的代码:
set.seed(1)
tp=cbind(paste(rep("project",5),1:5,sep=""),sample(2008:2010,5,replace=T))
tp=tp[sample(1:5,20,T),]
id=sample(paste(rep("id",10),1:10,sep=""),20,T)
df=as.data.frame(cbind(tp,id));rm(tp,id)
names(df)=c("project","year","id")
df=df[order(df$project,df$id),]
spl=split(df,df$year)
net=lapply(spl,function(x){
m = table(x$id, x$project)
res = tcrossprod(m) ## equivalently: res = m %*% t(m)
diag(res) <- 0
res <- ifelse(res > 0, 1, 0)
res
})
net
拆分数据:
$`2008`
project year id
5 project1 2008 id4
7 project1 2008 id6
19 project1 2008 id6
2 project5 2008 id1
13 project5 2008 id2
1 project5 2008 id4
16 project5 2008 id9
$`2009`
project year id
9 project2 2009 id2
6 project2 2009 id5
20 project2 2009 id6
17 project2 2009 id7
14 project2 2009 id8
11 project3 2009 id7
$`2010`
project year id
3 project4 2010 id4
8 project4 2010 id5
15 project4 2010 id5
12 project4 2010 id8
18 project4 2010 id8
4 project4 2010 id9
10 project4 2010 id9
每年项目的邻接矩阵:
$`2008`
id1 id2 id4 id5 id6 id7 id8 id9
id1 0 1 1 0 0 0 0 1
id2 1 0 1 0 0 0 0 1
id4 1 1 0 0 1 0 0 1
id5 0 0 0 0 0 0 0 0
id6 0 0 1 0 0 0 0 0
id7 0 0 0 0 0 0 0 0
id8 0 0 0 0 0 0 0 0
id9 1 1 1 0 0 0 0 0
$`2009`
id1 id2 id4 id5 id6 id7 id8 id9
id1 0 0 0 0 0 0 0 0
id2 0 0 0 1 1 1 1 0
id4 0 0 0 0 0 0 0 0
id5 0 1 0 0 1 1 1 0
id6 0 1 0 1 0 1 1 0
id7 0 1 0 1 1 0 1 0
id8 0 1 0 1 1 1 0 0
id9 0 0 0 0 0 0 0 0
$`2010`
id1 id2 id4 id5 id6 id7 id8 id9
id1 0 0 0 0 0 0 0 0
id2 0 0 0 0 0 0 0 0
id4 0 0 0 1 0 0 1 1
id5 0 0 1 0 0 0 1 1
id6 0 0 0 0 0 0 0 0
id7 0 0 0 0 0 0 0 0
id8 0 0 1 1 0 0 0 1
id9 0 0 1 1 0 0 1 0
您也可以将 dplyr 与 tidyr 一起使用:
library(dplyr)
library(tidyr)
df %>%
unique %>%
mutate(val = 1) %>%
spread(id, val) %>%
select(-project) %>%
group_by(year) %>%
do({
mat <- select(., -year) %>% as.matrix
mat[is.na(mat)] <- 0
cp <- crossprod(mat)
diag(cp) <- 0
cp %>% as.data.frame %>%
add_rownames(var = 'id')
}) %>%
ungroup
我的数据框包含参与其中的不同个人的项目,以及项目执行的年份。
我如何为每一年创建一个 nxn 关系矩阵(n 是个人的数量)来计算个人之间的合作数量。
考虑以下重现所需结构的示例:
# Example dataframe
set.seed(1)
tp=cbind(paste(rep("project",10),1:10,sep=""),sample(2005:2010,10,replace=T))
tp=tp[sample(1:10,50,T),]
id=sample(paste(rep("id",10),1:10,sep=""),50,T)
df=as.data.frame(cbind(tp,id));rm(tp,id)
names(df)=c("project","year","id")
df=df[order(df$project,df$id),]
df[1:10,]
# project year id
# project1 2006 id1
# project1 2006 id3
# project1 2006 id5
# project1 2006 id5
# project4 2006 id3
# project4 2006 id4
# project5 2006 id3
# project5 2006 id4
# project6 2008 id2
# project6 2008 id3
例如,2006 年的关系矩阵如下所示
id1 id2 id3 id4 id5
id1 0 0 1 0 1
id2 0 0 0 0 0
id3 1 0 0 2 1
id4 0 0 2 0 0
id5 1 0 1 0 0
# link between 1 and 3, 1 and 5, 3 and 5 on project 1
# links between 3 and 4 on project 4 and project 5
# the matrix is symmetric
# the diagonal is O because an individual cannot collaborate with himself
我稍微修改了您的采样代码,使项目维度与 id 维度不同,因为我正在研究矩阵的维度以确保我得到正确的 n x n
矩阵。这是有效的代码:
set.seed(1)
tp=cbind(paste(rep("project",5),1:5,sep=""),sample(2008:2010,5,replace=T))
tp=tp[sample(1:5,20,T),]
id=sample(paste(rep("id",10),1:10,sep=""),20,T)
df=as.data.frame(cbind(tp,id));rm(tp,id)
names(df)=c("project","year","id")
df=df[order(df$project,df$id),]
spl=split(df,df$year)
net=lapply(spl,function(x){
m = table(x$id, x$project)
res = tcrossprod(m) ## equivalently: res = m %*% t(m)
diag(res) <- 0
res <- ifelse(res > 0, 1, 0)
res
})
net
拆分数据:
$`2008`
project year id
5 project1 2008 id4
7 project1 2008 id6
19 project1 2008 id6
2 project5 2008 id1
13 project5 2008 id2
1 project5 2008 id4
16 project5 2008 id9
$`2009`
project year id
9 project2 2009 id2
6 project2 2009 id5
20 project2 2009 id6
17 project2 2009 id7
14 project2 2009 id8
11 project3 2009 id7
$`2010`
project year id
3 project4 2010 id4
8 project4 2010 id5
15 project4 2010 id5
12 project4 2010 id8
18 project4 2010 id8
4 project4 2010 id9
10 project4 2010 id9
每年项目的邻接矩阵:
$`2008`
id1 id2 id4 id5 id6 id7 id8 id9
id1 0 1 1 0 0 0 0 1
id2 1 0 1 0 0 0 0 1
id4 1 1 0 0 1 0 0 1
id5 0 0 0 0 0 0 0 0
id6 0 0 1 0 0 0 0 0
id7 0 0 0 0 0 0 0 0
id8 0 0 0 0 0 0 0 0
id9 1 1 1 0 0 0 0 0
$`2009`
id1 id2 id4 id5 id6 id7 id8 id9
id1 0 0 0 0 0 0 0 0
id2 0 0 0 1 1 1 1 0
id4 0 0 0 0 0 0 0 0
id5 0 1 0 0 1 1 1 0
id6 0 1 0 1 0 1 1 0
id7 0 1 0 1 1 0 1 0
id8 0 1 0 1 1 1 0 0
id9 0 0 0 0 0 0 0 0
$`2010`
id1 id2 id4 id5 id6 id7 id8 id9
id1 0 0 0 0 0 0 0 0
id2 0 0 0 0 0 0 0 0
id4 0 0 0 1 0 0 1 1
id5 0 0 1 0 0 0 1 1
id6 0 0 0 0 0 0 0 0
id7 0 0 0 0 0 0 0 0
id8 0 0 1 1 0 0 0 1
id9 0 0 1 1 0 0 1 0
您也可以将 dplyr 与 tidyr 一起使用:
library(dplyr)
library(tidyr)
df %>%
unique %>%
mutate(val = 1) %>%
spread(id, val) %>%
select(-project) %>%
group_by(year) %>%
do({
mat <- select(., -year) %>% as.matrix
mat[is.na(mat)] <- 0
cp <- crossprod(mat)
diag(cp) <- 0
cp %>% as.data.frame %>%
add_rownames(var = 'id')
}) %>%
ungroup