为网络创建邻接矩阵的优化函数
Optimising function for creating adjacency matrices for networks
我正在尝试通过查看特定受试者的氨基酸库来在 R 中进行基于网络的分析。
一个氨基酸和另一个氨基酸之间的联系基于任何具有编辑距离 1 的对。
问题是,我创建的用于输出我需要用于网络图的邻接矩阵的函数非常慢,我想要一些关于如何利用 Rs 矢量化功能来执行这样的操作的建议操作,或以其他方式。
我在论坛上阅读了很多关于 R 中 for 循环有多慢的帖子,但是,出于分析的目的,我只是找不到其他方法来做到这一点。
这是一个公开可用的数据集的片段,类似于我正在分析的数据集:
structure(list(Gene = structure(c(2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("TRA", "TRB"), class = "factor"),
aminos = c("CASSSSMESGNTIYF", "CASSGPGGGAFF", "CASSDSLVRGYQETQYF",
"CASSLVENTEAFF", "CASSLQEWDPNYGYTF", "CASSLVENTEAFF", "CASSQEGGTEQFF",
"CASSYLGDIQFNQPQHF", "CASSPRTSGGYQEPQYF", "CASSPRTSGGYQETQYF",
"CASQHGPGIGTGELFF", "CASSLPDRAGEKLFF", "CASSSGQGNIQYF", "CASSYSVKGLNTEAFF",
"CASSWRQGATNYGYTF", "CASSDDVGRLAYEQYF", "CASSEIGRSTGELFF",
"CASSFGRQAYEQYF", "CASSAGQGGEHQPQHF", "CASSRSDREMFNYGYTF",
"CASSLFSQGWTEAFF", "CASSLYIQGGEQYF", "CASSFGRQAYEQYF", "CASSLENGQYEQYF",
"CASSLDKPPPDTGELFF", "CASNQGTATEAFF", "CASSLLLAGGYQETQYF",
"CASSYSVKGLNTEAFF", "CASSFEIAGGNEQFF", "CASSHSAGVFMNTEAFF",
"CASSLARQEETQYF", "CSATGGRHTGELFF", "CSATRSSGEPEQFF", "CASSQEVAAGGGDTQYF",
"CASSLPDRAGEKLFF", "CASSQEMSTGLGEQYF", "CASSQEGSGAPYEQYF",
"CASSQEPGAPNTGELFF", "CASSLTVSLSPDLNEQFF", "CASSQDPLAGYTGELFF",
"CASSQEPSGGTNTGELFF", "CASSLETGKWGEQYF", "CASSQEGQGAPYEQYF",
"CSAGESTPEAFF", "CASSQEASGGPYEQYF", "CASRETGGVWETQYF", "CASSLEGNGHREQYF",
"CASSLEGTSGSPDLNEQFF", "CASSLTVSLSPDLNEQFF", "CASSQDPLAGYTGELFF",
"CASSQGGDTEAFF", "CASSDLGQGRMNTEAFF", "CASSQEVGTSGEGEQFF",
"CASSQEVGQRLLNTGELFF", "CASSQEQGGWGEQYF", "CAVEDTGGFKTIF",
"CAASARGQAGTALIF", "CAMREHTSGTYKYIF", "CAENGGNTPLVF", "CAFMITGAGSYQLTF",
"CALSVVNQAGTALIF", "CAETGGFKTIF", "CAFMKLESYMDSNYQLIW", "CALSESANSGGYQKVTF",
"CALSESANSGGYQKVTF", "CASFPTTSGTYKYIF", "CAVDLTGAGSYQLTF",
"CAVEPNSGYALNF", "CAVEPPDGQKLLF", "CAVEPPSGSRLTF", "CAVERSDGQKLLF",
"CAVGAGPSGTYKYIF", "CAVQANTNAGKSTF", "CAVSNFMNSGYSTLTF",
"CAYRGFYGGATNKLIF", "CAYRSLALIQGAQKLVF", "CAYRSLDLSGNTPLVF",
"CAYRSLDVSRDDKIIF", "CAYRTLEGTYKYIF", "CAYRTTLSGGGADGLTF",
"CGRTGFQKLVF", "CILSATTSGTYKYIF", "CIVRVPFLYNQGGKLIF", "CLVANGNNRLAF",
"CLVARGGSYIPTF", "CLVASPSGGYNKLIF", "CLVEPPPGNGGFKTIF", "CLVGAPLVF",
"CLVGDGRGGSQGNLIF", "CLVGDGYGNNRLAF", "CLVGDLTNYQLIW", "CLVGDSGDRGSTLGRLYF",
"CLVGDTSSGSARQLTF", "CLVGEAGGFKTIF", "CLVGEAGGFKTIF", "CLVGEGDNYQLIW",
"CLVGEGRGGMDSNYQLIW", "CLVGENNNARLMF", "CLVGETNAGKSTF", "CLVGGNNNDMRF",
"CLVGGTGTASKLTF", "CLVGPGGFGNEKLTF", "CLVGVPAGNMLTF", "CLVGVPGSARQLTF",
"CLVGVPGSARQLTF", "CLVGVPLGGGGNKLTF", "CLVGVPNDYKLSF", "CLVGVYNQGGKLIF",
"CLVNTNAGKSTF", "CLVTGSARQLTF")), class = "data.frame", row.names = c(NA,
-110L))
这是我创建的函数:
getAdjMat4AAs <- function(x){
SR1 <- x #assignment to input bcause i started this operation on SR1
net_SR1 <- stringdistmatrix(SR1$aminos, SR1$aminos)
colnames(net_SR1) <- SR1$aminos
rownames(net_SR1) <- SR1$aminos
#Must find indexes of those w lev dist == 1 out of this huge matrix. Proceed like this.
##down there changed from nrow(SR1) -> nrow(net_SR1)
idx_loc <- matrix(nrow = 2*nrow(net_SR1), ncol = 2) #dont know exact NROW dim of mat, so chose (2x)
ii <- 1
for(i in 1:nrow(net_SR1)){
for(j in 1:ncol(net_SR1)){
idx <- which(net_SR1[i,j] == 1)
if(length(idx) == 0){
next
}else{
#idx_loc[[i]] <- paste(i,j, sep = ",")
idx_loc[ii,c(1,2)] <- c(i,j)
ii <- ii+1
}
}
}
idx_loc <- idx_loc[complete.cases(idx_loc),] #remove NAs from surplus nrow assignment matrix
#Also, use unique(AAs) for this calculation, will use rowsums() or colsums() for making centres?
AAs_col <- colnames(net_SR1)[idx_loc[,2]]
AAs_row <- rownames(net_SR1)[idx_loc[,1]]
AAs_colUnq <- AAs_col %>% unique()
AAs_rowUnq <- AAs_row %>% unique()
adjMat_SR1 <- matrix(nrow = length(AAs_colUnq), ncol = length(AAs_colUnq))
#should have the same order of AAs in rows and col for adjacency matrix.. proceed as such
colnames(adjMat_SR1) <- AAs_colUnq
rownames(adjMat_SR1) <- AAs_colUnq
for(i in 1:nrow(adjMat_SR1)){
for(j in 1:ncol(adjMat_SR1)){
if(stringdist(rownames(adjMat_SR1)[i], colnames(adjMat_SR1)[j]) == 1){
adjMat_SR1[i,j] = 1
}else{
adjMat_SR1[i,j] = 0
}
}
}
return(adjMat_SR1)
}
你是否应该 运行 数据集上的函数,前提是它不会很慢,但是,一旦我们进入数千个,它就会变得非常慢。
任何关于优化此过程的建议,甚至是关于我用于网络分析的实际方法的任何建议,我们将不胜感激。
您可以通过以下简单操作来获得预期的邻接矩阵(您可以轻松地将其包装在一个函数中)。 SR1
是您提供的数据。
# define a Levenshtein distance matrix with all the aminos
levenshtein.dist.mat <- stringdist::stringdistmatrix(unique(SR1$aminos),
unique(SR1$aminos),
useNames = "strings") # I think you should add method = "lv", right ?
# in row are the aminos with a Levenshtein distance of 1 to at least one another amino
levenshtein.dist.mat <- levenshtein.dist.mat[rowSums(sapply(as.data.frame(levenshtein.dist.mat), '==', 1)) > 0, ]
# we can filter the relevant columns
levenshtein.dist.mat <- levenshtein.dist.mat[, colnames(levenshtein.dist.mat) %in% rownames(levenshtein.dist.mat)]
# values not equal to 1 do not represent a connection. Let's set them to zero
levenshtein.dist.mat[levenshtein.dist.mat != 1] <- 0
# output
levenshtein.dist.mat
CASSPRTSGGYQEPQYF CASSPRTSGGYQETQYF CASSQEGSGAPYEQYF CASSQEGQGAPYEQYF
CASSPRTSGGYQEPQYF 0 1 0 0
CASSPRTSGGYQETQYF 1 0 0 0
CASSQEGSGAPYEQYF 0 0 0 1
CASSQEGQGAPYEQYF 0 0 1 0
我正在尝试通过查看特定受试者的氨基酸库来在 R 中进行基于网络的分析。
一个氨基酸和另一个氨基酸之间的联系基于任何具有编辑距离 1 的对。
问题是,我创建的用于输出我需要用于网络图的邻接矩阵的函数非常慢,我想要一些关于如何利用 Rs 矢量化功能来执行这样的操作的建议操作,或以其他方式。
我在论坛上阅读了很多关于 R 中 for 循环有多慢的帖子,但是,出于分析的目的,我只是找不到其他方法来做到这一点。
这是一个公开可用的数据集的片段,类似于我正在分析的数据集:
structure(list(Gene = structure(c(2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("TRA", "TRB"), class = "factor"),
aminos = c("CASSSSMESGNTIYF", "CASSGPGGGAFF", "CASSDSLVRGYQETQYF",
"CASSLVENTEAFF", "CASSLQEWDPNYGYTF", "CASSLVENTEAFF", "CASSQEGGTEQFF",
"CASSYLGDIQFNQPQHF", "CASSPRTSGGYQEPQYF", "CASSPRTSGGYQETQYF",
"CASQHGPGIGTGELFF", "CASSLPDRAGEKLFF", "CASSSGQGNIQYF", "CASSYSVKGLNTEAFF",
"CASSWRQGATNYGYTF", "CASSDDVGRLAYEQYF", "CASSEIGRSTGELFF",
"CASSFGRQAYEQYF", "CASSAGQGGEHQPQHF", "CASSRSDREMFNYGYTF",
"CASSLFSQGWTEAFF", "CASSLYIQGGEQYF", "CASSFGRQAYEQYF", "CASSLENGQYEQYF",
"CASSLDKPPPDTGELFF", "CASNQGTATEAFF", "CASSLLLAGGYQETQYF",
"CASSYSVKGLNTEAFF", "CASSFEIAGGNEQFF", "CASSHSAGVFMNTEAFF",
"CASSLARQEETQYF", "CSATGGRHTGELFF", "CSATRSSGEPEQFF", "CASSQEVAAGGGDTQYF",
"CASSLPDRAGEKLFF", "CASSQEMSTGLGEQYF", "CASSQEGSGAPYEQYF",
"CASSQEPGAPNTGELFF", "CASSLTVSLSPDLNEQFF", "CASSQDPLAGYTGELFF",
"CASSQEPSGGTNTGELFF", "CASSLETGKWGEQYF", "CASSQEGQGAPYEQYF",
"CSAGESTPEAFF", "CASSQEASGGPYEQYF", "CASRETGGVWETQYF", "CASSLEGNGHREQYF",
"CASSLEGTSGSPDLNEQFF", "CASSLTVSLSPDLNEQFF", "CASSQDPLAGYTGELFF",
"CASSQGGDTEAFF", "CASSDLGQGRMNTEAFF", "CASSQEVGTSGEGEQFF",
"CASSQEVGQRLLNTGELFF", "CASSQEQGGWGEQYF", "CAVEDTGGFKTIF",
"CAASARGQAGTALIF", "CAMREHTSGTYKYIF", "CAENGGNTPLVF", "CAFMITGAGSYQLTF",
"CALSVVNQAGTALIF", "CAETGGFKTIF", "CAFMKLESYMDSNYQLIW", "CALSESANSGGYQKVTF",
"CALSESANSGGYQKVTF", "CASFPTTSGTYKYIF", "CAVDLTGAGSYQLTF",
"CAVEPNSGYALNF", "CAVEPPDGQKLLF", "CAVEPPSGSRLTF", "CAVERSDGQKLLF",
"CAVGAGPSGTYKYIF", "CAVQANTNAGKSTF", "CAVSNFMNSGYSTLTF",
"CAYRGFYGGATNKLIF", "CAYRSLALIQGAQKLVF", "CAYRSLDLSGNTPLVF",
"CAYRSLDVSRDDKIIF", "CAYRTLEGTYKYIF", "CAYRTTLSGGGADGLTF",
"CGRTGFQKLVF", "CILSATTSGTYKYIF", "CIVRVPFLYNQGGKLIF", "CLVANGNNRLAF",
"CLVARGGSYIPTF", "CLVASPSGGYNKLIF", "CLVEPPPGNGGFKTIF", "CLVGAPLVF",
"CLVGDGRGGSQGNLIF", "CLVGDGYGNNRLAF", "CLVGDLTNYQLIW", "CLVGDSGDRGSTLGRLYF",
"CLVGDTSSGSARQLTF", "CLVGEAGGFKTIF", "CLVGEAGGFKTIF", "CLVGEGDNYQLIW",
"CLVGEGRGGMDSNYQLIW", "CLVGENNNARLMF", "CLVGETNAGKSTF", "CLVGGNNNDMRF",
"CLVGGTGTASKLTF", "CLVGPGGFGNEKLTF", "CLVGVPAGNMLTF", "CLVGVPGSARQLTF",
"CLVGVPGSARQLTF", "CLVGVPLGGGGNKLTF", "CLVGVPNDYKLSF", "CLVGVYNQGGKLIF",
"CLVNTNAGKSTF", "CLVTGSARQLTF")), class = "data.frame", row.names = c(NA,
-110L))
这是我创建的函数:
getAdjMat4AAs <- function(x){
SR1 <- x #assignment to input bcause i started this operation on SR1
net_SR1 <- stringdistmatrix(SR1$aminos, SR1$aminos)
colnames(net_SR1) <- SR1$aminos
rownames(net_SR1) <- SR1$aminos
#Must find indexes of those w lev dist == 1 out of this huge matrix. Proceed like this.
##down there changed from nrow(SR1) -> nrow(net_SR1)
idx_loc <- matrix(nrow = 2*nrow(net_SR1), ncol = 2) #dont know exact NROW dim of mat, so chose (2x)
ii <- 1
for(i in 1:nrow(net_SR1)){
for(j in 1:ncol(net_SR1)){
idx <- which(net_SR1[i,j] == 1)
if(length(idx) == 0){
next
}else{
#idx_loc[[i]] <- paste(i,j, sep = ",")
idx_loc[ii,c(1,2)] <- c(i,j)
ii <- ii+1
}
}
}
idx_loc <- idx_loc[complete.cases(idx_loc),] #remove NAs from surplus nrow assignment matrix
#Also, use unique(AAs) for this calculation, will use rowsums() or colsums() for making centres?
AAs_col <- colnames(net_SR1)[idx_loc[,2]]
AAs_row <- rownames(net_SR1)[idx_loc[,1]]
AAs_colUnq <- AAs_col %>% unique()
AAs_rowUnq <- AAs_row %>% unique()
adjMat_SR1 <- matrix(nrow = length(AAs_colUnq), ncol = length(AAs_colUnq))
#should have the same order of AAs in rows and col for adjacency matrix.. proceed as such
colnames(adjMat_SR1) <- AAs_colUnq
rownames(adjMat_SR1) <- AAs_colUnq
for(i in 1:nrow(adjMat_SR1)){
for(j in 1:ncol(adjMat_SR1)){
if(stringdist(rownames(adjMat_SR1)[i], colnames(adjMat_SR1)[j]) == 1){
adjMat_SR1[i,j] = 1
}else{
adjMat_SR1[i,j] = 0
}
}
}
return(adjMat_SR1)
}
你是否应该 运行 数据集上的函数,前提是它不会很慢,但是,一旦我们进入数千个,它就会变得非常慢。
任何关于优化此过程的建议,甚至是关于我用于网络分析的实际方法的任何建议,我们将不胜感激。
您可以通过以下简单操作来获得预期的邻接矩阵(您可以轻松地将其包装在一个函数中)。 SR1
是您提供的数据。
# define a Levenshtein distance matrix with all the aminos
levenshtein.dist.mat <- stringdist::stringdistmatrix(unique(SR1$aminos),
unique(SR1$aminos),
useNames = "strings") # I think you should add method = "lv", right ?
# in row are the aminos with a Levenshtein distance of 1 to at least one another amino
levenshtein.dist.mat <- levenshtein.dist.mat[rowSums(sapply(as.data.frame(levenshtein.dist.mat), '==', 1)) > 0, ]
# we can filter the relevant columns
levenshtein.dist.mat <- levenshtein.dist.mat[, colnames(levenshtein.dist.mat) %in% rownames(levenshtein.dist.mat)]
# values not equal to 1 do not represent a connection. Let's set them to zero
levenshtein.dist.mat[levenshtein.dist.mat != 1] <- 0
# output
levenshtein.dist.mat
CASSPRTSGGYQEPQYF CASSPRTSGGYQETQYF CASSQEGSGAPYEQYF CASSQEGQGAPYEQYF
CASSPRTSGGYQEPQYF 0 1 0 0
CASSPRTSGGYQETQYF 1 0 0 0
CASSQEGSGAPYEQYF 0 0 0 1
CASSQEGQGAPYEQYF 0 0 1 0