成对计算所有行之间给定字符的位置匹配数的快速方法
Quick way to count number of position match of a given character between all rows pairwise
我有一个矩阵,我想确定每个字符在所有成对之间出现在相同位置的次数。
下面是我正在做的方法的一个例子,但是我的矩阵有 10,000 行,而且它花费的时间太长了。
# This code will generate a dataframe with one row for each pair and columns that
# count the number of position match each letter have
my_letters <- c("A", "B", "C", "D")
size_vector <- 175
n_vectors <- 10
indexes_vectors <- seq_len(n_vectors)
mtx <- sapply(indexes_vectors,
function(i) sample(my_letters, n_vectors, replace = TRUE))
rownames(mtx) <- indexes_vectors
df <- as.data.frame(t(combn(indexes_vectors, m = 2)))
colnames(df) <- c("index_1", "index_2")
for(l in my_letters){
cat(l, "\n")
df[,l] <- apply(df[,1:2], 1,
function(ids) {
sum(mtx[ids[1],] == mtx[ids[2],] &
mtx[ids[1],] == l, na.rm = TRUE)
})
}
我不知道这是否会表现良好,但这是一个选择:
library(data.table)
matchDT = setDT(melt(mtx))[,
CJ(row1 = Var1, row2 = Var1)[row1 < row2], by=.(value, col = Var2)]
]
dcast(matchDT, row1 + row2 ~ value)
这排除了没有匹配项的行组合。为了让他们回来,也许...
levs = seq_len(nrow(mtx))
dcast(matchDT, factor(row1, levels=levs) + factor(row2, levels = levs) ~ value, drop = FALSE)[as.integer(row1) < as.integer(row2)]
Aggregate function missing, defaulting to 'length'
row1 row2 A B C D
1: 1 2 1 0 2 0
2: 1 3 1 0 1 1
3: 1 4 1 1 0 1
4: 1 5 0 1 1 0
5: 1 6 1 0 1 1
6: 1 7 0 0 1 0
7: 1 8 0 2 1 0
8: 1 9 1 2 2 1
9: 1 10 0 1 1 0
10: 2 3 2 0 0 0
11: 2 4 1 0 1 0
12: 2 5 0 1 1 0
13: 2 6 1 0 1 1
14: 2 7 0 0 1 0
15: 2 8 2 0 1 0
16: 2 9 1 0 1 0
17: 2 10 1 0 1 0
18: 3 4 0 0 0 2
19: 3 5 0 0 0 0
20: 3 6 1 0 0 2
21: 3 7 1 1 1 0
22: 3 8 1 0 0 1
23: 3 9 1 1 0 0
24: 3 10 1 0 1 0
25: 4 5 0 2 1 0
26: 4 6 0 1 0 2
27: 4 7 0 0 0 0
28: 4 8 1 1 0 2
29: 4 9 0 2 0 0
30: 4 10 0 2 1 0
31: 5 6 0 1 1 0
32: 5 7 0 2 1 0
33: 5 8 0 1 0 1
34: 5 9 0 1 1 0
35: 5 10 0 2 1 1
36: 6 7 0 1 2 1
37: 6 8 0 0 0 1
38: 6 9 1 1 1 0
39: 6 10 0 1 0 0
40: 7 8 0 0 1 0
41: 7 9 0 0 1 0
42: 7 10 0 1 2 0
43: 8 9 1 2 1 0
44: 8 10 1 1 1 1
45: 9 10 0 2 1 0
row1 row2 A B C D
基于 R 的可能解决方案:
l1 <- lapply(split(df, 1:nrow(df)), as.integer)
l2 <- lapply(l1, function(x) {
m <- mtx[x[1],] == mtx[x[2],]
l <- lapply(my_letters, '==', mtx[x[1],])
sapply(l, function(i) sum(i & m))
})
cbind(df, setNames(do.call(rbind.data.frame, l2), my_letters))
给出:
index_1 index_2 A B C D
1 1 2 0 0 0 0
2 1 3 0 0 2 1
3 1 4 0 0 0 1
4 1 5 0 1 2 0
5 1 6 0 0 3 1
6 1 7 0 1 1 3
7 1 8 0 1 2 2
8 1 9 0 0 2 1
9 1 10 0 0 2 0
10 2 3 0 1 0 1
11 2 4 0 1 0 2
12 2 5 0 1 0 0
13 2 6 0 0 0 2
14 2 7 0 1 0 1
15 2 8 1 0 0 0
16 2 9 0 1 0 2
17 2 10 2 1 0 3
18 3 4 0 0 1 0
19 3 5 0 0 1 1
20 3 6 0 0 1 1
21 3 7 0 1 1 2
22 3 8 0 0 0 1
23 3 9 1 0 0 0
24 3 10 0 0 0 1
25 4 5 0 2 1 0
26 4 6 0 0 1 1
27 4 7 1 1 0 1
28 4 8 1 1 1 1
29 4 9 0 1 1 2
30 4 10 0 1 0 2
31 5 6 0 1 2 0
32 5 7 0 1 1 0
33 5 8 0 2 1 0
34 5 9 0 1 2 0
35 5 10 0 2 1 0
36 6 7 1 0 1 1
37 6 8 0 0 3 1
38 6 9 0 1 2 0
39 6 10 0 0 1 1
40 7 8 0 1 0 2
41 7 9 0 1 0 1
42 7 10 0 0 0 1
43 8 9 0 0 2 1
44 8 10 1 1 1 0
45 9 10 0 0 2 1
m1 <- t(sapply(1:nrow(df), function(i)
table(factor(mtx[df[i,1],][mtx[df[i,1],] == mtx[df[i,2],]],
levels = my_letters))))
cbind(df, m1)
> V1 V2 A B C D
1 1 2 0 0 1 1
2 1 3 1 0 1 1
3 1 4 1 0 2 1
4 1 5 0 0 1 0
5 1 6 2 0 2 0
6 1 7 0 0 1 0
7 1 8 1 0 1 1
8 1 9 0 0 1 0
9 1 10 1 0 1 1
10 2 3 0 0 1 1
11 2 4 1 1 1 2
12 2 5 0 0 0 1
13 2 6 1 0 2 1
14 2 7 1 0 0 1
15 2 8 1 0 0 0
16 2 9 2 0 0 0
17 2 10 1 0 1 0
18 3 4 0 0 0 0
19 3 5 0 2 1 0
20 3 6 1 1 2 1
21 3 7 0 1 0 0
22 3 8 1 1 0 0
23 3 9 0 1 2 0
24 3 10 0 0 1 0
25 4 5 1 1 0 1
26 4 6 2 1 1 0
27 4 7 1 0 1 1
28 4 8 0 1 0 0
29 4 9 1 0 0 0
30 4 10 2 0 0 0
31 5 6 0 2 0 0
32 5 7 0 1 3 1
33 5 8 0 1 2 0
34 5 9 1 0 2 0
35 5 10 0 0 2 0
36 6 7 0 0 0 0
37 6 8 1 1 0 0
38 6 9 0 0 1 0
39 6 10 3 0 1 0
40 7 8 0 1 1 0
41 7 9 1 0 1 0
42 7 10 0 0 1 0
43 8 9 1 1 1 1
44 8 10 0 0 1 0
45 9 10 0 0 0 0
我有一个矩阵,我想确定每个字符在所有成对之间出现在相同位置的次数。
下面是我正在做的方法的一个例子,但是我的矩阵有 10,000 行,而且它花费的时间太长了。
# This code will generate a dataframe with one row for each pair and columns that
# count the number of position match each letter have
my_letters <- c("A", "B", "C", "D")
size_vector <- 175
n_vectors <- 10
indexes_vectors <- seq_len(n_vectors)
mtx <- sapply(indexes_vectors,
function(i) sample(my_letters, n_vectors, replace = TRUE))
rownames(mtx) <- indexes_vectors
df <- as.data.frame(t(combn(indexes_vectors, m = 2)))
colnames(df) <- c("index_1", "index_2")
for(l in my_letters){
cat(l, "\n")
df[,l] <- apply(df[,1:2], 1,
function(ids) {
sum(mtx[ids[1],] == mtx[ids[2],] &
mtx[ids[1],] == l, na.rm = TRUE)
})
}
我不知道这是否会表现良好,但这是一个选择:
library(data.table)
matchDT = setDT(melt(mtx))[,
CJ(row1 = Var1, row2 = Var1)[row1 < row2], by=.(value, col = Var2)]
]
dcast(matchDT, row1 + row2 ~ value)
这排除了没有匹配项的行组合。为了让他们回来,也许...
levs = seq_len(nrow(mtx))
dcast(matchDT, factor(row1, levels=levs) + factor(row2, levels = levs) ~ value, drop = FALSE)[as.integer(row1) < as.integer(row2)]
Aggregate function missing, defaulting to 'length'
row1 row2 A B C D
1: 1 2 1 0 2 0
2: 1 3 1 0 1 1
3: 1 4 1 1 0 1
4: 1 5 0 1 1 0
5: 1 6 1 0 1 1
6: 1 7 0 0 1 0
7: 1 8 0 2 1 0
8: 1 9 1 2 2 1
9: 1 10 0 1 1 0
10: 2 3 2 0 0 0
11: 2 4 1 0 1 0
12: 2 5 0 1 1 0
13: 2 6 1 0 1 1
14: 2 7 0 0 1 0
15: 2 8 2 0 1 0
16: 2 9 1 0 1 0
17: 2 10 1 0 1 0
18: 3 4 0 0 0 2
19: 3 5 0 0 0 0
20: 3 6 1 0 0 2
21: 3 7 1 1 1 0
22: 3 8 1 0 0 1
23: 3 9 1 1 0 0
24: 3 10 1 0 1 0
25: 4 5 0 2 1 0
26: 4 6 0 1 0 2
27: 4 7 0 0 0 0
28: 4 8 1 1 0 2
29: 4 9 0 2 0 0
30: 4 10 0 2 1 0
31: 5 6 0 1 1 0
32: 5 7 0 2 1 0
33: 5 8 0 1 0 1
34: 5 9 0 1 1 0
35: 5 10 0 2 1 1
36: 6 7 0 1 2 1
37: 6 8 0 0 0 1
38: 6 9 1 1 1 0
39: 6 10 0 1 0 0
40: 7 8 0 0 1 0
41: 7 9 0 0 1 0
42: 7 10 0 1 2 0
43: 8 9 1 2 1 0
44: 8 10 1 1 1 1
45: 9 10 0 2 1 0
row1 row2 A B C D
基于 R 的可能解决方案:
l1 <- lapply(split(df, 1:nrow(df)), as.integer)
l2 <- lapply(l1, function(x) {
m <- mtx[x[1],] == mtx[x[2],]
l <- lapply(my_letters, '==', mtx[x[1],])
sapply(l, function(i) sum(i & m))
})
cbind(df, setNames(do.call(rbind.data.frame, l2), my_letters))
给出:
index_1 index_2 A B C D 1 1 2 0 0 0 0 2 1 3 0 0 2 1 3 1 4 0 0 0 1 4 1 5 0 1 2 0 5 1 6 0 0 3 1 6 1 7 0 1 1 3 7 1 8 0 1 2 2 8 1 9 0 0 2 1 9 1 10 0 0 2 0 10 2 3 0 1 0 1 11 2 4 0 1 0 2 12 2 5 0 1 0 0 13 2 6 0 0 0 2 14 2 7 0 1 0 1 15 2 8 1 0 0 0 16 2 9 0 1 0 2 17 2 10 2 1 0 3 18 3 4 0 0 1 0 19 3 5 0 0 1 1 20 3 6 0 0 1 1 21 3 7 0 1 1 2 22 3 8 0 0 0 1 23 3 9 1 0 0 0 24 3 10 0 0 0 1 25 4 5 0 2 1 0 26 4 6 0 0 1 1 27 4 7 1 1 0 1 28 4 8 1 1 1 1 29 4 9 0 1 1 2 30 4 10 0 1 0 2 31 5 6 0 1 2 0 32 5 7 0 1 1 0 33 5 8 0 2 1 0 34 5 9 0 1 2 0 35 5 10 0 2 1 0 36 6 7 1 0 1 1 37 6 8 0 0 3 1 38 6 9 0 1 2 0 39 6 10 0 0 1 1 40 7 8 0 1 0 2 41 7 9 0 1 0 1 42 7 10 0 0 0 1 43 8 9 0 0 2 1 44 8 10 1 1 1 0 45 9 10 0 0 2 1
m1 <- t(sapply(1:nrow(df), function(i)
table(factor(mtx[df[i,1],][mtx[df[i,1],] == mtx[df[i,2],]],
levels = my_letters))))
cbind(df, m1)
> V1 V2 A B C D 1 1 2 0 0 1 1 2 1 3 1 0 1 1 3 1 4 1 0 2 1 4 1 5 0 0 1 0 5 1 6 2 0 2 0 6 1 7 0 0 1 0 7 1 8 1 0 1 1 8 1 9 0 0 1 0 9 1 10 1 0 1 1 10 2 3 0 0 1 1 11 2 4 1 1 1 2 12 2 5 0 0 0 1 13 2 6 1 0 2 1 14 2 7 1 0 0 1 15 2 8 1 0 0 0 16 2 9 2 0 0 0 17 2 10 1 0 1 0 18 3 4 0 0 0 0 19 3 5 0 2 1 0 20 3 6 1 1 2 1 21 3 7 0 1 0 0 22 3 8 1 1 0 0 23 3 9 0 1 2 0 24 3 10 0 0 1 0 25 4 5 1 1 0 1 26 4 6 2 1 1 0 27 4 7 1 0 1 1 28 4 8 0 1 0 0 29 4 9 1 0 0 0 30 4 10 2 0 0 0 31 5 6 0 2 0 0 32 5 7 0 1 3 1 33 5 8 0 1 2 0 34 5 9 1 0 2 0 35 5 10 0 0 2 0 36 6 7 0 0 0 0 37 6 8 1 1 0 0 38 6 9 0 0 1 0 39 6 10 3 0 1 0 40 7 8 0 1 1 0 41 7 9 1 0 1 0 42 7 10 0 0 1 0 43 8 9 1 1 1 1 44 8 10 0 0 1 0 45 9 10 0 0 0 0