R:所有列组合的频率
R: Frequency of all column combinations
问题描述
我有一个大小相等的字符串列表,如下所示:
example.list <- c('BBCD','ABBC','ADDB','ACBB')
然后我想获取特定字母在特定位置出现的频率。
首先,我将其转换为矩阵:
A1 B1 C1 D1 A2 B2 C2 D2 A3 B3 C3 D3 A4 B4 C4 D4
[1,] 0 1 0 0 0 1 0 0 0 0 1 0 0 0 0 1
[2,] 1 0 0 0 0 1 0 0 0 1 0 0 0 0 1 0
[3,] 1 0 0 0 0 0 0 1 0 0 0 1 0 1 0 0
[4,] 1 0 0 0 0 0 1 0 0 1 0 0 0 1 0 0
[5,] 1 0 0 0 0 1 0 0 0 1 0 0 0 0 0 1
现在我想获得每个列组合的频率。一些例子:
A1 : B2 = 2
A1 : B3 = 3
B1 : B2 = 1
.. etc
这应该会为您提供每个 colum_A x colum_B 组合的表格列表(A 和 B 的数字从 1 到 length(nchar(your_Strings)))
rm(list = ls())
example.list <- c('ABCD','ABBC','ADDB','ACBB', "BCBB", "BASD")
example.matrix = matrix(unlist(strsplit(example.list, "")), ncol = nchar(example.list[1]), nrow = length(example.list), byrow = T)
table(example.matrix[,1], example.matrix[,2])
results = list()
for(i in 1:NCOL(example.matrix))
{
for(j in 1:NCOL(example.matrix))
{
temp = as.matrix(table(example.matrix[,i], example.matrix[,j]))
rownames(temp) = paste0("pos_",i,"_", rownames(temp))
colnames(temp) = paste0("pos_",j,"_", colnames(temp))
print(temp)
results[[paste0(i,"_",j)]] = temp
}
}
results
是这样的吗?
编辑:
最好使用 Ryan 的解决方案。优雅多了。
假设你的矩阵被命名为 mat
# get all vars present in each row
present <- lapply(seq(nrow(mat)), function(i) names(which(mat[i,] == 1)))
# get all pairs
all.pairs <- gtools::combinations(n = ncol(mat), r = 2, colnames(mat))
# count times pairs appear
count <- apply(all.pairs, 1, function(x){
there <- lapply(x, function(y) sapply(present, `%in%`, x = y))
sum(Reduce(`&`, there))
})
cbind(all.pairs, count)[count > 0,]
# count
# [1,] "A1" "B2" "2"
# [2,] "A1" "B3" "3"
# [3,] "A1" "B4" "2"
# [4,] "A1" "C2" "1"
# [5,] "A1" "C4" "1"
# [6,] "A1" "D2" "1"
# [7,] "A1" "D3" "1"
# [8,] "A1" "D4" "1"
# [9,] "B1" "B2" "1"
# [10,] "B1" "C3" "1"
# [11,] "B1" "D4" "1"
# [12,] "B2" "B3" "2"
# [13,] "B2" "C3" "1"
# [14,] "B2" "C4" "1"
# [15,] "B2" "D4" "2"
# [16,] "B3" "B4" "1"
# [17,] "B3" "C2" "1"
# [18,] "B3" "C4" "1"
# [19,] "B3" "D4" "1"
# [20,] "B4" "C2" "1"
# [21,] "B4" "D2" "1"
# [22,] "B4" "D3" "1"
# [23,] "C3" "D4" "1"
# [24,] "D2" "D3" "1"
编辑:包括反向对,例如A1:B2 和 B2:A1,定义 all.pairs
如下所示
all.pairs <- expand.grid(colnames(mat), colnames(mat))
将字符串拆分为单个字符向量列表 s
。将 n
设置为它们的共同长度,并从 s
创建一个矩阵 v
,其列包含 B1
等元素。然后使用 xtabs
创建计数,给出m1
和 combn
以获取 m2
.
中的对数
s <- strsplit(example.list, "")
n <- lengths(s)[1]
v <- sapply(s, paste0, 1:n)
m1 <- xtabs(~., data.frame(colv = c(col(v)), v = c(v)))
m2 <- combn(1:ncol(m1), 2, function(ix) sum(m1[, ix[1]] * m1[, ix[2]]))
names(m2) <- combn(colnames(m1), 2, paste, collapse = "")
给予:
> m1
v
colv A1 B1 B2 B3 B4 C2 C3 C4 D2 D3 D4
1 0 1 1 0 0 0 1 0 0 0 1
2 1 0 1 1 0 0 0 1 0 0 0
3 1 0 0 0 1 0 0 0 1 1 0
4 1 0 0 1 1 1 0 0 0 0 0
> m2
A1B1 A1B2 A1B3 A1B4 A1C2 A1C3 A1C4 A1D2 A1D3 A1D4 B1B2 B1B3 B1B4 B1C2 B1C3 B1C4
0 1 2 2 1 0 1 1 1 0 1 0 0 0 1 0
B1D2 B1D3 B1D4 B2B3 B2B4 B2C2 B2C3 B2C4 B2D2 B2D3 B2D4 B3B4 B3C2 B3C3 B3C4 B3D2
0 0 1 1 0 0 1 1 0 0 1 1 1 0 1 0
B3D3 B3D4 B4C2 B4C3 B4C4 B4D2 B4D3 B4D4 C2C3 C2C4 C2D2 C2D3 C2D4 C3C4 C3D2 C3D3
0 0 1 0 0 1 1 0 0 0 0 0 0 0 0 0
C3D4 C4D2 C4D3 C4D4 D2D3 D2D4 D3D4
1 0 0 0 1 0 0
问题描述
我有一个大小相等的字符串列表,如下所示:
example.list <- c('BBCD','ABBC','ADDB','ACBB')
然后我想获取特定字母在特定位置出现的频率。 首先,我将其转换为矩阵:
A1 B1 C1 D1 A2 B2 C2 D2 A3 B3 C3 D3 A4 B4 C4 D4
[1,] 0 1 0 0 0 1 0 0 0 0 1 0 0 0 0 1
[2,] 1 0 0 0 0 1 0 0 0 1 0 0 0 0 1 0
[3,] 1 0 0 0 0 0 0 1 0 0 0 1 0 1 0 0
[4,] 1 0 0 0 0 0 1 0 0 1 0 0 0 1 0 0
[5,] 1 0 0 0 0 1 0 0 0 1 0 0 0 0 0 1
现在我想获得每个列组合的频率。一些例子:
A1 : B2 = 2
A1 : B3 = 3
B1 : B2 = 1
.. etc
这应该会为您提供每个 colum_A x colum_B 组合的表格列表(A 和 B 的数字从 1 到 length(nchar(your_Strings)))
rm(list = ls())
example.list <- c('ABCD','ABBC','ADDB','ACBB', "BCBB", "BASD")
example.matrix = matrix(unlist(strsplit(example.list, "")), ncol = nchar(example.list[1]), nrow = length(example.list), byrow = T)
table(example.matrix[,1], example.matrix[,2])
results = list()
for(i in 1:NCOL(example.matrix))
{
for(j in 1:NCOL(example.matrix))
{
temp = as.matrix(table(example.matrix[,i], example.matrix[,j]))
rownames(temp) = paste0("pos_",i,"_", rownames(temp))
colnames(temp) = paste0("pos_",j,"_", colnames(temp))
print(temp)
results[[paste0(i,"_",j)]] = temp
}
}
results
是这样的吗?
编辑: 最好使用 Ryan 的解决方案。优雅多了。
假设你的矩阵被命名为 mat
# get all vars present in each row
present <- lapply(seq(nrow(mat)), function(i) names(which(mat[i,] == 1)))
# get all pairs
all.pairs <- gtools::combinations(n = ncol(mat), r = 2, colnames(mat))
# count times pairs appear
count <- apply(all.pairs, 1, function(x){
there <- lapply(x, function(y) sapply(present, `%in%`, x = y))
sum(Reduce(`&`, there))
})
cbind(all.pairs, count)[count > 0,]
# count
# [1,] "A1" "B2" "2"
# [2,] "A1" "B3" "3"
# [3,] "A1" "B4" "2"
# [4,] "A1" "C2" "1"
# [5,] "A1" "C4" "1"
# [6,] "A1" "D2" "1"
# [7,] "A1" "D3" "1"
# [8,] "A1" "D4" "1"
# [9,] "B1" "B2" "1"
# [10,] "B1" "C3" "1"
# [11,] "B1" "D4" "1"
# [12,] "B2" "B3" "2"
# [13,] "B2" "C3" "1"
# [14,] "B2" "C4" "1"
# [15,] "B2" "D4" "2"
# [16,] "B3" "B4" "1"
# [17,] "B3" "C2" "1"
# [18,] "B3" "C4" "1"
# [19,] "B3" "D4" "1"
# [20,] "B4" "C2" "1"
# [21,] "B4" "D2" "1"
# [22,] "B4" "D3" "1"
# [23,] "C3" "D4" "1"
# [24,] "D2" "D3" "1"
编辑:包括反向对,例如A1:B2 和 B2:A1,定义 all.pairs
如下所示
all.pairs <- expand.grid(colnames(mat), colnames(mat))
将字符串拆分为单个字符向量列表 s
。将 n
设置为它们的共同长度,并从 s
创建一个矩阵 v
,其列包含 B1
等元素。然后使用 xtabs
创建计数,给出m1
和 combn
以获取 m2
.
s <- strsplit(example.list, "")
n <- lengths(s)[1]
v <- sapply(s, paste0, 1:n)
m1 <- xtabs(~., data.frame(colv = c(col(v)), v = c(v)))
m2 <- combn(1:ncol(m1), 2, function(ix) sum(m1[, ix[1]] * m1[, ix[2]]))
names(m2) <- combn(colnames(m1), 2, paste, collapse = "")
给予:
> m1
v
colv A1 B1 B2 B3 B4 C2 C3 C4 D2 D3 D4
1 0 1 1 0 0 0 1 0 0 0 1
2 1 0 1 1 0 0 0 1 0 0 0
3 1 0 0 0 1 0 0 0 1 1 0
4 1 0 0 1 1 1 0 0 0 0 0
> m2
A1B1 A1B2 A1B3 A1B4 A1C2 A1C3 A1C4 A1D2 A1D3 A1D4 B1B2 B1B3 B1B4 B1C2 B1C3 B1C4
0 1 2 2 1 0 1 1 1 0 1 0 0 0 1 0
B1D2 B1D3 B1D4 B2B3 B2B4 B2C2 B2C3 B2C4 B2D2 B2D3 B2D4 B3B4 B3C2 B3C3 B3C4 B3D2
0 0 1 1 0 0 1 1 0 0 1 1 1 0 1 0
B3D3 B3D4 B4C2 B4C3 B4C4 B4D2 B4D3 B4D4 C2C3 C2C4 C2D2 C2D3 C2D4 C3C4 C3D2 C3D3
0 0 1 0 0 1 1 0 0 0 0 0 0 0 0 0
C3D4 C4D2 C4D3 C4D4 D2D3 D2D4 D3D4
1 0 0 0 1 0 0