reshape/melt 根据rowKey的非对称矩阵
reshape/melt an asymmetric matrix according to a rowKey
我想 reshape/melt 一个非对称矩阵,这样当两个列单元格根据 rowKey 都不为零时,任意两行跨列求和。我尝试了各种选项,但 none 都有效。我正在寻找适用于大型非对称矩阵的通用解决方案。
#Dummy data
set.seed(123)
mat <- matrix(rbinom(20,100,0.01),4,5,dimnames=list(LETTERS[1:4],letters[1:5]))
mat
a b c d e
A 0 3 1 1 0
B 2 0 1 1 0
C 1 1 3 0 0
D 2 2 1 2 3
rowKey <- c("A"="N1","B"="N1","C"="N2","D"="N2")
#Desired output
N1 N2 N3 value
1 A C a 0
2 A C b 4
3 A C c 4
4 A C d 0
5 A C e 0
6 B C a 3
7 B C b 0
8 B C c 4
9 B C d 0
10 B C e 0
11 A D a 0
12 A D b 5
13 A D c 2
14 A D d 3
15 A D e 0
16 B D a 4
17 B D b 0
18 B D c 2
19 B D d 3
20 B D e 0
非常感谢任何指点!
temp = expand.grid(c(split(names(rowKey), rowKey), list(N3 = colnames(mat))))
temp2 = sapply(1:nrow(temp), function(i)
mat[row.names(mat) == temp$N1[i] | row.names(mat) == temp$N2[i],
colnames(mat) == temp$N3[i]])
temp$value = colSums(temp2) * (colSums(temp2 > 0) == nrow(temp2))
temp
# N1 N2 N3 value
#1 A C a 0
#2 B C a 3
#3 A D a 0
#4 B D a 4
#5 A C b 4
#6 B C b 0
#7 A D b 5
#8 B D b 0
#9 A C c 4
#10 B C c 4
#11 A D c 2
#12 B D c 2
#13 A C d 0
#14 B C d 0
#15 A D d 3
#16 B D d 3
#17 A C e 0
#18 B C e 0
#19 A D e 0
#20 B D e 0
这是一个更长的数据处理方法:
set.seed(123)
mat <- matrix(rbinom(20,100,0.01),4,5,dimnames=list(LETTERS[1:4],letters[1:5]))
rowKey <- c("A"="N1","B"="N1","C"="N2","D"="N2")
按行键分割矩阵:
> n1 <- names(which(rowKey=="N1"))
> mat[n1,]
a b c d e
A 0 3 1 1 0
B 2 0 1 1 0
> n2 <- names(which(rowKey=="N2"))
> mat[n2,]
a b c d e
C 1 1 3 0 0
D 2 2 1 2 3
然后将数据转换为熔融数据帧。
> library(reshape2)
> mmat1 <- melt(mat[n1,])
> mmat1
Var1 Var2 value
1 A a 0
2 B a 2
3 A b 3
4 B b 0
5 A c 1
6 B c 1
7 A d 1
8 B d 1
9 A e 0
10 B e 0
> mmat2 <- melt(mat[n2,])
> mmat2
Var1 Var2 value
1 C a 1
2 D a 2
3 C b 1
4 D b 2
5 C c 3
6 D c 1
7 C d 0
8 D d 2
9 C e 0
10 D e 3
然后按列名合并数据框,合并时注意列名
> colnames(mmat1) <- c("N1","N3","Val1")
> colnames(mmat2) <- c("N2","N3","Val2")
> mmat12 <- merge(mmat1,mmat2)
现在我们可以用条件
组成最终矩阵
> res <- cbind(mmat12[c('N1','N2','N3')],mmat12['Val1']+mmat12['Val2'])
> res[(mmat12['Val1'] == 0)|(mmat12['Val2'] == 0),4] <- 0
> res[with(res, order(N1,N2,N3)),]
N1 N2 N3 Val1
1 A C a 0
5 A C b 4
9 A C c 4
13 A C d 0
17 A C e 0
2 A D a 0
6 A D b 5
10 A D c 2
14 A D d 3
18 A D e 0
3 B C a 3
7 B C b 0
11 B C c 4
15 B C d 0
19 B C e 0
4 B D a 4
8 B D b 0
12 B D c 2
16 B D d 3
20 B D e 0
这里有一些以 data.frame 为中心的选项,使用 tidyverse 进行操作:
library(tidyverse)
set.seed(123)
mat <- matrix(rbinom(20, 100, 0.01), 4, 5,
dimnames = list(LETTERS[1:4], letters[1:5]))
rowKey <- c("A" = "N1", "B" = "N1", "C" = "N2", "D" = "N2")
output1 <- mat %>%
as.data.frame() %>%
rownames_to_column('N1') %>%
gather(N3, value, -N1) %>% # reshape to long form
crossing(N2 = .$N1) %>% # add combinations of rowname values
filter(N1 != N2, rowKey[N1] != rowKey[N2]) %>% # drop unwanted combinations
mutate(value = na_if(value, 0), # change 0 values to NA so sum will be 0
# sort rowname values to make group ID column for aggregation
id = map2_chr(N1, N2, ~toString(sort(c(.x, .y))))) %>%
group_by(id, N3) %>%
summarise(N1 = min(N1), # get alpabetically first rowname for N1
N2 = max(N2), # and last for N2
value = coalesce(sum(value), 0L)) %>% # sum and replace NAs with 0s again
# clean up
ungroup() %>%
select(N1, N2, N3, value) %>%
arrange(N2)
output1
#> # A tibble: 20 x 4
#> N1 N2 N3 value
#> <chr> <chr> <chr> <int>
#> 1 A C a 0
#> 2 A C b 4
#> 3 A C c 4
#> 4 A C d 0
#> 5 A C e 0
#> 6 B C a 3
#> 7 B C b 0
#> 8 B C c 4
#> 9 B C d 0
#> 10 B C e 0
#> 11 A D a 0
#> 12 A D b 5
#> 13 A D c 2
#> 14 A D d 3
#> 15 A D e 0
#> 16 B D a 4
#> 17 B D b 0
#> 18 B D c 2
#> 19 B D d 3
#> 20 B D e 0
与 expand.grid
一样,tidyr::crossing
扩展得比必要的多(例如 A
/A
组合),这可能会在规模上减慢速度。如果写起来更烦人,基于 combn
的方法可能会更快。
拆分和使用自联接是通过添加列而不是行来创建组合的更直接的方法。它需要一些光表体操,或者 split
:
output2 <- mat %>%
as.data.frame() %>%
rownames_to_column('N') %>%
gather(N3, value, -N) %>%
mutate(key = rowKey[N], # add column with key
value = na_if(value, 0)) %>%
split(.$key) %>% # split list by key
# join list elements to add N1/N2 and value combinations
reduce(full_join, by = 'N3', suffix = sub('N', '', names(.))) %>%
transmute(N1, N2, N3,
value = coalesce(value1 + value2, 0L)) %>%
arrange(N2, N1)
all_equal(output1, output2)
#> [1] TRUE
...或tidyr::nest
:
output3 <- mat %>%
as.data.frame() %>%
rownames_to_column('N') %>%
gather(N3, value, -N) %>%
mutate(key = rowKey[N],
value = na_if(value, 0)) %>%
nest(-key) %>% # store all but key column as nested data frame
# join nested data frames by N3 to get N1/N2 and value combinations
{ reduce(.$data, full_join, by = 'N3', suffix = sub('N', '', .$key)) } %>%
transmute(N1, N2, N3,
value = coalesce(value1 + value2, 0L)) %>%
arrange(N2, N1)
all_equal(output1, output3)
#> [1] TRUE
reduce
调用可以替换为 purrr::invoke
/do.call
,因为 reduce
仅调用 full_join
一次,但减少连接是常见的习语,并可能使该方法更易于扩展。
我想 reshape/melt 一个非对称矩阵,这样当两个列单元格根据 rowKey 都不为零时,任意两行跨列求和。我尝试了各种选项,但 none 都有效。我正在寻找适用于大型非对称矩阵的通用解决方案。
#Dummy data
set.seed(123)
mat <- matrix(rbinom(20,100,0.01),4,5,dimnames=list(LETTERS[1:4],letters[1:5]))
mat
a b c d e
A 0 3 1 1 0
B 2 0 1 1 0
C 1 1 3 0 0
D 2 2 1 2 3
rowKey <- c("A"="N1","B"="N1","C"="N2","D"="N2")
#Desired output
N1 N2 N3 value
1 A C a 0
2 A C b 4
3 A C c 4
4 A C d 0
5 A C e 0
6 B C a 3
7 B C b 0
8 B C c 4
9 B C d 0
10 B C e 0
11 A D a 0
12 A D b 5
13 A D c 2
14 A D d 3
15 A D e 0
16 B D a 4
17 B D b 0
18 B D c 2
19 B D d 3
20 B D e 0
非常感谢任何指点!
temp = expand.grid(c(split(names(rowKey), rowKey), list(N3 = colnames(mat))))
temp2 = sapply(1:nrow(temp), function(i)
mat[row.names(mat) == temp$N1[i] | row.names(mat) == temp$N2[i],
colnames(mat) == temp$N3[i]])
temp$value = colSums(temp2) * (colSums(temp2 > 0) == nrow(temp2))
temp
# N1 N2 N3 value
#1 A C a 0
#2 B C a 3
#3 A D a 0
#4 B D a 4
#5 A C b 4
#6 B C b 0
#7 A D b 5
#8 B D b 0
#9 A C c 4
#10 B C c 4
#11 A D c 2
#12 B D c 2
#13 A C d 0
#14 B C d 0
#15 A D d 3
#16 B D d 3
#17 A C e 0
#18 B C e 0
#19 A D e 0
#20 B D e 0
这是一个更长的数据处理方法:
set.seed(123)
mat <- matrix(rbinom(20,100,0.01),4,5,dimnames=list(LETTERS[1:4],letters[1:5]))
rowKey <- c("A"="N1","B"="N1","C"="N2","D"="N2")
按行键分割矩阵:
> n1 <- names(which(rowKey=="N1"))
> mat[n1,]
a b c d e
A 0 3 1 1 0
B 2 0 1 1 0
> n2 <- names(which(rowKey=="N2"))
> mat[n2,]
a b c d e
C 1 1 3 0 0
D 2 2 1 2 3
然后将数据转换为熔融数据帧。
> library(reshape2)
> mmat1 <- melt(mat[n1,])
> mmat1
Var1 Var2 value
1 A a 0
2 B a 2
3 A b 3
4 B b 0
5 A c 1
6 B c 1
7 A d 1
8 B d 1
9 A e 0
10 B e 0
> mmat2 <- melt(mat[n2,])
> mmat2
Var1 Var2 value
1 C a 1
2 D a 2
3 C b 1
4 D b 2
5 C c 3
6 D c 1
7 C d 0
8 D d 2
9 C e 0
10 D e 3
然后按列名合并数据框,合并时注意列名
> colnames(mmat1) <- c("N1","N3","Val1")
> colnames(mmat2) <- c("N2","N3","Val2")
> mmat12 <- merge(mmat1,mmat2)
现在我们可以用条件
组成最终矩阵> res <- cbind(mmat12[c('N1','N2','N3')],mmat12['Val1']+mmat12['Val2'])
> res[(mmat12['Val1'] == 0)|(mmat12['Val2'] == 0),4] <- 0
> res[with(res, order(N1,N2,N3)),]
N1 N2 N3 Val1
1 A C a 0
5 A C b 4
9 A C c 4
13 A C d 0
17 A C e 0
2 A D a 0
6 A D b 5
10 A D c 2
14 A D d 3
18 A D e 0
3 B C a 3
7 B C b 0
11 B C c 4
15 B C d 0
19 B C e 0
4 B D a 4
8 B D b 0
12 B D c 2
16 B D d 3
20 B D e 0
这里有一些以 data.frame 为中心的选项,使用 tidyverse 进行操作:
library(tidyverse)
set.seed(123)
mat <- matrix(rbinom(20, 100, 0.01), 4, 5,
dimnames = list(LETTERS[1:4], letters[1:5]))
rowKey <- c("A" = "N1", "B" = "N1", "C" = "N2", "D" = "N2")
output1 <- mat %>%
as.data.frame() %>%
rownames_to_column('N1') %>%
gather(N3, value, -N1) %>% # reshape to long form
crossing(N2 = .$N1) %>% # add combinations of rowname values
filter(N1 != N2, rowKey[N1] != rowKey[N2]) %>% # drop unwanted combinations
mutate(value = na_if(value, 0), # change 0 values to NA so sum will be 0
# sort rowname values to make group ID column for aggregation
id = map2_chr(N1, N2, ~toString(sort(c(.x, .y))))) %>%
group_by(id, N3) %>%
summarise(N1 = min(N1), # get alpabetically first rowname for N1
N2 = max(N2), # and last for N2
value = coalesce(sum(value), 0L)) %>% # sum and replace NAs with 0s again
# clean up
ungroup() %>%
select(N1, N2, N3, value) %>%
arrange(N2)
output1
#> # A tibble: 20 x 4
#> N1 N2 N3 value
#> <chr> <chr> <chr> <int>
#> 1 A C a 0
#> 2 A C b 4
#> 3 A C c 4
#> 4 A C d 0
#> 5 A C e 0
#> 6 B C a 3
#> 7 B C b 0
#> 8 B C c 4
#> 9 B C d 0
#> 10 B C e 0
#> 11 A D a 0
#> 12 A D b 5
#> 13 A D c 2
#> 14 A D d 3
#> 15 A D e 0
#> 16 B D a 4
#> 17 B D b 0
#> 18 B D c 2
#> 19 B D d 3
#> 20 B D e 0
与 expand.grid
一样,tidyr::crossing
扩展得比必要的多(例如 A
/A
组合),这可能会在规模上减慢速度。如果写起来更烦人,基于 combn
的方法可能会更快。
拆分和使用自联接是通过添加列而不是行来创建组合的更直接的方法。它需要一些光表体操,或者 split
:
output2 <- mat %>%
as.data.frame() %>%
rownames_to_column('N') %>%
gather(N3, value, -N) %>%
mutate(key = rowKey[N], # add column with key
value = na_if(value, 0)) %>%
split(.$key) %>% # split list by key
# join list elements to add N1/N2 and value combinations
reduce(full_join, by = 'N3', suffix = sub('N', '', names(.))) %>%
transmute(N1, N2, N3,
value = coalesce(value1 + value2, 0L)) %>%
arrange(N2, N1)
all_equal(output1, output2)
#> [1] TRUE
...或tidyr::nest
:
output3 <- mat %>%
as.data.frame() %>%
rownames_to_column('N') %>%
gather(N3, value, -N) %>%
mutate(key = rowKey[N],
value = na_if(value, 0)) %>%
nest(-key) %>% # store all but key column as nested data frame
# join nested data frames by N3 to get N1/N2 and value combinations
{ reduce(.$data, full_join, by = 'N3', suffix = sub('N', '', .$key)) } %>%
transmute(N1, N2, N3,
value = coalesce(value1 + value2, 0L)) %>%
arrange(N2, N1)
all_equal(output1, output3)
#> [1] TRUE
reduce
调用可以替换为 purrr::invoke
/do.call
,因为 reduce
仅调用 full_join
一次,但减少连接是常见的习语,并可能使该方法更易于扩展。