通过加权变量重塑矩阵
Reshape matrix by weighting variable
我的数据:
datas=structure(list(Id = 1:4, Product1 = structure(c(3L, 2L, 2L, 1L
), .Label = c("1,2,5", "1,3", "5"), class = "factor"), Product2 = structure(c(4L,
3L, 1L, 2L), .Label = c("A", "A,B,E", "B,D", "D"), class = "factor")), .Names = c("Id",
"Product1", "Product2"), class = "data.frame", row.names = c(NA,
-4L))
这在视觉上给出了(人们会购买两种产品,并且对于每种产品,一个或多个原因。对于产品 1(原因=1、2、3、4、5)和产品 2(原因=A、B、C ,D,E)和原因可以组合
Id Product1 Product2
1 5 D
2 1,3 B,D
3 1,3 A
4 1,2,5 A,B,E
我想重塑如下
对于每个 ID 计算:
Id= 1
A B C D E
1 0 0 0 0 0
2 0 0 0 0 0
3 0 0 0 0 0
4 0 0 0 0 0
5 0 0 0 1 0
Id= 2 : here 0.25 because we have B1,B3,D1,D3 so 1/4 for each one
A B C D E
1 0 0,25 0 0,25 0
2 0 0 0 0 0
3 0 0,25 0 0,25 0
4 0 0 0 0 0
5 0 0 0 1 0
on so on Id = 4, we have : a1,a2,a5,b1,b2,b5,e1,e2,e5, so 1/9 for each one.
A B C D E
1 0,11+0,5 0,11+0,25 0 0,25 0,11
2 0,11+0 0,11+0 0 0 0,11
3 0,5 0,25 0 0,25 0
4 0 0 0 0 0
5 0,11+0 0,11+0 0 1 0,11
如何轻松做到这一点,或者我应该计算每个矩阵并在每次迭代中求和?
非常感谢!
试试这个:
library(dplyr)
library(reshape2)
# split strings
L <- lapply(datas[,2:3],function(v) strsplit(as.character(v),','))
# generate all combinations of products
d <- mapply(expand.grid,L$Product1, L$Product2,SIMPLIFY = F,stringsAsFactors=F)
df <- melt(d,id.vars=c('Var1','Var2')) %>% # convert to long format
group_by(L1) %>%
mutate(weight=1/n()) %>% # calculate weights
group_by(Var1,Var2) %>% #
summarize(sm=sum(weight)) # calculate sums
dcast(df,Var1~Var2)
# Note that it ignores column C and row 4 because no data were available for them
# Var1 A B D E
# 1 1 0.6111111 0.3611111 0.25 0.1111111
# 2 2 0.1111111 0.1111111 NA 0.1111111
# 3 3 0.5000000 0.2500000 0.25 NA
# 4 5 0.1111111 0.1111111 1.00 0.1111111
datas=structure(list(Id = 1:4, Product1 = structure(c(3L, 2L, 2L, 1L), .Label = c("1,2,5", "1,3", "5"), class = "factor"),
Product2 = structure(c(4L, 3L, 1L, 2L), .Label = c("A", "A,B,E", "B,D", "D"), class = "factor")),
.Names = c("Id", "Product1", "Product2"), class = "data.frame", row.names = c(NA, -4L))
f <- function(p1, p2, lvls = 1:5) {
# p1 <- datas$Product1[2]; p2 <- datas$Product2[2]
p1 <- strsplit(as.character(p1), ',')[[1]]
p2 <- strsplit(as.character(p2), ',')[[1]]
t2 <- factor(rep(p2, each = length(p1)), levels = LETTERS[lvls])
t1 <- factor(rep(p1, length(p2)), levels = lvls)
tbl <- table(t1, t2)
tbl / sum(tbl)
}
对于单个 ID
Map(f, datas$Product1, datas$Product2)
# [[1]]
# t2
# t1 A B C D E
# 1 0 0 0 0 0
# 2 0 0 0 0 0
# 3 0 0 0 0 0
# 4 0 0 0 0 0
# 5 0 0 0 1 0
#
# [[2]]
# t2
# t1 A B C D E
# 1 0.00 0.25 0.00 0.25 0.00
# 2 0.00 0.00 0.00 0.00 0.00
# 3 0.00 0.25 0.00 0.25 0.00
# 4 0.00 0.00 0.00 0.00 0.00
# 5 0.00 0.00 0.00 0.00 0.00
#
# [[3]]
# t2
# t1 A B C D E
# 1 0.5 0.0 0.0 0.0 0.0
# 2 0.0 0.0 0.0 0.0 0.0
# 3 0.5 0.0 0.0 0.0 0.0
# 4 0.0 0.0 0.0 0.0 0.0
# 5 0.0 0.0 0.0 0.0 0.0
#
# [[4]]
# t2
# t1 A B C D E
# 1 0.1111111 0.1111111 0.0000000 0.0000000 0.1111111
# 2 0.1111111 0.1111111 0.0000000 0.0000000 0.1111111
# 3 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000
# 4 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000
# 5 0.1111111 0.1111111 0.0000000 0.0000000 0.1111111
全部累积起来
Reduce(`+`, Map(f, datas$Product1, datas$Product2), accumulate = TRUE)
# [[1]]
# t2
# t1 A B C D E
# 1 0 0 0 0 0
# 2 0 0 0 0 0
# 3 0 0 0 0 0
# 4 0 0 0 0 0
# 5 0 0 0 1 0
#
# [[2]]
# t2
# t1 A B C D E
# 1 0.00 0.25 0.00 0.25 0.00
# 2 0.00 0.00 0.00 0.00 0.00
# 3 0.00 0.25 0.00 0.25 0.00
# 4 0.00 0.00 0.00 0.00 0.00
# 5 0.00 0.00 0.00 1.00 0.00
#
# [[3]]
# t2
# t1 A B C D E
# 1 0.50 0.25 0.00 0.25 0.00
# 2 0.00 0.00 0.00 0.00 0.00
# 3 0.50 0.25 0.00 0.25 0.00
# 4 0.00 0.00 0.00 0.00 0.00
# 5 0.00 0.00 0.00 1.00 0.00
#
# [[4]]
# t2
# t1 A B C D E
# 1 0.6111111 0.3611111 0.0000000 0.2500000 0.1111111
# 2 0.1111111 0.1111111 0.0000000 0.0000000 0.1111111
# 3 0.5000000 0.2500000 0.0000000 0.2500000 0.0000000
# 4 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000
# 5 0.1111111 0.1111111 0.0000000 1.0000000 0.1111111
我的数据:
datas=structure(list(Id = 1:4, Product1 = structure(c(3L, 2L, 2L, 1L
), .Label = c("1,2,5", "1,3", "5"), class = "factor"), Product2 = structure(c(4L,
3L, 1L, 2L), .Label = c("A", "A,B,E", "B,D", "D"), class = "factor")), .Names = c("Id",
"Product1", "Product2"), class = "data.frame", row.names = c(NA,
-4L))
这在视觉上给出了(人们会购买两种产品,并且对于每种产品,一个或多个原因。对于产品 1(原因=1、2、3、4、5)和产品 2(原因=A、B、C ,D,E)和原因可以组合
Id Product1 Product2
1 5 D
2 1,3 B,D
3 1,3 A
4 1,2,5 A,B,E
我想重塑如下
对于每个 ID 计算:
Id= 1
A B C D E
1 0 0 0 0 0
2 0 0 0 0 0
3 0 0 0 0 0
4 0 0 0 0 0
5 0 0 0 1 0
Id= 2 : here 0.25 because we have B1,B3,D1,D3 so 1/4 for each one
A B C D E
1 0 0,25 0 0,25 0
2 0 0 0 0 0
3 0 0,25 0 0,25 0
4 0 0 0 0 0
5 0 0 0 1 0
on so on Id = 4, we have : a1,a2,a5,b1,b2,b5,e1,e2,e5, so 1/9 for each one.
A B C D E
1 0,11+0,5 0,11+0,25 0 0,25 0,11
2 0,11+0 0,11+0 0 0 0,11
3 0,5 0,25 0 0,25 0
4 0 0 0 0 0
5 0,11+0 0,11+0 0 1 0,11
如何轻松做到这一点,或者我应该计算每个矩阵并在每次迭代中求和?
非常感谢!
试试这个:
library(dplyr)
library(reshape2)
# split strings
L <- lapply(datas[,2:3],function(v) strsplit(as.character(v),','))
# generate all combinations of products
d <- mapply(expand.grid,L$Product1, L$Product2,SIMPLIFY = F,stringsAsFactors=F)
df <- melt(d,id.vars=c('Var1','Var2')) %>% # convert to long format
group_by(L1) %>%
mutate(weight=1/n()) %>% # calculate weights
group_by(Var1,Var2) %>% #
summarize(sm=sum(weight)) # calculate sums
dcast(df,Var1~Var2)
# Note that it ignores column C and row 4 because no data were available for them
# Var1 A B D E
# 1 1 0.6111111 0.3611111 0.25 0.1111111
# 2 2 0.1111111 0.1111111 NA 0.1111111
# 3 3 0.5000000 0.2500000 0.25 NA
# 4 5 0.1111111 0.1111111 1.00 0.1111111
datas=structure(list(Id = 1:4, Product1 = structure(c(3L, 2L, 2L, 1L), .Label = c("1,2,5", "1,3", "5"), class = "factor"),
Product2 = structure(c(4L, 3L, 1L, 2L), .Label = c("A", "A,B,E", "B,D", "D"), class = "factor")),
.Names = c("Id", "Product1", "Product2"), class = "data.frame", row.names = c(NA, -4L))
f <- function(p1, p2, lvls = 1:5) {
# p1 <- datas$Product1[2]; p2 <- datas$Product2[2]
p1 <- strsplit(as.character(p1), ',')[[1]]
p2 <- strsplit(as.character(p2), ',')[[1]]
t2 <- factor(rep(p2, each = length(p1)), levels = LETTERS[lvls])
t1 <- factor(rep(p1, length(p2)), levels = lvls)
tbl <- table(t1, t2)
tbl / sum(tbl)
}
对于单个 ID
Map(f, datas$Product1, datas$Product2)
# [[1]]
# t2
# t1 A B C D E
# 1 0 0 0 0 0
# 2 0 0 0 0 0
# 3 0 0 0 0 0
# 4 0 0 0 0 0
# 5 0 0 0 1 0
#
# [[2]]
# t2
# t1 A B C D E
# 1 0.00 0.25 0.00 0.25 0.00
# 2 0.00 0.00 0.00 0.00 0.00
# 3 0.00 0.25 0.00 0.25 0.00
# 4 0.00 0.00 0.00 0.00 0.00
# 5 0.00 0.00 0.00 0.00 0.00
#
# [[3]]
# t2
# t1 A B C D E
# 1 0.5 0.0 0.0 0.0 0.0
# 2 0.0 0.0 0.0 0.0 0.0
# 3 0.5 0.0 0.0 0.0 0.0
# 4 0.0 0.0 0.0 0.0 0.0
# 5 0.0 0.0 0.0 0.0 0.0
#
# [[4]]
# t2
# t1 A B C D E
# 1 0.1111111 0.1111111 0.0000000 0.0000000 0.1111111
# 2 0.1111111 0.1111111 0.0000000 0.0000000 0.1111111
# 3 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000
# 4 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000
# 5 0.1111111 0.1111111 0.0000000 0.0000000 0.1111111
全部累积起来
Reduce(`+`, Map(f, datas$Product1, datas$Product2), accumulate = TRUE)
# [[1]]
# t2
# t1 A B C D E
# 1 0 0 0 0 0
# 2 0 0 0 0 0
# 3 0 0 0 0 0
# 4 0 0 0 0 0
# 5 0 0 0 1 0
#
# [[2]]
# t2
# t1 A B C D E
# 1 0.00 0.25 0.00 0.25 0.00
# 2 0.00 0.00 0.00 0.00 0.00
# 3 0.00 0.25 0.00 0.25 0.00
# 4 0.00 0.00 0.00 0.00 0.00
# 5 0.00 0.00 0.00 1.00 0.00
#
# [[3]]
# t2
# t1 A B C D E
# 1 0.50 0.25 0.00 0.25 0.00
# 2 0.00 0.00 0.00 0.00 0.00
# 3 0.50 0.25 0.00 0.25 0.00
# 4 0.00 0.00 0.00 0.00 0.00
# 5 0.00 0.00 0.00 1.00 0.00
#
# [[4]]
# t2
# t1 A B C D E
# 1 0.6111111 0.3611111 0.0000000 0.2500000 0.1111111
# 2 0.1111111 0.1111111 0.0000000 0.0000000 0.1111111
# 3 0.5000000 0.2500000 0.0000000 0.2500000 0.0000000
# 4 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000
# 5 0.1111111 0.1111111 0.0000000 1.0000000 0.1111111