从长到宽重塑数据框
Reshaping dataframe from long to wide
我有以下 data.frame
:
t1 = data.frame(
t_id = c("61","61","61","62","62","63"),
u_id = c("84","85","86","84","87","88"),
type = c("d","d","d","s","s","d"),
v1 = c(0.25, 0.25, 0.25, 0.35, 0.35, 0.45),v2 = c(0.30, 0.30, 0.40, 0.50, 0.50, 1.00),
tdate = as.Date(c("2015-11-01","2015-11-02","2015-11-03","2015-10-01","2015-10-02","2015-09-01"))
);
这是它的样子:
t_id u_id type v1 v2 tdate
1 61 84 d 0.25 0.3 2015-11-01
2 61 85 d 0.25 0.3 2015-11-02
3 61 86 d 0.25 0.4 2015-11-03
4 62 84 s 0.35 0.5 2015-10-01
5 62 87 s 0.35 0.5 2015-10-02
6 63 88 d 0.45 1.0 2015-09-01
t_id
是 transaction_id 而 u_id
是 user_id。我希望输出按 t_id
和 u_id
分组,并将列中的值与第一个用户的前缀 u1_
关联,第二个用户的前缀 u2_
关联,依此类推。假设每笔交易不会超过 3 个用户。
输出应如下所示:
t_id u1_id u1_type u1_v1 u1_v2 u1_tdate u2_id u2_type u2_v1 u2_v2 u2_tdate u3_id u3_type u3_v1 u3_v2 u3_tdate
61 84 d 0.25 0.3 2015-11-01 85 d 0.25 0.3 2015-11-02 86 d 0.25 0.4 2015-11-03
62 84 s 0.35 0.5 2015-10-01 87 s 0.35 0.5 2015-10-02
63 88 d 0.45 1.0 2015-09-01
我试过 reshape
但无济于事。关于我应该如何去做有什么想法吗?
您不能创建这样的 data.frame
(即每行的列数不同)。但是您可以创建一个列表。
这非常接近:
lapply(split(t1,t1$t_id),function(x) {
prefixes_counter=0
if (nrow(x)>1) {
Reduce(function(x1,x2) {
prefixes_counter<<-prefixes_counter+1
cn1=colnames(x1)
cn2=colnames(x2)
if (prefixes_counter==1) cn1[-1]=paste0("u",prefixes_counter,"_",cn1[-1])
cn2[-1]=paste0("u",prefixes_counter+1,"_",cn2[-1])
merge(`colnames<-`(x1,cn1),`colnames<-`(x2,cn2),by="t_id")
},split(x,1:nrow(x)))
} else {
colnames(x)[-1]=paste0("u1_",colnames(x)[-1])
x
}
})
哦,是的,顺便说一句:我强烈支持@SabDeM 的评论。 ;)
我认为你能做的最好的就是一系列演员表:
library(reshape2)
t1 = data.frame(
t_id = c("61","61","61","62","62","63"),
u_id = c("84","85","86","84","87","88"),
type = c("d","d","d","s","s","d"),
v1 = c(0.25, 0.25, 0.25, 0.35, 0.35, 0.45),v2 = c(0.30, 0.30, 0.40, 0.50, 0.50, 1.00),
tdate = as.Date(c("2015-11-01","2015-11-02","2015-11-03","2015-10-01","2015-10-02","2015-09-01")),
stringsAsFactors=FALSE
)
vars <- names(t1)[-1]
t1$seq <- ave(t1$t_id,t1$t_id,FUN=function(x) paste0("u",seq(along=x),"_"))
out <- data.frame(t_id=unique(t1$t_id))
for(i in vars) {
temp <- dcast(t1,t_id~seq,value.var=i)
names(temp)[-1] <- paste0(names(temp)[-1],i)
if(i=="tdate") temp[,-1] <- lapply(temp[,-1],as.Date)
out <- merge(out,temp)
}
out <- out[,sort(names(out))]
我有以下 data.frame
:
t1 = data.frame(
t_id = c("61","61","61","62","62","63"),
u_id = c("84","85","86","84","87","88"),
type = c("d","d","d","s","s","d"),
v1 = c(0.25, 0.25, 0.25, 0.35, 0.35, 0.45),v2 = c(0.30, 0.30, 0.40, 0.50, 0.50, 1.00),
tdate = as.Date(c("2015-11-01","2015-11-02","2015-11-03","2015-10-01","2015-10-02","2015-09-01"))
);
这是它的样子:
t_id u_id type v1 v2 tdate
1 61 84 d 0.25 0.3 2015-11-01
2 61 85 d 0.25 0.3 2015-11-02
3 61 86 d 0.25 0.4 2015-11-03
4 62 84 s 0.35 0.5 2015-10-01
5 62 87 s 0.35 0.5 2015-10-02
6 63 88 d 0.45 1.0 2015-09-01
t_id
是 transaction_id 而 u_id
是 user_id。我希望输出按 t_id
和 u_id
分组,并将列中的值与第一个用户的前缀 u1_
关联,第二个用户的前缀 u2_
关联,依此类推。假设每笔交易不会超过 3 个用户。
输出应如下所示:
t_id u1_id u1_type u1_v1 u1_v2 u1_tdate u2_id u2_type u2_v1 u2_v2 u2_tdate u3_id u3_type u3_v1 u3_v2 u3_tdate
61 84 d 0.25 0.3 2015-11-01 85 d 0.25 0.3 2015-11-02 86 d 0.25 0.4 2015-11-03
62 84 s 0.35 0.5 2015-10-01 87 s 0.35 0.5 2015-10-02
63 88 d 0.45 1.0 2015-09-01
我试过 reshape
但无济于事。关于我应该如何去做有什么想法吗?
您不能创建这样的 data.frame
(即每行的列数不同)。但是您可以创建一个列表。
这非常接近:
lapply(split(t1,t1$t_id),function(x) {
prefixes_counter=0
if (nrow(x)>1) {
Reduce(function(x1,x2) {
prefixes_counter<<-prefixes_counter+1
cn1=colnames(x1)
cn2=colnames(x2)
if (prefixes_counter==1) cn1[-1]=paste0("u",prefixes_counter,"_",cn1[-1])
cn2[-1]=paste0("u",prefixes_counter+1,"_",cn2[-1])
merge(`colnames<-`(x1,cn1),`colnames<-`(x2,cn2),by="t_id")
},split(x,1:nrow(x)))
} else {
colnames(x)[-1]=paste0("u1_",colnames(x)[-1])
x
}
})
哦,是的,顺便说一句:我强烈支持@SabDeM 的评论。 ;)
我认为你能做的最好的就是一系列演员表:
library(reshape2)
t1 = data.frame(
t_id = c("61","61","61","62","62","63"),
u_id = c("84","85","86","84","87","88"),
type = c("d","d","d","s","s","d"),
v1 = c(0.25, 0.25, 0.25, 0.35, 0.35, 0.45),v2 = c(0.30, 0.30, 0.40, 0.50, 0.50, 1.00),
tdate = as.Date(c("2015-11-01","2015-11-02","2015-11-03","2015-10-01","2015-10-02","2015-09-01")),
stringsAsFactors=FALSE
)
vars <- names(t1)[-1]
t1$seq <- ave(t1$t_id,t1$t_id,FUN=function(x) paste0("u",seq(along=x),"_"))
out <- data.frame(t_id=unique(t1$t_id))
for(i in vars) {
temp <- dcast(t1,t_id~seq,value.var=i)
names(temp)[-1] <- paste0(names(temp)[-1],i)
if(i=="tdate") temp[,-1] <- lapply(temp[,-1],as.Date)
out <- merge(out,temp)
}
out <- out[,sort(names(out))]