如何将表格数据重塑为每组一行
How to reshape tabular data to one row per group
我是一个 R(和编码新手),我正在寻找一种方法来将下面的 Table A 显示重新配置为 Table B。
Table答:
type x1 x2 x3
A 4 6 9
A 7 4 1
A 9 6 2
B 1 3 8
B 2 7 9
我正在寻找可以转换为以下内容的代码
Table乙:
type x1 x2 x3 x1' x2' x3' x1'' x2'' x3''
A 4 6 9 7 4 1 9 6 2
B 1 3 8 2 7 9
真正的Table A 超过150000 行和36 列。具有 2100 个独特的 "type" 值。
感谢您的帮助。
-肖恩
尝试一下,解决方案不是那么简洁,只是给你一个 hint.I 认为很多事情都可以改进。
不过最后还是要在这里介绍一下NA :(
zz <- "type x1 x2 x3
A 4 6 9
A 7 4 1
A 9 6 2
B 1 3 8
B 2 7 9"
dA <- read.table(text=zz, header=T)
tmp<-(sapply(unique(dA$type), FUN=function(x) as.vector(t(dA[dA$type == x, -1]))))
t(sapply(tmp, '[', seq(max(sapply(tmp, length)))))
[,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9]
[1,] 4 6 9 7 4 1 9 6 2
[2,] 1 3 8 2 7 9 NA NA NA
a <- data.frame(type=c("A", "A","A", "B", "B"), x1 = c(4,7,9,1,2),x2=c(6,4,6,3,7),
x3 = c(9,1,2,8,9))
library(dplyr)
tmp <-
a %>%
group_by(type) %>%
summarise(no_rows = length(type))
for(i in unique(a$type)){
n <- max(tmp$no_rows) - nrow(a[a$type == i,])
nn <- nrow(a)
if(n > 0){
for(ii in 1:n){
a[nn+ii,] <- c(i,NA,NA,NA)
}
}
}
a <- a[order(a$type),]
a$timevar <- seq(1:max(tmp$no_rows) )
b<-reshape(a,timevar="timevar",idvar="type",direction="wide",drop = F)
b
type x1.1 x2.1 x3.1 x1.2 x2.2 x3.2 x1.3 x2.3 x3.3
1 A 4 6 9 7 4 1 9 6 2
4 B 1 3 8 2 7 9 <NA> <NA> <NA>
@Hack-R 答案的变化:
A$num <- with(A, ave(as.character(type), type, FUN=seq_along) )
tmp <- cbind(A[c(1,5)], stack(A[2:4]))
tmp$time <- paste(tmp$ind, tmp$num, sep=".")
reshape(tmp[c("type","time","values")], idvar="type", timevar="time", direction="wide")
# type values.x1.1 values.x1.2 values.x1.3 values.x2.1 values.x2.2 values.x2.3 values.x3.1 values.x3.2 values.x3.3
#1 A 4 7 9 6 4 6 9 1 2
#4 B 1 2 NA 3 7 NA 8 9 NA
还有一个有趣的 dplyr 版本:
library(dplyr)
library(tidyr)
A %>%
gather(var, value, -type) %>%
group_by(type,var) %>%
mutate(time=seq_along(var)) %>%
ungroup() %>%
unite(grpvar, c(time,var) ) %>%
spread(grpvar, value )
#Source: local data frame [2 x 10]
#
# type 1_x1 1_x2 1_x3 2_x1 2_x2 2_x3 3_x1 3_x2 3_x3
# (chr) (int) (int) (int) (int) (int) (int) (int) (int) (int)
#1 A 4 6 9 7 4 1 9 6 2
#2 B 1 3 8 2 7 9 NA NA NA
对我来说,这个解决方案似乎非常简单
# split the data frame by type and use unlist, which will provide names
ld <- lapply(split(d[-1], d[["type"]]), unlist)
# gather all the unique names in the list
ldNames <- Reduce(unique, lapply(ld, names))
# use the names to index each list element, which makes them
# all of equal length and suitable for row binding.
do.call(rbind, lapply(ld, function(x) x[ldNames]))
# x11 x12 x13 x21 x22 x23 x31 x32 x33
# A 4 7 9 6 4 6 9 1 2
# B 1 2 NA 3 7 NA 8 9 NA
如果上面的输出顺序不满意,可以重新排列:
# save the output from above
d2 <- do.call(rbind, lapply(ld, function(x) x[ldNames]))
# reorder the names
ldNames_sorted <- c(matrix(ldNames, ncol = (ncol(d) - 1), byrow = TRUE))
# apply the new order.
d2 <- d2[, ldNames_sorted]
# x11 x21 x31 x12 x22 x32 x13 x23 x33
#A 4 6 9 7 4 1 9 6 2
#B 1 3 8 2 7 9 NA NA NA
要为类型添加列而不是使用行名称,一种方法是:
data.frame(type = row.names(d2), d2)
派对有点晚了,但这也可以通过 data.table
包的 dcast
功能轻松完成,因为您可以在其中使用多个 value.var
:
library(data.table)
dcast(setDT(d), type ~ rowid(type), value.var = c('x1','x2','x3'), sep = '')
给出:
type x11 x12 x13 x21 x22 x23 x31 x32 x33
1: A 4 7 9 6 4 6 9 1 2
2: B 1 2 NA 3 7 NA 8 9 NA
您也可以在 base R 中执行此操作:
d$num <- ave(d$x1, d$type, FUN = seq_along)
reshape(d, idvar = 'type', direction = 'wide', timevar = 'num', sep = '')
我是一个 R(和编码新手),我正在寻找一种方法来将下面的 Table A 显示重新配置为 Table B。
Table答:
type x1 x2 x3
A 4 6 9
A 7 4 1
A 9 6 2
B 1 3 8
B 2 7 9
我正在寻找可以转换为以下内容的代码
Table乙:
type x1 x2 x3 x1' x2' x3' x1'' x2'' x3''
A 4 6 9 7 4 1 9 6 2
B 1 3 8 2 7 9
真正的Table A 超过150000 行和36 列。具有 2100 个独特的 "type" 值。
感谢您的帮助。
-肖恩
尝试一下,解决方案不是那么简洁,只是给你一个 hint.I 认为很多事情都可以改进。
不过最后还是要在这里介绍一下NA :(
zz <- "type x1 x2 x3
A 4 6 9
A 7 4 1
A 9 6 2
B 1 3 8
B 2 7 9"
dA <- read.table(text=zz, header=T)
tmp<-(sapply(unique(dA$type), FUN=function(x) as.vector(t(dA[dA$type == x, -1]))))
t(sapply(tmp, '[', seq(max(sapply(tmp, length)))))
[,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9]
[1,] 4 6 9 7 4 1 9 6 2
[2,] 1 3 8 2 7 9 NA NA NA
a <- data.frame(type=c("A", "A","A", "B", "B"), x1 = c(4,7,9,1,2),x2=c(6,4,6,3,7),
x3 = c(9,1,2,8,9))
library(dplyr)
tmp <-
a %>%
group_by(type) %>%
summarise(no_rows = length(type))
for(i in unique(a$type)){
n <- max(tmp$no_rows) - nrow(a[a$type == i,])
nn <- nrow(a)
if(n > 0){
for(ii in 1:n){
a[nn+ii,] <- c(i,NA,NA,NA)
}
}
}
a <- a[order(a$type),]
a$timevar <- seq(1:max(tmp$no_rows) )
b<-reshape(a,timevar="timevar",idvar="type",direction="wide",drop = F)
b
type x1.1 x2.1 x3.1 x1.2 x2.2 x3.2 x1.3 x2.3 x3.3
1 A 4 6 9 7 4 1 9 6 2
4 B 1 3 8 2 7 9 <NA> <NA> <NA>
@Hack-R 答案的变化:
A$num <- with(A, ave(as.character(type), type, FUN=seq_along) )
tmp <- cbind(A[c(1,5)], stack(A[2:4]))
tmp$time <- paste(tmp$ind, tmp$num, sep=".")
reshape(tmp[c("type","time","values")], idvar="type", timevar="time", direction="wide")
# type values.x1.1 values.x1.2 values.x1.3 values.x2.1 values.x2.2 values.x2.3 values.x3.1 values.x3.2 values.x3.3
#1 A 4 7 9 6 4 6 9 1 2
#4 B 1 2 NA 3 7 NA 8 9 NA
还有一个有趣的 dplyr 版本:
library(dplyr)
library(tidyr)
A %>%
gather(var, value, -type) %>%
group_by(type,var) %>%
mutate(time=seq_along(var)) %>%
ungroup() %>%
unite(grpvar, c(time,var) ) %>%
spread(grpvar, value )
#Source: local data frame [2 x 10]
#
# type 1_x1 1_x2 1_x3 2_x1 2_x2 2_x3 3_x1 3_x2 3_x3
# (chr) (int) (int) (int) (int) (int) (int) (int) (int) (int)
#1 A 4 6 9 7 4 1 9 6 2
#2 B 1 3 8 2 7 9 NA NA NA
对我来说,这个解决方案似乎非常简单
# split the data frame by type and use unlist, which will provide names
ld <- lapply(split(d[-1], d[["type"]]), unlist)
# gather all the unique names in the list
ldNames <- Reduce(unique, lapply(ld, names))
# use the names to index each list element, which makes them
# all of equal length and suitable for row binding.
do.call(rbind, lapply(ld, function(x) x[ldNames]))
# x11 x12 x13 x21 x22 x23 x31 x32 x33
# A 4 7 9 6 4 6 9 1 2
# B 1 2 NA 3 7 NA 8 9 NA
如果上面的输出顺序不满意,可以重新排列:
# save the output from above
d2 <- do.call(rbind, lapply(ld, function(x) x[ldNames]))
# reorder the names
ldNames_sorted <- c(matrix(ldNames, ncol = (ncol(d) - 1), byrow = TRUE))
# apply the new order.
d2 <- d2[, ldNames_sorted]
# x11 x21 x31 x12 x22 x32 x13 x23 x33
#A 4 6 9 7 4 1 9 6 2
#B 1 3 8 2 7 9 NA NA NA
要为类型添加列而不是使用行名称,一种方法是:
data.frame(type = row.names(d2), d2)
派对有点晚了,但这也可以通过 data.table
包的 dcast
功能轻松完成,因为您可以在其中使用多个 value.var
:
library(data.table)
dcast(setDT(d), type ~ rowid(type), value.var = c('x1','x2','x3'), sep = '')
给出:
type x11 x12 x13 x21 x22 x23 x31 x32 x33 1: A 4 7 9 6 4 6 9 1 2 2: B 1 2 NA 3 7 NA 8 9 NA
您也可以在 base R 中执行此操作:
d$num <- ave(d$x1, d$type, FUN = seq_along)
reshape(d, idvar = 'type', direction = 'wide', timevar = 'num', sep = '')