根据行名重新格式化数据表以在 R 中生成新列
Reformat data tables based on row names to generate new columns in R
我有一个看起来像
的数据框
m1 m2 m3
P001.st 60.00 2.0 1
P003.nd 14.30 2.077 1
P003.rt 29.60 2.077 1
P006.st 10.30 2.077 1
P006.nd 79.30 2.077 1
P008.nd 9.16 2.077 1
我想重新格式化 table 以便仅行名称的第一部分(句点之前,即 P001、P003 等)显示为行名称,并将具有相似名称的每个后续行附加到列。输出应该类似于
m1st m2st m3st m1nd m2nd m3nd m1rt m2rt m3rt
P001 60.00 2.0 1 0 0 0 0 0 0
P003 0 0 0 14.30 2.077 1 29.60 2.077 1
P006 10.30 2.077 1 79.30 2.077 1 0 0 0
P008 0 0 0 9.16 2.077 1 0 0 0
类似
的聚合函数
aggregate(value~name, df, I)
或来自 data.table 的方法,如
setDT(df)[, list(value=list(value)), by=name]
将不起作用,因为 row.names 不完全相同。将数百行与许多可变子类型匹配的任何建议(即,在句点之后:.nd、.st 等)。
试试这个:
library(tidyr)
library(dplyr)
library(reshape2)
library(stringr)
data <-
structure(list(m1 = c(60, 14.3, 29.6, 10.3, 79.3, 9.16),
m2 = c(2, 2.077, 2.077, 2.077, 2.077, 2.077),
m3 = c(1L, 1L, 1L, 1L, 1L, 1L)),
.Names = c("m1", "m2", "m3"),
class = "data.frame",
row.names = c("P001.st", "P003.nd", "P003.rt",
"P006.st", "P006.nd", "P008.nd"))
my_data <-
as_data_frame(cbind(col_01 = rownames(data), data)) %>%
melt(.) %>%
separate(., col_01, into = c("var_01", "var_02"), sep = "\.") %>%
mutate(my_var = str_c(variable, var_02)) %>%
select(var_01, my_var, value) %>%
arrange(var_01, my_var) %>%
spread(., my_var, value)
my_data
var_01 m1nd m1rt m1st m2nd m2rt m2st m3nd m3rt m3st
1 P001 NA NA 60.0 NA NA 2.000 NA NA 1
2 P003 14.30 29.6 NA 2.077 2.077 NA 1 1 NA
3 P006 79.30 NA 10.3 2.077 NA 2.077 1 NA 1
4 P008 9.16 NA NA 2.077 NA NA 1 NA NA
如果你想用0替换NAs,你可以这样做:
my_data[is.na(my_data)] <- 0
var_01 m1nd m1rt m1st m2nd m2rt m2st m3nd m3rt m3st
1 P001 0.00 0.0 60.0 0.000 0.000 2.000 0 0 1
2 P003 14.30 29.6 0.0 2.077 2.077 0.000 1 1 0
3 P006 79.30 0.0 10.3 2.077 0.000 2.077 1 0 1
4 P008 9.16 0.0 0.0 2.077 0.000 0.000 1 0 0
这是另一种方法:
library(dplyr)
library(tidyr)
(wide <- reshape(df %>% add_rownames() %>% separate(rowname, c("rowname", "id")),
idvar = "rowname",
timevar = "id",
direction = "wide",
sep = ""))
# rowname m1st m2st m3st m1nd m2nd m3nd m1rt m2rt m3rt
# 1 P001 60.0 2.000 1 NA NA NA NA NA NA
# 2 P003 NA NA NA 14.30 2.077 1 29.6 2.077 1
# 4 P006 10.3 2.077 1 79.30 2.077 1 NA NA NA
# 6 P008 NA NA NA 9.16 2.077 1 NA NA NA
wide[is.na(wide)] <- 0
rownames(wide) <- wide[, 1]
wide$rowname <- NULL
wide
# m1st m2st m3st m1nd m2nd m3nd m1rt m2rt m3rt
# P001 60.0 2.000 1 0.00 0.000 0 0.0 0.000 0
# P003 0.0 0.000 0 14.30 2.077 1 29.6 2.077 1
# P006 10.3 2.077 1 79.30 2.077 1 0.0 0.000 0
# P008 0.0 0.000 0 9.16 2.077 1 0.0 0.000 0
如果你的数据框叫做"data":
library(reshape2)
data$prefix <- gsub("(.*)\..*","\1",row.names(data))
data$suffix <- gsub(".*\.(.*)","\1",row.names(data))
data.melt <- melt(data)
data.melt
data.cast <- dcast(data.melt,prefix~variable+suffix,mean)
# set the row names to prefix
row.names(data.cast) <- data.cast$prefix
# get rid of the prefix column
data.cast <- data.cast[,-1]
data.cast
给予
Using prefix, suffix as id variables
m1_nd m1_rt m1_st m2_nd m2_rt m2_st m3_nd m3_rt m3_st
P001 NaN NaN 60.0 NaN NaN 2.000 NaN NaN 1
P003 14.30 29.6 NaN 2.077 2.077 NaN 1 1 NaN
P006 79.30 NaN 10.3 2.077 NaN 2.077 1 NaN 1
P008 9.16 NaN NaN 2.077 NaN NaN 1 NaN NaN
要更正列名和零而不是 NaN,请执行
names(data.cast) <- gsub("_","",names(data.cast))
apply(data.cast,c(1,2),function(x){as.numeric(ifelse(is.na(x),0,x)) })
得到
m1nd m1rt m1st m2nd m2rt m2st m3nd m3rt m3st
P001 0.00 0.0 60.0 0.000 0.000 2.000 0 0 1
P003 14.30 29.6 0.0 2.077 2.077 0.000 1 1 0
P006 79.30 0.0 10.3 2.077 0.000 2.077 1 0 1
P008 9.16 0.0 0.0 2.077 0.000 0.000 1 0 0
dt = as.data.table(your_df, keep.rownames = T)
# split the row names into two id's
dt[, `:=`(id1 = sub('\..*', '', rn), id2 = sub('.*\.', '', rn), rn = NULL)]
# melt and dcast (need latest 1.9.5 or have to load reshape2 and use dcast.data.table)
dcast(melt(dt, id.vars = c('id1', 'id2')), id1 ~ variable + id2, fill = 0)
# id1 m1_nd m1_rt m1_st m2_nd m2_rt m2_st m3_nd m3_rt m3_st
#1: P001 0.00 0.0 60.0 0.000 0.000 2.000 0 0 1
#2: P003 14.30 29.6 0.0 2.077 2.077 0.000 1 1 0
#3: P006 79.30 0.0 10.3 2.077 0.000 2.077 1 0 1
#4: P008 9.16 0.0 0.0 2.077 0.000 0.000 1 0 0
使用 extract()
而不是 separate()
和更灵活的正则表达式,使用 tidyr
和 dplyr
:
df %>%
extract(id, c("id2", "var"), c("(P00.)\.(..)")) %>%
gather(variable,value,c(m1,m2,m3)) %>%
mutate(var=paste0(variable,".",var)) %>%
select(-variable) %>%
spread(var,value,fill=0)
id2 m1.nd m1.rt m1.st m2.nd m2.rt m2.st m3.nd m3.rt m3.st
1 P001 0.00 0.0 60.0 0.000 0.000 2.000 0 0 1
2 P003 14.30 29.6 0.0 2.077 2.077 0.000 1 1 0
3 P006 79.30 0.0 10.3 2.077 0.000 2.077 1 0 1
4 P008 9.16 0.0 0.0 2.077 0.000 0.000 1 0 0
我有一个看起来像
的数据框 m1 m2 m3
P001.st 60.00 2.0 1
P003.nd 14.30 2.077 1
P003.rt 29.60 2.077 1
P006.st 10.30 2.077 1
P006.nd 79.30 2.077 1
P008.nd 9.16 2.077 1
我想重新格式化 table 以便仅行名称的第一部分(句点之前,即 P001、P003 等)显示为行名称,并将具有相似名称的每个后续行附加到列。输出应该类似于
m1st m2st m3st m1nd m2nd m3nd m1rt m2rt m3rt
P001 60.00 2.0 1 0 0 0 0 0 0
P003 0 0 0 14.30 2.077 1 29.60 2.077 1
P006 10.30 2.077 1 79.30 2.077 1 0 0 0
P008 0 0 0 9.16 2.077 1 0 0 0
类似
的聚合函数aggregate(value~name, df, I)
或来自 data.table 的方法,如
setDT(df)[, list(value=list(value)), by=name]
将不起作用,因为 row.names 不完全相同。将数百行与许多可变子类型匹配的任何建议(即,在句点之后:.nd、.st 等)。
试试这个:
library(tidyr)
library(dplyr)
library(reshape2)
library(stringr)
data <-
structure(list(m1 = c(60, 14.3, 29.6, 10.3, 79.3, 9.16),
m2 = c(2, 2.077, 2.077, 2.077, 2.077, 2.077),
m3 = c(1L, 1L, 1L, 1L, 1L, 1L)),
.Names = c("m1", "m2", "m3"),
class = "data.frame",
row.names = c("P001.st", "P003.nd", "P003.rt",
"P006.st", "P006.nd", "P008.nd"))
my_data <-
as_data_frame(cbind(col_01 = rownames(data), data)) %>%
melt(.) %>%
separate(., col_01, into = c("var_01", "var_02"), sep = "\.") %>%
mutate(my_var = str_c(variable, var_02)) %>%
select(var_01, my_var, value) %>%
arrange(var_01, my_var) %>%
spread(., my_var, value)
my_data
var_01 m1nd m1rt m1st m2nd m2rt m2st m3nd m3rt m3st
1 P001 NA NA 60.0 NA NA 2.000 NA NA 1
2 P003 14.30 29.6 NA 2.077 2.077 NA 1 1 NA
3 P006 79.30 NA 10.3 2.077 NA 2.077 1 NA 1
4 P008 9.16 NA NA 2.077 NA NA 1 NA NA
如果你想用0替换NAs,你可以这样做:
my_data[is.na(my_data)] <- 0
var_01 m1nd m1rt m1st m2nd m2rt m2st m3nd m3rt m3st
1 P001 0.00 0.0 60.0 0.000 0.000 2.000 0 0 1
2 P003 14.30 29.6 0.0 2.077 2.077 0.000 1 1 0
3 P006 79.30 0.0 10.3 2.077 0.000 2.077 1 0 1
4 P008 9.16 0.0 0.0 2.077 0.000 0.000 1 0 0
这是另一种方法:
library(dplyr)
library(tidyr)
(wide <- reshape(df %>% add_rownames() %>% separate(rowname, c("rowname", "id")),
idvar = "rowname",
timevar = "id",
direction = "wide",
sep = ""))
# rowname m1st m2st m3st m1nd m2nd m3nd m1rt m2rt m3rt
# 1 P001 60.0 2.000 1 NA NA NA NA NA NA
# 2 P003 NA NA NA 14.30 2.077 1 29.6 2.077 1
# 4 P006 10.3 2.077 1 79.30 2.077 1 NA NA NA
# 6 P008 NA NA NA 9.16 2.077 1 NA NA NA
wide[is.na(wide)] <- 0
rownames(wide) <- wide[, 1]
wide$rowname <- NULL
wide
# m1st m2st m3st m1nd m2nd m3nd m1rt m2rt m3rt
# P001 60.0 2.000 1 0.00 0.000 0 0.0 0.000 0
# P003 0.0 0.000 0 14.30 2.077 1 29.6 2.077 1
# P006 10.3 2.077 1 79.30 2.077 1 0.0 0.000 0
# P008 0.0 0.000 0 9.16 2.077 1 0.0 0.000 0
如果你的数据框叫做"data":
library(reshape2)
data$prefix <- gsub("(.*)\..*","\1",row.names(data))
data$suffix <- gsub(".*\.(.*)","\1",row.names(data))
data.melt <- melt(data)
data.melt
data.cast <- dcast(data.melt,prefix~variable+suffix,mean)
# set the row names to prefix
row.names(data.cast) <- data.cast$prefix
# get rid of the prefix column
data.cast <- data.cast[,-1]
data.cast
给予
Using prefix, suffix as id variables
m1_nd m1_rt m1_st m2_nd m2_rt m2_st m3_nd m3_rt m3_st
P001 NaN NaN 60.0 NaN NaN 2.000 NaN NaN 1
P003 14.30 29.6 NaN 2.077 2.077 NaN 1 1 NaN
P006 79.30 NaN 10.3 2.077 NaN 2.077 1 NaN 1
P008 9.16 NaN NaN 2.077 NaN NaN 1 NaN NaN
要更正列名和零而不是 NaN,请执行
names(data.cast) <- gsub("_","",names(data.cast))
apply(data.cast,c(1,2),function(x){as.numeric(ifelse(is.na(x),0,x)) })
得到
m1nd m1rt m1st m2nd m2rt m2st m3nd m3rt m3st
P001 0.00 0.0 60.0 0.000 0.000 2.000 0 0 1
P003 14.30 29.6 0.0 2.077 2.077 0.000 1 1 0
P006 79.30 0.0 10.3 2.077 0.000 2.077 1 0 1
P008 9.16 0.0 0.0 2.077 0.000 0.000 1 0 0
dt = as.data.table(your_df, keep.rownames = T)
# split the row names into two id's
dt[, `:=`(id1 = sub('\..*', '', rn), id2 = sub('.*\.', '', rn), rn = NULL)]
# melt and dcast (need latest 1.9.5 or have to load reshape2 and use dcast.data.table)
dcast(melt(dt, id.vars = c('id1', 'id2')), id1 ~ variable + id2, fill = 0)
# id1 m1_nd m1_rt m1_st m2_nd m2_rt m2_st m3_nd m3_rt m3_st
#1: P001 0.00 0.0 60.0 0.000 0.000 2.000 0 0 1
#2: P003 14.30 29.6 0.0 2.077 2.077 0.000 1 1 0
#3: P006 79.30 0.0 10.3 2.077 0.000 2.077 1 0 1
#4: P008 9.16 0.0 0.0 2.077 0.000 0.000 1 0 0
使用 extract()
而不是 separate()
和更灵活的正则表达式,使用 tidyr
和 dplyr
:
df %>%
extract(id, c("id2", "var"), c("(P00.)\.(..)")) %>%
gather(variable,value,c(m1,m2,m3)) %>%
mutate(var=paste0(variable,".",var)) %>%
select(-variable) %>%
spread(var,value,fill=0)
id2 m1.nd m1.rt m1.st m2.nd m2.rt m2.st m3.nd m3.rt m3.st
1 P001 0.00 0.0 60.0 0.000 0.000 2.000 0 0 1
2 P003 14.30 29.6 0.0 2.077 2.077 0.000 1 1 0
3 P006 79.30 0.0 10.3 2.077 0.000 2.077 1 0 1
4 P008 9.16 0.0 0.0 2.077 0.000 0.000 1 0 0