从长格式重塑为宽格式
Reshape from long to wide format
例子
customer_code items
1 sugar
1 salt
2 sugar
2 accessories
3 salt
期望的输出
customer_code item item2 item3
1 sugar salt
2 sugar accessories
3 salt
你可以尝试使用函数 reshape
:
获取尽可能多的列:
new_df <- reshape(df, idvar="customer_code", timevar="items", v.names="items", direction="wide")
new_df
# customer_code items.sugar items.salt items.accessories
#1 1 sugar salt <NA>
#3 2 sugar <NA> accessories
#5 3 <NA> salt <NA>
之后您可以使用 colnames(new_df)[-1] <- paste0("item", 1:(ncol(new_df)-1))
更改列名
另一种选择,如果您想获得与唯一客户可以拥有的最大项目数一样多的列:
df_split <- split(df, df[, 1])
df_split <- lapply(df_split, reshape, idvar="customer_code", timevar="items", v.names="items", direction="wide")
max_item <- max(sapply(df_split, ncol))
df_split <- lapply(df_split, function(df){
if(ncol(df) < max_item) df <- cbind(df, matrix(NA, ncol=max_item - ncol(df)))
colnames(df)[-1] <- paste0("item", 1:(max_item-1))
return(df)
})
new_df <- do.call("rbind", df_split)
new_df
# customer_code item1 item2
#1 1 sugar salt
#2 2 sugar accessories
#3 3 salt <NA>
您可以使用 tidyr
中的 spread
library(dplyr)
library(tidyr)
mutate(df1, var=factor(items, levels=unique(items),
labels=paste0('items', seq(n_distinct(items))))) %>%
spread(var, items, fill='')
# customer_code items1 items2 items3
#1 1 sugar salt
#2 2 sugar accessories
#3 3 salt
Packages dplyr
尤其是 tidyr
可以解决这类问题。这段代码可以解决问题。
require("tidyr")
require("dplyr")
df %>% group_by(customer_code) %>% spread(items, items) -> df_wide
# customer_code accessories salt sugar
# 1 1 NA salt sugar
# 2 2 accessories NA sugar
# 3 3 NA salt NA
希望在必要时更改 colnames 没有问题:
names(df_wide)[-1] <- paste0("item", 1:(ncol(df_wide)-1))
# customer_code item1 item2 item3
# 1 1 NA salt sugar
# 2 2 accessories NA sugar
# 3 3 NA salt NA
另外可以提示这种输出形式(可能方便):
df %>% mutate(present = T) %>% spread(items, present, fill = F)
# customer_code accessories salt sugar
# 1 1 FALSE TRUE TRUE
# 2 2 TRUE FALSE TRUE
# 3 3 FALSE TRUE FALSE
你可以在这里做一个简单的dcast
library(reshape2)
dcast(df, customer_code ~ paste("items", items, sep = "_"), value.var = "items")
# customer_code items_accessories items_salt items_sugar
# 1 1 <NA> salt sugar
# 2 2 accessories <NA> sugar
# 3 3 <NA> salt <NA>
或更接近您想要的输出
library(data.table)
setDT(df)[, indx := paste0("items", .GRP), by = items]
dcast(df, customer_code ~ indx, value.var = "items")
# customer_code items1 items2 items3
# 1: 1 sugar salt NA
# 2: 2 sugar NA accessories
# 3: 3 NA salt NA
例子
customer_code items
1 sugar
1 salt
2 sugar
2 accessories
3 salt
期望的输出
customer_code item item2 item3
1 sugar salt
2 sugar accessories
3 salt
你可以尝试使用函数 reshape
:
获取尽可能多的列:
new_df <- reshape(df, idvar="customer_code", timevar="items", v.names="items", direction="wide")
new_df
# customer_code items.sugar items.salt items.accessories
#1 1 sugar salt <NA>
#3 2 sugar <NA> accessories
#5 3 <NA> salt <NA>
之后您可以使用 colnames(new_df)[-1] <- paste0("item", 1:(ncol(new_df)-1))
另一种选择,如果您想获得与唯一客户可以拥有的最大项目数一样多的列:
df_split <- split(df, df[, 1])
df_split <- lapply(df_split, reshape, idvar="customer_code", timevar="items", v.names="items", direction="wide")
max_item <- max(sapply(df_split, ncol))
df_split <- lapply(df_split, function(df){
if(ncol(df) < max_item) df <- cbind(df, matrix(NA, ncol=max_item - ncol(df)))
colnames(df)[-1] <- paste0("item", 1:(max_item-1))
return(df)
})
new_df <- do.call("rbind", df_split)
new_df
# customer_code item1 item2
#1 1 sugar salt
#2 2 sugar accessories
#3 3 salt <NA>
您可以使用 tidyr
spread
library(dplyr)
library(tidyr)
mutate(df1, var=factor(items, levels=unique(items),
labels=paste0('items', seq(n_distinct(items))))) %>%
spread(var, items, fill='')
# customer_code items1 items2 items3
#1 1 sugar salt
#2 2 sugar accessories
#3 3 salt
Packages dplyr
尤其是 tidyr
可以解决这类问题。这段代码可以解决问题。
require("tidyr")
require("dplyr")
df %>% group_by(customer_code) %>% spread(items, items) -> df_wide
# customer_code accessories salt sugar
# 1 1 NA salt sugar
# 2 2 accessories NA sugar
# 3 3 NA salt NA
希望在必要时更改 colnames 没有问题:
names(df_wide)[-1] <- paste0("item", 1:(ncol(df_wide)-1))
# customer_code item1 item2 item3
# 1 1 NA salt sugar
# 2 2 accessories NA sugar
# 3 3 NA salt NA
另外可以提示这种输出形式(可能方便):
df %>% mutate(present = T) %>% spread(items, present, fill = F)
# customer_code accessories salt sugar
# 1 1 FALSE TRUE TRUE
# 2 2 TRUE FALSE TRUE
# 3 3 FALSE TRUE FALSE
你可以在这里做一个简单的dcast
library(reshape2)
dcast(df, customer_code ~ paste("items", items, sep = "_"), value.var = "items")
# customer_code items_accessories items_salt items_sugar
# 1 1 <NA> salt sugar
# 2 2 accessories <NA> sugar
# 3 3 <NA> salt <NA>
或更接近您想要的输出
library(data.table)
setDT(df)[, indx := paste0("items", .GRP), by = items]
dcast(df, customer_code ~ indx, value.var = "items")
# customer_code items1 items2 items3
# 1: 1 sugar salt NA
# 2: 2 sugar NA accessories
# 3: 3 NA salt NA