在 R 中重新格式化数据框
Reformat dataframe in R
我有多个列,想重新格式化数据框以减少列数。
这是我的当前 df:
# Dataframe
df <- data.frame(
~Location, ~Product_Name, ~Category, ~Machine1, ~Machine2 ~Machine1_adds, ~Machine2_adds, ~Sales1, ~Saless2, Spoils1, Spoils2
A, "Snickers", Candy, 0, 1, .5, , 2, 1
A, "Kitcat", Candy, 0, 1, , , 2, 1
A, "Pepsi", Bev, 1, 1, , , 3, 0
B, "Coke", Bev, 1, 0, , .45, 1, 1
B, "Gatoraid", Bev, 0, 1, , .45, 1, 0
B, "Sprite", Bev, 1, 1, , , 1, 0
)
df
我想将数据框重新格式化为 machine 是一个列,sales 是另一个列,而 spoils 是最后一个,每个值行都调整为新列。参考输出:
# Dataframe
new_df <- data.frame(
~Location, ~Product_Name, ~Category, ~Machine, ~Machine_adds, ~Sales, Spoils
A, "Snickers", Candy, 1, 0, .5, 2
A, "Kitcat", Candy, 1, 0, , 2
A, "Pepsi", Bev, 1, 1, , 3
B, "Coke", Bev, 2, 1, .45, 1
B, "Gatoraid", Bev, 2, 0, .45, 1
B, "Sprite", Bev, 2, 1, , 0
)
new_df
I implemented the melt function but I am not getting the 'Machine' Column to represent which machine I am pulling from (either Machine 1, 2, 3 etc..).
这是 data.table
中带有 melt
的选项
library(data.table)
melt(setDT(df), measure = patterns("^Machine", "^Sales", "^Spoils"),
value.name = c("Machine_adds", "Sales", "Spoils"))[, variable := NULL][]
# Location Product_Name Category Machine_adds Sales Spoils
# 1: A Snickers Candy 0 .5 2
# 2: A Kitcat Candy 0 2
# 3: A Pepsi Bev 1 3
# 4: B Coke Bev 1 1
# 5: B Gatoraid Bev 0 1
# 6: B Sprite Bev 1 1
# 7: A Snickers Candy 1 1
# 8: A Kitcat Candy 1 1
# 9: A Pepsi Bev 1 0
#10: B Coke Bev 0 .45 1
#11: B Gatoraid Bev 1 .45 0
#12: B Sprite Bev 1 0
更新
根据 OP 的更新示例,如果有 'Machine' 和 Machine_adds' 列,我们可以将 patterns
稍微更改为
# creating new columns in the dataset
df[c('Machine1', 'Machine2')] <- df[c("Machine1_adds", "Machine2_adds")]
melt(setDT(df), measure = patterns("^Machine\d+$",
"^Machine\d+_adds$", "^Sales", "^Spoils"),
value.name = c("Machine", "Machine_adds", "Sales", "Spoils"))[,
variable := NULL][]
或使用 tidyr
中的 pivot_longer
library(dplyr)
library(tidyr)
library(stringr)
df %>%
rename_at(3:ncol(.), ~
str_replace(., "(\d+)_?.*", "_\1")) %>%
pivot_longer(cols = matches("^(Machine|Sales|Spoils)"),
names_to = c(".value", "group"), names_sep = "_") %>%
select(-group)
# A tibble: 12 x 6
# Location Product_Name Category Machine Sales Spoils
# <chr> <chr> <chr> <dbl> <chr> <dbl>
# 1 A Snickers Candy 0 .5 2
# 2 A Snickers Candy 1 1
# 3 A Kitcat Candy 0 2
# 4 A Kitcat Candy 1 1
# 5 A Pepsi Bev 1 3
# 6 A Pepsi Bev 1 0
# 7 B Coke Bev 1 1
# 8 B Coke Bev 0 .45 1
# 9 B Gatoraid Bev 0 1
#10 B Gatoraid Bev 1 .45 0
#11 B Sprite Bev 1 1
#12 B Sprite Bev 1 0
更新
df %>%
rename_at(vars(matches('^Machine.*adds$')), ~
str_replace(., '(\d+)_(\w+)$', '_\2\1')) %>%
rename_at(3:ncol(.), ~ str_replace(., "(\d+)_?.*", ":\1")) %>%
pivot_longer(cols = matches("^(Machine|Sales|Spoils)"),
names_to = c(".value", "group"), names_sep = ":") %>%
select(-group)
数据
df <- structure(list(Location = c("A", "A", "A", "B", "B", "B"),
Product_Name = c("Snickers",
"Kitcat", "Pepsi", "Coke", "Gatoraid", "Sprite"), Category = c("Candy",
"Candy", "Bev", "Bev", "Bev", "Bev"), Machine1_adds = c(0, 0,
1, 1, 0, 1), Machine2_adds = c(1, 1, 1, 0, 1, 1), Sales1 = c(".5",
"", "", "", "", ""), Sales2 = c("", "", "", ".45",
".45", ""), Spoils1 = c(2, 2, 3, 1, 1, 1), Spoils2 = c(1,
1, 0, 1, 0, 0)), row.names = c(NA, -6L), class = c("tbl_df",
"tbl", "data.frame"))
根据您所拥有的,您正在寻找变量对齐为 list(c(4,5),c(6,7),c(8,9))
的 reshape
函数。您可以使用:
reshape(df,t(matrix(4:ncol(df),2)),idvar = 1:3,dir="long")
或
reshape(df,list(c(4,5),c(6,7),c(8,9)),idvar = 1:3,dir="long")
为了得到你的名字,我将使用 v.names
参数
reshape(df,list(c(4,5),c(6,7),c(8,9)),idvar = 1:3,dir="long",
v.names = c("Machine_adds","Sales","Spoils"))[-4]# -4 removes the time variable.
Location Product_Name Category Machine_adds Sales Spoils
A.Snickers.Candy.1 A Snickers Candy 0 .5 2
A.Kitcat.Candy.1 A Kitcat Candy 0 2
A.Pepsi.Bev.1 A Pepsi Bev 1 3
B.Coke.Bev.1 B Coke Bev 1 1
B.Gatoraid.Bev.1 B Gatoraid Bev 0 1
B.Sprite.Bev.1 B Sprite Bev 1 1
A.Snickers.Candy.2 A Snickers Candy 1 1
A.Kitcat.Candy.2 A Kitcat Candy 1 1
A.Pepsi.Bev.2 A Pepsi Bev 1 0
B.Coke.Bev.2 B Coke Bev 0 .45 1
B.Gatoraid.Bev.2 B Gatoraid Bev 1 .45 0
B.Sprite.Bev.2 B Sprite Bev 1 0
我有多个列,想重新格式化数据框以减少列数。
这是我的当前 df:
# Dataframe
df <- data.frame(
~Location, ~Product_Name, ~Category, ~Machine1, ~Machine2 ~Machine1_adds, ~Machine2_adds, ~Sales1, ~Saless2, Spoils1, Spoils2
A, "Snickers", Candy, 0, 1, .5, , 2, 1
A, "Kitcat", Candy, 0, 1, , , 2, 1
A, "Pepsi", Bev, 1, 1, , , 3, 0
B, "Coke", Bev, 1, 0, , .45, 1, 1
B, "Gatoraid", Bev, 0, 1, , .45, 1, 0
B, "Sprite", Bev, 1, 1, , , 1, 0
)
df
我想将数据框重新格式化为 machine 是一个列,sales 是另一个列,而 spoils 是最后一个,每个值行都调整为新列。参考输出:
# Dataframe
new_df <- data.frame(
~Location, ~Product_Name, ~Category, ~Machine, ~Machine_adds, ~Sales, Spoils
A, "Snickers", Candy, 1, 0, .5, 2
A, "Kitcat", Candy, 1, 0, , 2
A, "Pepsi", Bev, 1, 1, , 3
B, "Coke", Bev, 2, 1, .45, 1
B, "Gatoraid", Bev, 2, 0, .45, 1
B, "Sprite", Bev, 2, 1, , 0
)
new_df
I implemented the melt function but I am not getting the 'Machine' Column to represent which machine I am pulling from (either Machine 1, 2, 3 etc..).
这是 data.table
melt
的选项
library(data.table)
melt(setDT(df), measure = patterns("^Machine", "^Sales", "^Spoils"),
value.name = c("Machine_adds", "Sales", "Spoils"))[, variable := NULL][]
# Location Product_Name Category Machine_adds Sales Spoils
# 1: A Snickers Candy 0 .5 2
# 2: A Kitcat Candy 0 2
# 3: A Pepsi Bev 1 3
# 4: B Coke Bev 1 1
# 5: B Gatoraid Bev 0 1
# 6: B Sprite Bev 1 1
# 7: A Snickers Candy 1 1
# 8: A Kitcat Candy 1 1
# 9: A Pepsi Bev 1 0
#10: B Coke Bev 0 .45 1
#11: B Gatoraid Bev 1 .45 0
#12: B Sprite Bev 1 0
更新
根据 OP 的更新示例,如果有 'Machine' 和 Machine_adds' 列,我们可以将 patterns
稍微更改为
# creating new columns in the dataset
df[c('Machine1', 'Machine2')] <- df[c("Machine1_adds", "Machine2_adds")]
melt(setDT(df), measure = patterns("^Machine\d+$",
"^Machine\d+_adds$", "^Sales", "^Spoils"),
value.name = c("Machine", "Machine_adds", "Sales", "Spoils"))[,
variable := NULL][]
或使用 tidyr
pivot_longer
library(dplyr)
library(tidyr)
library(stringr)
df %>%
rename_at(3:ncol(.), ~
str_replace(., "(\d+)_?.*", "_\1")) %>%
pivot_longer(cols = matches("^(Machine|Sales|Spoils)"),
names_to = c(".value", "group"), names_sep = "_") %>%
select(-group)
# A tibble: 12 x 6
# Location Product_Name Category Machine Sales Spoils
# <chr> <chr> <chr> <dbl> <chr> <dbl>
# 1 A Snickers Candy 0 .5 2
# 2 A Snickers Candy 1 1
# 3 A Kitcat Candy 0 2
# 4 A Kitcat Candy 1 1
# 5 A Pepsi Bev 1 3
# 6 A Pepsi Bev 1 0
# 7 B Coke Bev 1 1
# 8 B Coke Bev 0 .45 1
# 9 B Gatoraid Bev 0 1
#10 B Gatoraid Bev 1 .45 0
#11 B Sprite Bev 1 1
#12 B Sprite Bev 1 0
更新
df %>%
rename_at(vars(matches('^Machine.*adds$')), ~
str_replace(., '(\d+)_(\w+)$', '_\2\1')) %>%
rename_at(3:ncol(.), ~ str_replace(., "(\d+)_?.*", ":\1")) %>%
pivot_longer(cols = matches("^(Machine|Sales|Spoils)"),
names_to = c(".value", "group"), names_sep = ":") %>%
select(-group)
数据
df <- structure(list(Location = c("A", "A", "A", "B", "B", "B"),
Product_Name = c("Snickers",
"Kitcat", "Pepsi", "Coke", "Gatoraid", "Sprite"), Category = c("Candy",
"Candy", "Bev", "Bev", "Bev", "Bev"), Machine1_adds = c(0, 0,
1, 1, 0, 1), Machine2_adds = c(1, 1, 1, 0, 1, 1), Sales1 = c(".5",
"", "", "", "", ""), Sales2 = c("", "", "", ".45",
".45", ""), Spoils1 = c(2, 2, 3, 1, 1, 1), Spoils2 = c(1,
1, 0, 1, 0, 0)), row.names = c(NA, -6L), class = c("tbl_df",
"tbl", "data.frame"))
根据您所拥有的,您正在寻找变量对齐为 list(c(4,5),c(6,7),c(8,9))
的 reshape
函数。您可以使用:
reshape(df,t(matrix(4:ncol(df),2)),idvar = 1:3,dir="long")
或
reshape(df,list(c(4,5),c(6,7),c(8,9)),idvar = 1:3,dir="long")
为了得到你的名字,我将使用 v.names
参数
reshape(df,list(c(4,5),c(6,7),c(8,9)),idvar = 1:3,dir="long",
v.names = c("Machine_adds","Sales","Spoils"))[-4]# -4 removes the time variable.
Location Product_Name Category Machine_adds Sales Spoils
A.Snickers.Candy.1 A Snickers Candy 0 .5 2
A.Kitcat.Candy.1 A Kitcat Candy 0 2
A.Pepsi.Bev.1 A Pepsi Bev 1 3
B.Coke.Bev.1 B Coke Bev 1 1
B.Gatoraid.Bev.1 B Gatoraid Bev 0 1
B.Sprite.Bev.1 B Sprite Bev 1 1
A.Snickers.Candy.2 A Snickers Candy 1 1
A.Kitcat.Candy.2 A Kitcat Candy 1 1
A.Pepsi.Bev.2 A Pepsi Bev 1 0
B.Coke.Bev.2 B Coke Bev 0 .45 1
B.Gatoraid.Bev.2 B Gatoraid Bev 1 .45 0
B.Sprite.Bev.2 B Sprite Bev 1 0