对于不同的指标,按唯一变量名称对变量求和 - 需要找到唯一名称 before/after 变量名称中的前缀
Sum variables by unique variable names, for different metrics - requires finding unique names before/after a prefix in variable name
是否有一种方法可以对数据框中所有唯一变量名称(可口可乐和百事可乐等品牌)的变量(例如销售额和单位)求和。
为了提供帮助,这里有一些示例数据。
set.seed(123)
period <- seq(as.Date('2021/01/01'), as.Date('2021/01/07'), by="day")
Coke_Regular_Units <- sample(1000:2000, 7, replace = TRUE)
Coke_Diet_Units <- sample(1000:2000, 7, replace = TRUE)
Coke_Regular_Sales <- sample(500:1000,7, replace = TRUE)
Coke_Diet_Sales <- sample(500:1000, 7, replace = TRUE)
Pepsi_Regular_Units <- sample(1000:2000, 7, replace = TRUE)
Pepsi_Diet_Units <- sample(1000:2000, 7, replace = TRUE)
Pepsi_Regular_Sales <- sample(500:1000, 7, replace = TRUE)
Pepsi_Diet_Sales <- sample(500:1000, 7, replace = TRUE)
df <- data.frame(Coke_Regular_Units, Coke_Diet_Units, Coke_Regular_Sales, Coke_Diet_Sales,
Pepsi_Regular_Units, Pepsi_Diet_Units, Pepsi_Regular_Sales, Pepsi_Diet_Sales)
> head(df)
period Coke_Regular_Units Coke_Diet_Units Coke_Regular_Sales Coke_Diet_Sales Pepsi_Regular_Units
1 2021-01-01 1414 1117 589 847 1425
2 2021-01-02 1462 1298 590 636 1648
3 2021-01-03 1178 1228 755 976 1765
4 2021-01-04 1525 1243 696 854 1210
5 2021-01-05 1194 1013 998 827 1931
6 2021-01-06 1937 1373 590 525 1589
Pepsi_Diet_Units Pepsi_Regular_Sales Pepsi_Diet_Sales
1 1554 608 943
2 1870 762 808
3 1372 892 634
4 1843 924 808
5 1142 829 910
6 1543 522 723
我喜欢自动计算Coke_Sales、Coke_Units、Pepsi_Sales、Pepsi_Units、Regular_Sales和Diet_Units的代码。
我目前正在为每个变量这样做
library(dplyr)
df$Coke_Sales <- rowSums(Filter(is.numeric, select(df, (matches("Coke") & matches("Sales")))))
df$Coke_Units <- rowSums(Filter(is.numeric, select(df, (matches("Coke") & matches("Units")))))
这对于少量变量没问题,但我需要对 100 多个变量执行此操作。有什么功能可以做到这一点吗?它需要自动找到唯一的变量名称,如 Coke、Pepsi、Diet 和 Regular。指标是变量名称的最后一部分,因此不一定需要自动找到它,但会很棒。如果方便的话,指定指标就可以了,指标最多只有3个,但是有几百个品牌。
如果它不能自动化,有没有办法简化它,我在其中指定所需的变量。不完美,但仍然是一个进步。例如,包括这些代码行以指定要求和的变量和所需的指标。
VarsToSum <- c("Coke", "Pepsi", "Diet", "Regular")
Metrics <- c("Sales", "Units")
如果那样也无法完成,也许我需要分成更小的步骤,任何提示都会很棒。试着思考如何去做,我是否应该尝试在前缀“_”之前找到唯一名称,然后计算这些唯一名称的“销售额”和“单位”。这是最好的方法吗?还是我应该重塑数据?还有其他路线可以到达吗?
任何帮助或如何实现这一目标的指导将不胜感激。谢谢
这里有一个data.table
方法...
library( data.table )
setDT(df) #make it a data.table
#melt to long
ans <- melt( df, id.vars = "period", variable.factor = FALSE )
#split variable to 3 new columns
ans[, c("brand", "type", "what") := tstrsplit( variable, "_" ) ]
# > head(ans)
# period variable value brand type what
# 1: 2021-01-01 Coke_Regular_Units 1414 Coke Regular Units
# 2: 2021-01-02 Coke_Regular_Units 1462 Coke Regular Units
# 3: 2021-01-03 Coke_Regular_Units 1178 Coke Regular Units
# 4: 2021-01-04 Coke_Regular_Units 1525 Coke Regular Units
# 5: 2021-01-05 Coke_Regular_Units 1194 Coke Regular Units
# 6: 2021-01-06 Coke_Regular_Units 1937 Coke Regular Units
#summarise however you like
ans[, .(total = sum(value) ), by = .(brand, type, what)]
# brand type what total
# 1: Coke Regular Units 10527
# 2: Coke Diet Units 8936
# 3: Coke Regular Sales 5158
# 4: Coke Diet Sales 5171
# 5: Pepsi Regular Units 11160
# 6: Pepsi Diet Units 10813
# 7: Pepsi Regular Sales 5447
# 8: Pepsi Diet Sales 5491
使用 outer
来 paste
音节和 grep
。
sapply(outer(c("Coke", "Pepsi"), c("Sales", "Units"), paste, sep=".*"), function(x)
rowSums(df[grep(x, names(df))]))
# Coke.*Sales Pepsi.*Sales Coke.*Units Pepsi.*Units
# [1,] 1436 1551 2531 2979
# [2,] 1226 1570 2760 3518
# [3,] 1731 1526 2406 3137
# [4,] 1550 1732 2768 3053
# [5,] 1825 1739 2207 3073
# [6,] 1115 1245 3310 3132
# [7,] 1446 1575 3481 3081
这是一个与@Wimpel 类似的解决方案,但是 tidyverse
:
library(tidyverse)
summary_df <-
df %>%
pivot_longer(cols = ends_with("Sales") | ends_with("Units"),
names_to = c("brand", "type", ".value"),
names_pattern = "(.*)_(.*)_(.*)") %>%
group_by(brand) %>%
summarize(Sales = sum(Sales),
Units = sum(Units)) %>%
pivot_wider(names_from = "brand",
values_from = c("Sales", "Units"),
names_glue = "{brand}_{.value}")
summary_df
# # A tibble: 1 x 4
# Coke_Sales Pepsi_Sales Coke_Units Pepsi_Units
# <int> <int> <int> <int>
# 1 10329 10938 19463 21973
是否有一种方法可以对数据框中所有唯一变量名称(可口可乐和百事可乐等品牌)的变量(例如销售额和单位)求和。 为了提供帮助,这里有一些示例数据。
set.seed(123)
period <- seq(as.Date('2021/01/01'), as.Date('2021/01/07'), by="day")
Coke_Regular_Units <- sample(1000:2000, 7, replace = TRUE)
Coke_Diet_Units <- sample(1000:2000, 7, replace = TRUE)
Coke_Regular_Sales <- sample(500:1000,7, replace = TRUE)
Coke_Diet_Sales <- sample(500:1000, 7, replace = TRUE)
Pepsi_Regular_Units <- sample(1000:2000, 7, replace = TRUE)
Pepsi_Diet_Units <- sample(1000:2000, 7, replace = TRUE)
Pepsi_Regular_Sales <- sample(500:1000, 7, replace = TRUE)
Pepsi_Diet_Sales <- sample(500:1000, 7, replace = TRUE)
df <- data.frame(Coke_Regular_Units, Coke_Diet_Units, Coke_Regular_Sales, Coke_Diet_Sales,
Pepsi_Regular_Units, Pepsi_Diet_Units, Pepsi_Regular_Sales, Pepsi_Diet_Sales)
> head(df)
period Coke_Regular_Units Coke_Diet_Units Coke_Regular_Sales Coke_Diet_Sales Pepsi_Regular_Units
1 2021-01-01 1414 1117 589 847 1425
2 2021-01-02 1462 1298 590 636 1648
3 2021-01-03 1178 1228 755 976 1765
4 2021-01-04 1525 1243 696 854 1210
5 2021-01-05 1194 1013 998 827 1931
6 2021-01-06 1937 1373 590 525 1589
Pepsi_Diet_Units Pepsi_Regular_Sales Pepsi_Diet_Sales
1 1554 608 943
2 1870 762 808
3 1372 892 634
4 1843 924 808
5 1142 829 910
6 1543 522 723
我喜欢自动计算Coke_Sales、Coke_Units、Pepsi_Sales、Pepsi_Units、Regular_Sales和Diet_Units的代码。
我目前正在为每个变量这样做
library(dplyr)
df$Coke_Sales <- rowSums(Filter(is.numeric, select(df, (matches("Coke") & matches("Sales")))))
df$Coke_Units <- rowSums(Filter(is.numeric, select(df, (matches("Coke") & matches("Units")))))
这对于少量变量没问题,但我需要对 100 多个变量执行此操作。有什么功能可以做到这一点吗?它需要自动找到唯一的变量名称,如 Coke、Pepsi、Diet 和 Regular。指标是变量名称的最后一部分,因此不一定需要自动找到它,但会很棒。如果方便的话,指定指标就可以了,指标最多只有3个,但是有几百个品牌。
如果它不能自动化,有没有办法简化它,我在其中指定所需的变量。不完美,但仍然是一个进步。例如,包括这些代码行以指定要求和的变量和所需的指标。
VarsToSum <- c("Coke", "Pepsi", "Diet", "Regular")
Metrics <- c("Sales", "Units")
如果那样也无法完成,也许我需要分成更小的步骤,任何提示都会很棒。试着思考如何去做,我是否应该尝试在前缀“_”之前找到唯一名称,然后计算这些唯一名称的“销售额”和“单位”。这是最好的方法吗?还是我应该重塑数据?还有其他路线可以到达吗?
任何帮助或如何实现这一目标的指导将不胜感激。谢谢
这里有一个data.table
方法...
library( data.table )
setDT(df) #make it a data.table
#melt to long
ans <- melt( df, id.vars = "period", variable.factor = FALSE )
#split variable to 3 new columns
ans[, c("brand", "type", "what") := tstrsplit( variable, "_" ) ]
# > head(ans)
# period variable value brand type what
# 1: 2021-01-01 Coke_Regular_Units 1414 Coke Regular Units
# 2: 2021-01-02 Coke_Regular_Units 1462 Coke Regular Units
# 3: 2021-01-03 Coke_Regular_Units 1178 Coke Regular Units
# 4: 2021-01-04 Coke_Regular_Units 1525 Coke Regular Units
# 5: 2021-01-05 Coke_Regular_Units 1194 Coke Regular Units
# 6: 2021-01-06 Coke_Regular_Units 1937 Coke Regular Units
#summarise however you like
ans[, .(total = sum(value) ), by = .(brand, type, what)]
# brand type what total
# 1: Coke Regular Units 10527
# 2: Coke Diet Units 8936
# 3: Coke Regular Sales 5158
# 4: Coke Diet Sales 5171
# 5: Pepsi Regular Units 11160
# 6: Pepsi Diet Units 10813
# 7: Pepsi Regular Sales 5447
# 8: Pepsi Diet Sales 5491
使用 outer
来 paste
音节和 grep
。
sapply(outer(c("Coke", "Pepsi"), c("Sales", "Units"), paste, sep=".*"), function(x)
rowSums(df[grep(x, names(df))]))
# Coke.*Sales Pepsi.*Sales Coke.*Units Pepsi.*Units
# [1,] 1436 1551 2531 2979
# [2,] 1226 1570 2760 3518
# [3,] 1731 1526 2406 3137
# [4,] 1550 1732 2768 3053
# [5,] 1825 1739 2207 3073
# [6,] 1115 1245 3310 3132
# [7,] 1446 1575 3481 3081
这是一个与@Wimpel 类似的解决方案,但是 tidyverse
:
library(tidyverse)
summary_df <-
df %>%
pivot_longer(cols = ends_with("Sales") | ends_with("Units"),
names_to = c("brand", "type", ".value"),
names_pattern = "(.*)_(.*)_(.*)") %>%
group_by(brand) %>%
summarize(Sales = sum(Sales),
Units = sum(Units)) %>%
pivot_wider(names_from = "brand",
values_from = c("Sales", "Units"),
names_glue = "{brand}_{.value}")
summary_df
# # A tibble: 1 x 4
# Coke_Sales Pepsi_Sales Coke_Units Pepsi_Units
# <int> <int> <int> <int>
# 1 10329 10938 19463 21973