基于R中特定类别的其他列以百分比计算列
Calculating columns in percent based on other columns for particular category in R
我是 R 的初学者,需要帮助来完成这项任务。
带 dput 的虚拟数据集的输出(真实数据集真的很大):
structure(list(CODE = c(453, 463, 476,
798, 885, 582, 626, 663, 457, 408
), CATEGORY = c("CIG", "BET", "CIG", "CIG", "ARI", "CRR", "ARI", "CIG",
"CIG", "BET"), AMOUNT = c(22, 5, 6, 52, 16, 11, 6, 70, 208, 5),
PRICE = c(5.56, 8.29, 3.89, 3.8, 4.05, 3.99, 3.55, 7.69, 6.75,
5.2), BRAND = c("ROTHMANS", "ALLINO", "MARLBORO", "ROTHMANS", "AURIELO",
"SOLINOS", "CHLEBLO", "MARLBORO", "LD", "SOLINOS"
)), row.names = c(NA, -10L), class = c("tbl_df",
"tbl", "data.frame"))
应该做什么:应该添加一个列,以百分比显示每个品牌的份额。首先,我所做的是以这种方式创建一个新列“VALUE”:
df$VALUE <- with(df, AMOUNT*PRICE)
现在,SHARE 列必须按以下方式创建:特定类别中特定品牌的价值总和(来自列 VALUE)除以整个类别的价值总和。例如,“ROTHMANS”属于CIG类别,它的值总和为319.92,整个CIG类别的总和为2285,56,所以SHARE=14%。并且应该在每种情况下进行计算。我认为 dplyr 库可以适用,但找不到解决方案。
您可以先 sum
BRAND
值并得到每个 CATEGORY
的比例。
library(dplyr)
df %>%
group_by(CATEGORY, BRAND) %>%
summarise(VALUE = sum(VALUE)) %>%
mutate(SHARE = prop.table(VALUE) * 100) %>%
ungroup
# CATEGORY BRAND VALUE SHARE
# <chr> <chr> <dbl> <dbl>
#1 ARI AURIELO 64.8 75.3
#2 ARI CHLEBLO 21.3 24.7
#3 BET ALLINO 41.4 61.5
#4 BET SOLINOS 26 38.5
#5 CIG LD 1404 61.4
#6 CIG MARLBORO 562. 24.6
#7 CIG ROTHMANS 320. 14.0
#8 CRR SOLINOS 43.9 100
一个data.table
解决方案可以是:
library(data.table)
res <- setDT(df)[,'.'(VALUE = sum(VALUE)), by = list(CATEGORY,BRAND)
][,':='(SHARE = round(VALUE/sum(VALUE)*100,2)), by = list(CATEGORY)]
res
CATEGORY BRAND VALUE SHARE
1: CIG ROTHMANS 319.92 14.00
2: BET ALLINO 41.45 61.45
3: CIG MARLBORO 561.64 24.57
4: ARI AURIELO 64.80 75.26
5: CRR SOLINOS 43.89 100.00
6: ARI CHLEBLO 21.30 24.74
7: CIG LD 1404.00 61.43
8: BET SOLINOS 26.00 38.55
编辑
要保持原始值可能是这样的:
res <- setDT(df)[,'.'(VALUE = sum(VALUE)), by = list(CATEGORY,BRAND)
][,':='(SHARE = round(VALUE/sum(VALUE)*100,2)), by = list(CATEGORY)
][setDT(df), on = c('BRAND','CATEGORY')
][,-('i.VALUE')]
res
CATEGORY BRAND VALUE SHARE CODE AMOUNT PRICE
1: CIG ROTHMANS 319.92 14.00 453 22 5.56
2: BET ALLINO 41.45 61.45 463 5 8.29
3: CIG MARLBORO 561.64 24.57 476 6 3.89
4: CIG ROTHMANS 319.92 14.00 798 52 3.80
5: ARI AURIELO 64.80 75.26 885 16 4.05
6: CRR SOLINOS 43.89 100.00 582 11 3.99
7: ARI CHLEBLO 21.30 24.74 626 6 3.55
8: CIG MARLBORO 561.64 24.57 663 70 7.69
9: CIG LD 1404.00 61.43 457 208 6.75
10: BET SOLINOS 26.00 38.55 408 5 5.20
我们可以使用base R
transform(aggregate(VALUE ~ CATEGORY + BRAND, df, sum),
SHARE = ave(VALUE, CATEGORY, FUN = proportions) * 100)
CATEGORY BRAND VALUE SHARE
1 BET ALLINO 41.45 61.45293
2 ARI AURIELO 64.80 75.26132
3 ARI CHLEBLO 21.30 24.73868
4 CIG LD 1404.00 61.42915
5 CIG MARLBORO 561.64 24.57341
6 CIG ROTHMANS 319.92 13.99744
7 BET SOLINOS 26.00 38.54707
8 CRR SOLINOS 43.89 100.00000
我是 R 的初学者,需要帮助来完成这项任务。 带 dput 的虚拟数据集的输出(真实数据集真的很大):
structure(list(CODE = c(453, 463, 476,
798, 885, 582, 626, 663, 457, 408
), CATEGORY = c("CIG", "BET", "CIG", "CIG", "ARI", "CRR", "ARI", "CIG",
"CIG", "BET"), AMOUNT = c(22, 5, 6, 52, 16, 11, 6, 70, 208, 5),
PRICE = c(5.56, 8.29, 3.89, 3.8, 4.05, 3.99, 3.55, 7.69, 6.75,
5.2), BRAND = c("ROTHMANS", "ALLINO", "MARLBORO", "ROTHMANS", "AURIELO",
"SOLINOS", "CHLEBLO", "MARLBORO", "LD", "SOLINOS"
)), row.names = c(NA, -10L), class = c("tbl_df",
"tbl", "data.frame"))
应该做什么:应该添加一个列,以百分比显示每个品牌的份额。首先,我所做的是以这种方式创建一个新列“VALUE”:
df$VALUE <- with(df, AMOUNT*PRICE)
现在,SHARE 列必须按以下方式创建:特定类别中特定品牌的价值总和(来自列 VALUE)除以整个类别的价值总和。例如,“ROTHMANS”属于CIG类别,它的值总和为319.92,整个CIG类别的总和为2285,56,所以SHARE=14%。并且应该在每种情况下进行计算。我认为 dplyr 库可以适用,但找不到解决方案。
您可以先 sum
BRAND
值并得到每个 CATEGORY
的比例。
library(dplyr)
df %>%
group_by(CATEGORY, BRAND) %>%
summarise(VALUE = sum(VALUE)) %>%
mutate(SHARE = prop.table(VALUE) * 100) %>%
ungroup
# CATEGORY BRAND VALUE SHARE
# <chr> <chr> <dbl> <dbl>
#1 ARI AURIELO 64.8 75.3
#2 ARI CHLEBLO 21.3 24.7
#3 BET ALLINO 41.4 61.5
#4 BET SOLINOS 26 38.5
#5 CIG LD 1404 61.4
#6 CIG MARLBORO 562. 24.6
#7 CIG ROTHMANS 320. 14.0
#8 CRR SOLINOS 43.9 100
一个data.table
解决方案可以是:
library(data.table)
res <- setDT(df)[,'.'(VALUE = sum(VALUE)), by = list(CATEGORY,BRAND)
][,':='(SHARE = round(VALUE/sum(VALUE)*100,2)), by = list(CATEGORY)]
res
CATEGORY BRAND VALUE SHARE
1: CIG ROTHMANS 319.92 14.00
2: BET ALLINO 41.45 61.45
3: CIG MARLBORO 561.64 24.57
4: ARI AURIELO 64.80 75.26
5: CRR SOLINOS 43.89 100.00
6: ARI CHLEBLO 21.30 24.74
7: CIG LD 1404.00 61.43
8: BET SOLINOS 26.00 38.55
编辑
要保持原始值可能是这样的:
res <- setDT(df)[,'.'(VALUE = sum(VALUE)), by = list(CATEGORY,BRAND)
][,':='(SHARE = round(VALUE/sum(VALUE)*100,2)), by = list(CATEGORY)
][setDT(df), on = c('BRAND','CATEGORY')
][,-('i.VALUE')]
res
CATEGORY BRAND VALUE SHARE CODE AMOUNT PRICE
1: CIG ROTHMANS 319.92 14.00 453 22 5.56
2: BET ALLINO 41.45 61.45 463 5 8.29
3: CIG MARLBORO 561.64 24.57 476 6 3.89
4: CIG ROTHMANS 319.92 14.00 798 52 3.80
5: ARI AURIELO 64.80 75.26 885 16 4.05
6: CRR SOLINOS 43.89 100.00 582 11 3.99
7: ARI CHLEBLO 21.30 24.74 626 6 3.55
8: CIG MARLBORO 561.64 24.57 663 70 7.69
9: CIG LD 1404.00 61.43 457 208 6.75
10: BET SOLINOS 26.00 38.55 408 5 5.20
我们可以使用base R
transform(aggregate(VALUE ~ CATEGORY + BRAND, df, sum),
SHARE = ave(VALUE, CATEGORY, FUN = proportions) * 100)
CATEGORY BRAND VALUE SHARE
1 BET ALLINO 41.45 61.45293
2 ARI AURIELO 64.80 75.26132
3 ARI CHLEBLO 21.30 24.73868
4 CIG LD 1404.00 61.42915
5 CIG MARLBORO 561.64 24.57341
6 CIG ROTHMANS 319.92 13.99744
7 BET SOLINOS 26.00 38.54707
8 CRR SOLINOS 43.89 100.00000