R 中按百分比分组与 Excel 不同
Group by percentage in R is not same with Excel
我正在尝试计算每个帐户的金额占 R 中总金额的百分比。但是当我与 excel.i 比较时我没有得到正确的值相信我的逻辑中有一些遗漏。
> dput(test)
structure(list(Account_new = c("32001932", "31011217", "31011217",
"22006428", "22005443", "22005443", "31011217", "32002608", "22001044",
"22006428", "31004020", "32002608", "32002608", "32002692", "31001730",
"31011597", "32002387", "32000717", "32002788", "32002789", "22001054",
"31002915", "22002000", "31010427", "31007795", "23001237", "32002777",
"22000284", "31004020", "32002062", "22000400", "31008884", "32002432",
"31010427", "32002608", "32002608", "32002742", "22001061", "31005764",
"31007410", "31011634", "31007335", "32000741", "32000741", "32000741",
"32001742", "32002692", "32002692", "31001725", "31001730", "32001462",
"32001462", "31011597", "32000756", "32000756", "32000756", "32000756",
"32000756", "32000756", "32000756", "32000756", "32000756", "32000246",
"32000256", "31008560", "32000706", "32000706", "32000706", "32000706",
"32002777", "32000756", "22001054", "31002915", "31002915", "22006433",
"22001061", "22006293", "32002707", "32002237", "32002608", "32002608",
"22005443", "31010427", "22002000", "22001061", "31002915", "22006428",
"22006428", "32002608", "32002097", "32002097", "32002097", "32002097",
"32002097", "32002432", "22006293", "23000062", "32002782", "22005171",
"22001061"), New_amt_loc_curr = c(393.82, 12558.49, 1738.87,
22200.19, 101.19, 142.74, 68262.44, 5357.97, 382048.48, 56877.06,
433.71, 43696.82, 14350.07, 369.97, 199723.81, 549.2, 52893.4,
1617.03, 2093.81, 7607.58, 102152.99, 88716.88, 339.85, 51401.16,
373.28, 14166.36, 77.22, 2478.74, 1110.22, 2520.91, 7736.87,
501.7, 139.97, 16555.63, 25805.93, 10620.37, 34992.76, 6267.67,
225.22, 228.62, 155580.49, 13749.64, 610.14, 238.31, 41.47, 1041.44,
78.78, 994.85, 119598.63, 310084.4, 1263.4, 536.49, 3343.21,
18.16, 913.63, 315.72, 5449.66, 2999.34, 121.54, 929, 403.73,
44.71, 48085.22, 1911.95, 104957.41, 15732.01, 4356.91, 17642.22,
3060.26, 6042.52, 22.69, 119372.69, 92081.72, 6213.82, 7458.01,
6547.72, 10092.65, 560.75, 2402.59, 22804.72, 62681.66, 105.41,
25664.79, 14953.4, 2145.61, 98682.25, 61021.51, 7.48, 822.44,
67544.51, 24179.65, 23431.98, 46609.75, 67544.51, 612.36, 4528.94,
25492.2, 6017.25, 15312.28, 1610.53)), row.names = c(NA, -100L
), class = c("tbl_df", "tbl", "data.frame"))
我用于百分比的代码是:
summary<-sqldf("select Account_new,New_amt_loc_curr/sum(New_amt_loc_curr) as amnt_avg from test group by Account_new")
输出:
excel 中的输出是:
你可以这样做:
library(dplyr)
df <- df %>%
group_by(Account_new) %>%
mutate(perc = New_amt_loc_curr/sum(New_amt_loc_curr))
print(df)
Account_new New_amt_loc_curr perc
<chr> <dbl> <dbl>
1 32001932 394. 1
2 31011217 12558. 0.152
3 31011217 1739. 0.0211
4 22006428 22200. 0.158
5 22005443 101. 0.290
6 22005443 143. 0.409
SQL 似乎是在计算来自每条记录的每个 Account_new
的百分比,而不是来自每个 Account_new
的总数的百分比。从 R 中获取与您在 Excel 中看到的相同值的一种方法是:
aggregate(New_amt_loc_curr ~ Account_new, test,
function(x)(100*sum(x)/sum(test$New_amt_loc_curr)))
# Account_new New_amt_loc_curr
#1 22000284 0.086184065
#2 22000400 0.269005588
#3 22001044 13.283559869
#4 22001054 7.702293784
#5 22001061 0.576180570
#6 22002000 0.531735663
#7 22005171 0.532397323
#8 22005443 0.012146309
#9 22006293 0.508382512
#10 22006428 4.871396497
#11 22006433 0.259309819
#...
这可能更容易理解为两个步骤:
# Sum up the records for each account
res <- aggregate(New_amt_loc_curr ~ Account_new, test, sum)
# Convert the values to percentages
res$perc <- 100* res$New_amt_loc_curr/sum(res$New_amt_loc_curr)
在您的 R 代码中,您将每个 Account_new
的第一个除以该帐户的 New_amt_loc_curr
的总和。在 excel 中,您将每个账户的 New_amt_loc_curr
总和除以所有账户的总和:
> library(dplyr)
> test %>%
mutate(total = sum(New_amt_loc_curr)) %>%
group_by(Account_new) %>%
summarise(total_amt_per_account_pcg = 100*New_amt_loc_curr/total)
Account_new total_amt_per_account_pcg
<chr> <dbl>
1 22000284 0.0862
2 22000400 0.269
3 22001044 13.3
4 22001054 3.55
5 22001054 4.15
6 22001061 0.218
7 22001061 0.228
8 22001061 0.0746
9 22001061 0.0560
10 22002000 0.0118
现有的 SQL 代码正在按 Account_new 计算平均值:value/sum(values per group)
,相反,我们需要的是 sum(values per group)/sum(all values)
此处使用 sqldf:
更正了 SQL 代码
library(sqldf)
sqldf("SELECT Account_new,
sum(New_amt_loc_curr) / (
SELECT sum(New_amt_loc_curr)
FROM test
) * 100 as amnt_avg
FROM test
GROUP BY Account_new")
# Account_new amnt_avg
# 1 22000284 0.086184065
# 2 22000400 0.269005588
# 3 22001044 13.283559869
# 4 22001054 7.702293784
# 5 22001061 0.576180570
# 6 22002000 0.531735663
# 7 22005171 0.532397323
# 8 22005443 0.012146309
# 9 22006293 0.508382512
# 10 22006428 4.871396497
我正在尝试计算每个帐户的金额占 R 中总金额的百分比。但是当我与 excel.i 比较时我没有得到正确的值相信我的逻辑中有一些遗漏。
> dput(test)
structure(list(Account_new = c("32001932", "31011217", "31011217",
"22006428", "22005443", "22005443", "31011217", "32002608", "22001044",
"22006428", "31004020", "32002608", "32002608", "32002692", "31001730",
"31011597", "32002387", "32000717", "32002788", "32002789", "22001054",
"31002915", "22002000", "31010427", "31007795", "23001237", "32002777",
"22000284", "31004020", "32002062", "22000400", "31008884", "32002432",
"31010427", "32002608", "32002608", "32002742", "22001061", "31005764",
"31007410", "31011634", "31007335", "32000741", "32000741", "32000741",
"32001742", "32002692", "32002692", "31001725", "31001730", "32001462",
"32001462", "31011597", "32000756", "32000756", "32000756", "32000756",
"32000756", "32000756", "32000756", "32000756", "32000756", "32000246",
"32000256", "31008560", "32000706", "32000706", "32000706", "32000706",
"32002777", "32000756", "22001054", "31002915", "31002915", "22006433",
"22001061", "22006293", "32002707", "32002237", "32002608", "32002608",
"22005443", "31010427", "22002000", "22001061", "31002915", "22006428",
"22006428", "32002608", "32002097", "32002097", "32002097", "32002097",
"32002097", "32002432", "22006293", "23000062", "32002782", "22005171",
"22001061"), New_amt_loc_curr = c(393.82, 12558.49, 1738.87,
22200.19, 101.19, 142.74, 68262.44, 5357.97, 382048.48, 56877.06,
433.71, 43696.82, 14350.07, 369.97, 199723.81, 549.2, 52893.4,
1617.03, 2093.81, 7607.58, 102152.99, 88716.88, 339.85, 51401.16,
373.28, 14166.36, 77.22, 2478.74, 1110.22, 2520.91, 7736.87,
501.7, 139.97, 16555.63, 25805.93, 10620.37, 34992.76, 6267.67,
225.22, 228.62, 155580.49, 13749.64, 610.14, 238.31, 41.47, 1041.44,
78.78, 994.85, 119598.63, 310084.4, 1263.4, 536.49, 3343.21,
18.16, 913.63, 315.72, 5449.66, 2999.34, 121.54, 929, 403.73,
44.71, 48085.22, 1911.95, 104957.41, 15732.01, 4356.91, 17642.22,
3060.26, 6042.52, 22.69, 119372.69, 92081.72, 6213.82, 7458.01,
6547.72, 10092.65, 560.75, 2402.59, 22804.72, 62681.66, 105.41,
25664.79, 14953.4, 2145.61, 98682.25, 61021.51, 7.48, 822.44,
67544.51, 24179.65, 23431.98, 46609.75, 67544.51, 612.36, 4528.94,
25492.2, 6017.25, 15312.28, 1610.53)), row.names = c(NA, -100L
), class = c("tbl_df", "tbl", "data.frame"))
我用于百分比的代码是:
summary<-sqldf("select Account_new,New_amt_loc_curr/sum(New_amt_loc_curr) as amnt_avg from test group by Account_new")
输出:
excel 中的输出是:
你可以这样做:
library(dplyr)
df <- df %>%
group_by(Account_new) %>%
mutate(perc = New_amt_loc_curr/sum(New_amt_loc_curr))
print(df)
Account_new New_amt_loc_curr perc
<chr> <dbl> <dbl>
1 32001932 394. 1
2 31011217 12558. 0.152
3 31011217 1739. 0.0211
4 22006428 22200. 0.158
5 22005443 101. 0.290
6 22005443 143. 0.409
SQL 似乎是在计算来自每条记录的每个 Account_new
的百分比,而不是来自每个 Account_new
的总数的百分比。从 R 中获取与您在 Excel 中看到的相同值的一种方法是:
aggregate(New_amt_loc_curr ~ Account_new, test,
function(x)(100*sum(x)/sum(test$New_amt_loc_curr)))
# Account_new New_amt_loc_curr
#1 22000284 0.086184065
#2 22000400 0.269005588
#3 22001044 13.283559869
#4 22001054 7.702293784
#5 22001061 0.576180570
#6 22002000 0.531735663
#7 22005171 0.532397323
#8 22005443 0.012146309
#9 22006293 0.508382512
#10 22006428 4.871396497
#11 22006433 0.259309819
#...
这可能更容易理解为两个步骤:
# Sum up the records for each account
res <- aggregate(New_amt_loc_curr ~ Account_new, test, sum)
# Convert the values to percentages
res$perc <- 100* res$New_amt_loc_curr/sum(res$New_amt_loc_curr)
在您的 R 代码中,您将每个 Account_new
的第一个除以该帐户的 New_amt_loc_curr
的总和。在 excel 中,您将每个账户的 New_amt_loc_curr
总和除以所有账户的总和:
> library(dplyr)
> test %>%
mutate(total = sum(New_amt_loc_curr)) %>%
group_by(Account_new) %>%
summarise(total_amt_per_account_pcg = 100*New_amt_loc_curr/total)
Account_new total_amt_per_account_pcg
<chr> <dbl>
1 22000284 0.0862
2 22000400 0.269
3 22001044 13.3
4 22001054 3.55
5 22001054 4.15
6 22001061 0.218
7 22001061 0.228
8 22001061 0.0746
9 22001061 0.0560
10 22002000 0.0118
现有的 SQL 代码正在按 Account_new 计算平均值:value/sum(values per group)
,相反,我们需要的是 sum(values per group)/sum(all values)
此处使用 sqldf:
更正了 SQL 代码library(sqldf)
sqldf("SELECT Account_new,
sum(New_amt_loc_curr) / (
SELECT sum(New_amt_loc_curr)
FROM test
) * 100 as amnt_avg
FROM test
GROUP BY Account_new")
# Account_new amnt_avg
# 1 22000284 0.086184065
# 2 22000400 0.269005588
# 3 22001044 13.283559869
# 4 22001054 7.702293784
# 5 22001061 0.576180570
# 6 22002000 0.531735663
# 7 22005171 0.532397323
# 8 22005443 0.012146309
# 9 22006293 0.508382512
# 10 22006428 4.871396497