多种功能聚合
Multiple functions in aggregate
有没有可能从下面的数据帧df1
Branch Loan_Amount TAT
A 100 2.0
A 120 4.0
A 300 9.0
B 150 1.5
B 200 2.0
我可以使用聚合函数将以下输出作为数据帧 df2
Branch Number_of_loans Loan_Amount Total_TAT
A 3 520 15.0
B 2 350 3.5
我知道我可以使用 nrow 来计算 number_of_loans 并合并,但我正在寻找更好的方法。
使用 dplyr,你可以这样做:
library(dplyr)
group_by(d,Branch) %>%
summarize(Number_of_loans = n(),
Loan_Amount = sum(Loan_Amount),
TAT = sum(TAT))
输出
Source: local data frame [2 x 4]
Branch Number_of_loans Loan_Amount TAT
(fctr) (int) (int) (dbl)
1 A 3 520 15.0
2 B 2 350 3.5
数据
d <- read.table(text="Branch Loan_Amount TAT
A 100 2.0
A 120 4.0
A 300 9.0
B 150 1.5
B 200 2.0",head=TRUE)
Base package:
df1 <- aggregate(.~ Branch, df, FUN = "sum")
df2 <- setNames(aggregate(Loan_Amount~Branch, df, length)[2], c("Number_of_loans"))
cbind(df1, df2)
输出
Branch Loan_Amount TAT Number_of_loans
1 A 520 15.0 3
2 B 350 3.5 2
Package sqldf
:
library(sqldf)
sqldf("SELECT Branch, COUNT(Loan_Amount) Number_of_loans, SUM(Loan_Amount) Loan_Amount, SUM(TAT) TAT
FROM df
GROUP BY Branch")
输出
Branch Number_of_loans Loan_Amount TAT
1 A 3 520 15.0
2 B 2 350 3.5
数据
df <- structure(list(Branch = structure(c(1L, 1L, 1L, 2L, 2L), .Label = c("A",
"B"), class = "factor"), Loan_Amount = c(100L, 120L, 300L, 150L,
200L), TAT = c(2, 4, 9, 1.5, 2)), .Names = c("Branch", "Loan_Amount",
"TAT"), class = "data.frame", row.names = c(NA, -5L))
使用data.table
library(data.table)
setDT(df)[,list(Number_of_loans=.N,
Loan_Amount =sum(Loan_Amount),
Total_TAT =sum(TAT)), by=Branch]
# Branch Number_of_loans Loan_Amount Total_TAT
# 1: A 3 520 15.0
# 2: B 2 350 3.5
这是 hacky 和低效的,但它有效且有趣(它使用 aggregate()
):
d <- read.table(text="Branch Loan_Amount TAT
A 100 2.0
A 120 4.0
A 300 9.0
B 150 1.5
B 200 2.0",head=TRUE)
library(stringr)
df = aggregate(.~Branch, data=d, FUN=function(x) paste0(length(x), '|',sum(x)))
df_ = cbind(str_split_fixed(df$Loan_Amount, '|', 4)[,c(2,4)], str_split_fixed(df$TAT, '|', 4)[,4])
df_ = apply(df_, 2, as.numeric)
colnames(df_) = c('Number_of_loans','Loan_Amount','Total_TAT')
cbind(df[,'Branch',drop=F], df_)
生成所需的data.frame:
Branch Number_of_loans Loan_Amount Total_TAT
1 A 3 520 15.0
2 B 2 350 3.5
这是一个旧的 post 但在一个常见的操作上,在我看来 应该 是一个更简单的解决方案。
这是一个可能更简单的单行替代方案。
> aggregate2(df, x = c('Loan_Amount', 'TAT'), by = 'Branch',
FUN = list(total = sum, number = length))
Branch Loan_Amount.total TAT.total Loan_Amount.number TAT.number
1 A 520 15.0 3 3
2 B 350 3.5 2 2
aggregate2()
是 jumbled repo 中的一个函数,我刚刚在基本函数 aggregate
之上构建。它为每个 FUN
函数调用一次 aggregate
,之前和之后都有一些工作。
与aggregate
不同,它接受多个函数。
与 dplyr 解决方案不同,它将所有这些函数应用于所有 x
变量,而不是例如每个变量一个 Loan_Amount = sum(Loan_Amount),
。
有没有可能从下面的数据帧df1
Branch Loan_Amount TAT
A 100 2.0
A 120 4.0
A 300 9.0
B 150 1.5
B 200 2.0
我可以使用聚合函数将以下输出作为数据帧 df2
Branch Number_of_loans Loan_Amount Total_TAT
A 3 520 15.0
B 2 350 3.5
我知道我可以使用 nrow 来计算 number_of_loans 并合并,但我正在寻找更好的方法。
使用 dplyr,你可以这样做:
library(dplyr)
group_by(d,Branch) %>%
summarize(Number_of_loans = n(),
Loan_Amount = sum(Loan_Amount),
TAT = sum(TAT))
输出
Source: local data frame [2 x 4]
Branch Number_of_loans Loan_Amount TAT
(fctr) (int) (int) (dbl)
1 A 3 520 15.0
2 B 2 350 3.5
数据
d <- read.table(text="Branch Loan_Amount TAT
A 100 2.0
A 120 4.0
A 300 9.0
B 150 1.5
B 200 2.0",head=TRUE)
Base package:
df1 <- aggregate(.~ Branch, df, FUN = "sum")
df2 <- setNames(aggregate(Loan_Amount~Branch, df, length)[2], c("Number_of_loans"))
cbind(df1, df2)
输出
Branch Loan_Amount TAT Number_of_loans
1 A 520 15.0 3
2 B 350 3.5 2
Package
sqldf
:
library(sqldf)
sqldf("SELECT Branch, COUNT(Loan_Amount) Number_of_loans, SUM(Loan_Amount) Loan_Amount, SUM(TAT) TAT
FROM df
GROUP BY Branch")
输出
Branch Number_of_loans Loan_Amount TAT
1 A 3 520 15.0
2 B 2 350 3.5
数据
df <- structure(list(Branch = structure(c(1L, 1L, 1L, 2L, 2L), .Label = c("A",
"B"), class = "factor"), Loan_Amount = c(100L, 120L, 300L, 150L,
200L), TAT = c(2, 4, 9, 1.5, 2)), .Names = c("Branch", "Loan_Amount",
"TAT"), class = "data.frame", row.names = c(NA, -5L))
使用data.table
library(data.table)
setDT(df)[,list(Number_of_loans=.N,
Loan_Amount =sum(Loan_Amount),
Total_TAT =sum(TAT)), by=Branch]
# Branch Number_of_loans Loan_Amount Total_TAT
# 1: A 3 520 15.0
# 2: B 2 350 3.5
这是 hacky 和低效的,但它有效且有趣(它使用 aggregate()
):
d <- read.table(text="Branch Loan_Amount TAT
A 100 2.0
A 120 4.0
A 300 9.0
B 150 1.5
B 200 2.0",head=TRUE)
library(stringr)
df = aggregate(.~Branch, data=d, FUN=function(x) paste0(length(x), '|',sum(x)))
df_ = cbind(str_split_fixed(df$Loan_Amount, '|', 4)[,c(2,4)], str_split_fixed(df$TAT, '|', 4)[,4])
df_ = apply(df_, 2, as.numeric)
colnames(df_) = c('Number_of_loans','Loan_Amount','Total_TAT')
cbind(df[,'Branch',drop=F], df_)
生成所需的data.frame:
Branch Number_of_loans Loan_Amount Total_TAT
1 A 3 520 15.0
2 B 2 350 3.5
这是一个旧的 post 但在一个常见的操作上,在我看来 应该 是一个更简单的解决方案。
这是一个可能更简单的单行替代方案。
> aggregate2(df, x = c('Loan_Amount', 'TAT'), by = 'Branch',
FUN = list(total = sum, number = length))
Branch Loan_Amount.total TAT.total Loan_Amount.number TAT.number
1 A 520 15.0 3 3
2 B 350 3.5 2 2
aggregate2()
是 jumbled repo 中的一个函数,我刚刚在基本函数 aggregate
之上构建。它为每个 FUN
函数调用一次 aggregate
,之前和之后都有一些工作。
与aggregate
不同,它接受多个函数。
与 dplyr 解决方案不同,它将所有这些函数应用于所有 x
变量,而不是例如每个变量一个 Loan_Amount = sum(Loan_Amount),
。