使用 dplyr 的条件累积和
conditional cumulative sum using dplyr
我的数据框看起来像这样,我想要两个单独的累积列,一个用于基金 A,另一个用于基金 B
Name Event SalesAmount Fund Cum-A(desired) Cum-B(desired)
John Webinar NA NA NA NA
John Sale 1000 A 1000 NA
John Sale 2000 B 1000 2000
John Sale 3000 A 4000 2000
John Email NA NA 4000 2000
Tom Webinar NA NA NA NA
Tom Sale 1000 A 1000 NA
Tom Sale 2000 B 1000 2000
Tom Sale 3000 A 4000 2000
Tom Email NA NA 4000 2000
I have tried:
df<-
df %>%
group_by(Name)%>%
mutate(Cum-A = as.numeric(ifelse(Fund=="A",cumsum(SalesAmount),0)))%>%
mutate(Cum-B = as.numeric(ifelse(Fund=="B",cumsum(SalesAmount),0)))
但这完全不是我想要的,因为它向我显示了两个基金的总计,尽管只有在基金匹配时才显示在行中。
请帮忙。
怎么样:
library(dplyr)
d %>%
group_by(Name) %>%
mutate(cA=cumsum(ifelse(!is.na(Fund) & Fund=="A",SalesAmount,0))) %>%
mutate(cB=cumsum(ifelse(!is.na(Fund) & Fund=="B",SalesAmount,0)))
输出:
Source: local data frame [10 x 8]
Groups: Name
Name Event SalesAmount Fund Cum.A.desired. Cum.B.desired. cA cB
1 John Webinar NA NA NA NA 0 0
2 John Sale 1000 A 1000 NA 1000 0
3 John Sale 2000 B 1000 2000 1000 2000
4 John Sale 3000 A 4000 2000 4000 2000
5 John Email NA NA 4000 2000 4000 2000
6 Tom Webinar NA NA NA NA 0 0
7 Tom Sale 1000 A 1000 NA 1000 0
8 Tom Sale 2000 B 1000 2000 1000 2000
9 Tom Sale 3000 A 4000 2000 4000 2000
10 Tom Email NA NA 4000 2000 4000 2000
如果需要,结果列中的零可以随后被 NA
替换:
result$cA[result$cA==0] <- NA
result$cB[result$cB==0] <- NA
您的输入数据集:
d <- structure(list(Name = structure(c(1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L), .Label = c("John", "Tom"), class = "factor"), Event = structure(c(3L, 2L, 2L, 2L, 1L, 3L, 2L, 2L, 2L, 1L), .Label = c("Email", "Sale", "Webinar"), class = "factor"), SalesAmount = c(NA, 1000L, 2000L, 3000L, NA, NA, 1000L, 2000L, 3000L, NA), Fund = structure(c(NA, 1L, 2L, 1L, NA, NA, 1L, 2L, 1L, NA), .Label = c("A", "B"), class = "factor"), Cum.A.desired. = c(NA, 1000L, 1000L, 4000L, 4000L, NA, 1000L, 1000L, 4000L, 4000L), Cum.B.desired. = c(NA, NA, 2000L, 2000L, 2000L, NA, NA, 2000L, 2000L, 2000L)), .Names = c("Name", "Event", "SalesAmount", "Fund", "Cum.A.desired.", "Cum.B.desired." ), class = "data.frame", row.names = c(NA, -10L))
您可以将@Marat 的回答全部合并为一个 mutate
:
df %>%
group_by(Name) %>%
mutate(
cA = cumsum(ifelse(!is.na(Fund) & Fund == "A", SalesAmount, 0)),
cB = cumsum(ifelse(!is.na(Fund) & Fund == "B", SalesAmount, 0)),
cA = ifelse(cA == 0, NA, cA),
cB = ifelse(cB == 0, NA, cB)
)
这是一种推广到更多基金的方法,使用 zoo
和 data.table
:
# prep
require(data.table)
require(zoo)
setDT(d)
d[,Fund:=as.character(Fund)] # because factors are the worst
uf <- unique(d[Event=="Sale"]$Fund) # collect set of funds
首先,分配相关观察子集的累计销售额:
for (f in uf) d[(Event=="Sale"&Fund==f),paste0('c',f):=cumsum(SalesAmount),by=Name]
然后,将最后的观察向前推进:
d[,paste0('c',uf):=lapply(.SD,na.locf,na.rm=FALSE),.SDcols=paste0('c',uf),by=Name]
我的数据框看起来像这样,我想要两个单独的累积列,一个用于基金 A,另一个用于基金 B
Name Event SalesAmount Fund Cum-A(desired) Cum-B(desired)
John Webinar NA NA NA NA
John Sale 1000 A 1000 NA
John Sale 2000 B 1000 2000
John Sale 3000 A 4000 2000
John Email NA NA 4000 2000
Tom Webinar NA NA NA NA
Tom Sale 1000 A 1000 NA
Tom Sale 2000 B 1000 2000
Tom Sale 3000 A 4000 2000
Tom Email NA NA 4000 2000
I have tried:
df<-
df %>%
group_by(Name)%>%
mutate(Cum-A = as.numeric(ifelse(Fund=="A",cumsum(SalesAmount),0)))%>%
mutate(Cum-B = as.numeric(ifelse(Fund=="B",cumsum(SalesAmount),0)))
但这完全不是我想要的,因为它向我显示了两个基金的总计,尽管只有在基金匹配时才显示在行中。 请帮忙。
怎么样:
library(dplyr)
d %>%
group_by(Name) %>%
mutate(cA=cumsum(ifelse(!is.na(Fund) & Fund=="A",SalesAmount,0))) %>%
mutate(cB=cumsum(ifelse(!is.na(Fund) & Fund=="B",SalesAmount,0)))
输出:
Source: local data frame [10 x 8]
Groups: Name
Name Event SalesAmount Fund Cum.A.desired. Cum.B.desired. cA cB
1 John Webinar NA NA NA NA 0 0
2 John Sale 1000 A 1000 NA 1000 0
3 John Sale 2000 B 1000 2000 1000 2000
4 John Sale 3000 A 4000 2000 4000 2000
5 John Email NA NA 4000 2000 4000 2000
6 Tom Webinar NA NA NA NA 0 0
7 Tom Sale 1000 A 1000 NA 1000 0
8 Tom Sale 2000 B 1000 2000 1000 2000
9 Tom Sale 3000 A 4000 2000 4000 2000
10 Tom Email NA NA 4000 2000 4000 2000
如果需要,结果列中的零可以随后被 NA
替换:
result$cA[result$cA==0] <- NA
result$cB[result$cB==0] <- NA
您的输入数据集:
d <- structure(list(Name = structure(c(1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L), .Label = c("John", "Tom"), class = "factor"), Event = structure(c(3L, 2L, 2L, 2L, 1L, 3L, 2L, 2L, 2L, 1L), .Label = c("Email", "Sale", "Webinar"), class = "factor"), SalesAmount = c(NA, 1000L, 2000L, 3000L, NA, NA, 1000L, 2000L, 3000L, NA), Fund = structure(c(NA, 1L, 2L, 1L, NA, NA, 1L, 2L, 1L, NA), .Label = c("A", "B"), class = "factor"), Cum.A.desired. = c(NA, 1000L, 1000L, 4000L, 4000L, NA, 1000L, 1000L, 4000L, 4000L), Cum.B.desired. = c(NA, NA, 2000L, 2000L, 2000L, NA, NA, 2000L, 2000L, 2000L)), .Names = c("Name", "Event", "SalesAmount", "Fund", "Cum.A.desired.", "Cum.B.desired." ), class = "data.frame", row.names = c(NA, -10L))
您可以将@Marat 的回答全部合并为一个 mutate
:
df %>%
group_by(Name) %>%
mutate(
cA = cumsum(ifelse(!is.na(Fund) & Fund == "A", SalesAmount, 0)),
cB = cumsum(ifelse(!is.na(Fund) & Fund == "B", SalesAmount, 0)),
cA = ifelse(cA == 0, NA, cA),
cB = ifelse(cB == 0, NA, cB)
)
这是一种推广到更多基金的方法,使用 zoo
和 data.table
:
# prep
require(data.table)
require(zoo)
setDT(d)
d[,Fund:=as.character(Fund)] # because factors are the worst
uf <- unique(d[Event=="Sale"]$Fund) # collect set of funds
首先,分配相关观察子集的累计销售额:
for (f in uf) d[(Event=="Sale"&Fund==f),paste0('c',f):=cumsum(SalesAmount),by=Name]
然后,将最后的观察向前推进:
d[,paste0('c',uf):=lapply(.SD,na.locf,na.rm=FALSE),.SDcols=paste0('c',uf),by=Name]