我使用 GROUPBY 然后使用 SUMMARIZE 将总标签添加到数据框。 BUT 表示总水平的 % 数据是错误的
I used GROUPBY then SUMMARISE to add a total label to dataframe. BUT means the % data on total level is wrong
我使用 GROUPBY 然后 SUM 然后 SUMMARIZE 将总标签添加到数据框。 BUT 表示总水平的 % 数据是错误的。因此,我想用具有正确结果的计算覆盖百分比变量 'percentage absent staff'。问题是它是一个长数据集,不可能手动完成。正在寻找好的解决方案,LOOP 或其他东西?
代码:
Date=c("01/09/2020","01/09/2020","01/09/2020","01/09/2020",
"01/09/2020","01/09/2020","01/09/2020","01/09/2020",
"02/09/2020","02/09/2020","02/09/2020","02/09/2020",
"02/09/2020","02/09/2020","02/09/2020","02/09/2020")
Asset=c("Blue Hotel","Blue Hotel","Blue Hotel","Blue Hotel",
"Green Hotel","Green Hotel","Green Hotel","Green Hotel",
"Blue Hotel","Blue Hotel","Blue Hotel","Blue Hotel",
"Green Hotel","Green Hotel","Green Hotel","Green Hotel")
Variable=c("hotel staff","bar staff","absent staff","percentage absent
staff",
"hotel staff","bar staff","absent staff","percentage absent staff",
"hotel staff","bar staff","absent staff","percentage absent staff",
"hotel staff","bar staff","absent staff","percentage absent staff")
value=c(5,10,3,0.2,4,8,2,0.17,5,10,3,0.20,6,3,3,0.33)
df=data.frame(Date,Asset,Variable,value)
#to create totals
df2= df %>%
group_by(Date,Variable) %>%
summarise(value = sum(as.numeric(value), na.rm=F)) %>% ungroup()
我不确定你想要什么计算,因为第一个“正确”的计算看起来像 absent_staff/(hotel_staff + bar_staff + absent_staff)第二个正确的计算看起来像 absent_staff/(hotel_staff + bar_staff)。但是,您可以根据自己的喜好制定以下解决方案。
df2= df %>%
group_by(Date,Variable) %>%
summarise(value = sum(as.numeric(value), na.rm=F)) %>%
ungroup() %>%
group_by(Date) %>%
mutate(value = case_when(
Variable == "percentage absent staff" ~ value[which(Variable == "absent staff")]/
sum(value[which(Variable %in% c("absent staff", "bar staff", "hotel staff"))]),
TRUE ~ value)
)
df2
# # A tibble: 8 x 3
# # Groups: Date [2]
# Date Variable value
# <chr> <chr> <dbl>
# 1 01/09/2020 absent staff 5
# 2 01/09/2020 bar staff 18
# 3 01/09/2020 hotel staff 9
# 4 01/09/2020 percentage absent staff 0.156
# 5 02/09/2020 absent staff 6
# 6 02/09/2020 bar staff 13
# 7 02/09/2020 hotel staff 11
# 8 02/09/2020 percentage absent staff 0.2
在上面,您按 Date
对汇总数据进行分组,然后用条件表达式替换值。当 Variable
等于 "percentage absent staff"
时,该值将是 "absent staff"
的值除以 "absent staff", "bar staff", "hotel staff"
的值之和。所以,如果你真的想要上面的第二个计算,你可以把 "absent staff"
留在这个向量之外。否则,value
将返回原来的值。
编辑
要回答评论中的问题,如果同一变量中还有其他驻留值 - Variable
具有相同的结构,您可以将它们重新添加为:
Date=c("01/09/2020","01/09/2020","01/09/2020","01/09/2020",
"01/09/2020","01/09/2020","01/09/2020","01/09/2020",
"02/09/2020","02/09/2020","02/09/2020","02/09/2020",
"02/09/2020","02/09/2020","02/09/2020","02/09/2020")
Asset=c("Blue Hotel","Blue Hotel","Blue Hotel","Blue Hotel",
"Green Hotel","Green Hotel","Green Hotel","Green Hotel",
"Blue Hotel","Blue Hotel","Blue Hotel","Blue Hotel",
"Green Hotel","Green Hotel","Green Hotel","Green Hotel")
Variable=c("hotel staff","bar staff","absent staff","percentage absent staff",
"hotel staff","bar staff","absent staff","percentage absent staff",
"hotel staff","bar staff","absent staff","percentage absent staff",
"hotel staff","bar staff","absent staff","percentage absent staff")
value=c(5,10,3,0.2,4,8,2,0.17,5,10,3,0.20,6,3,3,0.33)
df=data.frame(Date,Asset,Variable,value)
#to create totals
dfr <- df
dfr$Variable <- gsub("staff", "residents", dfr$Variable)
dfr$value <- rpois(nrow(dfr), 25)
df <- bind_rows(df, dfr)
df[c(1:5, 17:21), ]
df2= df %>%
group_by(Date,Variable) %>%
summarise(value = sum(as.numeric(value), na.rm=F)) %>% ungroup()
df2a= df2 %>%
group_by(Date,Variable) %>%
summarise(value = sum(as.numeric(value), na.rm=F)) %>%
ungroup() %>%
group_by(Date) %>%
mutate(value = case_when( Variable == "percentage absent staff" ~ value[which(Variable == "absent staff")]/
sum(value[which(Variable %in% c("absent staff", "bar staff", "hotel staff"))]),
Variable == "percentage absent residents" ~ value[which(Variable == "absent residents")]/
sum(value[which(Variable %in% c("absent residents", "bar residents", "hotel residents"))]),
TRUE ~ value) )
我使用 GROUPBY 然后 SUM 然后 SUMMARIZE 将总标签添加到数据框。 BUT 表示总水平的 % 数据是错误的。因此,我想用具有正确结果的计算覆盖百分比变量 'percentage absent staff'。问题是它是一个长数据集,不可能手动完成。正在寻找好的解决方案,LOOP 或其他东西?
代码:
Date=c("01/09/2020","01/09/2020","01/09/2020","01/09/2020",
"01/09/2020","01/09/2020","01/09/2020","01/09/2020",
"02/09/2020","02/09/2020","02/09/2020","02/09/2020",
"02/09/2020","02/09/2020","02/09/2020","02/09/2020")
Asset=c("Blue Hotel","Blue Hotel","Blue Hotel","Blue Hotel",
"Green Hotel","Green Hotel","Green Hotel","Green Hotel",
"Blue Hotel","Blue Hotel","Blue Hotel","Blue Hotel",
"Green Hotel","Green Hotel","Green Hotel","Green Hotel")
Variable=c("hotel staff","bar staff","absent staff","percentage absent
staff",
"hotel staff","bar staff","absent staff","percentage absent staff",
"hotel staff","bar staff","absent staff","percentage absent staff",
"hotel staff","bar staff","absent staff","percentage absent staff")
value=c(5,10,3,0.2,4,8,2,0.17,5,10,3,0.20,6,3,3,0.33)
df=data.frame(Date,Asset,Variable,value)
#to create totals
df2= df %>%
group_by(Date,Variable) %>%
summarise(value = sum(as.numeric(value), na.rm=F)) %>% ungroup()
我不确定你想要什么计算,因为第一个“正确”的计算看起来像 absent_staff/(hotel_staff + bar_staff + absent_staff)第二个正确的计算看起来像 absent_staff/(hotel_staff + bar_staff)。但是,您可以根据自己的喜好制定以下解决方案。
df2= df %>%
group_by(Date,Variable) %>%
summarise(value = sum(as.numeric(value), na.rm=F)) %>%
ungroup() %>%
group_by(Date) %>%
mutate(value = case_when(
Variable == "percentage absent staff" ~ value[which(Variable == "absent staff")]/
sum(value[which(Variable %in% c("absent staff", "bar staff", "hotel staff"))]),
TRUE ~ value)
)
df2
# # A tibble: 8 x 3
# # Groups: Date [2]
# Date Variable value
# <chr> <chr> <dbl>
# 1 01/09/2020 absent staff 5
# 2 01/09/2020 bar staff 18
# 3 01/09/2020 hotel staff 9
# 4 01/09/2020 percentage absent staff 0.156
# 5 02/09/2020 absent staff 6
# 6 02/09/2020 bar staff 13
# 7 02/09/2020 hotel staff 11
# 8 02/09/2020 percentage absent staff 0.2
在上面,您按 Date
对汇总数据进行分组,然后用条件表达式替换值。当 Variable
等于 "percentage absent staff"
时,该值将是 "absent staff"
的值除以 "absent staff", "bar staff", "hotel staff"
的值之和。所以,如果你真的想要上面的第二个计算,你可以把 "absent staff"
留在这个向量之外。否则,value
将返回原来的值。
编辑
要回答评论中的问题,如果同一变量中还有其他驻留值 - Variable
具有相同的结构,您可以将它们重新添加为:
Date=c("01/09/2020","01/09/2020","01/09/2020","01/09/2020",
"01/09/2020","01/09/2020","01/09/2020","01/09/2020",
"02/09/2020","02/09/2020","02/09/2020","02/09/2020",
"02/09/2020","02/09/2020","02/09/2020","02/09/2020")
Asset=c("Blue Hotel","Blue Hotel","Blue Hotel","Blue Hotel",
"Green Hotel","Green Hotel","Green Hotel","Green Hotel",
"Blue Hotel","Blue Hotel","Blue Hotel","Blue Hotel",
"Green Hotel","Green Hotel","Green Hotel","Green Hotel")
Variable=c("hotel staff","bar staff","absent staff","percentage absent staff",
"hotel staff","bar staff","absent staff","percentage absent staff",
"hotel staff","bar staff","absent staff","percentage absent staff",
"hotel staff","bar staff","absent staff","percentage absent staff")
value=c(5,10,3,0.2,4,8,2,0.17,5,10,3,0.20,6,3,3,0.33)
df=data.frame(Date,Asset,Variable,value)
#to create totals
dfr <- df
dfr$Variable <- gsub("staff", "residents", dfr$Variable)
dfr$value <- rpois(nrow(dfr), 25)
df <- bind_rows(df, dfr)
df[c(1:5, 17:21), ]
df2= df %>%
group_by(Date,Variable) %>%
summarise(value = sum(as.numeric(value), na.rm=F)) %>% ungroup()
df2a= df2 %>%
group_by(Date,Variable) %>%
summarise(value = sum(as.numeric(value), na.rm=F)) %>%
ungroup() %>%
group_by(Date) %>%
mutate(value = case_when( Variable == "percentage absent staff" ~ value[which(Variable == "absent staff")]/
sum(value[which(Variable %in% c("absent staff", "bar staff", "hotel staff"))]),
Variable == "percentage absent residents" ~ value[which(Variable == "absent residents")]/
sum(value[which(Variable %in% c("absent residents", "bar residents", "hotel residents"))]),
TRUE ~ value) )