计算字符数并汇总每组的值
Count characters and summarize values per group
我似乎可以为我的问题找到合适的代码。
我想根据不同的条件创建组并汇总(求和、计数或长度)其他列。
我已经尝试 group_by 并根据不同的条件进行了总结,但还没有找到任何可行的方法。
我有一个 table 类似于:
data <- data.frame(Name= c('Anna', 'Anna', 'Anna', 'Anna', 'Anna',
'Bella', 'Bella', 'Bella', 'Camilla', 'Camilla'),
Date= c('1.1.2021', '1.1.2021', '2.1.2021', '3.1.2021', '3.1.2021',
'1.1.2021', '5.1.2021', '5.1.2021', '7.1.2021', '8.1.2021'),
Item= c('Apple','Pear', 'Zucini','Apple', 'Broccoli',
'Apple','Pear','Apple','Apple', 'Tomato'),
Category= c('Fruit', 'Fruit', 'Vegetable', 'Fruit', 'Vegetable',
'Fruit', 'Fruit', 'Fruit', 'Fruit', 'Vegetable'),
Weight_kg= c(0.2,0.3,0.5,0.4,1.1,
1,0.5,0.8,1.2,0.5)
)
这将是我想要的输出:
desired_table <- data.frame(Name=c('Anna', 'Bella', 'Camilla'),
Shopping_days=c(3,2,2),
days_fruit=c(2,2,1),
days_vegetables=c(2,0,1),
Total_kg=c(2.5,2.3,1.7),
Fruit_kg=c(0.9,2.3,1.2),
Vegetables=c(1.6,0,0.5))
我已经尝试过许多与此代码类似但显然不起作用的代码变体:
data1 <- data %>%
group_by(Name) %>%
summarize(Shopping_days = length(unique(Date)),
days_fruit = length(unique(Date, Category='Fruit')),
days_vegetables = length(unique(Date, Category='Vegetables')),
Total_kg = sum(Weight_kg),
Fruit_kg = sum(Weight_kg, if Category=Fruit),
Vegetables_kg = sum(Weight_kg, if Category=Vegetables))
如有任何帮助,我们将不胜感激。
我建议先做三个单独的表,然后再合并
data <- data.frame(Name= c('Anna', 'Anna', 'Anna', 'Anna', 'Anna',
'Bella', 'Bella', 'Bella', 'Camilla', 'Camilla'),
Date= c('1.1.2021', '1.1.2021', '2.1.2021', '3.1.2021', '3.1.2021',
'1.1.2021', '5.1.2021', '5.1.2021', '7.1.2021', '8.1.2021'),
Item= c('Apple','Pear', 'Zucini','Apple', 'Broccoli',
'Apple','Pear','Apple','Apple', 'Tomato'),
Category= c('Fruit', 'Fruit', 'Vegetable', 'Fruit', 'Vegetable',
'Fruit', 'Fruit', 'Fruit', 'Fruit', 'Vegetable'),
Weight_kg= c(0.2,0.3,0.5,0.4,1.1,
1,0.5,0.8,1.2,0.5)
)
library(tidyverse)
tbl_kg <- data %>%
pivot_wider(
id_cols = Name,
names_from = Category,
values_from = Weight_kg,
values_fill = 0,
values_fn = sum,
names_prefix = "kg_"
) %>%
rowwise() %>%
mutate(kg_total = sum(c_across(where(is.numeric)))) %>%
ungroup()
tbl_shoping <- data %>%
group_by(Name) %>%
summarise(shopping_days = n_distinct(Date))
tbl_category <- data %>%
pivot_wider(
id_cols = Name,
names_from = Category,
values_from = Category,
values_fn = length,
values_fill = 0,
names_prefix = "days_"
)
l <- list(tbl_kg, tbl_shoping, tbl_category)
reduce(l, left_join, by = "Name")
#> # A tibble: 3 x 7
#> Name kg_Fruit kg_Vegetable kg_total shopping_days days_Fruit days_Vegetable
#> <chr> <dbl> <dbl> <dbl> <int> <int> <int>
#> 1 Anna 0.9 1.6 2.5 3 3 2
#> 2 Bella 2.3 0 2.3 2 3 0
#> 3 Camilla 1.2 0.5 1.7 2 1 1
由 reprex package (v1.0.0)
于 2021 年 3 月 24 日创建
我认为没有非常简单的解决方案,因为您必须多次对数据帧进行分组和重新分组才能获得所需的输出。
你可以用一些 left_joins()
使用 dplyr
来做到这一点。
例如:
library(dplyr)
data %>% distinct(Name) %>%
left_join(
data %>% select(Name, Date) %>%
distinct() %>%
group_by(Name) %>%
summarise(shopping_days = n()), by = 'Name') %>%
left_join(
data %>% filter(Category == "Fruit") %>%
select(Name, Date) %>%
distinct() %>%
group_by(Name) %>%
summarise(days_fruit = n()), by = 'Name') %>%
left_join(
data %>% filter(Category == "Vegetable") %>%
select(Name, Date) %>%
distinct() %>%
group_by(Name) %>%
summarise(days_vegetables = n()), by = 'Name') %>%
left_join(
data %>% filter(Category == "Vegetable") %>%
group_by(Name) %>%
summarise(Vegetables = sum(Weight_kg)), by= 'Name') %>%
left_join(
data %>% filter(Category == "Fruit") %>%
group_by(Name) %>%
summarise(Fruit_kg = sum(Weight_kg)), by= 'Name') %>%
left_join(
data %>%
group_by(Name) %>%
summarise(Total_kg = sum(Weight_kg)), by= 'Name')
在这里,您基本上为每个 left_join()
创建了新的数据帧,因此它们本身也可以保存为对象。
使用 group_by
和 summarise
:
library(dplyr)
data %>%
group_by(Name) %>%
summarise(Shopping_days = n_distinct(Date),
days_fruit = n_distinct(Item[Category == 'Fruit']),
days_vegetables = n_distinct(Item[Category == 'Vegetable']),
Total_kg = sum(Weight_kg),
Fruit_kg = sum(Weight_kg[Category == 'Fruit']),
Vegetables_kg = sum(Weight_kg[Category == 'Vegetable']))
# Name Shopping_days days_fruit days_vegetables Total_kg Fruit_kg Vegetables_kg
# <chr> <int> <int> <int> <dbl> <dbl> <dbl>
#1 Anna 3 2 2 2.5 0.9 1.6
#2 Bella 2 2 0 2.3 2.3 0
#3 Camilla 2 1 1 1.7 1.2 0.5
只需按名称分组即可轻松获得两列:
r1 <- data %>%
group_by(Name) %>%
summarize(Shopping_days = n_distinct(Date),
Total_kg = sum(Weight_kg))
# A tibble: 3 x 3
# Name Shopping_days Total_kg
# * <chr> <int> <dbl>
# 1 Anna 3 2.5
# 2 Bella 2 2.3
# 3 Camilla 2 1.7
其余列可以按名称和类别分组获得。
r2 <- data %>%
group_by(Name, Category) %>%
summarize(
days = n_distinct(Date),
weight = sum(Weight_kg)
)
# A tibble: 5 x 4
# Groups: Name [3]
# Name Category days weight
# <chr> <chr> <int> <dbl>
# 1 Anna Fruit 2 0.9
# 2 Anna Vegetable 2 1.6
# 3 Bella Fruit 2 2.3
# 4 Camilla Fruit 1 1.2
# 5 Camilla Vegetable 1 0.5
但最终结果不是您想要的格式。
因此,首先让 pivot_wider r2 和第一个结果比 left_join,使用名称列作为键:
r2 <- r2 %>%
pivot_wider(names_from = 'Category', values_from = c('days', 'weight'), values_fill = 0)
result = r1 %>% left_join(r2)
# A tibble: 3 x 7
# Name Shopping_days Total_kg days_Fruit days_Vegetable weight_Fruit weight_Vegetable
# <chr> <int> <dbl> <int> <int> <dbl> <dbl>
#1 Anna 3 2.5 2 2 0.9 1.6
#2 Bella 2 2.3 2 0 2.3 0
#3 Camilla 2 1.7 1 1 1.2 0.5
有点难看的基本解决方案(但仍然很通用):
# Create a lookup table to be used to ensure every category is represented:
lookup_df <- data.frame(
Category = unique(df$Category),
Weight_kg.days = rep(0, length(unique(df$Category))),
Weight_kg.kg = rep(0, length(unique(df$Category)))
)
# Create the report:
data.frame(do.call(rbind, Map(function(r){
s <- subset(r, !duplicated(Date), select = -Item)
v <- do.call(data.frame, aggregate(Weight_kg~Category, s,
FUN = function(u){c(days = length(u), kg = sum(u))}))
w <- rbind(v, subset(lookup_df, !(Category %in% v$Category)))
col_idx <- vapply(w, is.numeric, logical(1))
x <- rbind(w, data.frame(Category = "Total", t(colSums(w[,col_idx]))))
z <- do.call(cbind, lapply(with(x, split(x[,col_idx], Category)), function(y){
setNames(y, paste0(y$Category, c("days", "kg")))}))
cbind(Name = unique(s$Name), z)
}, with(df, split(df, Name)))), row.names = NULL)
数据:
df <- df <- data.frame(Name= c('Anna', 'Anna', 'Anna', 'Anna', 'Anna',
'Bella', 'Bella', 'Bella', 'Camilla', 'Camilla'),
Date= c('1.1.2021', '1.1.2021', '2.1.2021', '3.1.2021', '3.1.2021',
'1.1.2021', '5.1.2021', '5.1.2021', '7.1.2021', '8.1.2021'),
Item= c('Apple','Pear', 'Zucini','Apple', 'Broccoli',
'Apple','Pear','Apple','Apple', 'Tomato'),
Category= c('Fruit', 'Fruit', 'Vegetable', 'Fruit', 'Vegetable',
'Fruit', 'Fruit', 'Fruit', 'Fruit', 'Vegetable'),
Weight_kg= c(0.2,0.3,0.5,0.4,1.1,
1,0.5,0.8,1.2,0.5)
)
使用 data.table
的替代方案
library(data.table)
setDT(data)
#Computing the necessary columns by Name
data[, `:=`(Shopping_days = uniqueN(Date),
day_fruits = uniqueN(Date[Category=="Fruit"]),
day_vegetable = uniqueN(Date[Category=="Vegetable"]),
Total_kg = sum(Weight_kg),
Fruit_kg = sum(Weight_kg[Category=="Fruit"]),
Vegetable_kg = sum(Weight_kg[Category=="Vegetable"])), Name]
#Taking the first element of the computed columns by Name
data[,.SD[1, .(Shopping_days,day_fruits,day_vegetable,Total_kg,Fruit_kg,Vegetable_kg)], Name]
Name Shopping_days day_fruits day_vegetable Total_kg Fruit_kg Vegetable_kg
1: Anna 3 2 2 2.5 0.9 1.6
2: Bella 2 2 0 2.3 2.3 0.0
3: Camilla 2 1 1 1.7 1.2 0.5
我似乎可以为我的问题找到合适的代码。 我想根据不同的条件创建组并汇总(求和、计数或长度)其他列。
我已经尝试 group_by 并根据不同的条件进行了总结,但还没有找到任何可行的方法。
我有一个 table 类似于:
data <- data.frame(Name= c('Anna', 'Anna', 'Anna', 'Anna', 'Anna',
'Bella', 'Bella', 'Bella', 'Camilla', 'Camilla'),
Date= c('1.1.2021', '1.1.2021', '2.1.2021', '3.1.2021', '3.1.2021',
'1.1.2021', '5.1.2021', '5.1.2021', '7.1.2021', '8.1.2021'),
Item= c('Apple','Pear', 'Zucini','Apple', 'Broccoli',
'Apple','Pear','Apple','Apple', 'Tomato'),
Category= c('Fruit', 'Fruit', 'Vegetable', 'Fruit', 'Vegetable',
'Fruit', 'Fruit', 'Fruit', 'Fruit', 'Vegetable'),
Weight_kg= c(0.2,0.3,0.5,0.4,1.1,
1,0.5,0.8,1.2,0.5)
)
这将是我想要的输出:
desired_table <- data.frame(Name=c('Anna', 'Bella', 'Camilla'),
Shopping_days=c(3,2,2),
days_fruit=c(2,2,1),
days_vegetables=c(2,0,1),
Total_kg=c(2.5,2.3,1.7),
Fruit_kg=c(0.9,2.3,1.2),
Vegetables=c(1.6,0,0.5))
我已经尝试过许多与此代码类似但显然不起作用的代码变体:
data1 <- data %>%
group_by(Name) %>%
summarize(Shopping_days = length(unique(Date)),
days_fruit = length(unique(Date, Category='Fruit')),
days_vegetables = length(unique(Date, Category='Vegetables')),
Total_kg = sum(Weight_kg),
Fruit_kg = sum(Weight_kg, if Category=Fruit),
Vegetables_kg = sum(Weight_kg, if Category=Vegetables))
如有任何帮助,我们将不胜感激。
我建议先做三个单独的表,然后再合并
data <- data.frame(Name= c('Anna', 'Anna', 'Anna', 'Anna', 'Anna',
'Bella', 'Bella', 'Bella', 'Camilla', 'Camilla'),
Date= c('1.1.2021', '1.1.2021', '2.1.2021', '3.1.2021', '3.1.2021',
'1.1.2021', '5.1.2021', '5.1.2021', '7.1.2021', '8.1.2021'),
Item= c('Apple','Pear', 'Zucini','Apple', 'Broccoli',
'Apple','Pear','Apple','Apple', 'Tomato'),
Category= c('Fruit', 'Fruit', 'Vegetable', 'Fruit', 'Vegetable',
'Fruit', 'Fruit', 'Fruit', 'Fruit', 'Vegetable'),
Weight_kg= c(0.2,0.3,0.5,0.4,1.1,
1,0.5,0.8,1.2,0.5)
)
library(tidyverse)
tbl_kg <- data %>%
pivot_wider(
id_cols = Name,
names_from = Category,
values_from = Weight_kg,
values_fill = 0,
values_fn = sum,
names_prefix = "kg_"
) %>%
rowwise() %>%
mutate(kg_total = sum(c_across(where(is.numeric)))) %>%
ungroup()
tbl_shoping <- data %>%
group_by(Name) %>%
summarise(shopping_days = n_distinct(Date))
tbl_category <- data %>%
pivot_wider(
id_cols = Name,
names_from = Category,
values_from = Category,
values_fn = length,
values_fill = 0,
names_prefix = "days_"
)
l <- list(tbl_kg, tbl_shoping, tbl_category)
reduce(l, left_join, by = "Name")
#> # A tibble: 3 x 7
#> Name kg_Fruit kg_Vegetable kg_total shopping_days days_Fruit days_Vegetable
#> <chr> <dbl> <dbl> <dbl> <int> <int> <int>
#> 1 Anna 0.9 1.6 2.5 3 3 2
#> 2 Bella 2.3 0 2.3 2 3 0
#> 3 Camilla 1.2 0.5 1.7 2 1 1
由 reprex package (v1.0.0)
于 2021 年 3 月 24 日创建我认为没有非常简单的解决方案,因为您必须多次对数据帧进行分组和重新分组才能获得所需的输出。
你可以用一些 left_joins()
使用 dplyr
来做到这一点。
例如:
library(dplyr)
data %>% distinct(Name) %>%
left_join(
data %>% select(Name, Date) %>%
distinct() %>%
group_by(Name) %>%
summarise(shopping_days = n()), by = 'Name') %>%
left_join(
data %>% filter(Category == "Fruit") %>%
select(Name, Date) %>%
distinct() %>%
group_by(Name) %>%
summarise(days_fruit = n()), by = 'Name') %>%
left_join(
data %>% filter(Category == "Vegetable") %>%
select(Name, Date) %>%
distinct() %>%
group_by(Name) %>%
summarise(days_vegetables = n()), by = 'Name') %>%
left_join(
data %>% filter(Category == "Vegetable") %>%
group_by(Name) %>%
summarise(Vegetables = sum(Weight_kg)), by= 'Name') %>%
left_join(
data %>% filter(Category == "Fruit") %>%
group_by(Name) %>%
summarise(Fruit_kg = sum(Weight_kg)), by= 'Name') %>%
left_join(
data %>%
group_by(Name) %>%
summarise(Total_kg = sum(Weight_kg)), by= 'Name')
在这里,您基本上为每个 left_join()
创建了新的数据帧,因此它们本身也可以保存为对象。
使用 group_by
和 summarise
:
library(dplyr)
data %>%
group_by(Name) %>%
summarise(Shopping_days = n_distinct(Date),
days_fruit = n_distinct(Item[Category == 'Fruit']),
days_vegetables = n_distinct(Item[Category == 'Vegetable']),
Total_kg = sum(Weight_kg),
Fruit_kg = sum(Weight_kg[Category == 'Fruit']),
Vegetables_kg = sum(Weight_kg[Category == 'Vegetable']))
# Name Shopping_days days_fruit days_vegetables Total_kg Fruit_kg Vegetables_kg
# <chr> <int> <int> <int> <dbl> <dbl> <dbl>
#1 Anna 3 2 2 2.5 0.9 1.6
#2 Bella 2 2 0 2.3 2.3 0
#3 Camilla 2 1 1 1.7 1.2 0.5
只需按名称分组即可轻松获得两列:
r1 <- data %>%
group_by(Name) %>%
summarize(Shopping_days = n_distinct(Date),
Total_kg = sum(Weight_kg))
# A tibble: 3 x 3
# Name Shopping_days Total_kg
# * <chr> <int> <dbl>
# 1 Anna 3 2.5
# 2 Bella 2 2.3
# 3 Camilla 2 1.7
其余列可以按名称和类别分组获得。
r2 <- data %>%
group_by(Name, Category) %>%
summarize(
days = n_distinct(Date),
weight = sum(Weight_kg)
)
# A tibble: 5 x 4
# Groups: Name [3]
# Name Category days weight
# <chr> <chr> <int> <dbl>
# 1 Anna Fruit 2 0.9
# 2 Anna Vegetable 2 1.6
# 3 Bella Fruit 2 2.3
# 4 Camilla Fruit 1 1.2
# 5 Camilla Vegetable 1 0.5
但最终结果不是您想要的格式。
因此,首先让 pivot_wider r2 和第一个结果比 left_join,使用名称列作为键:
r2 <- r2 %>%
pivot_wider(names_from = 'Category', values_from = c('days', 'weight'), values_fill = 0)
result = r1 %>% left_join(r2)
# A tibble: 3 x 7
# Name Shopping_days Total_kg days_Fruit days_Vegetable weight_Fruit weight_Vegetable
# <chr> <int> <dbl> <int> <int> <dbl> <dbl>
#1 Anna 3 2.5 2 2 0.9 1.6
#2 Bella 2 2.3 2 0 2.3 0
#3 Camilla 2 1.7 1 1 1.2 0.5
有点难看的基本解决方案(但仍然很通用):
# Create a lookup table to be used to ensure every category is represented:
lookup_df <- data.frame(
Category = unique(df$Category),
Weight_kg.days = rep(0, length(unique(df$Category))),
Weight_kg.kg = rep(0, length(unique(df$Category)))
)
# Create the report:
data.frame(do.call(rbind, Map(function(r){
s <- subset(r, !duplicated(Date), select = -Item)
v <- do.call(data.frame, aggregate(Weight_kg~Category, s,
FUN = function(u){c(days = length(u), kg = sum(u))}))
w <- rbind(v, subset(lookup_df, !(Category %in% v$Category)))
col_idx <- vapply(w, is.numeric, logical(1))
x <- rbind(w, data.frame(Category = "Total", t(colSums(w[,col_idx]))))
z <- do.call(cbind, lapply(with(x, split(x[,col_idx], Category)), function(y){
setNames(y, paste0(y$Category, c("days", "kg")))}))
cbind(Name = unique(s$Name), z)
}, with(df, split(df, Name)))), row.names = NULL)
数据:
df <- df <- data.frame(Name= c('Anna', 'Anna', 'Anna', 'Anna', 'Anna',
'Bella', 'Bella', 'Bella', 'Camilla', 'Camilla'),
Date= c('1.1.2021', '1.1.2021', '2.1.2021', '3.1.2021', '3.1.2021',
'1.1.2021', '5.1.2021', '5.1.2021', '7.1.2021', '8.1.2021'),
Item= c('Apple','Pear', 'Zucini','Apple', 'Broccoli',
'Apple','Pear','Apple','Apple', 'Tomato'),
Category= c('Fruit', 'Fruit', 'Vegetable', 'Fruit', 'Vegetable',
'Fruit', 'Fruit', 'Fruit', 'Fruit', 'Vegetable'),
Weight_kg= c(0.2,0.3,0.5,0.4,1.1,
1,0.5,0.8,1.2,0.5)
)
使用 data.table
的替代方案library(data.table)
setDT(data)
#Computing the necessary columns by Name
data[, `:=`(Shopping_days = uniqueN(Date),
day_fruits = uniqueN(Date[Category=="Fruit"]),
day_vegetable = uniqueN(Date[Category=="Vegetable"]),
Total_kg = sum(Weight_kg),
Fruit_kg = sum(Weight_kg[Category=="Fruit"]),
Vegetable_kg = sum(Weight_kg[Category=="Vegetable"])), Name]
#Taking the first element of the computed columns by Name
data[,.SD[1, .(Shopping_days,day_fruits,day_vegetable,Total_kg,Fruit_kg,Vegetable_kg)], Name]
Name Shopping_days day_fruits day_vegetable Total_kg Fruit_kg Vegetable_kg
1: Anna 3 2 2 2.5 0.9 1.6
2: Bella 2 2 0 2.3 2.3 0.0
3: Camilla 2 1 1 1.7 1.2 0.5