计算字符数并汇总每组的值

Count characters and summarize values per group

我似乎可以为我的问题找到合适的代码。 我想根据不同的条件创建组并汇总(求和、计数或长度)其他列。

我已经尝试 group_by 并根据不同的条件进行了总结,但还没有找到任何可行的方法。

我有一个 table 类似于:

data <- data.frame(Name= c('Anna', 'Anna', 'Anna', 'Anna', 'Anna',
                       'Bella', 'Bella', 'Bella', 'Camilla', 'Camilla'),
               Date= c('1.1.2021', '1.1.2021', '2.1.2021', '3.1.2021', '3.1.2021', 
                       '1.1.2021', '5.1.2021', '5.1.2021', '7.1.2021', '8.1.2021'),
               Item= c('Apple','Pear', 'Zucini','Apple', 'Broccoli',
                       'Apple','Pear','Apple','Apple', 'Tomato'),
               Category= c('Fruit', 'Fruit', 'Vegetable', 'Fruit', 'Vegetable', 
                           'Fruit', 'Fruit', 'Fruit', 'Fruit', 'Vegetable'),
               Weight_kg= c(0.2,0.3,0.5,0.4,1.1,
                            1,0.5,0.8,1.2,0.5)
               )

这将是我想要的输出:

desired_table <- data.frame(Name=c('Anna', 'Bella', 'Camilla'),
Shopping_days=c(3,2,2),
days_fruit=c(2,2,1),
days_vegetables=c(2,0,1),
Total_kg=c(2.5,2.3,1.7),
Fruit_kg=c(0.9,2.3,1.2),
Vegetables=c(1.6,0,0.5))

我已经尝试过许多与此代码类似但显然不起作用的代码变体:

data1 <- data %>%
group_by(Name) %>%
summarize(Shopping_days = length(unique(Date)),
days_fruit = length(unique(Date, Category='Fruit')),
days_vegetables = length(unique(Date, Category='Vegetables')),
Total_kg = sum(Weight_kg),
Fruit_kg = sum(Weight_kg, if Category=Fruit),
Vegetables_kg = sum(Weight_kg, if Category=Vegetables))

如有任何帮助,我们将不胜感激。

我建议先做三个单独的表,然后再合并

data <- data.frame(Name= c('Anna', 'Anna', 'Anna', 'Anna', 'Anna',
                           'Bella', 'Bella', 'Bella', 'Camilla', 'Camilla'),
                   Date= c('1.1.2021', '1.1.2021', '2.1.2021', '3.1.2021', '3.1.2021', 
                           '1.1.2021', '5.1.2021', '5.1.2021', '7.1.2021', '8.1.2021'),
                   Item= c('Apple','Pear', 'Zucini','Apple', 'Broccoli',
                           'Apple','Pear','Apple','Apple', 'Tomato'),
                   Category= c('Fruit', 'Fruit', 'Vegetable', 'Fruit', 'Vegetable', 
                               'Fruit', 'Fruit', 'Fruit', 'Fruit', 'Vegetable'),
                   Weight_kg= c(0.2,0.3,0.5,0.4,1.1,
                                1,0.5,0.8,1.2,0.5)
)

library(tidyverse)

tbl_kg <- data %>% 
  pivot_wider(
    id_cols = Name, 
    names_from = Category, 
    values_from = Weight_kg, 
    values_fill = 0, 
    values_fn = sum,
    names_prefix = "kg_"
    ) %>% 
  rowwise() %>% 
  mutate(kg_total = sum(c_across(where(is.numeric)))) %>% 
  ungroup()

tbl_shoping <- data %>% 
  group_by(Name) %>% 
  summarise(shopping_days = n_distinct(Date))

tbl_category <- data %>% 
  pivot_wider(
    id_cols = Name, 
    names_from = Category, 
    values_from = Category, 
    values_fn = length,
    values_fill = 0,
    names_prefix = "days_"
    )

l <- list(tbl_kg, tbl_shoping, tbl_category)

reduce(l, left_join, by = "Name")
#> # A tibble: 3 x 7
#>   Name    kg_Fruit kg_Vegetable kg_total shopping_days days_Fruit days_Vegetable
#>   <chr>      <dbl>        <dbl>    <dbl>         <int>      <int>          <int>
#> 1 Anna         0.9          1.6      2.5             3          3              2
#> 2 Bella        2.3          0        2.3             2          3              0
#> 3 Camilla      1.2          0.5      1.7             2          1              1

reprex package (v1.0.0)

于 2021 年 3 月 24 日创建

我认为没有非常简单的解决方案,因为您必须多次对数据帧进行分组和重新分组才能获得所需的输出。

你可以用一些 left_joins() 使用 dplyr 来做到这一点。

例如:

library(dplyr)

data %>% distinct(Name) %>% 
  left_join(
    data %>% select(Name, Date) %>% 
      distinct() %>% 
      group_by(Name) %>% 
      summarise(shopping_days = n()), by = 'Name') %>% 
  left_join(
    data %>% filter(Category == "Fruit") %>%
      select(Name, Date) %>% 
      distinct() %>% 
      group_by(Name) %>% 
      summarise(days_fruit = n()), by = 'Name') %>% 
  left_join(
    data %>% filter(Category == "Vegetable") %>%
      select(Name, Date) %>% 
      distinct() %>% 
      group_by(Name) %>% 
      summarise(days_vegetables = n()), by = 'Name') %>% 
  left_join(
    data %>% filter(Category == "Vegetable") %>% 
      group_by(Name) %>% 
      summarise(Vegetables = sum(Weight_kg)), by= 'Name') %>% 
  left_join(
    data %>% filter(Category == "Fruit") %>% 
      group_by(Name) %>% 
      summarise(Fruit_kg = sum(Weight_kg)), by= 'Name') %>% 
  left_join(
    data %>% 
      group_by(Name) %>% 
      summarise(Total_kg = sum(Weight_kg)), by= 'Name')

在这里,您基本上为每个 left_join() 创建了新的数据帧,因此它们本身也可以保存为对象。

使用 group_bysummarise :

library(dplyr)

data %>%
  group_by(Name) %>%
  summarise(Shopping_days = n_distinct(Date), 
            days_fruit = n_distinct(Item[Category == 'Fruit']), 
            days_vegetables = n_distinct(Item[Category == 'Vegetable']), 
            Total_kg = sum(Weight_kg), 
            Fruit_kg = sum(Weight_kg[Category == 'Fruit']), 
            Vegetables_kg = sum(Weight_kg[Category == 'Vegetable']))

#  Name    Shopping_days days_fruit days_vegetables Total_kg Fruit_kg Vegetables_kg
#  <chr>           <int>      <int>           <int>    <dbl>    <dbl>         <dbl>
#1 Anna                3          2               2      2.5      0.9           1.6
#2 Bella               2          2               0      2.3      2.3           0  
#3 Camilla             2          1               1      1.7      1.2           0.5

只需按名称分组即可轻松获得两列:

r1 <- data %>%
  group_by(Name) %>%
  summarize(Shopping_days = n_distinct(Date),
            Total_kg = sum(Weight_kg))

#  A tibble: 3 x 3
#    Name    Shopping_days Total_kg
#  * <chr>           <int>    <dbl>
#  1 Anna                3      2.5
#  2 Bella               2      2.3
#  3 Camilla             2      1.7

其余列可以按名称和类别分组获得。

r2 <- data %>% 
  group_by(Name, Category) %>% 
  summarize(
    days = n_distinct(Date),
    weight = sum(Weight_kg)
  ) 

# A tibble: 5 x 4
#  Groups:   Name [3]
#   Name    Category   days weight
#   <chr>   <chr>     <int>  <dbl>
# 1 Anna    Fruit         2    0.9
# 2 Anna    Vegetable     2    1.6
# 3 Bella   Fruit         2    2.3
# 4 Camilla Fruit         1    1.2
# 5 Camilla Vegetable     1    0.5

但最终结果不是您想要的格式。

因此,首先让 pivot_wider r2 和第一个结果比 left_join,使用名称列作为键:

r2 <- r2 %>% 
  pivot_wider(names_from = 'Category', values_from = c('days', 'weight'), values_fill = 0)

result = r1 %>% left_join(r2)


# A tibble: 3 x 7
#  Name    Shopping_days Total_kg days_Fruit days_Vegetable weight_Fruit weight_Vegetable
#  <chr>           <int>    <dbl>      <int>          <int>        <dbl>            <dbl>
#1 Anna                3      2.5          2              2          0.9              1.6
#2 Bella               2      2.3          2              0          2.3              0  
#3 Camilla             2      1.7          1              1          1.2              0.5

有点难看的基本解决方案(但仍然很通用):

# Create a lookup table to be used to ensure every category is represented:
lookup_df <- data.frame(
  Category = unique(df$Category), 
  Weight_kg.days = rep(0, length(unique(df$Category))), 
  Weight_kg.kg = rep(0, length(unique(df$Category)))
)

# Create the report:
data.frame(do.call(rbind, Map(function(r){
  s <- subset(r, !duplicated(Date), select = -Item)
  v <- do.call(data.frame, aggregate(Weight_kg~Category, s,
        FUN = function(u){c(days = length(u), kg = sum(u))}))
  w <- rbind(v, subset(lookup_df, !(Category %in% v$Category)))
  col_idx <- vapply(w, is.numeric, logical(1))
  x <- rbind(w, data.frame(Category = "Total", t(colSums(w[,col_idx]))))
  z <- do.call(cbind, lapply(with(x, split(x[,col_idx], Category)), function(y){
        setNames(y, paste0(y$Category, c("days", "kg")))}))
  cbind(Name = unique(s$Name), z)
}, with(df, split(df, Name)))), row.names = NULL)

数据:

df <- df <- data.frame(Name= c('Anna', 'Anna', 'Anna', 'Anna', 'Anna',
                               'Bella', 'Bella', 'Bella', 'Camilla', 'Camilla'),
                       Date= c('1.1.2021', '1.1.2021', '2.1.2021', '3.1.2021', '3.1.2021', 
                               '1.1.2021', '5.1.2021', '5.1.2021', '7.1.2021', '8.1.2021'),
                       Item= c('Apple','Pear', 'Zucini','Apple', 'Broccoli',
                               'Apple','Pear','Apple','Apple', 'Tomato'),
                       Category= c('Fruit', 'Fruit', 'Vegetable', 'Fruit', 'Vegetable', 
                                   'Fruit', 'Fruit', 'Fruit', 'Fruit', 'Vegetable'),
                       Weight_kg= c(0.2,0.3,0.5,0.4,1.1,
                                    1,0.5,0.8,1.2,0.5)
)

使用 data.table

的替代方案
library(data.table)
setDT(data)
#Computing the necessary columns by Name
data[, `:=`(Shopping_days = uniqueN(Date),
            day_fruits = uniqueN(Date[Category=="Fruit"]),
            day_vegetable = uniqueN(Date[Category=="Vegetable"]),
            Total_kg = sum(Weight_kg),
            Fruit_kg = sum(Weight_kg[Category=="Fruit"]),
            Vegetable_kg = sum(Weight_kg[Category=="Vegetable"])), Name]

#Taking the first element of the computed columns by Name
data[,.SD[1, .(Shopping_days,day_fruits,day_vegetable,Total_kg,Fruit_kg,Vegetable_kg)], Name]

      Name Shopping_days day_fruits day_vegetable Total_kg Fruit_kg Vegetable_kg
1:    Anna             3          2             2      2.5      0.9          1.6
2:   Bella             2          2             0      2.3      2.3          0.0
3: Camilla             2          1             1      1.7      1.2          0.5