如何在 R 数据框中添加子计数

Question

我在 R 中有下面提到的数据框：

dput(df)

structure(list(CustID = c("C-1", "C-2", 
"C-2", "C-2", "C-3", "C-3", 
"C-3", "C-4", "C-5"), DATE = c("2021-01-02 14:13:10", "2021-01-02 13:17:07", "2021-01-02 14:15:10", "2021-01-02 16:14:08", "2021-01-02 17:11:03", "2021-01-02 12:14:24", "2021-01-02 12:33:34", "2021-01-02 10:43:55", "2021-01-03 20:23:35"), TYPE = c("Demo", 
"Pro", "Pro", "Pro", "Pro", "Pro", "Pro", "Pro", "Pro"
), LogCat = c(NA, "SPR,DET,RTD", "SPR,DET,RTD", "SPR,DET,RTD", 
"DET", "DET", "DET", NA, " SPR, RTD "), PriceCode = c(NA,"KR", "SR", "DE", "KL", "ZT", "KR", "KR", "KR")), class = "data.frame", row.names = c(NA, 
-9L))

CusID     Date                   Type        LogCat             PriceCode
C-1       2021-01-02 14:13:10    Demo                           
C-2       2021-01-02 13:17:07    Pro         SPR, DET, RTD      KR
C-2       2021-01-02 14:15:10    Pro         SPR, DET, RTD      SR
C-2       2021-01-02 16:14:08    Pro         SPR, DET, RTD      DE
C-3       2021-01-02 17:11:03    Pro         DET                KL
C-3       2021-01-02 12:14:24    Pro         DET                ZT
C-3       2021-01-02 12:33:34    Pro         DET                KR
C-4       2021-01-02 10:43:55    Pro                            KR
C-5       2021-01-03 20:23:35    Pro         SPR, RTD           KR

我正在使用下面的代码，它给出了下面提到的输出。

library(tidyverse)
library(janitor)
df %>% mutate(DATE = as.Date(DATE)) %>% select(1:3) %>%
  unique() %>%
  tabyl(TYPE, DATE) %>%
  adorn_totals("row") %>%
  adorn_percentages("col") %>%
  adorn_pct_formatting(2) %>%
  adorn_ns("front")

  TYPE  2021-01-02  2021-01-03
  Demo 1  (25.00%) 0   (0.00%)
   Pro 3  (75.00%) 1 (100.00%)
 Total 4 (100.00%) 1 (100.00%)

在上面提到的输出中，我需要基于列 LogCat 和 PriceCode 的不同子计数。例如，在给出特定 Type 的总不同计数时，我需要得到有多少不同 CusID 具有 LogCat 和 PriceCode。另外，有多少不同的计数没有 LogCat 和 PriceCode.

其中，对于每种类型，我们需要检查以下条件。

LogCat - 唯一的 CusID 仅在 LogCat 列中具有值，并且 PriceCode 为 NA 或 Blank
PriceCode - 唯一的 CusID 仅在 PriceCode 列中具有值，并且 LogCat 为 NA 或空白
Both True - 唯一的 CusID 在 LogCat 和 PriceCode 列中都有值。
Both False - 唯一的 CusID 在 LogCat 和 PriceCode 列中都没有值。

这些值的百分比应该通过计算我们计算这些值的特定 Type 的非重复计数来计算。例如，在 Demo 下面，如果我们计算 Demo 类型的这些值，则第一个日期的分母应为 1，第二个日期的分母应为 0。

需要数据框<-

  TYPE           2021-01-02   2021-01-03
  Demo           1 (25.00%)   0 (0.00%)
  LogCat         0 (0.00%)    0 (0.00%)
  PriceCode      0 (0.00%)    0 (0.00%)
  Both True      0 (0.00%)    0 (0.00%)
  Both False     1 (100.00%)  0 (0.00%) 
  Pro            3 (75.00%)   1 (100.00%)
  LogCat         0 (0.00%)    0 (0.00%)
  PriceCode      1 (33.33%)   0 (0.00%)
  Both True      2 (66.66%)   1 (100.00%)
  Both False     0 (0.00%)    0 (0.00%)
  Total          4 (100.00%)  1 (100.00%)

Answer 1

包 pivottabler 可能对这里有帮助

df %>% mutate(DATE = as.Date(DATE)) %>%
  mutate(across(.cols = c(LogCat, PriceCode), ~ +!(is.na(.) | . == ""))) %>%
  mutate(dummy = case_when(LogCat + PriceCode == 0 ~ "Both False",
                           LogCat + PriceCode == 2 ~ "Both True",
                           LogCat == 1 ~ "Logcat",
                           TRUE ~ "PriceCode")) %>%
  mutate(dummy = factor(dummy, levels = c("Both False", "Both True", "Logcat", "PriceCode")),
         d2 = 1) %>%
  group_by(CustID, DATE, TYPE) %>%
  complete(dummy) %>% ungroup() %>% unique() %>%
  select(c(2:4,7)) -> df1

library(pivottabler)  

qpvt(df1, c("TYPE", "dummy"), "DATE", "sum(d2, na.rm = T)")

                   2021-01-02  2021-01-03  Total  
Demo   Both False           1                  1  
       Both True            0                  0  
       Logcat               0                  0  
       PriceCode            0                  0  
       Total                1                  1  
Pro    Both False           0           0      0  
       Both True            2           1      3  
       Logcat               0           0      0  
       PriceCode            1           0      1  
       Total                3           1      4  
Total                       4           1      5

或

qhpvt(df1, c("TYPE", "dummy"), "DATE", "sum(d2, na.rm = T)")

Answer 2

这是我从您的 OP 中了解到的内容 - 相当复杂的逻辑 ;)

library(tidyverse)
library(scales)

# grain level data-frame
grain_df <- df %>%
  # convert Date to Date
  mutate(DATE = as.Date(DATE)) %>%
  separate_rows(LogCat) %>%
  # Categorize customer into your define logic
  group_by(DATE, TYPE, CustID) %>%
  summarize(category = case_when(
    # all LogCat is missing & Price is present
    all(is.na(LogCat)) & any(!is.na(PriceCode)) ~ "Missing_LogCat",
    # LogCat is present & Price is missing
    any(!is.na(LogCat)) & all(is.na(PriceCode)) ~ "Missing_PriceCode",
    # Both present
    any(!is.na(LogCat)) & any(!is.na(PriceCode)) ~ "Both_True",
    # Both missing
    all(is.na(LogCat)) & all(is.na(PriceCode)) ~ "Both_True",
    TRUE ~ "No category yet"
  ), .groups = "drop") %>%
  # count the customer in each category by type & date
  group_by(DATE, TYPE, category) %>%
  summarize(count = n_distinct(CustID), .groups = "drop") %>%
  # compelete the data set with all combination of Date/type/category
  complete(DATE, TYPE, category, fill = list(count = 0))

# summarize and put in some formmating
summary_df <- grain_df %>%
  # calculate the percent
  group_by(DATE, TYPE) %>%
  mutate(percent = paste0(count, " (", percent(count / sum(count)), ")")) %>%
  ungroup() %>%
  select(-count) %>%
  # Pivot wider to have the format you want
  pivot_wider(names_from = DATE, values_from = percent, values_fill = "")

# calculate total row & formatting
total_rows <- grain_df %>%
  group_by(DATE) %>%
  summarize(percent = paste0(sum(count), " (100%)")) %>%
  mutate(TYPE = "Total", category = "") %>%
  pivot_wider(names_from = DATE, values_from = percent)

# combind two data to get a final df with total rows
bind_rows(summary_df,
  total_rows)
#> # A tibble: 5 x 4
#>   TYPE  category         `2021-01-02` `2021-01-03`
#>   <chr> <chr>            <chr>        <chr>       
#> 1 Demo  "Both_True"      1 (100%)     0 (NA)      
#> 2 Demo  "Missing_LogCat" 0 (0%)       0 (NA)      
#> 3 Pro   "Both_True"      2 (67%)      1 (100%)    
#> 4 Pro   "Missing_LogCat" 1 (33%)      0 (0%)      
#> 5 Total ""               4 (100%)     1 (100%)

^{由 reprex package (v2.0.0)}

于 2021-04-18 创建

这是一种在 RStudio

中生成一些枢轴样式 table 的方法

library(kableExtra)
kbl(summary_df, align = "c") %>%
  kable_paper(full_width = F) %>%
  column_spec(1:2, bold = T) %>%
  collapse_rows(columns = 1:2, target = 1, valign = "top")

这将在 View 选项卡中输出 RStudio 此 table

如何在 R 数据框中添加子计数

How to add sub count in R dataframe

r

dataframe

dplyr

tidyverse

janitor