如何在 R 数据框中添加子计数
How to add sub count in R dataframe
我在 R 中有下面提到的数据框:
dput(df)
structure(list(CustID = c("C-1", "C-2",
"C-2", "C-2", "C-3", "C-3",
"C-3", "C-4", "C-5"), DATE = c("2021-01-02 14:13:10", "2021-01-02 13:17:07", "2021-01-02 14:15:10", "2021-01-02 16:14:08", "2021-01-02 17:11:03", "2021-01-02 12:14:24", "2021-01-02 12:33:34", "2021-01-02 10:43:55", "2021-01-03 20:23:35"), TYPE = c("Demo",
"Pro", "Pro", "Pro", "Pro", "Pro", "Pro", "Pro", "Pro"
), LogCat = c(NA, "SPR,DET,RTD", "SPR,DET,RTD", "SPR,DET,RTD",
"DET", "DET", "DET", NA, " SPR, RTD "), PriceCode = c(NA,"KR", "SR", "DE", "KL", "ZT", "KR", "KR", "KR")), class = "data.frame", row.names = c(NA,
-9L))
CusID Date Type LogCat PriceCode
C-1 2021-01-02 14:13:10 Demo
C-2 2021-01-02 13:17:07 Pro SPR, DET, RTD KR
C-2 2021-01-02 14:15:10 Pro SPR, DET, RTD SR
C-2 2021-01-02 16:14:08 Pro SPR, DET, RTD DE
C-3 2021-01-02 17:11:03 Pro DET KL
C-3 2021-01-02 12:14:24 Pro DET ZT
C-3 2021-01-02 12:33:34 Pro DET KR
C-4 2021-01-02 10:43:55 Pro KR
C-5 2021-01-03 20:23:35 Pro SPR, RTD KR
我正在使用下面的代码,它给出了下面提到的输出。
library(tidyverse)
library(janitor)
df %>% mutate(DATE = as.Date(DATE)) %>% select(1:3) %>%
unique() %>%
tabyl(TYPE, DATE) %>%
adorn_totals("row") %>%
adorn_percentages("col") %>%
adorn_pct_formatting(2) %>%
adorn_ns("front")
TYPE 2021-01-02 2021-01-03
Demo 1 (25.00%) 0 (0.00%)
Pro 3 (75.00%) 1 (100.00%)
Total 4 (100.00%) 1 (100.00%)
在上面提到的输出中,我需要基于列 LogCat
和 PriceCode
的不同子计数。例如,在给出特定 Type
的总不同计数时,我需要得到有多少不同 CusID
具有 LogCat
和 PriceCode
。另外,有多少不同的计数没有 LogCat
和 PriceCode
.
其中,对于每种类型,我们需要检查以下条件。
LogCat
- 唯一的 CusID
仅在 LogCat
列中具有值,并且 PriceCode
为 NA 或 Blank
PriceCode
- 唯一的 CusID
仅在 PriceCode
列中具有值,并且 LogCat
为 NA 或空白
Both True
- 唯一的 CusID
在 LogCat
和 PriceCode
列中都有值。
Both False
- 唯一的 CusID
在 LogCat
和 PriceCode
列中都没有值。
这些值的百分比应该通过计算我们计算这些值的特定 Type
的非重复计数来计算。例如,在 Demo
下面,如果我们计算 Demo
类型的这些值,则第一个日期的分母应为 1,第二个日期的分母应为 0。
需要数据框<-
TYPE 2021-01-02 2021-01-03
Demo 1 (25.00%) 0 (0.00%)
LogCat 0 (0.00%) 0 (0.00%)
PriceCode 0 (0.00%) 0 (0.00%)
Both True 0 (0.00%) 0 (0.00%)
Both False 1 (100.00%) 0 (0.00%)
Pro 3 (75.00%) 1 (100.00%)
LogCat 0 (0.00%) 0 (0.00%)
PriceCode 1 (33.33%) 0 (0.00%)
Both True 2 (66.66%) 1 (100.00%)
Both False 0 (0.00%) 0 (0.00%)
Total 4 (100.00%) 1 (100.00%)
包 pivottabler
可能对这里有帮助
df %>% mutate(DATE = as.Date(DATE)) %>%
mutate(across(.cols = c(LogCat, PriceCode), ~ +!(is.na(.) | . == ""))) %>%
mutate(dummy = case_when(LogCat + PriceCode == 0 ~ "Both False",
LogCat + PriceCode == 2 ~ "Both True",
LogCat == 1 ~ "Logcat",
TRUE ~ "PriceCode")) %>%
mutate(dummy = factor(dummy, levels = c("Both False", "Both True", "Logcat", "PriceCode")),
d2 = 1) %>%
group_by(CustID, DATE, TYPE) %>%
complete(dummy) %>% ungroup() %>% unique() %>%
select(c(2:4,7)) -> df1
library(pivottabler)
qpvt(df1, c("TYPE", "dummy"), "DATE", "sum(d2, na.rm = T)")
2021-01-02 2021-01-03 Total
Demo Both False 1 1
Both True 0 0
Logcat 0 0
PriceCode 0 0
Total 1 1
Pro Both False 0 0 0
Both True 2 1 3
Logcat 0 0 0
PriceCode 1 0 1
Total 3 1 4
Total 4 1 5
或
qhpvt(df1, c("TYPE", "dummy"), "DATE", "sum(d2, na.rm = T)")
这是我从您的 OP 中了解到的内容 - 相当复杂的逻辑 ;)
library(tidyverse)
library(scales)
# grain level data-frame
grain_df <- df %>%
# convert Date to Date
mutate(DATE = as.Date(DATE)) %>%
separate_rows(LogCat) %>%
# Categorize customer into your define logic
group_by(DATE, TYPE, CustID) %>%
summarize(category = case_when(
# all LogCat is missing & Price is present
all(is.na(LogCat)) & any(!is.na(PriceCode)) ~ "Missing_LogCat",
# LogCat is present & Price is missing
any(!is.na(LogCat)) & all(is.na(PriceCode)) ~ "Missing_PriceCode",
# Both present
any(!is.na(LogCat)) & any(!is.na(PriceCode)) ~ "Both_True",
# Both missing
all(is.na(LogCat)) & all(is.na(PriceCode)) ~ "Both_True",
TRUE ~ "No category yet"
), .groups = "drop") %>%
# count the customer in each category by type & date
group_by(DATE, TYPE, category) %>%
summarize(count = n_distinct(CustID), .groups = "drop") %>%
# compelete the data set with all combination of Date/type/category
complete(DATE, TYPE, category, fill = list(count = 0))
# summarize and put in some formmating
summary_df <- grain_df %>%
# calculate the percent
group_by(DATE, TYPE) %>%
mutate(percent = paste0(count, " (", percent(count / sum(count)), ")")) %>%
ungroup() %>%
select(-count) %>%
# Pivot wider to have the format you want
pivot_wider(names_from = DATE, values_from = percent, values_fill = "")
# calculate total row & formatting
total_rows <- grain_df %>%
group_by(DATE) %>%
summarize(percent = paste0(sum(count), " (100%)")) %>%
mutate(TYPE = "Total", category = "") %>%
pivot_wider(names_from = DATE, values_from = percent)
# combind two data to get a final df with total rows
bind_rows(summary_df,
total_rows)
#> # A tibble: 5 x 4
#> TYPE category `2021-01-02` `2021-01-03`
#> <chr> <chr> <chr> <chr>
#> 1 Demo "Both_True" 1 (100%) 0 (NA)
#> 2 Demo "Missing_LogCat" 0 (0%) 0 (NA)
#> 3 Pro "Both_True" 2 (67%) 1 (100%)
#> 4 Pro "Missing_LogCat" 1 (33%) 0 (0%)
#> 5 Total "" 4 (100%) 1 (100%)
由 reprex package (v2.0.0)
于 2021-04-18 创建
这是一种在 RStudio
中生成一些枢轴样式 table 的方法
library(kableExtra)
kbl(summary_df, align = "c") %>%
kable_paper(full_width = F) %>%
column_spec(1:2, bold = T) %>%
collapse_rows(columns = 1:2, target = 1, valign = "top")
这将在 View
选项卡中输出 RStudio
此 table
我在 R 中有下面提到的数据框:
dput(df)
structure(list(CustID = c("C-1", "C-2",
"C-2", "C-2", "C-3", "C-3",
"C-3", "C-4", "C-5"), DATE = c("2021-01-02 14:13:10", "2021-01-02 13:17:07", "2021-01-02 14:15:10", "2021-01-02 16:14:08", "2021-01-02 17:11:03", "2021-01-02 12:14:24", "2021-01-02 12:33:34", "2021-01-02 10:43:55", "2021-01-03 20:23:35"), TYPE = c("Demo",
"Pro", "Pro", "Pro", "Pro", "Pro", "Pro", "Pro", "Pro"
), LogCat = c(NA, "SPR,DET,RTD", "SPR,DET,RTD", "SPR,DET,RTD",
"DET", "DET", "DET", NA, " SPR, RTD "), PriceCode = c(NA,"KR", "SR", "DE", "KL", "ZT", "KR", "KR", "KR")), class = "data.frame", row.names = c(NA,
-9L))
CusID Date Type LogCat PriceCode
C-1 2021-01-02 14:13:10 Demo
C-2 2021-01-02 13:17:07 Pro SPR, DET, RTD KR
C-2 2021-01-02 14:15:10 Pro SPR, DET, RTD SR
C-2 2021-01-02 16:14:08 Pro SPR, DET, RTD DE
C-3 2021-01-02 17:11:03 Pro DET KL
C-3 2021-01-02 12:14:24 Pro DET ZT
C-3 2021-01-02 12:33:34 Pro DET KR
C-4 2021-01-02 10:43:55 Pro KR
C-5 2021-01-03 20:23:35 Pro SPR, RTD KR
我正在使用下面的代码,它给出了下面提到的输出。
library(tidyverse)
library(janitor)
df %>% mutate(DATE = as.Date(DATE)) %>% select(1:3) %>%
unique() %>%
tabyl(TYPE, DATE) %>%
adorn_totals("row") %>%
adorn_percentages("col") %>%
adorn_pct_formatting(2) %>%
adorn_ns("front")
TYPE 2021-01-02 2021-01-03
Demo 1 (25.00%) 0 (0.00%)
Pro 3 (75.00%) 1 (100.00%)
Total 4 (100.00%) 1 (100.00%)
在上面提到的输出中,我需要基于列 LogCat
和 PriceCode
的不同子计数。例如,在给出特定 Type
的总不同计数时,我需要得到有多少不同 CusID
具有 LogCat
和 PriceCode
。另外,有多少不同的计数没有 LogCat
和 PriceCode
.
其中,对于每种类型,我们需要检查以下条件。
LogCat
- 唯一的CusID
仅在LogCat
列中具有值,并且PriceCode
为 NA 或 BlankPriceCode
- 唯一的CusID
仅在PriceCode
列中具有值,并且LogCat
为 NA 或空白Both True
- 唯一的CusID
在LogCat
和PriceCode
列中都有值。Both False
- 唯一的CusID
在LogCat
和PriceCode
列中都没有值。
这些值的百分比应该通过计算我们计算这些值的特定 Type
的非重复计数来计算。例如,在 Demo
下面,如果我们计算 Demo
类型的这些值,则第一个日期的分母应为 1,第二个日期的分母应为 0。
需要数据框<-
TYPE 2021-01-02 2021-01-03
Demo 1 (25.00%) 0 (0.00%)
LogCat 0 (0.00%) 0 (0.00%)
PriceCode 0 (0.00%) 0 (0.00%)
Both True 0 (0.00%) 0 (0.00%)
Both False 1 (100.00%) 0 (0.00%)
Pro 3 (75.00%) 1 (100.00%)
LogCat 0 (0.00%) 0 (0.00%)
PriceCode 1 (33.33%) 0 (0.00%)
Both True 2 (66.66%) 1 (100.00%)
Both False 0 (0.00%) 0 (0.00%)
Total 4 (100.00%) 1 (100.00%)
包 pivottabler
可能对这里有帮助
df %>% mutate(DATE = as.Date(DATE)) %>%
mutate(across(.cols = c(LogCat, PriceCode), ~ +!(is.na(.) | . == ""))) %>%
mutate(dummy = case_when(LogCat + PriceCode == 0 ~ "Both False",
LogCat + PriceCode == 2 ~ "Both True",
LogCat == 1 ~ "Logcat",
TRUE ~ "PriceCode")) %>%
mutate(dummy = factor(dummy, levels = c("Both False", "Both True", "Logcat", "PriceCode")),
d2 = 1) %>%
group_by(CustID, DATE, TYPE) %>%
complete(dummy) %>% ungroup() %>% unique() %>%
select(c(2:4,7)) -> df1
library(pivottabler)
qpvt(df1, c("TYPE", "dummy"), "DATE", "sum(d2, na.rm = T)")
2021-01-02 2021-01-03 Total
Demo Both False 1 1
Both True 0 0
Logcat 0 0
PriceCode 0 0
Total 1 1
Pro Both False 0 0 0
Both True 2 1 3
Logcat 0 0 0
PriceCode 1 0 1
Total 3 1 4
Total 4 1 5
或
qhpvt(df1, c("TYPE", "dummy"), "DATE", "sum(d2, na.rm = T)")
这是我从您的 OP 中了解到的内容 - 相当复杂的逻辑 ;)
library(tidyverse)
library(scales)
# grain level data-frame
grain_df <- df %>%
# convert Date to Date
mutate(DATE = as.Date(DATE)) %>%
separate_rows(LogCat) %>%
# Categorize customer into your define logic
group_by(DATE, TYPE, CustID) %>%
summarize(category = case_when(
# all LogCat is missing & Price is present
all(is.na(LogCat)) & any(!is.na(PriceCode)) ~ "Missing_LogCat",
# LogCat is present & Price is missing
any(!is.na(LogCat)) & all(is.na(PriceCode)) ~ "Missing_PriceCode",
# Both present
any(!is.na(LogCat)) & any(!is.na(PriceCode)) ~ "Both_True",
# Both missing
all(is.na(LogCat)) & all(is.na(PriceCode)) ~ "Both_True",
TRUE ~ "No category yet"
), .groups = "drop") %>%
# count the customer in each category by type & date
group_by(DATE, TYPE, category) %>%
summarize(count = n_distinct(CustID), .groups = "drop") %>%
# compelete the data set with all combination of Date/type/category
complete(DATE, TYPE, category, fill = list(count = 0))
# summarize and put in some formmating
summary_df <- grain_df %>%
# calculate the percent
group_by(DATE, TYPE) %>%
mutate(percent = paste0(count, " (", percent(count / sum(count)), ")")) %>%
ungroup() %>%
select(-count) %>%
# Pivot wider to have the format you want
pivot_wider(names_from = DATE, values_from = percent, values_fill = "")
# calculate total row & formatting
total_rows <- grain_df %>%
group_by(DATE) %>%
summarize(percent = paste0(sum(count), " (100%)")) %>%
mutate(TYPE = "Total", category = "") %>%
pivot_wider(names_from = DATE, values_from = percent)
# combind two data to get a final df with total rows
bind_rows(summary_df,
total_rows)
#> # A tibble: 5 x 4
#> TYPE category `2021-01-02` `2021-01-03`
#> <chr> <chr> <chr> <chr>
#> 1 Demo "Both_True" 1 (100%) 0 (NA)
#> 2 Demo "Missing_LogCat" 0 (0%) 0 (NA)
#> 3 Pro "Both_True" 2 (67%) 1 (100%)
#> 4 Pro "Missing_LogCat" 1 (33%) 0 (0%)
#> 5 Total "" 4 (100%) 1 (100%)
由 reprex package (v2.0.0)
于 2021-04-18 创建这是一种在 RStudio
中生成一些枢轴样式 table 的方法library(kableExtra)
kbl(summary_df, align = "c") %>%
kable_paper(full_width = F) %>%
column_spec(1:2, bold = T) %>%
collapse_rows(columns = 1:2, target = 1, valign = "top")
这将在 View
选项卡中输出 RStudio
此 table