在对来自另一列的完整组合进行分组后,R 创建具有比例列的摘要数据框

R create summary data frame that has a proportions column, after grouping of complete combinations from another column

这是我正在使用的数据框的屏幕截图。

我还包括了 dput(head(df, n = 50)).

structure(list(conversion_hash_id = c("0001cd061567445d62e91e9d5c28c004", 
"0001cd061567445d62e91e9d5c28c004", "0003bc99a5039e664fd71dab54d7d533", 
"0003c6d107ae3dc12174eee321ee4589", "0006ba6c297c921e745ebe624c61b3c2", 
"0007dbcbc304375c09d6cd650a144cd1", "00093c82a4a8fb0827833960fabb5e1a", 
"000a508ea789bbad8e9e832bb1c2b787", "000d383e749beb47f8c93ec3a05a1b4a", 
"00106ff98badafd53709fbb617052bb6", "0010a4d7e4f77d5c0ce684499a529ce2", 
"00131c89636a6d0f37ae93fe6da4144e", "001dcc2ff0655c7dd8243b86ad461cff", 
"002097d1eb159f92b43a6dc0510d9b08", "0020c1e9b159733d2d3ba9912c795382", 
"0020c1e9b159733d2d3ba9912c795382", "00212acae3e183252078bdfd2c3963e3", 
"00212acae3e183252078bdfd2c3963e3", "0023ff241f1c71743146300098021297", 
"002f421d2a3ad29afd9cb15807ce1f0f", "00316ec40c99e9d8b53b4834386b0476", 
"003593dce8e41b1a2a544b16196cc6a1", "00389f0ef15d13e4ec2eec7a1ac03240", 
"003c69e077ba1ccbf051a229f5fc627a", "00415d4cd157e3f256976d0e9f5dab19", 
"0041723c94fa240eea4e6245513e7213", "0043e1fd2c4ef8c1da1959a3f4dd0362", 
"0043e1fd2c4ef8c1da1959a3f4dd0362", "004a2bcbef69fb0d334b675583cdb873", 
"004df76075f83c1c6de314ff785aa0d7", "004df76075f83c1c6de314ff785aa0d7", 
"005090f2bf5796a2cbe693e43e70d653", "0051d01a74e467cb91d779ed064f7dda", 
"0052ef5a56f4a24011abe9a1a7242f49", "00592654753f3ef738a6f93227f3dd61", 
"005af6fca539bb39a4c88713b0c9df2b", "005d62d75f8ecf29e7a3c8571b6f9bf6", 
"005de06981b347a4af1f104fb3c7c6f2", "005dea5fc43528e8a4f00d172308cbea", 
"0060afd14a7edd637d99ded56429ab4a", "00669dae25369a8655199c54aec4c229", 
"00679670a8f30a3b59b9dd803c2aa48c", "006a9b088b768bbabb6d77cd6b44f763", 
"006af62acaac8b6d8b620a170f09e71b", "006c137bd6884d985e2e0724c45ad2ab", 
"006c83b0f6c0a8e3773a63c85f34c3c7", "006fc42950d4a610baaef0f66ded0819", 
"00716fcf53da8f347571e387e1beae6d", "00716fcf53da8f347571e387e1beae6d", 
"0073cee456597683aaecc9434a18eda3"), tier_1 = c("OTT", "Paid Search", 
"OTT", "Email", "Paid Search", "Paid Search", "Email", "Paid Social", 
"Paid Social", "Paid Search", "Direct", "Email", "Paid Search", 
"Paid Social", "Paid Search", "Paid Social", "Organic Search", 
"Paid Search", "Paid Social", "Organic Search", "Direct", "Paid Search", 
"Paid Video", "Direct", "Email", "Paid Search", "Affiliate", 
"Email", "Paid Search", "OTT", "Paid Search", "Paid Search", 
"Email", "Direct", "Direct", "Paid Search", "Organic Search", 
"Paid Search", "Paid Search", "Paid Search", "Paid Social", "Direct", 
"Organic Search", "Paid Search", "Email", "Paid Search", "Paid Social", 
"Paid Search", "Paid Social", "Organic Social"), normalized = c(0.4287711836, 
0.5712288164, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0.7237434946, 
0.2762565054, 0.3000477093, 0.6999522907, 1, 1, 1, 1, 1, 1, 1, 
1, 0.5447557896, 0.4552442104, 1, 0.3493884477, 0.6506115523, 
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0.700741097, 
0.299258903, 0.5210938212)), row.names = c(NA, -50L), groups = structure(list(
    conversion_hash_id = c("0001cd061567445d62e91e9d5c28c004", 
    "0003bc99a5039e664fd71dab54d7d533", "0003c6d107ae3dc12174eee321ee4589", 
    "0006ba6c297c921e745ebe624c61b3c2", "0007dbcbc304375c09d6cd650a144cd1", 
    "00093c82a4a8fb0827833960fabb5e1a", "000a508ea789bbad8e9e832bb1c2b787", 
    "000d383e749beb47f8c93ec3a05a1b4a", "00106ff98badafd53709fbb617052bb6", 
    "0010a4d7e4f77d5c0ce684499a529ce2", "00131c89636a6d0f37ae93fe6da4144e", 
    "001dcc2ff0655c7dd8243b86ad461cff", "002097d1eb159f92b43a6dc0510d9b08", 
    "0020c1e9b159733d2d3ba9912c795382", "00212acae3e183252078bdfd2c3963e3", 
    "0023ff241f1c71743146300098021297", "002f421d2a3ad29afd9cb15807ce1f0f", 
    "00316ec40c99e9d8b53b4834386b0476", "003593dce8e41b1a2a544b16196cc6a1", 
    "00389f0ef15d13e4ec2eec7a1ac03240", "003c69e077ba1ccbf051a229f5fc627a", 
    "00415d4cd157e3f256976d0e9f5dab19", "0041723c94fa240eea4e6245513e7213", 
    "0043e1fd2c4ef8c1da1959a3f4dd0362", "004a2bcbef69fb0d334b675583cdb873", 
    "004df76075f83c1c6de314ff785aa0d7", "005090f2bf5796a2cbe693e43e70d653", 
    "0051d01a74e467cb91d779ed064f7dda", "0052ef5a56f4a24011abe9a1a7242f49", 
    "00592654753f3ef738a6f93227f3dd61", "005af6fca539bb39a4c88713b0c9df2b", 
    "005d62d75f8ecf29e7a3c8571b6f9bf6", "005de06981b347a4af1f104fb3c7c6f2", 
    "005dea5fc43528e8a4f00d172308cbea", "0060afd14a7edd637d99ded56429ab4a", 
    "00669dae25369a8655199c54aec4c229", "00679670a8f30a3b59b9dd803c2aa48c", 
    "006a9b088b768bbabb6d77cd6b44f763", "006af62acaac8b6d8b620a170f09e71b", 
    "006c137bd6884d985e2e0724c45ad2ab", "006c83b0f6c0a8e3773a63c85f34c3c7", 
    "006fc42950d4a610baaef0f66ded0819", "00716fcf53da8f347571e387e1beae6d", 
    "0073cee456597683aaecc9434a18eda3"), .rows = structure(list(
        1:2, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 
        14L, 15:16, 17:18, 19L, 20L, 21L, 22L, 23L, 24L, 25L, 
        26L, 27:28, 29L, 30:31, 32L, 33L, 34L, 35L, 36L, 37L, 
        38L, 39L, 40L, 41L, 42L, 43L, 44L, 45L, 46L, 47L, 48:49, 
        50L), ptype = integer(0), class = c("vctrs_list_of", 
    "vctrs_vctr", "list"))), row.names = c(NA, -44L), class = c("tbl_df", 
"tbl", "data.frame"), .drop = TRUE), class = c("grouped_df", 
"tbl_df", "tbl", "data.frame"))

我需要使用一些我不知道如何创建的逻辑来创建摘要数据框。

我需要做的是将属于同一组 tier_1 类别的每个 conversion_hash_id 分组,并计算出现的总数。

例如,如果有 30 个不同的 conversion_hash_id 彼此不同,但每个都列出了两次,一次用于 OTT,一次用于付费搜索,那么我需要计算该组合出现的次数。

这是新数据框的样子。这些只是几个案例的编造数字。最终 table 的结构可能还有其他可能性。我没有在这里列出所有案例。

group                   count
OTT, Paid Search         30 
OTT, Paid Social         25

按 'conversion_hash_id' 分组后,summarise 将 'tier_1' 的 sorted unique 值作为单个字符串 pasteing (toString - paste(..., collapse=", ")) 然后获取 'group' 列的 count

library(dplyr)
df %>%
    group_by(conversion_hash_id) %>%
    summarise(group = toString(sort(unique(tier_1)))) %>%
    count(group)