检查行中的值是否在 R 中的组之间重复

To check if a value in a row is repeated between groups in R

我有一个数据集,其中包含不同家庭在不同零售商处的购买情况。例如

Example Dataset

使用dput()

structure(list(household_code = c(76, 76, 76, 76, 76, 76, 76, 
76, 76, 76, 76, 76, 76, 76, 76, 76, 76, 76, 76, 76, 76, 76, 126, 
126, 126, 126, 126, 126, 126, 126, 126), trip_code_uc = c(1032497498L, 
1025776063L, 1029419047L, 1030418100L, 1029502602L, 1034153056L, 
1027035051L, 1027533991L, 1033515804L, 1032998207L, 1032066227L, 
1028192785L, 1033419039L, 1028730296L, 1027388499L, 1030652869L, 
1025638394L, 1034032718L, 1034032718L, 1025678520L, 1029490031L, 
1029898838L, 1028024134L, 1030324171L, 1031983761L, 1031983761L, 
1033767148L, 1023953965L, 1030954113L, 1030954113L, 1027392968L
), purchase_date = structure(c(1L, 2L, 23L, 50L, 52L, 74L, 77L, 
94L, 148L, 158L, 176L, 179L, 196L, 211L, 224L, 246L, 271L, 286L, 
286L, 309L, 329L, 346L, 2L, 9L, 46L, 46L, 50L, 58L, 66L, 66L, 
68L), .Label = c("2012-01-01", "2012-01-02", "2012-01-03", "2012-01-04", 
"2012-01-05", "2012-01-06", "2012-01-07", "2012-01-08", "2012-01-09", 
"2012-01-10", "2012-01-11", "2012-01-12", "2012-01-13", "2012-01-14", 
"2012-01-15", "2012-01-16", "2012-01-17", "2012-01-18", "2012-01-19", 
"2012-01-20", "2012-01-21", "2012-01-22", "2012-01-23", "2012-01-24", 
"2012-01-25", "2012-01-26", "2012-01-27", "2012-01-28", "2012-01-29", 
"2012-01-30", "2012-01-31", "2012-02-01", "2012-02-02", "2012-02-03", 
"2012-02-04", "2012-02-05", "2012-02-06", "2012-02-07", "2012-02-08", 
"2012-02-09", "2012-02-10", "2012-02-11", "2012-02-12", "2012-02-13", 
"2012-02-14", "2012-02-15", "2012-02-16", "2012-02-17", "2012-02-18", 
"2012-02-19", "2012-02-20", "2012-02-21", "2012-02-22", "2012-02-23", 
"2012-02-24", "2012-02-25", "2012-02-26", "2012-02-27", "2012-02-28", 
"2012-02-29", "2012-03-01", "2012-03-02", "2012-03-03", "2012-03-04", 
"2012-03-05", "2012-03-06", "2012-03-07", "2012-03-08", "2012-03-09", 
"2012-03-10", "2012-03-11", "2012-03-12", "2012-03-13", "2012-03-14", 
"2012-03-15", "2012-03-16", "2012-03-17", "2012-03-18", "2012-03-19", 
"2012-03-20", "2012-03-21", "2012-03-22", "2012-03-23", "2012-03-24", 
"2012-03-25", "2012-03-26", "2012-03-27", "2012-03-28", "2012-03-29", 
"2012-03-30", "2012-03-31", "2012-04-01", "2012-04-02", "2012-04-03", 
"2012-04-04", "2012-04-05", "2012-04-06", "2012-04-07", "2012-04-08", 
"2012-04-09", "2012-04-10", "2012-04-11", "2012-04-12", "2012-04-13", 
"2012-04-14", "2012-04-15", "2012-04-16", "2012-04-17", "2012-04-18", 
"2012-04-19", "2012-04-20", "2012-04-21", "2012-04-22", "2012-04-23", 
"2012-04-24", "2012-04-25", "2012-04-26", "2012-04-27", "2012-04-28", 
"2012-04-29", "2012-04-30", "2012-05-01", "2012-05-02", "2012-05-03", 
"2012-05-04", "2012-05-05", "2012-05-06", "2012-05-07", "2012-05-08", 
"2012-05-09", "2012-05-10", "2012-05-11", "2012-05-12", "2012-05-13", 
"2012-05-14", "2012-05-15", "2012-05-16", "2012-05-17", "2012-05-18", 
"2012-05-19", "2012-05-20", "2012-05-21", "2012-05-22", "2012-05-23", 
"2012-05-24", "2012-05-25", "2012-05-26", "2012-05-27", "2012-05-28", 
"2012-05-29", "2012-05-30", "2012-05-31", "2012-06-01", "2012-06-02", 
"2012-06-03", "2012-06-04", "2012-06-05", "2012-06-06", "2012-06-07", 
"2012-06-08", "2012-06-09", "2012-06-10", "2012-06-11", "2012-06-12", 
"2012-06-13", "2012-06-14", "2012-06-15", "2012-06-16", "2012-06-17", 
"2012-06-18", "2012-06-19", "2012-06-20", "2012-06-21", "2012-06-22", 
"2012-06-23", "2012-06-24", "2012-06-25", "2012-06-26", "2012-06-27", 
"2012-06-28", "2012-06-29", "2012-06-30", "2012-07-01", "2012-07-02", 
"2012-07-03", "2012-07-04", "2012-07-05", "2012-07-06", "2012-07-07", 
"2012-07-08", "2012-07-09", "2012-07-10", "2012-07-11", "2012-07-12", 
"2012-07-13", "2012-07-14", "2012-07-15", "2012-07-16", "2012-07-17", 
"2012-07-18", "2012-07-19", "2012-07-20", "2012-07-21", "2012-07-22", 
"2012-07-23", "2012-07-24", "2012-07-25", "2012-07-26", "2012-07-27", 
"2012-07-28", "2012-07-29", "2012-07-30", "2012-07-31", "2012-08-01", 
"2012-08-02", "2012-08-03", "2012-08-04", "2012-08-05", "2012-08-06", 
"2012-08-07", "2012-08-08", "2012-08-09", "2012-08-10", "2012-08-11", 
"2012-08-12", "2012-08-13", "2012-08-14", "2012-08-15", "2012-08-16", 
"2012-08-17", "2012-08-18", "2012-08-19", "2012-08-20", "2012-08-21", 
"2012-08-22", "2012-08-23", "2012-08-24", "2012-08-25", "2012-08-26", 
"2012-08-27", "2012-08-28", "2012-08-29", "2012-08-30", "2012-08-31", 
"2012-09-01", "2012-09-02", "2012-09-03", "2012-09-04", "2012-09-05", 
"2012-09-06", "2012-09-07", "2012-09-08", "2012-09-09", "2012-09-10", 
"2012-09-11", "2012-09-12", "2012-09-13", "2012-09-14", "2012-09-15", 
"2012-09-16", "2012-09-17", "2012-09-18", "2012-09-19", "2012-09-20", 
"2012-09-21", "2012-09-22", "2012-09-23", "2012-09-24", "2012-09-25", 
"2012-09-26", "2012-09-27", "2012-09-28", "2012-09-29", "2012-09-30", 
"2012-10-01", "2012-10-02", "2012-10-03", "2012-10-04", "2012-10-05", 
"2012-10-06", "2012-10-07", "2012-10-08", "2012-10-09", "2012-10-10", 
"2012-10-11", "2012-10-12", "2012-10-13", "2012-10-14", "2012-10-15", 
"2012-10-16", "2012-10-17", "2012-10-18", "2012-10-19", "2012-10-20", 
"2012-10-21", "2012-10-22", "2012-10-23", "2012-10-24", "2012-10-25", 
"2012-10-26", "2012-10-27", "2012-10-28", "2012-10-29", "2012-10-30", 
"2012-10-31", "2012-11-01", "2012-11-02", "2012-11-03", "2012-11-04", 
"2012-11-05", "2012-11-06", "2012-11-07", "2012-11-08", "2012-11-09", 
"2012-11-10", "2012-11-11", "2012-11-12", "2012-11-13", "2012-11-14", 
"2012-11-15", "2012-11-16", "2012-11-17", "2012-11-18", "2012-11-19", 
"2012-11-20", "2012-11-21", "2012-11-22", "2012-11-23", "2012-11-24", 
"2012-11-25", "2012-11-26", "2012-11-27", "2012-11-28", "2012-11-29", 
"2012-11-30", "2012-12-01", "2012-12-02", "2012-12-03", "2012-12-04", 
"2012-12-05", "2012-12-06", "2012-12-07", "2012-12-08", "2012-12-09", 
"2012-12-10", "2012-12-11", "2012-12-12", "2012-12-13", "2012-12-14", 
"2012-12-15", "2012-12-16", "2012-12-17", "2012-12-18", "2012-12-19", 
"2012-12-20", "2012-12-21", "2012-12-22", "2012-12-23", "2012-12-24", 
"2012-12-25", "2012-12-26", "2012-12-27", "2012-12-28", "2012-12-29"
), class = "factor"), retailer_code = c(11024, 11024, 11024, 
11024, 11024, 11024, 11024, 11024, 11024, 11024, 11024, 11024, 
11024, 11024, 11024, 11024, 2353, 11024, 11024, 2353, 11024, 
11024, 63882, 650, 89960, 89960, 650, 89960, 89960, 89960, 650
), Overall_Brand = structure(c(19L, 74L, 19L, 48L, 19L, 48L, 
19L, 19L, 19L, 48L, 48L, 31L, 46L, 31L, 31L, 48L, 74L, 31L, 74L, 
19L, 31L, 19L, 48L, 48L, 31L, 31L, 48L, 31L, 31L, 48L, 48L), .Label = c("ABUNDANCE", 
"ALPEN", "AMERICAN BREAKFAST ", "ANNIE'S HOMEGROWN", "ARWHD MLS", 
"BARBARA'S", "BEAR NAKED", "BEAR RIVER", "BOB'S RED MILL", "BOKOMO COUNTRY", 
"BREAKFAST CHOICE", "BREAKFAST ZONE", "BROOKFARM MACADAMIA ", 
"BRUGGEN", "BUCKEYE HEROES", "CADIA", "CASCADIAN FARM ", "CHOCOLATE SPOONERS", 
"CTL BR", "DORSET", "ENJOY LIFE PERKY'S CRUNCHY FLX", "EREWHON", 
"F-FACTOR", "FAMILIA", "FIELD DAY", "FINAX", "FLEURY FLAKES MARC ANDRE", 
"FOOD FOR LIFE EZEKIEL 50", "FORRELLI ", "GEFEN KING", "GENERAL MILLS", 
"GERONIMO PEYTON HILLIS REDZONE", "GLUCERNA", "GLUTINO", "GOLDEN FOODS", 
"GRANVITA PUFFY'S", "GREENBRIER INT INC-NBL CRN FLK", "HEALTH VALLEY", 
"HODGSON MILL", "HOME FAVORITE", "HOSTESS TOASTED OATS", "HSP", 
"ISABEL'S WAY ", "JASPER", "JUSTIN VERLANDER'S FASTBLL FLK", 
"KASHI", "KAY'S NATURALS BETTER BALANCE", "Kellogg", "KIND", 
"KOZY SHACK READY GRAINS", "KRETSCHMER", "LADY LIBERTY", "LIEBER'S", 
"LIVING INTENTIONS SPRFD CRL", "LOVE GROWN FOODS", "MAIZORO", 
"MANISCHEWITZ", "MILL SELECT ", "MOTHER'S", "MULTIGRAIN SPNRS", 
"NASH BROTHERS", "NATURE'S PATH", "NESTLE", "NEW ENGLAND NATURALS", 
"NEWMAN'S OWN SWEET ENGH WHT PF", "NUTRISYSTEM NOURISH", "NUTRITIOUS LIVING", 
"PAMPA ", "PASKESZ CHOCO RIOS", "PBLC LB BR-NBL HY NT TSD OT DM", 
"PEACE", "POST", "PROTEIN PLUS CORN FLAKE", "QUAKER", "RALSTON", 
"RLSTN", "SALLY'S ", "SCRUMPTIOUS SPELNDID CRLS CBB", "SEITENBACHER MUESLI", 
"SIMPLY FIBER", "SKINNER'S RAISIN BRAN", "SORIANA WHOLE BRAN", 
"STREIT'S FRUIT & NUT MUESLI", "SUNBELT", "SWAD", "T. ABRAHAM'S", 
"TAANUG CORN FLAKES", "TASTY", "TEMMY'S", "THE", "THREE SISTERS", 
"TIKISS SWEETENED PUFFD WHL WHT", "TRU ROOTS", "VOTTO'S", "WEETABIX", 
"WHOLESOME GOODNESS", "WILD ROOTS ANCIENT ORIGINS", "WONDER CORN FLAKES", 
"YOG ACTIVE"), class = "factor")), row.names = c(90609L, 222436L, 
90606L, 688592L, 90607L, 688593L, 90605L, 90604L, 90608L, 668330L, 
321377L, 567447L, 945385L, 567445L, 567443L, 892854L, 583186L, 
567446L, 583185L, 168138L, 567444L, 60086L, 698120L, 698127L, 
3933L, 809409L, 698129L, 15286L, 15284L, 698116L, 319455L), class = "data.frame")

我有一个单独的家庭变量、特定购买实例的旅行 ID、零售商和他们购买的品牌。一个家庭在1次旅行中可能会购买1个以上的品牌。我想计算连续 2 次旅行中的重复购买。例如,如果一个家庭在行程 1 中同时购买了 General Mills 和 Kellogg,而在行程 2 中只购买了 General mill,对于 general mills,这将是一个重复。

示例输出:

现在,当我每次旅行只购买 1 件时,这很容易。我使用 rle()

通过以下代码完成
e1 = transform(e1, brand_last_dum = ave(as.character(Brand), rleid(Household, Brand), FUN = seq_along))

但是,当超过1次购买时,它就不起作用了。数据按购买日期排列,在一次旅行中购买超过 1 次同一品牌可能会被视为不同的购买,用于计算下一行的重复。请帮忙

转念一想,如果我们引入这样的函数,事情会容易得多

conditional_count <- function(x, fcond) {
  fcond <- rlang::as_function(fcond)
  x <- fcond(x)
  x[x] <- with(rle(x), sequence(lengths[values]))
  x
}

fcond 需要是仅适用于 x 和 returns 的函数 TRUEFALSE。此函数的输出是每个连续 TRUEs 的序列,否则为 0。例如,

> conditional_count(c(1,1,2,3,3,3,5), duplicated)
[1] 0 1 0 0 1 2 0

然后我们可以分两步实现你想要的。首先,为每个行程生成一个 rleid。然后,对于每个 HouseholdBrand,我们计算与上一次旅行的 id 完全相同或仅加一的每个 trip id。第一个条件表示一次旅行有两次购买同一品牌。第二个条件表示在上次行程中找到了相同的品牌。两者都表明该品牌是根据您的要求重复的。例如,如果品牌出现在行程 1、1、2、3、3、5、6 中,则计数将为 0、1、2、3、4、0、1。那么,代码就是

library(dplyr)

e1 %>% 
  group_by(Household) %>% 
  mutate(trip_seq = rleid(`Trip ID`)) %>% 
  group_by(Household, Brand) %>% 
  mutate(
    Repeat = conditional_count(trip_seq, ~. - lag(., default = -1L) < 2L),
    trip_seq = NULL
  )

输出

# A tibble: 31 x 5
# Groups:   Household, Brand [7]
   Household  `Trip ID` Retailer Brand         Repeat
       <dbl>      <int>    <dbl> <fct>          <int>
 1        76 1032497498    11024 CTL BR             0
 2        76 1025776063    11024 QUAKER             0
 3        76 1029419047    11024 CTL BR             0
 4        76 1030418100    11024 Kellogg            0
 5        76 1029502602    11024 CTL BR             0
 6        76 1034153056    11024 Kellogg            0
 7        76 1027035051    11024 CTL BR             0
 8        76 1027533991    11024 CTL BR             1
 9        76 1033515804    11024 CTL BR             2
10        76 1032998207    11024 Kellogg            0
11        76 1032066227    11024 Kellogg            1
12        76 1028192785    11024 GENERAL MILLS      0
13        76 1033419039    11024 KASHI              0
14        76 1028730296    11024 GENERAL MILLS      0
15        76 1027388499    11024 GENERAL MILLS      1
16        76 1030652869    11024 Kellogg            0
17        76 1025638394     2353 QUAKER             0
18        76 1034032718    11024 GENERAL MILLS      0
19        76 1034032718    11024 QUAKER             1
20        76 1025678520     2353 CTL BR             0
21        76 1029490031    11024 GENERAL MILLS      0
22        76 1029898838    11024 CTL BR             0
23       126 1028024134    63882 Kellogg            0
24       126 1030324171      650 Kellogg            1
25       126 1031983761    89960 GENERAL MILLS      0
26       126 1031983761    89960 GENERAL MILLS      1
27       126 1033767148      650 Kellogg            0
28       126 1023953965    89960 GENERAL MILLS      0
29       126 1030954113    89960 GENERAL MILLS      1
30       126 1030954113    89960 Kellogg            0
31       126 1027392968      650 Kellogg            1

2020-11-29 更新: 调整代码以将同一品牌在同一购物行程中的第二次以上购买计为重复购买。鉴于额外的要求,我能够消除之前添加的连接之一,以一致地处理旅行中给定品牌的所有购买。

library(dplyr)
data %>%
        arrange(.,household_code,purchase_date,trip_code_uc) %>% 
        group_by(household_code,purchase_date) %>%
        distinct(trip_code_uc) %>% 
        group_by(household_code) %>% 
        mutate(trip_seq = seq_along(household_code)) %>% 
        ungroup() %>% 
        left_join(data,.) %>%
        arrange(household_code,Overall_Brand,trip_seq) %>%
        mutate(repeat_purchase = if_else(household_code == lag(household_code) &
                                                 Overall_Brand == lag(Overall_Brand) & 
                                                 (trip_seq == lag(trip_seq) + 1 | trip_seq == lag(trip_seq)),
                                         TRUE,FALSE,missing=FALSE)) %>%
        arrange(.,household_code,purchase_date,Overall_Brand) -> result

print(as.data.frame(result[,c(1,3,5,6,7)]))

...以及输出:

> print(as.data.frame(result[,c(1,3,5,6,7)]))
   household_code purchase_date Overall_Brand trip_seq repeat_purchase
1              76    2012-01-01        CTL BR        1           FALSE
2              76    2012-01-02        QUAKER        2           FALSE
3              76    2012-01-23        CTL BR        3           FALSE
4              76    2012-02-19       Kellogg        4           FALSE
5              76    2012-02-21        CTL BR        5           FALSE
6              76    2012-03-14       Kellogg        6           FALSE
7              76    2012-03-17        CTL BR        7           FALSE
8              76    2012-04-03        CTL BR        8            TRUE
9              76    2012-05-27        CTL BR        9            TRUE
10             76    2012-06-06       Kellogg       10           FALSE
11             76    2012-06-24       Kellogg       11            TRUE
12             76    2012-06-27 GENERAL MILLS       12           FALSE
13             76    2012-07-14         KASHI       13           FALSE
14             76    2012-07-29 GENERAL MILLS       14           FALSE
15             76    2012-08-11 GENERAL MILLS       15            TRUE
16             76    2012-09-02       Kellogg       16           FALSE
17             76    2012-09-27        QUAKER       17           FALSE
18             76    2012-10-12 GENERAL MILLS       18           FALSE
19             76    2012-10-12        QUAKER       18            TRUE
20             76    2012-11-04        CTL BR       19           FALSE
21             76    2012-11-24 GENERAL MILLS       20           FALSE
22             76    2012-12-11        CTL BR       21           FALSE
23            126    2012-01-02       Kellogg        1           FALSE
24            126    2012-01-09       Kellogg        2            TRUE
25            126    2012-02-15 GENERAL MILLS        3           FALSE
26            126    2012-02-15 GENERAL MILLS        3            TRUE
27            126    2012-02-19       Kellogg        4           FALSE
28            126    2012-02-27 GENERAL MILLS        5           FALSE
29            126    2012-03-06 GENERAL MILLS        6            TRUE
30            126    2012-03-06       Kellogg        6           FALSE
31            126    2012-03-08       Kellogg        7            TRUE
> 

下面包含以前的版本以保持评论的相关性

2020-11-28 更新: 收到包含 purchase_date 的更新数据后,我们更改了解决方案以使用此信息生成 trip_seq .我们假设如果一天内有多个 trip_code_uc 值,则购物体验按 trip_code_uc.

的升序发生

我们使用 dplyr 为每个家庭定义不同的出行标识符并分配出行顺序。然后,我们将旅行序列与原始数据合并后,评估是否在连续两次购物旅行中购买了同一品牌。

library(dplyr)
data %>%
     arrange(.,household_code,purchase_date,trip_code_uc) %>% 
     group_by(household_code,purchase_date) %>%
     distinct(trip_code_uc) %>% 
     group_by(household_code) %>% 
     mutate(trip_seq = seq_along(household_code)) %>% 
     ungroup() %>% 
     left_join(data,.) %>% 
     group_by(household_code,purchase_date,trip_seq) %>%
     distinct(Overall_Brand) %>%
     ungroup() %>%
     arrange(household_code,Overall_Brand,purchase_date,trip_seq) %>%
     mutate(repeat_purchase = if_else(household_code == lag(household_code) &
                                           Overall_Brand == lag(Overall_Brand) & 
                                           trip_seq == lag(trip_seq) + 1,
                                      TRUE,FALSE,missing=FALSE)) %>%
     left_join(data,.) -> result
result <- arrange(result,household_code,purchase_date,Overall_Brand)
print(as.data.frame(result[,c(1,3,5,6,7)]))

...以及输出:

> print(as.data.frame(result[,c(1,3,5,6,7)]))
   household_code purchase_date Overall_Brand trip_seq repeat_purchase
1              76    2012-01-01        CTL BR        1           FALSE
2              76    2012-01-02        QUAKER        2           FALSE
3              76    2012-01-23        CTL BR        3           FALSE
4              76    2012-02-19       Kellogg        4           FALSE
5              76    2012-02-21        CTL BR        5           FALSE
6              76    2012-03-14       Kellogg        6           FALSE
7              76    2012-03-17        CTL BR        7           FALSE
8              76    2012-04-03        CTL BR        8            TRUE
9              76    2012-05-27        CTL BR        9            TRUE
10             76    2012-06-06       Kellogg       10           FALSE
11             76    2012-06-24       Kellogg       11            TRUE
12             76    2012-06-27 GENERAL MILLS       12           FALSE
13             76    2012-07-14         KASHI       13           FALSE
14             76    2012-07-29 GENERAL MILLS       14           FALSE
15             76    2012-08-11 GENERAL MILLS       15            TRUE
16             76    2012-09-02       Kellogg       16           FALSE
17             76    2012-09-27        QUAKER       17           FALSE
18             76    2012-10-12 GENERAL MILLS       18           FALSE
19             76    2012-10-12        QUAKER       18            TRUE
20             76    2012-11-04        CTL BR       19           FALSE
21             76    2012-11-24 GENERAL MILLS       20           FALSE
22             76    2012-12-11        CTL BR       21           FALSE
23            126    2012-01-02       Kellogg        1           FALSE
24            126    2012-01-09       Kellogg        2            TRUE
25            126    2012-02-15 GENERAL MILLS        3           FALSE
26            126    2012-02-15 GENERAL MILLS        3           FALSE
27            126    2012-02-19       Kellogg        4           FALSE
28            126    2012-02-27 GENERAL MILLS        5           FALSE
29            126    2012-03-06 GENERAL MILLS        6            TRUE
30            126    2012-03-06       Kellogg        6           FALSE
31            126    2012-03-08       Kellogg        7            TRUE

序号很重要,因为如果我们简单地按HouseholdBrandTrip ID排序,我们无法判断下一个行程编号是否真的是“下一个”购买,如家庭 126 次购买 Kellog 品牌所示,该品牌是在旅行序列 1、2、4、6 和 7 中购买的。根据在 OP 中请求。

虽然对Len Greski提出的方案投了赞成票,但感觉代码可以这样缩短(没必要加入)。

EDIT 再三考虑,我提出了一个使用 dense_rank 而不是 cur_group_id[=19 的解决方案=]

library(dplyr)

df_result <- df %>% group_by(household_code) %>%
  arrange(purchase_date, trip_code_uc) %>% 
  mutate(shop_id = dense_rank(paste(purchase_date, trip_code_uc)),
         brand_seq = dense_rank(Overall_Brand)) %>%
  group_by(household_code, brand_seq) %>%
  mutate(cond = ifelse(shop_id == lag(shop_id)  | shop_id == lag(shop_id)+1, TRUE, FALSE),
         cond = ifelse(is.na(cond), FALSE, cond)) %>%
  ungroup() %>%
  select(-shop_id, -brand_seq)

#Check for true rows
> df_result %>% filter(cond == T)
# A tibble: 9 x 6
  household_code trip_code_uc purchase_date retailer_code Overall_Brand cond 
           <dbl>        <int> <fct>                 <dbl> <fct>         <lgl>
1            126   1030324171 2012-01-09              650 Kellogg       TRUE 
2            126   1031983761 2012-02-15            89960 GENERAL MILLS TRUE 
3            126   1030954113 2012-03-06            89960 GENERAL MILLS TRUE 
4            126   1027392968 2012-03-08              650 Kellogg       TRUE 
5             76   1027533991 2012-04-03            11024 CTL BR        TRUE 
6             76   1033515804 2012-05-27            11024 CTL BR        TRUE 
7             76   1032066227 2012-06-24            11024 Kellogg       TRUE 
8             76   1027388499 2012-08-11            11024 GENERAL MILLS TRUE 
9             76   1034032718 2012-10-12            11024 QUAKER        TRUE 

注意 我已经生成 shop_id 请记住,一位客户一天可以进行两次不同的购买。

之前提出的解决方案

library(tidyverse)

df %>% group_by(household_code, purchase_date, trip_code_uc) %>%
  arrange(purchase_date, trip_code_uc) %>% 
  mutate(shop_id = cur_group_id()) %>%
  group_by(household_code, Overall_Brand) %>%
  mutate(brand_seq = cur_group_id()) %>%
  group_by(household_code, brand_seq) %>%
  mutate(cond = ifelse(shop_id == lag(shop_id)  | shop_id == lag(shop_id)+1, TRUE, FALSE),
         cond = ifelse(is.na(cond), FALSE, cond)) %>%
  ungroup() %>%
  select(-shop_id, -brand_seq)

# A tibble: 31 x 6
   household_code trip_code_uc purchase_date retailer_code Overall_Brand cond 
            <dbl>        <int> <fct>                 <dbl> <fct>         <lgl>
 1             76   1032497498 2012-01-01            11024 CTL BR        FALSE
 2             76   1025776063 2012-01-02            11024 QUAKER        FALSE
 3            126   1028024134 2012-01-02            63882 Kellogg       FALSE
 4            126   1030324171 2012-01-09              650 Kellogg       TRUE 
 5             76   1029419047 2012-01-23            11024 CTL BR        FALSE
 6            126   1031983761 2012-02-15            89960 GENERAL MILLS FALSE
 7            126   1031983761 2012-02-15            89960 GENERAL MILLS TRUE 
 8             76   1030418100 2012-02-19            11024 Kellogg       FALSE
 9            126   1033767148 2012-02-19              650 Kellogg       FALSE
10             76   1029502602 2012-02-21            11024 CTL BR        FALSE
# ... with 21 more rows

如果我们只检查 TRUE

# A tibble: 9 x 6
  household_code trip_code_uc purchase_date retailer_code Overall_Brand cond 
           <dbl>        <int> <fct>                 <dbl> <fct>         <lgl>
1            126   1030324171 2012-01-09              650 Kellogg       TRUE 
2            126   1031983761 2012-02-15            89960 GENERAL MILLS TRUE 
3            126   1030954113 2012-03-06            89960 GENERAL MILLS TRUE 
4            126   1027392968 2012-03-08              650 Kellogg       TRUE 
5             76   1027533991 2012-04-03            11024 CTL BR        TRUE 
6             76   1033515804 2012-05-27            11024 CTL BR        TRUE 
7             76   1032066227 2012-06-24            11024 Kellogg       TRUE 
8             76   1027388499 2012-08-11            11024 GENERAL MILLS TRUE 
9             76   1034032718 2012-10-12            11024 QUAKER        TRUE 

在此解决方案中,group_ids 不会在每个 household 处重新编号。如果您特别想在每个家庭重新开始编号,此代码可能会有所帮助

df %>% 
  group_split(household_code) %>%
  purrr::map_df(~.x %>% arrange(purchase_date, trip_code_uc) %>%
                  group_by(purchase_date, trip_code_uc) %>% 
                  mutate(shop_id = cur_group_id()) %>%
                  group_by(Overall_Brand) %>%
                  mutate(brand_seq = cur_group_id())) %>%
  group_by(household_code, brand_seq) %>%
  mutate(cond = ifelse(shop_id == lag(shop_id)  | shop_id == lag(shop_id)+1, TRUE, FALSE),
         cond = ifelse(is.na(cond), FALSE, cond)) %>%
  ungroup()

# A tibble: 31 x 8
   household_code trip_code_uc purchase_date retailer_code Overall_Brand shop_id brand_seq cond 
            <dbl>        <int> <fct>                 <dbl> <fct>           <int>     <int> <lgl>
 1             76   1032497498 2012-01-01            11024 CTL BR              1         1 FALSE
 2             76   1025776063 2012-01-02            11024 QUAKER              2         5 FALSE
 3             76   1029419047 2012-01-23            11024 CTL BR              3         1 FALSE
 4             76   1030418100 2012-02-19            11024 Kellogg             4         4 FALSE
 5             76   1029502602 2012-02-21            11024 CTL BR              5         1 FALSE
 6             76   1034153056 2012-03-14            11024 Kellogg             6         4 FALSE
 7             76   1027035051 2012-03-17            11024 CTL BR              7         1 FALSE
 8             76   1027533991 2012-04-03            11024 CTL BR              8         1 TRUE 
 9             76   1033515804 2012-05-27            11024 CTL BR              9         1 TRUE 
10             76   1032998207 2012-06-06            11024 Kellogg            10         4 FALSE
# ... with 21 more rows