根据其他两列的范围信息创建和填充新列

Create and fill new columns based on range information from two other columns

我有以下数据:

df <- data.frame(group  = c(1, 1, 1, 2, 2, 2),
                 start  = c(2, 2, 2, 7, 7, 7),
                 stop   = c(4, 7, 8, 7, 8, 9),
                 unstop = c(5, 7, 10, 7, 9, 10))

我现在想做以下事情:

我有一个理论上的方法。问题是我的真实数据有 80k 行(由 60k 组组成),我需要创建大约 200 个这样的周列。对于下面的代码,即使过滤 10 行也只需要 ~30 秒。

所以我正在寻找更 elegent/smarter/FASTER 的解决方案。

预期结果:

# A tibble: 6 × 14
# Groups:   group [2]
  group start  stop unstop week_1 week_2 week_3 week_4 week_5 week_6 week_7 week_8 week_9 week_10
  <dbl> <dbl> <dbl>  <dbl>  <int>  <int>  <int>  <int>  <int>  <int>  <int>  <int>  <int>   <int>
1     1     2     4      5      0      1      1      1      0      0      0      0      0       0
2     1     2     7      7      0      0      0      0      0      0      1      1      0       0
3     1     2     8     10      0      0      0      0      0      0      0      0      0       1
4     2     7     7      7      0      0      0      0      0      0      1      0      0       0
5     2     7     8      9      0      0      0      0      0      0      0      1      1       0
6     2     7     9     10      0      0      0      0      0      0      0      0      0       1

下面是我通常如何处理它(当然不是手动定义每个 row_number。除此之外,代码也是错误的,没有给出预期的 0/1 值。它也抛出许多警告。最后,这段代码已经 运行s 几秒钟只是为了这个小测试数据。它会 运行 一个月我的 80k/200col 数据集。

add_weeks <- as_tibble(as.list(setNames(rep(0L, 10),
                                        paste0("week_", 1:10))))

df |> 
  bind_cols(add_weeks) |> 
  group_by(group) |> 
  mutate(across(num_range("week_", 1:10),
                ~ if_else(row_number() == 1 & str_extract(cur_column(), "\d+$") %in% start:stop,
                          1L,
                          .)),
         across(num_range("week_", 1:10),
                ~ if_else(row_number() == 2 & str_extract(cur_column(), "\d+$") %in% unstop:lead(stop),
                          1L,
                          .)),
         across(num_range("week_", 1:10),
                ~ if_else(row_number() == 3 & str_extract(cur_column(), "\d+$") %in% unstop:10,
                          1L,
                          .)))

现在测试代码。评论中所述策略的实施:

I’d make a matrix with names columns and assign with row and col indices. You can then either attach it as a matrix or convert to data frame.

Mat <- matrix(0, nrow(df), 10) # 200 for real case
maxwk <- 10
colnames(Mat) <- paste0("week", 1:maxwk)

# Add extra column that marks condition 
# If there are always exactly 3 row per group just rep(1:3, ngrps)

# Need to define a value for cond that identifies the three possibilities:

df$cond <- rep(1:3, length=nrow(df))  # assume all groups have exactly 3:

for ( r in 1:nrow(df) ) {
          # for first row in group
  if( df$cond[r] == 1){
     Idx <-  paste0("week", df$start[r]:df$stop[r] ) #start:stop
     Mat[r, Idx] <- 1; next}
          # second
  if( df$cond[r] == 2){ 
     Idx <-  paste0("week" , df$stop[r]:df$unstop[r] )#  stop:unstop
     Mat[r, Idx] <- 1; next}
          # third
  if( df$cond[r] == 3){
    Idx  <- paste0("week", df$unstop[r]:maxwk )    # unstop:max
    Mat[r, Idx] <- 1; next}
  }
df
  group start stop unstop cond
1     1     2    4      5    1
2     1     2    7      7    2
3     1     2    8     10    3
4     2     7    6      7    1
5     2     7    8      9    2
6     2     7    9     10    3
> Mat
     week1 week2 week3 week4 week5 week6 week7 week8 week9 week10
[1,]     0     1     1     1     0     0     0     0     0      0
[2,]     0     0     0     0     0     0     1     0     0      0
[3,]     0     0     0     0     0     0     0     0     0      1
[4,]     0     0     0     0     0     1     1     0     0      0
[5,]     0     0     0     0     0     0     0     1     1      0
[6,]     0     0     0     0     0     0     0     0     0      1

你可以cbind这些。

可能会有性能改进。可以使用 switch(cond, ...) 来分派到正确的逻辑而不是 if( cond == .){ ., next} 方法。这应该比使用 ifelseif_else 的代码快得多。如果您想了解它是如何实现的,请勾选总体策略,我会花时间添加备用代码。

  • 运行 设置两种方法最多 100 周后的基准。 * 警告来自问题中的代码:
> perf_results <- microbenchmark(
+     first.method    = do_first(df), sec.method=do_second(df), times=10)
There were 50 or more warnings (use warnings() to see the first 50)
> perf_results
Unit: microseconds
         expr         min        lq         mean       median          uq        max neval
 first.method 4385001.123 4416568.8 4581549.9624 4450691.5455 4615753.753 5350416.80    10
   sec.method     146.432     149.6     181.6137     188.2125     193.307     243.47    10

我想看看为一行选择正确算法的 switch 方法是否会提高性能。它做到了,而且在某种程度上让我感到惊讶。 switch 函数类似于 Pascal 和许多其他语言中的 case 函数。它有两种形式,其行为根据第一个参数 EXPR 是数字还是字符而不同。这里选择了“dispatch”版本,因为“cond”列是数字。

do_third= function(df){ Mat <- matrix(0, nrow(df), 100) # 200 for real case
maxwk <- 100
colnames(Mat) <- paste0("week", 1:maxwk)
df$cond <- rep(1:3, length=nrow(df))  # assume all groups have exactly 3: 
for( r in 1:nrow(df)) { switch( df[r,"cond"],      
         { # for first row in each group of 3
     Idx <-  paste0("week", df$start[r]:df$stop[r] ) #start:stop
     Mat[r, Idx] <- 1 }, 
          
          { # second row in group
     Idx <-  paste0("week" , df$stop[r]:df$unstop[r] )#  stop:unstop
     Mat[r, Idx] <- 1 },
          
          {# third
     Idx  <- paste0("week", df$unstop[r]:maxwk )    # unstop:max
     Mat[r, Idx] <- 1 } ) }
   }

新微基准:

perf_results
Unit: nanoseconds
         expr        min         lq         mean     median         uq        max neval cld
 first.method 4304901359 4351893534 4387626725.8 4372151785 4416247096 4543314742    10   b
   sec.method     162803     173855    2588492.1     215309     216878   24081195    10  a 
   third.meth         34         53        610.6        877        940        963    10  a 

FWIW,我正在 post 自己解决这个问题。显然,根据某些条件向 60k 数据帧添加 200 列非常慢。所以我做的是:

  • 通过 str_c 添加一个 chr 列,其中包含有关周的信息。
  • 创建一个只有分组变量和这个新信息的较小数据集。
  • 然后在这个week_info上使用separate_rows得到一个长格式的数据集。
  • 然后使用 pivot_wider 并将此信息与原始数据集结合起来。

请注意,这种方法有效,因为我在最初的 post 中没有提到我实际上想总结每组的周信息。所以最后我想每组一行。为了使我的问题简单明了,我没有将此添加到我的问题中。

话虽如此,@IRTFM 的解决方案仍然快了 3 倍。

df2 <- df |>
  group_by(group) |> 
  mutate(lead_stop = lead(stop, default = 0),
         n_rows = n(),
         row_number = row_number()) |> 
  ungroup() |> 
  rowwise() |> 
  mutate(split_weeks = case_when(n_rows == 1 & row_number == 1 ~ str_c(start:stop, collapse = ","),
                                 n_rows  > 1 & row_number == 1 ~ str_c(c(start:stop, unstop:lead_stop), collapse = ","),
                                 row_number == n_rows          ~ str_c(unstop:10, collapse = ","),
                                 TRUE                          ~ str_c(unstop:lead_stop, collapse = ",")))

df3 <- df2 |> 
  group_by(group) |> 
  summarize(split_weeks = unique(str_c(split_weeks, collapse = ","))) |> 
  separate_rows(split_weeks, sep = ",", convert = TRUE) |>
  distinct() |> 
  mutate(value = 1L) |>
  full_join(y = data.frame(split_weeks = 1:10)) |> 
  pivot_wider(names_from = split_weeks,
              names_prefix = "week_",
              values_from = value,
              values_fill = 0L,
              names_expand = TRUE) |> 
  filter(!is.na(group))

df4 <- df2 |> 
  ungroup() |> 
  select(-split_weeks, -n_rows) |> 
  pivot_wider(names_from = row_number, values_from = -group) |> 
  bind_cols(x = df3 |> select(-group), y = _)
library(tidyverse)

periods <- tibble(
  group  = c(1, 1, 1, 2, 2, 2),
  start  = c(2, 2, 2, 7, 7, 7), 
  stop   = c(4, 7, 8, 7, 8, 9), 
  unstop = c(5, 7, 10, 7, 9, 10)
)

LAST <- 10

我认为将组内 start/stop/unstop 逻辑重新编码为 每行单个 start/stop。我们称它们为 rstart/rstop。使用你的 规则,它们可以这样创建:

(periods <- periods %>% 
  group_by(group) %>% 
  transmute(
    period = row_number(),
    rstart = if_else(period == 1L, start, unstop),
    rstop  = if_else(period == 1L, stop,  lead(stop, default = LAST))
  ) %>% 
  ungroup()
)
#> # A tibble: 6 x 4
#>   group period rstart rstop
#>   <dbl>  <int>  <dbl> <dbl>
#> 1     1      1      2     4
#> 2     1      2      7     8
#> 3     1      3     10    10
#> 4     2      1      7     7
#> 5     2      2      9     9
#> 6     2      3     10    10

现在,我们可以通过 group_by -> 总结来生成活跃的伸展。在这里,我们 还添加一个指标列 active 以显示给定的周数是活跃的

(periods <- periods %>% 
  group_by(group, period) %>% 
  summarise(
    weeks = rstart:rstop, 
    active = 1L, 
    .groups = "drop"
  ) 
)
#> # A tibble: 9 x 4
#>   group period weeks active
#>   <dbl>  <int> <int>  <int>
#> 1     1      1     2      1
#> 2     1      1     3      1
#> 3     1      1     4      1
#> 4     1      2     7      1
#> 5     1      2     8      1
#> 6     1      3    10      1
#> 7     2      1     7      1
#> 8     2      2     9      1
#> 9     2      3    10      1

要在 pivot_wider 之后的输出中显示 non-observed 周, 我们可以将周列转换为一个因子并添加缺失的水平 fct_expand。我还添加了 fct_inseq 以确保列是 在输出中按预期排序。完成后,我们可以使用 pivot_wider 获得宽格式。注意 names_expand = TRUE 参数给了我们 我们添加到周列的级别。

periods %>% 
  mutate(
    weeks = as_factor(weeks) %>% 
      fct_expand(as.character(1:LAST)) %>% 
      fct_inseq()
  ) %>% 
  pivot_wider(
    names_from = weeks, 
    names_expand = TRUE,
    values_from = active,
    values_fill = 0L, 
    names_prefix = "week"
  )
#> # A tibble: 6 x 12
#>   group period week1 week2 week3 week4 week5 week6 week7 week8 week9 week10
#>   <dbl>  <int> <int> <int> <int> <int> <int> <int> <int> <int> <int>  <int>
#> 1     1      1     0     1     1     1     0     0     0     0     0      0
#> 2     1      2     0     0     0     0     0     0     1     1     0      0
#> 3     1      3     0     0     0     0     0     0     0     0     0      1
#> 4     2      1     0     0     0     0     0     0     1     0     0      0
#> 5     2      2     0     0     0     0     0     0     0     0     1      0
#> 6     2      3     0     0     0     0     0     0     0     0     0      1

reprex package (v2.0.1)

于 2022-05-09 创建