如何分组并仅回填某些组

How to groupby and back-fill only certain groups

给定如下面板数据集:

df <- structure(list(date = c(1999L, 2000L, 2001L, 1999L, 2000L, 2001L, 
1999L, 2000L, 2001L), firms = c("A", "A", "A", "B", "B", "B", 
"C", "C", "C"), return = c(5L, NA, 6L, 9L, NA, 10L, 8L, NA, 3L
)), class = "data.frame", row.names = c(NA, -9L))

我可以使用 df %>% group_by(firms) %>% fill(return, .direction="up"):

分组 firms 并回填 return
   date firms return
  <int> <chr>  <int>
1  1999 A          5
2  2000 A          6
3  2001 A          6
4  1999 B          9
5  2000 B         10
6  2001 B         10
7  1999 C          8
8  2000 C          3
9  2001 C          3

但我想知道如何回填某些组,即 BC

预期结果:

  date firms return
1 1999     A      5
2 2000     A     NA
3 2001     A      6
4 1999     B      9
5 2000     B     10
6 2001     B     10
7 1999     C      8
8 2000     C      3
9 2001     C      3

一种选择是创建第二列,仅复制您要填充的组。然后,我使用 coalesce 将两列组合在一起。

library(tidyverse)

df %>% 
  mutate(return2 = ifelse(firms %in% c("B", "C"), return, NA)) %>% 
  group_by(firms) %>% 
  fill(return2, .direction="up") %>% 
  mutate(return = coalesce(return, return2)) %>% 
  select(-return2)

另一种选择是创建一个包含要填充的组的新数据框,然后将数据连接回原始数据框。然后,我将 coalesce 应用于以“return”开头的两列。

df %>% 
  filter(firms != "A") %>% 
  group_by(firms) %>% 
  fill(return, .direction="up") %>% 
  left_join(df, ., by = c("date", "firms")) %>% 
  mutate(return = coalesce(!!!select(., starts_with("return")))) %>% 
  select(-c(return.x, return.y))

另一种选择是将数据帧按组拆分为一个小标题列表。然后,我把select组填满,再绑定回去。

df %>%
    group_split(firms, .keep = TRUE) %>% 
    map_at(c(2:3), fill, return, .direction="up") %>% 
    map_dfr(., bind_rows)

输出

   date firms return
  <int> <chr>  <int>
1  1999 A          5
2  2000 A         NA
3  2001 A          6
4  1999 B          9
5  2000 B         10
6  2001 B         10
7  1999 C          8
8  2000 C          3
9  2001 C          3

这是一个简单直观的解决方案:

library(data.table)
df %>% group_by(firms) %>% 
mutate(return = ifelse(firms %in% c("B", "C"), nafill(return, type ="locf"), return))

>
  date firms return
1 1999     A      5
2 2000     A     NA
3 2001     A      6
4 1999     B      9
5 2000     B      9
6 2001     B     10
7 1999     C      8
8 2000     C      8
9 2001     C      3
> 

您可以对观察结果进行子集化,然后对列进行变异

library(data.table)
df <- data.table(df)
df[firms %in% c("B", "C"), return := nafill(return, type = "nocb"), by = firms]
# Function to fill value down: fill_down => function
fill_down <- function(vec){
  # Stop if vec isn't a vector:
  stopifnot(
    is.vector(vec)
  )
  # Explicitly define returned object: vector => env
  return(
    na.omit(vec)[cumsum(Negate(is.na)(vec))]
  )
}

# Function to negate logical %in% test: not_in => function
not_in <- function(x, y){
  # Stop if x and y arent vectors: 
  stopifnot(
    all(
      vapply(
        list(x, y), 
        is.vector, 
        logical(1), 
        USE.NAMES = FALSE
      )
    )
  )
  # Explicitly define the returned object: logical vector => env
  return(
    Negate(`%in%`)(x, y)
  )
}

# Function to assign index: assign_idx => function
assign_idx <- function(df){
  # Function to avoid namespace collision: .resolve_unused_vec_name => function 
  .resolve_unused_vec_name <- function(df, vec_name = "idx", i = 1){
    # Check if any data.frame vector names are named the idx_vec_name
    if(Negate(hasName)(df, vec_name)){
      # If not; use the idx_vec_name => charact scalar env
      return(vec_name)
      # Otherwise:
    }else{
      # Increment i: i => integer scalar
      i <- i + 1
      # If this is the first iteration:
      if(i == 1){
        # Create the name of the index vector: vec_name => character scalar
        vec_name <- paste(
          vec_name, 
          as.character(i),
          sep = "_"
        )
        # Otherwise
      }else{
        # Rename the index vector: vec_name => character scalar
        vec_name <- gsub(
          "\_\d+$",
          paste0(
            "_", 
            as.character(i)
          ),
          vec_name
        )
      }
      # Re-apply the function with the adjusted vector name: 
      return(
        .resolve_unused_vec_name(
          df, 
          vec_name = vec_name,
          i = i
        )
      )
    }
  }
  # Resolve an unused vector name: idx_vec_name => character scalar
  idx_vec_name <- .resolve_unused_vec_name(df)
  # Assign index to data.frame: 
  df[,idx_vec_name] <- seq_len(
    nrow(
      df
    )
  )
  # Return the data.frame and index vector name: 
  # list of data.frame and character scalar => env
  return(
    list(
      df, 
      idx_vec_name
    )
  )
}

# Function to stop if not list of data.frames: 
# stop_if_not_list_of_data.frames => function
stop_if_not_list_of_data.frames <- function(df_lst){
  # Stop if df_lst is not a list of data.frames: 
  return(
    stopifnot(
      all(
        c(
          vapply(
            df_lst, 
            is.data.frame, 
            logical(1), 
            USE.NAMES = FALSE
          ),
          is.list(df_lst)
        )    
      ) 
    )
  )
}

# Function to stop if the vectors aren't present in a given data.frame:
# stop_if_not_vectors_in_df => function
stop_if_not_vectors_in_df <- function(df, vec_to_check){
  # If the names aren't present as vector names in the data.frame 
  # stop execution: 
  return(
    stopifnot(
      all(
        vec_to_check %in% colnames(df)
      )
    )
  )
}

# Function create list of data.frames based on logical condition: 
# df_2_list_of_dfs => function
logically_split_df_2_list_of_dfs <- function(df, logical_vec){
  # Stop if df isn't a data.frame and logical_vector isn't of type logical
  stopifnot(
    all(
      c(
        is.data.frame(df),
        is.logical(logical_vec)
      )
    )
  )
  # Allocate some memory: df_lst => empty list
  df_lst <- vector(
    "list", 
    2
  )
  # Split the data.frame into a list: df_lst => list of data.frames
  df_lst <- with(
    df, 
    split(
      df, 
      logical_vec
    )
  )
  # Explicitly define the returned object: list of data.frames => env
  return(df_lst)
}

# Function to apply function on list of data.frames and combine result: 
# combine_list_2_df => function
combine_list_2_df <- function(
  df_lst, 
  combination_func = c(rbind, cbind)){
  # Stop if not a list of data.frames: 
  stop_if_not_list_of_data.frames(df_lst)
  # Resolve the desired funciton to combine the list: 
  # cmbn_func_resolved => function
  cmbn_func_resolved <- match.fun(combination_func)
  # Combine list data.frame: res => data.frame
  res <- data.frame(
    do.call(
      cmbn_func_resolved, 
      df_lst
    ),
    stringsAsFactors = FALSE,
    row.names = NULL
  )
  # Explicitly define the returned object: data.frame => env
  return(res)
}

# Function to parse ellipses: parse_ellipsis => function
parse_ellipsis <- function(..., truncate_arg_names = TRUE){
  # Test if all arguments are characters: arg_vec_is_char => logical scalar 
  arg_vec_is_char <- all(
    vapply(
      list(...), 
      is.character,
      logical(1), 
      USE.NAMES = FALSE
    )
  )
  # If the args provided are a character vector
  if(arg_vec_is_char){
    # Coerce to a character vector: vec_str => character vector
    vec_str <- as.character(
      unlist(
        list(...)
      )
    )
  }else{
    # Substitute the optional args: order_vecs => list of calls
    vecs <- substitute(
      list(...)
    )[-1]
    # Parse the calls: vec_str => character vector
    vec_str <- vapply(
      vecs, 
      deparse,
      character(1)
    )
  }
  # If we want to truncate the provided arguments: 
  if(truncate_arg_names){
    # Drop any prefixes of data.frame name etc: 
    # vec_str_wo_prefix => character vector
    vec_str <- gsub(
      ".*\$(\w+|\d+|\s+)$", 
      "\1", 
      vec_str
    )
  }
  # Explicitly define the returned object: character vector => env
  return(vec_str)
}

# Function to order a data.frame by a vector: order_by => function
order_by <- function(df, ..., decreasing = FALSE){
  # Stop if a data.frame hasnt been provided: 
  stopifnot(
    is.data.frame(df)
  )
  # Parse the ellipsis arguments: order_vec_str_wo_prefix => character vector
  order_vec_str_wo_prefix <- parse_ellipsis(...)
  # Stop if vectors aren't in data.frame: 
  stop_if_not_vectors_in_df(
    df, 
    order_vec_str_wo_prefix
  )
  # If order by descending: 
  if(decreasing){
    # Sort the data.frame by the given vectors: sort_order => integer vector
    sort_order <- rev(
      do.call(
        "order", 
        df[, order_vec_str_wo_prefix, drop = FALSE]
      )
    )
    # Otherwise: 
  }else{
    # Sort the data.frame by the given vectors: sort_order => integer vector
    sort_order <- do.call(
      "order", 
      df[, order_vec_str_wo_prefix, drop = FALSE]
    )
  }
  # Order the data.frame by the sort order: ordered_df => data.frame
  ordered_df <- df[sort_order,]
  # Explicitly define the returned object: data.frame => env
  return(ordered_df)
}

# Drop vectors in data.frame: drop_columns => function()
drop_columns <- function(df, ...){
  # Stop if a data.frame hasn't been provided: 
  stopifnot(
    is.data.frame(df)
  )
  # Parse the ellipsis argument: col_names_to_drop => character vector
  col_names_to_drop <- parse_ellipsis(...)
  # Stop if the vectors aren't in the data.frame:
  stop_if_not_vectors_in_df(
    df, 
    col_names_to_drop
  )
  # Drop the columns: df => data.frame
  df[,col_names_to_drop] <- NULL
  # Return the data.frame: data.frame => env
  return(df)
}

# Function to apply function to data.frame meeting logical test:
# lapply_to_true_df => function() 
lapply_to_true_df <- function(df_lst, function_to_apply = fill_down, ...){
  # Resolve the function to be applied to vectors: 
  function_resolved <- match.fun(function_to_apply)
  # Resolve the vectors apply the function to: 
  # vector_names_resolved => character vector
  vector_names_resolved <- parse_ellipsis(...)
  # Apply function to partition of data.frame in list that met logical 
  # condition: res => list of data.frames
  res <- lapply(
    seq_along(df_lst),
    function(i){
      if(names(df_lst)[i] == "TRUE"){
        df_lst[[i]][,vector_names_resolved] <- lapply(
          vector_names_resolved,
          function(x){
            function_resolved(
              as.vector(
                df_lst[[i]][, x, drop = TRUE]
              )
            )
          }
        )
      }
      df_lst[[i]]
    }
  )
  # Explicitly define the returned object: data.frame => env
  res
}

# Define the main function: main => function
main <- function(){
  # Input data: df => data.frame
  df <- structure(list(date = c(1999L, 2000L, 2001L, 1999L, 2000L, 2001L, 
    1999L, 2000L, 2001L), firms = c("A", "A", "A", "B", "B", "B", 
    "C", "C", "C"), return = c(5L, NA, 6L, 9L, NA, 10L, 8L, NA, 3L
    )), class = "data.frame", row.names = c(NA, -9L))
  # Allocate some memory for the index assigment: tmp_lst => empty list
  tmp_lst <- vector("list", 2)
  # Assign the index to the data.frame: tmp_lst => list of data.frame and character
  # scalar denoting the name of the index vector
  tmp_lst <- assign_idx(df)
  # Split the data.frame based on logical test: df_lst => list of data.frames
  df_lst <- with(
    tmp_lst[[1]], 
    logically_split_df_2_list_of_dfs(
      tmp_lst[[1]], 
      not_in(
        firms, 
        "A"
      )
    )
  )
  # Apply-Combine function and combine back to data.frame: res => data.frame
  res <- drop_columns(
    order_by(
      combine_list_2_df(
        lapply_to_true_df(
          df_lst,
          fill_down, 
          return
        ),
        rbind
      ),
      tmp_lst[[2]]
    ),
    tmp_lst[[2]]
  )
  # Output the result to the console: data.frame => stdout(console)
  return(res)
}

# Execute main if called:
if (sys.nframe() == 0){
  # Execute the main function: data.frame => stdout(console)
  main()
}