如何分组并仅回填某些组
How to groupby and back-fill only certain groups
给定如下面板数据集:
df <- structure(list(date = c(1999L, 2000L, 2001L, 1999L, 2000L, 2001L,
1999L, 2000L, 2001L), firms = c("A", "A", "A", "B", "B", "B",
"C", "C", "C"), return = c(5L, NA, 6L, 9L, NA, 10L, 8L, NA, 3L
)), class = "data.frame", row.names = c(NA, -9L))
我可以使用 df %>% group_by(firms) %>% fill(return, .direction="up")
:
分组 firms
并回填 return
date firms return
<int> <chr> <int>
1 1999 A 5
2 2000 A 6
3 2001 A 6
4 1999 B 9
5 2000 B 10
6 2001 B 10
7 1999 C 8
8 2000 C 3
9 2001 C 3
但我想知道如何回填某些组,即 B
和 C
?
预期结果:
date firms return
1 1999 A 5
2 2000 A NA
3 2001 A 6
4 1999 B 9
5 2000 B 10
6 2001 B 10
7 1999 C 8
8 2000 C 3
9 2001 C 3
一种选择是创建第二列,仅复制您要填充的组。然后,我使用 coalesce
将两列组合在一起。
library(tidyverse)
df %>%
mutate(return2 = ifelse(firms %in% c("B", "C"), return, NA)) %>%
group_by(firms) %>%
fill(return2, .direction="up") %>%
mutate(return = coalesce(return, return2)) %>%
select(-return2)
另一种选择是创建一个包含要填充的组的新数据框,然后将数据连接回原始数据框。然后,我将 coalesce
应用于以“return”开头的两列。
df %>%
filter(firms != "A") %>%
group_by(firms) %>%
fill(return, .direction="up") %>%
left_join(df, ., by = c("date", "firms")) %>%
mutate(return = coalesce(!!!select(., starts_with("return")))) %>%
select(-c(return.x, return.y))
另一种选择是将数据帧按组拆分为一个小标题列表。然后,我把select组填满,再绑定回去。
df %>%
group_split(firms, .keep = TRUE) %>%
map_at(c(2:3), fill, return, .direction="up") %>%
map_dfr(., bind_rows)
输出
date firms return
<int> <chr> <int>
1 1999 A 5
2 2000 A NA
3 2001 A 6
4 1999 B 9
5 2000 B 10
6 2001 B 10
7 1999 C 8
8 2000 C 3
9 2001 C 3
这是一个简单直观的解决方案:
library(data.table)
df %>% group_by(firms) %>%
mutate(return = ifelse(firms %in% c("B", "C"), nafill(return, type ="locf"), return))
>
date firms return
1 1999 A 5
2 2000 A NA
3 2001 A 6
4 1999 B 9
5 2000 B 9
6 2001 B 10
7 1999 C 8
8 2000 C 8
9 2001 C 3
>
您可以对观察结果进行子集化,然后对列进行变异
library(data.table)
df <- data.table(df)
df[firms %in% c("B", "C"), return := nafill(return, type = "nocb"), by = firms]
# Function to fill value down: fill_down => function
fill_down <- function(vec){
# Stop if vec isn't a vector:
stopifnot(
is.vector(vec)
)
# Explicitly define returned object: vector => env
return(
na.omit(vec)[cumsum(Negate(is.na)(vec))]
)
}
# Function to negate logical %in% test: not_in => function
not_in <- function(x, y){
# Stop if x and y arent vectors:
stopifnot(
all(
vapply(
list(x, y),
is.vector,
logical(1),
USE.NAMES = FALSE
)
)
)
# Explicitly define the returned object: logical vector => env
return(
Negate(`%in%`)(x, y)
)
}
# Function to assign index: assign_idx => function
assign_idx <- function(df){
# Function to avoid namespace collision: .resolve_unused_vec_name => function
.resolve_unused_vec_name <- function(df, vec_name = "idx", i = 1){
# Check if any data.frame vector names are named the idx_vec_name
if(Negate(hasName)(df, vec_name)){
# If not; use the idx_vec_name => charact scalar env
return(vec_name)
# Otherwise:
}else{
# Increment i: i => integer scalar
i <- i + 1
# If this is the first iteration:
if(i == 1){
# Create the name of the index vector: vec_name => character scalar
vec_name <- paste(
vec_name,
as.character(i),
sep = "_"
)
# Otherwise
}else{
# Rename the index vector: vec_name => character scalar
vec_name <- gsub(
"\_\d+$",
paste0(
"_",
as.character(i)
),
vec_name
)
}
# Re-apply the function with the adjusted vector name:
return(
.resolve_unused_vec_name(
df,
vec_name = vec_name,
i = i
)
)
}
}
# Resolve an unused vector name: idx_vec_name => character scalar
idx_vec_name <- .resolve_unused_vec_name(df)
# Assign index to data.frame:
df[,idx_vec_name] <- seq_len(
nrow(
df
)
)
# Return the data.frame and index vector name:
# list of data.frame and character scalar => env
return(
list(
df,
idx_vec_name
)
)
}
# Function to stop if not list of data.frames:
# stop_if_not_list_of_data.frames => function
stop_if_not_list_of_data.frames <- function(df_lst){
# Stop if df_lst is not a list of data.frames:
return(
stopifnot(
all(
c(
vapply(
df_lst,
is.data.frame,
logical(1),
USE.NAMES = FALSE
),
is.list(df_lst)
)
)
)
)
}
# Function to stop if the vectors aren't present in a given data.frame:
# stop_if_not_vectors_in_df => function
stop_if_not_vectors_in_df <- function(df, vec_to_check){
# If the names aren't present as vector names in the data.frame
# stop execution:
return(
stopifnot(
all(
vec_to_check %in% colnames(df)
)
)
)
}
# Function create list of data.frames based on logical condition:
# df_2_list_of_dfs => function
logically_split_df_2_list_of_dfs <- function(df, logical_vec){
# Stop if df isn't a data.frame and logical_vector isn't of type logical
stopifnot(
all(
c(
is.data.frame(df),
is.logical(logical_vec)
)
)
)
# Allocate some memory: df_lst => empty list
df_lst <- vector(
"list",
2
)
# Split the data.frame into a list: df_lst => list of data.frames
df_lst <- with(
df,
split(
df,
logical_vec
)
)
# Explicitly define the returned object: list of data.frames => env
return(df_lst)
}
# Function to apply function on list of data.frames and combine result:
# combine_list_2_df => function
combine_list_2_df <- function(
df_lst,
combination_func = c(rbind, cbind)){
# Stop if not a list of data.frames:
stop_if_not_list_of_data.frames(df_lst)
# Resolve the desired funciton to combine the list:
# cmbn_func_resolved => function
cmbn_func_resolved <- match.fun(combination_func)
# Combine list data.frame: res => data.frame
res <- data.frame(
do.call(
cmbn_func_resolved,
df_lst
),
stringsAsFactors = FALSE,
row.names = NULL
)
# Explicitly define the returned object: data.frame => env
return(res)
}
# Function to parse ellipses: parse_ellipsis => function
parse_ellipsis <- function(..., truncate_arg_names = TRUE){
# Test if all arguments are characters: arg_vec_is_char => logical scalar
arg_vec_is_char <- all(
vapply(
list(...),
is.character,
logical(1),
USE.NAMES = FALSE
)
)
# If the args provided are a character vector
if(arg_vec_is_char){
# Coerce to a character vector: vec_str => character vector
vec_str <- as.character(
unlist(
list(...)
)
)
}else{
# Substitute the optional args: order_vecs => list of calls
vecs <- substitute(
list(...)
)[-1]
# Parse the calls: vec_str => character vector
vec_str <- vapply(
vecs,
deparse,
character(1)
)
}
# If we want to truncate the provided arguments:
if(truncate_arg_names){
# Drop any prefixes of data.frame name etc:
# vec_str_wo_prefix => character vector
vec_str <- gsub(
".*\$(\w+|\d+|\s+)$",
"\1",
vec_str
)
}
# Explicitly define the returned object: character vector => env
return(vec_str)
}
# Function to order a data.frame by a vector: order_by => function
order_by <- function(df, ..., decreasing = FALSE){
# Stop if a data.frame hasnt been provided:
stopifnot(
is.data.frame(df)
)
# Parse the ellipsis arguments: order_vec_str_wo_prefix => character vector
order_vec_str_wo_prefix <- parse_ellipsis(...)
# Stop if vectors aren't in data.frame:
stop_if_not_vectors_in_df(
df,
order_vec_str_wo_prefix
)
# If order by descending:
if(decreasing){
# Sort the data.frame by the given vectors: sort_order => integer vector
sort_order <- rev(
do.call(
"order",
df[, order_vec_str_wo_prefix, drop = FALSE]
)
)
# Otherwise:
}else{
# Sort the data.frame by the given vectors: sort_order => integer vector
sort_order <- do.call(
"order",
df[, order_vec_str_wo_prefix, drop = FALSE]
)
}
# Order the data.frame by the sort order: ordered_df => data.frame
ordered_df <- df[sort_order,]
# Explicitly define the returned object: data.frame => env
return(ordered_df)
}
# Drop vectors in data.frame: drop_columns => function()
drop_columns <- function(df, ...){
# Stop if a data.frame hasn't been provided:
stopifnot(
is.data.frame(df)
)
# Parse the ellipsis argument: col_names_to_drop => character vector
col_names_to_drop <- parse_ellipsis(...)
# Stop if the vectors aren't in the data.frame:
stop_if_not_vectors_in_df(
df,
col_names_to_drop
)
# Drop the columns: df => data.frame
df[,col_names_to_drop] <- NULL
# Return the data.frame: data.frame => env
return(df)
}
# Function to apply function to data.frame meeting logical test:
# lapply_to_true_df => function()
lapply_to_true_df <- function(df_lst, function_to_apply = fill_down, ...){
# Resolve the function to be applied to vectors:
function_resolved <- match.fun(function_to_apply)
# Resolve the vectors apply the function to:
# vector_names_resolved => character vector
vector_names_resolved <- parse_ellipsis(...)
# Apply function to partition of data.frame in list that met logical
# condition: res => list of data.frames
res <- lapply(
seq_along(df_lst),
function(i){
if(names(df_lst)[i] == "TRUE"){
df_lst[[i]][,vector_names_resolved] <- lapply(
vector_names_resolved,
function(x){
function_resolved(
as.vector(
df_lst[[i]][, x, drop = TRUE]
)
)
}
)
}
df_lst[[i]]
}
)
# Explicitly define the returned object: data.frame => env
res
}
# Define the main function: main => function
main <- function(){
# Input data: df => data.frame
df <- structure(list(date = c(1999L, 2000L, 2001L, 1999L, 2000L, 2001L,
1999L, 2000L, 2001L), firms = c("A", "A", "A", "B", "B", "B",
"C", "C", "C"), return = c(5L, NA, 6L, 9L, NA, 10L, 8L, NA, 3L
)), class = "data.frame", row.names = c(NA, -9L))
# Allocate some memory for the index assigment: tmp_lst => empty list
tmp_lst <- vector("list", 2)
# Assign the index to the data.frame: tmp_lst => list of data.frame and character
# scalar denoting the name of the index vector
tmp_lst <- assign_idx(df)
# Split the data.frame based on logical test: df_lst => list of data.frames
df_lst <- with(
tmp_lst[[1]],
logically_split_df_2_list_of_dfs(
tmp_lst[[1]],
not_in(
firms,
"A"
)
)
)
# Apply-Combine function and combine back to data.frame: res => data.frame
res <- drop_columns(
order_by(
combine_list_2_df(
lapply_to_true_df(
df_lst,
fill_down,
return
),
rbind
),
tmp_lst[[2]]
),
tmp_lst[[2]]
)
# Output the result to the console: data.frame => stdout(console)
return(res)
}
# Execute main if called:
if (sys.nframe() == 0){
# Execute the main function: data.frame => stdout(console)
main()
}
给定如下面板数据集:
df <- structure(list(date = c(1999L, 2000L, 2001L, 1999L, 2000L, 2001L,
1999L, 2000L, 2001L), firms = c("A", "A", "A", "B", "B", "B",
"C", "C", "C"), return = c(5L, NA, 6L, 9L, NA, 10L, 8L, NA, 3L
)), class = "data.frame", row.names = c(NA, -9L))
我可以使用 df %>% group_by(firms) %>% fill(return, .direction="up")
:
firms
并回填 return
date firms return
<int> <chr> <int>
1 1999 A 5
2 2000 A 6
3 2001 A 6
4 1999 B 9
5 2000 B 10
6 2001 B 10
7 1999 C 8
8 2000 C 3
9 2001 C 3
但我想知道如何回填某些组,即 B
和 C
?
预期结果:
date firms return
1 1999 A 5
2 2000 A NA
3 2001 A 6
4 1999 B 9
5 2000 B 10
6 2001 B 10
7 1999 C 8
8 2000 C 3
9 2001 C 3
一种选择是创建第二列,仅复制您要填充的组。然后,我使用 coalesce
将两列组合在一起。
library(tidyverse)
df %>%
mutate(return2 = ifelse(firms %in% c("B", "C"), return, NA)) %>%
group_by(firms) %>%
fill(return2, .direction="up") %>%
mutate(return = coalesce(return, return2)) %>%
select(-return2)
另一种选择是创建一个包含要填充的组的新数据框,然后将数据连接回原始数据框。然后,我将 coalesce
应用于以“return”开头的两列。
df %>%
filter(firms != "A") %>%
group_by(firms) %>%
fill(return, .direction="up") %>%
left_join(df, ., by = c("date", "firms")) %>%
mutate(return = coalesce(!!!select(., starts_with("return")))) %>%
select(-c(return.x, return.y))
另一种选择是将数据帧按组拆分为一个小标题列表。然后,我把select组填满,再绑定回去。
df %>%
group_split(firms, .keep = TRUE) %>%
map_at(c(2:3), fill, return, .direction="up") %>%
map_dfr(., bind_rows)
输出
date firms return
<int> <chr> <int>
1 1999 A 5
2 2000 A NA
3 2001 A 6
4 1999 B 9
5 2000 B 10
6 2001 B 10
7 1999 C 8
8 2000 C 3
9 2001 C 3
这是一个简单直观的解决方案:
library(data.table)
df %>% group_by(firms) %>%
mutate(return = ifelse(firms %in% c("B", "C"), nafill(return, type ="locf"), return))
>
date firms return
1 1999 A 5
2 2000 A NA
3 2001 A 6
4 1999 B 9
5 2000 B 9
6 2001 B 10
7 1999 C 8
8 2000 C 8
9 2001 C 3
>
您可以对观察结果进行子集化,然后对列进行变异
library(data.table)
df <- data.table(df)
df[firms %in% c("B", "C"), return := nafill(return, type = "nocb"), by = firms]
# Function to fill value down: fill_down => function
fill_down <- function(vec){
# Stop if vec isn't a vector:
stopifnot(
is.vector(vec)
)
# Explicitly define returned object: vector => env
return(
na.omit(vec)[cumsum(Negate(is.na)(vec))]
)
}
# Function to negate logical %in% test: not_in => function
not_in <- function(x, y){
# Stop if x and y arent vectors:
stopifnot(
all(
vapply(
list(x, y),
is.vector,
logical(1),
USE.NAMES = FALSE
)
)
)
# Explicitly define the returned object: logical vector => env
return(
Negate(`%in%`)(x, y)
)
}
# Function to assign index: assign_idx => function
assign_idx <- function(df){
# Function to avoid namespace collision: .resolve_unused_vec_name => function
.resolve_unused_vec_name <- function(df, vec_name = "idx", i = 1){
# Check if any data.frame vector names are named the idx_vec_name
if(Negate(hasName)(df, vec_name)){
# If not; use the idx_vec_name => charact scalar env
return(vec_name)
# Otherwise:
}else{
# Increment i: i => integer scalar
i <- i + 1
# If this is the first iteration:
if(i == 1){
# Create the name of the index vector: vec_name => character scalar
vec_name <- paste(
vec_name,
as.character(i),
sep = "_"
)
# Otherwise
}else{
# Rename the index vector: vec_name => character scalar
vec_name <- gsub(
"\_\d+$",
paste0(
"_",
as.character(i)
),
vec_name
)
}
# Re-apply the function with the adjusted vector name:
return(
.resolve_unused_vec_name(
df,
vec_name = vec_name,
i = i
)
)
}
}
# Resolve an unused vector name: idx_vec_name => character scalar
idx_vec_name <- .resolve_unused_vec_name(df)
# Assign index to data.frame:
df[,idx_vec_name] <- seq_len(
nrow(
df
)
)
# Return the data.frame and index vector name:
# list of data.frame and character scalar => env
return(
list(
df,
idx_vec_name
)
)
}
# Function to stop if not list of data.frames:
# stop_if_not_list_of_data.frames => function
stop_if_not_list_of_data.frames <- function(df_lst){
# Stop if df_lst is not a list of data.frames:
return(
stopifnot(
all(
c(
vapply(
df_lst,
is.data.frame,
logical(1),
USE.NAMES = FALSE
),
is.list(df_lst)
)
)
)
)
}
# Function to stop if the vectors aren't present in a given data.frame:
# stop_if_not_vectors_in_df => function
stop_if_not_vectors_in_df <- function(df, vec_to_check){
# If the names aren't present as vector names in the data.frame
# stop execution:
return(
stopifnot(
all(
vec_to_check %in% colnames(df)
)
)
)
}
# Function create list of data.frames based on logical condition:
# df_2_list_of_dfs => function
logically_split_df_2_list_of_dfs <- function(df, logical_vec){
# Stop if df isn't a data.frame and logical_vector isn't of type logical
stopifnot(
all(
c(
is.data.frame(df),
is.logical(logical_vec)
)
)
)
# Allocate some memory: df_lst => empty list
df_lst <- vector(
"list",
2
)
# Split the data.frame into a list: df_lst => list of data.frames
df_lst <- with(
df,
split(
df,
logical_vec
)
)
# Explicitly define the returned object: list of data.frames => env
return(df_lst)
}
# Function to apply function on list of data.frames and combine result:
# combine_list_2_df => function
combine_list_2_df <- function(
df_lst,
combination_func = c(rbind, cbind)){
# Stop if not a list of data.frames:
stop_if_not_list_of_data.frames(df_lst)
# Resolve the desired funciton to combine the list:
# cmbn_func_resolved => function
cmbn_func_resolved <- match.fun(combination_func)
# Combine list data.frame: res => data.frame
res <- data.frame(
do.call(
cmbn_func_resolved,
df_lst
),
stringsAsFactors = FALSE,
row.names = NULL
)
# Explicitly define the returned object: data.frame => env
return(res)
}
# Function to parse ellipses: parse_ellipsis => function
parse_ellipsis <- function(..., truncate_arg_names = TRUE){
# Test if all arguments are characters: arg_vec_is_char => logical scalar
arg_vec_is_char <- all(
vapply(
list(...),
is.character,
logical(1),
USE.NAMES = FALSE
)
)
# If the args provided are a character vector
if(arg_vec_is_char){
# Coerce to a character vector: vec_str => character vector
vec_str <- as.character(
unlist(
list(...)
)
)
}else{
# Substitute the optional args: order_vecs => list of calls
vecs <- substitute(
list(...)
)[-1]
# Parse the calls: vec_str => character vector
vec_str <- vapply(
vecs,
deparse,
character(1)
)
}
# If we want to truncate the provided arguments:
if(truncate_arg_names){
# Drop any prefixes of data.frame name etc:
# vec_str_wo_prefix => character vector
vec_str <- gsub(
".*\$(\w+|\d+|\s+)$",
"\1",
vec_str
)
}
# Explicitly define the returned object: character vector => env
return(vec_str)
}
# Function to order a data.frame by a vector: order_by => function
order_by <- function(df, ..., decreasing = FALSE){
# Stop if a data.frame hasnt been provided:
stopifnot(
is.data.frame(df)
)
# Parse the ellipsis arguments: order_vec_str_wo_prefix => character vector
order_vec_str_wo_prefix <- parse_ellipsis(...)
# Stop if vectors aren't in data.frame:
stop_if_not_vectors_in_df(
df,
order_vec_str_wo_prefix
)
# If order by descending:
if(decreasing){
# Sort the data.frame by the given vectors: sort_order => integer vector
sort_order <- rev(
do.call(
"order",
df[, order_vec_str_wo_prefix, drop = FALSE]
)
)
# Otherwise:
}else{
# Sort the data.frame by the given vectors: sort_order => integer vector
sort_order <- do.call(
"order",
df[, order_vec_str_wo_prefix, drop = FALSE]
)
}
# Order the data.frame by the sort order: ordered_df => data.frame
ordered_df <- df[sort_order,]
# Explicitly define the returned object: data.frame => env
return(ordered_df)
}
# Drop vectors in data.frame: drop_columns => function()
drop_columns <- function(df, ...){
# Stop if a data.frame hasn't been provided:
stopifnot(
is.data.frame(df)
)
# Parse the ellipsis argument: col_names_to_drop => character vector
col_names_to_drop <- parse_ellipsis(...)
# Stop if the vectors aren't in the data.frame:
stop_if_not_vectors_in_df(
df,
col_names_to_drop
)
# Drop the columns: df => data.frame
df[,col_names_to_drop] <- NULL
# Return the data.frame: data.frame => env
return(df)
}
# Function to apply function to data.frame meeting logical test:
# lapply_to_true_df => function()
lapply_to_true_df <- function(df_lst, function_to_apply = fill_down, ...){
# Resolve the function to be applied to vectors:
function_resolved <- match.fun(function_to_apply)
# Resolve the vectors apply the function to:
# vector_names_resolved => character vector
vector_names_resolved <- parse_ellipsis(...)
# Apply function to partition of data.frame in list that met logical
# condition: res => list of data.frames
res <- lapply(
seq_along(df_lst),
function(i){
if(names(df_lst)[i] == "TRUE"){
df_lst[[i]][,vector_names_resolved] <- lapply(
vector_names_resolved,
function(x){
function_resolved(
as.vector(
df_lst[[i]][, x, drop = TRUE]
)
)
}
)
}
df_lst[[i]]
}
)
# Explicitly define the returned object: data.frame => env
res
}
# Define the main function: main => function
main <- function(){
# Input data: df => data.frame
df <- structure(list(date = c(1999L, 2000L, 2001L, 1999L, 2000L, 2001L,
1999L, 2000L, 2001L), firms = c("A", "A", "A", "B", "B", "B",
"C", "C", "C"), return = c(5L, NA, 6L, 9L, NA, 10L, 8L, NA, 3L
)), class = "data.frame", row.names = c(NA, -9L))
# Allocate some memory for the index assigment: tmp_lst => empty list
tmp_lst <- vector("list", 2)
# Assign the index to the data.frame: tmp_lst => list of data.frame and character
# scalar denoting the name of the index vector
tmp_lst <- assign_idx(df)
# Split the data.frame based on logical test: df_lst => list of data.frames
df_lst <- with(
tmp_lst[[1]],
logically_split_df_2_list_of_dfs(
tmp_lst[[1]],
not_in(
firms,
"A"
)
)
)
# Apply-Combine function and combine back to data.frame: res => data.frame
res <- drop_columns(
order_by(
combine_list_2_df(
lapply_to_true_df(
df_lst,
fill_down,
return
),
rbind
),
tmp_lst[[2]]
),
tmp_lst[[2]]
)
# Output the result to the console: data.frame => stdout(console)
return(res)
}
# Execute main if called:
if (sys.nframe() == 0){
# Execute the main function: data.frame => stdout(console)
main()
}