数据管理:用 R 扁平化数据

Data management: flatten data with R

我有以下收集政策演变的数据框:

Df <- data.frame(Id_policy = c("A_001", "A_002", "A_003","B_001","B_002"),
                 date_new = c("20200101","20200115","20200304","20200110","20200215"),
                 date_end = c("20200503","20200608","20210101","20200403","20200503"),
                 expend = c("","A_001","A_002","",""))

看起来像这样:

  Id_policy date_new date_end expend
     A_001 20200101 20200503       
     A_002 20200115 20200608  A_001
     A_003 20200304 20210101  A_002
     B_001 20200110 20200403       
     B_002 20200215 20200503       

"Id_policy"指具体保单,"date_new"保单签发日期, "date_end" 保单终止日期。但是,有时政策会延长。在这种情况下,将设置新策略,变量“expend”提供它更改的先前策略的名称。

这里的想法是扁平化数据集,因此我们只保留与不同政策相对应的行。所以,输出将是这样的:

  Id_policy date_new date_end expend
     A_001 20200101 20210101       
     B_001 20200110 20200403       
     B_002 20200215 20200503     

有人遇到过类似的问题吗?

外层 for 循环遍历 Df 中的每个策略 ID,内层 while 循环查找原始策略的最后一个扩展应该有效

Df <- data.frame(Id_policy = c("A_001", "A_002", "A_003","B_001","B_002"),
                 date_new = c("20200101","20200115","20200304","20200110","20200215"),
                 date_end = c("20200503","20200608","20210101","20200403","20200503"),
                 expend = c("","A_001","A_002","",""),
                 stringsAsFactors = F)
final_df <- data.frame(matrix(nrow = 0, ncol = 0), stringsAsFactors = F)

for (i in seq_len(nrow(Df))) {
  # Check to see if the current policy ID is in the column expend
  if (Df$Id_policy[i] %in% Df$expend || !Df$expend[i] == "") {
    # Loop through expend policy until last one is found
    found_last <- F
    j <- i
    end_date <- ""

    c_policy_id <- Df$Id_policy[j]
    expended_id <- Df$Id_policy[which(Df$expend == c_policy_id)]

    if (length(expended_id) > 0) {
      if (expended_id %in% Df$expend) {
        while(!found_last) {
          c_policy_id <- Df$Id_policy[j]
          expended_id <- Df$Id_policy[which(Df$expend == c_policy_id)]

          if (length(expended_id) > 0) {
            if (expended_id %in% Df$expend) {

              j <- which(Df$expend == expended_id)
            }
          }else{
            end_date <- Df$date_end[j]
            found_last <- T
          }
        }
        if (!end_date == "") {
          # Add to final df when found the last one
          final_df <- bind_rows(final_df, data.frame(Id_policy = Df$Id_policy[i],
                                                     date_new = Df$date_new[i],
                                                     date_end = end_date,
                                                     expend = ""))
        }
      }
    }
  }else{
    final_df <- bind_rows(final_df, Df[i, ])

  }
}

final_df

 Id_policy date_new date_end expend
1     A_001 20200101 20210101       
2     B_001 20200110 20200403       
3     B_002 20200215 20200503

这是一个解决方案,使用 igraph 创建一个 id 的定向网络,并使用 data.table 进行一些绑定和连接。 我一直在结果之间显示每个步骤的作用。

library( data.table )
library( igraph )
setDT(Df)
#create nodes and links
nodes <- Df[,1:3]
links <- Df[ !expend == "", .(from = expend, to = Id_policy) ]
g = graph_from_data_frame( links, vertices = nodes, directed = TRUE )
plot(g)

#find nodes without incoming (these are startpoints of paths)
in.nodes <- V(g)[degree(g, mode = 'in') == 0]
#define sumcomponents of the graph by looping the in.nodes
L <- lapply( in.nodes, function(x) names( subcomponent(g, x) ) )
# $A_001
# [1] "A_001" "A_002" "A_003"
# $B_001
# [1] "B_001"
# $B_002
# [1] "B_002"
L2 <- lapply( L, function(x) {
  #get first and last element
  dt <- data.table( start = x[1], end = x[ length(x) ] )
})
#bind list together to a single data.table
ans <- rbindlist( L2, use.names = TRUE, fill = TRUE, idcol = "Id_policy" )
#    Id_policy start   end
# 1:     A_001 A_001 A_003
# 2:     B_001 B_001 B_001
# 3:     B_002 B_002 B_002

#update ans with values from original Df for start and end
ans[ Df, `:=`( start = i.date_new ), on = .(start = Id_policy) ][]
ans[ Df, `:=`( end   = i.date_end ), on = .(end = Id_policy) ][]
# Id_policy    start      end
# 1:     A_001 20200101 20210101
# 2:     B_001 20200110 20200403
# 3:     B_002 20200215 20200503

一种方法是将此视为网络问题并使用 igraph 函数(相关帖子,例如 根据多个列制作 group_indices ; ).

  1. 将缺失的'expend'设置为'Id_policy'

  2. 使用graph_from_data_frame创建图形,其中'expend'和'Id_policy'列被视为边列表。

  3. 使用components获取图的连通分量,即'Id_policy'直接或间接连通。

  4. Select membership 元素获取“每个顶点所属的簇id”。

  5. 加入原始数据会员。

  6. 获取按成员分组的相关数据。

我使用 data.table 进行数据整理步骤,但这当然也可以在 basedplyr 中完成。

library(data.table)
library(igraph)

setDT(Df)
Df[expend ==  "", expend := Id_policy]

g = graph_from_data_frame(Df[ , .(expend, Id_policy)])
mem = components(g)$membership

Df[.(names(mem)), on = .(Id_policy), mem := mem]    

Df[ , .(Id_policy = Id_policy[1],
        date_new = first(date_new),
        date_end = last(date_end), by = mem]
#    mem Id_policy date_new date_end
# 1:   1     A_001 20200101 20210101
# 2:   2     B_001 20200110 20200403
# 3:   3     B_002 20200215 20200503