数据管理:用 R 扁平化数据
Data management: flatten data with R
我有以下收集政策演变的数据框:
Df <- data.frame(Id_policy = c("A_001", "A_002", "A_003","B_001","B_002"),
date_new = c("20200101","20200115","20200304","20200110","20200215"),
date_end = c("20200503","20200608","20210101","20200403","20200503"),
expend = c("","A_001","A_002","",""))
看起来像这样:
Id_policy date_new date_end expend
A_001 20200101 20200503
A_002 20200115 20200608 A_001
A_003 20200304 20210101 A_002
B_001 20200110 20200403
B_002 20200215 20200503
"Id_policy"指具体保单,"date_new"保单签发日期, "date_end" 保单终止日期。但是,有时政策会延长。在这种情况下,将设置新策略,变量“expend”提供它更改的先前策略的名称。
这里的想法是扁平化数据集,因此我们只保留与不同政策相对应的行。所以,输出将是这样的:
Id_policy date_new date_end expend
A_001 20200101 20210101
B_001 20200110 20200403
B_002 20200215 20200503
有人遇到过类似的问题吗?
外层 for 循环遍历 Df 中的每个策略 ID,内层 while 循环查找原始策略的最后一个扩展应该有效
Df <- data.frame(Id_policy = c("A_001", "A_002", "A_003","B_001","B_002"),
date_new = c("20200101","20200115","20200304","20200110","20200215"),
date_end = c("20200503","20200608","20210101","20200403","20200503"),
expend = c("","A_001","A_002","",""),
stringsAsFactors = F)
final_df <- data.frame(matrix(nrow = 0, ncol = 0), stringsAsFactors = F)
for (i in seq_len(nrow(Df))) {
# Check to see if the current policy ID is in the column expend
if (Df$Id_policy[i] %in% Df$expend || !Df$expend[i] == "") {
# Loop through expend policy until last one is found
found_last <- F
j <- i
end_date <- ""
c_policy_id <- Df$Id_policy[j]
expended_id <- Df$Id_policy[which(Df$expend == c_policy_id)]
if (length(expended_id) > 0) {
if (expended_id %in% Df$expend) {
while(!found_last) {
c_policy_id <- Df$Id_policy[j]
expended_id <- Df$Id_policy[which(Df$expend == c_policy_id)]
if (length(expended_id) > 0) {
if (expended_id %in% Df$expend) {
j <- which(Df$expend == expended_id)
}
}else{
end_date <- Df$date_end[j]
found_last <- T
}
}
if (!end_date == "") {
# Add to final df when found the last one
final_df <- bind_rows(final_df, data.frame(Id_policy = Df$Id_policy[i],
date_new = Df$date_new[i],
date_end = end_date,
expend = ""))
}
}
}
}else{
final_df <- bind_rows(final_df, Df[i, ])
}
}
final_df
Id_policy date_new date_end expend
1 A_001 20200101 20210101
2 B_001 20200110 20200403
3 B_002 20200215 20200503
这是一个解决方案,使用 igraph
创建一个 id 的定向网络,并使用 data.table
进行一些绑定和连接。
我一直在结果之间显示每个步骤的作用。
library( data.table )
library( igraph )
setDT(Df)
#create nodes and links
nodes <- Df[,1:3]
links <- Df[ !expend == "", .(from = expend, to = Id_policy) ]
g = graph_from_data_frame( links, vertices = nodes, directed = TRUE )
plot(g)
#find nodes without incoming (these are startpoints of paths)
in.nodes <- V(g)[degree(g, mode = 'in') == 0]
#define sumcomponents of the graph by looping the in.nodes
L <- lapply( in.nodes, function(x) names( subcomponent(g, x) ) )
# $A_001
# [1] "A_001" "A_002" "A_003"
# $B_001
# [1] "B_001"
# $B_002
# [1] "B_002"
L2 <- lapply( L, function(x) {
#get first and last element
dt <- data.table( start = x[1], end = x[ length(x) ] )
})
#bind list together to a single data.table
ans <- rbindlist( L2, use.names = TRUE, fill = TRUE, idcol = "Id_policy" )
# Id_policy start end
# 1: A_001 A_001 A_003
# 2: B_001 B_001 B_001
# 3: B_002 B_002 B_002
#update ans with values from original Df for start and end
ans[ Df, `:=`( start = i.date_new ), on = .(start = Id_policy) ][]
ans[ Df, `:=`( end = i.date_end ), on = .(end = Id_policy) ][]
# Id_policy start end
# 1: A_001 20200101 20210101
# 2: B_001 20200110 20200403
# 3: B_002 20200215 20200503
一种方法是将此视为网络问题并使用 igraph
函数(相关帖子,例如 根据多个列制作 group_indices
; ).
将缺失的'expend'设置为'Id_policy'
使用graph_from_data_frame
创建图形,其中'expend'和'Id_policy'列被视为边列表。
使用components
获取图的连通分量,即'Id_policy'直接或间接连通。
Select membership
元素获取“每个顶点所属的簇id”。
加入原始数据会员。
获取按成员分组的相关数据。
我使用 data.table
进行数据整理步骤,但这当然也可以在 base
或 dplyr
中完成。
library(data.table)
library(igraph)
setDT(Df)
Df[expend == "", expend := Id_policy]
g = graph_from_data_frame(Df[ , .(expend, Id_policy)])
mem = components(g)$membership
Df[.(names(mem)), on = .(Id_policy), mem := mem]
Df[ , .(Id_policy = Id_policy[1],
date_new = first(date_new),
date_end = last(date_end), by = mem]
# mem Id_policy date_new date_end
# 1: 1 A_001 20200101 20210101
# 2: 2 B_001 20200110 20200403
# 3: 3 B_002 20200215 20200503
我有以下收集政策演变的数据框:
Df <- data.frame(Id_policy = c("A_001", "A_002", "A_003","B_001","B_002"),
date_new = c("20200101","20200115","20200304","20200110","20200215"),
date_end = c("20200503","20200608","20210101","20200403","20200503"),
expend = c("","A_001","A_002","",""))
看起来像这样:
Id_policy date_new date_end expend
A_001 20200101 20200503
A_002 20200115 20200608 A_001
A_003 20200304 20210101 A_002
B_001 20200110 20200403
B_002 20200215 20200503
"Id_policy"指具体保单,"date_new"保单签发日期, "date_end" 保单终止日期。但是,有时政策会延长。在这种情况下,将设置新策略,变量“expend”提供它更改的先前策略的名称。
这里的想法是扁平化数据集,因此我们只保留与不同政策相对应的行。所以,输出将是这样的:
Id_policy date_new date_end expend
A_001 20200101 20210101
B_001 20200110 20200403
B_002 20200215 20200503
有人遇到过类似的问题吗?
外层 for 循环遍历 Df 中的每个策略 ID,内层 while 循环查找原始策略的最后一个扩展应该有效
Df <- data.frame(Id_policy = c("A_001", "A_002", "A_003","B_001","B_002"),
date_new = c("20200101","20200115","20200304","20200110","20200215"),
date_end = c("20200503","20200608","20210101","20200403","20200503"),
expend = c("","A_001","A_002","",""),
stringsAsFactors = F)
final_df <- data.frame(matrix(nrow = 0, ncol = 0), stringsAsFactors = F)
for (i in seq_len(nrow(Df))) {
# Check to see if the current policy ID is in the column expend
if (Df$Id_policy[i] %in% Df$expend || !Df$expend[i] == "") {
# Loop through expend policy until last one is found
found_last <- F
j <- i
end_date <- ""
c_policy_id <- Df$Id_policy[j]
expended_id <- Df$Id_policy[which(Df$expend == c_policy_id)]
if (length(expended_id) > 0) {
if (expended_id %in% Df$expend) {
while(!found_last) {
c_policy_id <- Df$Id_policy[j]
expended_id <- Df$Id_policy[which(Df$expend == c_policy_id)]
if (length(expended_id) > 0) {
if (expended_id %in% Df$expend) {
j <- which(Df$expend == expended_id)
}
}else{
end_date <- Df$date_end[j]
found_last <- T
}
}
if (!end_date == "") {
# Add to final df when found the last one
final_df <- bind_rows(final_df, data.frame(Id_policy = Df$Id_policy[i],
date_new = Df$date_new[i],
date_end = end_date,
expend = ""))
}
}
}
}else{
final_df <- bind_rows(final_df, Df[i, ])
}
}
final_df
Id_policy date_new date_end expend
1 A_001 20200101 20210101
2 B_001 20200110 20200403
3 B_002 20200215 20200503
这是一个解决方案,使用 igraph
创建一个 id 的定向网络,并使用 data.table
进行一些绑定和连接。
我一直在结果之间显示每个步骤的作用。
library( data.table )
library( igraph )
setDT(Df)
#create nodes and links
nodes <- Df[,1:3]
links <- Df[ !expend == "", .(from = expend, to = Id_policy) ]
g = graph_from_data_frame( links, vertices = nodes, directed = TRUE )
plot(g)
#find nodes without incoming (these are startpoints of paths)
in.nodes <- V(g)[degree(g, mode = 'in') == 0]
#define sumcomponents of the graph by looping the in.nodes
L <- lapply( in.nodes, function(x) names( subcomponent(g, x) ) )
# $A_001
# [1] "A_001" "A_002" "A_003"
# $B_001
# [1] "B_001"
# $B_002
# [1] "B_002"
L2 <- lapply( L, function(x) {
#get first and last element
dt <- data.table( start = x[1], end = x[ length(x) ] )
})
#bind list together to a single data.table
ans <- rbindlist( L2, use.names = TRUE, fill = TRUE, idcol = "Id_policy" )
# Id_policy start end
# 1: A_001 A_001 A_003
# 2: B_001 B_001 B_001
# 3: B_002 B_002 B_002
#update ans with values from original Df for start and end
ans[ Df, `:=`( start = i.date_new ), on = .(start = Id_policy) ][]
ans[ Df, `:=`( end = i.date_end ), on = .(end = Id_policy) ][]
# Id_policy start end
# 1: A_001 20200101 20210101
# 2: B_001 20200110 20200403
# 3: B_002 20200215 20200503
一种方法是将此视为网络问题并使用 igraph
函数(相关帖子,例如 根据多个列制作 group_indices
;
将缺失的'expend'设置为'Id_policy'
使用
graph_from_data_frame
创建图形,其中'expend'和'Id_policy'列被视为边列表。使用
components
获取图的连通分量,即'Id_policy'直接或间接连通。Select
membership
元素获取“每个顶点所属的簇id”。加入原始数据会员。
获取按成员分组的相关数据。
我使用 data.table
进行数据整理步骤,但这当然也可以在 base
或 dplyr
中完成。
library(data.table)
library(igraph)
setDT(Df)
Df[expend == "", expend := Id_policy]
g = graph_from_data_frame(Df[ , .(expend, Id_policy)])
mem = components(g)$membership
Df[.(names(mem)), on = .(Id_policy), mem := mem]
Df[ , .(Id_policy = Id_policy[1],
date_new = first(date_new),
date_end = last(date_end), by = mem]
# mem Id_policy date_new date_end
# 1: 1 A_001 20200101 20210101
# 2: 2 B_001 20200110 20200403
# 3: 3 B_002 20200215 20200503