复杂的长到宽整形算法
Complex long to wide reshape algorithm
我遇到一个问题,我需要根据 ID1 和 ID2 将长格式数据 table 重新整形为具有非重叠条目的宽格式。逻辑相当复杂,取决于 3 列(“Seq”、“ID1”和“ID2”)。
属于 ID1 的 Value_1 如果 'overlaps' 与 ID2 相加,反之亦然,但仅适用于不同的 ID。
请参阅下面的输入示例和输出,希望能说明问题。
输入:
df <- structure(list(Seq = c(9143L, 916L, 9293L, 9301L, 9302L, 9304L,
9305L, 9306L, 9307L, 931L, 9311L), ID1 = c("ID1_1", "ID1_1",
NA, "ID1_2", "ID1_2", NA, "ID1_3", "ID1_3", "ID1_3", "ID1_4",
"ID1_4"), value_1 = c(30L, 30L, NA, 30L, 30L, NA, 30L, 30L, 30L,
50L, 50L), ID2 = c(NA, NA, "ID2_1", "ID2_2", "ID2_3", "ID2_4",
"ID2_4", "ID2_4", "ID2_4", "ID2_4", "ID2_5"), value_2 = c(NA,
NA, 33L, 200L, 46L, 58L, 58L, 58L, 58L, 58L, 46L)), class = "data.frame", row.names = c(NA,
-11L))
输出:
(例如,注意最后一行,value_1 = 80,因为 30+50 属于 ID1_3 和 ID1_4 的值之和)
我使用了 data.table 包中的 rleid()
函数,这是一个计算 运行 长度编码的有趣函数。这样做
library(data.table)
library(dplyr)
df %>%
mutate(d = cumsum( c(0, diff(rleid(ID1))) != 0 & c(0, diff(rleid(ID2))) != 0),
value_1 = value_1 * c(1, diff(rleid(ID1))),
value_2 = value_2 * c(1, diff(rleid(ID2)))) %>% group_by(d) %>%
summarise(Seq = toString(Seq),
value_1 = sum(value_1, na.rm = T),
value_2 = sum(value_2, na.rm = T)) %>%
ungroup() %>% select(-d)
# A tibble: 4 x 3
Seq value_1 value_2
<chr> <int> <int>
1 9143, 916 30 0
2 9293 0 33
3 9301, 9302 30 246
4 9304, 9305, 9306, 9307, 931, 9311 80 104
旧答案
df %>% group_by(d = cumsum( c(0, diff(rleid(ID1))) != 0 & c(0, diff(rleid(ID2))) != 0)) %>%
summarise(Seq = toString(Seq),
value_1 = sum(unique(value_1), na.rm = T),
value_2 = sum(unique(value_2), na.rm = T)) %>%
ungroup() %>% select(-d)
首先,我非常喜欢 AnilGoyal 的解决方案。我可以看到我需要开始使用 data.table 包。
话虽如此,我正在研究一种没有 data.table 的 dplyr 方法,这显然更加冗长。此外,我花了一段时间才弄清楚如何处理重复值。乘以 changei
列(0 或 1)删除了重复项。以下是我的方法:
df %>%
mutate_if(is.numeric, replace_na, 0) %>%
mutate_if(is.character, replace_na, "NA") %>%
mutate(
change1 = ID1 != lag(ID1, default = "Start"),
value_1 = value_1 * change1,
change2 = ID2 != lag(ID2, default = "Start"),
value_2 = value_2 * change2,
change = cumsum(change1 & change2)
) %>%
group_by(change) %>%
summarise(
Seq = toString(Seq),
value_1 = sum(value_1, na.rm = T),
value_2 = sum(value_2, na.rm = T)
) %>%
ungroup()
结果是:
df
# change Seq value_1 value_2
# <int> <chr> <dbl> <dbl>
# 1 1 9143, 916 30 0
# 2 2 9293 0 33
# 3 3 9301, 9302 30 246
# 4 4 9304, 9305, 9306, 9307, 931, 9311 80 104
没有上面那么简洁,而是一个Base R的解决方案none the less:
# Function to calculate the aggregate value: .agg_func => function()
.agg_func <- function(df, id_col, value_col){
sbst <- subset(
df,
!(is.na(df[,id_col])) & !(duplicated(df[,id_col])),
select = c(id_col, value_col)
)
return(sum(sbst[,value_col], na.rm = TRUE))
}
# Function to group data by ids: .grouping_func => function()
.grouping_func <- function(df, id_col){
r_l_e <- rle(df[,id_col])
rle_id <- rep(seq_along(r_l_e$values), times = r_l_e$lengths)
return(c(0, diff(rle_id)) != 0)
}
# Group the data: grpd_df => data.frame
grpd_df <- transform(
df,
grp = cumsum(
apply(
vapply(
names(df)[startsWith(names(df), "ID")],
function(x).grouping_func(df, x),
logical(nrow(df))
),
1,
all
)
)
)
# Split-apply-combine the aggregate function to the grouped data:
data.frame(do.call(rbind, lapply(with(grpd_df, split(grpd_df, grp)), function(s){
data.frame(
Seq = toString(s$Seq),
value_1 = .agg_func(s, "ID1", "value_1"),
value_2 = .agg_func(s, "ID2", "value_2")
)
}
)
), row.names = NULL, stringsAsFactors = FALSE
)
我遇到一个问题,我需要根据 ID1 和 ID2 将长格式数据 table 重新整形为具有非重叠条目的宽格式。逻辑相当复杂,取决于 3 列(“Seq”、“ID1”和“ID2”)。
属于 ID1 的Value_1 如果 'overlaps' 与 ID2 相加,反之亦然,但仅适用于不同的 ID。
请参阅下面的输入示例和输出,希望能说明问题。
输入:
df <- structure(list(Seq = c(9143L, 916L, 9293L, 9301L, 9302L, 9304L,
9305L, 9306L, 9307L, 931L, 9311L), ID1 = c("ID1_1", "ID1_1",
NA, "ID1_2", "ID1_2", NA, "ID1_3", "ID1_3", "ID1_3", "ID1_4",
"ID1_4"), value_1 = c(30L, 30L, NA, 30L, 30L, NA, 30L, 30L, 30L,
50L, 50L), ID2 = c(NA, NA, "ID2_1", "ID2_2", "ID2_3", "ID2_4",
"ID2_4", "ID2_4", "ID2_4", "ID2_4", "ID2_5"), value_2 = c(NA,
NA, 33L, 200L, 46L, 58L, 58L, 58L, 58L, 58L, 46L)), class = "data.frame", row.names = c(NA,
-11L))
输出:
(例如,注意最后一行,value_1 = 80,因为 30+50 属于 ID1_3 和 ID1_4 的值之和)
我使用了 data.table 包中的 rleid()
函数,这是一个计算 运行 长度编码的有趣函数。这样做
library(data.table)
library(dplyr)
df %>%
mutate(d = cumsum( c(0, diff(rleid(ID1))) != 0 & c(0, diff(rleid(ID2))) != 0),
value_1 = value_1 * c(1, diff(rleid(ID1))),
value_2 = value_2 * c(1, diff(rleid(ID2)))) %>% group_by(d) %>%
summarise(Seq = toString(Seq),
value_1 = sum(value_1, na.rm = T),
value_2 = sum(value_2, na.rm = T)) %>%
ungroup() %>% select(-d)
# A tibble: 4 x 3
Seq value_1 value_2
<chr> <int> <int>
1 9143, 916 30 0
2 9293 0 33
3 9301, 9302 30 246
4 9304, 9305, 9306, 9307, 931, 9311 80 104
旧答案
df %>% group_by(d = cumsum( c(0, diff(rleid(ID1))) != 0 & c(0, diff(rleid(ID2))) != 0)) %>%
summarise(Seq = toString(Seq),
value_1 = sum(unique(value_1), na.rm = T),
value_2 = sum(unique(value_2), na.rm = T)) %>%
ungroup() %>% select(-d)
首先,我非常喜欢 AnilGoyal 的解决方案。我可以看到我需要开始使用 data.table 包。
话虽如此,我正在研究一种没有 data.table 的 dplyr 方法,这显然更加冗长。此外,我花了一段时间才弄清楚如何处理重复值。乘以 changei
列(0 或 1)删除了重复项。以下是我的方法:
df %>%
mutate_if(is.numeric, replace_na, 0) %>%
mutate_if(is.character, replace_na, "NA") %>%
mutate(
change1 = ID1 != lag(ID1, default = "Start"),
value_1 = value_1 * change1,
change2 = ID2 != lag(ID2, default = "Start"),
value_2 = value_2 * change2,
change = cumsum(change1 & change2)
) %>%
group_by(change) %>%
summarise(
Seq = toString(Seq),
value_1 = sum(value_1, na.rm = T),
value_2 = sum(value_2, na.rm = T)
) %>%
ungroup()
结果是:
df
# change Seq value_1 value_2
# <int> <chr> <dbl> <dbl>
# 1 1 9143, 916 30 0
# 2 2 9293 0 33
# 3 3 9301, 9302 30 246
# 4 4 9304, 9305, 9306, 9307, 931, 9311 80 104
没有上面那么简洁,而是一个Base R的解决方案none the less:
# Function to calculate the aggregate value: .agg_func => function()
.agg_func <- function(df, id_col, value_col){
sbst <- subset(
df,
!(is.na(df[,id_col])) & !(duplicated(df[,id_col])),
select = c(id_col, value_col)
)
return(sum(sbst[,value_col], na.rm = TRUE))
}
# Function to group data by ids: .grouping_func => function()
.grouping_func <- function(df, id_col){
r_l_e <- rle(df[,id_col])
rle_id <- rep(seq_along(r_l_e$values), times = r_l_e$lengths)
return(c(0, diff(rle_id)) != 0)
}
# Group the data: grpd_df => data.frame
grpd_df <- transform(
df,
grp = cumsum(
apply(
vapply(
names(df)[startsWith(names(df), "ID")],
function(x).grouping_func(df, x),
logical(nrow(df))
),
1,
all
)
)
)
# Split-apply-combine the aggregate function to the grouped data:
data.frame(do.call(rbind, lapply(with(grpd_df, split(grpd_df, grp)), function(s){
data.frame(
Seq = toString(s$Seq),
value_1 = .agg_func(s, "ID1", "value_1"),
value_2 = .agg_func(s, "ID2", "value_2")
)
}
)
), row.names = NULL, stringsAsFactors = FALSE
)