省略某些值时如何创建桑基图
How to create a sankey diagram when certain values are omitted
我需要用 plotly
在 3 年内在 R
中创建桑基图。我的 group
列应该是节点 (1
== worst
, 2
== bad
, 3
== good
和 4
== best
),但是在 2019 年和 2020 年我 have/need 一个额外的节点 5
== not available
.
我的数据非常大,所以我只给你看一小段:
dt.2018 <- structure(list(Year = c(2018L, 2018L, 2018L, 2018L, 2018L, 2018L,
2018L, 2018L, 2018L, 2018L), GPNRPlan = c(100236L, 101554L, 111328L,
124213L, 127434L, 128509L, 130058L, 130192L, 130224L, 130309L
), TB.Info = c("Below TB", "Over TB", "In TB", "In TB", "In TB",
"Below TB", "Over TB", "Below TB", "Below TB", "Below TB"), Qeff = c(-0.01,
0, 0, 0, 0, 0, 0, 0, -0.01, -0.01), group = c(1, 1, 3, 4, 2,
2, 1, 4, 2, 3)), class = c("data.table", "data.frame"), row.names = c(NA,
-10L))
dt.2019 <- structure(list(Year = c(2019L, 2019L, 2019L, 2019L, 2019L, 2019L,
2019L, 2019L, 2019L, 2019L), GPNRPlan = c(100236L, 101554L, 111328L,
124213L, 127434L, 128003L, 128509L, 130058L, 130192L, 130351L
), TB.Info = c("Below TB", "Over TB", "In TB", "In TB", "In TB",
"Over TB", "In TB", "Over TB", "Below TB", "Over TB"), Qeff = c(-0.01,
0.04, -0.01, 0, 0, 0, 0, 0, 0, 0), group = c(1, 2, 3, 1, 2, 4,
1, 1, 3, 2)), class = c("data.table", "data.frame"), row.names = c(NA,
-10L))
dt.2020 <- structure(list(Year = c(2020L, 2020L, 2020L, 2020L, 2020L, 2020L,
2020L, 2020L, 2020L, 2020L), GPNRPlan = c(100236L, 111328L, 128003L,
130058L, 130192L, 133874L, 135886L, 137792L, 138153L, 142309L
), TB.Info = c("Below TB", "In TB", "Over TB", "Below TB", "Below TB",
"Over TB", "Below TB", "Over TB", "Over TB", "In TB"), Qeff = c(0,
-0.01, 0, 0, -0.01, 0.02, -0.01, -0.01, 0.01, 0), group = c(2,
3, 1, 4, 2, 3, 1, 1, 2, 4)), class = c("data.table", "data.frame"
))
现在我想看看2018有哪些客户(客户ID==GPNRPlan
)在2019 或已更改组,如果他们不再在 2019,则他们应参考组 5
,也称为 not available
。从 2019 到 2020 应该会发生同样的情况。
这怎么行?
是否可以在同一张桑基图中引用从2018到2020?
所以我这里这个样本的桑基图看起来像这样(手工制作):
这主要是正确格式化数据的问题。
我加入了不同的 data.tables 以获得 NA 值。
此外,请检查不同的排列选项。我不认为你的要求。输出可以达到 100% - 要么节点重叠,要么使用“捕捉”改变节点的顺序。
library(data.table)
library(plotly)
library(scales)
dt.2018 <- structure(list(Year = c(2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L),
GPNRPlan = c(100236L, 101554L, 111328L, 124213L, 127434L, 128509L, 130058L, 130192L, 130224L, 130309L),
TB.Info = c("Below TB", "Over TB", "In TB", "In TB", "In TB", "Below TB", "Over TB", "Below TB", "Below TB", "Below TB"),
Qeff = c(-0.01, 0, 0, 0, 0, 0, 0, 0, -0.01, -0.01),
group = c(1, 1, 3, 4, 2, 2, 1, 4, 2, 3)),
class = c("data.table", "data.frame"), row.names = c(NA, -10L))
dt.2019 <- structure(list(Year = c(2019L, 2019L, 2019L, 2019L, 2019L, 2019L, 2019L, 2019L, 2019L, 2019L),
GPNRPlan = c(100236L, 101554L, 111328L, 124213L, 127434L, 128003L, 128509L, 130058L, 130192L, 130351L),
TB.Info = c("Below TB", "Over TB", "In TB", "In TB", "In TB", "Over TB", "In TB", "Over TB", "Below TB", "Over TB"),
Qeff = c(-0.01, 0.04, -0.01, 0, 0, 0, 0, 0, 0, 0),
group = c(1, 2, 3, 1, 2, 4, 1, 1, 3, 2)),
class = c("data.table", "data.frame"), row.names = c(NA, -10L))
dt.2020 <- structure(list(Year = c(2020L, 2020L, 2020L, 2020L, 2020L, 2020L, 2020L, 2020L, 2020L, 2020L),
GPNRPlan = c(100236L, 111328L, 128003L, 130058L, 130192L, 133874L, 135886L, 137792L, 138153L, 142309L),
TB.Info = c("Below TB", "In TB", "Over TB", "Below TB", "Below TB", "Over TB", "Below TB", "Over TB", "Over TB", "In TB"),
Qeff = c(0, -0.01, 0, 0, -0.01, 0.02, -0.01, -0.01, 0.01, 0), group = c(2, 3, 1, 4, 2, 3, 1, 1, 2, 4)),
class = c("data.table", "data.frame"))
lookUpDT <- data.table(group = c(as.character(1:4), "NA"), group_name = c("worst", "bad", "good", "best", "not available"), color = c("red", "orange", "yellow", "green", "darkgrey"))
sankeyDT <- rbindlist(list(merge.data.table(dt.2018, dt.2019, by = "GPNRPlan", all.x = TRUE, suffixes = c(".source", ".target"))[, Year.target := 2019],
merge.data.table(dt.2019, dt.2020, by = "GPNRPlan", all.x = TRUE, suffixes = c(".source", ".target"))[, Year.target := 2020]
))
sankeyDT[, node_id.source := paste0(Year.source, "_", group.source)]
sankeyDT[, node_id.target := paste0(Year.target, "_", group.target)]
charCols <- c("group.source", "group.target")
sankeyDT[,(charCols):= lapply(.SD, as.character), .SDcols = charCols]
sankeyDT <- merge.data.table(sankeyDT, lookUpDT, by.x = "group.source", by.y = "group")
sankeyLabelsDT <- data.table(node_id = sort(unique(c(sankeyDT$node_id.source, sankeyDT$node_id.target)), na.last = TRUE))
sankeyLabelsDT[, c("year", "group") := tstrsplit(node_id, "_", fixed=TRUE)]
sankeyLabelsDT[, x_scale := .GRP, by = year][, y_scale := .GRP, by = group]
sankeyLabelsDT[, x_scale := rescale(x_scale, to=c(0, 0.9))][, y_scale := rescale(y_scale, to=c(0.2, 0.75))]
sankeyLabelsDT <- merge.data.table(sankeyLabelsDT, lookUpDT, by = "group")
sankeyLabelsDT[, label := paste(year, "-", group_name)]
setorder(sankeyLabelsDT, year, group, na.last = TRUE)
fig <- plot_ly(
data = sankeyDT,
type = "sankey",
arrangement = "perpendicular", # snap - perpendicular - freeform - fixed
orientation = "h",
node = list(
label = sankeyLabelsDT$label,
color = sankeyLabelsDT$color,
x = sankeyLabelsDT$x_scale,
y = sankeyLabelsDT$y_scale,
pad = 10 # 10 Pixel
),
link = list(
source = match(sankeyDT$node_id.source, sankeyLabelsDT$node_id)-1,
target = match(sankeyDT$node_id.target, sankeyLabelsDT$node_id)-1,
value = rep(1, nrow(sankeyDT)),
label = paste("customer:", sankeyDT$GPNRPlan),
color = sankeyDT$color # default: grey
)
)
fig <- fig %>% layout(
title = "Sankey Diagram",
font = list(
size = 10
)
)
fig
我需要用 plotly
在 3 年内在 R
中创建桑基图。我的 group
列应该是节点 (1
== worst
, 2
== bad
, 3
== good
和 4
== best
),但是在 2019 年和 2020 年我 have/need 一个额外的节点 5
== not available
.
我的数据非常大,所以我只给你看一小段:
dt.2018 <- structure(list(Year = c(2018L, 2018L, 2018L, 2018L, 2018L, 2018L,
2018L, 2018L, 2018L, 2018L), GPNRPlan = c(100236L, 101554L, 111328L,
124213L, 127434L, 128509L, 130058L, 130192L, 130224L, 130309L
), TB.Info = c("Below TB", "Over TB", "In TB", "In TB", "In TB",
"Below TB", "Over TB", "Below TB", "Below TB", "Below TB"), Qeff = c(-0.01,
0, 0, 0, 0, 0, 0, 0, -0.01, -0.01), group = c(1, 1, 3, 4, 2,
2, 1, 4, 2, 3)), class = c("data.table", "data.frame"), row.names = c(NA,
-10L))
dt.2019 <- structure(list(Year = c(2019L, 2019L, 2019L, 2019L, 2019L, 2019L,
2019L, 2019L, 2019L, 2019L), GPNRPlan = c(100236L, 101554L, 111328L,
124213L, 127434L, 128003L, 128509L, 130058L, 130192L, 130351L
), TB.Info = c("Below TB", "Over TB", "In TB", "In TB", "In TB",
"Over TB", "In TB", "Over TB", "Below TB", "Over TB"), Qeff = c(-0.01,
0.04, -0.01, 0, 0, 0, 0, 0, 0, 0), group = c(1, 2, 3, 1, 2, 4,
1, 1, 3, 2)), class = c("data.table", "data.frame"), row.names = c(NA,
-10L))
dt.2020 <- structure(list(Year = c(2020L, 2020L, 2020L, 2020L, 2020L, 2020L,
2020L, 2020L, 2020L, 2020L), GPNRPlan = c(100236L, 111328L, 128003L,
130058L, 130192L, 133874L, 135886L, 137792L, 138153L, 142309L
), TB.Info = c("Below TB", "In TB", "Over TB", "Below TB", "Below TB",
"Over TB", "Below TB", "Over TB", "Over TB", "In TB"), Qeff = c(0,
-0.01, 0, 0, -0.01, 0.02, -0.01, -0.01, 0.01, 0), group = c(2,
3, 1, 4, 2, 3, 1, 1, 2, 4)), class = c("data.table", "data.frame"
))
现在我想看看2018有哪些客户(客户ID==GPNRPlan
)在2019 或已更改组,如果他们不再在 2019,则他们应参考组 5
,也称为 not available
。从 2019 到 2020 应该会发生同样的情况。
这怎么行?
是否可以在同一张桑基图中引用从2018到2020?
所以我这里这个样本的桑基图看起来像这样(手工制作):
这主要是正确格式化数据的问题。
我加入了不同的 data.tables 以获得 NA 值。
此外,请检查不同的排列选项。我不认为你的要求。输出可以达到 100% - 要么节点重叠,要么使用“捕捉”改变节点的顺序。
library(data.table)
library(plotly)
library(scales)
dt.2018 <- structure(list(Year = c(2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L),
GPNRPlan = c(100236L, 101554L, 111328L, 124213L, 127434L, 128509L, 130058L, 130192L, 130224L, 130309L),
TB.Info = c("Below TB", "Over TB", "In TB", "In TB", "In TB", "Below TB", "Over TB", "Below TB", "Below TB", "Below TB"),
Qeff = c(-0.01, 0, 0, 0, 0, 0, 0, 0, -0.01, -0.01),
group = c(1, 1, 3, 4, 2, 2, 1, 4, 2, 3)),
class = c("data.table", "data.frame"), row.names = c(NA, -10L))
dt.2019 <- structure(list(Year = c(2019L, 2019L, 2019L, 2019L, 2019L, 2019L, 2019L, 2019L, 2019L, 2019L),
GPNRPlan = c(100236L, 101554L, 111328L, 124213L, 127434L, 128003L, 128509L, 130058L, 130192L, 130351L),
TB.Info = c("Below TB", "Over TB", "In TB", "In TB", "In TB", "Over TB", "In TB", "Over TB", "Below TB", "Over TB"),
Qeff = c(-0.01, 0.04, -0.01, 0, 0, 0, 0, 0, 0, 0),
group = c(1, 2, 3, 1, 2, 4, 1, 1, 3, 2)),
class = c("data.table", "data.frame"), row.names = c(NA, -10L))
dt.2020 <- structure(list(Year = c(2020L, 2020L, 2020L, 2020L, 2020L, 2020L, 2020L, 2020L, 2020L, 2020L),
GPNRPlan = c(100236L, 111328L, 128003L, 130058L, 130192L, 133874L, 135886L, 137792L, 138153L, 142309L),
TB.Info = c("Below TB", "In TB", "Over TB", "Below TB", "Below TB", "Over TB", "Below TB", "Over TB", "Over TB", "In TB"),
Qeff = c(0, -0.01, 0, 0, -0.01, 0.02, -0.01, -0.01, 0.01, 0), group = c(2, 3, 1, 4, 2, 3, 1, 1, 2, 4)),
class = c("data.table", "data.frame"))
lookUpDT <- data.table(group = c(as.character(1:4), "NA"), group_name = c("worst", "bad", "good", "best", "not available"), color = c("red", "orange", "yellow", "green", "darkgrey"))
sankeyDT <- rbindlist(list(merge.data.table(dt.2018, dt.2019, by = "GPNRPlan", all.x = TRUE, suffixes = c(".source", ".target"))[, Year.target := 2019],
merge.data.table(dt.2019, dt.2020, by = "GPNRPlan", all.x = TRUE, suffixes = c(".source", ".target"))[, Year.target := 2020]
))
sankeyDT[, node_id.source := paste0(Year.source, "_", group.source)]
sankeyDT[, node_id.target := paste0(Year.target, "_", group.target)]
charCols <- c("group.source", "group.target")
sankeyDT[,(charCols):= lapply(.SD, as.character), .SDcols = charCols]
sankeyDT <- merge.data.table(sankeyDT, lookUpDT, by.x = "group.source", by.y = "group")
sankeyLabelsDT <- data.table(node_id = sort(unique(c(sankeyDT$node_id.source, sankeyDT$node_id.target)), na.last = TRUE))
sankeyLabelsDT[, c("year", "group") := tstrsplit(node_id, "_", fixed=TRUE)]
sankeyLabelsDT[, x_scale := .GRP, by = year][, y_scale := .GRP, by = group]
sankeyLabelsDT[, x_scale := rescale(x_scale, to=c(0, 0.9))][, y_scale := rescale(y_scale, to=c(0.2, 0.75))]
sankeyLabelsDT <- merge.data.table(sankeyLabelsDT, lookUpDT, by = "group")
sankeyLabelsDT[, label := paste(year, "-", group_name)]
setorder(sankeyLabelsDT, year, group, na.last = TRUE)
fig <- plot_ly(
data = sankeyDT,
type = "sankey",
arrangement = "perpendicular", # snap - perpendicular - freeform - fixed
orientation = "h",
node = list(
label = sankeyLabelsDT$label,
color = sankeyLabelsDT$color,
x = sankeyLabelsDT$x_scale,
y = sankeyLabelsDT$y_scale,
pad = 10 # 10 Pixel
),
link = list(
source = match(sankeyDT$node_id.source, sankeyLabelsDT$node_id)-1,
target = match(sankeyDT$node_id.target, sankeyLabelsDT$node_id)-1,
value = rep(1, nrow(sankeyDT)),
label = paste("customer:", sankeyDT$GPNRPlan),
color = sankeyDT$color # default: grey
)
)
fig <- fig %>% layout(
title = "Sankey Diagram",
font = list(
size = 10
)
)
fig