如何为 R 中的桑基图准备输入数据?
How to prepare input data for a sankey diagrams in R?
我正在尝试生成一个 sankey diagram in R, which is also referred as a river plot. I've seen this question Sankey Diagrams in R?,其中列出了生成桑基图的各种包。因为我有输入数据并且知道不同的 tools/packages 我可以制作这样的图表但是我的问题是:我如何为这样的输入数据准备?
假设我们想展示用户如何在 10 天内在不同状态之间迁移,并且开始数据集如下所示:
data.frame(userID = 1:100,
day1_state = sample(letters[1:8], replace = TRUE, size = 100),
day2_state = sample(letters[1:8], replace = TRUE, size = 100),
day3_state = sample(letters[1:8], replace = TRUE, size = 100),
day4_state = sample(letters[1:8], replace = TRUE, size = 100),
day5_state = sample(letters[1:8], replace = TRUE, size = 100),
day6_state = sample(letters[1:8], replace = TRUE, size = 100),
day7_state = sample(letters[1:8], replace = TRUE, size = 100),
day8_state = sample(letters[1:8], replace = TRUE, size = 100),
day9_state = sample(letters[1:8], replace = TRUE, size = 100),
day10_state = sample(letters[1:8], replace = TRUE, size = 100)
) -> dt
现在,如果想用 networkD3
package 创建桑基图,应该如何将此 dt
data.frame 转换为所需的输入
这样我们就可以像这个例子一样输入
library(networkD3)
URL <- paste0(
"https://cdn.rawgit.com/christophergandrud/networkD3/",
"master/JSONdata/energy.json")
Energy <- jsonlite::fromJSON(URL)
# Plot
sankeyNetwork(Links = Energy$links, Nodes = Energy$nodes, Source = "source",
Target = "target", Value = "value", NodeID = "name",
units = "TWh", fontSize = 12, nodeWidth = 30)
编辑
我发现了这样的脚本,它在其他情况下准备数据并复制了它,所以我认为它现在可能会被关闭:
https://github.com/mi2-warsaw/JakOniGlosowali/blob/master/sankey/sankey.R
我发现了这样的脚本,它在其他情况下准备数据并复制了它,所以我认为它现在可能会被关闭:
https://github.com/mi2-warsaw/JakOniGlosowali/blob/master/sankey/sankey.R
然后这段代码为问题data.frame
生成这样的sankey图
fixtable <- function(...) {
tab <- table(...)
if (substr(colnames(tab)[1],1,1) == "_" &
substr(rownames(tab)[1],1,1) == "_") {
tab2 <- tab
colnames(tab2) <- sapply(strsplit(colnames(tab2), split=" "), `[`, 1)
rownames(tab2) <- sapply(strsplit(rownames(tab2), split=" "), `[`, 1)
tab2[1,1] <- 0
# mandat w klubie
for (par in names(which(tab2[1,] > 0))) {
delta = min(tab2[par, 1], tab2[1, par])
tab2[par, par] = tab2[par, par] + delta
tab2[1, par] = tab2[1, par] - delta
tab2[par, 1] = tab2[par, 1] - delta
}
# przechodzi przez niezalezy
for (par in names(which(tab2[1,] > 0))) {
tab2["niez.", par] = tab2["niez.", par] + tab2[1, par]
tab2[1, par] = 0
}
for (par in names(which(tab2[,1] > 0))) {
tab2[par, "niez."] = tab2[par, "niez."] + tab2[par, 1]
tab2[par, 1] = 0
}
tab[] <- tab2[]
}
tab
}
flow2 <- rbind(
data.frame(fixtable(z = paste0(dat$day1_state, " day1"), do = paste0(dat$day2_state, " day2"))),
data.frame(fixtable(z = paste0(dat$day2_state, " day2"), do = paste0(dat$day3_state, " day3"))),
data.frame(fixtable(z = paste0(dat$day3_state, " day3"), do = paste0(dat$day4_state, " day4"))),
data.frame(fixtable(z = paste0(dat$day4_state, " day4"), do = paste0(dat$day5_state, " day5"))),
data.frame(fixtable(z = paste0(dat$day5_state, " day5"), do = paste0(dat$day6_state, " day6"))),
data.frame(fixtable(z = paste0(dat$day6_state, " day6"), do = paste0(dat$day7_state, " day7"))),
data.frame(fixtable(z = paste0(dat$day7_state, " day7"), do = paste0(dat$day8_state, " day8"))),
data.frame(fixtable(z = paste0(dat$day8_state, " day8"), do = paste0(dat$day9_state, " day9"))),
data.frame(fixtable(z = paste0(dat$day9_state, " day9"), do = paste0(dat$day10_state, " day10"))))
flow2 <- flow2[flow2[,3] > 0,]
nodes2 <- data.frame(name=unique(c(levels(factor(flow2[,1])), levels(factor(flow2[,2])))))
nam2 <- seq_along(nodes2[,1])-1
names(nam2) <- nodes2[,1]
links2 <- data.frame(source = nam2[as.character(flow2[,1])],
target = nam2[as.character(flow2[,2])],
value = flow2[,3])
sankeyNetwork(Links = links, Nodes = nodes,
Source = "source", Target = "target",
Value = "value", NodeID = "name",
fontFamily = "Arial", fontSize = 12, nodeWidth = 40,
colourScale = "d3.scale.category20()")
我想我最好 post 在这里说明如何用 tidyverse
魔法完成。
library(ggplot2)
library(ggalluvial)
library(tidyr)
library(dplyr)
library(stringr)
# The actual data preperation happens here
dt_new <- dt %>%
gather(day, state, -userID) %>% # Long format
mutate(day = str_match(day, "[0-9]+")[,1]) %>% # Get the numbers
mutate(day = as.integer(day), # Convert to proper data types
state = as.factor(state))
这是数据 dt_new
的样子
userID day state
1 1 1 d
2 2 1 d
3 3 1 g
4 4 1 a
5 5 1 a
6 6 1 d
7 7 1 d
8 8 1 b
9 9 1 d
10 10 1 e
...
现在绘制桑基图:
ggplot(dt_new,
aes(x = day, stratum = state, alluvium = userID, fill = state, label = state)) +
geom_stratum() +
geom_text(stat = "stratum") +
geom_flow()
这是输出
我正在尝试生成一个 sankey diagram in R, which is also referred as a river plot. I've seen this question Sankey Diagrams in R?,其中列出了生成桑基图的各种包。因为我有输入数据并且知道不同的 tools/packages 我可以制作这样的图表但是我的问题是:我如何为这样的输入数据准备?
假设我们想展示用户如何在 10 天内在不同状态之间迁移,并且开始数据集如下所示:
data.frame(userID = 1:100,
day1_state = sample(letters[1:8], replace = TRUE, size = 100),
day2_state = sample(letters[1:8], replace = TRUE, size = 100),
day3_state = sample(letters[1:8], replace = TRUE, size = 100),
day4_state = sample(letters[1:8], replace = TRUE, size = 100),
day5_state = sample(letters[1:8], replace = TRUE, size = 100),
day6_state = sample(letters[1:8], replace = TRUE, size = 100),
day7_state = sample(letters[1:8], replace = TRUE, size = 100),
day8_state = sample(letters[1:8], replace = TRUE, size = 100),
day9_state = sample(letters[1:8], replace = TRUE, size = 100),
day10_state = sample(letters[1:8], replace = TRUE, size = 100)
) -> dt
现在,如果想用 networkD3
package 创建桑基图,应该如何将此 dt
data.frame 转换为所需的输入
这样我们就可以像这个例子一样输入
library(networkD3)
URL <- paste0(
"https://cdn.rawgit.com/christophergandrud/networkD3/",
"master/JSONdata/energy.json")
Energy <- jsonlite::fromJSON(URL)
# Plot
sankeyNetwork(Links = Energy$links, Nodes = Energy$nodes, Source = "source",
Target = "target", Value = "value", NodeID = "name",
units = "TWh", fontSize = 12, nodeWidth = 30)
编辑
我发现了这样的脚本,它在其他情况下准备数据并复制了它,所以我认为它现在可能会被关闭:
https://github.com/mi2-warsaw/JakOniGlosowali/blob/master/sankey/sankey.R
我发现了这样的脚本,它在其他情况下准备数据并复制了它,所以我认为它现在可能会被关闭:
https://github.com/mi2-warsaw/JakOniGlosowali/blob/master/sankey/sankey.R
然后这段代码为问题data.frame
生成这样的sankey图fixtable <- function(...) {
tab <- table(...)
if (substr(colnames(tab)[1],1,1) == "_" &
substr(rownames(tab)[1],1,1) == "_") {
tab2 <- tab
colnames(tab2) <- sapply(strsplit(colnames(tab2), split=" "), `[`, 1)
rownames(tab2) <- sapply(strsplit(rownames(tab2), split=" "), `[`, 1)
tab2[1,1] <- 0
# mandat w klubie
for (par in names(which(tab2[1,] > 0))) {
delta = min(tab2[par, 1], tab2[1, par])
tab2[par, par] = tab2[par, par] + delta
tab2[1, par] = tab2[1, par] - delta
tab2[par, 1] = tab2[par, 1] - delta
}
# przechodzi przez niezalezy
for (par in names(which(tab2[1,] > 0))) {
tab2["niez.", par] = tab2["niez.", par] + tab2[1, par]
tab2[1, par] = 0
}
for (par in names(which(tab2[,1] > 0))) {
tab2[par, "niez."] = tab2[par, "niez."] + tab2[par, 1]
tab2[par, 1] = 0
}
tab[] <- tab2[]
}
tab
}
flow2 <- rbind(
data.frame(fixtable(z = paste0(dat$day1_state, " day1"), do = paste0(dat$day2_state, " day2"))),
data.frame(fixtable(z = paste0(dat$day2_state, " day2"), do = paste0(dat$day3_state, " day3"))),
data.frame(fixtable(z = paste0(dat$day3_state, " day3"), do = paste0(dat$day4_state, " day4"))),
data.frame(fixtable(z = paste0(dat$day4_state, " day4"), do = paste0(dat$day5_state, " day5"))),
data.frame(fixtable(z = paste0(dat$day5_state, " day5"), do = paste0(dat$day6_state, " day6"))),
data.frame(fixtable(z = paste0(dat$day6_state, " day6"), do = paste0(dat$day7_state, " day7"))),
data.frame(fixtable(z = paste0(dat$day7_state, " day7"), do = paste0(dat$day8_state, " day8"))),
data.frame(fixtable(z = paste0(dat$day8_state, " day8"), do = paste0(dat$day9_state, " day9"))),
data.frame(fixtable(z = paste0(dat$day9_state, " day9"), do = paste0(dat$day10_state, " day10"))))
flow2 <- flow2[flow2[,3] > 0,]
nodes2 <- data.frame(name=unique(c(levels(factor(flow2[,1])), levels(factor(flow2[,2])))))
nam2 <- seq_along(nodes2[,1])-1
names(nam2) <- nodes2[,1]
links2 <- data.frame(source = nam2[as.character(flow2[,1])],
target = nam2[as.character(flow2[,2])],
value = flow2[,3])
sankeyNetwork(Links = links, Nodes = nodes,
Source = "source", Target = "target",
Value = "value", NodeID = "name",
fontFamily = "Arial", fontSize = 12, nodeWidth = 40,
colourScale = "d3.scale.category20()")
tidyverse
魔法完成。
library(ggplot2)
library(ggalluvial)
library(tidyr)
library(dplyr)
library(stringr)
# The actual data preperation happens here
dt_new <- dt %>%
gather(day, state, -userID) %>% # Long format
mutate(day = str_match(day, "[0-9]+")[,1]) %>% # Get the numbers
mutate(day = as.integer(day), # Convert to proper data types
state = as.factor(state))
这是数据 dt_new
的样子
userID day state
1 1 1 d
2 2 1 d
3 3 1 g
4 4 1 a
5 5 1 a
6 6 1 d
7 7 1 d
8 8 1 b
9 9 1 d
10 10 1 e
...
现在绘制桑基图:
ggplot(dt_new,
aes(x = day, stratum = state, alluvium = userID, fill = state, label = state)) +
geom_stratum() +
geom_text(stat = "stratum") +
geom_flow()
这是输出