R 中的桑基图 - 数据准备
sankey diagram in R - data preparation
我有以下数据框,其中每个患者都是一行(我只显示其中的一个样本):
df = structure(list(firstY = c("N/A", "1", "3a", "3a", "3b", "1",
"2", "1", "5", "3b"), secondY = c("N/A", "1", "2", "3a", "4",
"1", "N/A", "1", "5", "3b"), ThirdY = c("N/A", "1", "N/A", "3b",
"4", "1", "N/A", "1", "N/A", "3b"), FourthY = c("N/A", "1", "N/A",
"3a", "4", "1", "N/A", "1", "N/A", "3a"), FifthY = c("N/A", "1",
"N/A", "2", "5", "1", "N/A", "N/A", "N/A", "3b")), class = c("tbl_df",
"tbl", "data.frame"), row.names = c(NA, -10L))
我想绘制一个 Sankey 图,它显示每个患者随时间的轨迹,我知道我必须创建节点和链接,但我在将数据转换为必要的格式时遇到了问题完成这个。具体来说,最有问题的问题是统计每个轨迹有多少患者,例如第一年有多少患者从stage 1到stage 2,以及所有其他组合。
如能提供数据准备方面的帮助,我们将不胜感激。
Alluvial 包虽然易于理解,但在数据量很大的情况下并不能很好地应对。
不太清楚你想要实现什么,因为你没有提到你想要使用的包,但看看你的数据,这似乎有帮助,如果你可以使用 alluvial
包裹:
library(alluvial) # sankey plots
library(dplyr) # data manipulation
alluvial
函数可以像您一样使用宽格式数据,但它需要一个频率列,因此我们可以创建它,然后绘制:
dats_all <- df %>% # data
group_by( firstY, secondY, ThirdY, FourthY, FifthY) %>% # group them
summarise(Freq = n()) # add frequencies
# now plot it
alluvial( dats_all[,1:5], freq=dats_all$Freq, border=NA )
另一方面,如果您想使用特定的包,您应该指定是哪个包。
编辑
使用 network3D 有点棘手,但您也许可以从中获得一些不错的结果。您需要 links 和节点,并让它们匹配,所以首先我们可以创建 links:
# put your df in two columns, and preserve the ordering in many levels (columns) with paste0
links <- data.frame(source = c(paste0(df$firstY,'_1'),paste0(df$secondY,'_2'),paste0(df$ThirdY,'_3'),paste0(df$FourthY,'_4')),
target = c(paste0(df$secondY,'_2'),paste0(df$ThirdY,'_3'),paste0(df$FourthY,'_4'),paste0(df$FifthY,'_5')))
# now convert as character
links$source <- as.character(links$source)
links$target<- as.character(links$target)
现在节点是 link 中的每个元素 unique()
方式:
nodes <- data.frame(name = unique(c(links$source, links$target)))
现在我们需要每个节点都有一个link(或vice-versa),所以我们匹配它们并进行数字转换。注意最后的-1,因为networkD3是0索引,表示数字(索引)从0开始。
links$source <- match(links$source, nodes$name) - 1
links$target <- match(links$target, nodes$name) - 1
links$value <- 1 # add also a value
现在您应该准备好绘制桑基图了:
sankeyNetwork(Links = links, Nodes = nodes, Source = 'source',
Target = 'target', Value = 'value', NodeID = 'name')
使用 ggforce:
library(ggforce)
library(dplyr)
xx <- df %>%
count(firstY, secondY, ThirdY, FourthY, FifthY, name = "value") %>%
gather_set_data(1:5) %>%
mutate(x = factor(x, levels = colnames(df)))
ggplot(xx, aes(x, id = id, split = y, value = value)) +
geom_parallel_sets(alpha = 0.3, axis.width = 0.1) +
geom_parallel_sets_axes(axis.width = 0.3) +
geom_parallel_sets_labels(colour = "white")
一个tidyverse way with networkd3
library(tidyr)
library(dplyr)
library(networkD3)
df <- read.table(header = TRUE, stringsAsFactors = FALSE, text = "
firstY secondY ThirdY FourthY FifthY
N/A N/A N/A N/A N/A
1 1 1 1 1
3a 2 N/A N/A N/A
3a 3a 3b 3a 2
3b 4 4 4 5
1 1 1 1 1
2 N/A N/A N/A N/A
1 1 1 1 N/A
5 5 N/A N/A N/A
3b 3b 3b 3a 3b
")
links <-
df %>%
mutate(row = row_number()) %>% # add a row id
pivot_longer(-row, names_to = "col", values_to = "source") %>% # gather all columns
mutate(col = match(col, names(df))) %>% # convert col names to col ids
mutate(source = paste0(source, '_', col)) %>% # add col id to node names
group_by(row) %>%
mutate(target = lead(source, order_by = col)) %>% # get target from following node in row
ungroup() %>%
filter(!is.na(target)) %>% # remove links from last column in original data
group_by(source, target) %>%
summarise(value = n(), .groups = "drop") # aggregate and count similar links
# create nodes data frame from unque nodes found in links data frame
nodes <- data.frame(id = unique(c(links$source, links$target)),
stringsAsFactors = FALSE)
# remove column id from node names
nodes$name <- sub('_[0-9]*$', '', nodes$id)
# create node ids in links data to the 0-based index of the nodes in the nodes data frame
links$source_id <- match(links$source, nodes$id) - 1
links$target_id <- match(links$target, nodes$id) - 1
sankeyNetwork(Links = links, Nodes = nodes, Source = 'source_id',
Target = 'target_id', Value = 'value', NodeID = 'name')
我有以下数据框,其中每个患者都是一行(我只显示其中的一个样本):
df = structure(list(firstY = c("N/A", "1", "3a", "3a", "3b", "1",
"2", "1", "5", "3b"), secondY = c("N/A", "1", "2", "3a", "4",
"1", "N/A", "1", "5", "3b"), ThirdY = c("N/A", "1", "N/A", "3b",
"4", "1", "N/A", "1", "N/A", "3b"), FourthY = c("N/A", "1", "N/A",
"3a", "4", "1", "N/A", "1", "N/A", "3a"), FifthY = c("N/A", "1",
"N/A", "2", "5", "1", "N/A", "N/A", "N/A", "3b")), class = c("tbl_df",
"tbl", "data.frame"), row.names = c(NA, -10L))
我想绘制一个 Sankey 图,它显示每个患者随时间的轨迹,我知道我必须创建节点和链接,但我在将数据转换为必要的格式时遇到了问题完成这个。具体来说,最有问题的问题是统计每个轨迹有多少患者,例如第一年有多少患者从stage 1到stage 2,以及所有其他组合。
如能提供数据准备方面的帮助,我们将不胜感激。
Alluvial 包虽然易于理解,但在数据量很大的情况下并不能很好地应对。
不太清楚你想要实现什么,因为你没有提到你想要使用的包,但看看你的数据,这似乎有帮助,如果你可以使用 alluvial
包裹:
library(alluvial) # sankey plots
library(dplyr) # data manipulation
alluvial
函数可以像您一样使用宽格式数据,但它需要一个频率列,因此我们可以创建它,然后绘制:
dats_all <- df %>% # data
group_by( firstY, secondY, ThirdY, FourthY, FifthY) %>% # group them
summarise(Freq = n()) # add frequencies
# now plot it
alluvial( dats_all[,1:5], freq=dats_all$Freq, border=NA )
另一方面,如果您想使用特定的包,您应该指定是哪个包。
编辑
使用 network3D 有点棘手,但您也许可以从中获得一些不错的结果。您需要 links 和节点,并让它们匹配,所以首先我们可以创建 links:
# put your df in two columns, and preserve the ordering in many levels (columns) with paste0
links <- data.frame(source = c(paste0(df$firstY,'_1'),paste0(df$secondY,'_2'),paste0(df$ThirdY,'_3'),paste0(df$FourthY,'_4')),
target = c(paste0(df$secondY,'_2'),paste0(df$ThirdY,'_3'),paste0(df$FourthY,'_4'),paste0(df$FifthY,'_5')))
# now convert as character
links$source <- as.character(links$source)
links$target<- as.character(links$target)
现在节点是 link 中的每个元素 unique()
方式:
nodes <- data.frame(name = unique(c(links$source, links$target)))
现在我们需要每个节点都有一个link(或vice-versa),所以我们匹配它们并进行数字转换。注意最后的-1,因为networkD3是0索引,表示数字(索引)从0开始。
links$source <- match(links$source, nodes$name) - 1
links$target <- match(links$target, nodes$name) - 1
links$value <- 1 # add also a value
现在您应该准备好绘制桑基图了:
sankeyNetwork(Links = links, Nodes = nodes, Source = 'source',
Target = 'target', Value = 'value', NodeID = 'name')
使用 ggforce:
library(ggforce)
library(dplyr)
xx <- df %>%
count(firstY, secondY, ThirdY, FourthY, FifthY, name = "value") %>%
gather_set_data(1:5) %>%
mutate(x = factor(x, levels = colnames(df)))
ggplot(xx, aes(x, id = id, split = y, value = value)) +
geom_parallel_sets(alpha = 0.3, axis.width = 0.1) +
geom_parallel_sets_axes(axis.width = 0.3) +
geom_parallel_sets_labels(colour = "white")
一个tidyverse way with networkd3
library(tidyr)
library(dplyr)
library(networkD3)
df <- read.table(header = TRUE, stringsAsFactors = FALSE, text = "
firstY secondY ThirdY FourthY FifthY
N/A N/A N/A N/A N/A
1 1 1 1 1
3a 2 N/A N/A N/A
3a 3a 3b 3a 2
3b 4 4 4 5
1 1 1 1 1
2 N/A N/A N/A N/A
1 1 1 1 N/A
5 5 N/A N/A N/A
3b 3b 3b 3a 3b
")
links <-
df %>%
mutate(row = row_number()) %>% # add a row id
pivot_longer(-row, names_to = "col", values_to = "source") %>% # gather all columns
mutate(col = match(col, names(df))) %>% # convert col names to col ids
mutate(source = paste0(source, '_', col)) %>% # add col id to node names
group_by(row) %>%
mutate(target = lead(source, order_by = col)) %>% # get target from following node in row
ungroup() %>%
filter(!is.na(target)) %>% # remove links from last column in original data
group_by(source, target) %>%
summarise(value = n(), .groups = "drop") # aggregate and count similar links
# create nodes data frame from unque nodes found in links data frame
nodes <- data.frame(id = unique(c(links$source, links$target)),
stringsAsFactors = FALSE)
# remove column id from node names
nodes$name <- sub('_[0-9]*$', '', nodes$id)
# create node ids in links data to the 0-based index of the nodes in the nodes data frame
links$source_id <- match(links$source, nodes$id) - 1
links$target_id <- match(links$target, nodes$id) - 1
sankeyNetwork(Links = links, Nodes = nodes, Source = 'source_id',
Target = 'target_id', Value = 'value', NodeID = 'name')