在 R 中创建一个包含整洁数据的桑基图
Create a Sankey Diagram with tidy data in R
我使用 dplyr 和 tidyr 函数在 R 中 'tidied' 我的数据,并创建了一个如下所示的数据框:
df <- data.frame(PROD = c("A","A","A","A"), REJECT = c("YES","YES","NO","NO"),ALT_PROD = c("A","B","C","D"), VALUE = c(100,50,400,500))
我希望根据上述值绘制一个 3 部分桑基图。我发现的大多数示例都使用 2 部分图(从 -> 到),但我希望包括中间部分“REJECT”。
我还发现 examples 有多个部分,但由于我在 R 方面缺乏经验,我无法按照提供的示例进行操作。
可以选择使用 flipPlot 包,但由于包更新问题,我无法从 GitHub 安装包:
Error: Failed to install 'flipPlots' from GitHub:
Failed to install 'flipTransformations' from GitHub:
Failed to install 'flipFormat' from GitHub:
(converted from warning) cannot remove prior installation of package ‘jsonlite’
我以前使用 networkD3 包创建了一个两部分的图,我真的很想更好地理解如何扩展它来构建一个 3 部分的图。
您可以尝试使用定义为 in this Kaggle notebook 的 sankey_from_data_frame()
函数。它需要 dplyr
、tidyr
、purrr
、tidygraph
和 networkD3
。
我最近也遇到了 flipPlots
的安装问题,所以也许暂时避免这种情况。
library(dplyr)
library(tidyr)
library(purrr)
library(tidygraph)
library(networkD3)
## copy the code from the Kaggle notebook here
## sankey_from_data_frame <- ...
然后:
sankey_from_data_frame(data = df, val_col = VALUE)
生成:
请注意“loop-back”边,这是由 PROD 和 ALT_PROD 中的相同名称产生的。如果您希望 ALT_PROD value = A 位于右侧,一种解决方案是重命名 PROD 值:
sankey_from_data_frame(data = mutate(df1, PROD = paste0("PROD ", PROD)), val_col = VALUE)
结果:
您需要构建一个符合'source', 'target', ...
风格的链接数据框。在您的情况下,每个连续的列(除了 VALUE 列)都是前一列的目标。您可以通过从每列的顺序推断每个步骤的顺序来重塑数据...
library(networkD3)
library(dplyr)
library(tidyr)
df <- data.frame(PROD = c("A","A","A","A"),
REJECT = c("YES","YES","NO","NO"),
ALT_PROD = c("A","B","C","D"),
VALUE = c(100,50,400,500))
links <-
df %>%
as_tibble() %>%
mutate(row = row_number()) %>%
pivot_longer(cols = c(-row, -VALUE),
names_to = 'column', values_to = 'source') %>%
mutate(column = match(column, names(df))) %>%
mutate(source = paste0(source, '__', column)) %>%
group_by(row) %>%
mutate(target = lead(source, order_by = column)) %>%
drop_na(target, source) %>%
group_by(source, target) %>%
summarise(value = sum(VALUE), .groups = 'drop')
nodes <- data.frame(name = unique(c(links$source, links$target)))
links$source <- match(links$source, nodes$name) - 1
links$target <- match(links$target, nodes$name) - 1
nodes$name <- sub('__[0-9]+$', '', nodes$name)
sankeyNetwork(Links = links, Nodes = nodes, Source = "source",
Target = "target", Value = "value", NodeID = "name")
为了使流程更加清晰,下面是您需要构建的 links
数据框在流程中每个重要步骤之后的样子...
df %>%
as_tibble() %>%
mutate(row = row_number())
#> # A tibble: 4 x 5
#> PROD REJECT ALT_PROD VALUE row
#> <chr> <chr> <chr> <dbl> <int>
#> 1 A YES A 100 1
#> 2 A YES B 50 2
#> 3 A NO C 400 3
#> 4 A NO D 500 4
df %>%
as_tibble() %>%
mutate(row = row_number()) %>%
pivot_longer(cols = c(-row, -VALUE),
names_to = 'column', values_to = 'source')
#> # A tibble: 12 x 4
#> VALUE row column source
#> <dbl> <int> <chr> <chr>
#> 1 100 1 PROD A
#> 2 100 1 REJECT YES
#> 3 100 1 ALT_PROD A
#> 4 50 2 PROD A
#> 5 50 2 REJECT YES
#> 6 50 2 ALT_PROD B
#> 7 400 3 PROD A
#> 8 400 3 REJECT NO
#> 9 400 3 ALT_PROD C
#> 10 500 4 PROD A
#> 11 500 4 REJECT NO
#> 12 500 4 ALT_PROD D
df %>%
as_tibble() %>%
mutate(row = row_number()) %>%
pivot_longer(cols = c(-row, -VALUE),
names_to = 'column', values_to = 'source') %>%
mutate(column = match(column, names(df))) %>%
mutate(source = paste0(source, '__', column))
#> # A tibble: 12 x 4
#> VALUE row column source
#> <dbl> <int> <int> <chr>
#> 1 100 1 1 A__1
#> 2 100 1 2 YES__2
#> 3 100 1 3 A__3
#> 4 50 2 1 A__1
#> 5 50 2 2 YES__2
#> 6 50 2 3 B__3
#> 7 400 3 1 A__1
#> 8 400 3 2 NO__2
#> 9 400 3 3 C__3
#> 10 500 4 1 A__1
#> 11 500 4 2 NO__2
#> 12 500 4 3 D__3
df %>%
as_tibble() %>%
mutate(row = row_number()) %>%
pivot_longer(cols = c(-row, -VALUE),
names_to = 'column', values_to = 'source') %>%
mutate(column = match(column, names(df))) %>%
mutate(source = paste0(source, '__', column)) %>%
group_by(row) %>%
mutate(target = lead(source, order_by = column))
#> # A tibble: 12 x 5
#> # Groups: row [4]
#> VALUE row column source target
#> <dbl> <int> <int> <chr> <chr>
#> 1 100 1 1 A__1 YES__2
#> 2 100 1 2 YES__2 A__3
#> 3 100 1 3 A__3 <NA>
#> 4 50 2 1 A__1 YES__2
#> 5 50 2 2 YES__2 B__3
#> 6 50 2 3 B__3 <NA>
#> 7 400 3 1 A__1 NO__2
#> 8 400 3 2 NO__2 C__3
#> 9 400 3 3 C__3 <NA>
#> 10 500 4 1 A__1 NO__2
#> 11 500 4 2 NO__2 D__3
#> 12 500 4 3 D__3 <NA>
df %>%
as_tibble() %>%
mutate(row = row_number()) %>%
pivot_longer(cols = c(-row, -VALUE),
names_to = 'column', values_to = 'source') %>%
mutate(column = match(column, names(df))) %>%
mutate(source = paste0(source, '__', column)) %>%
group_by(row) %>%
mutate(target = lead(source, order_by = column)) %>%
drop_na(target, source)
#> # A tibble: 8 x 5
#> # Groups: row [4]
#> VALUE row column source target
#> <dbl> <int> <int> <chr> <chr>
#> 1 100 1 1 A__1 YES__2
#> 2 100 1 2 YES__2 A__3
#> 3 50 2 1 A__1 YES__2
#> 4 50 2 2 YES__2 B__3
#> 5 400 3 1 A__1 NO__2
#> 6 400 3 2 NO__2 C__3
#> 7 500 4 1 A__1 NO__2
#> 8 500 4 2 NO__2 D__3
df %>%
as_tibble() %>%
mutate(row = row_number()) %>%
pivot_longer(cols = c(-row, -VALUE),
names_to = 'column', values_to = 'source') %>%
mutate(column = match(column, names(df))) %>%
mutate(source = paste0(source, '__', column)) %>%
group_by(row) %>%
mutate(target = lead(source, order_by = column)) %>%
drop_na(target, source) %>%
group_by(source, target) %>%
summarise(value = sum(VALUE), .groups = 'drop')
#> # A tibble: 6 x 3
#> source target value
#> <chr> <chr> <dbl>
#> 1 A__1 NO__2 900
#> 2 A__1 YES__2 150
#> 3 NO__2 C__3 400
#> 4 NO__2 D__3 500
#> 5 YES__2 A__3 100
#> 6 YES__2 B__3 50
我使用 dplyr 和 tidyr 函数在 R 中 'tidied' 我的数据,并创建了一个如下所示的数据框:
df <- data.frame(PROD = c("A","A","A","A"), REJECT = c("YES","YES","NO","NO"),ALT_PROD = c("A","B","C","D"), VALUE = c(100,50,400,500))
我希望根据上述值绘制一个 3 部分桑基图。我发现的大多数示例都使用 2 部分图(从 -> 到),但我希望包括中间部分“REJECT”。 我还发现 examples 有多个部分,但由于我在 R 方面缺乏经验,我无法按照提供的示例进行操作。
可以选择使用 flipPlot 包,但由于包更新问题,我无法从 GitHub 安装包:
Error: Failed to install 'flipPlots' from GitHub:
Failed to install 'flipTransformations' from GitHub:
Failed to install 'flipFormat' from GitHub:
(converted from warning) cannot remove prior installation of package ‘jsonlite’
我以前使用 networkD3 包创建了一个两部分的图,我真的很想更好地理解如何扩展它来构建一个 3 部分的图。
您可以尝试使用定义为 in this Kaggle notebook 的 sankey_from_data_frame()
函数。它需要 dplyr
、tidyr
、purrr
、tidygraph
和 networkD3
。
我最近也遇到了 flipPlots
的安装问题,所以也许暂时避免这种情况。
library(dplyr)
library(tidyr)
library(purrr)
library(tidygraph)
library(networkD3)
## copy the code from the Kaggle notebook here
## sankey_from_data_frame <- ...
然后:
sankey_from_data_frame(data = df, val_col = VALUE)
生成:
请注意“loop-back”边,这是由 PROD 和 ALT_PROD 中的相同名称产生的。如果您希望 ALT_PROD value = A 位于右侧,一种解决方案是重命名 PROD 值:
sankey_from_data_frame(data = mutate(df1, PROD = paste0("PROD ", PROD)), val_col = VALUE)
结果:
您需要构建一个符合'source', 'target', ...
风格的链接数据框。在您的情况下,每个连续的列(除了 VALUE 列)都是前一列的目标。您可以通过从每列的顺序推断每个步骤的顺序来重塑数据...
library(networkD3)
library(dplyr)
library(tidyr)
df <- data.frame(PROD = c("A","A","A","A"),
REJECT = c("YES","YES","NO","NO"),
ALT_PROD = c("A","B","C","D"),
VALUE = c(100,50,400,500))
links <-
df %>%
as_tibble() %>%
mutate(row = row_number()) %>%
pivot_longer(cols = c(-row, -VALUE),
names_to = 'column', values_to = 'source') %>%
mutate(column = match(column, names(df))) %>%
mutate(source = paste0(source, '__', column)) %>%
group_by(row) %>%
mutate(target = lead(source, order_by = column)) %>%
drop_na(target, source) %>%
group_by(source, target) %>%
summarise(value = sum(VALUE), .groups = 'drop')
nodes <- data.frame(name = unique(c(links$source, links$target)))
links$source <- match(links$source, nodes$name) - 1
links$target <- match(links$target, nodes$name) - 1
nodes$name <- sub('__[0-9]+$', '', nodes$name)
sankeyNetwork(Links = links, Nodes = nodes, Source = "source",
Target = "target", Value = "value", NodeID = "name")
为了使流程更加清晰,下面是您需要构建的 links
数据框在流程中每个重要步骤之后的样子...
df %>%
as_tibble() %>%
mutate(row = row_number())
#> # A tibble: 4 x 5
#> PROD REJECT ALT_PROD VALUE row
#> <chr> <chr> <chr> <dbl> <int>
#> 1 A YES A 100 1
#> 2 A YES B 50 2
#> 3 A NO C 400 3
#> 4 A NO D 500 4
df %>%
as_tibble() %>%
mutate(row = row_number()) %>%
pivot_longer(cols = c(-row, -VALUE),
names_to = 'column', values_to = 'source')
#> # A tibble: 12 x 4
#> VALUE row column source
#> <dbl> <int> <chr> <chr>
#> 1 100 1 PROD A
#> 2 100 1 REJECT YES
#> 3 100 1 ALT_PROD A
#> 4 50 2 PROD A
#> 5 50 2 REJECT YES
#> 6 50 2 ALT_PROD B
#> 7 400 3 PROD A
#> 8 400 3 REJECT NO
#> 9 400 3 ALT_PROD C
#> 10 500 4 PROD A
#> 11 500 4 REJECT NO
#> 12 500 4 ALT_PROD D
df %>%
as_tibble() %>%
mutate(row = row_number()) %>%
pivot_longer(cols = c(-row, -VALUE),
names_to = 'column', values_to = 'source') %>%
mutate(column = match(column, names(df))) %>%
mutate(source = paste0(source, '__', column))
#> # A tibble: 12 x 4
#> VALUE row column source
#> <dbl> <int> <int> <chr>
#> 1 100 1 1 A__1
#> 2 100 1 2 YES__2
#> 3 100 1 3 A__3
#> 4 50 2 1 A__1
#> 5 50 2 2 YES__2
#> 6 50 2 3 B__3
#> 7 400 3 1 A__1
#> 8 400 3 2 NO__2
#> 9 400 3 3 C__3
#> 10 500 4 1 A__1
#> 11 500 4 2 NO__2
#> 12 500 4 3 D__3
df %>%
as_tibble() %>%
mutate(row = row_number()) %>%
pivot_longer(cols = c(-row, -VALUE),
names_to = 'column', values_to = 'source') %>%
mutate(column = match(column, names(df))) %>%
mutate(source = paste0(source, '__', column)) %>%
group_by(row) %>%
mutate(target = lead(source, order_by = column))
#> # A tibble: 12 x 5
#> # Groups: row [4]
#> VALUE row column source target
#> <dbl> <int> <int> <chr> <chr>
#> 1 100 1 1 A__1 YES__2
#> 2 100 1 2 YES__2 A__3
#> 3 100 1 3 A__3 <NA>
#> 4 50 2 1 A__1 YES__2
#> 5 50 2 2 YES__2 B__3
#> 6 50 2 3 B__3 <NA>
#> 7 400 3 1 A__1 NO__2
#> 8 400 3 2 NO__2 C__3
#> 9 400 3 3 C__3 <NA>
#> 10 500 4 1 A__1 NO__2
#> 11 500 4 2 NO__2 D__3
#> 12 500 4 3 D__3 <NA>
df %>%
as_tibble() %>%
mutate(row = row_number()) %>%
pivot_longer(cols = c(-row, -VALUE),
names_to = 'column', values_to = 'source') %>%
mutate(column = match(column, names(df))) %>%
mutate(source = paste0(source, '__', column)) %>%
group_by(row) %>%
mutate(target = lead(source, order_by = column)) %>%
drop_na(target, source)
#> # A tibble: 8 x 5
#> # Groups: row [4]
#> VALUE row column source target
#> <dbl> <int> <int> <chr> <chr>
#> 1 100 1 1 A__1 YES__2
#> 2 100 1 2 YES__2 A__3
#> 3 50 2 1 A__1 YES__2
#> 4 50 2 2 YES__2 B__3
#> 5 400 3 1 A__1 NO__2
#> 6 400 3 2 NO__2 C__3
#> 7 500 4 1 A__1 NO__2
#> 8 500 4 2 NO__2 D__3
df %>%
as_tibble() %>%
mutate(row = row_number()) %>%
pivot_longer(cols = c(-row, -VALUE),
names_to = 'column', values_to = 'source') %>%
mutate(column = match(column, names(df))) %>%
mutate(source = paste0(source, '__', column)) %>%
group_by(row) %>%
mutate(target = lead(source, order_by = column)) %>%
drop_na(target, source) %>%
group_by(source, target) %>%
summarise(value = sum(VALUE), .groups = 'drop')
#> # A tibble: 6 x 3
#> source target value
#> <chr> <chr> <dbl>
#> 1 A__1 NO__2 900
#> 2 A__1 YES__2 150
#> 3 NO__2 C__3 400
#> 4 NO__2 D__3 500
#> 5 YES__2 A__3 100
#> 6 YES__2 B__3 50