使用 ggplot2 的 TramineR 序列图
TramineR sequence plot with ggplot2
我是 TramineR 包的新手,想使用 ggplot 创建状态分布图。下面的图是用 TramineR 包创建的,但我如何提取数据并用 ggplot 绘制它?我也想更改坐标轴和颜色?
示例代码:
dev.off()
seqdplot(df_new.seq[1:10,], border=0,
axes=T, yaxis=T, xaxis=T, ylab="",
cex.legend=0.5, ncol=6, legend.prop=.11)
示例数据:
structure(list(`04:00` = structure(c(19L, 19L, 19L), .Label = c("PC",
"SL", "EA", "WR", "ST", "DI", "FP", "FO", "LA", "IR", "HO", "CH",
"CA", "LE", "CO", "TV", "RA", "TR", "OT", "*", "%"), class = "factor"),
`04:10` = structure(c(19L, 19L, 19L), .Label = c("PC", "SL",
"EA", "WR", "ST", "DI", "FP", "FO", "LA", "IR", "HO", "CH",
"CA", "LE", "CO", "TV", "RA", "TR", "OT", "*", "%"), class = "factor"),
`04:20` = structure(c(19L, 19L, 19L), .Label = c("PC", "SL",
"EA", "WR", "ST", "DI", "FP", "FO", "LA", "IR", "HO", "CH",
"CA", "LE", "CO", "TV", "RA", "TR", "OT", "*", "%"), class = "factor"),
`04:30` = structure(c(19L, 19L, 19L), .Label = c("PC", "SL",
"EA", "WR", "ST", "DI", "FP", "FO", "LA", "IR", "HO", "CH",
"CA", "LE", "CO", "TV", "RA", "TR", "OT", "*", "%"), class = "factor"),
`04:40` = structure(c(19L, 19L, 19L), .Label = c("PC", "SL",
"EA", "WR", "ST", "DI", "FP", "FO", "LA", "IR", "HO", "CH",
"CA", "LE", "CO", "TV", "RA", "TR", "OT", "*", "%"), class = "factor"),
`04:50` = structure(c(19L, 19L, 19L), .Label = c("PC", "SL",
"EA", "WR", "ST", "DI", "FP", "FO", "LA", "IR", "HO", "CH",
"CA", "LE", "CO", "TV", "RA", "TR", "OT", "*", "%"), class = "factor"),
`05:00` = structure(c(19L, 19L, 19L), .Label = c("PC", "SL",
"EA", "WR", "ST", "DI", "FP", "FO", "LA", "IR", "HO", "CH",
"CA", "LE", "CO", "TV", "RA", "TR", "OT", "*", "%"), class = "factor"),
`05:10` = structure(c(19L, 19L, 19L), .Label = c("PC", "SL",
"EA", "WR", "ST", "DI", "FP", "FO", "LA", "IR", "HO", "CH",
"CA", "LE", "CO", "TV", "RA", "TR", "OT", "*", "%"), class = "factor"),
`05:20` = structure(c(19L, 19L, 19L), .Label = c("PC", "SL",
"EA", "WR", "ST", "DI", "FP", "FO", "LA", "IR", "HO", "CH",
"CA", "LE", "CO", "TV", "RA", "TR", "OT", "*", "%"), class = "factor"),
`05:30` = structure(c(19L, 19L, 19L), .Label = c("PC", "SL",
"EA", "WR", "ST", "DI", "FP", "FO", "LA", "IR", "HO", "CH",
"CA", "LE", "CO", "TV", "RA", "TR", "OT", "*", "%"), class = "factor")), row.names = c(NA,
3L), start = 1, missing = NA, void = "%", nr = "*", alphabet = c("PC",
"SL", "EA", "WR", "ST", "DI", "FP", "FO", "LA", "IR", "HO", "CH",
"CA", "LE", "CO", "TV", "RA", "TR", "OT"), class = c("stslist",
"data.frame"), labels = c("Personal care", "Sleep", "Eating",
"Work", "Study", "Dishwash", "Food preparation", "Household upkeep",
"Laundry", "Ironing", "Housework", "Childcare", "Care for adults",
"Leisure", "Computing", "TV", "Radio and music", "Travel", "Other"
), cpal = c("#FFB3B5", "#F8B8A2", "#EDBE91", "#DDC485", "#CBCA82",
"#B5D087", "#9DD594", "#84D8A6", "#6ED9B9", "#61D9CD", "#66D7DF",
"#7AD3ED", "#96CCF8", "#B3C5FD", "#CEBDFD", "#E3B6F7", "#F3B1EC",
"#FDAFDC", "#FFB0CA"), missing.color = "darkgrey", xtstep = 19, tick.last = FALSE, Version = "2.2-2")
seqplot
(其中seqdplot
是type="d"
的别名)的联机帮助页面指出
A State distribution plot (type="d") represents the sequence of the
cross-sectional state frequencies by position (time point) computed by
the seqstatd function and rendered with the plot.stslist.statd method.
Such plots are also known as chronograms.
所以你用函数seqstatd
得到了seqdplot
使用的数据。实际上,分布在属性 Frequencies
.
中
您的示例数据仅包含三个长度为 10 的序列,其中一个咒语处于状态 'OT'。我把它存储在 s.spl
s.spl
# Sequence
# 1 OT-OT-OT-OT-OT-OT-OT-OT-OT-OT
# 2 OT-OT-OT-OT-OT-OT-OT-OT-OT-OT
# 3 OT-OT-OT-OT-OT-OT-OT-OT-OT-OT
位置分布为
sd <- seqstatd(s.spl)
sd$Frequencies
# 04:00 04:10 04:20 04:30 04:40 04:50 05:00 05:10 05:20 05:30
# PC 0 0 0 0 0 0 0 0 0 0
# SL 0 0 0 0 0 0 0 0 0 0
# EA 0 0 0 0 0 0 0 0 0 0
# WR 0 0 0 0 0 0 0 0 0 0
# ST 0 0 0 0 0 0 0 0 0 0
# DI 0 0 0 0 0 0 0 0 0 0
# FP 0 0 0 0 0 0 0 0 0 0
# FO 0 0 0 0 0 0 0 0 0 0
# LA 0 0 0 0 0 0 0 0 0 0
# IR 0 0 0 0 0 0 0 0 0 0
# HO 0 0 0 0 0 0 0 0 0 0
# CH 0 0 0 0 0 0 0 0 0 0
# CA 0 0 0 0 0 0 0 0 0 0
# LE 0 0 0 0 0 0 0 0 0 0
# CO 0 0 0 0 0 0 0 0 0 0
# TV 0 0 0 0 0 0 0 0 0 0
# RA 0 0 0 0 0 0 0 0 0 0
# TR 0 0 0 0 0 0 0 0 0 0
# OT 1 1 1 1 1 1 1 1 1 1
如果想用 ggplot
重写 TraMineR
的绘图工具,祝你好运
正如@Gilbert 所指出的,您的示例数据并不是很有用。因此,我使用附加到 TraMiner
的示例数据作为我的答案。
那么让我们开始设置数据
library(TraMineR)
## biofam data set
data(biofam)
## We use only a sample of 300 cases
set.seed(10)
biofam <- biofam[sample(nrow(biofam),300),]
biofam.lab <- c("Parent", "Left", "Married", "Left+Marr",
"Child", "Left+Child", "Left+Marr+Child", "Divorced")
biofam.seq <- seqdef(biofam, 10:25, labels=biofam.lab)
下面进入正题。对于 {TraMineR}
的 seqplot
,这是一项相当容易的工作。
# Convenient out of the box TraMineR plot
seqdplot(biofam.seq)
如果您想改用 {ggplot2}
,则必须重塑数据。正如@Gilbert 所指出的,您首先必须从 seqstatd
中提取状态分布。在第二步中,您必须将结果数据重塑为 tidy 格式,例如通过使用 pivot_longer
.
# Extract and reshape state distributions
dplotdata <- as_tibble(seqstatd(biofam.seq)$Frequencies,
rownames = "state") %>%
pivot_longer(cols = -1,
names_to = "time",
names_prefix = "a",
names_transform = list(time = as.integer))
dplotdata
# # A tibble: 128 x 3
# state time value
# <chr> <int> <dbl>
# 1 0 15 0.987
# 2 0 16 0.95
# 3 0 17 0.937
# 4 0 18 0.897
# 5 0 19 0.83
# 6 0 20 0.74
# 7 0 21 0.63
# 8 0 22 0.54
# 9 0 23 0.45
# 10 0 24 0.373
# # ... with 118 more rows
完成此操作后,您可以使用 {ggplot2}
重新创建 seqdplot
的输出。请注意,我必须颠倒填充变量(状态)的顺序才能获得与 seqdplot
相同的输出。如果堆叠顺序与您无关,可以缩短代码。
library(tidyverse)
library(glue)
# Extract color palette from sequence object
cpal <- attributes(biofam.seq)$cpal
# plot
ggplot(dplotdata, aes(fill = rev(state), y = value, x = time)) +
geom_bar(stat = "identity",
width = 1, colour = "black") +
scale_fill_manual(values = rev(cpal),
labels = rev(biofam.lab)) +
scale_y_continuous(expand = expansion(add = c(.01,0))) +
scale_x_continuous(expand = expansion(add = .15)) +
labs(x = "", y = glue("Rel. Freq. (n={nrow(biofam.seq)})")) +
guides(fill = guide_legend(reverse=TRUE)) +
theme_minimal() +
theme(legend.position = "bottom",
legend.title = element_blank())
如您所见,ggplot2 解决方案需要一些额外的工作,如果您对 {ggplot2}
比 R 的 {base} plot
更熟悉,则可能是 well-invested。鉴于 seqplot
的便利性,我建议在转向 {ggplot2}
.
之前总是先从这些图开始
我是 TramineR 包的新手,想使用 ggplot 创建状态分布图。下面的图是用 TramineR 包创建的,但我如何提取数据并用 ggplot 绘制它?我也想更改坐标轴和颜色?
示例代码:
dev.off()
seqdplot(df_new.seq[1:10,], border=0,
axes=T, yaxis=T, xaxis=T, ylab="",
cex.legend=0.5, ncol=6, legend.prop=.11)
示例数据:
structure(list(`04:00` = structure(c(19L, 19L, 19L), .Label = c("PC",
"SL", "EA", "WR", "ST", "DI", "FP", "FO", "LA", "IR", "HO", "CH",
"CA", "LE", "CO", "TV", "RA", "TR", "OT", "*", "%"), class = "factor"),
`04:10` = structure(c(19L, 19L, 19L), .Label = c("PC", "SL",
"EA", "WR", "ST", "DI", "FP", "FO", "LA", "IR", "HO", "CH",
"CA", "LE", "CO", "TV", "RA", "TR", "OT", "*", "%"), class = "factor"),
`04:20` = structure(c(19L, 19L, 19L), .Label = c("PC", "SL",
"EA", "WR", "ST", "DI", "FP", "FO", "LA", "IR", "HO", "CH",
"CA", "LE", "CO", "TV", "RA", "TR", "OT", "*", "%"), class = "factor"),
`04:30` = structure(c(19L, 19L, 19L), .Label = c("PC", "SL",
"EA", "WR", "ST", "DI", "FP", "FO", "LA", "IR", "HO", "CH",
"CA", "LE", "CO", "TV", "RA", "TR", "OT", "*", "%"), class = "factor"),
`04:40` = structure(c(19L, 19L, 19L), .Label = c("PC", "SL",
"EA", "WR", "ST", "DI", "FP", "FO", "LA", "IR", "HO", "CH",
"CA", "LE", "CO", "TV", "RA", "TR", "OT", "*", "%"), class = "factor"),
`04:50` = structure(c(19L, 19L, 19L), .Label = c("PC", "SL",
"EA", "WR", "ST", "DI", "FP", "FO", "LA", "IR", "HO", "CH",
"CA", "LE", "CO", "TV", "RA", "TR", "OT", "*", "%"), class = "factor"),
`05:00` = structure(c(19L, 19L, 19L), .Label = c("PC", "SL",
"EA", "WR", "ST", "DI", "FP", "FO", "LA", "IR", "HO", "CH",
"CA", "LE", "CO", "TV", "RA", "TR", "OT", "*", "%"), class = "factor"),
`05:10` = structure(c(19L, 19L, 19L), .Label = c("PC", "SL",
"EA", "WR", "ST", "DI", "FP", "FO", "LA", "IR", "HO", "CH",
"CA", "LE", "CO", "TV", "RA", "TR", "OT", "*", "%"), class = "factor"),
`05:20` = structure(c(19L, 19L, 19L), .Label = c("PC", "SL",
"EA", "WR", "ST", "DI", "FP", "FO", "LA", "IR", "HO", "CH",
"CA", "LE", "CO", "TV", "RA", "TR", "OT", "*", "%"), class = "factor"),
`05:30` = structure(c(19L, 19L, 19L), .Label = c("PC", "SL",
"EA", "WR", "ST", "DI", "FP", "FO", "LA", "IR", "HO", "CH",
"CA", "LE", "CO", "TV", "RA", "TR", "OT", "*", "%"), class = "factor")), row.names = c(NA,
3L), start = 1, missing = NA, void = "%", nr = "*", alphabet = c("PC",
"SL", "EA", "WR", "ST", "DI", "FP", "FO", "LA", "IR", "HO", "CH",
"CA", "LE", "CO", "TV", "RA", "TR", "OT"), class = c("stslist",
"data.frame"), labels = c("Personal care", "Sleep", "Eating",
"Work", "Study", "Dishwash", "Food preparation", "Household upkeep",
"Laundry", "Ironing", "Housework", "Childcare", "Care for adults",
"Leisure", "Computing", "TV", "Radio and music", "Travel", "Other"
), cpal = c("#FFB3B5", "#F8B8A2", "#EDBE91", "#DDC485", "#CBCA82",
"#B5D087", "#9DD594", "#84D8A6", "#6ED9B9", "#61D9CD", "#66D7DF",
"#7AD3ED", "#96CCF8", "#B3C5FD", "#CEBDFD", "#E3B6F7", "#F3B1EC",
"#FDAFDC", "#FFB0CA"), missing.color = "darkgrey", xtstep = 19, tick.last = FALSE, Version = "2.2-2")
seqplot
(其中seqdplot
是type="d"
的别名)的联机帮助页面指出
A State distribution plot (type="d") represents the sequence of the cross-sectional state frequencies by position (time point) computed by the seqstatd function and rendered with the plot.stslist.statd method. Such plots are also known as chronograms.
所以你用函数seqstatd
得到了seqdplot
使用的数据。实际上,分布在属性 Frequencies
.
您的示例数据仅包含三个长度为 10 的序列,其中一个咒语处于状态 'OT'。我把它存储在 s.spl
s.spl
# Sequence
# 1 OT-OT-OT-OT-OT-OT-OT-OT-OT-OT
# 2 OT-OT-OT-OT-OT-OT-OT-OT-OT-OT
# 3 OT-OT-OT-OT-OT-OT-OT-OT-OT-OT
位置分布为
sd <- seqstatd(s.spl)
sd$Frequencies
# 04:00 04:10 04:20 04:30 04:40 04:50 05:00 05:10 05:20 05:30
# PC 0 0 0 0 0 0 0 0 0 0
# SL 0 0 0 0 0 0 0 0 0 0
# EA 0 0 0 0 0 0 0 0 0 0
# WR 0 0 0 0 0 0 0 0 0 0
# ST 0 0 0 0 0 0 0 0 0 0
# DI 0 0 0 0 0 0 0 0 0 0
# FP 0 0 0 0 0 0 0 0 0 0
# FO 0 0 0 0 0 0 0 0 0 0
# LA 0 0 0 0 0 0 0 0 0 0
# IR 0 0 0 0 0 0 0 0 0 0
# HO 0 0 0 0 0 0 0 0 0 0
# CH 0 0 0 0 0 0 0 0 0 0
# CA 0 0 0 0 0 0 0 0 0 0
# LE 0 0 0 0 0 0 0 0 0 0
# CO 0 0 0 0 0 0 0 0 0 0
# TV 0 0 0 0 0 0 0 0 0 0
# RA 0 0 0 0 0 0 0 0 0 0
# TR 0 0 0 0 0 0 0 0 0 0
# OT 1 1 1 1 1 1 1 1 1 1
如果想用 ggplot
TraMineR
的绘图工具,祝你好运
正如@Gilbert 所指出的,您的示例数据并不是很有用。因此,我使用附加到 TraMiner
的示例数据作为我的答案。
那么让我们开始设置数据
library(TraMineR)
## biofam data set
data(biofam)
## We use only a sample of 300 cases
set.seed(10)
biofam <- biofam[sample(nrow(biofam),300),]
biofam.lab <- c("Parent", "Left", "Married", "Left+Marr",
"Child", "Left+Child", "Left+Marr+Child", "Divorced")
biofam.seq <- seqdef(biofam, 10:25, labels=biofam.lab)
下面进入正题。对于 {TraMineR}
的 seqplot
,这是一项相当容易的工作。
# Convenient out of the box TraMineR plot
seqdplot(biofam.seq)
如果您想改用 {ggplot2}
,则必须重塑数据。正如@Gilbert 所指出的,您首先必须从 seqstatd
中提取状态分布。在第二步中,您必须将结果数据重塑为 tidy 格式,例如通过使用 pivot_longer
.
# Extract and reshape state distributions
dplotdata <- as_tibble(seqstatd(biofam.seq)$Frequencies,
rownames = "state") %>%
pivot_longer(cols = -1,
names_to = "time",
names_prefix = "a",
names_transform = list(time = as.integer))
dplotdata
# # A tibble: 128 x 3
# state time value
# <chr> <int> <dbl>
# 1 0 15 0.987
# 2 0 16 0.95
# 3 0 17 0.937
# 4 0 18 0.897
# 5 0 19 0.83
# 6 0 20 0.74
# 7 0 21 0.63
# 8 0 22 0.54
# 9 0 23 0.45
# 10 0 24 0.373
# # ... with 118 more rows
完成此操作后,您可以使用 {ggplot2}
重新创建 seqdplot
的输出。请注意,我必须颠倒填充变量(状态)的顺序才能获得与 seqdplot
相同的输出。如果堆叠顺序与您无关,可以缩短代码。
library(tidyverse)
library(glue)
# Extract color palette from sequence object
cpal <- attributes(biofam.seq)$cpal
# plot
ggplot(dplotdata, aes(fill = rev(state), y = value, x = time)) +
geom_bar(stat = "identity",
width = 1, colour = "black") +
scale_fill_manual(values = rev(cpal),
labels = rev(biofam.lab)) +
scale_y_continuous(expand = expansion(add = c(.01,0))) +
scale_x_continuous(expand = expansion(add = .15)) +
labs(x = "", y = glue("Rel. Freq. (n={nrow(biofam.seq)})")) +
guides(fill = guide_legend(reverse=TRUE)) +
theme_minimal() +
theme(legend.position = "bottom",
legend.title = element_blank())
如您所见,ggplot2 解决方案需要一些额外的工作,如果您对 {ggplot2}
比 R 的 {base} plot
更熟悉,则可能是 well-invested。鉴于 seqplot
的便利性,我建议在转向 {ggplot2}
.