使用 ggplot2 的 TramineR 序列图

Question

我是 TramineR 包的新手，想使用 ggplot 创建状态分布图。下面的图是用 TramineR 包创建的，但我如何提取数据并用 ggplot 绘制它？我也想更改坐标轴和颜色？

示例代码：

dev.off()
seqdplot(df_new.seq[1:10,], border=0,
         axes=T, yaxis=T, xaxis=T, ylab="",
         cex.legend=0.5, ncol=6, legend.prop=.11)

示例数据：

structure(list(`04:00` = structure(c(19L, 19L, 19L), .Label = c("PC", 
"SL", "EA", "WR", "ST", "DI", "FP", "FO", "LA", "IR", "HO", "CH", 
"CA", "LE", "CO", "TV", "RA", "TR", "OT", "*", "%"), class = "factor"), 
    `04:10` = structure(c(19L, 19L, 19L), .Label = c("PC", "SL", 
    "EA", "WR", "ST", "DI", "FP", "FO", "LA", "IR", "HO", "CH", 
    "CA", "LE", "CO", "TV", "RA", "TR", "OT", "*", "%"), class = "factor"), 
    `04:20` = structure(c(19L, 19L, 19L), .Label = c("PC", "SL", 
    "EA", "WR", "ST", "DI", "FP", "FO", "LA", "IR", "HO", "CH", 
    "CA", "LE", "CO", "TV", "RA", "TR", "OT", "*", "%"), class = "factor"), 
    `04:30` = structure(c(19L, 19L, 19L), .Label = c("PC", "SL", 
    "EA", "WR", "ST", "DI", "FP", "FO", "LA", "IR", "HO", "CH", 
    "CA", "LE", "CO", "TV", "RA", "TR", "OT", "*", "%"), class = "factor"), 
    `04:40` = structure(c(19L, 19L, 19L), .Label = c("PC", "SL", 
    "EA", "WR", "ST", "DI", "FP", "FO", "LA", "IR", "HO", "CH", 
    "CA", "LE", "CO", "TV", "RA", "TR", "OT", "*", "%"), class = "factor"), 
    `04:50` = structure(c(19L, 19L, 19L), .Label = c("PC", "SL", 
    "EA", "WR", "ST", "DI", "FP", "FO", "LA", "IR", "HO", "CH", 
    "CA", "LE", "CO", "TV", "RA", "TR", "OT", "*", "%"), class = "factor"), 
    `05:00` = structure(c(19L, 19L, 19L), .Label = c("PC", "SL", 
    "EA", "WR", "ST", "DI", "FP", "FO", "LA", "IR", "HO", "CH", 
    "CA", "LE", "CO", "TV", "RA", "TR", "OT", "*", "%"), class = "factor"), 
    `05:10` = structure(c(19L, 19L, 19L), .Label = c("PC", "SL", 
    "EA", "WR", "ST", "DI", "FP", "FO", "LA", "IR", "HO", "CH", 
    "CA", "LE", "CO", "TV", "RA", "TR", "OT", "*", "%"), class = "factor"), 
    `05:20` = structure(c(19L, 19L, 19L), .Label = c("PC", "SL", 
    "EA", "WR", "ST", "DI", "FP", "FO", "LA", "IR", "HO", "CH", 
    "CA", "LE", "CO", "TV", "RA", "TR", "OT", "*", "%"), class = "factor"), 
    `05:30` = structure(c(19L, 19L, 19L), .Label = c("PC", "SL", 
    "EA", "WR", "ST", "DI", "FP", "FO", "LA", "IR", "HO", "CH", 
    "CA", "LE", "CO", "TV", "RA", "TR", "OT", "*", "%"), class = "factor")), row.names = c(NA, 
3L), start = 1, missing = NA, void = "%", nr = "*", alphabet = c("PC", 
"SL", "EA", "WR", "ST", "DI", "FP", "FO", "LA", "IR", "HO", "CH", 
"CA", "LE", "CO", "TV", "RA", "TR", "OT"), class = c("stslist", 
"data.frame"), labels = c("Personal care", "Sleep", "Eating", 
"Work", "Study", "Dishwash", "Food preparation", "Household upkeep", 
"Laundry", "Ironing", "Housework", "Childcare", "Care for adults", 
"Leisure", "Computing", "TV", "Radio and music", "Travel", "Other"
), cpal = c("#FFB3B5", "#F8B8A2", "#EDBE91", "#DDC485", "#CBCA82", 
"#B5D087", "#9DD594", "#84D8A6", "#6ED9B9", "#61D9CD", "#66D7DF", 
"#7AD3ED", "#96CCF8", "#B3C5FD", "#CEBDFD", "#E3B6F7", "#F3B1EC", 
"#FDAFDC", "#FFB0CA"), missing.color = "darkgrey", xtstep = 19, tick.last = FALSE, Version = "2.2-2")

Answer 1

seqplot（其中seqdplot是type="d"的别名）的联机帮助页面指出

A State distribution plot (type="d") represents the sequence of the cross-sectional state frequencies by position (time point) computed by the seqstatd function and rendered with the plot.stslist.statd method. Such plots are also known as chronograms.

所以你用函数seqstatd得到了seqdplot使用的数据。实际上，分布在属性 Frequencies.

中

您的示例数据仅包含三个长度为 10 的序列，其中一个咒语处于状态 'OT'。我把它存储在 s.spl

s.spl
#   Sequence                     
# 1 OT-OT-OT-OT-OT-OT-OT-OT-OT-OT
# 2 OT-OT-OT-OT-OT-OT-OT-OT-OT-OT
# 3 OT-OT-OT-OT-OT-OT-OT-OT-OT-OT

位置分布为

sd <- seqstatd(s.spl)
sd$Frequencies
#    04:00 04:10 04:20 04:30 04:40 04:50 05:00 05:10 05:20 05:30
# PC     0     0     0     0     0     0     0     0     0     0
# SL     0     0     0     0     0     0     0     0     0     0
# EA     0     0     0     0     0     0     0     0     0     0
# WR     0     0     0     0     0     0     0     0     0     0
# ST     0     0     0     0     0     0     0     0     0     0
# DI     0     0     0     0     0     0     0     0     0     0
# FP     0     0     0     0     0     0     0     0     0     0
# FO     0     0     0     0     0     0     0     0     0     0
# LA     0     0     0     0     0     0     0     0     0     0
# IR     0     0     0     0     0     0     0     0     0     0
# HO     0     0     0     0     0     0     0     0     0     0
# CH     0     0     0     0     0     0     0     0     0     0
# CA     0     0     0     0     0     0     0     0     0     0
# LE     0     0     0     0     0     0     0     0     0     0
# CO     0     0     0     0     0     0     0     0     0     0
# TV     0     0     0     0     0     0     0     0     0     0
# RA     0     0     0     0     0     0     0     0     0     0
# TR     0     0     0     0     0     0     0     0     0     0
# OT     1     1     1     1     1     1     1     1     1     1

如果想用 ggplot

重写 TraMineR 的绘图工具，祝你好运

Answer 2

正如@Gilbert 所指出的，您的示例数据并不是很有用。因此，我使用附加到 TraMiner 的示例数据作为我的答案。

那么让我们开始设置数据

library(TraMineR)  

## biofam data set
data(biofam)
## We use only a sample of 300 cases
set.seed(10)
biofam <- biofam[sample(nrow(biofam),300),]
biofam.lab <- c("Parent", "Left", "Married", "Left+Marr",
                "Child", "Left+Child", "Left+Marr+Child", "Divorced")
biofam.seq <- seqdef(biofam, 10:25, labels=biofam.lab)

下面进入正题。对于 {TraMineR} 的 seqplot，这是一项相当容易的工作。

# Convenient out of the box TraMineR plot
seqdplot(biofam.seq)

如果您想改用 {ggplot2}，则必须重塑数据。正如@Gilbert 所指出的，您首先必须从 seqstatd 中提取状态分布。在第二步中，您必须将结果数据重塑为 tidy 格式，例如通过使用 pivot_longer.

# Extract and reshape state distributions 
dplotdata <- as_tibble(seqstatd(biofam.seq)$Frequencies,
                       rownames = "state") %>% 
  pivot_longer(cols = -1,
               names_to = "time",
               names_prefix = "a",
               names_transform = list(time = as.integer))

dplotdata

# # A tibble: 128 x 3
# state  time value
# <chr> <int> <dbl>
#   1 0        15 0.987
# 2 0        16 0.95 
# 3 0        17 0.937
# 4 0        18 0.897
# 5 0        19 0.83 
# 6 0        20 0.74 
# 7 0        21 0.63 
# 8 0        22 0.54 
# 9 0        23 0.45 
# 10 0        24 0.373
# # ... with 118 more rows

完成此操作后，您可以使用 {ggplot2} 重新创建 seqdplot 的输出。请注意，我必须颠倒填充变量（状态）的顺序才能获得与 seqdplot 相同的输出。如果堆叠顺序与您无关，可以缩短代码。

library(tidyverse)
library(glue)

# Extract color palette from sequence object
cpal <- attributes(biofam.seq)$cpal 

# plot
ggplot(dplotdata, aes(fill = rev(state), y = value, x = time)) + 
  geom_bar(stat = "identity",
           width = 1, colour = "black") +
  scale_fill_manual(values = rev(cpal),
                    labels = rev(biofam.lab)) +
  scale_y_continuous(expand = expansion(add = c(.01,0))) +
  scale_x_continuous(expand = expansion(add = .15)) +
  labs(x = "", y = glue("Rel. Freq. (n={nrow(biofam.seq)})")) +
  guides(fill = guide_legend(reverse=TRUE)) +
  theme_minimal() +
  theme(legend.position = "bottom",
        legend.title = element_blank())

如您所见，ggplot2 解决方案需要一些额外的工作，如果您对 {ggplot2} 比 R 的 {base} plot 更熟悉，则可能是 well-invested。鉴于 seqplot 的便利性，我建议在转向 {ggplot2}.

之前总是先从这些图开始

使用 ggplot2 的 TramineR 序列图

TramineR sequence plot with ggplot2

r

dataframe

ggplot2

traminer