使用 ggplot2 的 TramineR 序列图

TramineR sequence plot with ggplot2

我是 TramineR 包的新手,想使用 ggplot 创建状态分布图。下面的图是用 TramineR 包创建的,但我如何提取数据并用 ggplot 绘制它?我也想更改坐标轴和颜色?

示例代码:

dev.off()
seqdplot(df_new.seq[1:10,], border=0,
         axes=T, yaxis=T, xaxis=T, ylab="",
         cex.legend=0.5, ncol=6, legend.prop=.11)

示例数据:

structure(list(`04:00` = structure(c(19L, 19L, 19L), .Label = c("PC", 
"SL", "EA", "WR", "ST", "DI", "FP", "FO", "LA", "IR", "HO", "CH", 
"CA", "LE", "CO", "TV", "RA", "TR", "OT", "*", "%"), class = "factor"), 
    `04:10` = structure(c(19L, 19L, 19L), .Label = c("PC", "SL", 
    "EA", "WR", "ST", "DI", "FP", "FO", "LA", "IR", "HO", "CH", 
    "CA", "LE", "CO", "TV", "RA", "TR", "OT", "*", "%"), class = "factor"), 
    `04:20` = structure(c(19L, 19L, 19L), .Label = c("PC", "SL", 
    "EA", "WR", "ST", "DI", "FP", "FO", "LA", "IR", "HO", "CH", 
    "CA", "LE", "CO", "TV", "RA", "TR", "OT", "*", "%"), class = "factor"), 
    `04:30` = structure(c(19L, 19L, 19L), .Label = c("PC", "SL", 
    "EA", "WR", "ST", "DI", "FP", "FO", "LA", "IR", "HO", "CH", 
    "CA", "LE", "CO", "TV", "RA", "TR", "OT", "*", "%"), class = "factor"), 
    `04:40` = structure(c(19L, 19L, 19L), .Label = c("PC", "SL", 
    "EA", "WR", "ST", "DI", "FP", "FO", "LA", "IR", "HO", "CH", 
    "CA", "LE", "CO", "TV", "RA", "TR", "OT", "*", "%"), class = "factor"), 
    `04:50` = structure(c(19L, 19L, 19L), .Label = c("PC", "SL", 
    "EA", "WR", "ST", "DI", "FP", "FO", "LA", "IR", "HO", "CH", 
    "CA", "LE", "CO", "TV", "RA", "TR", "OT", "*", "%"), class = "factor"), 
    `05:00` = structure(c(19L, 19L, 19L), .Label = c("PC", "SL", 
    "EA", "WR", "ST", "DI", "FP", "FO", "LA", "IR", "HO", "CH", 
    "CA", "LE", "CO", "TV", "RA", "TR", "OT", "*", "%"), class = "factor"), 
    `05:10` = structure(c(19L, 19L, 19L), .Label = c("PC", "SL", 
    "EA", "WR", "ST", "DI", "FP", "FO", "LA", "IR", "HO", "CH", 
    "CA", "LE", "CO", "TV", "RA", "TR", "OT", "*", "%"), class = "factor"), 
    `05:20` = structure(c(19L, 19L, 19L), .Label = c("PC", "SL", 
    "EA", "WR", "ST", "DI", "FP", "FO", "LA", "IR", "HO", "CH", 
    "CA", "LE", "CO", "TV", "RA", "TR", "OT", "*", "%"), class = "factor"), 
    `05:30` = structure(c(19L, 19L, 19L), .Label = c("PC", "SL", 
    "EA", "WR", "ST", "DI", "FP", "FO", "LA", "IR", "HO", "CH", 
    "CA", "LE", "CO", "TV", "RA", "TR", "OT", "*", "%"), class = "factor")), row.names = c(NA, 
3L), start = 1, missing = NA, void = "%", nr = "*", alphabet = c("PC", 
"SL", "EA", "WR", "ST", "DI", "FP", "FO", "LA", "IR", "HO", "CH", 
"CA", "LE", "CO", "TV", "RA", "TR", "OT"), class = c("stslist", 
"data.frame"), labels = c("Personal care", "Sleep", "Eating", 
"Work", "Study", "Dishwash", "Food preparation", "Household upkeep", 
"Laundry", "Ironing", "Housework", "Childcare", "Care for adults", 
"Leisure", "Computing", "TV", "Radio and music", "Travel", "Other"
), cpal = c("#FFB3B5", "#F8B8A2", "#EDBE91", "#DDC485", "#CBCA82", 
"#B5D087", "#9DD594", "#84D8A6", "#6ED9B9", "#61D9CD", "#66D7DF", 
"#7AD3ED", "#96CCF8", "#B3C5FD", "#CEBDFD", "#E3B6F7", "#F3B1EC", 
"#FDAFDC", "#FFB0CA"), missing.color = "darkgrey", xtstep = 19, tick.last = FALSE, Version = "2.2-2")

seqplot(其中seqdplottype="d"的别名)的联机帮助页面指出

A State distribution plot (type="d") represents the sequence of the cross-sectional state frequencies by position (time point) computed by the seqstatd function and rendered with the plot.stslist.statd method. Such plots are also known as chronograms.

所以你用函数seqstatd得到了seqdplot使用的数据。实际上,分布在属性 Frequencies.

您的示例数据仅包含三个长度为 10 的序列,其中一个咒语处于状态 'OT'。我把它存储在 s.spl

s.spl
#   Sequence                     
# 1 OT-OT-OT-OT-OT-OT-OT-OT-OT-OT
# 2 OT-OT-OT-OT-OT-OT-OT-OT-OT-OT
# 3 OT-OT-OT-OT-OT-OT-OT-OT-OT-OT

位置分布为

sd <- seqstatd(s.spl)
sd$Frequencies
#    04:00 04:10 04:20 04:30 04:40 04:50 05:00 05:10 05:20 05:30
# PC     0     0     0     0     0     0     0     0     0     0
# SL     0     0     0     0     0     0     0     0     0     0
# EA     0     0     0     0     0     0     0     0     0     0
# WR     0     0     0     0     0     0     0     0     0     0
# ST     0     0     0     0     0     0     0     0     0     0
# DI     0     0     0     0     0     0     0     0     0     0
# FP     0     0     0     0     0     0     0     0     0     0
# FO     0     0     0     0     0     0     0     0     0     0
# LA     0     0     0     0     0     0     0     0     0     0
# IR     0     0     0     0     0     0     0     0     0     0
# HO     0     0     0     0     0     0     0     0     0     0
# CH     0     0     0     0     0     0     0     0     0     0
# CA     0     0     0     0     0     0     0     0     0     0
# LE     0     0     0     0     0     0     0     0     0     0
# CO     0     0     0     0     0     0     0     0     0     0
# TV     0     0     0     0     0     0     0     0     0     0
# RA     0     0     0     0     0     0     0     0     0     0
# TR     0     0     0     0     0     0     0     0     0     0
# OT     1     1     1     1     1     1     1     1     1     1

如果想用 ggplot

重写 TraMineR 的绘图工具,祝你好运

正如@Gilbert 所指出的,您的示例数据并不是很有用。因此,我使用附加到 TraMiner 的示例数据作为我的答案。

那么让我们开始设置数据

library(TraMineR)  

## biofam data set
data(biofam)
## We use only a sample of 300 cases
set.seed(10)
biofam <- biofam[sample(nrow(biofam),300),]
biofam.lab <- c("Parent", "Left", "Married", "Left+Marr",
                "Child", "Left+Child", "Left+Marr+Child", "Divorced")
biofam.seq <- seqdef(biofam, 10:25, labels=biofam.lab)

下面进入正题。对于 {TraMineR}seqplot,这是一项相当容易的工作。

# Convenient out of the box TraMineR plot
seqdplot(biofam.seq)

如果您想改用 {ggplot2},则必须重塑数据。正如@Gilbert 所指出的,您首先必须从 seqstatd 中提取状态分布。在第二步中,您必须将结果数据重塑为 tidy 格式,例如通过使用 pivot_longer.

# Extract and reshape state distributions 
dplotdata <- as_tibble(seqstatd(biofam.seq)$Frequencies,
                       rownames = "state") %>% 
  pivot_longer(cols = -1,
               names_to = "time",
               names_prefix = "a",
               names_transform = list(time = as.integer))

dplotdata

# # A tibble: 128 x 3
# state  time value
# <chr> <int> <dbl>
#   1 0        15 0.987
# 2 0        16 0.95 
# 3 0        17 0.937
# 4 0        18 0.897
# 5 0        19 0.83 
# 6 0        20 0.74 
# 7 0        21 0.63 
# 8 0        22 0.54 
# 9 0        23 0.45 
# 10 0        24 0.373
# # ... with 118 more rows

完成此操作后,您可以使用 {ggplot2} 重新创建 seqdplot 的输出。请注意,我必须颠倒填充变量(状态)的顺序才能获得与 seqdplot 相同的输出。如果堆叠顺序与您无关,可以缩短代码。

library(tidyverse)
library(glue)

# Extract color palette from sequence object
cpal <- attributes(biofam.seq)$cpal 

# plot
ggplot(dplotdata, aes(fill = rev(state), y = value, x = time)) + 
  geom_bar(stat = "identity",
           width = 1, colour = "black") +
  scale_fill_manual(values = rev(cpal),
                    labels = rev(biofam.lab)) +
  scale_y_continuous(expand = expansion(add = c(.01,0))) +
  scale_x_continuous(expand = expansion(add = .15)) +
  labs(x = "", y = glue("Rel. Freq. (n={nrow(biofam.seq)})")) +
  guides(fill = guide_legend(reverse=TRUE)) +
  theme_minimal() +
  theme(legend.position = "bottom",
        legend.title = element_blank())

如您所见,ggplot2 解决方案需要一些额外的工作,如果您对 {ggplot2} 比 R 的 {base} plot 更熟悉,则可能是 well-invested。鉴于 seqplot 的便利性,我建议在转向 {ggplot2}.

之前总是先从这些图开始