如何将会话路径数据折叠成从-到路径以可视化网络数据?

How to collapse session path data into from-to paths for visualizing network data?

有哪些转换会话路径数据的方法,例如:

df
#   Session Link1 Link2 Link3 Link4 Link5
# 1       1     A     B                  
# 2       2     C                        
# 3       3     D     A     B            
# 4       4     C     F     G     H     J
# 5       5     A     B     C            

进入如下所示的数据集:

desired
#    Session From   To
# 1        1    A    B
# 2        2    C <NA>
# 3        3    D    A
# 4        3    A    B
# 5        4    C    F
# 6        4    F    G
# 7        4    G    H
# 8        4    H    J
# 9        5    A    B
# 10       5    B    C

再现性数据:

df <- structure(list(Session = 1:5, Link1 = structure(c(1L, 2L, 3L, 2L, 1L), .Label = c("A", "C", "D"), class = "factor"), Link2 = structure(c(3L, 1L, 2L, 4L, 3L), .Label = c("", "A", "B", "F"), class = "factor"), Link3 = structure(c(1L, 1L, 2L, 4L, 3L), .Label = c("", "B", "C", "G"), class = "factor"), Link4 = structure(c(1L, 1L, 1L, 2L, 1L), .Label = c("", "H"), class = "factor"), Link5 = structure(c(1L, 1L, 1L, 2L, 1L), .Label = c("", "J"), class = "factor")), .Names = c("Session", "Link1", "Link2", "Link3", "Link4", "Link5"), class = "data.frame", row.names = c(NA, -5L))
desired <- structure(list(Session = c(1L, 2L, 3L, 3L, 4L, 4L, 4L, 4L, 5L, 5L), From = structure(c(1L, 3L, 4L, 1L, 3L, 5L, 6L, 7L, 1L, 2L), .Label = c("A", "B", "C", "D", "F", "G", "H"), class = "factor"), To = structure(c(2L, NA, 1L, 2L, 4L, 5L, 6L, 7L, 2L, 3L), .Label = c("A", "B", "C", "F", "G", "H", "J"), class = "factor")), .Names = c("Session", "From", "To"), class = "data.frame", row.names = c(NA, -10L))

我们可以使用 data.table。将 'data.frame' 转换为 'data.table' (setDT(df))。使用 meltid.var 指定为 'Session',将 'wide' 格式重塑为 'long' 格式。删除 'value' 个空元素 [value!='']。按 'Session' 分组,我们在 'value' 列中为那些只有一行 (if...else) 的 'Session' 插入 'NA' 值,创建两列 ( 'From' 和 'To') 通过删除按 'Session'.

分组的 'V1' 的最后一个和第一个元素
 library(data.table)#v1.9.5+
 melt(setDT(df), id.var='Session')[value!=''][, 
   if(.N==1L) c(value, NA) else value, by = Session][,
      list(From=V1[-.N], To=V1[-1L]), by = Session]
 #   Session From To
 #1:       1    A  B
 #2:       2    C NA
 #3:       3    D  A
 #4:       3    A  B
 #5:       4    C  F
 #6:       4    F  G
 #7:       4    G  H
 #8:       4    H  J
 #9:       5    A  B
 #10:      5    B  C

以上可以在melt步骤后简化为一个块。出于某种原因,tmp[-.N] 不工作。所以我用了tmp[1:(.N-1)].

melt(setDT(df), id.var= 'Session')[value!='', {
              tmp <- if(.N==1L) c(value, NA) else value
              list(From= tmp[1:(.N-1)], To= tmp[-1L]) }, by = Session]
#    Session From To
#1:       1    A  B
#2:       2    C NA
#3:       3    D  A
#4:       3    A  B
#5:       4    C  F
#6:       4    F  G
#7:       4    G  H
#8:       4    H  J
#9:       5    A  B
#10:      5    B  C

受@akrun 的启发,这是我个人对这个问题的尝试。当然,对结果进行了调整以包括每对的终端从到路径:

library(dplyr)
library(tidyr)

gather(df, "Link_Num", "Value", -Session) %>%
  group_by(Session) %>%
  mutate(to = Value,
         from = lag(to)) %>%
  filter(Link_Num != "Link1" &
         from != "") %>%
  select(Session, from, to, Link_Num) %>%
  arrange(Session)

产生:

   Session from to Link_Num
1        1    A  B    Link2
2        1    B       Link3
3        2    C       Link2
4        3    D  A    Link2
5        3    A  B    Link3
6        3    B       Link4
7        4    C  F    Link2
8        4    F  G    Link3
9        4    G  H    Link4
10       4    H  J    Link5
11       5    A  B    Link2
12       5    B  C    Link3
13       5    C       Link4

另一种方法 dplyr 函数 meltlead:

library(dplyr)
df$spacer <- ""
df %>% melt(id.var = "Session") %>%
  arrange(Session) %>% 
  mutate(To = lead(value)) %>%
  filter(To !="" & value !="" | To =="" & variable =="Link1") %>%
  mutate(To = ifelse(To == "", NA, To)) %>% select(-variable)
#    Session value   To
# 1        1     A    B
# 2        2     C <NA>
# 3        3     D    A
# 4        3     A    B
# 5        4     C    F
# 6        4     F    G
# 7        4     G    H
# 8        4     H    J
# 9        5     A    B
# 10       5     B    C