字母序列的转换频率

Transition frequencies for a sequence of letters

我想将转换概率构建为变量并附加行。例如在我的例子中,我有

A<- c('A-B-C-D', 'A-B-C-A', 'A-B-A-B')
B<- c('project1', 'project2', 'project3')
df<- data.frame(A, B)

我想为每一行创建过渡频率,例如,项目 1 的 A-B 过渡将是

 df$A-B   df$A-A   
   1       0 

有什么建议吗?

下面的代码不够优雅,但它可以工作,至少在这个规模上是这样。

A<- c('A-B-C-D', 'A-B-C-A', 'A-B-A-B')
B<- c('project1', 'project2', 'project3')
df<- data.frame(A, B)
df1 <- df %>% mutate(A = str_split(A, "-")) %>% unnest(A) %>% 
  mutate(A = factor(A, levels=c("A", "B", "C", "D")))
df1 <- df1 %>% 
  group_by(B) %>% 
  mutate(lagA = lag(A), 
         obs = 1) %>% 
  na.omit

all_combs <- expand.grid(B=unique(df1$B), A = sort(unique(df1$A)), 
                         lagA = sort(unique(df1$A)))

df2 <- full_join(all_combs, df1) %>% 
  mutate(obs = ifelse(is.na(obs), 0, 1), 
         pair = paste(lagA, A, sep="-")) %>%
  select(-c(lagA, A)) %>% 
  group_by(B, pair) %>% 
  summarise(obs = sum(obs)) %>% 
  ungroup %>% 
  group_by(B) %>% 
  mutate(prob = obs/sum(obs)) %>% 
  select(-obs)

df2w <- df2 %>%
  pivot_wider(names_from="pair", values_from="prob")

df <- left_join(df, df2w)

df
#         A        B A-A       A-B A-C A-D       B-A B-B       B-C B-D       C-A C-B C-C       C-D
# 1 A-B-C-D project1   0 0.3333333   0   0 0.0000000   0 0.3333333   0 0.0000000   0   0 0.3333333
# 2 A-B-C-A project2   0 0.3333333   0   0 0.0000000   0 0.3333333   0 0.3333333   0   0 0.0000000
# 3 A-B-A-B project3   0 0.6666667   0   0 0.3333333   0 0.0000000   0 0.0000000   0   0 0.0000000
#   D-A D-B D-C D-D
# 1   0   0   0   0
# 2   0   0   0   0
# 3   0   0   0   0

使用 data.table 数据越大应该会更快:

library(data.table)

dt <- data.table(A = c('A-B-C-D', 'A-B-C-A', 'A-B-A-B'),
                 B = c('project1', 'project2', 'project3'))

dt <- dt[, strsplit(A, "-", fixed = TRUE), by = .(A, B)
         ][, .(pattern = head(paste(V1, shift(V1, -1), sep = "-"), -1)), by = .(A, B)
           ][, .(patternCnt = stringr::str_count(A, pattern)), by = .(A, B, pattern)
             ][, dcast(.SD, A + B ~ pattern, fun.aggregate = sum, value.var = "patternCnt") ]

dt
#          A        B A-B B-A B-C C-A C-D
# 1: A-B-A-B project3   2   1   0   0   0
# 2: A-B-C-A project2   1   0   1   1   0
# 3: A-B-C-D project1   1   0   1   0   1

因为你有 87K 个字母,也许可以跳过最后的 dcast 步骤,这样输出将采用更易于管理的长格式:

#          A        B pattern patternCnt
# 1: A-B-C-D project1     A-B          1
# 2: A-B-C-D project1     B-C          1
# 3: A-B-C-D project1     C-D          1
# 4: A-B-C-A project2     A-B          1
# 5: A-B-C-A project2     B-C          1
# 6: A-B-C-A project2     C-A          1
# 7: A-B-A-B project3     A-B          2
# 8: A-B-A-B project3     B-A          1

(下面是一个慢速的原始答案,OP在评论中提到了字符串可以有87K个字母。)

循环并计算字符串匹配数:

#pairs
x <- c("A-B", "B-C")

cbind(df, t(sapply(df$A, function(i){
  sapply(x, function(j){
    stringr::str_count(i, j)
  })
})))

#               A        B A-B B-C
# A-B-C-D A-B-C-D project1   1   1
# A-B-C-A A-B-C-A project2   1   1
# A-B-A-B A-B-A-B project3   2   0

使用 dplyrtidyrpurrr 的方法 - 使用 gtools

的排列生成
library(dplyr)
library(tidyr)
library(gtools)
library(purrr)

# Generate all possible permutations
possible_transition <- 
  permutations(n = 4, r = 2, v = possible_stage, repeats.allowed = TRUE) %>%
  as_tibble() %>%
  mutate(transition = paste0(V1, "-", V2))

# Take only the transition from the generated permuation
transition_vector <- possible_transition[["transition"]]
# Walk through each row of original df and
# count all the transition available for each project
pmap_dfr(df, .f = function(...) {
  current <- tibble(...)
  transition_count <- map_dfr(transition_vector, .f = function(x) {
    tibble(transition = x, count = str_count(current$A, x))
  })
  transition_count %>%
    mutate(project = current[["B"]]) %>%
    mutate(value = current[["A"]])
}) %>% filter(count > 0)

输出

# A tibble: 8 x 4
  transition count project  value  
  <chr>      <int> <chr>    <chr>  
1 A-B            1 project1 A-B-C-D
2 B-C            1 project1 A-B-C-D
3 C-D            1 project1 A-B-C-D
4 A-B            1 project2 A-B-C-A
5 B-C            1 project2 A-B-C-A
6 C-A            1 project2 A-B-C-A
7 A-B            2 project3 A-B-A-B
8 B-A            1 project3 A-B-A-B

基本 R 选项

cbind(df,
      do.call(rbind,
              lapply(df$A,
                     function(x)
                       sapply(do.call(outer, c(
                         rep(list(unique(unlist(
                           strsplit(df$A, "-")
                         ))), 2),
                         FUN = function(...)
                           paste(..., sep = "-")
                       )),
                       function(y)
                         lengths(regmatches(x, gregexpr(
                           y, x
                         )))))))

给予

        A        B A-A B-A C-A D-A A-B B-B C-B D-B A-C B-C C-C D-C A-D B-D C-D D-D
1 A-B-C-D project1   0   0   0   0   1   0   0   0   0   1   0   0   0   0   1   0
2 A-B-C-A project2   0   0   1   0   1   0   0   0   0   1   0   0   0   0   0   0
3 A-B-A-B project3   0   1   0   0   2   0   0   0   0   0   0   0   0   0   0   0