字母序列的转换频率
Transition frequencies for a sequence of letters
我想将转换概率构建为变量并附加行。例如在我的例子中,我有
A<- c('A-B-C-D', 'A-B-C-A', 'A-B-A-B')
B<- c('project1', 'project2', 'project3')
df<- data.frame(A, B)
我想为每一行创建过渡频率,例如,项目 1 的 A-B 过渡将是
df$A-B df$A-A
1 0
有什么建议吗?
下面的代码不够优雅,但它可以工作,至少在这个规模上是这样。
A<- c('A-B-C-D', 'A-B-C-A', 'A-B-A-B')
B<- c('project1', 'project2', 'project3')
df<- data.frame(A, B)
df1 <- df %>% mutate(A = str_split(A, "-")) %>% unnest(A) %>%
mutate(A = factor(A, levels=c("A", "B", "C", "D")))
df1 <- df1 %>%
group_by(B) %>%
mutate(lagA = lag(A),
obs = 1) %>%
na.omit
all_combs <- expand.grid(B=unique(df1$B), A = sort(unique(df1$A)),
lagA = sort(unique(df1$A)))
df2 <- full_join(all_combs, df1) %>%
mutate(obs = ifelse(is.na(obs), 0, 1),
pair = paste(lagA, A, sep="-")) %>%
select(-c(lagA, A)) %>%
group_by(B, pair) %>%
summarise(obs = sum(obs)) %>%
ungroup %>%
group_by(B) %>%
mutate(prob = obs/sum(obs)) %>%
select(-obs)
df2w <- df2 %>%
pivot_wider(names_from="pair", values_from="prob")
df <- left_join(df, df2w)
df
# A B A-A A-B A-C A-D B-A B-B B-C B-D C-A C-B C-C C-D
# 1 A-B-C-D project1 0 0.3333333 0 0 0.0000000 0 0.3333333 0 0.0000000 0 0 0.3333333
# 2 A-B-C-A project2 0 0.3333333 0 0 0.0000000 0 0.3333333 0 0.3333333 0 0 0.0000000
# 3 A-B-A-B project3 0 0.6666667 0 0 0.3333333 0 0.0000000 0 0.0000000 0 0 0.0000000
# D-A D-B D-C D-D
# 1 0 0 0 0
# 2 0 0 0 0
# 3 0 0 0 0
使用 data.table 数据越大应该会更快:
library(data.table)
dt <- data.table(A = c('A-B-C-D', 'A-B-C-A', 'A-B-A-B'),
B = c('project1', 'project2', 'project3'))
dt <- dt[, strsplit(A, "-", fixed = TRUE), by = .(A, B)
][, .(pattern = head(paste(V1, shift(V1, -1), sep = "-"), -1)), by = .(A, B)
][, .(patternCnt = stringr::str_count(A, pattern)), by = .(A, B, pattern)
][, dcast(.SD, A + B ~ pattern, fun.aggregate = sum, value.var = "patternCnt") ]
dt
# A B A-B B-A B-C C-A C-D
# 1: A-B-A-B project3 2 1 0 0 0
# 2: A-B-C-A project2 1 0 1 1 0
# 3: A-B-C-D project1 1 0 1 0 1
因为你有 87K 个字母,也许可以跳过最后的 dcast 步骤,这样输出将采用更易于管理的长格式:
# A B pattern patternCnt
# 1: A-B-C-D project1 A-B 1
# 2: A-B-C-D project1 B-C 1
# 3: A-B-C-D project1 C-D 1
# 4: A-B-C-A project2 A-B 1
# 5: A-B-C-A project2 B-C 1
# 6: A-B-C-A project2 C-A 1
# 7: A-B-A-B project3 A-B 2
# 8: A-B-A-B project3 B-A 1
(下面是一个慢速的原始答案,OP在评论中提到了字符串可以有87K个字母。)
循环并计算字符串匹配数:
#pairs
x <- c("A-B", "B-C")
cbind(df, t(sapply(df$A, function(i){
sapply(x, function(j){
stringr::str_count(i, j)
})
})))
# A B A-B B-C
# A-B-C-D A-B-C-D project1 1 1
# A-B-C-A A-B-C-A project2 1 1
# A-B-A-B A-B-A-B project3 2 0
使用 dplyr
、tidyr
和 purrr
的方法 - 使用 gtools
的排列生成
library(dplyr)
library(tidyr)
library(gtools)
library(purrr)
# Generate all possible permutations
possible_transition <-
permutations(n = 4, r = 2, v = possible_stage, repeats.allowed = TRUE) %>%
as_tibble() %>%
mutate(transition = paste0(V1, "-", V2))
# Take only the transition from the generated permuation
transition_vector <- possible_transition[["transition"]]
# Walk through each row of original df and
# count all the transition available for each project
pmap_dfr(df, .f = function(...) {
current <- tibble(...)
transition_count <- map_dfr(transition_vector, .f = function(x) {
tibble(transition = x, count = str_count(current$A, x))
})
transition_count %>%
mutate(project = current[["B"]]) %>%
mutate(value = current[["A"]])
}) %>% filter(count > 0)
输出
# A tibble: 8 x 4
transition count project value
<chr> <int> <chr> <chr>
1 A-B 1 project1 A-B-C-D
2 B-C 1 project1 A-B-C-D
3 C-D 1 project1 A-B-C-D
4 A-B 1 project2 A-B-C-A
5 B-C 1 project2 A-B-C-A
6 C-A 1 project2 A-B-C-A
7 A-B 2 project3 A-B-A-B
8 B-A 1 project3 A-B-A-B
基本 R 选项
cbind(df,
do.call(rbind,
lapply(df$A,
function(x)
sapply(do.call(outer, c(
rep(list(unique(unlist(
strsplit(df$A, "-")
))), 2),
FUN = function(...)
paste(..., sep = "-")
)),
function(y)
lengths(regmatches(x, gregexpr(
y, x
)))))))
给予
A B A-A B-A C-A D-A A-B B-B C-B D-B A-C B-C C-C D-C A-D B-D C-D D-D
1 A-B-C-D project1 0 0 0 0 1 0 0 0 0 1 0 0 0 0 1 0
2 A-B-C-A project2 0 0 1 0 1 0 0 0 0 1 0 0 0 0 0 0
3 A-B-A-B project3 0 1 0 0 2 0 0 0 0 0 0 0 0 0 0 0
我想将转换概率构建为变量并附加行。例如在我的例子中,我有
A<- c('A-B-C-D', 'A-B-C-A', 'A-B-A-B')
B<- c('project1', 'project2', 'project3')
df<- data.frame(A, B)
我想为每一行创建过渡频率,例如,项目 1 的 A-B 过渡将是
df$A-B df$A-A
1 0
有什么建议吗?
下面的代码不够优雅,但它可以工作,至少在这个规模上是这样。
A<- c('A-B-C-D', 'A-B-C-A', 'A-B-A-B')
B<- c('project1', 'project2', 'project3')
df<- data.frame(A, B)
df1 <- df %>% mutate(A = str_split(A, "-")) %>% unnest(A) %>%
mutate(A = factor(A, levels=c("A", "B", "C", "D")))
df1 <- df1 %>%
group_by(B) %>%
mutate(lagA = lag(A),
obs = 1) %>%
na.omit
all_combs <- expand.grid(B=unique(df1$B), A = sort(unique(df1$A)),
lagA = sort(unique(df1$A)))
df2 <- full_join(all_combs, df1) %>%
mutate(obs = ifelse(is.na(obs), 0, 1),
pair = paste(lagA, A, sep="-")) %>%
select(-c(lagA, A)) %>%
group_by(B, pair) %>%
summarise(obs = sum(obs)) %>%
ungroup %>%
group_by(B) %>%
mutate(prob = obs/sum(obs)) %>%
select(-obs)
df2w <- df2 %>%
pivot_wider(names_from="pair", values_from="prob")
df <- left_join(df, df2w)
df
# A B A-A A-B A-C A-D B-A B-B B-C B-D C-A C-B C-C C-D
# 1 A-B-C-D project1 0 0.3333333 0 0 0.0000000 0 0.3333333 0 0.0000000 0 0 0.3333333
# 2 A-B-C-A project2 0 0.3333333 0 0 0.0000000 0 0.3333333 0 0.3333333 0 0 0.0000000
# 3 A-B-A-B project3 0 0.6666667 0 0 0.3333333 0 0.0000000 0 0.0000000 0 0 0.0000000
# D-A D-B D-C D-D
# 1 0 0 0 0
# 2 0 0 0 0
# 3 0 0 0 0
使用 data.table 数据越大应该会更快:
library(data.table)
dt <- data.table(A = c('A-B-C-D', 'A-B-C-A', 'A-B-A-B'),
B = c('project1', 'project2', 'project3'))
dt <- dt[, strsplit(A, "-", fixed = TRUE), by = .(A, B)
][, .(pattern = head(paste(V1, shift(V1, -1), sep = "-"), -1)), by = .(A, B)
][, .(patternCnt = stringr::str_count(A, pattern)), by = .(A, B, pattern)
][, dcast(.SD, A + B ~ pattern, fun.aggregate = sum, value.var = "patternCnt") ]
dt
# A B A-B B-A B-C C-A C-D
# 1: A-B-A-B project3 2 1 0 0 0
# 2: A-B-C-A project2 1 0 1 1 0
# 3: A-B-C-D project1 1 0 1 0 1
因为你有 87K 个字母,也许可以跳过最后的 dcast 步骤,这样输出将采用更易于管理的长格式:
# A B pattern patternCnt
# 1: A-B-C-D project1 A-B 1
# 2: A-B-C-D project1 B-C 1
# 3: A-B-C-D project1 C-D 1
# 4: A-B-C-A project2 A-B 1
# 5: A-B-C-A project2 B-C 1
# 6: A-B-C-A project2 C-A 1
# 7: A-B-A-B project3 A-B 2
# 8: A-B-A-B project3 B-A 1
(下面是一个慢速的原始答案,OP在评论中提到了字符串可以有87K个字母。)
循环并计算字符串匹配数:
#pairs
x <- c("A-B", "B-C")
cbind(df, t(sapply(df$A, function(i){
sapply(x, function(j){
stringr::str_count(i, j)
})
})))
# A B A-B B-C
# A-B-C-D A-B-C-D project1 1 1
# A-B-C-A A-B-C-A project2 1 1
# A-B-A-B A-B-A-B project3 2 0
使用 dplyr
、tidyr
和 purrr
的方法 - 使用 gtools
library(dplyr)
library(tidyr)
library(gtools)
library(purrr)
# Generate all possible permutations
possible_transition <-
permutations(n = 4, r = 2, v = possible_stage, repeats.allowed = TRUE) %>%
as_tibble() %>%
mutate(transition = paste0(V1, "-", V2))
# Take only the transition from the generated permuation
transition_vector <- possible_transition[["transition"]]
# Walk through each row of original df and
# count all the transition available for each project
pmap_dfr(df, .f = function(...) {
current <- tibble(...)
transition_count <- map_dfr(transition_vector, .f = function(x) {
tibble(transition = x, count = str_count(current$A, x))
})
transition_count %>%
mutate(project = current[["B"]]) %>%
mutate(value = current[["A"]])
}) %>% filter(count > 0)
输出
# A tibble: 8 x 4
transition count project value
<chr> <int> <chr> <chr>
1 A-B 1 project1 A-B-C-D
2 B-C 1 project1 A-B-C-D
3 C-D 1 project1 A-B-C-D
4 A-B 1 project2 A-B-C-A
5 B-C 1 project2 A-B-C-A
6 C-A 1 project2 A-B-C-A
7 A-B 2 project3 A-B-A-B
8 B-A 1 project3 A-B-A-B
基本 R 选项
cbind(df,
do.call(rbind,
lapply(df$A,
function(x)
sapply(do.call(outer, c(
rep(list(unique(unlist(
strsplit(df$A, "-")
))), 2),
FUN = function(...)
paste(..., sep = "-")
)),
function(y)
lengths(regmatches(x, gregexpr(
y, x
)))))))
给予
A B A-A B-A C-A D-A A-B B-B C-B D-B A-C B-C C-C D-C A-D B-D C-D D-D
1 A-B-C-D project1 0 0 0 0 1 0 0 0 0 1 0 0 0 0 1 0
2 A-B-C-A project2 0 0 1 0 1 0 0 0 0 1 0 0 0 0 0 0
3 A-B-A-B project3 0 1 0 0 2 0 0 0 0 0 0 0 0 0 0 0