计算 String 中元素的出现次数
Count appearence of elements in String
我得到了以下数据集:
structure(list(ID = c(5L, 6L, 7L, 8L, 10L), chain = c("x49",
"x43", "x32 > x42 > x49 > x45 > x20 > x50 > x38", "x54 > x44",
"x38 > x38")), row.names = c(NA, -5L), class = c("data.table",
"data.frame"))
ID chain
1: 5 x49
2: 6 x43
3: 7 x32 > x42 > x49 > x45 > x20 > x50 > x38
4: 8 x54 > x44
5: 10 x38 > x38
chain 列表示产品的购买过程,也缺少一些信息(start 和 buy)。目标是对链中的每个值进行两次计数(origin 例如 from 和 destination 例如 to) 为了能够做到这一点,我需要重组数据集。
例如重组链 x54 > x44
应该是这样的:
from to
1 start x54
2 x54 x44
3 x44 buy
整个结果应该是这样的:
from to
1 start x49
2 x49 buy
3 start x43
4 x43 buy
5 start x32
6 x32 x42
7 x42 x49
8 x49 x45
9 x45 x20
10 x20 x50
11 x38 buy
12 start x54
13 x54 x44
14 x44 buy
15 start x54
16 x54 x44
17 x44 buy
18 start x38
19 x38 x38
20 x38 buy
我已经试过了,但我不确定这是否是个好主意(也不知道如何继续)。
df <- strsplit(df$chain, ">")
lapply(df, trimws)
性能可能很重要,因为链可能会变得很长(30 个项目)并且整个数据集有 10 万行。
基本的 R 方法是拆分 " > "
上的字符串并创建一个组合所有值的数据框。
do.call(rbind, lapply(strsplit(df$chain, " > "), function(x)
data.frame(from = c("start",x), to = c(x, "buy"))))
# from to
#1 start x49
#2 x49 buy
#3 start x43
#4 x43 buy
#5 start x32
#6 x32 x42
#7 x42 x49
#8 x49 x45
#9 x45 x20
#10 x20 x50
#11 x50 x38
#12 x38 buy
#13 start x54
#14 x54 x44
#15 x44 buy
#16 start x38
#17 x38 x38
#18 x38 buy
使用类似的方法 tidyverse
方法是
library(tidyverse)
map_dfr(str_split(df$chain, " > "), ~tibble(from = c("start",.), to = c(., "buy")))
我们可以用str_c
粘贴开头和结尾的字符串,用separate_rows
用tidyverse
扩展数据集
library(tidyverse)
dt %>%
mutate(chain = str_c("start > ", chain, " > buy")) %>%
separate_rows(chain) %>% group_by(ID) %>%
transmute(from = chain, to = lead(chain)) %>%
na.omit %>%
ungroup %>%
select(-ID)
# A tibble: 18 x 2
# from to
# <chr> <chr>
# 1 start x49
# 2 x49 buy
# 3 start x43
# 4 x43 buy
# 5 start x32
# 6 x32 x42
# 7 x42 x49
# 8 x49 x45
# 9 x45 x20
#10 x20 x50
#11 x50 x38
#12 x38 buy
#13 start x54
#14 x54 x44
#15 x44 buy
#16 start x38
#17 x38 x38
#18 x38 buy
我得到了以下数据集:
structure(list(ID = c(5L, 6L, 7L, 8L, 10L), chain = c("x49",
"x43", "x32 > x42 > x49 > x45 > x20 > x50 > x38", "x54 > x44",
"x38 > x38")), row.names = c(NA, -5L), class = c("data.table",
"data.frame"))
ID chain
1: 5 x49
2: 6 x43
3: 7 x32 > x42 > x49 > x45 > x20 > x50 > x38
4: 8 x54 > x44
5: 10 x38 > x38
chain 列表示产品的购买过程,也缺少一些信息(start 和 buy)。目标是对链中的每个值进行两次计数(origin 例如 from 和 destination 例如 to) 为了能够做到这一点,我需要重组数据集。
例如重组链 x54 > x44
应该是这样的:
from to
1 start x54
2 x54 x44
3 x44 buy
整个结果应该是这样的:
from to
1 start x49
2 x49 buy
3 start x43
4 x43 buy
5 start x32
6 x32 x42
7 x42 x49
8 x49 x45
9 x45 x20
10 x20 x50
11 x38 buy
12 start x54
13 x54 x44
14 x44 buy
15 start x54
16 x54 x44
17 x44 buy
18 start x38
19 x38 x38
20 x38 buy
我已经试过了,但我不确定这是否是个好主意(也不知道如何继续)。
df <- strsplit(df$chain, ">")
lapply(df, trimws)
性能可能很重要,因为链可能会变得很长(30 个项目)并且整个数据集有 10 万行。
基本的 R 方法是拆分 " > "
上的字符串并创建一个组合所有值的数据框。
do.call(rbind, lapply(strsplit(df$chain, " > "), function(x)
data.frame(from = c("start",x), to = c(x, "buy"))))
# from to
#1 start x49
#2 x49 buy
#3 start x43
#4 x43 buy
#5 start x32
#6 x32 x42
#7 x42 x49
#8 x49 x45
#9 x45 x20
#10 x20 x50
#11 x50 x38
#12 x38 buy
#13 start x54
#14 x54 x44
#15 x44 buy
#16 start x38
#17 x38 x38
#18 x38 buy
使用类似的方法 tidyverse
方法是
library(tidyverse)
map_dfr(str_split(df$chain, " > "), ~tibble(from = c("start",.), to = c(., "buy")))
我们可以用str_c
粘贴开头和结尾的字符串,用separate_rows
用tidyverse
library(tidyverse)
dt %>%
mutate(chain = str_c("start > ", chain, " > buy")) %>%
separate_rows(chain) %>% group_by(ID) %>%
transmute(from = chain, to = lead(chain)) %>%
na.omit %>%
ungroup %>%
select(-ID)
# A tibble: 18 x 2
# from to
# <chr> <chr>
# 1 start x49
# 2 x49 buy
# 3 start x43
# 4 x43 buy
# 5 start x32
# 6 x32 x42
# 7 x42 x49
# 8 x49 x45
# 9 x45 x20
#10 x20 x50
#11 x50 x38
#12 x38 buy
#13 start x54
#14 x54 x44
#15 x44 buy
#16 start x38
#17 x38 x38
#18 x38 buy