合并具有多个分隔符的列
Merge columns with multiple delimiters
如何合并分隔符数量可变的列,以便获得类似于输出的内容(假设所有内容都是字符)?
dt1
letter
1 a
2 b+c
3 c
4 d+e+f+g
5 a+g+e
dt2
letter number
1 a 1
2 b 2
3 c 3
4 d 4
5 e 5
6 f 6
7 g 7
> output
letter number
1 a 1
2 b+c 2+3
3 c 3
4 d+e+f+g 4+5+6+7
5 a+g+e 1+7+5
dt1<-data.frame(letter=c("a","b+c","c","d+e+f+g","a+g+e"))
dt2<-data.frame(letter=c("a","b","c","d","e","f","g"),number=c("1","2","3","4","5","6","7"))
output<-data.frame(letter=c("a","b+c","c","d+e+f+g","a+g+e"), number=c("1","2+3","3","4+5+6+7","1+7+5"))
library(tidyverse)
dt1 <- data.frame(letter = c("a", "b+c", "c", "d+e+f+g", "a+g+e"))
dt2 <- data.frame(letter = c("a", "b", "c", "d", "e", "f", "g"),
number = c("1", "2", "3", "4", "5", "6", "7"))
dt1 %>%
as_tibble() %>%
mutate(
number = letter %>% map_chr(~ .x %>%
str_split("[+]") %>%
simplify() %>%
map_chr(~ deframe(dt2)[.x]) %>%
paste0(collapse = "+")
)
)
#> # A tibble: 5 x 2
#> letter number
#> <chr> <chr>
#> 1 a 1
#> 2 b+c 2+3
#> 3 c 3
#> 4 d+e+f+g 4+5+6+7
#> 5 a+g+e 1+7+5
由 reprex package (v2.0.1)
于 2021-12-14 创建
基础 R 解决方案可以是,
dt1$res <- sapply(strsplit(dt1$letter, '+', fixed = TRUE), function(i)paste(dt2$number[dt2$letter %in% i], collapse = '+'))
# letter res
#1 a 1
#2 b+c 2+3
#3 c 3
#4 d+e+f+g 4+5+6+7
#5 a+g+e 1+5+7
使用 tidyverse
.
的解决方案
library(tidyverse)
output <- dt1 %>%
mutate(ID = 1:n()) %>%
separate_rows(letter, sep = "\+") %>%
left_join(dt2, by = "letter") %>%
group_by(ID) %>%
summarize(across(.fns = ~paste0(., collapse = "+"))) %>%
ungroup() %>%
select(-ID)
output
# # A tibble: 5 x 2
# letter number
# <chr> <chr>
# 1 a 1
# 2 b+c 2+3
# 3 c 3
# 4 d+e+f+g 4+5+6+7
# 5 a+g+e 1+7+5
dt1<-data.frame(letter=c("a","b+c","c","d+e+f+g","a+g+e"))
dt2<-data.frame(letter=c("a","b","c","d","e","f","g"),number=c("1","2","3","4","5","6","7"))
library(tidyverse)
dt1 %>%
rowwise() %>%
mutate(tmp = str_split(letter, pattern = "\+")) %>%
ungroup() %>%
mutate(number = map_chr(tmp, ~paste0(match(.x, dt2$letter), collapse = "+"))) %>%
select(-tmp)
#> # A tibble: 5 x 2
#> letter number
#> <chr> <chr>
#> 1 a 1
#> 2 b+c 2+3
#> 3 c 3
#> 4 d+e+f+g 4+5+6+7
#> 5 a+g+e 1+7+5
由 reprex package (v2.0.1)
于 2021-12-14 创建
不需要拆分任何数据,因为您只想用特定字母替换特定数字。
dt1<-data.frame(letter=c("a","b+c","c","d+e+f+g","a+g+e"), stringsAsFactors = F)
dt2<-data.frame(letter=c("a","b","c","d","e","f","g"),number=c("1","2","3","4","5","6","7"), stringsAsFactors = F)
library(stringi)
dt1 %>% mutate(number = stri_replace_all_regex(letter, dt2$letter, dt2$number, vectorize_all = F))
letter number
1 a 1
2 b+c 2+3
3 c 3
4 d+e+f+g 4+5+6+7
5 a+g+e 1+7+5
另一个解决方案可能更短
dt1 <- data.frame(letter=c("a","b+c","c","d+e+f+g","a+g+e"), stringsAsFactors = F)
v <- c("1","2","3","4","5","6","7")
names(v) <- c("a","b","c","d","e","f","g")
dt1 %>% mutate(number = str_replace_all(letter, v))
一种快速且可读的library(stringi)
方法:
library(stringi)
dt1 <- data.frame(letter=c("a","b+c","c","d+e+f+g","a+g+e"))
dt2 <- data.frame(letter=c("a","b","c","d","e","f","g"), number=c("1","2","3","4","5","6","7"))
dt1$number <- stri_replace_all_fixed(dt1$letter, pattern = dt2$letter, replacement = dt2$number, vectorize_all = FALSE)
dt1
结果:
> dt1
letter number
1 a 1
2 b+c 2+3
3 c 3
4 d+e+f+g 4+5+6+7
5 a+g+e 1+7+5
另请参阅此 。
编辑:当前可用答案的基准:
Unit: microseconds
expr min lq mean median uq max neval
Sotos 2689.6 2689.6 2689.6 2689.6 2689.6 2689.6 1
ismirsehregal 26.4 26.4 26.4 26.4 26.4 26.4 1
www 42247.8 42247.8 42247.8 42247.8 42247.8 42247.8 1
MerijnvanTilborg 1723.5 1723.5 1723.5 1723.5 1723.5 1723.5 1
YuriySaraykin 21859.2 21859.2 21859.2 21859.2 21859.2 21859.2 1
danlooo 4165.7 4165.7 4165.7 4165.7 4165.7 4165.7 1
要重现基准:
library(microbenchmark)
library(tidyverse)
library(stringi)
dt1 <- data.frame(letter=c("a","b+c","c","d+e+f+g","a+g+e"))
dt2 <- data.frame(letter=c("a","b","c","d","e","f","g"), number=c("1","2","3","4","5","6","7"))
microbenchmark(
Sotos = {
sapply(strsplit(dt1$letter, '+', fixed = TRUE), function(i)
paste(dt2$number[dt2$letter %in% i], collapse = '+'))
},
ismirsehregal = {
stri_replace_all_fixed(
dt1$letter,
pattern = dt2$letter,
replacement = dt2$number,
vectorize_all = FALSE
)
},
www = {
dt1 %>%
mutate(ID = 1:n()) %>%
separate_rows(letter, sep = "\+") %>%
left_join(dt2, by = "letter") %>%
group_by(ID) %>%
summarize(across(.fns = ~ paste0(., collapse = "+"))) %>%
ungroup() %>%
select(-ID)
},
MerijnvanTilborg = {
dt1 %>% mutate(MerijnvanTilborg = stri_replace_all_regex(letter, dt2$letter, dt2$number, vectorize_all = F))
},
YuriySaraykin = {
dt1 %>%
rowwise() %>%
mutate(tmp = str_split(letter, pattern = "\+")) %>%
ungroup() %>%
mutate(number = map_chr(tmp, ~ paste0(match(.x, dt2$letter), collapse = "+"))) %>%
select(-tmp)
},
danlooo = {
dt1 %>%
as_tibble() %>%
mutate(number = letter %>% map_chr(
~ .x %>%
str_split("[+]") %>%
simplify() %>%
map_chr( ~ deframe(dt2)[.x]) %>%
paste0(collapse = "+")
))
},
times = 1L
)
如何合并分隔符数量可变的列,以便获得类似于输出的内容(假设所有内容都是字符)?
dt1
letter
1 a
2 b+c
3 c
4 d+e+f+g
5 a+g+e
dt2
letter number
1 a 1
2 b 2
3 c 3
4 d 4
5 e 5
6 f 6
7 g 7
> output
letter number
1 a 1
2 b+c 2+3
3 c 3
4 d+e+f+g 4+5+6+7
5 a+g+e 1+7+5
dt1<-data.frame(letter=c("a","b+c","c","d+e+f+g","a+g+e"))
dt2<-data.frame(letter=c("a","b","c","d","e","f","g"),number=c("1","2","3","4","5","6","7"))
output<-data.frame(letter=c("a","b+c","c","d+e+f+g","a+g+e"), number=c("1","2+3","3","4+5+6+7","1+7+5"))
library(tidyverse)
dt1 <- data.frame(letter = c("a", "b+c", "c", "d+e+f+g", "a+g+e"))
dt2 <- data.frame(letter = c("a", "b", "c", "d", "e", "f", "g"),
number = c("1", "2", "3", "4", "5", "6", "7"))
dt1 %>%
as_tibble() %>%
mutate(
number = letter %>% map_chr(~ .x %>%
str_split("[+]") %>%
simplify() %>%
map_chr(~ deframe(dt2)[.x]) %>%
paste0(collapse = "+")
)
)
#> # A tibble: 5 x 2
#> letter number
#> <chr> <chr>
#> 1 a 1
#> 2 b+c 2+3
#> 3 c 3
#> 4 d+e+f+g 4+5+6+7
#> 5 a+g+e 1+7+5
由 reprex package (v2.0.1)
于 2021-12-14 创建基础 R 解决方案可以是,
dt1$res <- sapply(strsplit(dt1$letter, '+', fixed = TRUE), function(i)paste(dt2$number[dt2$letter %in% i], collapse = '+'))
# letter res
#1 a 1
#2 b+c 2+3
#3 c 3
#4 d+e+f+g 4+5+6+7
#5 a+g+e 1+5+7
使用 tidyverse
.
library(tidyverse)
output <- dt1 %>%
mutate(ID = 1:n()) %>%
separate_rows(letter, sep = "\+") %>%
left_join(dt2, by = "letter") %>%
group_by(ID) %>%
summarize(across(.fns = ~paste0(., collapse = "+"))) %>%
ungroup() %>%
select(-ID)
output
# # A tibble: 5 x 2
# letter number
# <chr> <chr>
# 1 a 1
# 2 b+c 2+3
# 3 c 3
# 4 d+e+f+g 4+5+6+7
# 5 a+g+e 1+7+5
dt1<-data.frame(letter=c("a","b+c","c","d+e+f+g","a+g+e"))
dt2<-data.frame(letter=c("a","b","c","d","e","f","g"),number=c("1","2","3","4","5","6","7"))
library(tidyverse)
dt1 %>%
rowwise() %>%
mutate(tmp = str_split(letter, pattern = "\+")) %>%
ungroup() %>%
mutate(number = map_chr(tmp, ~paste0(match(.x, dt2$letter), collapse = "+"))) %>%
select(-tmp)
#> # A tibble: 5 x 2
#> letter number
#> <chr> <chr>
#> 1 a 1
#> 2 b+c 2+3
#> 3 c 3
#> 4 d+e+f+g 4+5+6+7
#> 5 a+g+e 1+7+5
由 reprex package (v2.0.1)
于 2021-12-14 创建不需要拆分任何数据,因为您只想用特定字母替换特定数字。
dt1<-data.frame(letter=c("a","b+c","c","d+e+f+g","a+g+e"), stringsAsFactors = F)
dt2<-data.frame(letter=c("a","b","c","d","e","f","g"),number=c("1","2","3","4","5","6","7"), stringsAsFactors = F)
library(stringi)
dt1 %>% mutate(number = stri_replace_all_regex(letter, dt2$letter, dt2$number, vectorize_all = F))
letter number
1 a 1
2 b+c 2+3
3 c 3
4 d+e+f+g 4+5+6+7
5 a+g+e 1+7+5
另一个解决方案可能更短
dt1 <- data.frame(letter=c("a","b+c","c","d+e+f+g","a+g+e"), stringsAsFactors = F)
v <- c("1","2","3","4","5","6","7")
names(v) <- c("a","b","c","d","e","f","g")
dt1 %>% mutate(number = str_replace_all(letter, v))
一种快速且可读的library(stringi)
方法:
library(stringi)
dt1 <- data.frame(letter=c("a","b+c","c","d+e+f+g","a+g+e"))
dt2 <- data.frame(letter=c("a","b","c","d","e","f","g"), number=c("1","2","3","4","5","6","7"))
dt1$number <- stri_replace_all_fixed(dt1$letter, pattern = dt2$letter, replacement = dt2$number, vectorize_all = FALSE)
dt1
结果:
> dt1
letter number
1 a 1
2 b+c 2+3
3 c 3
4 d+e+f+g 4+5+6+7
5 a+g+e 1+7+5
另请参阅此
编辑:当前可用答案的基准:
Unit: microseconds
expr min lq mean median uq max neval
Sotos 2689.6 2689.6 2689.6 2689.6 2689.6 2689.6 1
ismirsehregal 26.4 26.4 26.4 26.4 26.4 26.4 1
www 42247.8 42247.8 42247.8 42247.8 42247.8 42247.8 1
MerijnvanTilborg 1723.5 1723.5 1723.5 1723.5 1723.5 1723.5 1
YuriySaraykin 21859.2 21859.2 21859.2 21859.2 21859.2 21859.2 1
danlooo 4165.7 4165.7 4165.7 4165.7 4165.7 4165.7 1
要重现基准:
library(microbenchmark)
library(tidyverse)
library(stringi)
dt1 <- data.frame(letter=c("a","b+c","c","d+e+f+g","a+g+e"))
dt2 <- data.frame(letter=c("a","b","c","d","e","f","g"), number=c("1","2","3","4","5","6","7"))
microbenchmark(
Sotos = {
sapply(strsplit(dt1$letter, '+', fixed = TRUE), function(i)
paste(dt2$number[dt2$letter %in% i], collapse = '+'))
},
ismirsehregal = {
stri_replace_all_fixed(
dt1$letter,
pattern = dt2$letter,
replacement = dt2$number,
vectorize_all = FALSE
)
},
www = {
dt1 %>%
mutate(ID = 1:n()) %>%
separate_rows(letter, sep = "\+") %>%
left_join(dt2, by = "letter") %>%
group_by(ID) %>%
summarize(across(.fns = ~ paste0(., collapse = "+"))) %>%
ungroup() %>%
select(-ID)
},
MerijnvanTilborg = {
dt1 %>% mutate(MerijnvanTilborg = stri_replace_all_regex(letter, dt2$letter, dt2$number, vectorize_all = F))
},
YuriySaraykin = {
dt1 %>%
rowwise() %>%
mutate(tmp = str_split(letter, pattern = "\+")) %>%
ungroup() %>%
mutate(number = map_chr(tmp, ~ paste0(match(.x, dt2$letter), collapse = "+"))) %>%
select(-tmp)
},
danlooo = {
dt1 %>%
as_tibble() %>%
mutate(number = letter %>% map_chr(
~ .x %>%
str_split("[+]") %>%
simplify() %>%
map_chr( ~ deframe(dt2)[.x]) %>%
paste0(collapse = "+")
))
},
times = 1L
)