根据另一个变量 R 的位置替换字符串中的字符
replace characters in string based on positions from another variable R
我有以下数据框 xo。对于每一行,我想按顺序查找并替换 positions_of_Ns_to_remove 中列出的位置。示例中的 results 新变量应按顺序排列,并删除所有 R。在这种情况下我无法根据角色本身进行搜索 - 它必须基于角色的位置。
p <- data.frame(locus = c("1","2","3"), positions_of_Ns_to_remove = c("12,17,43,100","30,60,61,62",NA))
x <- data.frame(locus = c("1","1","2","3"), sequence = c("xxxxxxxxxxxRxxxxRxxxxxxxxxxxxxxxxxxxxxxxxxRxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxR","xxxxxxxxxxxRxxxxRxxxxxxxxxxxxxxxxxxxxxxxxxRxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxR","xxxxxxxxxxxxxxxxxxxxxxxxxxxxxRxxxxxxxxxxxxxxxxxxxxxxxxxxxxxRRRxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx","xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"))
xo <- merge(x, p, by = c("locus"), all.x = T)
> xo
locus sequence positions_of_Ns_to_remove
1 1 xxxxxxxxxxxRxxxxRxxxxxxxxxxxxxxxxxxxxxxxxxRxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxR 12,17,43,100
2 1 xxxxxxxxxxxRxxxxRxxxxxxxxxxxxxxxxxxxxxxxxxRxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxR 12,17,43,100
3 2 xxxxxxxxxxxxxxxxxxxxxxxxxxxxxRxxxxxxxxxxxxxxxxxxxxxxxxxxxxxRRRxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx 30,60,61,62
4 3 xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx <NA>
如果 xo 中只有 1 行,则此方法有效,但当有多行时则无效。我想使用 tidyverse 函数/管道并尽可能避免循环。
xo %>% dplyr::mutate(new_sequence = paste(
replace( unlist(strsplit(sequence, "")), as.integer(unlist(strsplit(positions_of_Ns_to_remove,","))), "" ),
collapse = "")
)
我想要的:
locus new_sequence positions_of_Ns_to_remove
1 1 xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx 12,17,43,100
2 1 xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx 12,17,43,100
3 2 xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx 30,60,61,62
4 3 xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx <NA>
您可以构建自定义函数并将其应用于您的数据:
library(stringr)
# cuts the n-th character out of the string
remove_pos <- function(string, n) {
n <- as.integer(n)
n <- n[order(n, decreasing = TRUE)]
len <- nchar(string)
output <- string
for (i in n) {
output <- paste0(
str_sub(output, start = 1L, end = i - 1L),
str_sub(output, start = i + 1, end = len)
)
}
return(output)
}
xo %>%
mutate(positions = str_split(positions_of_Ns_to_remove, ",")) %>%
group_by(locus, n=row_number()) %>%
mutate(
new_seq = ifelse(!is.na(positions_of_Ns_to_remove),
remove_pos(sequence, unlist(positions)),
sequence)
) %>%
select(-positions) %>%
ungroup()
哪个returns
# A tibble: 5 x 4
locus sequence positions_of_Ns_to~ new_seq
<chr> <chr> <chr> <chr>
1 1 xxxxxxxxxxxRxxxxRxxxxxxxxxxxxxxxxxxxxxxxxx~ 12,17,43,100 xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx~
2 1 xxxxxxxxxxxRxxxxRxxxxxxxxxxxxxxxxxxxxxxxxx~ 12,17,43,100 xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx~
3 2 xxxxxxxxxxxxxxxxxxxxxxxxxxxxxRxxxxxxxxxxxx~ 30,60,61,62 xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx~
4 3 Rxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx~ 1 xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx~
5 4 xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx~ NA xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx~
我有以下数据框 xo。对于每一行,我想按顺序查找并替换 positions_of_Ns_to_remove 中列出的位置。示例中的 results 新变量应按顺序排列,并删除所有 R。在这种情况下我无法根据角色本身进行搜索 - 它必须基于角色的位置。
p <- data.frame(locus = c("1","2","3"), positions_of_Ns_to_remove = c("12,17,43,100","30,60,61,62",NA))
x <- data.frame(locus = c("1","1","2","3"), sequence = c("xxxxxxxxxxxRxxxxRxxxxxxxxxxxxxxxxxxxxxxxxxRxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxR","xxxxxxxxxxxRxxxxRxxxxxxxxxxxxxxxxxxxxxxxxxRxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxR","xxxxxxxxxxxxxxxxxxxxxxxxxxxxxRxxxxxxxxxxxxxxxxxxxxxxxxxxxxxRRRxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx","xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"))
xo <- merge(x, p, by = c("locus"), all.x = T)
> xo
locus sequence positions_of_Ns_to_remove
1 1 xxxxxxxxxxxRxxxxRxxxxxxxxxxxxxxxxxxxxxxxxxRxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxR 12,17,43,100
2 1 xxxxxxxxxxxRxxxxRxxxxxxxxxxxxxxxxxxxxxxxxxRxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxR 12,17,43,100
3 2 xxxxxxxxxxxxxxxxxxxxxxxxxxxxxRxxxxxxxxxxxxxxxxxxxxxxxxxxxxxRRRxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx 30,60,61,62
4 3 xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx <NA>
如果 xo 中只有 1 行,则此方法有效,但当有多行时则无效。我想使用 tidyverse 函数/管道并尽可能避免循环。
xo %>% dplyr::mutate(new_sequence = paste(
replace( unlist(strsplit(sequence, "")), as.integer(unlist(strsplit(positions_of_Ns_to_remove,","))), "" ),
collapse = "")
)
我想要的:
locus new_sequence positions_of_Ns_to_remove
1 1 xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx 12,17,43,100
2 1 xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx 12,17,43,100
3 2 xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx 30,60,61,62
4 3 xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx <NA>
您可以构建自定义函数并将其应用于您的数据:
library(stringr)
# cuts the n-th character out of the string
remove_pos <- function(string, n) {
n <- as.integer(n)
n <- n[order(n, decreasing = TRUE)]
len <- nchar(string)
output <- string
for (i in n) {
output <- paste0(
str_sub(output, start = 1L, end = i - 1L),
str_sub(output, start = i + 1, end = len)
)
}
return(output)
}
xo %>%
mutate(positions = str_split(positions_of_Ns_to_remove, ",")) %>%
group_by(locus, n=row_number()) %>%
mutate(
new_seq = ifelse(!is.na(positions_of_Ns_to_remove),
remove_pos(sequence, unlist(positions)),
sequence)
) %>%
select(-positions) %>%
ungroup()
哪个returns
# A tibble: 5 x 4
locus sequence positions_of_Ns_to~ new_seq
<chr> <chr> <chr> <chr>
1 1 xxxxxxxxxxxRxxxxRxxxxxxxxxxxxxxxxxxxxxxxxx~ 12,17,43,100 xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx~
2 1 xxxxxxxxxxxRxxxxRxxxxxxxxxxxxxxxxxxxxxxxxx~ 12,17,43,100 xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx~
3 2 xxxxxxxxxxxxxxxxxxxxxxxxxxxxxRxxxxxxxxxxxx~ 30,60,61,62 xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx~
4 3 Rxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx~ 1 xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx~
5 4 xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx~ NA xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx~