根据与另一列的部分匹配创建新列
Create new column based on partial match with another column
我想为数据框创建一个新列,在另一列中使用部分匹配。问题是我的值只是部分匹配,名称末尾的后缀 _3p 或 _5p 仅存在于原始数据框中,但不存在于我用来测试的另一列中。
我使用的代码应该可以工作,但由于部分匹配问题,所以我被卡住了。
> head(df)
# A tibble: 6 x 2
microRNAs `number of targets`
<chr> <int>
1 bantam|LQNS02278082.1_33125_3p 128
2 bantam|LQNS02278082.1_33125_5p 8
3 Dpu-Mir-10-P2_LQNS02277998.1_30984_3p 44
4 Dpu-Mir-10-P2_LQNS02277998.1_30984_5p 78
5 Dpu-Mir-10-P3_LQNS02277998.1_30988_3p 1076
6 Dpu-Mir-10-P3_LQNS02277998.1_30988_5p 309
> dput(head(df))
structure(list(microRNAs = c("bantam|LQNS02278082.1_33125_3p",
"bantam|LQNS02278082.1_33125_5p", "Dpu-Mir-10-P2_LQNS02277998.1_30984_3p",
"Dpu-Mir-10-P2_LQNS02277998.1_30984_5p", "Dpu-Mir-10-P3_LQNS02277998.1_30988_3p",
"Dpu-Mir-10-P3_LQNS02277998.1_30988_5p"), `number of targets` = c(128L,
8L, 44L, 78L, 1076L, 309L)), row.names = c(NA, -6L), class = c("tbl_df",
"tbl", "data.frame"))
#matches to look for
unique
1 miR-9|LQNS02278094.1_36129
2 LQNS02278139.1_39527
3 LQNS02278139.1_39523
4 LQNS02278075.1_32386
5 Dpu-Mir-10-P3_LQNS02277998.1_30988
> dput(head(unique))
structure(list(unique = c("miR-9|LQNS02278094.1_36129",
"LQNS02278139.1_39527", "LQNS02278139.1_39523", "LQNS02278075.1_32386",
"Dpu-Mir-10-P3_LQNS02277998.1_30988")), row.names = c(NA,
6L), class = "data.frame")
#Create new column with Yes, No
df$new <- ifelse(df$microRNAs %in% unique$unique, 'Yes', 'No')
##But it all appears like No due to the partial match.
我们可以使用 fuzzyjoin
中的 regex_left_join
library(fuzzyjoin)
regex_left_join(df, unique, by = c("microRNAs" = "unique"))
使用 data.table
的快速解决方案。
library(data.table)
# convert data.frame to data.table
setDT(df)
# create temporary column dropping the last 3 characters
df[, microRNAs_short := substr(microRNAs ,1, nchar(microRNAs)-3) ]
# check values in common
df[, new := fifelse( microRNAs_short %in% df2$unique, 'Yes', 'No')]
我想为数据框创建一个新列,在另一列中使用部分匹配。问题是我的值只是部分匹配,名称末尾的后缀 _3p 或 _5p 仅存在于原始数据框中,但不存在于我用来测试的另一列中。
我使用的代码应该可以工作,但由于部分匹配问题,所以我被卡住了。
> head(df)
# A tibble: 6 x 2
microRNAs `number of targets`
<chr> <int>
1 bantam|LQNS02278082.1_33125_3p 128
2 bantam|LQNS02278082.1_33125_5p 8
3 Dpu-Mir-10-P2_LQNS02277998.1_30984_3p 44
4 Dpu-Mir-10-P2_LQNS02277998.1_30984_5p 78
5 Dpu-Mir-10-P3_LQNS02277998.1_30988_3p 1076
6 Dpu-Mir-10-P3_LQNS02277998.1_30988_5p 309
> dput(head(df))
structure(list(microRNAs = c("bantam|LQNS02278082.1_33125_3p",
"bantam|LQNS02278082.1_33125_5p", "Dpu-Mir-10-P2_LQNS02277998.1_30984_3p",
"Dpu-Mir-10-P2_LQNS02277998.1_30984_5p", "Dpu-Mir-10-P3_LQNS02277998.1_30988_3p",
"Dpu-Mir-10-P3_LQNS02277998.1_30988_5p"), `number of targets` = c(128L,
8L, 44L, 78L, 1076L, 309L)), row.names = c(NA, -6L), class = c("tbl_df",
"tbl", "data.frame"))
#matches to look for
unique
1 miR-9|LQNS02278094.1_36129
2 LQNS02278139.1_39527
3 LQNS02278139.1_39523
4 LQNS02278075.1_32386
5 Dpu-Mir-10-P3_LQNS02277998.1_30988
> dput(head(unique))
structure(list(unique = c("miR-9|LQNS02278094.1_36129",
"LQNS02278139.1_39527", "LQNS02278139.1_39523", "LQNS02278075.1_32386",
"Dpu-Mir-10-P3_LQNS02277998.1_30988")), row.names = c(NA,
6L), class = "data.frame")
#Create new column with Yes, No
df$new <- ifelse(df$microRNAs %in% unique$unique, 'Yes', 'No')
##But it all appears like No due to the partial match.
我们可以使用 fuzzyjoin
regex_left_join
library(fuzzyjoin)
regex_left_join(df, unique, by = c("microRNAs" = "unique"))
使用 data.table
的快速解决方案。
library(data.table)
# convert data.frame to data.table
setDT(df)
# create temporary column dropping the last 3 characters
df[, microRNAs_short := substr(microRNAs ,1, nchar(microRNAs)-3) ]
# check values in common
df[, new := fifelse( microRNAs_short %in% df2$unique, 'Yes', 'No')]