如何根据组内的子串匹配两个数据框
How to match two data frames based on substrings within groups
我想合并两个由相同标识符分组的数据框。第一个数据帧 (valueA) 中的变量应与第二个数据帧 (valueB) 中变量的子字符串匹配,但仅限于组内。
我可以设法匹配匹配的变量,但我很难将匹配限制为分组变量。以下是示例数据框和匹配代码:
df1 <- data.frame(report = c('Report1','Report1','Report1','Report1','Report1','Report1'),
identifier = c('Abraham', 'Abraham', 'Abraham','Barack', 'Barack','Barack'),
variableA = c('V1','V2','V3','V1','V2', 'V3'),
value = c('CDKN2A/B','PALB2','KRAS','TP53','RB1','KRAS'))
df2 <- data.frame(report = c('Report1','Report1','Report1','Report1','Report1','Report1','Report1'),
identifier = c('Abraham', 'Abraham', 'Abraham','Abraham','Barack', 'Barack','Barack'),
variableB = c('F1','F2','F3','F4','F1','F2', 'F3'),
valueB = c('CDKN2A/B LOSS','PALB2 P1111FS*13','KRAS G12R','PALB2 N540FS*1','RB1 SPLICE SITE 2325+1G>A','KRAS G13C','TP53 C238F'))
这是我试过的代码,但不适用于群组
idx2 <- sapply(df1$value, grep, df2$valueB)
idx1 <- sapply(seq_along(idx2), function(i) rep(i, length(idx2[[i]])))
idx3 <- cbind(df1[unlist(idx1),,drop=F], df2[unlist(idx2),,drop=F])
预期输出为(数据框代码)
df3 <- data.frame(report=c('Report1','Report1','Report1','Report1','Report1','Report1','Report1'),
identifier = c('Abraham', 'Abraham', 'Abraham','Abraham','Barack', 'Barack','Barack'),
variableA = c('V1','V2','V3','V2','V1','V2', 'V3'),
value = c('CDKN2A/B','PALB2','KRAS','PALB2','TP53','RB1','KRAS'),
variableB = c('F1','F2','F3','F4','F1','F2', 'F3'),
valueB = c('CDKN2A/B LOSS','PALB2 P1111FS*13','KRAS G12R','PALB2 N540FS*1','TP53 C238F','RB1 SPLICE SITE 2325+1G>A','KRAS G13C'))
结果数据帧
report identifier variableA value variableB valueB
Report1 Abraham V1 CDKN2A/B F1 CDKN2A/B LOSS
Report1 Abraham V2 PALB2 F2 PALB2 P1111FS*13
Report1 Abraham V3 KRAS F3 KRAS G12R
Report1 Abraham V2 PALB2 F4 PALB2 N540FS*1
Report1 Barack V1 TP53 F1 TP53 C238F
Report1 Barack V2 RB1 F2 RB1 SPLICE SITE 2325+1G>A
Report1 Barack V3 KRAS F3 KRAS G13C
希望这是有道理的,非常感谢您的帮助!
您可以为此使用 fuzzyjoin
软件包:
fuzzy_inner_join(df2, df1, by = c("valueB" = "valueA", "identifier" = "identifier"), match_fun = list(str_detect, `==`)) %>%
select(report.x, identifier.x, variableA, valueA, variableB, valueB)
report.x identifier.x variableA valueA variableB valueB
1 Report1 Abraham V1 CDKN2A/B F1 CDKN2A/B LOSS
2 Report1 Abraham V2 PALB2 F2 PALB2 P1111FS*13
3 Report1 Abraham V3 KRAS F3 KRAS G12R
4 Report1 Abraham V2 PALB2 F4 PALB2 N540FS*1
5 Report1 Barack V2 RB1 F1 RB1 SPLICE SITE 2325+1G>A
6 Report1 Barack V3 KRAS F2 KRAS G13C
7 Report1 Barack V1 TP53 F3 TP53 C238F
这样您就可以为不同的列应用不同的匹配函数。在这种情况下,我们使用 str_detect()
作为您的模糊匹配列,使用 ==
作为您的分组列。
我们可以使用str_extract
来捕获公共字符串并合并,即
library(stringr)
merge(df1,
transform(df2, value = sapply(df2$valueB, function(i)
str_extract(i, paste(df1$value, collapse = '|')))),
by = c('value', 'identifier', 'report'))
# value identifier report variableA variableB valueB
# 1 CDKN2A/B Abraham Report1 V1 F1 CDKN2A/B LOSS
# 2 KRAS Abraham Report1 V3 F3 KRAS G12R
# 3 KRAS Barack Report1 V3 F2 KRAS G13C
# 4 PALB2 Abraham Report1 V2 F2 PALB2 P1111FS*13
# 5 PALB2 Abraham Report1 V2 F4 PALB2 N540FS*1
# 6 RB1 Barack Report1 V2 F1 RB1 SPLICE SITE 2325+1G>A
# 7 TP53 Barack Report1 V1 F3 TP53 C238F
我想合并两个由相同标识符分组的数据框。第一个数据帧 (valueA) 中的变量应与第二个数据帧 (valueB) 中变量的子字符串匹配,但仅限于组内。
我可以设法匹配匹配的变量,但我很难将匹配限制为分组变量。以下是示例数据框和匹配代码:
df1 <- data.frame(report = c('Report1','Report1','Report1','Report1','Report1','Report1'),
identifier = c('Abraham', 'Abraham', 'Abraham','Barack', 'Barack','Barack'),
variableA = c('V1','V2','V3','V1','V2', 'V3'),
value = c('CDKN2A/B','PALB2','KRAS','TP53','RB1','KRAS'))
df2 <- data.frame(report = c('Report1','Report1','Report1','Report1','Report1','Report1','Report1'),
identifier = c('Abraham', 'Abraham', 'Abraham','Abraham','Barack', 'Barack','Barack'),
variableB = c('F1','F2','F3','F4','F1','F2', 'F3'),
valueB = c('CDKN2A/B LOSS','PALB2 P1111FS*13','KRAS G12R','PALB2 N540FS*1','RB1 SPLICE SITE 2325+1G>A','KRAS G13C','TP53 C238F'))
这是我试过的代码,但不适用于群组
idx2 <- sapply(df1$value, grep, df2$valueB)
idx1 <- sapply(seq_along(idx2), function(i) rep(i, length(idx2[[i]])))
idx3 <- cbind(df1[unlist(idx1),,drop=F], df2[unlist(idx2),,drop=F])
预期输出为(数据框代码)
df3 <- data.frame(report=c('Report1','Report1','Report1','Report1','Report1','Report1','Report1'),
identifier = c('Abraham', 'Abraham', 'Abraham','Abraham','Barack', 'Barack','Barack'),
variableA = c('V1','V2','V3','V2','V1','V2', 'V3'),
value = c('CDKN2A/B','PALB2','KRAS','PALB2','TP53','RB1','KRAS'),
variableB = c('F1','F2','F3','F4','F1','F2', 'F3'),
valueB = c('CDKN2A/B LOSS','PALB2 P1111FS*13','KRAS G12R','PALB2 N540FS*1','TP53 C238F','RB1 SPLICE SITE 2325+1G>A','KRAS G13C'))
结果数据帧
report identifier variableA value variableB valueB
Report1 Abraham V1 CDKN2A/B F1 CDKN2A/B LOSS
Report1 Abraham V2 PALB2 F2 PALB2 P1111FS*13
Report1 Abraham V3 KRAS F3 KRAS G12R
Report1 Abraham V2 PALB2 F4 PALB2 N540FS*1
Report1 Barack V1 TP53 F1 TP53 C238F
Report1 Barack V2 RB1 F2 RB1 SPLICE SITE 2325+1G>A
Report1 Barack V3 KRAS F3 KRAS G13C
希望这是有道理的,非常感谢您的帮助!
您可以为此使用 fuzzyjoin
软件包:
fuzzy_inner_join(df2, df1, by = c("valueB" = "valueA", "identifier" = "identifier"), match_fun = list(str_detect, `==`)) %>%
select(report.x, identifier.x, variableA, valueA, variableB, valueB)
report.x identifier.x variableA valueA variableB valueB
1 Report1 Abraham V1 CDKN2A/B F1 CDKN2A/B LOSS
2 Report1 Abraham V2 PALB2 F2 PALB2 P1111FS*13
3 Report1 Abraham V3 KRAS F3 KRAS G12R
4 Report1 Abraham V2 PALB2 F4 PALB2 N540FS*1
5 Report1 Barack V2 RB1 F1 RB1 SPLICE SITE 2325+1G>A
6 Report1 Barack V3 KRAS F2 KRAS G13C
7 Report1 Barack V1 TP53 F3 TP53 C238F
这样您就可以为不同的列应用不同的匹配函数。在这种情况下,我们使用 str_detect()
作为您的模糊匹配列,使用 ==
作为您的分组列。
我们可以使用str_extract
来捕获公共字符串并合并,即
library(stringr)
merge(df1,
transform(df2, value = sapply(df2$valueB, function(i)
str_extract(i, paste(df1$value, collapse = '|')))),
by = c('value', 'identifier', 'report'))
# value identifier report variableA variableB valueB
# 1 CDKN2A/B Abraham Report1 V1 F1 CDKN2A/B LOSS
# 2 KRAS Abraham Report1 V3 F3 KRAS G12R
# 3 KRAS Barack Report1 V3 F2 KRAS G13C
# 4 PALB2 Abraham Report1 V2 F2 PALB2 P1111FS*13
# 5 PALB2 Abraham Report1 V2 F4 PALB2 N540FS*1
# 6 RB1 Barack Report1 V2 F1 RB1 SPLICE SITE 2325+1G>A
# 7 TP53 Barack Report1 V1 F3 TP53 C238F