如何匹配列中的重复值、检查条件并在 R 中输出最终结果?
How can I match duplicated values in a column, examine the condition and output a final result in R?
我有一本这样的字典table:
ID
Position
Region
a
1-50
D1a
a
80-100
D2a
a
250-300
D3a
b
50-100
D1b
b
150-180
D2b
c
1-20
D1c
c
50-80
D2c
c
100-200
D3c
c
250-300
D4c
还有这样的目标 table:
ID
Position
a
28
a
85
a
320
b
55
b
100
c
18
c
45
c
180
c
270
逻辑是检查targettable中的数值-Position是否在字典table中的position范围内,输出给定ID的region值
我首先想到 R 包 hash
应该可以。但是后来我发现哈希键必须是唯一的,而在我的情况下不是。
如何先匹配每个ID,然后使用if between()
或其他函数将我的目标映射到各自的区域?
这是您的预期结果吗?
library(data.table)
dt1 <- fread("
ID Position Region
a 1-50 D1a
a 80-100 D2a
a 250-300 D3a
b 50-100 D1b
b 150-180 D2b
c 1-20 D1c
c 50-80 D2c
c 100-200 D3c
c 250-300 D4c
")
dt2 <- fread("
ID Position
a 28
a 85
a 320
b 55
b 100
c 18
c 45
c 180
c 270
")
#dt1[,c("Position_left","Position_right") := tstrsplit(Position,"-")]
#dt1[, dt2$Position %between% list(Position_left,Position_right)]
# [1] TRUE TRUE FALSE TRUE FALSE TRUE FALSE TRUE TRUE
dt1[,.( ID,
Position = dt2$Position,
Region = fifelse(dt2$Position %between% tstrsplit(Position,"-"),Region,NA)
)]
ID Position Region
<char> <int> <char>
1: a 28 D1a
2: a 85 D2a
3: a 320 <NA>
4: b 55 D1b
5: b 100 <NA>
6: c 18 D1c
7: c 45 <NA>
8: c 180 D3c
9: c 270 D4c
也许像这样用 tidy verse 方法 (tidyr::separate)
然而,这是一个快速的回答,我不确定是否如您所愿地理解了您的数据关系。
library(tidyverse)
one <- tibble::tribble(
~ID, ~Position, ~Region,
"a", "1-50", "D1a",
"a", "80-100", "D2a",
"a", "250-300", "D3a",
"b", "50-100", "D1b",
"b", "150-180", "D2b",
"c", "1-20", "D1c",
"c", "50-80", "D2c",
"c", "100-200", "D3c",
"c", "250-300", "D4c"
)
two <- tibble::tribble(
~ID, ~Position,
"a", 28L,
"a", 85L,
"a", 320L,
"b", 55L,
"b", 100L,
"c", 18L,
"c", 45L,
"c", 180L,
"c", 270L
)
one_ <- one %>%
tidyr::separate(Position, c('p_min', 'p_max'), sep = "-") %>%
mutate_at(vars(starts_with('p_')), as.integer)
two %>%
mutate(rn = row_number()) %>%
left_join(one_) %>%
mutate(in_between = (Position >= p_min & Position <= p_max)) %>%
filter(in_between) %>%
distinct(rn, .keep_all = TRUE)
Joining, by = "ID"
# A tibble: 7 × 7
ID Position rn p_min p_max Region in_between
<chr> <int> <int> <int> <int> <chr> <lgl>
1 a 28 1 1 50 D1a TRUE
2 a 85 2 80 100 D2a TRUE
3 b 55 4 50 100 D1b TRUE
4 b 100 5 50 100 D1b TRUE
5 c 18 6 1 20 D1c TRUE
6 c 180 8 100 200 D3c TRUE
7 c 270 9 250 300 D4c TRUE
在将 Position
列拆分为两列后,您可以使用程序包 fuzzyjoin
执行范围连接。
使用来自@Guillaume 的数据 -
library(dplyr)
library(fuzzyjoin)
library(tidyr)
one %>%
separate(Position, c('min', 'max'), sep = '-', convert = TRUE) %>%
fuzzy_right_join(two, by = c('ID', 'min' = 'Position', 'max' = 'Position'),
match_fun = c(`==`, `<=`, `>=`)) %>%
select(ID = ID.y, Position, min, max, Region)
# ID Position min max Region
# <chr> <int> <int> <int> <chr>
#1 a 28 1 50 D1a
#2 a 85 80 100 D2a
#3 a 320 NA NA NA
#4 b 55 50 100 D1b
#5 b 100 50 100 D1b
#6 c 18 1 20 D1c
#7 c 45 NA NA NA
#8 c 180 100 200 D3c
#9 c 270 250 300 D4c
根据您的预期输出,您可以使用 fuzzy_(inner_join
/left_join
/full_join
).
我有一本这样的字典table:
ID | Position | Region |
---|---|---|
a | 1-50 | D1a |
a | 80-100 | D2a |
a | 250-300 | D3a |
b | 50-100 | D1b |
b | 150-180 | D2b |
c | 1-20 | D1c |
c | 50-80 | D2c |
c | 100-200 | D3c |
c | 250-300 | D4c |
还有这样的目标 table:
ID | Position |
---|---|
a | 28 |
a | 85 |
a | 320 |
b | 55 |
b | 100 |
c | 18 |
c | 45 |
c | 180 |
c | 270 |
逻辑是检查targettable中的数值-Position是否在字典table中的position范围内,输出给定ID的region值
我首先想到 R 包 hash
应该可以。但是后来我发现哈希键必须是唯一的,而在我的情况下不是。
如何先匹配每个ID,然后使用if between()
或其他函数将我的目标映射到各自的区域?
这是您的预期结果吗?
library(data.table)
dt1 <- fread("
ID Position Region
a 1-50 D1a
a 80-100 D2a
a 250-300 D3a
b 50-100 D1b
b 150-180 D2b
c 1-20 D1c
c 50-80 D2c
c 100-200 D3c
c 250-300 D4c
")
dt2 <- fread("
ID Position
a 28
a 85
a 320
b 55
b 100
c 18
c 45
c 180
c 270
")
#dt1[,c("Position_left","Position_right") := tstrsplit(Position,"-")]
#dt1[, dt2$Position %between% list(Position_left,Position_right)]
# [1] TRUE TRUE FALSE TRUE FALSE TRUE FALSE TRUE TRUE
dt1[,.( ID,
Position = dt2$Position,
Region = fifelse(dt2$Position %between% tstrsplit(Position,"-"),Region,NA)
)]
ID Position Region
<char> <int> <char>
1: a 28 D1a
2: a 85 D2a
3: a 320 <NA>
4: b 55 D1b
5: b 100 <NA>
6: c 18 D1c
7: c 45 <NA>
8: c 180 D3c
9: c 270 D4c
也许像这样用 tidy verse 方法 (tidyr::separate)
然而,这是一个快速的回答,我不确定是否如您所愿地理解了您的数据关系。
library(tidyverse)
one <- tibble::tribble(
~ID, ~Position, ~Region,
"a", "1-50", "D1a",
"a", "80-100", "D2a",
"a", "250-300", "D3a",
"b", "50-100", "D1b",
"b", "150-180", "D2b",
"c", "1-20", "D1c",
"c", "50-80", "D2c",
"c", "100-200", "D3c",
"c", "250-300", "D4c"
)
two <- tibble::tribble(
~ID, ~Position,
"a", 28L,
"a", 85L,
"a", 320L,
"b", 55L,
"b", 100L,
"c", 18L,
"c", 45L,
"c", 180L,
"c", 270L
)
one_ <- one %>%
tidyr::separate(Position, c('p_min', 'p_max'), sep = "-") %>%
mutate_at(vars(starts_with('p_')), as.integer)
two %>%
mutate(rn = row_number()) %>%
left_join(one_) %>%
mutate(in_between = (Position >= p_min & Position <= p_max)) %>%
filter(in_between) %>%
distinct(rn, .keep_all = TRUE)
Joining, by = "ID"
# A tibble: 7 × 7
ID Position rn p_min p_max Region in_between
<chr> <int> <int> <int> <int> <chr> <lgl>
1 a 28 1 1 50 D1a TRUE
2 a 85 2 80 100 D2a TRUE
3 b 55 4 50 100 D1b TRUE
4 b 100 5 50 100 D1b TRUE
5 c 18 6 1 20 D1c TRUE
6 c 180 8 100 200 D3c TRUE
7 c 270 9 250 300 D4c TRUE
在将 Position
列拆分为两列后,您可以使用程序包 fuzzyjoin
执行范围连接。
使用来自@Guillaume 的数据 -
library(dplyr)
library(fuzzyjoin)
library(tidyr)
one %>%
separate(Position, c('min', 'max'), sep = '-', convert = TRUE) %>%
fuzzy_right_join(two, by = c('ID', 'min' = 'Position', 'max' = 'Position'),
match_fun = c(`==`, `<=`, `>=`)) %>%
select(ID = ID.y, Position, min, max, Region)
# ID Position min max Region
# <chr> <int> <int> <int> <chr>
#1 a 28 1 50 D1a
#2 a 85 80 100 D2a
#3 a 320 NA NA NA
#4 b 55 50 100 D1b
#5 b 100 50 100 D1b
#6 c 18 1 20 D1c
#7 c 45 NA NA NA
#8 c 180 100 200 D3c
#9 c 270 250 300 D4c
根据您的预期输出,您可以使用 fuzzy_(inner_join
/left_join
/full_join
).