使用值范围连接两个数据框

Joining two data frames using range of values

我有两个数据集要加入。 income_range 数据是主数据集,我想根据收入在哪个范围内将 data_occ 加入 income_range 数据。如果在范围内有两个以上的观察值(收入),我想取较低的收入。

我试图使用 data.table 但遇到了问题。如果可能的话,我还想保留 data.frames 中的所有列。

输出数据集应该只有 7 个观测值。

library(data.table)
library(dplyr)

income_range <- data.frame(id = "France"
                     ,inc_lower  = c(10, 21, 31, 41,51,61,71)
                     ,inc_high    = c(20, 30, 40, 50,60,70,80)
                     ,perct       = c(1,2,3,4,5,6,7))

data_occ <- data.frame(id = rep(c("France","Belgium"), each=50)
                   ,income = sample(10:80, 50)
                   ,occ = rep(c("manager","clerk","manual","skilled","office"), each=20))

setDT(income_range)
setDT(data_occ)

第一次尝试。

df2 <- income_range [data_occ , 
            on = .(id, inc_lower <= income, inc_high >= income),
            .(id, income, inc_lower,inc_high,perct,occ)]

提前致谢。

由于您标记了 dplyr,这里有一个使用该库的可能解决方案:

library('fuzzyjoin')

# join dataframes on id == id, inc_lower <= income, inc_high >= income
joined <- income_range %>%
          fuzzy_left_join(data_occ,
                          by = c('id' = 'id', 'inc_lower' = 'income', 'inc_high' = 'income'),
                          match_fun = list(`==`, `<=`, `>=`)) %>%
          rename(id = id.x) %>%
          select(-id.y)

# sort by income, and keep only the first row of every unique perct
result <- joined %>%
    arrange(income) %>%
    group_by(perct) %>%
    slice(1) 

以及(中间)结果:

> head(joined)
      id inc_lower inc_high perct income     occ
1 France        10       20     1     10 manager
2 France        10       20     1     19 manager
3 France        10       20     1     14 manager
4 France        10       20     1     11 manager
5 France        10       20     1     17 manager
6 France        10       20     1     12 manager

> result    
# A tibble: 7 x 6
# Groups:   perct [7]
  id     inc_lower inc_high perct income occ    
  <chr>      <dbl>    <dbl> <dbl>  <int> <chr>  
1 France        10       20     1     10 manager
2 France        21       30     2     21 manual 
3 France        31       40     3     31 manual 
4 France        41       50     4     43 manager
5 France        51       60     5     51 clerk  
6 France        61       70     6     61 manager
7 France        71       80     7     71 manager

为了便于理解,我添加了中间数据框 joined。您可以省略它,只需将两个命令链与 %>%.

链接在一起

这是一种data.table方法:

cols = c("inc_lower", "inc_high")
data_occ[, (cols) := income]

result = data_occ[order(income)
                  ][income_range, 
                    on = .(id, inc_lower>=inc_lower, inc_high<=inc_high), 
                    mult="first"]

data_occ[, (cols) := NULL]


#        id income     occ inc_lower inc_high perct
# 1: France     10   clerk        10       20     1
# 2: France     21 manager        21       30     2
# 3: France     31   clerk        31       40     3
# 4: France     41   clerk        41       50     4
# 5: France     51   clerk        51       60     5
# 6: France     62 manager        61       70     6
# 7: France     71 manager        71       80     7