R Data.Table 加入条件

R Data.Table Join on Conditionals

我有两个 table,我想以等同于以下 SQL 的方式连接在一起,我在多个条件下加入,而不仅仅是平等。

require(sqldf)
require(data.table)

dt <- data.table(num=c(1, 2, 3, 4, 5, 6), 
char=c('A', 'A', 'A', 'B', 'B', 'B'), 
bool=c(TRUE, FALSE, TRUE, FALSE, TRUE, FALSE))

dt_two <- data.table(
num =c(6, 1, 5, 2, 4, 3), 
char=c('A', 'A', 'A', 'B', 'B', 'B'), 
bool=c(TRUE, FALSE, TRUE, FALSE, TRUE, FALSE))


dt_out_sql <- sqldf('
    select dtone.num, dtone.char, dtone.bool, SUM(dttwo.num) as SUM,  
   MIN(dttwo.num) as MIN
    from dt as dtone INNER join dt_two as dttwo on 
    (dtone.char = dttwo.char) and 
    (dtone.num >= dttwo.num OR dtone.bool)
GROUP BY dtone.num, dtone.char, dtone.bool')

出于性能和灵活性方面的原因,我想避免使用 SQL 解决方案。进行交叉连接也是如此,然后是 filtering/aggregating——它会创建一个中间 table,其中有很多不必要的记录供我过滤掉。

非常感谢!

更新——我最初的例子是仓促完成的。在我的实际问题中,我没有进行自连接。

这是一种方法:

require(data.table)
setkey(dt, char)
setkey(dt_two, char)

dt_two[dt, {
   val = num[i.bool | i.num >= num]; 
   list(num=i.num, bool=i.bool, sum=sum(val), min=min(val))
}, by=.EACHI]
#    char num  bool sum min
# 1:    A   1  TRUE  12   1
# 2:    A   2 FALSE   1   1
# 3:    A   3  TRUE  12   1
# 4:    B   4 FALSE   9   2
# 5:    B   5  TRUE   9   2
# 6:    B   6 FALSE   9   2

要阅读有关 by=.EACHI 的信息,请查看 this post(直到连接的插图完成)。

HTH

它有点难看但有效:

library(data.table)
library(sqldf)

dt <- data.table(num=c(1, 2, 3, 4, 5, 6), 
                 char=c('A', 'A', 'A', 'B', 'B', 'B'), 
                 bool=c(TRUE, FALSE, TRUE, FALSE, TRUE, FALSE))

dt_two <- data.table(
  num =c(6, 1, 5, 2, 4, 3), 
  char=c('A', 'A', 'A', 'B', 'B', 'B'), 
  bool=c(TRUE, FALSE, TRUE, FALSE, TRUE, FALSE))


dt_out_sql <- sqldf('
    select dtone.num,
            dtone.char,
            dtone.bool,
            SUM(dttwo.num) as SUM,  
            MIN(dttwo.num) as MIN
    from    dt as dtone
    INNER join dt_two as dttwo on 
          (dtone.char = dttwo.char) and 
          (dtone.num >= dttwo.num OR dtone.bool)
    GROUP BY dtone.num, dtone.char, dtone.bool
  ')

setDT(dt_out_sql)

setkey(dt, char)
setkey(dt_two, char)

dt_out_r <- dt[dt_two,
               list(dtone.num = num,
                    dttwo.num = i.num,
                    char,
                    bool) ,
               nomatch = 0, allow.cartesian = T
               ][
                 dtone.num >= dttwo.num | bool,
                 list(SUM = sum(dttwo.num),
                      MIN = min(dttwo.num)),
                 by = list(num = dtone.num,
                           char,
                           bool)
                 ]

setkey(dt_out_r, num, char, bool)


all.equal(dt_out_sql, dt_out_r, check.attributes = FALSE)

data.table 1.9.8 起,对于可以放宽连接条件的情况,出现了简单的非相等连接语法,如下所示:

dt_two[dt, on=.(char, num >= num)]