R Data.Table 加入条件
R Data.Table Join on Conditionals
我有两个 table,我想以等同于以下 SQL 的方式连接在一起,我在多个条件下加入,而不仅仅是平等。
require(sqldf)
require(data.table)
dt <- data.table(num=c(1, 2, 3, 4, 5, 6),
char=c('A', 'A', 'A', 'B', 'B', 'B'),
bool=c(TRUE, FALSE, TRUE, FALSE, TRUE, FALSE))
dt_two <- data.table(
num =c(6, 1, 5, 2, 4, 3),
char=c('A', 'A', 'A', 'B', 'B', 'B'),
bool=c(TRUE, FALSE, TRUE, FALSE, TRUE, FALSE))
dt_out_sql <- sqldf('
select dtone.num, dtone.char, dtone.bool, SUM(dttwo.num) as SUM,
MIN(dttwo.num) as MIN
from dt as dtone INNER join dt_two as dttwo on
(dtone.char = dttwo.char) and
(dtone.num >= dttwo.num OR dtone.bool)
GROUP BY dtone.num, dtone.char, dtone.bool')
出于性能和灵活性方面的原因,我想避免使用 SQL 解决方案。进行交叉连接也是如此,然后是 filtering/aggregating——它会创建一个中间 table,其中有很多不必要的记录供我过滤掉。
非常感谢!
更新——我最初的例子是仓促完成的。在我的实际问题中,我没有进行自连接。
这是一种方法:
require(data.table)
setkey(dt, char)
setkey(dt_two, char)
dt_two[dt, {
val = num[i.bool | i.num >= num];
list(num=i.num, bool=i.bool, sum=sum(val), min=min(val))
}, by=.EACHI]
# char num bool sum min
# 1: A 1 TRUE 12 1
# 2: A 2 FALSE 1 1
# 3: A 3 TRUE 12 1
# 4: B 4 FALSE 9 2
# 5: B 5 TRUE 9 2
# 6: B 6 FALSE 9 2
要阅读有关 by=.EACHI
的信息,请查看 this post(直到连接的插图完成)。
HTH
它有点难看但有效:
library(data.table)
library(sqldf)
dt <- data.table(num=c(1, 2, 3, 4, 5, 6),
char=c('A', 'A', 'A', 'B', 'B', 'B'),
bool=c(TRUE, FALSE, TRUE, FALSE, TRUE, FALSE))
dt_two <- data.table(
num =c(6, 1, 5, 2, 4, 3),
char=c('A', 'A', 'A', 'B', 'B', 'B'),
bool=c(TRUE, FALSE, TRUE, FALSE, TRUE, FALSE))
dt_out_sql <- sqldf('
select dtone.num,
dtone.char,
dtone.bool,
SUM(dttwo.num) as SUM,
MIN(dttwo.num) as MIN
from dt as dtone
INNER join dt_two as dttwo on
(dtone.char = dttwo.char) and
(dtone.num >= dttwo.num OR dtone.bool)
GROUP BY dtone.num, dtone.char, dtone.bool
')
setDT(dt_out_sql)
setkey(dt, char)
setkey(dt_two, char)
dt_out_r <- dt[dt_two,
list(dtone.num = num,
dttwo.num = i.num,
char,
bool) ,
nomatch = 0, allow.cartesian = T
][
dtone.num >= dttwo.num | bool,
list(SUM = sum(dttwo.num),
MIN = min(dttwo.num)),
by = list(num = dtone.num,
char,
bool)
]
setkey(dt_out_r, num, char, bool)
all.equal(dt_out_sql, dt_out_r, check.attributes = FALSE)
自 data.table
1.9.8 起,对于可以放宽连接条件的情况,出现了简单的非相等连接语法,如下所示:
dt_two[dt, on=.(char, num >= num)]
我有两个 table,我想以等同于以下 SQL 的方式连接在一起,我在多个条件下加入,而不仅仅是平等。
require(sqldf)
require(data.table)
dt <- data.table(num=c(1, 2, 3, 4, 5, 6),
char=c('A', 'A', 'A', 'B', 'B', 'B'),
bool=c(TRUE, FALSE, TRUE, FALSE, TRUE, FALSE))
dt_two <- data.table(
num =c(6, 1, 5, 2, 4, 3),
char=c('A', 'A', 'A', 'B', 'B', 'B'),
bool=c(TRUE, FALSE, TRUE, FALSE, TRUE, FALSE))
dt_out_sql <- sqldf('
select dtone.num, dtone.char, dtone.bool, SUM(dttwo.num) as SUM,
MIN(dttwo.num) as MIN
from dt as dtone INNER join dt_two as dttwo on
(dtone.char = dttwo.char) and
(dtone.num >= dttwo.num OR dtone.bool)
GROUP BY dtone.num, dtone.char, dtone.bool')
出于性能和灵活性方面的原因,我想避免使用 SQL 解决方案。进行交叉连接也是如此,然后是 filtering/aggregating——它会创建一个中间 table,其中有很多不必要的记录供我过滤掉。
非常感谢!
更新——我最初的例子是仓促完成的。在我的实际问题中,我没有进行自连接。
这是一种方法:
require(data.table)
setkey(dt, char)
setkey(dt_two, char)
dt_two[dt, {
val = num[i.bool | i.num >= num];
list(num=i.num, bool=i.bool, sum=sum(val), min=min(val))
}, by=.EACHI]
# char num bool sum min
# 1: A 1 TRUE 12 1
# 2: A 2 FALSE 1 1
# 3: A 3 TRUE 12 1
# 4: B 4 FALSE 9 2
# 5: B 5 TRUE 9 2
# 6: B 6 FALSE 9 2
要阅读有关 by=.EACHI
的信息,请查看 this post(直到连接的插图完成)。
HTH
它有点难看但有效:
library(data.table)
library(sqldf)
dt <- data.table(num=c(1, 2, 3, 4, 5, 6),
char=c('A', 'A', 'A', 'B', 'B', 'B'),
bool=c(TRUE, FALSE, TRUE, FALSE, TRUE, FALSE))
dt_two <- data.table(
num =c(6, 1, 5, 2, 4, 3),
char=c('A', 'A', 'A', 'B', 'B', 'B'),
bool=c(TRUE, FALSE, TRUE, FALSE, TRUE, FALSE))
dt_out_sql <- sqldf('
select dtone.num,
dtone.char,
dtone.bool,
SUM(dttwo.num) as SUM,
MIN(dttwo.num) as MIN
from dt as dtone
INNER join dt_two as dttwo on
(dtone.char = dttwo.char) and
(dtone.num >= dttwo.num OR dtone.bool)
GROUP BY dtone.num, dtone.char, dtone.bool
')
setDT(dt_out_sql)
setkey(dt, char)
setkey(dt_two, char)
dt_out_r <- dt[dt_two,
list(dtone.num = num,
dttwo.num = i.num,
char,
bool) ,
nomatch = 0, allow.cartesian = T
][
dtone.num >= dttwo.num | bool,
list(SUM = sum(dttwo.num),
MIN = min(dttwo.num)),
by = list(num = dtone.num,
char,
bool)
]
setkey(dt_out_r, num, char, bool)
all.equal(dt_out_sql, dt_out_r, check.attributes = FALSE)
自 data.table
1.9.8 起,对于可以放宽连接条件的情况,出现了简单的非相等连接语法,如下所示:
dt_two[dt, on=.(char, num >= num)]