data.table 基于多个条件的聚合
data.table aggregation based on multiple criteria
我正在尝试计算一组 fid 中有多少 pid 的 yob 小于人的 yob。第二个问题是关于独特的pid。根据@langtang 的努力和我自己的思考更新问题:
#Libraries:
library(data.table)
library(tictoc)
#Make it replicable:
set.seed(1)
#Define parameters of the simulation:
pid<-1:1000
fid<-1:5
time_periods<-1:12
yob<-sample(seq(1900,2010),length(pid),replace = TRUE)
#Obtain in how many firms a given pid works in a givem month:
nr_firms_pid_time<-sample(1:length(fid),length(pid),replace = TRUE)
#This means:
#First pid: works in first firm;
#Second pid: works in first four firms;
#Third pid: works in first firm;
#Fourth pid: works in two firms.
#Aux functions:
function_rep<-function(x){
rep(1:12,x)
}
function_seq<-function(x){
1:x
}
#Create panel
data_panel<-data.table(pid = rep(pid,nr_firms_pid_time*length(time_periods)))
data_panel[,yearmonth:=do.call(c,sapply(nr_firms_pid_time,function_rep))]
data_panel[,fid:=rep(do.call(c,sapply(nr_firms_pid_time,function_seq)),each = 12)]
#Merge in yob:
data_yob<-data.table(pid = pid,yob = yob)
data_panel<-merge(data_panel,data_yob,by = c("pid"),all.x = TRUE)
#Remove not needed stuff:
rm(pid)
rm(fid)
rm(time_periods)
rm(yob)
rm(data_yob)
#Solution 1 (terribly slow):
# make a small function that counts the number of coworkers with
# earlier dob than this individual
older_coworkers = function(id,yrmonth) {
#First obtain firms in which a worker works in a given month:
id_firms<-data_panel[pid==id&yearmonth==yrmonth,fid]
#Then extract data at a given month:
data_func<-data_panel[(fid %in% id_firms)&(yearmonth==yrmonth)]
#Then extract his dob:
dob_to_use<-unique(data_func[pid==id,yob])
sum(data_func[pid!=id]$yob<dob_to_use)
}
older_coworkers_unique = function(id,yrmonth) {
#First obtain firms in which a worker works in a given month:
id_firms<-data_panel[pid==id&yearmonth==yrmonth,fid]
#Then extract data at a given month:
data_func<-data_panel[(fid %in% id_firms)&(yearmonth==yrmonth)]
#Then extract his dob:
dob_to_use<-unique(data_func[pid==id,yob])
#Get UNIQUE number of coworkers:
sum(unique(data_func[pid!=id],by = c("pid"))$yob<dob_to_use)
}
#Works but is terrible slow:
tic()
sol_1<-data_panel[, .(older_coworkers(.BY$pid,.BY$yearmonth)),by = c("pid","yearmonth")]
toc()
#Solution 2 (better but do not like it, what if I want unique older coworkers)
function_older<-function(x){
noc<-lapply(
1:length(x),
function(i){
sum(x[-i]<x[i])
}
)
unlist(noc)
}
#This is fast but I cannot get unique number:
tic()
sol_2<-data_panel[,.(pid,function_older(yob)),by = c("fid","yearmonth")][,sum(V2),by = c("pid","yearmonth")][order(pid,yearmonth)]
toc()
#Everything works:
identical(sol_1,sol_2)
问题是如何以非常快的方式实施 older_coworkers_unique
。任何建议将不胜感激。
更新,基于 OP 的新的可重现数据集
- 如果你想让one-liner重现上面的
sol_2
,你可以这样做:
data_panel[data_panel, on=.(yearmonth, fid, yob<yob )][, .N, by=.(i.pid, yearmonth)]
解释:
以上是使用 non-equi 联接,这在使用 data.table 时可能是一种有用的方法。我加入 data_panel
本身,要求 yearmonth 和 fid 相等,但出生年份(join 左侧)小于出生年份(join 右侧)。这将 return 一个 data.table 其中公司和年月匹配,但每个年长的同事 (pid
) 都匹配他们的年轻同事 (i.pid
)。因此,我们可以计算每个年轻同事 (i.pid
) 和 yearmonth
的行数 (.N)。这产生与上面的 sol_1
和 sol_2
相同的结果。您评论说您希望找到独特的同事,因此下面的第二种方法通过在选项 2 中使用 len(unique(pid))
来做到这一点。
- 可以使用相同的 non-equi 加入方法来获取 独特的 年长同事,如下所示:
data_panel[data_panel, on=.(yearmonth, fid, yob<yob )] %>%
.[, .(older_coworkers = length(unique(pid))), by=.(i.pid, yearmonth)]
先前的响应,基于 OP 的原始非常小的示例数据集
我不确定您希望输出的确切内容。但是,在您的示例数据中,我首先删除了重复行(因为我不明白为什么它在那里(参见我上面的评论)),然后我应用了一个函数来计算每个 pid/fid/ym.
# make your example data unique
data=unique(data)
# make a small function that counts the number of coworkers with
# earlier dob than this individual
older_coworkers = function(birth,firm,yrmonth,id) {
data[dob<birth & fid==firm & ym==yrmonth & pid!=id,.N]
}
# apply the function to the data
data[, .(num_older_coworkers = older_coworkers(dob,.BY$fid, .BY$ym, .BY$pid)), by=.(pid,fid,ym)]
输出:
pid fid ym num_older_coworkers
1: 1 1 200801 1
2: 1 2 200802 0
3: 2 1 200801 0
4: 3 2 200801 0
公司 1 的人员 1 在 2008-01 月份有一位年长的同事 -- 即,公司 1 的人员 2 在 2008-01 年。
公司2的人1(1950年出生)也有一个年长的同事,即公司2的人3(1930年出生),但结果显示为0,因为人1在公司 2 年(即 2008-01)与潜在年长同事的年(即 2008-02)不匹配。
我正在尝试计算一组 fid 中有多少 pid 的 yob 小于人的 yob。第二个问题是关于独特的pid。根据@langtang 的努力和我自己的思考更新问题:
#Libraries:
library(data.table)
library(tictoc)
#Make it replicable:
set.seed(1)
#Define parameters of the simulation:
pid<-1:1000
fid<-1:5
time_periods<-1:12
yob<-sample(seq(1900,2010),length(pid),replace = TRUE)
#Obtain in how many firms a given pid works in a givem month:
nr_firms_pid_time<-sample(1:length(fid),length(pid),replace = TRUE)
#This means:
#First pid: works in first firm;
#Second pid: works in first four firms;
#Third pid: works in first firm;
#Fourth pid: works in two firms.
#Aux functions:
function_rep<-function(x){
rep(1:12,x)
}
function_seq<-function(x){
1:x
}
#Create panel
data_panel<-data.table(pid = rep(pid,nr_firms_pid_time*length(time_periods)))
data_panel[,yearmonth:=do.call(c,sapply(nr_firms_pid_time,function_rep))]
data_panel[,fid:=rep(do.call(c,sapply(nr_firms_pid_time,function_seq)),each = 12)]
#Merge in yob:
data_yob<-data.table(pid = pid,yob = yob)
data_panel<-merge(data_panel,data_yob,by = c("pid"),all.x = TRUE)
#Remove not needed stuff:
rm(pid)
rm(fid)
rm(time_periods)
rm(yob)
rm(data_yob)
#Solution 1 (terribly slow):
# make a small function that counts the number of coworkers with
# earlier dob than this individual
older_coworkers = function(id,yrmonth) {
#First obtain firms in which a worker works in a given month:
id_firms<-data_panel[pid==id&yearmonth==yrmonth,fid]
#Then extract data at a given month:
data_func<-data_panel[(fid %in% id_firms)&(yearmonth==yrmonth)]
#Then extract his dob:
dob_to_use<-unique(data_func[pid==id,yob])
sum(data_func[pid!=id]$yob<dob_to_use)
}
older_coworkers_unique = function(id,yrmonth) {
#First obtain firms in which a worker works in a given month:
id_firms<-data_panel[pid==id&yearmonth==yrmonth,fid]
#Then extract data at a given month:
data_func<-data_panel[(fid %in% id_firms)&(yearmonth==yrmonth)]
#Then extract his dob:
dob_to_use<-unique(data_func[pid==id,yob])
#Get UNIQUE number of coworkers:
sum(unique(data_func[pid!=id],by = c("pid"))$yob<dob_to_use)
}
#Works but is terrible slow:
tic()
sol_1<-data_panel[, .(older_coworkers(.BY$pid,.BY$yearmonth)),by = c("pid","yearmonth")]
toc()
#Solution 2 (better but do not like it, what if I want unique older coworkers)
function_older<-function(x){
noc<-lapply(
1:length(x),
function(i){
sum(x[-i]<x[i])
}
)
unlist(noc)
}
#This is fast but I cannot get unique number:
tic()
sol_2<-data_panel[,.(pid,function_older(yob)),by = c("fid","yearmonth")][,sum(V2),by = c("pid","yearmonth")][order(pid,yearmonth)]
toc()
#Everything works:
identical(sol_1,sol_2)
问题是如何以非常快的方式实施 older_coworkers_unique
。任何建议将不胜感激。
更新,基于 OP 的新的可重现数据集
- 如果你想让one-liner重现上面的
sol_2
,你可以这样做:
data_panel[data_panel, on=.(yearmonth, fid, yob<yob )][, .N, by=.(i.pid, yearmonth)]
解释:
以上是使用 non-equi 联接,这在使用 data.table 时可能是一种有用的方法。我加入 data_panel
本身,要求 yearmonth 和 fid 相等,但出生年份(join 左侧)小于出生年份(join 右侧)。这将 return 一个 data.table 其中公司和年月匹配,但每个年长的同事 (pid
) 都匹配他们的年轻同事 (i.pid
)。因此,我们可以计算每个年轻同事 (i.pid
) 和 yearmonth
的行数 (.N)。这产生与上面的 sol_1
和 sol_2
相同的结果。您评论说您希望找到独特的同事,因此下面的第二种方法通过在选项 2 中使用 len(unique(pid))
来做到这一点。
- 可以使用相同的 non-equi 加入方法来获取 独特的 年长同事,如下所示:
data_panel[data_panel, on=.(yearmonth, fid, yob<yob )] %>%
.[, .(older_coworkers = length(unique(pid))), by=.(i.pid, yearmonth)]
先前的响应,基于 OP 的原始非常小的示例数据集
我不确定您希望输出的确切内容。但是,在您的示例数据中,我首先删除了重复行(因为我不明白为什么它在那里(参见我上面的评论)),然后我应用了一个函数来计算每个 pid/fid/ym.
# make your example data unique
data=unique(data)
# make a small function that counts the number of coworkers with
# earlier dob than this individual
older_coworkers = function(birth,firm,yrmonth,id) {
data[dob<birth & fid==firm & ym==yrmonth & pid!=id,.N]
}
# apply the function to the data
data[, .(num_older_coworkers = older_coworkers(dob,.BY$fid, .BY$ym, .BY$pid)), by=.(pid,fid,ym)]
输出:
pid fid ym num_older_coworkers
1: 1 1 200801 1
2: 1 2 200802 0
3: 2 1 200801 0
4: 3 2 200801 0
公司 1 的人员 1 在 2008-01 月份有一位年长的同事 -- 即,公司 1 的人员 2 在 2008-01 年。
公司2的人1(1950年出生)也有一个年长的同事,即公司2的人3(1930年出生),但结果显示为0,因为人1在公司 2 年(即 2008-01)与潜在年长同事的年(即 2008-02)不匹配。