在 r 中使用 data.table 来消除内部 for 循环
Using data.table in r to eliminate inner for loop
我在 R 中有一个内部 for 循环,我已将其确定为代码中的重大瓶颈。该脚本模拟了时变政策对成年之前个人的影响。外循环遍历我想研究的队列列表(yob = 1910,...,1930 等)。内部循环从 a = 5 到 a = 17 的年龄计数。CSL.details 是一个 data.table,它包含我正在研究的每条法律的详细信息,以我获取的变量的形式出现,这些变量因年份而异= 出生年份 + a.要按出生队列了解该政策的总体影响,我需要跟踪每个 a.ca_years1、ca_years2、ca_years3 和 ca_years4。
ages = seq.int(5,17)
state = "Massachusetts"
yob = seq.int(1910, 1930)
for (birthyear in yob){
ca_years1 = 0; ca_years2 = 0; ca_years3 = 0; ca_years4 = 0;
for (a in ages){
thisyear = birthyear + a
# Grab each law for given state and year and implement exemption permit
thislaw <- CSL.details[statename == state & yob == birthyear & thisyear == year]
if (nrow(thislaw) == 0) next
exempt_workpermit = (ca_years2 >= thislaw$workyrs & a >= thislaw$workage & thislaw$workage > 0)
exempt_yearstodropout = (ca_years3 >= thislaw$earlyyrs & a >= thislaw$earlyyrs_condition & thislaw$earlyyrs > 0)
exempt_cont = ((ca_years2 + ca_years4) >= thislaw$contyrs & thislaw$contyrs > 0)
# Increment each law when school is required
if(thislaw$entryage <= a & a < thislaw$exitage){
ca_years1 = ca_years1 + 1
if(!exempt_workpermit){ca_years2 = ca_years2 + 1}
if(!exempt_yearstodropout){ca_years3 = ca_years3 + 1}
}
if(thislaw$contage > a &
a >= thislaw$workage &
!exempt_cont &
thislaw$workage > 0 &
!(thislaw$entryage <= a & a < thislaw$exitage & !exempt_workpermit)
){ca_years4 = ca_years4 + 1}
}
CSL.exposures[statename == state & yob == birthyear]$ca_years1 = ca_years1
CSL.exposures[statename == state & yob == birthyear]$ca_years2 = ca_years2
CSL.exposures[statename == state & yob == birthyear]$ca_years3 = ca_years3
CSL.exposures[statename == state & yob == birthyear]$ca_years4 = ca_years4
}
是否有data.table替换内循环的解决方案?我是一名中级 R 编码员,想如何入门有点困难。虽然我更喜欢 data.table,但我愿意接受 dplyr 类型的解决方案,如果它们能显着加快代码速度的话。
编辑:这里是 CSL.detail 的示例,作为复制粘贴的 data.table。
statename year yob statefip entryage exitage earlyyrs earlyyrs_condition workage workyrs contage contyrs statecompschoolyr
1: Massachusetts 1913 1800 25 7 16 4 14 14 4 16 0 1852
2: Massachusetts 1913 1801 25 7 16 4 14 14 4 16 0 1852
3: Massachusetts 1913 1802 25 7 16 4 14 14 4 16 0 1852
4: Massachusetts 1913 1803 25 7 16 4 14 14 4 16 0 1852
5: Massachusetts 1913 1804 25 7 16 4 14 14 4 16 0 1852
我设法重构代码来解决问题。关键思想是利用 state
和 yob
作为分组变量(因为所有计算都发生在 state
和 yob
对中)。这完全消除了外部循环,只需要一个循环,按年龄迭代。我只是把这个答案保存在这里以供参考,但我不确定 whosebug.com 社区是否有更广泛的教训,所以请随时删除。时间节省大约 95%,主要是因为它减少了调用 data.table
.
的开销时间
for(a in ages){
# grab running total of years of education compelled by state and year of birth
CSL.details[CSL.exposures, on = .(statename, yob),
`:=` (ca_years1 = i.ca_years1,
ca_years2 = i.ca_years2,
ca_years3 = i.ca_years3,
ca_years4 = i.ca_years4)] %>%
.[year == a + yob,
`:=`(
# create exemptions by age based on number of years of schooling completed
exempt_workpermit = (ca_years2 >= workyrs & a >= workage & workage > 0),
exempt_yearstodropout = (ca_years3 >= earlyyrs & a >= earlyyrs_condition & earlyyrs > 0),
exempt_cont = ((ca_years2 + ca_years4) >= contyrs & contyrs > 0)
), by = .(statename, yob)]
CSL.exposures[
CSL.details[year == a + yob], on = .(yob, statename),
`:=` (exempt_workpermit = i.exempt_workpermit, exempt_yearstodropout = i.exempt_yearstodropout,
exempt_cont = i.exempt_cont, entryage = i.entryage,
exitage = i.exitage, contage = i.contage, workage = i.workage) ] %>%
.[ ,
`:=` (
ca_years1 =
fifelse(entryage <= a & a < exitage,
ca_years1 + 1, ca_years1, na = as.numeric(ca_years1)),
ca_years2 =
fifelse(entryage <= a & a < exitage & !exempt_workpermit,
ca_years2 + 1, ca_years2, na = as.numeric(ca_years2)),
ca_years3 =
fifelse(entryage <= a & a < exitage & !exempt_yearstodropout,
ca_years3 + 1, ca_years3, na = as.numeric(ca_years3)),
ca_years4 =
fifelse(contage > a & a >= workage & !exempt_cont &
workage > 0 &
!(entryage <= a & a < exitage & !exempt_workpermit),
ca_years4 + 1, ca_years4, na = as.numeric(ca_years4))),
by = .(statename, yob)
]
}
我在 R 中有一个内部 for 循环,我已将其确定为代码中的重大瓶颈。该脚本模拟了时变政策对成年之前个人的影响。外循环遍历我想研究的队列列表(yob = 1910,...,1930 等)。内部循环从 a = 5 到 a = 17 的年龄计数。CSL.details 是一个 data.table,它包含我正在研究的每条法律的详细信息,以我获取的变量的形式出现,这些变量因年份而异= 出生年份 + a.要按出生队列了解该政策的总体影响,我需要跟踪每个 a.ca_years1、ca_years2、ca_years3 和 ca_years4。
ages = seq.int(5,17)
state = "Massachusetts"
yob = seq.int(1910, 1930)
for (birthyear in yob){
ca_years1 = 0; ca_years2 = 0; ca_years3 = 0; ca_years4 = 0;
for (a in ages){
thisyear = birthyear + a
# Grab each law for given state and year and implement exemption permit
thislaw <- CSL.details[statename == state & yob == birthyear & thisyear == year]
if (nrow(thislaw) == 0) next
exempt_workpermit = (ca_years2 >= thislaw$workyrs & a >= thislaw$workage & thislaw$workage > 0)
exempt_yearstodropout = (ca_years3 >= thislaw$earlyyrs & a >= thislaw$earlyyrs_condition & thislaw$earlyyrs > 0)
exempt_cont = ((ca_years2 + ca_years4) >= thislaw$contyrs & thislaw$contyrs > 0)
# Increment each law when school is required
if(thislaw$entryage <= a & a < thislaw$exitage){
ca_years1 = ca_years1 + 1
if(!exempt_workpermit){ca_years2 = ca_years2 + 1}
if(!exempt_yearstodropout){ca_years3 = ca_years3 + 1}
}
if(thislaw$contage > a &
a >= thislaw$workage &
!exempt_cont &
thislaw$workage > 0 &
!(thislaw$entryage <= a & a < thislaw$exitage & !exempt_workpermit)
){ca_years4 = ca_years4 + 1}
}
CSL.exposures[statename == state & yob == birthyear]$ca_years1 = ca_years1
CSL.exposures[statename == state & yob == birthyear]$ca_years2 = ca_years2
CSL.exposures[statename == state & yob == birthyear]$ca_years3 = ca_years3
CSL.exposures[statename == state & yob == birthyear]$ca_years4 = ca_years4
}
是否有data.table替换内循环的解决方案?我是一名中级 R 编码员,想如何入门有点困难。虽然我更喜欢 data.table,但我愿意接受 dplyr 类型的解决方案,如果它们能显着加快代码速度的话。
编辑:这里是 CSL.detail 的示例,作为复制粘贴的 data.table。
statename year yob statefip entryage exitage earlyyrs earlyyrs_condition workage workyrs contage contyrs statecompschoolyr
1: Massachusetts 1913 1800 25 7 16 4 14 14 4 16 0 1852
2: Massachusetts 1913 1801 25 7 16 4 14 14 4 16 0 1852
3: Massachusetts 1913 1802 25 7 16 4 14 14 4 16 0 1852
4: Massachusetts 1913 1803 25 7 16 4 14 14 4 16 0 1852
5: Massachusetts 1913 1804 25 7 16 4 14 14 4 16 0 1852
我设法重构代码来解决问题。关键思想是利用 state
和 yob
作为分组变量(因为所有计算都发生在 state
和 yob
对中)。这完全消除了外部循环,只需要一个循环,按年龄迭代。我只是把这个答案保存在这里以供参考,但我不确定 whosebug.com 社区是否有更广泛的教训,所以请随时删除。时间节省大约 95%,主要是因为它减少了调用 data.table
.
for(a in ages){
# grab running total of years of education compelled by state and year of birth
CSL.details[CSL.exposures, on = .(statename, yob),
`:=` (ca_years1 = i.ca_years1,
ca_years2 = i.ca_years2,
ca_years3 = i.ca_years3,
ca_years4 = i.ca_years4)] %>%
.[year == a + yob,
`:=`(
# create exemptions by age based on number of years of schooling completed
exempt_workpermit = (ca_years2 >= workyrs & a >= workage & workage > 0),
exempt_yearstodropout = (ca_years3 >= earlyyrs & a >= earlyyrs_condition & earlyyrs > 0),
exempt_cont = ((ca_years2 + ca_years4) >= contyrs & contyrs > 0)
), by = .(statename, yob)]
CSL.exposures[
CSL.details[year == a + yob], on = .(yob, statename),
`:=` (exempt_workpermit = i.exempt_workpermit, exempt_yearstodropout = i.exempt_yearstodropout,
exempt_cont = i.exempt_cont, entryage = i.entryage,
exitage = i.exitage, contage = i.contage, workage = i.workage) ] %>%
.[ ,
`:=` (
ca_years1 =
fifelse(entryage <= a & a < exitage,
ca_years1 + 1, ca_years1, na = as.numeric(ca_years1)),
ca_years2 =
fifelse(entryage <= a & a < exitage & !exempt_workpermit,
ca_years2 + 1, ca_years2, na = as.numeric(ca_years2)),
ca_years3 =
fifelse(entryage <= a & a < exitage & !exempt_yearstodropout,
ca_years3 + 1, ca_years3, na = as.numeric(ca_years3)),
ca_years4 =
fifelse(contage > a & a >= workage & !exempt_cont &
workage > 0 &
!(entryage <= a & a < exitage & !exempt_workpermit),
ca_years4 + 1, ca_years4, na = as.numeric(ca_years4))),
by = .(statename, yob)
]
}