正在从 data.table 中提取折叠的摘要报告
Pulling collapsed summary report from data.table
正在尝试更新和总结更大的data.table;数据如下
structure(list(y1_countyname = c("Montgomery County", "Elmore County",
"Dallas County", "Chilton County", "Jefferson County", "Escambia County",
"Escambia County", "Harris County"), n2 = c(867L, 835L, 115L,
169L, 75L, 599L, 144L, 90L), y2_geoid = c("01001", "01001", "01001",
"01001", "01001", "01003", "01003", "01003"), y1_geoid = c("01101",
"01051", "01047", "01021", "01073", "12033", "01053", "48201"
), y2_ling_zo = c(9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L), y1_ling_zo = c(9L,
9L, 9L, 11L, 11L, 7L, 7L, 12L), ling_mig = c(0, 0, 0, 1, 1, 1,
1, 1), grp_y2_geoid = c(1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L), i7 = c(0L,
0L, 0L, 0L, 0L, 599L, 144L, 0L), i9 = c(867L, 835L, 115L, 0L,
0L, 0L, 0L, 0L), i11 = c(0L, 0L, 0L, 169L, 75L, 0L, 0L, 0L),
i12 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 90L)), row.names = c(NA,
-8L), class = c("data.table", "data.frame"), .internal.selfref = <pointer: 0x55ab0e577580>, sorted = c("y2_geoid",
"y1_ling_zo"))
> in_1314_sub
y1_countyname n2 y2_geoid y1_geoid y2_ling_zo y1_ling_zo ling_mig
1: Montgomery County 867 01001 01101 9 9 0
2: Elmore County 835 01001 01051 9 9 0
3: Dallas County 115 01001 01047 9 9 0
4: Chilton County 169 01001 01021 9 11 1
5: Jefferson County 75 01001 01073 9 11 1
6: Escambia County 599 01003 12033 9 7 1
7: Escambia County 144 01003 01053 9 7 1
8: Harris County 90 01003 48201 9 12 1
grp_y2_geoid i7 i9 i11 i12
1: 1 0 867 0 0
2: 1 0 835 0 0
3: 1 0 115 0 0
4: 1 0 0 169 0
5: 1 0 0 75 0
6: 2 599 0 0 0
7: 2 144 0 0 0
8: 2 0 0 0 90
我要附加到的报告:
structure(list(y1_countyname = c("Autauga County Non-migrants",
"Baldwin County Non-migrants"), n2 = c(41198L, 148883L), y2_geoid = c("01001",
"01003"), y1_geoid = c("01001", "01003"), y2_ling_zo = c(9L,
9L), y1_ling_zo = c(9L, 9L), ling_mig = c(0, 0), nm_7 = c(NA_integer_,
NA_integer_), nm_9 = c(41198L, 148883L), nm_11 = c(NA_integer_,
NA_integer_), nm_12 = c(NA_integer_, NA_integer_), nm_15 = c(NA_integer_,
NA_integer_)), row.names = c(NA, -2L), class = c("data.table",
"data.frame"), .internal.selfref = <pointer: 0x55ab0e577580>)
> in_1314_non_sof[, c(1:12)]
y1_countyname n2 y2_geoid y1_geoid y2_ling_zo y1_ling_zo
1: Autauga County Non-migrants 41198 01001 01001 9 9
2: Baldwin County Non-migrants 148883 01003 01003 9 9
ling_mig nm_7 nm_9 nm_11 nm_12 nm_15
1: 0 NA 41198 NA NA NA
2: 0 NA 148883 NA NA NA
使用这个:
in_1314_non_sof[, c('i7', 'i9', 'i11', 'i12'):=
.(sum(in_1314_sub$n2[in_1314_sub$y1_ling_zo == 7L]),
sum(in_1314_sub$n2[in_1314_sub$y1_ling_zo == 9L]),
sum(in_1314_sub$n2[in_1314_sub$y1_ling_zo == 11L]),
sum(in_1314_sub$n2[in_1314_sub$y1_ling_zo == 12L])),
by = .(unique(in_1314_sub$grp_y2_geoid))]
结果:
in_1314_non_sof[, -c(8:12)]
y1_countyname n2 y2_geoid y1_geoid y2_ling_zo y1_ling_zo
1: Autauga County Non-migrants 41198 01001 01001 9 9
2: Baldwin County Non-migrants 148883 01003 01003 9 9
ling_mig i7 i9 i11 i12
1: 0 743 1817 244 90
2: 0 743 1817 244 90
希望的结果是:
i7 i9 i11 i12
0 1817 244 0
743 0 0 90
我缺少什么才能达到希望的结果。我不认为这是 的副本;但是,也许我错过了一个有用的 by
成员。
随着更新 post,并使用更自然的连接,
in_1314_non_sof[, -c(8:12)][in_1314_sub[, lapply(.SD, sum), .SDcols=i7:i12, by=y2_geoid], on=.(y2_geoid)]
输出
y1_countyname n2 y2_geoid y1_geoid y2_ling_zo y1_ling_zo ling_mig i7 i9 i11 i12
<char> <int> <char> <char> <int> <int> <num> <int> <int> <int> <int>
1: Autauga County Non-migrants 41198 01001 01001 9 9 0 0 1817 244 0
2: Baldwin County Non-migrants 148883 01003 01003 9 9 0 743 0 0 90
如果你没有i7:i12估计,而你需要从n2开始估计,你可以
# get cts by geoid
cts_by_geoid <- dcast(data = in_1314_sub[, sum(n2), by=.(y2_geoid,g=paste0("i", y1_ling_zo))],
formula = y2_geoid~g,
value.var="V1")
# merge with non_sof table
in_1314_non_sof[,-c(8,12)][cts_by_geoid, on=.(y2_geoid)]
输出,如上
正在尝试更新和总结更大的data.table;数据如下
structure(list(y1_countyname = c("Montgomery County", "Elmore County",
"Dallas County", "Chilton County", "Jefferson County", "Escambia County",
"Escambia County", "Harris County"), n2 = c(867L, 835L, 115L,
169L, 75L, 599L, 144L, 90L), y2_geoid = c("01001", "01001", "01001",
"01001", "01001", "01003", "01003", "01003"), y1_geoid = c("01101",
"01051", "01047", "01021", "01073", "12033", "01053", "48201"
), y2_ling_zo = c(9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L), y1_ling_zo = c(9L,
9L, 9L, 11L, 11L, 7L, 7L, 12L), ling_mig = c(0, 0, 0, 1, 1, 1,
1, 1), grp_y2_geoid = c(1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L), i7 = c(0L,
0L, 0L, 0L, 0L, 599L, 144L, 0L), i9 = c(867L, 835L, 115L, 0L,
0L, 0L, 0L, 0L), i11 = c(0L, 0L, 0L, 169L, 75L, 0L, 0L, 0L),
i12 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 90L)), row.names = c(NA,
-8L), class = c("data.table", "data.frame"), .internal.selfref = <pointer: 0x55ab0e577580>, sorted = c("y2_geoid",
"y1_ling_zo"))
> in_1314_sub
y1_countyname n2 y2_geoid y1_geoid y2_ling_zo y1_ling_zo ling_mig
1: Montgomery County 867 01001 01101 9 9 0
2: Elmore County 835 01001 01051 9 9 0
3: Dallas County 115 01001 01047 9 9 0
4: Chilton County 169 01001 01021 9 11 1
5: Jefferson County 75 01001 01073 9 11 1
6: Escambia County 599 01003 12033 9 7 1
7: Escambia County 144 01003 01053 9 7 1
8: Harris County 90 01003 48201 9 12 1
grp_y2_geoid i7 i9 i11 i12
1: 1 0 867 0 0
2: 1 0 835 0 0
3: 1 0 115 0 0
4: 1 0 0 169 0
5: 1 0 0 75 0
6: 2 599 0 0 0
7: 2 144 0 0 0
8: 2 0 0 0 90
我要附加到的报告:
structure(list(y1_countyname = c("Autauga County Non-migrants",
"Baldwin County Non-migrants"), n2 = c(41198L, 148883L), y2_geoid = c("01001",
"01003"), y1_geoid = c("01001", "01003"), y2_ling_zo = c(9L,
9L), y1_ling_zo = c(9L, 9L), ling_mig = c(0, 0), nm_7 = c(NA_integer_,
NA_integer_), nm_9 = c(41198L, 148883L), nm_11 = c(NA_integer_,
NA_integer_), nm_12 = c(NA_integer_, NA_integer_), nm_15 = c(NA_integer_,
NA_integer_)), row.names = c(NA, -2L), class = c("data.table",
"data.frame"), .internal.selfref = <pointer: 0x55ab0e577580>)
> in_1314_non_sof[, c(1:12)]
y1_countyname n2 y2_geoid y1_geoid y2_ling_zo y1_ling_zo
1: Autauga County Non-migrants 41198 01001 01001 9 9
2: Baldwin County Non-migrants 148883 01003 01003 9 9
ling_mig nm_7 nm_9 nm_11 nm_12 nm_15
1: 0 NA 41198 NA NA NA
2: 0 NA 148883 NA NA NA
使用这个:
in_1314_non_sof[, c('i7', 'i9', 'i11', 'i12'):=
.(sum(in_1314_sub$n2[in_1314_sub$y1_ling_zo == 7L]),
sum(in_1314_sub$n2[in_1314_sub$y1_ling_zo == 9L]),
sum(in_1314_sub$n2[in_1314_sub$y1_ling_zo == 11L]),
sum(in_1314_sub$n2[in_1314_sub$y1_ling_zo == 12L])),
by = .(unique(in_1314_sub$grp_y2_geoid))]
结果:
in_1314_non_sof[, -c(8:12)]
y1_countyname n2 y2_geoid y1_geoid y2_ling_zo y1_ling_zo
1: Autauga County Non-migrants 41198 01001 01001 9 9
2: Baldwin County Non-migrants 148883 01003 01003 9 9
ling_mig i7 i9 i11 i12
1: 0 743 1817 244 90
2: 0 743 1817 244 90
希望的结果是:
i7 i9 i11 i12
0 1817 244 0
743 0 0 90
我缺少什么才能达到希望的结果。我不认为这是 by
成员。
随着更新 post,并使用更自然的连接,
in_1314_non_sof[, -c(8:12)][in_1314_sub[, lapply(.SD, sum), .SDcols=i7:i12, by=y2_geoid], on=.(y2_geoid)]
输出
y1_countyname n2 y2_geoid y1_geoid y2_ling_zo y1_ling_zo ling_mig i7 i9 i11 i12
<char> <int> <char> <char> <int> <int> <num> <int> <int> <int> <int>
1: Autauga County Non-migrants 41198 01001 01001 9 9 0 0 1817 244 0
2: Baldwin County Non-migrants 148883 01003 01003 9 9 0 743 0 0 90
如果你没有i7:i12估计,而你需要从n2开始估计,你可以
# get cts by geoid
cts_by_geoid <- dcast(data = in_1314_sub[, sum(n2), by=.(y2_geoid,g=paste0("i", y1_ling_zo))],
formula = y2_geoid~g,
value.var="V1")
# merge with non_sof table
in_1314_non_sof[,-c(8,12)][cts_by_geoid, on=.(y2_geoid)]
输出,如上