正在从 data.table 中提取折叠的摘要报告

Question

正在尝试更新和总结更大的data.table；数据如下

    structure(list(y1_countyname = c("Montgomery County", "Elmore County", 
"Dallas County", "Chilton County", "Jefferson County", "Escambia County", 
"Escambia County", "Harris County"), n2 = c(867L, 835L, 115L, 
169L, 75L, 599L, 144L, 90L), y2_geoid = c("01001", "01001", "01001", 
"01001", "01001", "01003", "01003", "01003"), y1_geoid = c("01101", 
"01051", "01047", "01021", "01073", "12033", "01053", "48201"
), y2_ling_zo = c(9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L), y1_ling_zo = c(9L, 
9L, 9L, 11L, 11L, 7L, 7L, 12L), ling_mig = c(0, 0, 0, 1, 1, 1, 
1, 1), grp_y2_geoid = c(1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L), i7 = c(0L, 
0L, 0L, 0L, 0L, 599L, 144L, 0L), i9 = c(867L, 835L, 115L, 0L, 
0L, 0L, 0L, 0L), i11 = c(0L, 0L, 0L, 169L, 75L, 0L, 0L, 0L), 
    i12 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 90L)), row.names = c(NA, 
-8L), class = c("data.table", "data.frame"), .internal.selfref = <pointer: 0x55ab0e577580>, sorted = c("y2_geoid", 
"y1_ling_zo"))

> in_1314_sub
       y1_countyname  n2 y2_geoid y1_geoid y2_ling_zo y1_ling_zo ling_mig
1: Montgomery County 867    01001    01101          9          9        0
2:     Elmore County 835    01001    01051          9          9        0
3:     Dallas County 115    01001    01047          9          9        0
4:    Chilton County 169    01001    01021          9         11        1
5:  Jefferson County  75    01001    01073          9         11        1
6:   Escambia County 599    01003    12033          9          7        1
7:   Escambia County 144    01003    01053          9          7        1
8:     Harris County  90    01003    48201          9         12        1
   grp_y2_geoid  i7  i9 i11 i12
1:            1   0 867   0   0
2:            1   0 835   0   0
3:            1   0 115   0   0
4:            1   0   0 169   0
5:            1   0   0  75   0
6:            2 599   0   0   0
7:            2 144   0   0   0
8:            2   0   0   0  90

我要附加到的报告：

structure(list(y1_countyname = c("Autauga County Non-migrants", 
"Baldwin County Non-migrants"), n2 = c(41198L, 148883L), y2_geoid = c("01001", 
"01003"), y1_geoid = c("01001", "01003"), y2_ling_zo = c(9L, 
9L), y1_ling_zo = c(9L, 9L), ling_mig = c(0, 0), nm_7 = c(NA_integer_, 
NA_integer_), nm_9 = c(41198L, 148883L), nm_11 = c(NA_integer_, 
NA_integer_), nm_12 = c(NA_integer_, NA_integer_), nm_15 = c(NA_integer_, 
NA_integer_)), row.names = c(NA, -2L), class = c("data.table", 
"data.frame"), .internal.selfref = <pointer: 0x55ab0e577580>)

> in_1314_non_sof[, c(1:12)]
                 y1_countyname     n2 y2_geoid y1_geoid y2_ling_zo y1_ling_zo
1: Autauga County Non-migrants  41198    01001    01001          9          9
2: Baldwin County Non-migrants 148883    01003    01003          9          9
   ling_mig nm_7   nm_9 nm_11 nm_12 nm_15
1:        0   NA  41198    NA    NA    NA
2:        0   NA 148883    NA    NA    NA

使用这个：

in_1314_non_sof[, c('i7', 'i9', 'i11', 'i12'):= 
.(sum(in_1314_sub$n2[in_1314_sub$y1_ling_zo == 7L]),
 sum(in_1314_sub$n2[in_1314_sub$y1_ling_zo == 9L]),
 sum(in_1314_sub$n2[in_1314_sub$y1_ling_zo == 11L]),
 sum(in_1314_sub$n2[in_1314_sub$y1_ling_zo == 12L])),
 by = .(unique(in_1314_sub$grp_y2_geoid))]

结果：

in_1314_non_sof[, -c(8:12)]
                 y1_countyname     n2 y2_geoid y1_geoid y2_ling_zo y1_ling_zo
1: Autauga County Non-migrants  41198    01001    01001          9          9
2: Baldwin County Non-migrants 148883    01003    01003          9          9
   ling_mig  i7   i9 i11 i12
1:        0 743 1817 244  90
2:        0 743 1817 244  90

希望的结果是：

  i7   i9    i11  i12
  0   1817   244   0
 743   0      0    90

我缺少什么才能达到希望的结果。我不认为这是的副本；但是，也许我错过了一个有用的 by 成员。

Answer 1

随着更新 post，并使用更自然的连接，

in_1314_non_sof[, -c(8:12)][in_1314_sub[, lapply(.SD, sum), .SDcols=i7:i12, by=y2_geoid], on=.(y2_geoid)]

输出

                 y1_countyname     n2 y2_geoid y1_geoid y2_ling_zo y1_ling_zo ling_mig    i7    i9   i11   i12
                        <char>  <int>   <char>   <char>      <int>      <int>    <num> <int> <int> <int> <int>
1: Autauga County Non-migrants  41198    01001    01001          9          9        0     0  1817   244     0
2: Baldwin County Non-migrants 148883    01003    01003          9          9        0   743     0     0    90

如果你没有i7:i12估计，而你需要从n2开始估计，你可以

# get cts by geoid
cts_by_geoid <- dcast(data = in_1314_sub[, sum(n2), by=.(y2_geoid,g=paste0("i", y1_ling_zo))],
                      formula = y2_geoid~g,
                      value.var="V1")

# merge with non_sof table
in_1314_non_sof[,-c(8,12)][cts_by_geoid, on=.(y2_geoid)]

输出，如上

正在从 data.table 中提取折叠的摘要报告

Pulling collapsed summary report from data.table

r

extract

collapse

data.table