R Data.Table 聚合
R Data.Table Aggregate
library(data.table)
set.seed(01)
DATA = data.table("STUDENT" = c(1:2000),
"GROUP" = c(sample(letters[1:3], size=2000, r = T)),
"CLASS" = c(sample(1:6, size=2000, r = T)),
"SCORE1" = c(sample(50:100,size=2000,r=T)),
"SCORE2" = c(sample(40:100, size=2000, r = T)))
假设您有这些数据,并且您希望聚合或折叠多个 data.table...
a) GROUP
平均 SCORE1 和平均 SCORE2
b) 平均 SCORE1 和平均 SCORE2 CLASS
c) 按 GROUP 和按 CLASS
平均 SCORE1 和平均 SCORE2
d) CLASS
的 GROUP 计数
有没有更有效的方法使用data.table创建这4个data.table?或者你只是建议制作所有这些并存储为 4 个单独的 data.tables?
它们都有不同的结构
(一)
DATA[, .(mSCORE1 = mean(SCORE1), mSCORE2= mean(SCORE2)), by=.(GROUP)]
GROUP mSCORE1 mSCORE2
1: a 74.59971 69.98240
2: c 75.12943 71.30817
3: b 74.95964 69.97608
(b 和 d 合并)
DATA[, .(mSCORE1 = mean(SCORE1), mSCORE2= mean(SCORE2),nGROUPS =uniqueN(GROUP)), by=.(CLASS)]
CLASS mSCORE1 mSCORE2 nGROUPS
1: 6 75.01685 70.14607 3
2: 3 75.76025 70.18927 3
3: 4 74.25532 70.74164 3
4: 5 75.77126 69.46921 3
5: 1 73.28797 71.56646 3
6: 2 75.17595 70.44282 3
(c)
DATA[, lapply(.SD, mean, na.rm=T), by=.(GROUP,CLASS),.SDcols = patterns("SC")]
GROUP CLASS SCORE1 SCORE2
1: a 6 75.27027 69.00901
2: c 3 78.05660 70.18868
3: a 4 75.72727 68.95868
4: b 6 74.20455 70.78788
5: a 5 75.94915 69.78814
6: c 1 73.93043 72.63478
7: c 2 75.18812 71.80198
8: b 2 76.46667 67.89167
9: c 5 75.28814 70.63559
10: a 1 72.43519 69.72222
11: a 2 73.87500 71.85000
12: b 1 73.48387 72.38710
13: b 5 76.11429 67.80000
14: b 3 75.07477 69.84112
15: c 6 75.71681 70.51327
16: b 4 74.26786 71.41964
17: a 3 74.12500 70.54808
18: c 4 72.38542 72.19792
如果你想把所有这些都放在一个 table 中,当然你会在组行和 class 之间复制 class-specific 个摘要和 group-specific 个摘要。这是单个管道中的插图
DATA[, .(mSCORE1_grpclass = mean(SCORE1), mSCORE2_grpclass= mean(SCORE2), NSTUDENTS=.N), by=.(GROUP,CLASS)] %>%
.[,`:=`(
mSCORE1_grp=sum(NSTUDENTS*mSCORE1_grpclass)/sum(NSTUDENTS),
mSCORE2_grp=sum(NSTUDENTS*mSCORE2_grpclass)/sum(NSTUDENTS)),by=.(GROUP)] %>%
.[,`:=`(
mSCORE1_class=sum(NSTUDENTS*mSCORE1_grpclass)/sum(NSTUDENTS),
mSCORE2_class=sum(NSTUDENTS*mSCORE2_grpclass)/sum(NSTUDENTS),
nGROUPS_class = uniqueN(GROUP)),by=.(CLASS)] %>%
.[]
输出:
GROUP CLASS mSCORE1_grpclass mSCORE2_grpclass NSTUDENTS mSCORE1_grp mSCORE2_grp mSCORE1_class mSCORE2_class nGROUPS_class
1: a 6 75.27027 69.00901 111 74.59971 69.98240 75.01685 70.14607 3
2: c 3 78.05660 70.18868 106 75.12943 71.30817 75.76025 70.18927 3
3: a 4 75.72727 68.95868 121 74.59971 69.98240 74.25532 70.74164 3
4: b 6 74.20455 70.78788 132 74.95964 69.97608 75.01685 70.14607 3
5: a 5 75.94915 69.78814 118 74.59971 69.98240 75.77126 69.46921 3
6: c 1 73.93043 72.63478 115 75.12943 71.30817 73.28797 71.56646 3
7: c 2 75.18812 71.80198 101 75.12943 71.30817 75.17595 70.44282 3
8: b 2 76.46667 67.89167 120 74.95964 69.97608 75.17595 70.44282 3
9: c 5 75.28814 70.63559 118 75.12943 71.30817 75.77126 69.46921 3
10: a 1 72.43519 69.72222 108 74.59971 69.98240 73.28797 71.56646 3
11: a 2 73.87500 71.85000 120 74.59971 69.98240 75.17595 70.44282 3
12: b 1 73.48387 72.38710 93 74.95964 69.97608 73.28797 71.56646 3
13: b 5 76.11429 67.80000 105 74.95964 69.97608 75.77126 69.46921 3
14: b 3 75.07477 69.84112 107 74.95964 69.97608 75.76025 70.18927 3
15: c 6 75.71681 70.51327 113 75.12943 71.30817 75.01685 70.14607 3
16: b 4 74.26786 71.41964 112 74.95964 69.97608 74.25532 70.74164 3
17: a 3 74.12500 70.54808 104 74.59971 69.98240 75.76025 70.18927 3
18: c 4 72.38542 72.19792 96 75.12943 71.30817 74.25532 70.74164 3
library(data.table)
set.seed(01)
DATA = data.table("STUDENT" = c(1:2000),
"GROUP" = c(sample(letters[1:3], size=2000, r = T)),
"CLASS" = c(sample(1:6, size=2000, r = T)),
"SCORE1" = c(sample(50:100,size=2000,r=T)),
"SCORE2" = c(sample(40:100, size=2000, r = T)))
假设您有这些数据,并且您希望聚合或折叠多个 data.table...
a) GROUP
平均 SCORE1 和平均 SCORE2b) 平均 SCORE1 和平均 SCORE2 CLASS
c) 按 GROUP 和按 CLASS
平均 SCORE1 和平均 SCORE2d) CLASS
的 GROUP 计数有没有更有效的方法使用data.table创建这4个data.table?或者你只是建议制作所有这些并存储为 4 个单独的 data.tables?
它们都有不同的结构
(一)
DATA[, .(mSCORE1 = mean(SCORE1), mSCORE2= mean(SCORE2)), by=.(GROUP)]
GROUP mSCORE1 mSCORE2
1: a 74.59971 69.98240
2: c 75.12943 71.30817
3: b 74.95964 69.97608
(b 和 d 合并)
DATA[, .(mSCORE1 = mean(SCORE1), mSCORE2= mean(SCORE2),nGROUPS =uniqueN(GROUP)), by=.(CLASS)]
CLASS mSCORE1 mSCORE2 nGROUPS
1: 6 75.01685 70.14607 3
2: 3 75.76025 70.18927 3
3: 4 74.25532 70.74164 3
4: 5 75.77126 69.46921 3
5: 1 73.28797 71.56646 3
6: 2 75.17595 70.44282 3
(c)
DATA[, lapply(.SD, mean, na.rm=T), by=.(GROUP,CLASS),.SDcols = patterns("SC")]
GROUP CLASS SCORE1 SCORE2
1: a 6 75.27027 69.00901
2: c 3 78.05660 70.18868
3: a 4 75.72727 68.95868
4: b 6 74.20455 70.78788
5: a 5 75.94915 69.78814
6: c 1 73.93043 72.63478
7: c 2 75.18812 71.80198
8: b 2 76.46667 67.89167
9: c 5 75.28814 70.63559
10: a 1 72.43519 69.72222
11: a 2 73.87500 71.85000
12: b 1 73.48387 72.38710
13: b 5 76.11429 67.80000
14: b 3 75.07477 69.84112
15: c 6 75.71681 70.51327
16: b 4 74.26786 71.41964
17: a 3 74.12500 70.54808
18: c 4 72.38542 72.19792
如果你想把所有这些都放在一个 table 中,当然你会在组行和 class 之间复制 class-specific 个摘要和 group-specific 个摘要。这是单个管道中的插图
DATA[, .(mSCORE1_grpclass = mean(SCORE1), mSCORE2_grpclass= mean(SCORE2), NSTUDENTS=.N), by=.(GROUP,CLASS)] %>%
.[,`:=`(
mSCORE1_grp=sum(NSTUDENTS*mSCORE1_grpclass)/sum(NSTUDENTS),
mSCORE2_grp=sum(NSTUDENTS*mSCORE2_grpclass)/sum(NSTUDENTS)),by=.(GROUP)] %>%
.[,`:=`(
mSCORE1_class=sum(NSTUDENTS*mSCORE1_grpclass)/sum(NSTUDENTS),
mSCORE2_class=sum(NSTUDENTS*mSCORE2_grpclass)/sum(NSTUDENTS),
nGROUPS_class = uniqueN(GROUP)),by=.(CLASS)] %>%
.[]
输出:
GROUP CLASS mSCORE1_grpclass mSCORE2_grpclass NSTUDENTS mSCORE1_grp mSCORE2_grp mSCORE1_class mSCORE2_class nGROUPS_class
1: a 6 75.27027 69.00901 111 74.59971 69.98240 75.01685 70.14607 3
2: c 3 78.05660 70.18868 106 75.12943 71.30817 75.76025 70.18927 3
3: a 4 75.72727 68.95868 121 74.59971 69.98240 74.25532 70.74164 3
4: b 6 74.20455 70.78788 132 74.95964 69.97608 75.01685 70.14607 3
5: a 5 75.94915 69.78814 118 74.59971 69.98240 75.77126 69.46921 3
6: c 1 73.93043 72.63478 115 75.12943 71.30817 73.28797 71.56646 3
7: c 2 75.18812 71.80198 101 75.12943 71.30817 75.17595 70.44282 3
8: b 2 76.46667 67.89167 120 74.95964 69.97608 75.17595 70.44282 3
9: c 5 75.28814 70.63559 118 75.12943 71.30817 75.77126 69.46921 3
10: a 1 72.43519 69.72222 108 74.59971 69.98240 73.28797 71.56646 3
11: a 2 73.87500 71.85000 120 74.59971 69.98240 75.17595 70.44282 3
12: b 1 73.48387 72.38710 93 74.95964 69.97608 73.28797 71.56646 3
13: b 5 76.11429 67.80000 105 74.95964 69.97608 75.77126 69.46921 3
14: b 3 75.07477 69.84112 107 74.95964 69.97608 75.76025 70.18927 3
15: c 6 75.71681 70.51327 113 75.12943 71.30817 75.01685 70.14607 3
16: b 4 74.26786 71.41964 112 74.95964 69.97608 74.25532 70.74164 3
17: a 3 74.12500 70.54808 104 74.59971 69.98240 75.76025 70.18927 3
18: c 4 72.38542 72.19792 96 75.12943 71.30817 74.25532 70.74164 3