如何用两个分类变量做一个交叉表,但用第三个变量的平均值填充它

How to do a crosstab with two categorical variables but populate it with the mean of the third variable

library(ggplot2)
data(diamonds)
str(diamonds)
## 'data.frame':    53940 obs. of  10 variables:
##  $ carat  : num  0.23 0.21 0.23 0.29 0.31 0.24 0.24 0.26 0.22 0.23 ...
##  $ cut    : Ord.factor w/ 5 levels "Fair"<"Good"<..: 5 4 2 4 2 3 3 3 1 3 ...
##  $ color  : Ord.factor w/ 7 levels "D"<"E"<"F"<"G"<..: 2 2 2 6 7 7 6 5 2 5 ...
##  $ clarity: Ord.factor w/ 8 levels "I1"<"SI2"<"SI1"<..: 2 3 5 4 2 6 7 3 4 5 ...
##  $ depth  : num  61.5 59.8 56.9 62.4 63.3 62.8 62.3 61.9 65.1 59.4 ...
##  $ table  : num  55 61 65 58 58 57 57 55 61 61 ...
##  $ price  : int  326 326 327 334 335 336 336 337 337 338 ...
##  $ x      : num  3.95 3.89 4.05 4.2 4.34 3.94 3.95 4.07 3.87 4 ...
##  $ y      : num  3.98 3.84 4.07 4.23 4.35 3.96 3.98 4.11 3.78 4.05 ...
##  $ z      : num  2.43 2.31 2.31 2.63 2.75 2.48 2.47 2.53 2.49 2.39 ...

这是我的交叉表

table(diamonds$cut,diamonds$color)
##            
##                D    E    F    G    H    I    J
##   Fair       163  224  312  314  303  175  119
##   Good       662  933  909  871  702  522  307
##   Very Good 1513 2400 2164 2299 1824 1204  678
##   Premium   1603 2337 2331 2924 2360 1428  808
##   Ideal     2834 3903 3826 4884 3115 2093  896

但是我想要的不是计数,而是平均价格(或均值(价格)甚至最大值(价格)

我尝试了 Hmisc 包,但它以更长的格式给我数据,我需要它以上面的 table 格式

  summarize(diamonds$price,llist(diamonds$color,diamonds$clarity),max)

    ##    diamonds$color diamonds$clarity diamonds$price
## 1               D               I1          15964
## 4               D              SI2          18693
## 3               D              SI1          18468
## 6               D              VS2          18318
## 5               D              VS1          17936
## 8               D             VVS2          17545
## 7               D             VVS1          17932
## 2               D               IF          18542
## 9               E               I1          11548
## 12              E              SI2          18477
## 11              E              SI1          18731
## 14              E              VS2          18557
## 13              E              VS1          18729
## 16              E             VVS2          18188
## 15              E             VVS1          16256
## 10              E               IF          18700
## 17              F               I1          10685
## 20              F              SI2          18784
## 19              F              SI1          18759
## 22              F              VS2          18791
## 21              F              VS1          18780
## 24              F             VVS2          18614
## 23              F             VVS1          18777
## 18              F               IF          18552
## 25              G               I1          13203
## 28              G              SI2          18804
## 27              G              SI1          18818
## 30              G              VS2          18700
## 29              G              VS1          18419
## 32              G             VVS2          18768
## 31              G             VVS1          18445
## 26              G               IF          18806
## 33              H               I1          17329
## 36              H              SI2          18745
## 35              H              SI1          18803
## 38              H              VS2          18659
## 37              H              VS1          18522
## 40              H             VVS2          17267
## 39              H             VVS1          14603
## 34              H               IF          16300
## 41              I               I1          16193
## 44              I              SI2          18756
## 43              I              SI1          18797
## 46              I              VS2          18823
## 45              I              VS1          18795
## 48              I             VVS2          15952
## 47              I             VVS1          15654
## 42              I               IF          12725
## 49              J               I1          18531
## 52              J              SI2          18710
## 51              J              SI1          18508
## 54              J              VS2          18701
## 53              J              VS1          18706
## 56              J             VVS2          17214
## 55              J             VVS1          17891
## 50              J               IF          18594

尝试

library(reshape2)
acast(diamonds, cut~color, value.var='price', mean)
#                D        E        F        G        H        I        J
#Fair      4291.061 3682.312 3827.003 4239.255 5135.683 4685.446 4975.655
#Good      3405.382 3423.644 3495.750 4123.482 4276.255 5078.533 4574.173
#Very Good 3470.467 3214.652 3778.820 3872.754 4535.390 5255.880 5103.513
#Premium   3631.293 3538.914 4324.890 4500.742 5216.707 5946.181 6294.592
#Ideal     2629.095 2597.550 3374.939 3720.706 3889.335 4451.970 4918.186

或使用base R

 with(diamonds, tapply(price, list(cut,color), FUN= mean))
 #                 D        E        F        G        H        I        J
 #Fair      4291.061 3682.312 3827.003 4239.255 5135.683 4685.446 4975.655
 #Good      3405.382 3423.644 3495.750 4123.482 4276.255 5078.533 4574.173
 #Very Good 3470.467 3214.652 3778.820 3872.754 4535.390 5255.880 5103.513
 #Premium   3631.293 3538.914 4324.890 4500.742 5216.707 5946.181 6294.592
 #Ideal     2629.095 2597.550 3374.939 3720.706 3889.335 4451.970 4918.186

或者按照@DavidArenburg 的建议

  xtabs(price ~ cut + color, diamonds)/table(diamonds[c('cut', 'color')])
  #            color
  #cut                D        E        F        G        H        I        J
  #Fair      4291.061 3682.312 3827.003 4239.255 5135.683 4685.446 4975.655
  #Good      3405.382 3423.644 3495.750 4123.482 4276.255 5078.533 4574.173
  #Very Good 3470.467 3214.652 3778.820 3872.754 4535.390 5255.880 5103.513
  #Premium   3631.293 3538.914 4324.890 4500.742 5216.707 5946.181 6294.592
  #Ideal     2629.095 2597.550 3374.939 3720.706 3889.335 4451.970 4918.186

您也可以从 data.table 的开发版本中尝试 dcast,即 v1.9.5。

 library(data.table)
 dcast(as.data.table(diamonds), cut~color, value.var='price', mean)

如果分组变量为'clarity'和'color'

 with(diamonds, tapply(price, list(clarity,color), FUN = mean))

其他功能,将tapply中的FUNacast/dcast中的fun.aggregate改为

尝试:

library(dplyr)
library(tidyr)

diamonds %>%
  group_by(cut, color) %>%
  summarise(price = mean(price)) %>%
  spread(color, price)

给出:

#Source: local data frame [5 x 8]
#
#        cut        D        E        F        G        H        I        J
#1      Fair 4291.061 3682.312 3827.003 4239.255 5135.683 4685.446 4975.655
#2      Good 3405.382 3423.644 3495.750 4123.482 4276.255 5078.533 4574.173
#3 Very Good 3470.467 3214.652 3778.820 3872.754 4535.390 5255.880 5103.513
#4   Premium 3631.293 3538.914 4324.890 4500.742 5216.707 5946.181 6294.592
#5     Ideal 2629.095 2597.550 3374.939 3720.706 3889.335 4451.970 4918.186

另一种选择

library(plyr) 
library(tidyr)    

spread(ddply(diamonds, .(cut, color), summarize, new = mean(price)), color, new)

#       cut        D        E        F        G        H        I        J
#1      Fair 4291.061 3682.312 3827.003 4239.255 5135.683 4685.446 4975.655
#2      Good 3405.382 3423.644 3495.750 4123.482 4276.255 5078.533 4574.173
#3 Very Good 3470.467 3214.652 3778.820 3872.754 4535.390 5255.880 5103.513
#4   Premium 3631.293 3538.914 4324.890 4500.742 5216.707 5946.181 6294.592
#5     Ideal 2629.095 2597.550 3374.939 3720.706 3889.335 4451.970 4918.186