如何用两个分类变量做一个交叉表,但用第三个变量的平均值填充它
How to do a crosstab with two categorical variables but populate it with the mean of the third variable
library(ggplot2)
data(diamonds)
str(diamonds)
## 'data.frame': 53940 obs. of 10 variables:
## $ carat : num 0.23 0.21 0.23 0.29 0.31 0.24 0.24 0.26 0.22 0.23 ...
## $ cut : Ord.factor w/ 5 levels "Fair"<"Good"<..: 5 4 2 4 2 3 3 3 1 3 ...
## $ color : Ord.factor w/ 7 levels "D"<"E"<"F"<"G"<..: 2 2 2 6 7 7 6 5 2 5 ...
## $ clarity: Ord.factor w/ 8 levels "I1"<"SI2"<"SI1"<..: 2 3 5 4 2 6 7 3 4 5 ...
## $ depth : num 61.5 59.8 56.9 62.4 63.3 62.8 62.3 61.9 65.1 59.4 ...
## $ table : num 55 61 65 58 58 57 57 55 61 61 ...
## $ price : int 326 326 327 334 335 336 336 337 337 338 ...
## $ x : num 3.95 3.89 4.05 4.2 4.34 3.94 3.95 4.07 3.87 4 ...
## $ y : num 3.98 3.84 4.07 4.23 4.35 3.96 3.98 4.11 3.78 4.05 ...
## $ z : num 2.43 2.31 2.31 2.63 2.75 2.48 2.47 2.53 2.49 2.39 ...
这是我的交叉表
table(diamonds$cut,diamonds$color)
##
## D E F G H I J
## Fair 163 224 312 314 303 175 119
## Good 662 933 909 871 702 522 307
## Very Good 1513 2400 2164 2299 1824 1204 678
## Premium 1603 2337 2331 2924 2360 1428 808
## Ideal 2834 3903 3826 4884 3115 2093 896
但是我想要的不是计数,而是平均价格(或均值(价格)甚至最大值(价格)
我尝试了 Hmisc 包,但它以更长的格式给我数据,我需要它以上面的 table 格式
summarize(diamonds$price,llist(diamonds$color,diamonds$clarity),max)
## diamonds$color diamonds$clarity diamonds$price
## 1 D I1 15964
## 4 D SI2 18693
## 3 D SI1 18468
## 6 D VS2 18318
## 5 D VS1 17936
## 8 D VVS2 17545
## 7 D VVS1 17932
## 2 D IF 18542
## 9 E I1 11548
## 12 E SI2 18477
## 11 E SI1 18731
## 14 E VS2 18557
## 13 E VS1 18729
## 16 E VVS2 18188
## 15 E VVS1 16256
## 10 E IF 18700
## 17 F I1 10685
## 20 F SI2 18784
## 19 F SI1 18759
## 22 F VS2 18791
## 21 F VS1 18780
## 24 F VVS2 18614
## 23 F VVS1 18777
## 18 F IF 18552
## 25 G I1 13203
## 28 G SI2 18804
## 27 G SI1 18818
## 30 G VS2 18700
## 29 G VS1 18419
## 32 G VVS2 18768
## 31 G VVS1 18445
## 26 G IF 18806
## 33 H I1 17329
## 36 H SI2 18745
## 35 H SI1 18803
## 38 H VS2 18659
## 37 H VS1 18522
## 40 H VVS2 17267
## 39 H VVS1 14603
## 34 H IF 16300
## 41 I I1 16193
## 44 I SI2 18756
## 43 I SI1 18797
## 46 I VS2 18823
## 45 I VS1 18795
## 48 I VVS2 15952
## 47 I VVS1 15654
## 42 I IF 12725
## 49 J I1 18531
## 52 J SI2 18710
## 51 J SI1 18508
## 54 J VS2 18701
## 53 J VS1 18706
## 56 J VVS2 17214
## 55 J VVS1 17891
## 50 J IF 18594
尝试
library(reshape2)
acast(diamonds, cut~color, value.var='price', mean)
# D E F G H I J
#Fair 4291.061 3682.312 3827.003 4239.255 5135.683 4685.446 4975.655
#Good 3405.382 3423.644 3495.750 4123.482 4276.255 5078.533 4574.173
#Very Good 3470.467 3214.652 3778.820 3872.754 4535.390 5255.880 5103.513
#Premium 3631.293 3538.914 4324.890 4500.742 5216.707 5946.181 6294.592
#Ideal 2629.095 2597.550 3374.939 3720.706 3889.335 4451.970 4918.186
或使用base R
with(diamonds, tapply(price, list(cut,color), FUN= mean))
# D E F G H I J
#Fair 4291.061 3682.312 3827.003 4239.255 5135.683 4685.446 4975.655
#Good 3405.382 3423.644 3495.750 4123.482 4276.255 5078.533 4574.173
#Very Good 3470.467 3214.652 3778.820 3872.754 4535.390 5255.880 5103.513
#Premium 3631.293 3538.914 4324.890 4500.742 5216.707 5946.181 6294.592
#Ideal 2629.095 2597.550 3374.939 3720.706 3889.335 4451.970 4918.186
或者按照@DavidArenburg 的建议
xtabs(price ~ cut + color, diamonds)/table(diamonds[c('cut', 'color')])
# color
#cut D E F G H I J
#Fair 4291.061 3682.312 3827.003 4239.255 5135.683 4685.446 4975.655
#Good 3405.382 3423.644 3495.750 4123.482 4276.255 5078.533 4574.173
#Very Good 3470.467 3214.652 3778.820 3872.754 4535.390 5255.880 5103.513
#Premium 3631.293 3538.914 4324.890 4500.742 5216.707 5946.181 6294.592
#Ideal 2629.095 2597.550 3374.939 3720.706 3889.335 4451.970 4918.186
您也可以从 data.table
的开发版本中尝试 dcast
,即 v1.9.5。
library(data.table)
dcast(as.data.table(diamonds), cut~color, value.var='price', mean)
如果分组变量为'clarity'和'color'
with(diamonds, tapply(price, list(clarity,color), FUN = mean))
其他功能,将tapply
中的FUN
或acast/dcast
中的fun.aggregate
改为
尝试:
library(dplyr)
library(tidyr)
diamonds %>%
group_by(cut, color) %>%
summarise(price = mean(price)) %>%
spread(color, price)
给出:
#Source: local data frame [5 x 8]
#
# cut D E F G H I J
#1 Fair 4291.061 3682.312 3827.003 4239.255 5135.683 4685.446 4975.655
#2 Good 3405.382 3423.644 3495.750 4123.482 4276.255 5078.533 4574.173
#3 Very Good 3470.467 3214.652 3778.820 3872.754 4535.390 5255.880 5103.513
#4 Premium 3631.293 3538.914 4324.890 4500.742 5216.707 5946.181 6294.592
#5 Ideal 2629.095 2597.550 3374.939 3720.706 3889.335 4451.970 4918.186
另一种选择
library(plyr)
library(tidyr)
spread(ddply(diamonds, .(cut, color), summarize, new = mean(price)), color, new)
# cut D E F G H I J
#1 Fair 4291.061 3682.312 3827.003 4239.255 5135.683 4685.446 4975.655
#2 Good 3405.382 3423.644 3495.750 4123.482 4276.255 5078.533 4574.173
#3 Very Good 3470.467 3214.652 3778.820 3872.754 4535.390 5255.880 5103.513
#4 Premium 3631.293 3538.914 4324.890 4500.742 5216.707 5946.181 6294.592
#5 Ideal 2629.095 2597.550 3374.939 3720.706 3889.335 4451.970 4918.186
library(ggplot2)
data(diamonds)
str(diamonds)
## 'data.frame': 53940 obs. of 10 variables:
## $ carat : num 0.23 0.21 0.23 0.29 0.31 0.24 0.24 0.26 0.22 0.23 ...
## $ cut : Ord.factor w/ 5 levels "Fair"<"Good"<..: 5 4 2 4 2 3 3 3 1 3 ...
## $ color : Ord.factor w/ 7 levels "D"<"E"<"F"<"G"<..: 2 2 2 6 7 7 6 5 2 5 ...
## $ clarity: Ord.factor w/ 8 levels "I1"<"SI2"<"SI1"<..: 2 3 5 4 2 6 7 3 4 5 ...
## $ depth : num 61.5 59.8 56.9 62.4 63.3 62.8 62.3 61.9 65.1 59.4 ...
## $ table : num 55 61 65 58 58 57 57 55 61 61 ...
## $ price : int 326 326 327 334 335 336 336 337 337 338 ...
## $ x : num 3.95 3.89 4.05 4.2 4.34 3.94 3.95 4.07 3.87 4 ...
## $ y : num 3.98 3.84 4.07 4.23 4.35 3.96 3.98 4.11 3.78 4.05 ...
## $ z : num 2.43 2.31 2.31 2.63 2.75 2.48 2.47 2.53 2.49 2.39 ...
这是我的交叉表
table(diamonds$cut,diamonds$color)
##
## D E F G H I J
## Fair 163 224 312 314 303 175 119
## Good 662 933 909 871 702 522 307
## Very Good 1513 2400 2164 2299 1824 1204 678
## Premium 1603 2337 2331 2924 2360 1428 808
## Ideal 2834 3903 3826 4884 3115 2093 896
但是我想要的不是计数,而是平均价格(或均值(价格)甚至最大值(价格)
我尝试了 Hmisc 包,但它以更长的格式给我数据,我需要它以上面的 table 格式
summarize(diamonds$price,llist(diamonds$color,diamonds$clarity),max)
## diamonds$color diamonds$clarity diamonds$price
## 1 D I1 15964
## 4 D SI2 18693
## 3 D SI1 18468
## 6 D VS2 18318
## 5 D VS1 17936
## 8 D VVS2 17545
## 7 D VVS1 17932
## 2 D IF 18542
## 9 E I1 11548
## 12 E SI2 18477
## 11 E SI1 18731
## 14 E VS2 18557
## 13 E VS1 18729
## 16 E VVS2 18188
## 15 E VVS1 16256
## 10 E IF 18700
## 17 F I1 10685
## 20 F SI2 18784
## 19 F SI1 18759
## 22 F VS2 18791
## 21 F VS1 18780
## 24 F VVS2 18614
## 23 F VVS1 18777
## 18 F IF 18552
## 25 G I1 13203
## 28 G SI2 18804
## 27 G SI1 18818
## 30 G VS2 18700
## 29 G VS1 18419
## 32 G VVS2 18768
## 31 G VVS1 18445
## 26 G IF 18806
## 33 H I1 17329
## 36 H SI2 18745
## 35 H SI1 18803
## 38 H VS2 18659
## 37 H VS1 18522
## 40 H VVS2 17267
## 39 H VVS1 14603
## 34 H IF 16300
## 41 I I1 16193
## 44 I SI2 18756
## 43 I SI1 18797
## 46 I VS2 18823
## 45 I VS1 18795
## 48 I VVS2 15952
## 47 I VVS1 15654
## 42 I IF 12725
## 49 J I1 18531
## 52 J SI2 18710
## 51 J SI1 18508
## 54 J VS2 18701
## 53 J VS1 18706
## 56 J VVS2 17214
## 55 J VVS1 17891
## 50 J IF 18594
尝试
library(reshape2)
acast(diamonds, cut~color, value.var='price', mean)
# D E F G H I J
#Fair 4291.061 3682.312 3827.003 4239.255 5135.683 4685.446 4975.655
#Good 3405.382 3423.644 3495.750 4123.482 4276.255 5078.533 4574.173
#Very Good 3470.467 3214.652 3778.820 3872.754 4535.390 5255.880 5103.513
#Premium 3631.293 3538.914 4324.890 4500.742 5216.707 5946.181 6294.592
#Ideal 2629.095 2597.550 3374.939 3720.706 3889.335 4451.970 4918.186
或使用base R
with(diamonds, tapply(price, list(cut,color), FUN= mean))
# D E F G H I J
#Fair 4291.061 3682.312 3827.003 4239.255 5135.683 4685.446 4975.655
#Good 3405.382 3423.644 3495.750 4123.482 4276.255 5078.533 4574.173
#Very Good 3470.467 3214.652 3778.820 3872.754 4535.390 5255.880 5103.513
#Premium 3631.293 3538.914 4324.890 4500.742 5216.707 5946.181 6294.592
#Ideal 2629.095 2597.550 3374.939 3720.706 3889.335 4451.970 4918.186
或者按照@DavidArenburg 的建议
xtabs(price ~ cut + color, diamonds)/table(diamonds[c('cut', 'color')])
# color
#cut D E F G H I J
#Fair 4291.061 3682.312 3827.003 4239.255 5135.683 4685.446 4975.655
#Good 3405.382 3423.644 3495.750 4123.482 4276.255 5078.533 4574.173
#Very Good 3470.467 3214.652 3778.820 3872.754 4535.390 5255.880 5103.513
#Premium 3631.293 3538.914 4324.890 4500.742 5216.707 5946.181 6294.592
#Ideal 2629.095 2597.550 3374.939 3720.706 3889.335 4451.970 4918.186
您也可以从 data.table
的开发版本中尝试 dcast
,即 v1.9.5。
library(data.table)
dcast(as.data.table(diamonds), cut~color, value.var='price', mean)
如果分组变量为'clarity'和'color'
with(diamonds, tapply(price, list(clarity,color), FUN = mean))
其他功能,将tapply
中的FUN
或acast/dcast
中的fun.aggregate
改为
尝试:
library(dplyr)
library(tidyr)
diamonds %>%
group_by(cut, color) %>%
summarise(price = mean(price)) %>%
spread(color, price)
给出:
#Source: local data frame [5 x 8]
#
# cut D E F G H I J
#1 Fair 4291.061 3682.312 3827.003 4239.255 5135.683 4685.446 4975.655
#2 Good 3405.382 3423.644 3495.750 4123.482 4276.255 5078.533 4574.173
#3 Very Good 3470.467 3214.652 3778.820 3872.754 4535.390 5255.880 5103.513
#4 Premium 3631.293 3538.914 4324.890 4500.742 5216.707 5946.181 6294.592
#5 Ideal 2629.095 2597.550 3374.939 3720.706 3889.335 4451.970 4918.186
另一种选择
library(plyr)
library(tidyr)
spread(ddply(diamonds, .(cut, color), summarize, new = mean(price)), color, new)
# cut D E F G H I J
#1 Fair 4291.061 3682.312 3827.003 4239.255 5135.683 4685.446 4975.655
#2 Good 3405.382 3423.644 3495.750 4123.482 4276.255 5078.533 4574.173
#3 Very Good 3470.467 3214.652 3778.820 3872.754 4535.390 5255.880 5103.513
#4 Premium 3631.293 3538.914 4324.890 4500.742 5216.707 5946.181 6294.592
#5 Ideal 2629.095 2597.550 3374.939 3720.706 3889.335 4451.970 4918.186