如何动态地进行集群明智的汇总统计

how to do the cluster wise summary statistics dynamically

假设我将此数据的簇映射到每一行,如下所示:

head(movies.imp)
##   num_voted_users num_user_for_reviews num_critic_for_reviews
## 1          886204                 3054                    723
## 2          471220                 1238                    302
## 3          275868                  994                    602
## 4         1144337                 2701                    813
## 5               8                  127                     37
## 6          212204                  738                    462
##   movie_facebook_likes cast_total_facebook_likes cluster
## 1                33000                      4834       1
## 2                    0                     48350       1
## 3                85000                     11700       1
## 4               164000                    106759       1
## 5                    0                       143       2
## 6                24000                      1873       1

为了获得平均集群,我这样做:

a<-as.data.frame(aggregate( .~ cluster, FUN=mean, data=movies.imp))
a
##   cluster num_voted_users num_user_for_reviews num_critic_for_reviews
## 1       1       316561.46            831.60526              326.61773
## 2       2        44934.26            180.05922              109.69265
## 3       3        29020.10             80.20408               28.57143
##   movie_facebook_likes cast_total_facebook_likes
## 1            33647.263                 28282.450
## 2             3119.099                  6641.746
## 3             6843.327                  2426.755

然后我转换成long:

library(tidyr)
long_df.a <- gather(data=a, value = mean, key=variablenames,  num_voted_users,num_user_for_reviews,num_critic_for_reviews,movie_facebook_likes,cast_total_facebook_likes)
long_df.a
long_df.a[,1]<-as.factor(long_df.a[,1]) # converting into a factor
long_df.a[,2]<-as.factor(long_df.a[,2])# converting into a factor

然后我做同样的过程来获得中值、最小值、最大值和标准簇

为以上所有描述性统计计算创建一个数据框

  dflong<-cbind(long_df.a,long_df.b[,3],long_df.c[,3],long_df.d[,3],long_df.e[,3])

 dflong<-dflong%>% set_names(c("cluster","variablenames","mean","median","min","max","sd")) # renaming the columns
 head(dflong)  

 ##   cluster        variablenames         mean   median      min     max
 ## 1       1         num_voted_users   316561.45706 263332.5 246 1689764
 ## 2       2      num_voted_users      44934.26451  25256.5   5  469561
 ## 3       3      num_voted_users      29020.10204  9277.0    15  213483
 ## 4       1     num_user_for_reviews  831.60526    642.0     1    5060
 ## 5       2 num_user_for_reviews    180.05922      129.0     1    1690
 ## 6       3 num_user_for_reviews     80.20408      42.0      1     394
 ##            sd
 ## 1 231350.9509
 ## 2  53613.7994
 ## 3  48491.2638
## 4    659.0410
## 5    181.3630
## 6    105.1669   

按集群对数据进行子集化

 dflong.1<-dflong %>% filter(cluster==1)
 dflong.2<-dflong %>% filter(cluster==2)
 dflong.3<-dflong %>% filter(cluster==3)

 dflong.combined<-rbind(dflong.1,dflong.2,dflong.3)
 head(dflong.combined)  # final required output

##    cluster             variablenames         mean   median   min     max
## 1        1           num_voted_users   316561.45706 263332.5 246 1689764
## 2        1      num_user_for_reviews    831.60526    642.0    1    5060
## 3        1      num_critic_for_reviews    326.61773    307.5  2     813
## 4        1      movie_facebook_likes  33647.26316  23000.0    0  349000
## 5        1 cast_total_facebook_likes  28282.45014  21095.0    44  656730
## 6        2           num_voted_users  44934.26451  25256.5     5  469561

##              sd
## 1  231350.95086
## 2     659.04103
## 3     142.47953
## 4   37698.06583
## 5   37395.59205
## 6   53613.79942

所以我正在以一种非最佳的方式做事来获取集群汇总统计信息....需要有关如何对动态使用循环进行排序或应用函数以在较少的代码行中获得最终输出的帮助... ..

我以 mtcars 数据集为例。假设 cyl 变量等同于 cluster。您可以在一行代码中获取所有汇总统计信息:

d <- mtcars
s <- d %>% group_by(cyl) %>% 
  summarise_all(c("mean", "median", "min", "max", "sd"))
#     cyl mpg_mean disp_mean   hp_mean drat_mean  wt_mean qsec_mean   vs_mean
#   <dbl>    <dbl>     <dbl>     <dbl>     <dbl>    <dbl>     <dbl>     <dbl>
# 1     4 26.66364  105.1364  82.63636  4.070909 2.285727  19.13727 0.9090909
# 2     6 19.74286  183.3143 122.28571  3.585714 3.117143  17.97714 0.5714286
# 3     8 15.10000  353.1000 209.21429  3.229286 3.999214  16.77214 0.0000000
# # ... with 43 more variables: ...

我们只剩下重塑数据框以使其成为我们想要的形式:

s <- gather(s, key, value, -cyl)
s <- separate(s, key, c("variable", "stat"))
d.combined <- spread(s, key = stat, value = value)
# # A tibble: 30 × 7
#      cyl variable    max        mean median    min         sd
# *  <dbl>    <chr>  <dbl>       <dbl>  <dbl>  <dbl>      <dbl>
# 1      4       am   1.00   0.7272727   1.00  0.000  0.4670994
# 2      4     carb   2.00   1.5454545   2.00  1.000  0.5222330
# 3      4     disp 146.70 105.1363636 108.00 71.100 26.8715937
# 4      4     drat   4.93   4.0709091   4.08  3.690  0.3654711
# 5      4     gear   5.00   4.0909091   4.00  3.000  0.5393599
# 6      4       hp 113.00  82.6363636  91.00 52.000 20.9345300
# 7      4      mpg  33.90  26.6636364  26.00 21.400  4.5098277
# 8      4     qsec  22.90  19.1372727  18.90 16.700  1.6824452
# 9      4       vs   1.00   0.9090909   1.00  0.000  0.3015113
# 10     4       wt   3.19   2.2857273   2.20  1.513  0.5695637
# # ... with 20 more rows