计算给定条件的百分比

Question

我是这个网站的新手，也是编码方面的新手。我想知道你们中是否有人可以帮助我

我需要计算 Top 5 电影，通过评分分布，计算每部电影评分为 4 星或更高的百分比。

到目前为止，我只能使用 dplyr 计算出现次数。

是否可以使用 dplyr 计算它（类似于我的编码）？

我不确定我是否需要变异来提出解决方案，或者是否有其他方法可以这样做。

到目前为止我的代码：

dfAux1 <- na.omit(dfAux)
dfAux1 %>%
  group_by(movie) %>%
  summarise(tot = n()) %>%
  arrange(desc(tot))%>%
  head(5)

结果应该是这样的：

**Expected result**:
0.7000000, 'The Shawshank Redemption'
0.5333333, 'Star Wars IV - A New Hope'
0.5000000, 'Gladiator'
0.4444444, 'Blade Runner'
0.4375000, 'The Silence of the Lambs'

到目前为止，这是我的结果：

# A tibble: 5 x 2
                              movie   tot
                             <fctr> <int>
1                         Toy Story    17
2          The Silence of the Lambs    16
3         Star Wars IV - A New Hope    15
4 Star Wars VI - Return of the Jedi    14
5                  Independence Day    13

编辑：

str(dfAux1)
'data.frame':   241 obs. of  2 variables:
 $ Rating: int  1 5 4 2 4 5 4 2 3 2 ...
 $ movie : Factor w/ 20 levels "Star Wars IV - A New Hope",..: 1 1 1 1 1 1 1 1 1 1 ...
 - attr(*, "na.action")=Class 'omit'  Named int [1:159] 3 4 7 16 17 23 27 28 34 36 ...
  .. ..- attr(*, "names")= chr [1:159] "3" "4" "7" "16" ...

dput(dfAux1)
structure(list(Rating = c(1L, 5L, 4L, 2L, 4L, 5L, 4L, 2L, 3L, 
2L, 3L, 4L, 4L, 5L, 1L, 5L, 3L, 3L, 3L, 4L, 1L, 2L, 1L, 5L, 3L, 
4L, 5L, 1L, 2L, 2L, 4L, 4L, 3L, 5L, 2L, 3L, 1L, 1L, 2L, 2L, 5L, 
1L, 4L, 1L, 4L, 5L, 5L, 5L, 4L, 4L, 4L, 2L, 4L, 1L, 3L, 2L, 3L, 
2L, 4L, 2L, 5L, 3L, 4L, 1L, 5L, 4L, 2L, 1L, 1L, 4L, 2L, 4L, 5L, 
5L, 2L, 1L, 4L, 2L, 1L, 4L, 2L, 3L, 2L, 4L, 4L, 5L, 2L, 4L, 3L, 
2L, 2L, 4L, 2L, 2L, 2L, 3L, 4L, 1L, 5L, 4L, 3L, 5L, 2L, 1L, 3L, 
4L, 4L, 2L, 3L, 4L, 1L, 3L, 2L, 5L, 3L, 2L, 3L, 4L, 1L, 1L, 4L, 
1L, 4L, 5L, 1L, 3L, 2L, 2L, 3L, 5L, 5L, 1L, 2L, 3L, 5L, 2L, 3L, 
1L, 2L, 1L, 4L, 1L, 2L, 2L, 3L, 3L, 2L, 1L, 1L, 1L, 5L, 2L, 4L, 
1L, 4L, 3L, 1L, 2L, 2L, 3L, 4L, 2L, 3L, 2L, 4L, 3L, 4L, 3L, 2L, 
2L, 4L, 5L, 2L, 1L, 5L, 1L, 4L, 5L, 2L, 3L, 3L, 2L, 5L, 5L, 4L, 
1L, 3L, 1L, 2L, 1L, 5L, 5L, 2L, 4L, 2L, 4L, 2L, 5L, 2L, 5L, 5L, 
1L, 5L, 1L, 3L, 2L, 2L, 3L, 5L, 1L, 3L, 1L, 5L, 3L, 3L, 1L, 2L, 
4L, 1L, 5L, 3L, 1L, 1L, 5L, 5L, 1L, 5L, 3L, 3L, 2L, 3L, 3L, 2L, 
2L, 2L, 5L, 4L, 2L, 1L, 4L, 5L), movie = structure(c(1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 
3L, 3L, 3L, 3L, 3L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 5L, 
5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 6L, 
6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 7L, 7L, 7L, 7L, 7L, 
7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 8L, 8L, 8L, 8L, 
8L, 8L, 8L, 8L, 8L, 8L, 8L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 
9L, 9L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 
11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L, 12L, 12L, 
12L, 12L, 12L, 12L, 12L, 12L, 12L, 12L, 12L, 12L, 13L, 13L, 13L, 
13L, 13L, 13L, 13L, 13L, 13L, 13L, 13L, 13L, 13L, 14L, 14L, 14L, 
14L, 14L, 14L, 14L, 14L, 14L, 15L, 15L, 15L, 15L, 15L, 15L, 15L, 
15L, 15L, 15L, 15L, 15L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 
16L, 16L, 16L, 16L, 17L, 17L, 17L, 17L, 17L, 17L, 17L, 17L, 17L, 
17L, 17L, 17L, 18L, 18L, 18L, 18L, 18L, 18L, 18L, 18L, 18L, 18L, 
18L, 18L, 19L, 19L, 19L, 19L, 19L, 19L, 19L, 19L, 19L, 19L, 19L, 
20L, 20L, 20L, 20L, 20L, 20L, 20L, 20L, 20L, 20L), .Label = c("Star Wars IV - A New Hope", 
"Star Wars VI - Return of the Jedi", "Forrest Gump", "The Shawshank Redemption", 
"The Silence of the Lambs", "Gladiator", "Toy Story", "Saving Private Ryan", 
"Pulp Fiction", "Stand by Me", "Shakespeare in Love", "Total Recall", 
"Independence Day", "Blade Runner", "Groundhog Day", "The Matrix", 
"Schindler's List", "The Sixth Sense", "Raiders of the Lost Ark", 
"Babe"), class = "factor")), .Names = c("Rating", "movie"), row.names = c(1L, 
2L, 5L, 6L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 15L, 18L, 19L, 20L, 
21L, 22L, 24L, 25L, 26L, 29L, 30L, 31L, 32L, 33L, 35L, 38L, 39L, 
40L, 41L, 45L, 46L, 47L, 51L, 52L, 54L, 56L, 58L, 60L, 62L, 63L, 
65L, 66L, 67L, 69L, 70L, 73L, 78L, 80L, 81L, 82L, 83L, 85L, 87L, 
88L, 89L, 90L, 92L, 93L, 94L, 95L, 96L, 97L, 98L, 100L, 101L, 
102L, 104L, 105L, 107L, 108L, 109L, 111L, 115L, 116L, 118L, 119L, 
121L, 122L, 123L, 124L, 126L, 128L, 129L, 130L, 131L, 132L, 133L, 
134L, 135L, 137L, 138L, 139L, 140L, 141L, 144L, 145L, 146L, 147L, 
149L, 150L, 153L, 156L, 159L, 160L, 164L, 166L, 167L, 168L, 170L, 
172L, 175L, 177L, 178L, 179L, 180L, 181L, 182L, 183L, 185L, 186L, 
189L, 194L, 195L, 196L, 199L, 200L, 201L, 202L, 205L, 206L, 207L, 
209L, 212L, 216L, 217L, 219L, 220L, 222L, 223L, 224L, 225L, 226L, 
228L, 229L, 231L, 233L, 234L, 235L, 239L, 241L, 242L, 243L, 244L, 
246L, 248L, 249L, 250L, 251L, 252L, 253L, 254L, 255L, 261L, 263L, 
264L, 265L, 267L, 268L, 274L, 278L, 280L, 282L, 283L, 284L, 286L, 
288L, 289L, 292L, 293L, 294L, 295L, 296L, 300L, 301L, 303L, 305L, 
307L, 310L, 311L, 312L, 314L, 316L, 317L, 319L, 320L, 321L, 322L, 
323L, 324L, 325L, 328L, 330L, 334L, 335L, 336L, 338L, 340L, 341L, 
342L, 343L, 344L, 345L, 346L, 348L, 350L, 351L, 356L, 358L, 360L, 
362L, 363L, 364L, 367L, 368L, 371L, 373L, 375L, 376L, 378L, 380L, 
383L, 384L, 386L, 387L, 389L, 391L, 392L, 395L, 396L, 398L), class = "data.frame", na.action = structure(c(3L, 
4L, 7L, 16L, 17L, 23L, 27L, 28L, 34L, 36L, 37L, 42L, 43L, 44L, 
48L, 49L, 50L, 53L, 55L, 57L, 59L, 61L, 64L, 68L, 71L, 72L, 74L, 
75L, 76L, 77L, 79L, 84L, 86L, 91L, 99L, 103L, 106L, 110L, 112L, 
113L, 114L, 117L, 120L, 125L, 127L, 136L, 142L, 143L, 148L, 151L, 
152L, 154L, 155L, 157L, 158L, 161L, 162L, 163L, 165L, 169L, 171L, 
173L, 174L, 176L, 184L, 187L, 188L, 190L, 191L, 192L, 193L, 197L, 
198L, 203L, 204L, 208L, 210L, 211L, 213L, 214L, 215L, 218L, 221L, 
227L, 230L, 232L, 236L, 237L, 238L, 240L, 245L, 247L, 256L, 257L, 
258L, 259L, 260L, 262L, 266L, 269L, 270L, 271L, 272L, 273L, 275L, 
276L, 277L, 279L, 281L, 285L, 287L, 290L, 291L, 297L, 298L, 299L, 
302L, 304L, 306L, 308L, 309L, 313L, 315L, 318L, 326L, 327L, 329L, 
331L, 332L, 333L, 337L, 339L, 347L, 349L, 352L, 353L, 354L, 355L, 
357L, 359L, 361L, 365L, 366L, 369L, 370L, 372L, 374L, 377L, 379L, 
381L, 382L, 385L, 388L, 390L, 393L, 394L, 397L, 399L, 400L), .Names = c("3", 
"4", "7", "16", "17", "23", "27", "28", "34", "36", "37", "42", 
"43", "44", "48", "49", "50", "53", "55", "57", "59", "61", "64", 
"68", "71", "72", "74", "75", "76", "77", "79", "84", "86", "91", 
"99", "103", "106", "110", "112", "113", "114", "117", "120", 
"125", "127", "136", "142", "143", "148", "151", "152", "154", 
"155", "157", "158", "161", "162", "163", "165", "169", "171", 
"173", "174", "176", "184", "187", "188", "190", "191", "192", 
"193", "197", "198", "203", "204", "208", "210", "211", "213", 
"214", "215", "218", "221", "227", "230", "232", "236", "237", 
"238", "240", "245", "247", "256", "257", "258", "259", "260", 
"262", "266", "269", "270", "271", "272", "273", "275", "276", 
"277", "279", "281", "285", "287", "290", "291", "297", "298", 
"299", "302", "304", "306", "308", "309", "313", "315", "318", 
"326", "327", "329", "331", "332", "333", "337", "339", "347", 
"349", "352", "353", "354", "355", "357", "359", "361", "365", 
"366", "369", "370", "372", "374", "377", "379", "381", "382", 
"385", "388", "390", "393", "394", "397", "399", "400"), class = "omit"))

Answer 1

我正在使用 data.table 而不是 dplyr

library(data.table)
setDT(dfAux1)  # make dfAux1 as data table by reference

# calculate total number by movies, then compute percent for `Rating >= 4` by movies and then sort `tot` by descending order and also eliminating duplicates in movies using `.SD[1]` which gives the first row in each movie. 
dfAux1[, .(Rating, tot = .N), by = movie ][Rating >= 4, .(percent = .N/tot, tot), by = movie ][order(-tot), .SD[1], by = movie]

#                                movie    percent tot
# 1:                         Toy Story 0.35294118  17
# 2:          The Silence of the Lambs 0.43750000  16
# 3:         Star Wars IV - A New Hope 0.53333333  15
# 4: Star Wars VI - Return of the Jedi 0.35714286  14
# 5:                  Independence Day 0.30769231  13
# 6:                         Gladiator 0.50000000  12
# 7:                      Total Recall 0.08333333  12
# 8:                     Groundhog Day 0.41666667  12
# 9:                        The Matrix 0.41666667  12
# 10:                  Schindler's List 0.33333333  12
# 11:                   The Sixth Sense 0.33333333  12
# 12:               Saving Private Ryan 0.36363636  11
# 13:                      Pulp Fiction 0.36363636  11
# 14:                       Stand by Me 0.36363636  11
# 15:               Shakespeare in Love 0.27272727  11
# 16:           Raiders of the Lost Ark 0.27272727  11
# 17:                      Forrest Gump 0.30000000  10
# 18:          The Shawshank Redemption 0.70000000  10
# 19:                              Babe 0.40000000  10
# 20:                      Blade Runner 0.44444444   9

Answer 2

概述

我使用 dplyr 包按 movie 列对数据进行分组，并根据 rating 列执行计算。

在 summarise() 中，我创建了三个新列：

Total_Review：统计每个movie.
FourPlus_Rating：计算 Rating 值为 4 或更高的评论子集。
Per_FourPlus_Rating：将 FourPlus_Rating 除以 Total_Review。

然后我按照Per_FourPlus_Rating降序排列了日期。最后，我调用 head() to specify that I only want the tibble 到 return 前 5 行。

可重现的例子

# install necessary package
install.packages( pkgs = "dplyr" )

# load necessary package
library( dplyr )


# view first six rows
head( x = df )
#   Rating                     movie
# 1      1 Star Wars IV - A New Hope
# 2      5 Star Wars IV - A New Hope
# 5      4 Star Wars IV - A New Hope
# 6      2 Star Wars IV - A New Hope
# 8      4 Star Wars IV - A New Hope
# 9      5 Star Wars IV - A New Hope

# perform calculations using 
# dplyr functions
df %>%
  group_by( movie ) %>%
  summarise( Total_Review              = n()
             , FourPlus_Rating         = length( Rating[ which( Rating >= 4 ) ] )
             , Per_FourPlus_Rating     = length( Rating[ which( Rating >= 4 ) ] ) / n() ) %>%
  arrange( desc( Per_FourPlus_Rating ) ) %>%
  head( n = 5 )
# A tibble: 5 x 4
# movie               Total_Review FourPlus_Rating Per_FourPlus_Rati…
# <fct>                      <int>           <int>              <dbl>
# 1 The Shawshank Rede…           10               7              0.700
# 2 Star Wars IV - A N…           15               8              0.533
# 3 Gladiator                     12               6              0.500
# 4 Blade Runner                   9               4              0.444
# 5 The Silence of the…           16               7              0.438

# end of script #

Answer 3

使用 data.table 和来自 OP 的数据的单行解决方案可以是：

library(data.table)
setDT(dfAux1)[, .(pct = sum(Rating>=4)/.N), by=movie][order(-pct)][1:5]
                  movie        pct
1:  The Shawshank Redemption 0.7000000
2: Star Wars IV - A New Hope 0.5333333
3:                 Gladiator 0.5000000
4:              Blade Runner 0.4444444
5:  The Silence of the Lambs 0.4375000

Answer 4

这是一个 dplyr 解决方案：

    dfAuxhigh=filter(dfAux1,Rating>=4)%>%group_by(movie)%>%summarize(percentHigh=n())
dfAux=dfAux1%>%group_by(movie)%>%summarize(percentAll=n())
result<-merge(dfAuxhigh,dfAux,by="movie")%>%mutate(percentage=percentHigh/percentAll)
result<-result[order(result$percentage,decreasing = T)[1:5],c(1,4)]

Answer 5

library(tidyverse)

df %>% 
  group_by(movie, Rating) %>% 
  summarise(n = n()) %>%           #< get freq of movies
  mutate(freq = n/sum(n)) %>%      #< find perc for each rating, by movie
  filter(Rating >=4) %>%           #< filter for desired rating (4 or above) 
  summarise(freq = sum(freq)) %>%  #< summarize again
  top_n(5) %>%                     
  arrange(desc(freq)) %>% 
  mutate(freq = paste0(round(freq*100, 2), "%"))

#>   movie                     freq  
#> 1 The Shawshank Redemption  70%  
#> 2 Star Wars IV - A New Hope 53.33%
#> 3 Gladiator                 50%   
#> 4 Blade Runner              44.44%
#> 5 The Silence of the Lambs  43.75%

计算给定条件的百分比

Calculate percentage given condition

conditional

r

percentage

dplyr

概述

可重现的例子