如何对数据帧进行分组,然后计算 R 中的不同值

How to group a dataframe, and then count the distinct values in R

我在 R 中有一个数据框,它有 43 个变量和超过 80 行。我想根据一个变量——地理区域对数据进行分组,然后计算一个变量的不同值(有多少个 0、1、2、3 和 NA 等)。

我知道 tidyverse 中的 group_bysummarize 函数,我知道我可以使用 "sum" 和 "mean" 等函数,但是我想数

我试过了 est1 <- df %>% group_by(region) %>% summarize(count)

数据如下所示:

    iso3      Country WHOregion       WBIncomeGroup UrbanSanPol UrbanSanWom UrbanSanExt RuralSanPol RuralSanWom
  <chr>        <chr>     <chr>               <chr>       <chr>       <chr>       <chr>       <chr>       <chr>
1   AFG  Afghanistan      EMRO          Low income           0        <NA>        <NA>           1           1
2   ALB      Albania      EURO Upper middle income           1           0           0           1           0
3   ARG    Argentina      PAHO Upper middle income           1           0         0.5           1           0
4   AZE   Azerbaijan      EURO Upper middle income           1           1         0.5           1           1
5   BDI      Burundi      AFRO          Low income           1           1         0.5           1           1
6   BFA Burkina Faso      AFRO          Low income           1           1           1           1           1

但这不是我想要的。有人可以帮忙吗?

structure(list(iso3 = c("AFG", "ALB", "ARG", "AZE", "BDI", "BFA", 
"BGD", "BIH", "BLR", "BOL"), Country = c("Afghanistan", "Albania", 
"Argentina", "Azerbaijan", "Burundi", "Burkina Faso", "Bangladesh", 
"Bosnia and Herzegovina", "Belarus", "Bolivia (Plurinational State of)"
), WHOregion = c("EMRO", "EURO", "PAHO", "EURO", "AFRO", "AFRO", 
"SEARO", "EURO", "EURO", "PAHO"), WBIncomeGroup = c("Low income", 
"Upper middle income", "Upper middle income", "Upper middle income", 
"Low income", "Low income", "Lower middle income", "Upper middle income", 
"Upper middle income", "Lower middle income"), UrbanSanPol = c("0", 
"1", "1", "1", "1", "1", "1", "1", "1", "1"), UrbanSanWom = c(NA, 
"0", "0", "1", "1", "1", "1", "0", NA, "0"), UrbanSanExt = c(NA, 
"0", "0.5", "0.5", "0.5", "1", "0.5", "0", "0.5", "0"), RuralSanPol = c("1", 
"1", "1", "1", "1", "1", "1", "1", "1", "1"), RuralSanWom = c("1", 
"0", "0", "1", "1", "1", "1", "0", NA, "0"), RuralSanExt = c("0.5", 
"0", "0", "0.5", "0.5", "1", "0.5", "0", "0.5", "0.5"), UrbanDWPol = c("0", 
"1", "1", "1", "1", "1", "1", "1", "1", "1"), UrbanDWWom = c(NA, 
"0", "0", "1", "1", "1", "1", "0", NA, "0"), UrbanDWExt = c(NA, 
"0", "0.5", "1", "0", "0.5", "0.5", "0.5", "0.5", "0"), RuralDWPol = c("1", 
"1", "1", "1", "1", "1", "1", "1", "1", "1"), RuralDWWom = c("1", 
"0", "0", "1", "1", "1", "1", "0", NA, "0"), RuralDWExt = c("0.5", 
"0", "0", "1", "0.5", "1", "0.5", "0.5", "0.5", "0.5"), HygienePol = c("1", 
"1", "0", "1", "1", "1", "1", "1", "1", "0"), HygieneWom = c("1", 
NA, NA, "1", "1", "1", "1", "0", NA, "0"), HygieneExt = c("0.5", 
NA, NA, "0", "0.5", "0", "0.5", "0", "0.5", "0"), WASHHealthPol = c("1", 
"1", "0", "1", "1", "1", "1", "1", "0", "0"), WASHHealthWom = c("0", 
NA, NA, "1", "1", "1", "1", "0", NA, "0"), WASHHealthExt = c("0", 
NA, "0.5", "1", "0", "0.5", "0", "0", NA, "0"), WpollutionPol = c("1", 
"1", "1", "1", "1", "1", "1", "1", "1", "0"), WpollutionWom = c("1", 
NA, "0", "1", "1", "1", "1", "0", NA, "0"), WpollutionExt = c("0", 
NA, "0", "1", "0", "0.5", "0", "0", "0.5", "0"), WQMPol = c("1", 
"1", "1", "1", "1", "1", "1", "1", "1", "0"), WQMWom = c("1", 
NA, "0", "1", "1", "1", "1", "0", NA, "0"), WQMExt = c("0", NA, 
"0", "1", "0", "0.5", "0", "0", "0.5", "0"), WatRightPol = c("0", 
"1", "1", "1", NA, "1", "1", "1", "1", "1"), WatRightWom = c("0", 
NA, "0", "1", NA, "1", "1", "0", NA, "0"), WatRightExt = c("0", 
NA, "0.5", "1", NA, "1", "0", "0", "0.5", "0.5"), WRMPol = c("1", 
"1", "1", "1", "1", "1", "1", "1", "1", "1"), WRMWom = c("0", 
NA, "0", "1", "1", "1", "1", "0", NA, "0"), WRMExt = c("0", NA, 
"0.5", "1", "0.5", "1", "0", "0", "0.5", "0"), EnvProtPol = c("1", 
"1", "1", "1", "1", "1", "1", "1", "1", "1"), EnvProtWom = c("0", 
NA, "0", "1", "1", "1", "1", "0", NA, "0"), EnvProtExt = c("0", 
NA, "0", "1", "0", "1", "0", "0", "0.5", "0"), `SDG regions` = c("Central Asia (M49) and Southern Asia (MDG=M49)", 
"Northern America (M49) and Europe (M49)", "Latin America and the Caribbean (MDG=M49)", 
"Western Asia (M49) and Northern Africa (M49)", "Sub-Saharan Africa (M49)", 
"Sub-Saharan Africa (M49)", "Central Asia (M49) and Southern Asia (MDG=M49)", 
"Northern America (M49) and Europe (M49)", "Northern America (M49) and Europe (M49)", 
"Latin America and the Caribbean (MDG=M49)"), M49_level1 = c("Asia (M49)", 
"Europe (M49)", "Latin America and the Caribbean (MDG=M49)", 
"Asia (M49)", "Sub-Saharan Africa (M49)", "Sub-Saharan Africa (M49)", 
"Asia (M49)", "Europe (M49)", "Europe (M49)", "Latin America and the Caribbean (MDG=M49)"
), M49_level2 = c("Southern Asia (MDG=M49)", "Southern Europe (M49)", 
"South America (M49)", "Western Asia (M49)", "Eastern Africa (M49)", 
"Western Africa (M49)", "Southern Asia (MDG=M49)", "Southern Europe (M49)", 
"Eastern Europe (M49)", "South America (M49)"), LDCs = c("Least Developed Countries (LDCs)", 
NA, NA, NA, "Least Developed Countries (LDCs)", "Least Developed Countries (LDCs)", 
"Least Developed Countries (LDCs)", NA, NA, NA), LLDCS_SIDS = c("Landlocked developing countries (LLDCs)", 
NA, NA, "Landlocked developing countries (LLDCs)", "Landlocked developing countries (LLDCs)", 
"Landlocked developing countries (LLDCs)", NA, NA, NA, "Landlocked developing countries (LLDCs)"
), `Income group` = c("Low income", "Upper middle income", "Upper middle income", 
"Upper middle income", "Low income", "Low income", "Lower middle income", 
"Upper middle income", "Upper middle income", "Lower middle income"
)), .Names = c("iso3", "Country", "WHOregion", "WBIncomeGroup", 
"UrbanSanPol", "UrbanSanWom", "UrbanSanExt", "RuralSanPol", "RuralSanWom", 
"RuralSanExt", "UrbanDWPol", "UrbanDWWom", "UrbanDWExt", "RuralDWPol", 
"RuralDWWom", "RuralDWExt", "HygienePol", "HygieneWom", "HygieneExt", 
"WASHHealthPol", "WASHHealthWom", "WASHHealthExt", "WpollutionPol", 
"WpollutionWom", "WpollutionExt", "WQMPol", "WQMWom", "WQMExt", 
"WatRightPol", "WatRightWom", "WatRightExt", "WRMPol", "WRMWom", 
"WRMExt", "EnvProtPol", "EnvProtWom", "EnvProtExt", "SDG regions", 
"M49_level1", "M49_level2", "LDCs", "LLDCS_SIDS", "Income group"
), row.names = c(NA, -10L), class = c("tbl_df", "tbl", "data.frame"

在此处输入代码

Imagine I have three columns. The first is a list of countries (France, Germany, etc). The second is a list of regions (Asia, Europe), the third is a discrete value for each country (number of Olympic Gold medals). I want to group all the data by region, and count how many times for each region, 0 occurs, 1 occurs, 2 occurs.

根据你在评论中所说的,以及我的理解:

解读:

df %>% select(continent,countries,medals) %>% group_by(continent,countries) %>% summarize(count =n())

另一种解释:你想要的是每个大陆和它获得的不同奖牌数。

数字是该国获得的奥运奖牌数。

df <- as.data.frame(matrix(c("Asia","Asia","Asia","Asia","Europe","Europe","India","China","Bangladesh","Japan","Spain", "Italy",6,3,4,4,3,3),ncol = 3)) 

df %>% group_by(V1,V3) %>% summarise(count= n()) %>% spread(V3,count)

给我一个输出

# A tibble: 2 x 4
# Groups:   V1 [2]
      V1   `3`   `4`   `6`
* <fctr> <int> <int> <int>
1   Asia     1     2     1
2 Europe     2    NA    NA

试试这个;需要 dplyr 和 tidyverse

distinct_cnt <-  input_df  %>%
    gather(variable, value) %>%
    group_by(variable) %>%
    summarise(n_distinct(value))