使用 tidyverse 通过 nest 获得描述性结果,然后计算我们有多少观察符合这些标准
Using tidyverse to get descriptive results with nest and then count how many observations we have matching these criteria
假设我有一个来自普通学校的数据集,来自不同生活地区的学生在数学、英语和科学方面进行了测试。如果你的分数比平均值低 1SD,你需要重新测试,如果你的分数比平均值低 2SD,你就会失败。
我可以轻松计算均值、标准差和这些截断值。我正在使用 tidyverse
包中的 nest
。但是,我想知道有多少学生低于平均值 1SD 和低于平均值 2SD。
但是,我不知道如何以简单的方式对这些结果进行 count
计算。
请检查数据集和我用来实现描述性结果的代码:
library(tidyverse)
set.seed(123)
ds <- data.frame(quest = c(2,4,6),
living_area = c("rural","urban","mixed"),
math_sum = rnorm(120, 10,1),
english_sum = rnorm(120, 10,1),
science_sum = rnorm(120, 10,1)
)
ds %>%
select(quest, ends_with("sum")) %>% #get variable names
pivot_longer(-quest) %>% #tranform into long format
nest_by(quest, name) %>% #nest
mutate(
n = map_dbl(data, ~nrow(data.frame(.))), #compute sample size
mean = map_dbl(data, ~mean(.)), #get the means
sd = map_dbl(data, ~sd(.)), #get sd
below = mean-sd, #1 below
failed = mean-2*sd)
ds %>%
filter(quest == 2 & english_sum <= 9.19) %>% nrow()
ds %>%
filter(quest == 2 & english_sum <= 9.39) %>% nrow()
ds %>%
filter(quest == 2 & english_sum <= 8.73) %>% nrow()
你可以这样做。
library(tidyverse)
ds_sum <- ds %>%
select(quest, ends_with("sum")) %>% #get variable names
pivot_longer(-quest) %>% #tranform into long format
nest_by(quest, name) %>% #nest
mutate(
n = map_dbl(data, ~ nrow(data.frame(.))),
#compute sample size
mean = map_dbl(data, ~ mean(.)),
#get the means
sd = map_dbl(data, ~ sd(.)),
#get sd
below = mean - sd,
#1 below
failed = mean - 2 * sd
) %>%
unnest(data) %>%
rowwise() %>%
mutate(
`1sd_below` = ifelse(value <= below & value > failed, 1, 0),
`2sd_below` = ifelse(value <= failed, 1, 0)
) %>%
select(-value) %>%
ungroup() %>%
group_by(across(-c(`1sd_below`, `2sd_below`))) %>%
summarise(across(c(`1sd_below`, `2sd_below`), sum))
输出
# A tibble: 9 × 9
quest name n mean sd below failed `1sd_below` `2sd_below`
<dbl> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 2 english_sum 40 10.0 0.839 9.19 8.35 6 0
2 2 math_sum 40 10.2 0.805 9.39 8.59 7 0
3 2 science_sum 40 9.92 1.19 8.73 7.54 8 0
4 4 english_sum 40 10.0 1.08 8.94 7.87 6 0
5 4 math_sum 40 9.90 0.870 9.03 8.16 6 0
6 4 science_sum 40 9.96 0.882 9.07 8.19 5 1
7 6 english_sum 40 9.87 1.03 8.83 7.80 7 0
8 6 math_sum 40 9.95 0.992 8.96 7.96 5 1
9 6 science_sum 40 10.4 0.967 9.41 8.44 4 1
然后,如果您想以更宽的格式可视化数据,那么您可以这样做。
ds_sum %>%
mutate(row = row_number()) %>%
select(row, quest, name, `1sd_below`, `2sd_below`) %>%
pivot_longer(cols = `1sd_below`:`2sd_below`, names_to = 'SD') %>%
unite("new_name", name, SD, sep = "_") %>%
pivot_wider(names_from = "new_name", values_from = "value") %>%
select(-row) %>%
replace(is.na(.), 0)
输出
# A tibble: 9 × 7
quest english_sum_1sd_below english_sum_2sd_below math_sum_1sd_below math_sum_2sd_below science_sum_1sd_below science_sum_2sd_below
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 2 6 0 0 0 0 0
2 2 0 0 7 0 0 0
3 2 0 0 0 0 8 0
4 4 6 0 0 0 0 0
5 4 0 0 6 0 0 0
6 4 0 0 0 0 5 1
7 6 7 0 0 0 0 0
8 6 0 0 5 1 0 0
9 6 0 0 0 0 4 1
我们可以使用 data
列来查看有多少学生低于 1 和 2 标准差。
将这两行添加到 mutate
调用中:
oneSd_below = sum((mean - sd) > data[[1]]),
twoSd_below = sum((mean - 2*sd) > data[[1]])
library(tidyverse)
set.seed(123)
ds <- data.frame(quest = c(2,4,6),
living_area = c("rural","urban","mixed"),
math_sum = rnorm(120, 10,1),
english_sum = rnorm(120, 10,1),
science_sum = rnorm(120, 10,1)
) %>% as_tibble()
ds %>%
select(quest, ends_with("sum")) %>% #get variable names
pivot_longer(-quest) %>% #tranform into long format
nest_by(quest, name) %>%
mutate(
n = map_dbl(data, ~ nrow(data.frame(.))),
#compute sample size
mean = map_dbl(data, ~ mean(.)),
#get the means
sd = map_dbl(data, ~ sd(.)),
#get sd
below = mean - sd,
#1 below
failed = mean - 2 * sd,
oneSd_below = sum((mean - sd) > data[[1]]),
twoSd_below = sum((mean - 2*sd) > data[[1]])
)
#> # A tibble: 9 × 10
#> # Rowwise: quest, name
#> quest name data n mean sd below failed oneSd_below twoSd_below
#> <dbl> <chr> <list<ti> <dbl> <dbl> <dbl> <dbl> <dbl> <int> <int>
#> 1 2 englis… [40 × 1] 40 10.0 0.839 9.19 8.35 6 0
#> 2 2 math_s… [40 × 1] 40 10.2 0.805 9.39 8.59 7 0
#> 3 2 scienc… [40 × 1] 40 9.92 1.19 8.73 7.54 8 0
#> 4 4 englis… [40 × 1] 40 10.0 1.08 8.94 7.87 6 0
#> 5 4 math_s… [40 × 1] 40 9.90 0.870 9.03 8.16 6 0
#> 6 4 scienc… [40 × 1] 40 9.96 0.882 9.07 8.19 6 1
#> 7 6 englis… [40 × 1] 40 9.87 1.03 8.83 7.80 7 0
#> 8 6 math_s… [40 × 1] 40 9.95 0.992 8.96 7.96 6 1
#> 9 6 scienc… [40 × 1] 40 10.4 0.967 9.41 8.44 5 1
由 reprex package (v2.0.1)
于 2021-12-25 创建
假设我有一个来自普通学校的数据集,来自不同生活地区的学生在数学、英语和科学方面进行了测试。如果你的分数比平均值低 1SD,你需要重新测试,如果你的分数比平均值低 2SD,你就会失败。
我可以轻松计算均值、标准差和这些截断值。我正在使用 tidyverse
包中的 nest
。但是,我想知道有多少学生低于平均值 1SD 和低于平均值 2SD。
但是,我不知道如何以简单的方式对这些结果进行 count
计算。
请检查数据集和我用来实现描述性结果的代码:
library(tidyverse)
set.seed(123)
ds <- data.frame(quest = c(2,4,6),
living_area = c("rural","urban","mixed"),
math_sum = rnorm(120, 10,1),
english_sum = rnorm(120, 10,1),
science_sum = rnorm(120, 10,1)
)
ds %>%
select(quest, ends_with("sum")) %>% #get variable names
pivot_longer(-quest) %>% #tranform into long format
nest_by(quest, name) %>% #nest
mutate(
n = map_dbl(data, ~nrow(data.frame(.))), #compute sample size
mean = map_dbl(data, ~mean(.)), #get the means
sd = map_dbl(data, ~sd(.)), #get sd
below = mean-sd, #1 below
failed = mean-2*sd)
ds %>%
filter(quest == 2 & english_sum <= 9.19) %>% nrow()
ds %>%
filter(quest == 2 & english_sum <= 9.39) %>% nrow()
ds %>%
filter(quest == 2 & english_sum <= 8.73) %>% nrow()
你可以这样做。
library(tidyverse)
ds_sum <- ds %>%
select(quest, ends_with("sum")) %>% #get variable names
pivot_longer(-quest) %>% #tranform into long format
nest_by(quest, name) %>% #nest
mutate(
n = map_dbl(data, ~ nrow(data.frame(.))),
#compute sample size
mean = map_dbl(data, ~ mean(.)),
#get the means
sd = map_dbl(data, ~ sd(.)),
#get sd
below = mean - sd,
#1 below
failed = mean - 2 * sd
) %>%
unnest(data) %>%
rowwise() %>%
mutate(
`1sd_below` = ifelse(value <= below & value > failed, 1, 0),
`2sd_below` = ifelse(value <= failed, 1, 0)
) %>%
select(-value) %>%
ungroup() %>%
group_by(across(-c(`1sd_below`, `2sd_below`))) %>%
summarise(across(c(`1sd_below`, `2sd_below`), sum))
输出
# A tibble: 9 × 9
quest name n mean sd below failed `1sd_below` `2sd_below`
<dbl> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 2 english_sum 40 10.0 0.839 9.19 8.35 6 0
2 2 math_sum 40 10.2 0.805 9.39 8.59 7 0
3 2 science_sum 40 9.92 1.19 8.73 7.54 8 0
4 4 english_sum 40 10.0 1.08 8.94 7.87 6 0
5 4 math_sum 40 9.90 0.870 9.03 8.16 6 0
6 4 science_sum 40 9.96 0.882 9.07 8.19 5 1
7 6 english_sum 40 9.87 1.03 8.83 7.80 7 0
8 6 math_sum 40 9.95 0.992 8.96 7.96 5 1
9 6 science_sum 40 10.4 0.967 9.41 8.44 4 1
然后,如果您想以更宽的格式可视化数据,那么您可以这样做。
ds_sum %>%
mutate(row = row_number()) %>%
select(row, quest, name, `1sd_below`, `2sd_below`) %>%
pivot_longer(cols = `1sd_below`:`2sd_below`, names_to = 'SD') %>%
unite("new_name", name, SD, sep = "_") %>%
pivot_wider(names_from = "new_name", values_from = "value") %>%
select(-row) %>%
replace(is.na(.), 0)
输出
# A tibble: 9 × 7
quest english_sum_1sd_below english_sum_2sd_below math_sum_1sd_below math_sum_2sd_below science_sum_1sd_below science_sum_2sd_below
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 2 6 0 0 0 0 0
2 2 0 0 7 0 0 0
3 2 0 0 0 0 8 0
4 4 6 0 0 0 0 0
5 4 0 0 6 0 0 0
6 4 0 0 0 0 5 1
7 6 7 0 0 0 0 0
8 6 0 0 5 1 0 0
9 6 0 0 0 0 4 1
我们可以使用 data
列来查看有多少学生低于 1 和 2 标准差。
将这两行添加到 mutate
调用中:
oneSd_below = sum((mean - sd) > data[[1]]),
twoSd_below = sum((mean - 2*sd) > data[[1]])
library(tidyverse)
set.seed(123)
ds <- data.frame(quest = c(2,4,6),
living_area = c("rural","urban","mixed"),
math_sum = rnorm(120, 10,1),
english_sum = rnorm(120, 10,1),
science_sum = rnorm(120, 10,1)
) %>% as_tibble()
ds %>%
select(quest, ends_with("sum")) %>% #get variable names
pivot_longer(-quest) %>% #tranform into long format
nest_by(quest, name) %>%
mutate(
n = map_dbl(data, ~ nrow(data.frame(.))),
#compute sample size
mean = map_dbl(data, ~ mean(.)),
#get the means
sd = map_dbl(data, ~ sd(.)),
#get sd
below = mean - sd,
#1 below
failed = mean - 2 * sd,
oneSd_below = sum((mean - sd) > data[[1]]),
twoSd_below = sum((mean - 2*sd) > data[[1]])
)
#> # A tibble: 9 × 10
#> # Rowwise: quest, name
#> quest name data n mean sd below failed oneSd_below twoSd_below
#> <dbl> <chr> <list<ti> <dbl> <dbl> <dbl> <dbl> <dbl> <int> <int>
#> 1 2 englis… [40 × 1] 40 10.0 0.839 9.19 8.35 6 0
#> 2 2 math_s… [40 × 1] 40 10.2 0.805 9.39 8.59 7 0
#> 3 2 scienc… [40 × 1] 40 9.92 1.19 8.73 7.54 8 0
#> 4 4 englis… [40 × 1] 40 10.0 1.08 8.94 7.87 6 0
#> 5 4 math_s… [40 × 1] 40 9.90 0.870 9.03 8.16 6 0
#> 6 4 scienc… [40 × 1] 40 9.96 0.882 9.07 8.19 6 1
#> 7 6 englis… [40 × 1] 40 9.87 1.03 8.83 7.80 7 0
#> 8 6 math_s… [40 × 1] 40 9.95 0.992 8.96 7.96 6 1
#> 9 6 scienc… [40 × 1] 40 10.4 0.967 9.41 8.44 5 1
由 reprex package (v2.0.1)
于 2021-12-25 创建