如何使用 tidycensus 和 tidyverse 准确地汇总子组误差值?
How do I accurately aggregate subgroup margin of error values using tidycensus and tidyverse?
我正在尝试使用 R 中的美国社区调查计算明尼苏达州每个县按种族划分的 20 岁以下人口。使用 Tidycensus 我知道这可以使用 R 中每个种族和年龄组的 B01001H 变量来完成. 然而,我需要汇总每个种族群体 20 岁以下人群的所有变量。根据此网页(https://www.census.gov/content/dam/Census/library/publications/2018/acs/acs_general_handbook_2018_ch08.pdf)虽然汇总估计值只是每个子组值的总和,但汇总误差范围需要我计算以下公式:
MOE = sqrt(moe_1^2 + moe_2^2 + ... + moe_n^2)
对于子组中的每个 MOE。那么我究竟该如何使用 tidyverse 来准确计算这个汇总的 MOE 值呢?
到目前为止我的代码是这样的:
## age race
age_vars_male = c(w1="B01001H_003",w2="B01001H_004",w3="B01001H_005",w4="B01001H_006",
b1="B01001B_003",b2="B01001B_004",b3="B01001B_005",b4="B01001B_006",
AN1="B01001C_003",AN2="B01001C_004",AN3="B01001C_005",AN4="B01001C_006",
AS1="B01001D_003",AS2="B01001D_004",AS3="B01001D_005",AS4="B01001D_006",
H1="B01001I_003",H2="B01001I_004",H3="B01001I_005",H4="B01001I_006")
## obtaining variables listed above for MN counties
pop_un20 <- get_acs(geography = "county",
variables = age_vars_male,
state = "MN",
geometry=T)
pop_un20 = pop_un20 %>% mutate(Race = case_when(variable %in% c("w1","w2","w3","w4") ~ "White",
variable %in% c("b1","b2","b3","b4") ~ "Black",
variable %in% c("AN1","AN2","AN3","AN4") ~"AI/AN",
variable %in% c("AS1","AS2","AS3","AS4") ~"Asian",
variable %in% c("H1","H2","H3","H4") ~"Hispanic/Latino"),
moe_sqrd = moe^2) %>% select(-variable)
moe_aggregate = pop_un20 %>% group_by(NAME,Race) %>% summarise(moe_aggregate = sqrt(sum(moe_sqrd,na.rm = T))) %>% st_set_geometry(NULL)
est_aggregate = pop_un20 %>% group_by(NAME,Race) %>% summarise(estimate_aggregate = sum(estimate,na.rm = T)) %>% st_set_geometry(NULL)
pop_under20 = pop_un20 %>% right_join(moe_aggregate, by = c("NAME","Race")) %>% right_join(est_aggregate, by = c("NAME","Race")) %>%
select(-estimate,-moe,moe_sqrd)
我已经计算出我所要求的内容,首先为 moe 平方创建一个列,然后对每个组和种族的总和求平方根。但是有没有办法一次性做到这一点?
您可以使用 mutate
代替 summarise
并直接在数据中添加新列。
library(dplyr)
pop_under20 <- pop_un20 %>%
group_by(NAME,Race) %>%
mutate(moe_aggregate = sqrt(sum(moe_sqrd,na.rm = TRUE)),
estimate_aggregate = sum(estimate,na.rm = TRUE))
tidycensus 有一个函数,moe_sum()
,that does this for you。调整您的代码:
library(tidycensus)
library(tidyverse)
age_vars_male = c(w1="B01001H_003",w2="B01001H_004",w3="B01001H_005",w4="B01001H_006",
b1="B01001B_003",b2="B01001B_004",b3="B01001B_005",b4="B01001B_006",
AN1="B01001C_003",AN2="B01001C_004",AN3="B01001C_005",AN4="B01001C_006",
AS1="B01001D_003",AS2="B01001D_004",AS3="B01001D_005",AS4="B01001D_006",
H1="B01001I_003",H2="B01001I_004",H3="B01001I_005",H4="B01001I_006")
## obtaining variables listed above for MN counties
pop_un20 <- get_acs(geography = "county",
variables = age_vars_male,
state = "MN")
pop_un20_grouped <- pop_un20 %>%
mutate(Race = case_when(variable %in% c("w1","w2","w3","w4") ~ "White",
variable %in% c("b1","b2","b3","b4") ~ "Black",
variable %in% c("AN1","AN2","AN3","AN4") ~"AI/AN",
variable %in% c("AS1","AS2","AS3","AS4") ~"Asian",
variable %in% c("H1","H2","H3","H4") ~"Hispanic/Latino")) %>%
group_by(NAME, Race) %>%
summarize(group_estimate = sum(estimate, na.rm = TRUE),
group_moe = moe_sum(moe = moe, estimate = estimate, na.rm = TRUE))
pop_un20_grouped
# A tibble: 435 x 4
# Groups: NAME [87]
NAME Race group_estimate group_moe
<chr> <chr> <dbl> <dbl>
1 Aitkin County, Minnesota AI/AN 70 24.5
2 Aitkin County, Minnesota Asian 3 14.9
3 Aitkin County, Minnesota Black 5 15.1
4 Aitkin County, Minnesota Hispanic/Latino 71 22.6
5 Aitkin County, Minnesota White 1223 59.9
6 Anoka County, Minnesota AI/AN 322 97.9
7 Anoka County, Minnesota Asian 1983 219.
8 Anoka County, Minnesota Black 4015 303.
9 Anoka County, Minnesota Hispanic/Latino 3176 219.
10 Anoka County, Minnesota White 30557 478.
# … with 425 more rows
我正在尝试使用 R 中的美国社区调查计算明尼苏达州每个县按种族划分的 20 岁以下人口。使用 Tidycensus 我知道这可以使用 R 中每个种族和年龄组的 B01001H 变量来完成. 然而,我需要汇总每个种族群体 20 岁以下人群的所有变量。根据此网页(https://www.census.gov/content/dam/Census/library/publications/2018/acs/acs_general_handbook_2018_ch08.pdf)虽然汇总估计值只是每个子组值的总和,但汇总误差范围需要我计算以下公式:
MOE = sqrt(moe_1^2 + moe_2^2 + ... + moe_n^2)
对于子组中的每个 MOE。那么我究竟该如何使用 tidyverse 来准确计算这个汇总的 MOE 值呢?
到目前为止我的代码是这样的:
## age race
age_vars_male = c(w1="B01001H_003",w2="B01001H_004",w3="B01001H_005",w4="B01001H_006",
b1="B01001B_003",b2="B01001B_004",b3="B01001B_005",b4="B01001B_006",
AN1="B01001C_003",AN2="B01001C_004",AN3="B01001C_005",AN4="B01001C_006",
AS1="B01001D_003",AS2="B01001D_004",AS3="B01001D_005",AS4="B01001D_006",
H1="B01001I_003",H2="B01001I_004",H3="B01001I_005",H4="B01001I_006")
## obtaining variables listed above for MN counties
pop_un20 <- get_acs(geography = "county",
variables = age_vars_male,
state = "MN",
geometry=T)
pop_un20 = pop_un20 %>% mutate(Race = case_when(variable %in% c("w1","w2","w3","w4") ~ "White",
variable %in% c("b1","b2","b3","b4") ~ "Black",
variable %in% c("AN1","AN2","AN3","AN4") ~"AI/AN",
variable %in% c("AS1","AS2","AS3","AS4") ~"Asian",
variable %in% c("H1","H2","H3","H4") ~"Hispanic/Latino"),
moe_sqrd = moe^2) %>% select(-variable)
moe_aggregate = pop_un20 %>% group_by(NAME,Race) %>% summarise(moe_aggregate = sqrt(sum(moe_sqrd,na.rm = T))) %>% st_set_geometry(NULL)
est_aggregate = pop_un20 %>% group_by(NAME,Race) %>% summarise(estimate_aggregate = sum(estimate,na.rm = T)) %>% st_set_geometry(NULL)
pop_under20 = pop_un20 %>% right_join(moe_aggregate, by = c("NAME","Race")) %>% right_join(est_aggregate, by = c("NAME","Race")) %>%
select(-estimate,-moe,moe_sqrd)
我已经计算出我所要求的内容,首先为 moe 平方创建一个列,然后对每个组和种族的总和求平方根。但是有没有办法一次性做到这一点?
您可以使用 mutate
代替 summarise
并直接在数据中添加新列。
library(dplyr)
pop_under20 <- pop_un20 %>%
group_by(NAME,Race) %>%
mutate(moe_aggregate = sqrt(sum(moe_sqrd,na.rm = TRUE)),
estimate_aggregate = sum(estimate,na.rm = TRUE))
tidycensus 有一个函数,moe_sum()
,that does this for you。调整您的代码:
library(tidycensus)
library(tidyverse)
age_vars_male = c(w1="B01001H_003",w2="B01001H_004",w3="B01001H_005",w4="B01001H_006",
b1="B01001B_003",b2="B01001B_004",b3="B01001B_005",b4="B01001B_006",
AN1="B01001C_003",AN2="B01001C_004",AN3="B01001C_005",AN4="B01001C_006",
AS1="B01001D_003",AS2="B01001D_004",AS3="B01001D_005",AS4="B01001D_006",
H1="B01001I_003",H2="B01001I_004",H3="B01001I_005",H4="B01001I_006")
## obtaining variables listed above for MN counties
pop_un20 <- get_acs(geography = "county",
variables = age_vars_male,
state = "MN")
pop_un20_grouped <- pop_un20 %>%
mutate(Race = case_when(variable %in% c("w1","w2","w3","w4") ~ "White",
variable %in% c("b1","b2","b3","b4") ~ "Black",
variable %in% c("AN1","AN2","AN3","AN4") ~"AI/AN",
variable %in% c("AS1","AS2","AS3","AS4") ~"Asian",
variable %in% c("H1","H2","H3","H4") ~"Hispanic/Latino")) %>%
group_by(NAME, Race) %>%
summarize(group_estimate = sum(estimate, na.rm = TRUE),
group_moe = moe_sum(moe = moe, estimate = estimate, na.rm = TRUE))
pop_un20_grouped
# A tibble: 435 x 4
# Groups: NAME [87]
NAME Race group_estimate group_moe
<chr> <chr> <dbl> <dbl>
1 Aitkin County, Minnesota AI/AN 70 24.5
2 Aitkin County, Minnesota Asian 3 14.9
3 Aitkin County, Minnesota Black 5 15.1
4 Aitkin County, Minnesota Hispanic/Latino 71 22.6
5 Aitkin County, Minnesota White 1223 59.9
6 Anoka County, Minnesota AI/AN 322 97.9
7 Anoka County, Minnesota Asian 1983 219.
8 Anoka County, Minnesota Black 4015 303.
9 Anoka County, Minnesota Hispanic/Latino 3176 219.
10 Anoka County, Minnesota White 30557 478.
# … with 425 more rows