在数据子集上使用 fct_collapse

Using fct_collapse on a subset of data

我正在尝试建立一个预测模型。我的功能之一是 U.S 的标识符。州和领地。原始列表有 62 个唯一值,我可以使用 fct_collapse.

将它们减少到 5 个值
dat <- tibble(state = c('AA', 'AE', 'AK', 'AL', 'AP', 'AR', 'AS', 'AZ',
                        'CA', 'CO', 'CT', 'DC', 'DE', 'FL', 'FM', 'GA',
                        'GU', 'HI', 'IA', 'ID', 'IL', 'IN', 'KS', 'KY',
                        'LA', 'MA', 'MD', 'ME', 'MH', 'MI', 'MN', 'MO', 
                        'MS', 'MT', 'NC', 'ND', 'NE', 'NH', 'NJ', 'NM',
                        'None', 'NV', 'NY', 'OH', 'OK', 'OR', 'PA', 'PR',
                        'RI', 'SC', 'SD', 'TN', 'TX', 
                        'UNITED STATES MINOR OUTLYING ISLANDS', 'UT',
                        'VA', 'VI', 'VT', 'WA', 'WI', 'WV', 'WY'))
dat$census_region <- fct_collapse(dat$state,
    northeast = c("CT","ME","MA","NH","RI","VT","NJ","NY","PA"),
    midwest = c("IL","IN","MI","OH","WI","IA","KS","MN","MO","NE","ND","SD"),
    south = c("DE","FL","GA","MD","NC","SC","VA","DC","WV","AL","KY","MS","TN",
         "AR","LA","OK","TX"),
    west = c("AZ","CO","ID","MT","NV","NM","UT","WY","AK","CA","HI","OR","WA"),
    other = c("AA","AE","AP","AS","FM","GU","MH","None","PR",
         "UNITED STATES MINOR OUTLYING ISLANDS","VI"))

尾巴(数据,10)

小标题:10 x 2

state census_region
TX south
UNITED STATES MINOR OUTLYING ISLANDS other
UT west
VA south
VI other
VT northeast
WA west
WI midwest
WV south
WY west

我现在正在尝试验证模型,较小的数据集没有全部 62 个唯一状态标识符:

dat_2 <- tibble(state = c('ID', 'IL', 'IN', 'KS', 'KY',
                          'LA', 'MA', 'MD', 'ME', 'MH', 'MI', 'MN', 'MO', 
                          'MS', 'MT', 'NC', 'ND', 'NE', 'NH', 'NJ', 'NM',
                          'None', 'NV', 'NY', 'OH', 'OK'))

现在,如果我尝试在较小的数据集上使用 fct_collapse:

dat_2$census_region <- fct_collapse(dat_2$state,
    northeast = c("CT","ME","MA","NH","RI","VT","NJ","NY","PA"),
    midwest = c("IL","IN","MI","OH","WI","IA","KS","MN","MO","NE","ND","SD"),
    south = c("DE","FL","GA","MD","NC","SC","VA","DC","WV","AL","KY","MS","TN",
        "AR","LA","OK","TX"),
    west = c("AZ","CO","ID","MT","NV","NM","UT","WY","AK","CA","HI","OR","WA"),
    other = c("AA","AE","AP","AS","FM","GU","MH","None","PR",
        "UNITED STATES MINOR OUTLYING ISLANDS","VI"))

我明白了:

警告信息: f 中的未知级别:CT、RI、VT、PA、WI、IA、SD、DE、FL、GA、SC、VA、DC、WV、AL、TN、AR、TX、AZ、CO、UT , WY, AK, CA, HI, OR, WA, AA, AE, AP, AS, FM, GU, PR, 美国本土外小岛屿, VI

我做了类似的事情,按照管理和预算办公室的定义,按罗马数字对州和领地进行分组。我的目标是将 62 个虚拟变量减少到更易于管理的数量。

问题: forcats 包中是否有一个选项(更具体地说 fct_collapse)将只分配找到的那些值并跳过“未知水平”?

您可以考虑以不同的方式解决这个问题,只需按照下面的方法 dat_2 |> left_join(dat) 即可。

这从 dat 中获取与较小样本中的 state 匹配的 census_region,并将其作为一个因子。

library(tidyverse)

dat <- tibble(state = c('AA', 'AE', 'AK', 'AL', 'AP', 'AR', 'AS', 'AZ',
                        'CA', 'CO', 'CT', 'DC', 'DE', 'FL', 'FM', 'GA',
                        'GU', 'HI', 'IA', 'ID', 'IL', 'IN', 'KS', 'KY',
                        'LA', 'MA', 'MD', 'ME', 'MH', 'MI', 'MN', 'MO', 
                        'MS', 'MT', 'NC', 'ND', 'NE', 'NH', 'NJ', 'NM',
                        'None', 'NV', 'NY', 'OH', 'OK', 'OR', 'PA', 'PR',
                        'RI', 'SC', 'SD', 'TN', 'TX', 
                        'UNITED STATES MINOR OUTLYING ISLANDS', 'UT',
                        'VA', 'VI', 'VT', 'WA', 'WI', 'WV', 'WY'))

dat$census_region <- fct_collapse(dat$state,
                                  northeast = c("CT","ME","MA","NH","RI","VT","NJ","NY","PA"),
                                  midwest = c("IL","IN","MI","OH","WI","IA","KS","MN","MO","NE","ND","SD"),
                                  south = c("DE","FL","GA","MD","NC","SC","VA","DC","WV","AL","KY","MS","TN",
                                            "AR","LA","OK","TX"),
                                  west = c("AZ","CO","ID","MT","NV","NM","UT","WY","AK","CA","HI","OR","WA"),
                                  other = c("AA","AE","AP","AS","FM","GU","MH","None","PR",
                                            "UNITED STATES MINOR OUTLYING ISLANDS","VI"))

dat_2 <- tibble(state = c('ID', 'IL', 'IN', 'KS', 'KY',
                          'LA', 'MA', 'MD', 'ME', 'MH', 'MI', 'MN', 'MO', 
                          'MS', 'MT', 'NC', 'ND', 'NE', 'NH', 'NJ', 'NM',
                          'None', 'NV', 'NY', 'OH', 'OK'))

dat_2 |> left_join(dat)
#> Joining, by = "state"
#> # A tibble: 26 × 2
#>    state census_region
#>    <chr> <fct>        
#>  1 ID    west         
#>  2 IL    midwest      
#>  3 IN    midwest      
#>  4 KS    midwest      
#>  5 KY    south        
#>  6 LA    south        
#>  7 MA    northeast    
#>  8 MD    south        
#>  9 ME    northeast    
#> 10 MH    other        
#> # … with 16 more rows

reprex package (v2.0.1)

于 2022-05-19 创建