R:更难 pivot_longer 论点

R: harder pivot_longer arguments

我有一个具有这些名称的数据框:


df <- tribble( ~ state, ~ county, ~ kfr_asian_pooled_p25, ~ kfr_asian_pooled_p75, 
               ~ kfr_black_pooled_p25, ~ kfr_black_pooled_p75, 
               ~ kfr_pooled_pooled_p25, ~ kfr_pooled_pooled_p75, ~ kfr_white_pooled_p25, 
               ~ kfr_white_pooled_p75, ~ kids_count, ~ kfr_asian_pooled_p25_se, 
               ~ kfr_asian_pooled_p75_se, ~ kfr_black_pooled_p25_se, 
               ~ kfr_black_pooled_p75_se, ~ kfr_white_pooled_p25_se, 
               ~ kfr_white_pooled_p75_se,
               1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17)

当然数据集是巨大的,但我想要实现的是利用其最大潜力来延长枢轴。我的意思是同时抓取几个变量并旋转它们和它们的名称。

df <- tribble(~ state, ~ county, ~ kids_count,   ~ race, ~ percentile,   ~ se, ~ value,
                     1,       2,            3,  "asian",        "p25",   TRUE,   value,
                     2,       3,            4,  "black",        "p25",   TRUE,   value,
                     3,       4,            5,  "white",        "p25",   TRUE,   value,
                     1,       2,            3,  "asian",        "p75",   TRUE,   value,
                     2,       3,            4,  "black",        "p75",   TRUE,   value,
                     3,       4,            5,  "white",        "p75",   TRUE,   value,
                     1,       2,            3,  "asian",        "p25",  FALSE,   value,
                     2,       3,            4,  "black",        "p25",  FALSE,   value,
                     3,       4,            5,  "white",        "p25",  FALSE,   value,
                     1,       2,            3,  "asian",        "p75",  FALSE,   value,
                     2,       3,            4,  "black",        "p75",  FALSE,   value,
                     3,       4,            5,  "white",        "p75",  FALSE,   value)

非常感谢任何帮助!

你可以这样做,如果我明白你想要什么

(首先,快速解释一下我使用的 pivot_longer() 选项)

  • cols 选择您要旋转的列
  • names_to 具有多个元素允许我们将其转换为多个列
  • names_prefix 允许我们删除要转换的列之间的公共前缀
  • name_sep 允许我们指示 names_to 中的多个名称在原始列
  • 中是如何分解的
pivot_longer(
  df,
  cols = starts_with("kfr"),
  names_to=c("race", "pooled", "percentile", "se"), 
  names_prefix="kfr_", 
  names_sep="_"
) %>% 
  select(!pooled) %>% 
  mutate(se=!is.na(se))


# A tibble: 14 x 7
   state county kids_count race   percentile se    value
   <dbl>  <dbl>      <dbl> <chr>  <chr>      <lgl> <dbl>
 1     1      2         11 asian  p25        FALSE     3
 2     1      2         11 asian  p75        FALSE     4
 3     1      2         11 black  p25        FALSE     5
 4     1      2         11 black  p75        FALSE     6
 5     1      2         11 pooled p25        FALSE     7
 6     1      2         11 pooled p75        FALSE     8
 7     1      2         11 white  p25        FALSE     9
 8     1      2         11 white  p75        FALSE    10
 9     1      2         11 asian  p25        TRUE     12
10     1      2         11 asian  p75        TRUE     13
11     1      2         11 black  p25        TRUE     14
12     1      2         11 black  p75        TRUE     15
13     1      2         11 white  p25        TRUE     16
14     1      2         11 white  p75        TRUE     17

看起来您正在尝试 pivot_longer 仅百分位列,然后拆分名称以创建单独的列(如果我理解正确的话)。 tidyselect 函数(例如 matches)对于选择名称相似的列非常有用。我还使用了 stringr 包中的 str_split 来帮助创建新的字符列。如果顺序对您很重要,您可以随时使用 selectrelocate 重新组织列。

library(tidyverse)
library(stringr)
df_long <-
  df %>%
  pivot_longer(cols = matches("[p25]|[p75]"), names_to = "percentile") %>%
  mutate(se = grepl("se", percentile),
         race = stringr::str_split(percentile, "_", simplify = TRUE)[,2],
         percentile = stringr::str_split(percentile, "_", simplify = TRUE)[,4])

df_long
#> # A tibble: 14 x 7
#>    state county kids_count percentile value se    race  
#>    <dbl>  <dbl>      <dbl> <chr>      <dbl> <lgl> <chr> 
#>  1     1      2         11 p25            3 FALSE asian 
#>  2     1      2         11 p75            4 FALSE asian 
#>  3     1      2         11 p25            5 FALSE black 
#>  4     1      2         11 p75            6 FALSE black 
#>  5     1      2         11 p25            7 FALSE pooled
#>  6     1      2         11 p75            8 FALSE pooled
#>  7     1      2         11 p25            9 FALSE white 
#>  8     1      2         11 p75           10 FALSE white 
#>  9     1      2         11 p25           12 TRUE  asian 
#> 10     1      2         11 p75           13 TRUE  asian 
#> 11     1      2         11 p25           14 TRUE  black 
#> 12     1      2         11 p75           15 TRUE  black 
#> 13     1      2         11 p25           16 TRUE  white 
#> 14     1      2         11 p75           17 TRUE  white

reprex package (v2.0.1)

于 2022-02-18 创建

使用 separate() 的选项。

library(tidyverse)

df %>%
  pivot_longer(-c('state', 'county', 'kids_count')) %>%
  separate(name, into = c(NA, 'race', NA, 'percentile', 'se'), sep = '_') %>%
  mutate(se = !is.na(se))

# # A tibble: 14 x 7
#   state county kids_count race   percentile se    value
#    <dbl>  <dbl>      <dbl> <chr>  <chr>      <lgl> <dbl>
#  1     1      2         11 asian  p25        FALSE     3
#  2     1      2         11 asian  p75        FALSE     4
#  3     1      2         11 black  p25        FALSE     5
#  4     1      2         11 black  p75        FALSE     6
#  5     1      2         11 pooled p25        FALSE     7
#  6     1      2         11 pooled p75        FALSE     8
#  7     1      2         11 white  p25        FALSE     9
#  8     1      2         11 white  p75        FALSE    10
#  9     1      2         11 asian  p25        TRUE     12
# 10     1      2         11 asian  p75        TRUE     13
# 11     1      2         11 black  p25        TRUE     14
# 12     1      2         11 black  p75        TRUE     15
# 13     1      2         11 white  p25        TRUE     16
# 14     1      2         11 white  p75        TRUE     17

我们可以结合 tidyrpivot_longersüöotstaclshape

cSplit
library(splitstackshape)
library(dplyr)
library(tidyr)
df %>% 
  pivot_longer(
    -c(state, county, kids_count)
    ) %>%  
  cSplit("name", "_") %>% 
  select(state, county, kids_count, race=name_2, percentile=name_4, se=name_5, value) 
    state county kids_count   race percentile   se value
 1:     1      2         11  asian        p25 <NA>     3
 2:     1      2         11  asian        p75 <NA>     4
 3:     1      2         11  black        p25 <NA>     5
 4:     1      2         11  black        p75 <NA>     6
 5:     1      2         11 pooled        p25 <NA>     7
 6:     1      2         11 pooled        p75 <NA>     8
 7:     1      2         11  white        p25 <NA>     9
 8:     1      2         11  white        p75 <NA>    10
 9:     1      2         11  asian        p25   se    12
10:     1      2         11  asian        p75   se    13
11:     1      2         11  black        p25   se    14
12:     1      2         11  black        p75   se    15
13:     1      2         11  white        p25   se    16
14:     1      2         11  white        p75   se    17