仅将一些具有不同字符串特征的值从字段转换为更广泛的格式 R

Pivot only some values with distinct string characteristics from a field to a wider format R

我有一个结构如下的数据框:

   country      variable  value
1      ARG    variable_1 0.2340
2      ARG variable_1_se 0.0063
3      ARG variable_1_cv 0.0008
4      ARG    variable_2 0.5320
5      ARG variable_2_se 0.0023
6      ARG variable_2_cv 0.0004
7      BOL    variable_1 0.3240
8      BOL variable_1_se 0.0013
9      BOL variable_1_cv 0.0004
10     BOL    variable_2 0.6380
11     BOL variable_2_se 0.0053
12     BOL variable_2_cv 0.0009

我想将 se 和 cv 值拉出到更宽的格式,因此它看起来像这样:

  country   variable value     se    cv
1     ARG variable_1 0.234 0.0063 8e-04
2     ARG variable_2 0.532 0.0023 4e-04
3     BOL variable_1 0.324 0.0013 4e-04
4     BOL variable_2 0.638 0.0053 9e-04

我正在使用 tidyverse 中的 pivot_wider(),但我正在努力解决一些问题:

  1. 我无法仅提取 SE 和 CV 值,而将变量保留在原位。
  2. 我有很多变量与相关的 SE 和 CV 值,所以我想要一些允许我指定 ends_with("_se") 或 "_cv" 的东西。

这里是重现 dfs 的代码:

  df_original <- structure(list(country = c("ARG","ARG","ARG","ARG","ARG","ARG", "BOL", "BOL", "BOL","BOL", "BOL", "BOL"), 
                       variable = c("variable_1", "variable_1_se", "variable_1_cv","variable_2", "variable_2_se", "variable_2_cv", 
                                    "variable_1", "variable_1_se", "variable_1_cv","variable_2", "variable_2_se", "variable_2_cv"), 
                       value = c(.234, .0063, .0008, .532, .0023, .0004, 0.324,.0013, .0004,.638, .0053, .0009)), class = "data.frame", row.names = c(NA, -12L))
  
  df_desired <- structure(list(country = c("ARG","ARG","BOL", "BOL"), 
                               variable = c("variable_1", "variable_2", 
                                            "variable_1","variable_2"), 
                               value = c(.234,.532,.324,.638),
                               se = c(.0063,.0023,.0013,.0053), 
                               cv = c(.0008,.0004,.0004,.0009)), class = "data.frame", row.names = c(NA, -4L))
  

我们可能需要 extract 子字符串并执行 pivot_wider

library(dplyr)
library(stringr)
library(tidyr)
df_original %>%
  mutate(colnm = str_extract(variable, "^\w+_\d+")) %>% 
  group_by(country, colnm) %>% 
  mutate(value2 = value[variable == colnm]) %>%
  slice(-1) %>% 
  ungroup %>%
  mutate(variable = str_remove(variable, "^\w+_\d+_")) %>% 
  pivot_wider(names_from = variable, values_from = value)%>%  
  rename(variable = colnm, value = value2)

-输出

# A tibble: 4 × 5
  country variable   value     se     cv
  <chr>   <chr>      <dbl>  <dbl>  <dbl>
1 ARG     variable_1 0.234 0.0063 0.0008
2 ARG     variable_2 0.532 0.0023 0.0004
3 BOL     variable_1 0.324 0.0013 0.0004
4 BOL     variable_2 0.638 0.0053 0.0009

或使用 tidyr

中的 extract
df_original %>%
  tidyr::extract(variable, into = c("variable", "colnm"), 
   "^(\w+_\d+)_?([a-z]*)$") %>% 
  group_by(country, variable) %>% 
  mutate(value2 = value, value = value[!nzchar(colnm)]) %>% 
  ungroup %>% 
  filter(nzchar(colnm)) %>% 
  pivot_wider(names_from = colnm, values_from = value2)

另一个解决方案:

df_original %>% 
  mutate(variable = ifelse(grepl('variable_\d$', variable), paste(variable, 'value', sep = '_'), variable)) %>% 
  separate(variable, c('variable', 'num', 'measure'), sep = '_') %>% 
  pivot_wider(names_from = measure, values_from = value) %>% 
  mutate(variable = paste(variable, num, sep = '_')) %>% 
  select(-num)

  country variable   value     se     cv
  <chr>   <chr>      <dbl>  <dbl>  <dbl>
1 ARG     variable_1 0.234 0.0063 0.0008
2 ARG     variable_2 0.532 0.0023 0.0004
3 BOL     variable_1 0.324 0.0013 0.0004
4 BOL     variable_2 0.638 0.0053 0.0009

library(dplyr)
library(tidyr)

解决方案

df_original %>% 
  separate(variable, sep = "_", into = c("name", "variable", "se_cv")) %>% 
  mutate(variable = paste0(name, "_", variable),
         se_cv = case_when( is.na(se_cv) ~ "value",
                            T ~ se_cv)) %>% 
  select(-name) %>% 
  pivot_wider(names_from = se_cv, 
              values_from = value)

输出

# A tibble: 4 x 5
  country variable   value     se     cv
  <chr>   <chr>      <dbl>  <dbl>  <dbl>
1 ARG     variable_1 0.234 0.0063 0.0008
2 ARG     variable_2 0.532 0.0023 0.0004
3 BOL     variable_1 0.324 0.0013 0.0004
4 BOL     variable_2 0.638 0.0053 0.0009

有趣的问题: 这是另一种方法:

  1. 我们通过仅过滤具有 variable_1...

    的变量列来创建 df_variable
  2. 在这里创建一个标识符row

  3. 接下来我们在df

    中用case_when定义和过滤secv
  4. 分组并添加标识符row

  5. pivot_wider

  6. 最后是right_join

library(tidyverse)

df_variable <- df %>% 
  filter(str_detect(variable, '\w+\_\d$')) %>% 
  mutate(row = parse_number(variable))

df %>% 
  mutate(variable = case_when(str_detect(variable, "_se") ~"se",
                              str_detect(variable, "_cv") ~"cv",
                              TRUE ~ variable)) %>% 
  filter(str_detect(variable, "se|cv")) %>% 
  group_by(country, variable) %>% 
  mutate(row = row_number()) %>% 
  pivot_wider(
    names_from = variable,
    values_from = value,
  ) %>% 
  right_join(df_variable, by=c("country", "row")) %>% 
  select(-row)
  country     se     cv variable   value
  <chr>    <dbl>  <dbl> <chr>      <dbl>
1 ARG     0.0063 0.0008 variable_1 0.234
2 ARG     0.0023 0.0004 variable_2 0.532
3 BOL     0.0013 0.0004 variable_1 0.324
4 BOL     0.0053 0.0009 variable_2 0.638
df_original %>%
  separate(variable, c('variable', 'name'), '(?<=\d)_', fill = 'right') %>%
  mutate(name = replace_na(name, 'value')) %>%
  pivot_wider()

# A tibble: 4 x 5
  country variable   value     se     cv
  <chr>   <chr>      <dbl>  <dbl>  <dbl>
1 ARG     variable_1 0.234 0.0063 0.0008
2 ARG     variable_2 0.532 0.0023 0.0004
3 BOL     variable_1 0.324 0.0013 0.0004
4 BOL     variable_2 0.638 0.0053 0.0009