仅将一些具有不同字符串特征的值从字段转换为更广泛的格式 R
Pivot only some values with distinct string characteristics from a field to a wider format R
我有一个结构如下的数据框:
country variable value
1 ARG variable_1 0.2340
2 ARG variable_1_se 0.0063
3 ARG variable_1_cv 0.0008
4 ARG variable_2 0.5320
5 ARG variable_2_se 0.0023
6 ARG variable_2_cv 0.0004
7 BOL variable_1 0.3240
8 BOL variable_1_se 0.0013
9 BOL variable_1_cv 0.0004
10 BOL variable_2 0.6380
11 BOL variable_2_se 0.0053
12 BOL variable_2_cv 0.0009
我想将 se 和 cv 值拉出到更宽的格式,因此它看起来像这样:
country variable value se cv
1 ARG variable_1 0.234 0.0063 8e-04
2 ARG variable_2 0.532 0.0023 4e-04
3 BOL variable_1 0.324 0.0013 4e-04
4 BOL variable_2 0.638 0.0053 9e-04
我正在使用 tidyverse 中的 pivot_wider()
,但我正在努力解决一些问题:
- 我无法仅提取 SE 和 CV 值,而将变量保留在原位。
- 我有很多变量与相关的 SE 和 CV 值,所以我想要一些允许我指定 ends_with("_se") 或 "_cv" 的东西。
这里是重现 dfs 的代码:
df_original <- structure(list(country = c("ARG","ARG","ARG","ARG","ARG","ARG", "BOL", "BOL", "BOL","BOL", "BOL", "BOL"),
variable = c("variable_1", "variable_1_se", "variable_1_cv","variable_2", "variable_2_se", "variable_2_cv",
"variable_1", "variable_1_se", "variable_1_cv","variable_2", "variable_2_se", "variable_2_cv"),
value = c(.234, .0063, .0008, .532, .0023, .0004, 0.324,.0013, .0004,.638, .0053, .0009)), class = "data.frame", row.names = c(NA, -12L))
df_desired <- structure(list(country = c("ARG","ARG","BOL", "BOL"),
variable = c("variable_1", "variable_2",
"variable_1","variable_2"),
value = c(.234,.532,.324,.638),
se = c(.0063,.0023,.0013,.0053),
cv = c(.0008,.0004,.0004,.0009)), class = "data.frame", row.names = c(NA, -4L))
我们可能需要 extract
子字符串并执行 pivot_wider
library(dplyr)
library(stringr)
library(tidyr)
df_original %>%
mutate(colnm = str_extract(variable, "^\w+_\d+")) %>%
group_by(country, colnm) %>%
mutate(value2 = value[variable == colnm]) %>%
slice(-1) %>%
ungroup %>%
mutate(variable = str_remove(variable, "^\w+_\d+_")) %>%
pivot_wider(names_from = variable, values_from = value)%>%
rename(variable = colnm, value = value2)
-输出
# A tibble: 4 × 5
country variable value se cv
<chr> <chr> <dbl> <dbl> <dbl>
1 ARG variable_1 0.234 0.0063 0.0008
2 ARG variable_2 0.532 0.0023 0.0004
3 BOL variable_1 0.324 0.0013 0.0004
4 BOL variable_2 0.638 0.0053 0.0009
或使用 tidyr
中的 extract
df_original %>%
tidyr::extract(variable, into = c("variable", "colnm"),
"^(\w+_\d+)_?([a-z]*)$") %>%
group_by(country, variable) %>%
mutate(value2 = value, value = value[!nzchar(colnm)]) %>%
ungroup %>%
filter(nzchar(colnm)) %>%
pivot_wider(names_from = colnm, values_from = value2)
另一个解决方案:
df_original %>%
mutate(variable = ifelse(grepl('variable_\d$', variable), paste(variable, 'value', sep = '_'), variable)) %>%
separate(variable, c('variable', 'num', 'measure'), sep = '_') %>%
pivot_wider(names_from = measure, values_from = value) %>%
mutate(variable = paste(variable, num, sep = '_')) %>%
select(-num)
country variable value se cv
<chr> <chr> <dbl> <dbl> <dbl>
1 ARG variable_1 0.234 0.0063 0.0008
2 ARG variable_2 0.532 0.0023 0.0004
3 BOL variable_1 0.324 0.0013 0.0004
4 BOL variable_2 0.638 0.0053 0.0009
包
library(dplyr)
library(tidyr)
解决方案
df_original %>%
separate(variable, sep = "_", into = c("name", "variable", "se_cv")) %>%
mutate(variable = paste0(name, "_", variable),
se_cv = case_when( is.na(se_cv) ~ "value",
T ~ se_cv)) %>%
select(-name) %>%
pivot_wider(names_from = se_cv,
values_from = value)
输出
# A tibble: 4 x 5
country variable value se cv
<chr> <chr> <dbl> <dbl> <dbl>
1 ARG variable_1 0.234 0.0063 0.0008
2 ARG variable_2 0.532 0.0023 0.0004
3 BOL variable_1 0.324 0.0013 0.0004
4 BOL variable_2 0.638 0.0053 0.0009
有趣的问题:
这是另一种方法:
我们通过仅过滤具有 variable_1
...
的变量列来创建 df_variable
在这里创建一个标识符row
接下来我们在df
中用case_when
定义和过滤se
和cv
分组并添加标识符row
pivot_wider
最后是right_join
library(tidyverse)
df_variable <- df %>%
filter(str_detect(variable, '\w+\_\d$')) %>%
mutate(row = parse_number(variable))
df %>%
mutate(variable = case_when(str_detect(variable, "_se") ~"se",
str_detect(variable, "_cv") ~"cv",
TRUE ~ variable)) %>%
filter(str_detect(variable, "se|cv")) %>%
group_by(country, variable) %>%
mutate(row = row_number()) %>%
pivot_wider(
names_from = variable,
values_from = value,
) %>%
right_join(df_variable, by=c("country", "row")) %>%
select(-row)
country se cv variable value
<chr> <dbl> <dbl> <chr> <dbl>
1 ARG 0.0063 0.0008 variable_1 0.234
2 ARG 0.0023 0.0004 variable_2 0.532
3 BOL 0.0013 0.0004 variable_1 0.324
4 BOL 0.0053 0.0009 variable_2 0.638
df_original %>%
separate(variable, c('variable', 'name'), '(?<=\d)_', fill = 'right') %>%
mutate(name = replace_na(name, 'value')) %>%
pivot_wider()
# A tibble: 4 x 5
country variable value se cv
<chr> <chr> <dbl> <dbl> <dbl>
1 ARG variable_1 0.234 0.0063 0.0008
2 ARG variable_2 0.532 0.0023 0.0004
3 BOL variable_1 0.324 0.0013 0.0004
4 BOL variable_2 0.638 0.0053 0.0009
我有一个结构如下的数据框:
country variable value
1 ARG variable_1 0.2340
2 ARG variable_1_se 0.0063
3 ARG variable_1_cv 0.0008
4 ARG variable_2 0.5320
5 ARG variable_2_se 0.0023
6 ARG variable_2_cv 0.0004
7 BOL variable_1 0.3240
8 BOL variable_1_se 0.0013
9 BOL variable_1_cv 0.0004
10 BOL variable_2 0.6380
11 BOL variable_2_se 0.0053
12 BOL variable_2_cv 0.0009
我想将 se 和 cv 值拉出到更宽的格式,因此它看起来像这样:
country variable value se cv
1 ARG variable_1 0.234 0.0063 8e-04
2 ARG variable_2 0.532 0.0023 4e-04
3 BOL variable_1 0.324 0.0013 4e-04
4 BOL variable_2 0.638 0.0053 9e-04
我正在使用 tidyverse 中的 pivot_wider()
,但我正在努力解决一些问题:
- 我无法仅提取 SE 和 CV 值,而将变量保留在原位。
- 我有很多变量与相关的 SE 和 CV 值,所以我想要一些允许我指定 ends_with("_se") 或 "_cv" 的东西。
这里是重现 dfs 的代码:
df_original <- structure(list(country = c("ARG","ARG","ARG","ARG","ARG","ARG", "BOL", "BOL", "BOL","BOL", "BOL", "BOL"),
variable = c("variable_1", "variable_1_se", "variable_1_cv","variable_2", "variable_2_se", "variable_2_cv",
"variable_1", "variable_1_se", "variable_1_cv","variable_2", "variable_2_se", "variable_2_cv"),
value = c(.234, .0063, .0008, .532, .0023, .0004, 0.324,.0013, .0004,.638, .0053, .0009)), class = "data.frame", row.names = c(NA, -12L))
df_desired <- structure(list(country = c("ARG","ARG","BOL", "BOL"),
variable = c("variable_1", "variable_2",
"variable_1","variable_2"),
value = c(.234,.532,.324,.638),
se = c(.0063,.0023,.0013,.0053),
cv = c(.0008,.0004,.0004,.0009)), class = "data.frame", row.names = c(NA, -4L))
我们可能需要 extract
子字符串并执行 pivot_wider
library(dplyr)
library(stringr)
library(tidyr)
df_original %>%
mutate(colnm = str_extract(variable, "^\w+_\d+")) %>%
group_by(country, colnm) %>%
mutate(value2 = value[variable == colnm]) %>%
slice(-1) %>%
ungroup %>%
mutate(variable = str_remove(variable, "^\w+_\d+_")) %>%
pivot_wider(names_from = variable, values_from = value)%>%
rename(variable = colnm, value = value2)
-输出
# A tibble: 4 × 5
country variable value se cv
<chr> <chr> <dbl> <dbl> <dbl>
1 ARG variable_1 0.234 0.0063 0.0008
2 ARG variable_2 0.532 0.0023 0.0004
3 BOL variable_1 0.324 0.0013 0.0004
4 BOL variable_2 0.638 0.0053 0.0009
或使用 tidyr
extract
df_original %>%
tidyr::extract(variable, into = c("variable", "colnm"),
"^(\w+_\d+)_?([a-z]*)$") %>%
group_by(country, variable) %>%
mutate(value2 = value, value = value[!nzchar(colnm)]) %>%
ungroup %>%
filter(nzchar(colnm)) %>%
pivot_wider(names_from = colnm, values_from = value2)
另一个解决方案:
df_original %>%
mutate(variable = ifelse(grepl('variable_\d$', variable), paste(variable, 'value', sep = '_'), variable)) %>%
separate(variable, c('variable', 'num', 'measure'), sep = '_') %>%
pivot_wider(names_from = measure, values_from = value) %>%
mutate(variable = paste(variable, num, sep = '_')) %>%
select(-num)
country variable value se cv
<chr> <chr> <dbl> <dbl> <dbl>
1 ARG variable_1 0.234 0.0063 0.0008
2 ARG variable_2 0.532 0.0023 0.0004
3 BOL variable_1 0.324 0.0013 0.0004
4 BOL variable_2 0.638 0.0053 0.0009
包
library(dplyr)
library(tidyr)
解决方案
df_original %>%
separate(variable, sep = "_", into = c("name", "variable", "se_cv")) %>%
mutate(variable = paste0(name, "_", variable),
se_cv = case_when( is.na(se_cv) ~ "value",
T ~ se_cv)) %>%
select(-name) %>%
pivot_wider(names_from = se_cv,
values_from = value)
输出
# A tibble: 4 x 5
country variable value se cv
<chr> <chr> <dbl> <dbl> <dbl>
1 ARG variable_1 0.234 0.0063 0.0008
2 ARG variable_2 0.532 0.0023 0.0004
3 BOL variable_1 0.324 0.0013 0.0004
4 BOL variable_2 0.638 0.0053 0.0009
有趣的问题: 这是另一种方法:
我们通过仅过滤具有
的变量列来创建variable_1
...df_variable
在这里创建一个标识符
row
接下来我们在
中用df
case_when
定义和过滤se
和cv
分组并添加标识符
row
pivot_wider
最后是
right_join
library(tidyverse)
df_variable <- df %>%
filter(str_detect(variable, '\w+\_\d$')) %>%
mutate(row = parse_number(variable))
df %>%
mutate(variable = case_when(str_detect(variable, "_se") ~"se",
str_detect(variable, "_cv") ~"cv",
TRUE ~ variable)) %>%
filter(str_detect(variable, "se|cv")) %>%
group_by(country, variable) %>%
mutate(row = row_number()) %>%
pivot_wider(
names_from = variable,
values_from = value,
) %>%
right_join(df_variable, by=c("country", "row")) %>%
select(-row)
country se cv variable value
<chr> <dbl> <dbl> <chr> <dbl>
1 ARG 0.0063 0.0008 variable_1 0.234
2 ARG 0.0023 0.0004 variable_2 0.532
3 BOL 0.0013 0.0004 variable_1 0.324
4 BOL 0.0053 0.0009 variable_2 0.638
df_original %>%
separate(variable, c('variable', 'name'), '(?<=\d)_', fill = 'right') %>%
mutate(name = replace_na(name, 'value')) %>%
pivot_wider()
# A tibble: 4 x 5
country variable value se cv
<chr> <chr> <dbl> <dbl> <dbl>
1 ARG variable_1 0.234 0.0063 0.0008
2 ARG variable_2 0.532 0.0023 0.0004
3 BOL variable_1 0.324 0.0013 0.0004
4 BOL variable_2 0.638 0.0053 0.0009