有没有办法 dplyr (tidyverse) 映射我的数据集,找到以相同后缀结尾的列然后只保留一个?

Is there a way to dplyr (tidyverse) map my dataset, find columns ending with the same suffix then keep just one?

假设我开发了两种不同的测量工具(a3 和 a4)来评估心理属性,例如沟通技巧。 a3 工具是仪器的第一个版本,它有 6 个项目(1、2、3、4、5、6)。 A4 是 a3 的更新版本,它只开发了某些项目来替换 a3 版本中的某些项目。因此,虽然我的数据集始终包含 a3 中的项目 1 到 6,但我只能包含 a4 版本中的项目 1、2 和 5。

因为我有这两种乐器,所以我用 domain_version_item 标记了它们。因此,我有一个这样的数据集:

> asq_online %>% 
+   names
 [1] "id"           "age_interval" "com_a3_1"     "com_a3_2"     "com_a3_3"    
 [6] "com_a3_4"     "com_a3_5"     "com_a3_6"     "com_a4_1"     "com_a4_2"

如果我想要returna3版本,没问题:

+   select(starts_with("com_a3")) %>% 
+   names
[1] "com_a3_1" "com_a3_2" "com_a3_3" "com_a3_4" "com_a3_5" "com_a3_6"
> 

如果我的目标只是 return a4,没关系:

> asq_online %>% 
+   select(starts_with("com_a4")) %>% 
+   names
[1] "com_a4_1" "com_a4_2"

所以,现在我知道 a4 版本是用来替换 a3 版本中的项目 1 和 2 的,我应该用 com_a4_1、com_a4_2、 com_a3_3, com_a3_4, com_a3_5, com_a3_6

我正在努力构建这种语法。我看到我必须比较具有相同后缀的项目,然后将其替换为这样的打印:

"com_a4_1" "com_a4_2" "com_a3_3" "com_a3_4" "com_a3_5" "com_a3_6"

那是我的数据集

asq_online <- structure(list(id = c(1, 2, 3, 4, 5), age_interval = c(12, 12, 
12, 12, 12), com_a3_1 = c(0, 0, 5, 0, 10), com_a3_2 = c(5, 0, 
10, 0, 5), com_a3_3 = c(10, 10, 10, 0, 5), com_a3_4 = c(5, 0, 
0, 10, 10), com_a3_5 = c(5, 5, 0, 10, 10), com_a3_6 = c(5, 10, 
0, 5, 5), com_a4_1 = c(10, 5, 5, 10, 10), com_a4_2 = c(10, 5, 
0, 0, 10)), row.names = c(NA, -5L), class = c("tbl_df", "tbl", 
"data.frame"))

如果我遗漏了任何信息,请告诉我。我想留在 tidyverse 环境中。 谢谢。

为了最有效地使用 tidyverse,在将数据转换为长格式后对行进行操作通常会更容易(而不是编写一些逻辑或循环来帮助选择列)

这是我首先 pivot_longer 整理长格式的解决方案。然后我 group_by id-age_interval-domain-item 然后使用 mutate maxif_else 来识别 id-age_interval-domain-item 组中版本最高的值。然后我 filter 删除来自过时版本的行。根据您接下来要做什么,您可以保留长格式,或者您可以 pivot_wider 返回到原始格式。

library(tidyverse)

asq_online <- structure(list(id = c(1, 2, 3, 4, 5), age_interval = c(12, 12, 12, 12, 12), com_a3_1 = c(0, 0, 5, 0, 10), com_a3_2 = c(5, 0, 10, 0, 5), com_a3_3 = c(10, 10, 10, 0, 5), com_a3_4 = c(5, 0, 0, 10, 10), com_a3_5 = c(5, 5, 0, 10, 10), com_a3_6 = c(5, 10, 0, 5, 5), com_a4_1 = c(10, 5, 5, 10, 10), com_a4_2 = c(10, 5, 0, 0, 10)), row.names = c(NA, -5L), class = c("tbl_df", "tbl", "data.frame"))

(asq_online_long <- asq_online %>%
  pivot_longer(cols = -(1:2), names_to = c("domain", "version", "item"), names_sep = "_") %>%
  group_by(id, age_interval, domain, item) %>%
  mutate(latest_version_available = max(version),
         value_from_latest_version = if_else(version == latest_version_available, value, NA_real_)))
#> # A tibble: 40 x 8
#> # Groups:   id, age_interval, domain, item [30]
#>       id age_interval domain version item  value latest_version_~
#>    <dbl>        <dbl> <chr>  <chr>   <chr> <dbl> <chr>           
#>  1     1           12 com    a3      1         0 a4              
#>  2     1           12 com    a3      2         5 a4              
#>  3     1           12 com    a3      3        10 a3              
#>  4     1           12 com    a3      4         5 a3              
#>  5     1           12 com    a3      5         5 a3              
#>  6     1           12 com    a3      6         5 a3              
#>  7     1           12 com    a4      1        10 a4              
#>  8     1           12 com    a4      2        10 a4              
#>  9     2           12 com    a3      1         0 a4              
#> 10     2           12 com    a3      2         0 a4              
#> # ... with 30 more rows, and 1 more variable:
#> #   value_from_latest_version <dbl>

# if you need it back in original format
(asq_online_latest_versions_wide <- asq_online_long %>%
  filter(!is.na(value_from_latest_version)) %>%
  select(-latest_version_available, -value, value = value_from_latest_version) %>%
  pivot_wider(names_from = c("domain", "version", "item")))
#> # A tibble: 5 x 8
#> # Groups:   id, age_interval [5]
#>      id age_interval com_a3_3 com_a3_4 com_a3_5 com_a3_6 com_a4_1 com_a4_2
#>   <dbl>        <dbl>    <dbl>    <dbl>    <dbl>    <dbl>    <dbl>    <dbl>
#> 1     1           12       10        5        5        5       10       10
#> 2     2           12       10        0        5       10        5        5
#> 3     3           12       10        0        0        0        5        0
#> 4     4           12        0       10       10        5       10        0
#> 5     5           12        5       10       10        5       10       10

reprex package (v0.3.0)

于 2019-12-01 创建
asq_online%>%
    split.default(str_extract(names(.),"\d+$"))%>%
    map_dfc(~.x[ifelse(ncol(.x)>1,-seq(ncol(.x)/2),1)])
# A tibble: 5 x 6
  com_a4_1 com_a4_2 com_a3_3 com_a3_4 com_a3_5 com_a3_6
     <dbl>    <dbl>    <dbl>    <dbl>    <dbl>    <dbl>
1       10       10       10        5        5        5
2        5        5       10        0        5       10
3        5        0       10        0        0        0
4       10        0        0       10       10        5
5       10       10        5       10       10        5

另一种使用 tidyverse 的方法:

# Get the names of the a4 columns                                                                                                                                                                                                                                                                                                                                                  "data.frame"))
a=asq_online %>% select(starts_with("com_a4")) %>% colnames()
# Extract the last digit of each name
b=as.numeric(unlist(stringr::str_extract_all(a,"\d$")))
# Compare the extracted digits with a vector (1,2,5) (i.e. : the a3 columns which can be deleted)
c=b==c(1,2,5)
# We get a logical vector which is used to subset the b vector (keep TRUE elements).
# We the add 2 to generate the columns indexes for the removal step (since we always keep column 1 and 2)
remov=b[f]+2
# Remove the columns
asq_online %>% select(-remov)

输出:

# A tibble: 5 x 8
     id age_interval com_a3_3 com_a3_4 com_a3_5 com_a3_6 com_a4_1 com_a4_2
  <dbl>        <dbl>    <dbl>    <dbl>    <dbl>    <dbl>    <dbl>    <dbl>
1     1           12       10        5        5        5       10       10
2     2           12       10        0        5       10        5        5
3     3           12       10        0        0        0        5        0
4     4           12        0       10       10        5       10        0
5     5           12        5       10       10        5       10       10