根据列名中的条件子字符串计算列值

Compute for column values based on conditional substrings in the column names

我有一个包含数百列的数据框。 仅出于示例目的,我将展示一个玩具数据框。

TPT_A_2 | TPT_B_2 | TPT_C_2 | TPT_A_4 | TPT_B_4 | TPT_C_4 | TPT_A_6 | TPT_B_6 | TPT_C_6 | 
 100        100       100       200       200      200       400       400        400   

我想计算那些初始子字符串与名称(TPT_A、TPT_B..)相同且以 2 和 4 结尾的变量的平均值。 所以我会得到类似的东西:

TPT_A_mean | TPT_B_mean | TPT_C_mean | TPT_A_6 | TPT_B_6 | TPT_C_6 | 
  150           150          150         400      400        400  

此数据为:

row1 <- c("TPT_A_2", "TPT_B_2", "TPT_C_2","TPT_A_4", "TPT_B_4", "TPT_C_4", "TPT_A_6", "TPT_B_6", "TPT_C_6")
row2 <- c(100, 100, 100, 200, 200, 200, 400, 40, 400)   
data <- as.data.frame(rbind(row1, row2))
colnames(data) <- as.character(data[1,])
data <- data[-1,]

一个选项是使用 tidyr 中的数据透视函数使数据变长并在 tidyverse 中从那里开始工作:

library(tidyverse) # some prefer to call only the needed packages instead of the whole tidyverse

data %>%
    # make the data long
    tidyr::pivot_longer(1:last_col()) %>% 
    # cut the before column names to desired length and check for 2 or 4 to paste mean else 6
    dplyr::mutate(grp = paste0(stringr::str_sub(name, 1, 5),
                               "_",
                               ifelse(stringr::str_detect(name, pattern = "2|4"), "mean", "6"))) %>% 
    # build groupings
    dplyr::group_by(grp) %>% 
    # caluclate mean
    dplyr::summarise(means = mean(as.numeric(value))) %>%
    # make table wide again
    tidyr::pivot_wider(names_from = "grp", values_from = "means")

# A tibble: 1 x 6
  TPT_A_6 TPT_A_mean TPT_B_6 TPT_B_mean TPT_C_6 TPT_C_mean
    <dbl>      <dbl>   <dbl>      <dbl>   <dbl>      <dbl>
1     400        150      40        150     400        150

您的样本数据中有一个小错字,因此 TPT_B_6 列是 40 而不是 400

首先,您生成帧的方法是一种反模式,导致您的数字被转换为字符串。

str(dat)
# 'data.frame': 1 obs. of  9 variables:
#  $ TPT_A_2: chr "100"
#  $ TPT_B_2: chr "100"
#  $ TPT_C_2: chr "100"
#  $ TPT_A_4: chr "200"
#  $ TPT_B_4: chr "200"
#  $ TPT_C_4: chr "200"
#  $ TPT_A_6: chr "400"
#  $ TPT_B_6: chr "40"
#  $ TPT_C_6: chr "400"

在这种情况下,我们可以使用:

row1 <- c("TPT_A_2", "TPT_B_2", "TPT_C_2","TPT_A_4", "TPT_B_4", "TPT_C_4", "TPT_A_6", "TPT_B_6", "TPT_C_6")
row2 <- c(100, 100, 100, 200, 200, 200, 400, 40, 400)   
dat <- as.data.frame(setNames(as.list(row2),row1))
str(dat)
# 'data.frame': 1 obs. of  9 variables:
#  $ TPT_A_2: num 100
#  $ TPT_B_2: num 100
#  $ TPT_C_2: num 100
#  $ TPT_A_4: num 200
#  $ TPT_B_4: num 200
#  $ TPT_C_4: num 200
#  $ TPT_A_6: num 400
#  $ TPT_B_6: num 40
#  $ TPT_C_6: num 400

从这里...

基础 R

dat2a <- subset(dat, select = grepl("TPT_[ABC]_[24]", colnames(dat)))
dat2b <- subset(dat, select = !grepl("TPT_[ABC]_[24]", colnames(dat)))
cbind(
  dat2b, 
  lapply(split.default(dat2a, gsub("_[24]$", "", colnames(dat2a))),
         function(z) mean(unlist(z)))
)
#   TPT_A_6 TPT_B_6 TPT_C_6 TPT_A TPT_B TPT_C
# 1     400      40     400   150   150   150

dplyr

library(dplyr)
library(purrr) # imap
dat %>%
  split.default(., gsub("_[24]$", "",  colnames(.))) %>%
  imap(., function(x, nm)  {
    if (ncol(x) > 1) {
      setNames(data.frame(mean(unlist(x))), paste0(nm, "_mean"))
    } else x
  }) %>%
  bind_cols()
#   TPT_A_mean TPT_A_6 TPT_B_mean TPT_B_6 TPT_C_mean TPT_C_6
# 1        150     400        150      40        150     400

另一个可能的解决方案:

library(tidyverse)

row1 <- c("TPT_A_2", "TPT_B_2", "TPT_C_2","TPT_A_4", "TPT_B_4", "TPT_C_4", "TPT_A_6", "TPT_B_6", "TPT_C_6")
row2 <- c(100, 100, 100, 200, 200, 200, 400, 400, 400)   
data <- as.data.frame(rbind(row1, row2))
colnames(data) <- as.character(data[1,])
data <- data[-1,]

data %>% 
  pivot_longer(everything()) %>%
  mutate(value = as.numeric(value)) %>% 
  mutate(aux = if_else(str_detect(name, "2$|4$"), 1, 0),
         name1 = str_extract(name, "TPT_[A-Z]")) %>% 
  group_by(name1, aux) %>% 
  summarise(
    avg = if_else(aux == 0, value, mean(value)), 
    names2 = if_else(aux == 0, name, str_c(name1, "_mean")),
    .groups = "drop") %>% 
  distinct %>% select(-name1,-aux) %>%  
  pivot_wider(names_from = names2, values_from = avg) %>% 
  select(ends_with("mean"), matches("\d$"))

#> # A tibble: 1 × 6
#>   TPT_A_mean TPT_B_mean TPT_C_mean TPT_A_6 TPT_B_6 TPT_C_6
#>        <dbl>      <dbl>      <dbl>   <dbl>   <dbl>   <dbl>
#> 1        150        150        150     400     400     400