根据几个条件更改变量的优雅方法？

Question

我正在尝试根据多种条件更改变量“曝光”。

例如：如果stimulus_content为“neg”，条件为“neg”，set为“A”，则对于 stimulus_no 为 X1、X2、... 或 X5 的行，变量“exposure”应更改为“long”。对于 stimulus_no 为 X6、X7、...或 X10 的行，变量“exposure 应更改为”short。依此类推...

希望下面的代码能让问题更清楚。

首先，这是大概的数据集：

n <- 6
dataset <- data.frame(
participant = rep(1:n, each=40),
condition = rep(c("pos","neg"), each=40),
set = rep(c("A","B"), each=40),
stimulus_content = rep(c("pos","neg"), each=2),
stimulus_no = rep(c("X1","X10","X11","X12","X13","X14","X15","X16","X17","X18","X19","X2","X20","X3","X4","X5","X6","X7","X8","X9"), each=2),
exposure = NA)

我们尝试的第一件事是通过循环。为了简单起见，只包含循环的一部分。它不会 return 错误，但也不会执行任何操作。

for (i in 1:length(longdat[,1])){
  if(longdat[i,"stimulus_content"] == "pos") { 
    if(longdat[i,"condition"] == "pos") {
      if(longdat[i,"set"] == "A") {     
        for(stimulus_no in c("X1","X2","X3","X4","X5")){longdat[i,"exposure"] == "long"}
        for(stimulus_no in c("X6","X7","X8","X9","X10")){longdat[i,"exposure"] == "short"}
        for(stimulus_no in c("X11","X12","X13","X14","X15","X16","X17","X18","X19","X20")){longdat[i,"exposure"] == "none"}
      } else { #for condition = pos and set != A            
        for(stimulus_no in c("X11","X12","X13","X14","X15")){longdat[i,"exposure"] == "long"}
        for(stimulus_no in c("X16","X17","X18","X19","X20")){longdat[i,"exposure"] == "short"}
        for(stimulus_no in c("X1","X2","X3","X4","X5","X6","X7","X8","X9","X10")){longdat[i,"exposure"] == "none"}
      }
    }
  }
}

接下来，我们尝试通过 mutate 和 case_when。这段代码确实做了它应该做的，但它有将近 100 行长！请在下面找到摘录。

longdat2 <- longdat %>%
  mutate(exposure = case_when(
    # Condition pos, set A
    stimulus_no=="X1" & stimulus_content=="pos" & condition=="pos" & set=="A" ~ "long",
    stimulus_no=="X2" & stimulus_content=="pos" & condition=="pos" & set=="A" ~ "long",
    # ...
    stimulus_no=="X9" & stimulus_content=="pos" & condition=="pos" & set=="A" ~ "short",
    stimulus_no=="X10" & stimulus_content=="pos" & condition=="pos" & set=="A" ~ "short",
    stimulus_no=="X11" & stimulus_content=="pos" & condition=="pos" & set=="A" ~ "none",
    # ... accordingly for condition pos and set B, and for condition neg and set A
    # and eventually for condition neg and set B
    stimulus_no=="X18" & stimulus_content=="neg" & condition=="neg" & set=="B" ~ "short",
    stimulus_no=="X19" & stimulus_content=="neg" & condition=="neg" & set=="B" ~ "short",
    stimulus_no=="X20" & stimulus_content=="neg" & condition=="neg" & set=="B" ~ "short",
  )
)

如果有人设法发现循环中的错误或者可以告诉我第二个（或第一个）选项的更简洁版本，我将不胜感激！

非常感谢！

Answer 1

您可以使用 %in% 运算符和 else 部分的逆条件来简化您的第二个解决方案：

dataset2 <- dataset %>%
  mutate(exposure = case_when(
    # Condition pos, set A
    (stimulus_content=="pos" & condition=="pos" & set=="A") & stimulus_no %in% c("X1","X2","X3","X4","X5") ~ "long",
    (stimulus_content=="pos" & condition=="pos" & set=="A") & stimulus_no %in% c("X6","X7","X8","X9","X10") ~ "short",
    (stimulus_content=="pos" & condition=="pos" & set=="A") & stimulus_no %in% c("X11","X12","X13","X14","X15","X16","X17","X18","X19","X20") ~ "none",
    # else
    !(stimulus_content=="pos" & condition=="pos" & set=="A") & stimulus_no %in% c("X11","X12","X13","X14","X15") ~ "long",
    !(stimulus_content=="pos" & condition=="pos" & set=="A") & stimulus_no %in% c("X16","X17","X18","X19","X20") ~ "short",
    !(stimulus_content=="pos" & condition=="pos" & set=="A") & stimulus_no %in% c("X1","X2","X3","X4","X5","X6","X7","X8","X9","X10") ~ "none"
  )
)

编辑

对于循环的解决方案：

dataset3 <- dataset
for (i in 1:length(dataset3[,1])){
  if(dataset3[i,"stimulus_content"] == "pos" & dataset3[i,"condition"] == "pos" & dataset3[i,"set"] == "A") {    
    if(dataset3[i,"stimulus_no"] %in% c("X1","X2","X3","X4","X5")) {dataset3[i,"exposure"] <- "long"}
    if(dataset3[i,"stimulus_no"] %in% c("X6","X7","X8","X9","X10")) {dataset3[i,"exposure"] <- "short"}
    if(dataset3[i,"stimulus_no"] %in% c("X11","X12","X13","X14","X15","X16","X17","X18","X19","X20")){dataset3[i,"exposure"] <- "none"}
  } else {       
    if(dataset3[i,"stimulus_no"] %in% c("X11","X12","X13","X14","X15")) {dataset3[i,"exposure"] <- "long"}
    if(dataset3[i,"stimulus_no"] %in% c("X16","X17","X18","X19","X20")) {dataset3[i,"exposure"] <- "short"}
    if(dataset3[i,"stimulus_no"] %in% c("X1","X2","X3","X4","X5","X6","X7","X8","X9","X10")) {dataset3[i,"exposure"] <- "none"}
  }
}

compareDF::compare_df(dataset3, dataset2, rownames)
#> Error in stop_or_warn("The two data frames are the same!", stop_on_error): The two data frames are the same!

并避免循环，就像@g-grothendieck 但更接近您的代码：

dataset4 <- within(dataset, {
  # Condition pos, set A
  exposure[(stimulus_content == "pos" & condition == "pos" & set == "A") & stimulus_no %in% c("X1","X2","X3","X4","X5")] <- "long"
  exposure[(stimulus_content == "pos" & condition == "pos" & set == "A") & stimulus_no %in% c("X6","X7","X8","X9","X10")] <- "short"
  exposure[(stimulus_content == "pos" & condition == "pos" & set == "A") & stimulus_no %in% c("X11","X12","X13","X14","X15","X16","X17","X18","X19","X20")] <- "none"
  
  # else     
  exposure[!(stimulus_content == "pos" & condition == "pos" & set == "A") & stimulus_no %in% c("X11","X12","X13","X14","X15")] <- "long"
  exposure[!(stimulus_content == "pos" & condition == "pos" & set == "A") & stimulus_no %in% c("X16","X17","X18","X19","X20")] <- "short"
  exposure[!(stimulus_content == "pos" & condition == "pos" & set == "A") & stimulus_no %in% c("X1","X2","X3","X4","X5","X6","X7","X8","X9","X10")] <- "none"
})

compareDF::compare_df(dataset4, dataset2, rownames)
#> Error in stop_or_warn("The two data frames are the same!", stop_on_error): The two data frames are the same!

或

dataset5 <- within(dataset, {
  # Condition pos, set A
  exposure <- ifelse((stimulus_content == "pos" & condition == "pos" & set == "A") & stimulus_no %in% c("X1","X2","X3","X4","X5"), "long", exposure)
  exposure <- ifelse((stimulus_content == "pos" & condition == "pos" & set == "A") & stimulus_no %in% c("X6","X7","X8","X9","X10"), "short", exposure)
  exposure <- ifelse((stimulus_content == "pos" & condition == "pos" & set == "A") & stimulus_no %in% c("X11","X12","X13","X14","X15","X16","X17","X18","X19","X20"), "none", exposure)
  
  # else     
  exposure <- ifelse(!(stimulus_content == "pos" & condition == "pos" & set == "A") & stimulus_no %in% c("X11","X12","X13","X14","X15"), "long", exposure)
  exposure <- ifelse(!(stimulus_content == "pos" & condition == "pos" & set == "A") & stimulus_no %in% c("X16","X17","X18","X19","X20"), "short", exposure)
  exposure <- ifelse(!(stimulus_content == "pos" & condition == "pos" & set == "A") & stimulus_no %in% c("X1","X2","X3","X4","X5","X6","X7","X8","X9","X10"), "none", exposure)
})

compareDF::compare_df(dataset5, dataset2, rownames)
#> Error in stop_or_warn("The two data frames are the same!", stop_on_error): The two data frames are the same!

此致，

Answer 2

1) grep 创建一个code，将要匹配的列粘贴在一起，然后用正则表达式匹配得到简洁的表达式。没有使用包。请注意，[^A] 将匹配任何不是 A 的单个字符。如果只有 A 和 B，则可以使用 B。 X1[1-5] 将匹配 X11, ..., X15。 X[6-9]|X10 将匹配 X6, ..., X10。 $ 匹配字符串的结尾。如果要保留 code 列，请省略 code <- NULL 行。

dataset2 <- within(dataset, {
  code <- paste(stimulus_content, condition, set, stimulus_no)
  exposure[grep("pos pos A X[1-5]$", code)] <- "long"
  exposure[grep("pos pos A (X[6-9]|X10)$", code)] <- "short"
  exposure[grep("pos pos A (X1[1-9]|X20)$", code)] <- "none"
  exposure[grep("pos pos [^A] X1[1-5]$", code)] <- "long"
  exposure[grep("pos pos [^A] (X1[6-9]|X20)$", code)] <- "short"
  exposure[grep("pos pos [^A] (X[1-9]|X10)$", code)] <- "none"
  code <- NULL
})

2) Between 另一种方法，同样只使用基数 R，是定义一个 Between 函数来检查它的第一个非数字和数字部分参数分别假设数字部分必须在指定范围内，非数字部分等于第四个参数（默认为 "X" 因此为了简洁起见，我们可以在调用中省略它）。然后使用within如图：

Between <- function(x, lo, hi, alpha = "X") {
  nonno <- gsub("\d", "", x)
  no = as.numeric(gsub("\D", "", x))
  no >= lo & no <= hi & nonno == alpha
}

dataset3 <- within(dataset, {

  cond1 <- stimulus_content == "pos" & condition == "pos" & set == "A"
  exposure[cond1 & Between(stimulus_no, 1, 5)] <- "long"
  exposure[cond1 & Between(stimulus_no, 6, 10)] <- "short"
  exposure[cond1 & Between(stimulus_no, 11, 20)] <- "none"

  cond2 <- stimulus_content == "pos" & condition == "pos" & set != "A"
  exposure[cond2 & Between(stimulus_no, 11, 15)] <- "long"
  exposure[cond2 & Between(stimulus_no, 16, 20)] <- "short"
  exposure[cond2 & Between(stimulus_no, 1, 10)] <- "none"

  cond1 <- cond2 <- NULL
})

根据几个条件更改变量的优雅方法？

Elegant way to change a variable according to several conditions?

loops

for-loop

if-statement

r

case-when