R中数据框中数百个变量的计数、条件和星座变量
Counting, conditionals and constellation variable for hundred of variables in a data frame in R
我正在处理一个数据集,我需要在其中同时评估数百列,以创建按行计算的新变量。我有三个新变量,一个需要“或”运算符来决定 ~100 列中是否有任何“是”。第二个需要计算变量中我总共有多少个“是”,第三个需要创建一个星座变量,向我显示具有“是”值的变量的名称,所有这些都是按行显示的。我有前两个的代码,但第三个我卡住了。此外,出于示例目的,我仅使用了几个变量,但我需要使用约 100 个变量。我的代码如下:
#making the data - I am using actually ~100 variables
test.data <- data.frame(var1 = c("yes", "no", "no", "N/A", NA, NA),
var2 = c(NA, NA, "yes", "no", "yes", NA),
var3 = c("yes", "yes", "yes", "no", "yes", "N/A"),
var4 = c("N/A", "yes", "no", "no", "yes", NA))
# code for the first two variables: is.positive and number.pos - not elegant nor efficient since I #need to work with ~100 vars
final.data <- data.frame(test.data %>%
mutate(is.positive = ifelse(var1=="yes" | var2=="yes" | var3=="yes" | var4=="yes", 1,
ifelse((is.na(var1) | var1=="N/A") &
(is.na(var2) | var2=="N/A") &
(is.na(var3) | var3=="N/A") &
(is.na(var4) | var4=="N/A"), NA, 0))) %>%
rowwise() %>%
mutate(number.pos = sum(c_across(c(var1, var2, var3, var4))=="yes",na.rm=TRUE)))
您可以通过创建一个列表列来实现,其中一些是正数,然后从中导出其他值。
library(tidyverse)
test.data <- data.frame(var1 = c("yes", "no", "no", "N/A", NA, NA),
var2 = c(NA, NA, "yes", "no", "yes", NA),
var3 = c("yes", "yes", "yes", "no", "yes", "N/A"),
var4 = c("N/A", "yes", "no", "no", "yes", NA))
nv <- test.data %>%
select(var1:var4) %>%
names()
out <- test.data %>%
rowwise() %>%
mutate(which_pos = list(nv[which(c_across(var1:var4) == "yes")]),
num.positive = length(which_pos),
is.positive = num.positive > 0)
out
#> # A tibble: 6 × 7
#> # Rowwise:
#> var1 var2 var3 var4 which_pos num.positive is.positive
#> <chr> <chr> <chr> <chr> <list> <int> <lgl>
#> 1 yes <NA> yes N/A <chr [2]> 2 TRUE
#> 2 no <NA> yes yes <chr [2]> 2 TRUE
#> 3 no yes yes no <chr [2]> 2 TRUE
#> 4 N/A no no no <chr [0]> 0 FALSE
#> 5 <NA> yes yes yes <chr [3]> 3 TRUE
#> 6 <NA> <NA> N/A <NA> <chr [0]> 0 FALSE
out$which_pos
#> [[1]]
#> [1] "var1" "var3"
#>
#> [[2]]
#> [1] "var3" "var4"
#>
#> [[3]]
#> [1] "var2" "var3"
#>
#> [[4]]
#> character(0)
#>
#> [[5]]
#> [1] "var2" "var3" "var4"
#>
#> [[6]]
#> character(0)
由 reprex package (v2.0.1)
于 2022-05-26 创建
如果您想要一个用于标识哪些变量为正的变量的普通列,您可以简单地将名称粘贴在一起以创建一个具有 comma-separated 个名称的字符串:
library(tidyverse)
test.data <- data.frame(var1 = c("yes", "no", "no", "N/A", NA, NA),
var2 = c(NA, NA, "yes", "no", "yes", NA),
var3 = c("yes", "yes", "yes", "no", "yes", "N/A"),
var4 = c("N/A", "yes", "no", "no", "yes", NA))
nv <- test.data %>%
select(var1:var4) %>%
names()
out <- test.data %>%
rowwise() %>%
mutate(which_pos = paste(nv[which(c_across(var1:var4) == "yes")], collapse=","),
num.positive = sum(c_across(var1:var4) == "yes", na.rm=TRUE),
is.positive = num.positive > 0)
out
#> # A tibble: 6 × 7
#> # Rowwise:
#> var1 var2 var3 var4 which_pos num.positive is.positive
#> <chr> <chr> <chr> <chr> <chr> <int> <lgl>
#> 1 yes <NA> yes N/A "var1,var3" 2 TRUE
#> 2 no <NA> yes yes "var3,var4" 2 TRUE
#> 3 no yes yes no "var2,var3" 2 TRUE
#> 4 N/A no no no "" 0 FALSE
#> 5 <NA> yes yes yes "var2,var3,var4" 3 TRUE
#> 6 <NA> <NA> N/A <NA> "" 0 FALSE
由 reprex package (v2.0.1)
于 2022-05-26 创建
如果需要,列表列可能更易于在后续分析中使用,但 comma-separated 变量可能更易于用于目视检查。
使用基础 R:
is.na(test.data) <- test.data == 'N/A'
idx <- test.data == 'yes'
test.data['num.positive'] <- rowSums(idx, na.rm = TRUE)
test.data['is.positive'] <- +(test.data[['num.positive']] > 0)
idx2 <- data.frame(which(idx, TRUE))
df1 <- aggregate(col~row, idx2, \(x)paste(names(test.data)[x], collapse = '-'))
df2 <- merge(cbind(test.data, row = seq(nrow(test.data))), df1, all.x =TRUE)
df2
row var1 var2 var3 var4 num.positive is.positive col
1 1 yes <NA> yes <NA> 2 1 var1-var3
2 2 no <NA> yes yes 2 1 var3-var4
3 3 no yes yes no 2 1 var2-var3
4 4 <NA> no no no 0 0 <NA>
5 5 <NA> yes yes yes 3 1 var2-var3-var4
6 6 <NA> <NA> <NA> <NA> 0 0 <NA>
我正在处理一个数据集,我需要在其中同时评估数百列,以创建按行计算的新变量。我有三个新变量,一个需要“或”运算符来决定 ~100 列中是否有任何“是”。第二个需要计算变量中我总共有多少个“是”,第三个需要创建一个星座变量,向我显示具有“是”值的变量的名称,所有这些都是按行显示的。我有前两个的代码,但第三个我卡住了。此外,出于示例目的,我仅使用了几个变量,但我需要使用约 100 个变量。我的代码如下:
#making the data - I am using actually ~100 variables
test.data <- data.frame(var1 = c("yes", "no", "no", "N/A", NA, NA),
var2 = c(NA, NA, "yes", "no", "yes", NA),
var3 = c("yes", "yes", "yes", "no", "yes", "N/A"),
var4 = c("N/A", "yes", "no", "no", "yes", NA))
# code for the first two variables: is.positive and number.pos - not elegant nor efficient since I #need to work with ~100 vars
final.data <- data.frame(test.data %>%
mutate(is.positive = ifelse(var1=="yes" | var2=="yes" | var3=="yes" | var4=="yes", 1,
ifelse((is.na(var1) | var1=="N/A") &
(is.na(var2) | var2=="N/A") &
(is.na(var3) | var3=="N/A") &
(is.na(var4) | var4=="N/A"), NA, 0))) %>%
rowwise() %>%
mutate(number.pos = sum(c_across(c(var1, var2, var3, var4))=="yes",na.rm=TRUE)))
您可以通过创建一个列表列来实现,其中一些是正数,然后从中导出其他值。
library(tidyverse)
test.data <- data.frame(var1 = c("yes", "no", "no", "N/A", NA, NA),
var2 = c(NA, NA, "yes", "no", "yes", NA),
var3 = c("yes", "yes", "yes", "no", "yes", "N/A"),
var4 = c("N/A", "yes", "no", "no", "yes", NA))
nv <- test.data %>%
select(var1:var4) %>%
names()
out <- test.data %>%
rowwise() %>%
mutate(which_pos = list(nv[which(c_across(var1:var4) == "yes")]),
num.positive = length(which_pos),
is.positive = num.positive > 0)
out
#> # A tibble: 6 × 7
#> # Rowwise:
#> var1 var2 var3 var4 which_pos num.positive is.positive
#> <chr> <chr> <chr> <chr> <list> <int> <lgl>
#> 1 yes <NA> yes N/A <chr [2]> 2 TRUE
#> 2 no <NA> yes yes <chr [2]> 2 TRUE
#> 3 no yes yes no <chr [2]> 2 TRUE
#> 4 N/A no no no <chr [0]> 0 FALSE
#> 5 <NA> yes yes yes <chr [3]> 3 TRUE
#> 6 <NA> <NA> N/A <NA> <chr [0]> 0 FALSE
out$which_pos
#> [[1]]
#> [1] "var1" "var3"
#>
#> [[2]]
#> [1] "var3" "var4"
#>
#> [[3]]
#> [1] "var2" "var3"
#>
#> [[4]]
#> character(0)
#>
#> [[5]]
#> [1] "var2" "var3" "var4"
#>
#> [[6]]
#> character(0)
由 reprex package (v2.0.1)
于 2022-05-26 创建如果您想要一个用于标识哪些变量为正的变量的普通列,您可以简单地将名称粘贴在一起以创建一个具有 comma-separated 个名称的字符串:
library(tidyverse)
test.data <- data.frame(var1 = c("yes", "no", "no", "N/A", NA, NA),
var2 = c(NA, NA, "yes", "no", "yes", NA),
var3 = c("yes", "yes", "yes", "no", "yes", "N/A"),
var4 = c("N/A", "yes", "no", "no", "yes", NA))
nv <- test.data %>%
select(var1:var4) %>%
names()
out <- test.data %>%
rowwise() %>%
mutate(which_pos = paste(nv[which(c_across(var1:var4) == "yes")], collapse=","),
num.positive = sum(c_across(var1:var4) == "yes", na.rm=TRUE),
is.positive = num.positive > 0)
out
#> # A tibble: 6 × 7
#> # Rowwise:
#> var1 var2 var3 var4 which_pos num.positive is.positive
#> <chr> <chr> <chr> <chr> <chr> <int> <lgl>
#> 1 yes <NA> yes N/A "var1,var3" 2 TRUE
#> 2 no <NA> yes yes "var3,var4" 2 TRUE
#> 3 no yes yes no "var2,var3" 2 TRUE
#> 4 N/A no no no "" 0 FALSE
#> 5 <NA> yes yes yes "var2,var3,var4" 3 TRUE
#> 6 <NA> <NA> N/A <NA> "" 0 FALSE
由 reprex package (v2.0.1)
于 2022-05-26 创建如果需要,列表列可能更易于在后续分析中使用,但 comma-separated 变量可能更易于用于目视检查。
使用基础 R:
is.na(test.data) <- test.data == 'N/A'
idx <- test.data == 'yes'
test.data['num.positive'] <- rowSums(idx, na.rm = TRUE)
test.data['is.positive'] <- +(test.data[['num.positive']] > 0)
idx2 <- data.frame(which(idx, TRUE))
df1 <- aggregate(col~row, idx2, \(x)paste(names(test.data)[x], collapse = '-'))
df2 <- merge(cbind(test.data, row = seq(nrow(test.data))), df1, all.x =TRUE)
df2
row var1 var2 var3 var4 num.positive is.positive col
1 1 yes <NA> yes <NA> 2 1 var1-var3
2 2 no <NA> yes yes 2 1 var3-var4
3 3 no yes yes no 2 1 var2-var3
4 4 <NA> no no no 0 0 <NA>
5 5 <NA> yes yes yes 3 1 var2-var3-var4
6 6 <NA> <NA> <NA> <NA> 0 0 <NA>