如何按行计算字符串数字并计算百分比(仅考虑空列)
How to count string numbers row-wisely and count percentage (only not empty columns taking into account)
对于以下数据框:
df <- structure(list(test = c("A", "B", "C"), `2019` = c("true", "",
"false"), `2020` = c("false", "true", "true"), `2021` = c("true",
"false", "true"), `2022` = c("", "false", "false")), class = "data.frame", row.names = c(NA,
-3L))
输出:
test 2019 2020 2021 2022
1 A true false true
2 B true false false
3 C false true true false
我需要计算从 2019
列到 2022
的 true
的数量,然后从 not NA 计算 true
的百分比或非空值:
预期的结果可能是这样的:
test 2019 2020 2021 2022 true_pct
1 A true false true 0.67 # 2/3
2 B true false false 0.33 # 1/3
3 C false true true false 0.50 # 2/4
下面的代码 return 的答案是错误的,因为它需要所有年份的列,因为数据中有空字符串,但我需要过滤掩码,例如 !is.na(df[,2:5]) | df[,2:5] != ''
:
df$count <- rowSums(df[-1] == "true")
df$not_na <- rowSums(!is.na(df[,2:5]))
# df$not_na <- rowSums(!complete.cases(df[,2:5]))
df$true_pct <- df$count/df$not_na
输出:
test 2019 2020 2021 2022 count not_na true_pct
1 A true false true 2 4 0.50
2 B true false false 1 4 0.25
3 C false true true false 2 4 0.50
请分享更好的解决方案,谢谢。
更新: 对于以下数据集,代码似乎没有 return 正确的结果:
df <- structure(list(test = c("A", "B", "C"), `2018` = c("true", NA, NA
), `2019` = c("true", "", "false"), `2020` = c("false", "true",
"true"), `2021` = c("true", "false", "true"), `2022` = c("",
"false", "false")), class = "data.frame", row.names = c(NA, -3L
))
tmp <- df[-1]
df$true_pct <- rowSums(tmp == "true")/rowSums(tmp != "" & !is.na(tmp))
df
或:
df$count <- rowSums(df[-1] == "true")
df$not_na <- rowSums(sapply(df[, 2:6], function(x){x %in% c("true", "false")}))
df$true_pct <- df$count/df$not_na
df
生成相同的结果:
test 2018 2019 2020 2021 2022 true_pct
1 A true true false true 0.75
2 B <NA> true false false NA
3 C <NA> false true true false NA
你可以使用-
#To select only required columns
tmp <- df[-1]
#Total true values divided by total values which are not `NA` or empty.
df$true_pct <- rowSums(tmp == "true")/rowSums(tmp != "" & !is.na(tmp))
df
# test 2019 2020 2021 2022 true_pct
#1 A true false true 0.6666667
#2 B true false false 0.3333333
#3 C false true true false 0.5000000
rowSums
和 na.rm = TRUE
怎么样:
df$true_pct <- rowSums(df[-1] == "true", na.rm = TRUE)/rowSums(df[-1] != "" & !is.na(df[-1]), na.rm = TRUE)
或此 dplyr
解决方案 apply
:
library(dplyr)
library(stringr)
df %>%
mutate(true_pct = apply(.[-1], 1, paste, collapse = " "),
true_pct = str_count(true_pct, "true")/str_count(true_pct, "true|false"))
test 2018 2019 2020 2021 2022 true_pct
1 A true true false true 0.7500000
2 B <NA> true false false 0.3333333
3 C <NA> false true true false 0.5000000
数据:
df <- structure(list(test = c("A", "B", "C"), `2018` = c("true", NA, NA
), `2019` = c("true", "", "false"), `2020` = c("false", "true",
"true"), `2021` = c("true", "false", "true"), `2022` = c("",
"false", "false")), class = "data.frame", row.names = c(NA, -3L
))
对于以下数据框:
df <- structure(list(test = c("A", "B", "C"), `2019` = c("true", "",
"false"), `2020` = c("false", "true", "true"), `2021` = c("true",
"false", "true"), `2022` = c("", "false", "false")), class = "data.frame", row.names = c(NA,
-3L))
输出:
test 2019 2020 2021 2022
1 A true false true
2 B true false false
3 C false true true false
我需要计算从 2019
列到 2022
的 true
的数量,然后从 not NA 计算 true
的百分比或非空值:
预期的结果可能是这样的:
test 2019 2020 2021 2022 true_pct
1 A true false true 0.67 # 2/3
2 B true false false 0.33 # 1/3
3 C false true true false 0.50 # 2/4
下面的代码 return 的答案是错误的,因为它需要所有年份的列,因为数据中有空字符串,但我需要过滤掩码,例如 !is.na(df[,2:5]) | df[,2:5] != ''
:
df$count <- rowSums(df[-1] == "true")
df$not_na <- rowSums(!is.na(df[,2:5]))
# df$not_na <- rowSums(!complete.cases(df[,2:5]))
df$true_pct <- df$count/df$not_na
输出:
test 2019 2020 2021 2022 count not_na true_pct
1 A true false true 2 4 0.50
2 B true false false 1 4 0.25
3 C false true true false 2 4 0.50
请分享更好的解决方案,谢谢。
更新: 对于以下数据集,代码似乎没有 return 正确的结果:
df <- structure(list(test = c("A", "B", "C"), `2018` = c("true", NA, NA
), `2019` = c("true", "", "false"), `2020` = c("false", "true",
"true"), `2021` = c("true", "false", "true"), `2022` = c("",
"false", "false")), class = "data.frame", row.names = c(NA, -3L
))
tmp <- df[-1]
df$true_pct <- rowSums(tmp == "true")/rowSums(tmp != "" & !is.na(tmp))
df
或:
df$count <- rowSums(df[-1] == "true")
df$not_na <- rowSums(sapply(df[, 2:6], function(x){x %in% c("true", "false")}))
df$true_pct <- df$count/df$not_na
df
生成相同的结果:
test 2018 2019 2020 2021 2022 true_pct
1 A true true false true 0.75
2 B <NA> true false false NA
3 C <NA> false true true false NA
你可以使用-
#To select only required columns
tmp <- df[-1]
#Total true values divided by total values which are not `NA` or empty.
df$true_pct <- rowSums(tmp == "true")/rowSums(tmp != "" & !is.na(tmp))
df
# test 2019 2020 2021 2022 true_pct
#1 A true false true 0.6666667
#2 B true false false 0.3333333
#3 C false true true false 0.5000000
rowSums
和 na.rm = TRUE
怎么样:
df$true_pct <- rowSums(df[-1] == "true", na.rm = TRUE)/rowSums(df[-1] != "" & !is.na(df[-1]), na.rm = TRUE)
或此 dplyr
解决方案 apply
:
library(dplyr)
library(stringr)
df %>%
mutate(true_pct = apply(.[-1], 1, paste, collapse = " "),
true_pct = str_count(true_pct, "true")/str_count(true_pct, "true|false"))
test 2018 2019 2020 2021 2022 true_pct
1 A true true false true 0.7500000
2 B <NA> true false false 0.3333333
3 C <NA> false true true false 0.5000000
数据:
df <- structure(list(test = c("A", "B", "C"), `2018` = c("true", NA, NA
), `2019` = c("true", "", "false"), `2020` = c("false", "true",
"true"), `2021` = c("true", "false", "true"), `2022` = c("",
"false", "false")), class = "data.frame", row.names = c(NA, -3L
))