如何在R中为一行中的多个值设置多个条件?
How to set multiple conditions for multiple values in a row in R?
我有一个遗传数据集,其中每一行都描述了一个基因,并且有一个包含多个 beta 值的 beta 列,我已将其压缩为一个 row/cell(从一个基因中的多个变体给出多个变体级别)测试版)。 Beta 是基因在某种情况下可能产生的效应大小,因此大的负值和大的正值都很重要。我正在尝试编写代码,使 select 成为基因的最大负值或最大正值,在 -0.5 和 0.5 处截止。
我尝试编码的规则是:
如果 gene/row 的值小于 -0.5 且没有值高于 0.5,则仅保留最大的负值。
如果它有大于 0.5 的值且没有小于 -0.5 的值,则只保留最大的正值。
如果它没有小于-0.5或大于0.5的值,则保留最大值。
如果小于-0.5 和大于 0.5 的值保持最大值。
例如我的数据是这样的:
Gene Beta(s)
ACE 0.01, -0.6, 0.4
BRCA 0.7, -0.2, 0.2
ZAP70 0.001, 0.02, -0.003
P53 0.8, -0.6, 0.001
预期输出(select最大负值或正值取决于设置条件):
Gene Beta(s)
ACE -0.6
BRCA 0.7
ZAP70 0.02
P53 0.8
我有生物学背景,刚接触 R,所以不确定如何编写代码。目前我正在使用 select 基因的最大或最小 beta 值的函数,但我不知道如何在进一步的条件下修改它:
max2 = function(x) if(all(is.na(x))) NA else max(x,na.rm = T)
getmax = function(col) str_extract_all(col,"[0-9\.-]+") %>%
lapply(.,function(x)max2(as.numeric(x)) ) %>%
unlist()
min2 = function(x) if(all(is.na(x))) NA else min(x,na.rm = T)
getmin = function(col) str_extract_all(col,"[0-9\.-]+") %>%
lapply(.,function(x)min2(as.numeric(x)) ) %>%
unlist()
test <- df %>%
mutate_at(names(df)[2],getmax)
任何关于如何设置多个条件语句的正确方向的帮助将不胜感激。
示例数据:
dput(df)
structure(list(Gene = c("ACE", "BRCA", "ZAP70", "P53"), `Beta(s)` = c("0.01, -0.6, 0.4",
"0.7, -0.2, 0.2", "0.001, 0.02, -0.003", "0.8, -0.6, 0.001")), row.names = c(NA,
-4L), class = c("data.table", "data.frame"))
虽然逻辑对我来说不是很清楚,但很可能是这样的:
library(tidyverse)
library(stringr)
df %>%
separate("Beta(s)", sep = ",", into = str_c("v", 1:3)) %>%
mutate_at(vars(starts_with("v")), as.numeric) %>%
mutate(vmax = pmax(v1, v2, v3), vmin = pmin(v1, v2, v3)) %>%
mutate(want = case_when(vmax > 0.5 & vmin > -0.5 ~ vmax, # if vmax is > 0.5, it is always positive ..
abs(vmax) > abs(vmin) ~ vmax, # get largest absolute value ??
TRUE ~ vmin)) %>%
select(Gene, want)
# Gene want
# 1 ACE -0.60
# 2 BRCA 0.70
# 3 ZAP70 0.02
# 4 P53 0.80
## edited (handling multiple columns and NA):
df %>%
bind_cols(df %>%
pull("Beta(s)") %>%
str_split(",", simplify = TRUE) %>%
`colnames<-`(str_c("v", 1:NCOL(.))) %>%
as_tibble() %>%
mutate_all(~str_remove_all(., "\s") %>%
str_remove_all(., "NA") %>%
as.numeric) %>%
mutate(vmax = pmap_dbl(., pmax, na.rm = T),
vmin = pmap_dbl(., pmin, na.rm = T))) %>%
mutate(want = case_when(vmax > 0.5 & vmin > -0.5 ~ vmax,
abs(vmax) > abs(vmin) ~ vmax,
TRUE ~ vmin)) %>%
select(Gene, want)
这是一个 data.table 解决方案,它应该可以快速运行并且不受提供的 beta 数量的影响。
library( data.table )
library( matrixStats )
#set df as data.table
setDT( df )
#split Beta(s) to columns (dynamically)
df[, paste0( "Beta",
1:length( tstrsplit( df$`Beta(s)`, "," ) ) ) :=
lapply( tstrsplit( `Beta(s)`, "," ), as.numeric ) ][]
# Gene Beta(s) Beta1 Beta2 Beta3
# 1: ACE 0.01, -0.6, 0.4 0.010 -0.60 0.400
# 2: BRCA 0.7, -0.2, 0.2 0.700 -0.20 0.200
# 3: ZAP70 0.001, 0.02, -0.003 0.001 0.02 -0.003
# 4: P53 0.8, -0.6, 0.001 0.800 -0.60 0.001
#now, using rowMINs ans RowMAxs from the matrixStats-package (=FAST!!)
# get the filtering (and updating) done by reference.
#If a gene/row has a value less than -0.5 and no values higher than 0.5 then keep only the largest negative value.
df[ df[, rowMins( as.matrix(.SD), na.rm = TRUE ), .SDcols = patterns("^Beta[0-9]") ] < -0.5 &
df[, rowMaxs( as.matrix(.SD), na.rm = TRUE ), .SDcols = patterns("^Beta[0-9]") ] <= 0.5,
Beta.final := rowMins( as.matrix(.SD), na.rm = TRUE ), .SDcols = patterns("^Beta[0-9]") ]
#If it has a value higher than 0.5 and no values less than -0.5 keep only the largest positive value.
df[ df[, rowMaxs( as.matrix(.SD), na.rm = TRUE ), .SDcols = patterns("^Beta[0-9]") ] > 0.5 &
df[, rowMins( as.matrix(.SD), na.rm = TRUE ), .SDcols = patterns("^Beta[0-9]") ] >= -0.5,
Beta.final := rowMaxs( as.matrix(.SD), na.rm = TRUE ), .SDcols = patterns("^Beta[0-9]") ]
#If it has no values less than -0.5 or more than 0.5 keep the largest value.
df[ df[, rowMins( as.matrix(.SD), na.rm = TRUE ), .SDcols = patterns("^Beta[0-9]") ] >= -0.5 &
df[, rowMaxs( as.matrix(.SD), na.rm = TRUE ), .SDcols = patterns("^Beta[0-9]") ] <= 0.5,
Beta.final := rowMaxs( as.matrix(.SD), na.rm = TRUE ), .SDcols = patterns("^Beta[0-9]") ]
#If it has both values less than -0.5 and more than 0.5 keep the largest value.
df[ df[, rowMins( as.matrix(.SD), na.rm = TRUE ), .SDcols = patterns("^Beta[0-9]") ] < -0.5 &
df[, rowMaxs( as.matrix(.SD), na.rm = TRUE ), .SDcols = patterns("^Beta[0-9]") ] > 0.5,
Beta.final := rowMaxs( as.matrix(.SD), na.rm = TRUE ), .SDcols = patterns("^Beta[0-9]") ]
*输出
#final output
df[, .(Gene, `Beta(s)` = Beta.final )][]
# Gene Beta(s)
# 1: ACE -0.60
# 2: BRCA 0.70
# 3: ZAP70 0.02
# 4: P53 0.80
我有一个遗传数据集,其中每一行都描述了一个基因,并且有一个包含多个 beta 值的 beta 列,我已将其压缩为一个 row/cell(从一个基因中的多个变体给出多个变体级别)测试版)。 Beta 是基因在某种情况下可能产生的效应大小,因此大的负值和大的正值都很重要。我正在尝试编写代码,使 select 成为基因的最大负值或最大正值,在 -0.5 和 0.5 处截止。
我尝试编码的规则是:
如果 gene/row 的值小于 -0.5 且没有值高于 0.5,则仅保留最大的负值。
如果它有大于 0.5 的值且没有小于 -0.5 的值,则只保留最大的正值。
如果它没有小于-0.5或大于0.5的值,则保留最大值。
如果小于-0.5 和大于 0.5 的值保持最大值。
例如我的数据是这样的:
Gene Beta(s)
ACE 0.01, -0.6, 0.4
BRCA 0.7, -0.2, 0.2
ZAP70 0.001, 0.02, -0.003
P53 0.8, -0.6, 0.001
预期输出(select最大负值或正值取决于设置条件):
Gene Beta(s)
ACE -0.6
BRCA 0.7
ZAP70 0.02
P53 0.8
我有生物学背景,刚接触 R,所以不确定如何编写代码。目前我正在使用 select 基因的最大或最小 beta 值的函数,但我不知道如何在进一步的条件下修改它:
max2 = function(x) if(all(is.na(x))) NA else max(x,na.rm = T)
getmax = function(col) str_extract_all(col,"[0-9\.-]+") %>%
lapply(.,function(x)max2(as.numeric(x)) ) %>%
unlist()
min2 = function(x) if(all(is.na(x))) NA else min(x,na.rm = T)
getmin = function(col) str_extract_all(col,"[0-9\.-]+") %>%
lapply(.,function(x)min2(as.numeric(x)) ) %>%
unlist()
test <- df %>%
mutate_at(names(df)[2],getmax)
任何关于如何设置多个条件语句的正确方向的帮助将不胜感激。
示例数据:
dput(df)
structure(list(Gene = c("ACE", "BRCA", "ZAP70", "P53"), `Beta(s)` = c("0.01, -0.6, 0.4",
"0.7, -0.2, 0.2", "0.001, 0.02, -0.003", "0.8, -0.6, 0.001")), row.names = c(NA,
-4L), class = c("data.table", "data.frame"))
虽然逻辑对我来说不是很清楚,但很可能是这样的:
library(tidyverse)
library(stringr)
df %>%
separate("Beta(s)", sep = ",", into = str_c("v", 1:3)) %>%
mutate_at(vars(starts_with("v")), as.numeric) %>%
mutate(vmax = pmax(v1, v2, v3), vmin = pmin(v1, v2, v3)) %>%
mutate(want = case_when(vmax > 0.5 & vmin > -0.5 ~ vmax, # if vmax is > 0.5, it is always positive ..
abs(vmax) > abs(vmin) ~ vmax, # get largest absolute value ??
TRUE ~ vmin)) %>%
select(Gene, want)
# Gene want
# 1 ACE -0.60
# 2 BRCA 0.70
# 3 ZAP70 0.02
# 4 P53 0.80
## edited (handling multiple columns and NA):
df %>%
bind_cols(df %>%
pull("Beta(s)") %>%
str_split(",", simplify = TRUE) %>%
`colnames<-`(str_c("v", 1:NCOL(.))) %>%
as_tibble() %>%
mutate_all(~str_remove_all(., "\s") %>%
str_remove_all(., "NA") %>%
as.numeric) %>%
mutate(vmax = pmap_dbl(., pmax, na.rm = T),
vmin = pmap_dbl(., pmin, na.rm = T))) %>%
mutate(want = case_when(vmax > 0.5 & vmin > -0.5 ~ vmax,
abs(vmax) > abs(vmin) ~ vmax,
TRUE ~ vmin)) %>%
select(Gene, want)
这是一个 data.table 解决方案,它应该可以快速运行并且不受提供的 beta 数量的影响。
library( data.table )
library( matrixStats )
#set df as data.table
setDT( df )
#split Beta(s) to columns (dynamically)
df[, paste0( "Beta",
1:length( tstrsplit( df$`Beta(s)`, "," ) ) ) :=
lapply( tstrsplit( `Beta(s)`, "," ), as.numeric ) ][]
# Gene Beta(s) Beta1 Beta2 Beta3
# 1: ACE 0.01, -0.6, 0.4 0.010 -0.60 0.400
# 2: BRCA 0.7, -0.2, 0.2 0.700 -0.20 0.200
# 3: ZAP70 0.001, 0.02, -0.003 0.001 0.02 -0.003
# 4: P53 0.8, -0.6, 0.001 0.800 -0.60 0.001
#now, using rowMINs ans RowMAxs from the matrixStats-package (=FAST!!)
# get the filtering (and updating) done by reference.
#If a gene/row has a value less than -0.5 and no values higher than 0.5 then keep only the largest negative value.
df[ df[, rowMins( as.matrix(.SD), na.rm = TRUE ), .SDcols = patterns("^Beta[0-9]") ] < -0.5 &
df[, rowMaxs( as.matrix(.SD), na.rm = TRUE ), .SDcols = patterns("^Beta[0-9]") ] <= 0.5,
Beta.final := rowMins( as.matrix(.SD), na.rm = TRUE ), .SDcols = patterns("^Beta[0-9]") ]
#If it has a value higher than 0.5 and no values less than -0.5 keep only the largest positive value.
df[ df[, rowMaxs( as.matrix(.SD), na.rm = TRUE ), .SDcols = patterns("^Beta[0-9]") ] > 0.5 &
df[, rowMins( as.matrix(.SD), na.rm = TRUE ), .SDcols = patterns("^Beta[0-9]") ] >= -0.5,
Beta.final := rowMaxs( as.matrix(.SD), na.rm = TRUE ), .SDcols = patterns("^Beta[0-9]") ]
#If it has no values less than -0.5 or more than 0.5 keep the largest value.
df[ df[, rowMins( as.matrix(.SD), na.rm = TRUE ), .SDcols = patterns("^Beta[0-9]") ] >= -0.5 &
df[, rowMaxs( as.matrix(.SD), na.rm = TRUE ), .SDcols = patterns("^Beta[0-9]") ] <= 0.5,
Beta.final := rowMaxs( as.matrix(.SD), na.rm = TRUE ), .SDcols = patterns("^Beta[0-9]") ]
#If it has both values less than -0.5 and more than 0.5 keep the largest value.
df[ df[, rowMins( as.matrix(.SD), na.rm = TRUE ), .SDcols = patterns("^Beta[0-9]") ] < -0.5 &
df[, rowMaxs( as.matrix(.SD), na.rm = TRUE ), .SDcols = patterns("^Beta[0-9]") ] > 0.5,
Beta.final := rowMaxs( as.matrix(.SD), na.rm = TRUE ), .SDcols = patterns("^Beta[0-9]") ]
*输出
#final output
df[, .(Gene, `Beta(s)` = Beta.final )][]
# Gene Beta(s)
# 1: ACE -0.60
# 2: BRCA 0.70
# 3: ZAP70 0.02
# 4: P53 0.80