根据 "closest value" 测试有选择地包括列
Selectively including columns based on a "closest value" test
我有一个 df 看起来像:
SNP FRQ ACB ASW BEB CDX ACB_alt ASW_alt BEB_alt CDX_alt P EFF
rs10007883 0.3588 0.53645 0.54918 0.19186 0.10752 0.46354 0.45081 0.80813 0.89247 0.013510 -0.000152
rs10009522 0.8654 0.60416 0.47540 0.16279 0.14516 0.39583 0.52459 0.83720 0.85483 0.019823 0.009342
rs10010325 0.5277 0.45833 0.467213 0.45348 0.58602 0.54166 0.53278 0.54651 0.41397 0.9182352 -0.09135
rs10010809 0.3958 0.375 0.401639 0.29069 0.15591 0.62500 0.59836 0.70930 0.84408 0.0001923 -0.09402
rs10015151 0.6939 0.57291 0.44262 0.54651 0.36559 0.42708 0.55737 0.45348 0.63440 0.0192341 0.00012
rs10016978 0.5633 0.5625 0.56557 0.42441 0.16666 0.43750 0.43442 0.57558 0.83333 0.0153223 -0.00543
我想制作一个基于 "closer to" 测试有条件构建的新数据框。我想测试一下 FRQ 列是否更接近 ACB 列或 ACB_alt 列。如果 FRQ 更接近 ACB 列,我希望列 "SNP, FRQ, ACB, ASW, BEB, and CDX" 保留在新的 data.frame 中。如果 FRQ 列更接近 ACB_alt,我希望将列 "SNP, FRQ, ACB_alt,ASW_alt, BEB_alt, and CDX_alt" 保留在新数据框中。但是,我希望新数据框中新列的名称保持 "ACB, ASW,BEB,and CDX" 即使我使用的是 alt 值。
让我们运行以前两行为例。第 1 行中的 FRQ 是 0.3588。由于 0.3588 比 ACB (0.53645) 更接近 ACB_alt (0.46354),我希望新数据框的第一行由替代值组成:
SNP FRQ ACB ASW BEB CDX
rs10007883 0.3588 0.46354 0.45081 0.80813 0.89247
第 2 行中的 FRQ 是 0.8654。由于 0.8654 比 ACB_alt (0.39583) 更接近 ACB (0.60416),我希望新数据框的第二行由常规值组成:
SNP FRQ ACB ASW BEB CDX
rs10007883 0.3588 0.46354 0.45081 0.80813 0.89247
rs10009522 0.8654 0.60416 0.47540 0.16279 0.14516
谁能帮我解决这个问题?
EDIT:: 我添加了一个名为 "EFF" 的列。如果我使用正常值,我希望 EFF 保持不变。如果我使用替代值,我希望 EFF 翻转它的符号。
您可以通过筛选行和重命名列来创建 "normal" 数据框和 "alt" 数据框。然后 row_bind
将两个数据帧放在一起。
我们在开头添加一个 rownames
列,这样我们就可以在末尾保持相同的行顺序。
library('tidyverse')
df <- read_table2("SNP FRQ ACB ASW BEB CDX ACB_alt ASW_alt BEB_alt CDX_alt
rs10007883 0.3588 0.53645 0.54918 0.19186 0.10752 0.46354 0.45081 0.80813 0.89247
rs10009522 0.8654 0.60416 0.47540 0.16279 0.14516 0.39583 0.52459 0.83720 0.85483
rs10010325 0.5277 0.45833 0.467213 0.45348 0.58602 0.54166 0.53278 0.54651 0.41397
rs10010809 0.3958 0.375 0.401639 0.29069 0.15591 0.62500 0.59836 0.70930 0.84408
rs10015151 0.6939 0.57291 0.44262 0.54651 0.36559 0.42708 0.55737 0.45348 0.63440
rs10016978 0.5633 0.5625 0.56557 0.42441 0.16666 0.43750 0.43442 0.57558 0.83333"
)
df <- add_rownames(df)
df_non_alt <- df %>%
filter(abs(FRQ - ACB) <= abs(FRQ - ACB_alt)) %>%
select(-ends_with('_alt'))
df_non_alt
#> # A tibble: 4 x 7
#> rowname SNP FRQ ACB ASW BEB CDX
#> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 2 rs10009522 0.8654 0.60416 0.475400 0.16279 0.14516
#> 2 4 rs10010809 0.3958 0.37500 0.401639 0.29069 0.15591
#> 3 5 rs10015151 0.6939 0.57291 0.442620 0.54651 0.36559
#> 4 6 rs10016978 0.5633 0.56250 0.565570 0.42441 0.16666
df_alt <- df %>%
filter(abs(FRQ - ACB) > abs(FRQ - ACB_alt)) %>%
select(rowname, SNP, FRQ, ends_with('_alt')) %>%
rename_all(~gsub('_alt', '', .))
df_alt
#> # A tibble: 2 x 7
#> rowname SNP FRQ ACB ASW BEB CDX
#> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 1 rs10007883 0.3588 0.46354 0.45081 0.80813 0.89247
#> 2 3 rs10010325 0.5277 0.54166 0.53278 0.54651 0.41397
bind_rows(df_non_alt, df_alt) %>%
arrange(rowname) %>%
select(-rowname)
#> # A tibble: 6 x 6
#> SNP FRQ ACB ASW BEB CDX
#> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 rs10007883 0.3588 0.46354 0.450810 0.80813 0.89247
#> 2 rs10009522 0.8654 0.60416 0.475400 0.16279 0.14516
#> 3 rs10010325 0.5277 0.54166 0.532780 0.54651 0.41397
#> 4 rs10010809 0.3958 0.37500 0.401639 0.29069 0.15591
#> 5 rs10015151 0.6939 0.57291 0.442620 0.54651 0.36559
#> 6 rs10016978 0.5633 0.56250 0.565570 0.42441 0.16666
Paul 的代码使用了比我废弃的更好的方法(直到现在我才听说过 filter :/),但我认为 post.我做的很天真,基本上遍历每一行并将结果附加到一个空数据框。
编辑:我已经按照要求包含了 "EFF" 列,该函数现在翻转了替代案例的符号。
library('tidyverse')
df <- read_table2("SNP FRQ ACB ASW BEB CDX ACB_alt ASW_alt BEB_alt CDX_alt EFF
rs10007883 0.3588 0.53645 0.54918 0.19186 0.10752 0.46354 0.45081 0.80813 0.89247 1
rs10009522 0.8654 0.60416 0.47540 0.16279 0.14516 0.39583 0.52459 0.83720 0.85483 1
rs10010325 0.5277 0.45833 0.467213 0.45348 0.58602 0.54166 0.53278 0.54651 0.41397 1
rs10010809 0.3958 0.375 0.401639 0.29069 0.15591 0.62500 0.59836 0.70930 0.84408 1
rs10015151 0.6939 0.57291 0.44262 0.54651 0.36559 0.42708 0.55737 0.45348 0.63440 1
rs10016978 0.5633 0.5625 0.56557 0.42441 0.16666 0.43750 0.43442 0.57558 0.83333 1")
new_df <- data.frame(SNP=as.character(), FRQ=as.numeric(), ACB=as.numeric(),
ASW=as.numeric(), BEB=as.numeric(), CDX=as.numeric(),
EFF=as.numeric())
compareValues <- function(row){
if(abs(row$FRQ[1]-row$ACB[1]) <= abs(row$FRQ[1]-row$ACB_alt[1])) {
row <- row[,colnames(new_df)]
new_df <<- rbind(new_df, row)
} else {
row <- row[,c("SNP", "FRQ", "ACB_alt", "ASW_alt", "BEB_alt", "CDX_alt", "EFF")]
row$EFF <- -1 * row$EFF # Negate EFF
colnames(row) <- colnames(new_df)
new_df <<- rbind(new_df, row)
}
}
# Iterate thru rows of df
by(df, 1:nrow(df), compareValues)
print(new_df)
# A tibble: 6 x 7
# SNP FRQ ACB ASW BEB CDX EFF
# <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 rs10007883 0.359 0.464 0.451 0.808 0.892 -1.00
# 2 rs10009522 0.865 0.604 0.475 0.163 0.145 1.00
# 3 rs10010325 0.528 0.542 0.533 0.547 0.414 -1.00
# 4 rs10010809 0.396 0.375 0.402 0.291 0.156 1.00
# 5 rs10015151 0.694 0.573 0.443 0.547 0.366 1.00
# 6 rs10016978 0.563 0.562 0.566 0.424 0.167 1.00
以最简单的形式使用“data.tablepackage. The use of
data.table”解决此问题的另一种方法是:
library(data.table)
df <- read.table(text = "SNP FRQ ACB ASW BEB CDX ACB_alt ASW_alt BEB_alt CDX_alt
rs10007883 0.3588 0.53645 0.54918 0.19186 0.10752 0.46354 0.45081 0.80813 0.89247
rs10009522 0.8654 0.60416 0.47540 0.16279 0.14516 0.39583 0.52459 0.83720 0.85483
rs10010325 0.5277 0.45833 0.467213 0.45348 0.58602 0.54166 0.53278 0.54651 0.41397
rs10010809 0.3958 0.375 0.401639 0.29069 0.15591 0.62500 0.59836 0.70930 0.84408
rs10016978 0.5633 0.5625 0.56557 0.42441 0.16666 0.43750 0.43442 0.57558 0.83333", header = TRUE, stringsAsFactors = FALSE)
dt <- data.table(df)
dt[, .(SNP = SNP,
FRQ = FRQ,
ACB = ifelse(abs(FRQ - ACB) <= abs(FRQ - ACB_alt), ACB, ACB_alt ),
ASW = ifelse(abs(FRQ - ACB) <= abs(FRQ - ACB_alt), ASW, ASW_alt ),
BEB = ifelse(abs(FRQ - ACB) <= abs(FRQ - ACB_alt), BEB, BEB_alt ),
CDX = ifelse(abs(FRQ - ACB) <= abs(FRQ - ACB_alt), CDX, CDX_alt ))]
#Result:
# SNP FRQ ACB ASW BEB CDX
#1: rs10007883 0.3588 0.46354 0.450810 0.80813 0.89247
#2: rs10009522 0.8654 0.60416 0.475400 0.16279 0.14516
#3: rs10010325 0.5277 0.54166 0.532780 0.54651 0.41397
#4: rs10010809 0.3958 0.37500 0.401639 0.29069 0.15591
#5: rs10016978 0.5633 0.56250 0.565570 0.42441 0.16666
我有一个 df 看起来像:
SNP FRQ ACB ASW BEB CDX ACB_alt ASW_alt BEB_alt CDX_alt P EFF
rs10007883 0.3588 0.53645 0.54918 0.19186 0.10752 0.46354 0.45081 0.80813 0.89247 0.013510 -0.000152
rs10009522 0.8654 0.60416 0.47540 0.16279 0.14516 0.39583 0.52459 0.83720 0.85483 0.019823 0.009342
rs10010325 0.5277 0.45833 0.467213 0.45348 0.58602 0.54166 0.53278 0.54651 0.41397 0.9182352 -0.09135
rs10010809 0.3958 0.375 0.401639 0.29069 0.15591 0.62500 0.59836 0.70930 0.84408 0.0001923 -0.09402
rs10015151 0.6939 0.57291 0.44262 0.54651 0.36559 0.42708 0.55737 0.45348 0.63440 0.0192341 0.00012
rs10016978 0.5633 0.5625 0.56557 0.42441 0.16666 0.43750 0.43442 0.57558 0.83333 0.0153223 -0.00543
我想制作一个基于 "closer to" 测试有条件构建的新数据框。我想测试一下 FRQ 列是否更接近 ACB 列或 ACB_alt 列。如果 FRQ 更接近 ACB 列,我希望列 "SNP, FRQ, ACB, ASW, BEB, and CDX" 保留在新的 data.frame 中。如果 FRQ 列更接近 ACB_alt,我希望将列 "SNP, FRQ, ACB_alt,ASW_alt, BEB_alt, and CDX_alt" 保留在新数据框中。但是,我希望新数据框中新列的名称保持 "ACB, ASW,BEB,and CDX" 即使我使用的是 alt 值。
让我们运行以前两行为例。第 1 行中的 FRQ 是 0.3588。由于 0.3588 比 ACB (0.53645) 更接近 ACB_alt (0.46354),我希望新数据框的第一行由替代值组成:
SNP FRQ ACB ASW BEB CDX
rs10007883 0.3588 0.46354 0.45081 0.80813 0.89247
第 2 行中的 FRQ 是 0.8654。由于 0.8654 比 ACB_alt (0.39583) 更接近 ACB (0.60416),我希望新数据框的第二行由常规值组成:
SNP FRQ ACB ASW BEB CDX
rs10007883 0.3588 0.46354 0.45081 0.80813 0.89247
rs10009522 0.8654 0.60416 0.47540 0.16279 0.14516
谁能帮我解决这个问题?
EDIT:: 我添加了一个名为 "EFF" 的列。如果我使用正常值,我希望 EFF 保持不变。如果我使用替代值,我希望 EFF 翻转它的符号。
您可以通过筛选行和重命名列来创建 "normal" 数据框和 "alt" 数据框。然后 row_bind
将两个数据帧放在一起。
我们在开头添加一个 rownames
列,这样我们就可以在末尾保持相同的行顺序。
library('tidyverse')
df <- read_table2("SNP FRQ ACB ASW BEB CDX ACB_alt ASW_alt BEB_alt CDX_alt
rs10007883 0.3588 0.53645 0.54918 0.19186 0.10752 0.46354 0.45081 0.80813 0.89247
rs10009522 0.8654 0.60416 0.47540 0.16279 0.14516 0.39583 0.52459 0.83720 0.85483
rs10010325 0.5277 0.45833 0.467213 0.45348 0.58602 0.54166 0.53278 0.54651 0.41397
rs10010809 0.3958 0.375 0.401639 0.29069 0.15591 0.62500 0.59836 0.70930 0.84408
rs10015151 0.6939 0.57291 0.44262 0.54651 0.36559 0.42708 0.55737 0.45348 0.63440
rs10016978 0.5633 0.5625 0.56557 0.42441 0.16666 0.43750 0.43442 0.57558 0.83333"
)
df <- add_rownames(df)
df_non_alt <- df %>%
filter(abs(FRQ - ACB) <= abs(FRQ - ACB_alt)) %>%
select(-ends_with('_alt'))
df_non_alt
#> # A tibble: 4 x 7
#> rowname SNP FRQ ACB ASW BEB CDX
#> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 2 rs10009522 0.8654 0.60416 0.475400 0.16279 0.14516
#> 2 4 rs10010809 0.3958 0.37500 0.401639 0.29069 0.15591
#> 3 5 rs10015151 0.6939 0.57291 0.442620 0.54651 0.36559
#> 4 6 rs10016978 0.5633 0.56250 0.565570 0.42441 0.16666
df_alt <- df %>%
filter(abs(FRQ - ACB) > abs(FRQ - ACB_alt)) %>%
select(rowname, SNP, FRQ, ends_with('_alt')) %>%
rename_all(~gsub('_alt', '', .))
df_alt
#> # A tibble: 2 x 7
#> rowname SNP FRQ ACB ASW BEB CDX
#> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 1 rs10007883 0.3588 0.46354 0.45081 0.80813 0.89247
#> 2 3 rs10010325 0.5277 0.54166 0.53278 0.54651 0.41397
bind_rows(df_non_alt, df_alt) %>%
arrange(rowname) %>%
select(-rowname)
#> # A tibble: 6 x 6
#> SNP FRQ ACB ASW BEB CDX
#> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 rs10007883 0.3588 0.46354 0.450810 0.80813 0.89247
#> 2 rs10009522 0.8654 0.60416 0.475400 0.16279 0.14516
#> 3 rs10010325 0.5277 0.54166 0.532780 0.54651 0.41397
#> 4 rs10010809 0.3958 0.37500 0.401639 0.29069 0.15591
#> 5 rs10015151 0.6939 0.57291 0.442620 0.54651 0.36559
#> 6 rs10016978 0.5633 0.56250 0.565570 0.42441 0.16666
Paul 的代码使用了比我废弃的更好的方法(直到现在我才听说过 filter :/),但我认为 post.我做的很天真,基本上遍历每一行并将结果附加到一个空数据框。
编辑:我已经按照要求包含了 "EFF" 列,该函数现在翻转了替代案例的符号。
library('tidyverse')
df <- read_table2("SNP FRQ ACB ASW BEB CDX ACB_alt ASW_alt BEB_alt CDX_alt EFF
rs10007883 0.3588 0.53645 0.54918 0.19186 0.10752 0.46354 0.45081 0.80813 0.89247 1
rs10009522 0.8654 0.60416 0.47540 0.16279 0.14516 0.39583 0.52459 0.83720 0.85483 1
rs10010325 0.5277 0.45833 0.467213 0.45348 0.58602 0.54166 0.53278 0.54651 0.41397 1
rs10010809 0.3958 0.375 0.401639 0.29069 0.15591 0.62500 0.59836 0.70930 0.84408 1
rs10015151 0.6939 0.57291 0.44262 0.54651 0.36559 0.42708 0.55737 0.45348 0.63440 1
rs10016978 0.5633 0.5625 0.56557 0.42441 0.16666 0.43750 0.43442 0.57558 0.83333 1")
new_df <- data.frame(SNP=as.character(), FRQ=as.numeric(), ACB=as.numeric(),
ASW=as.numeric(), BEB=as.numeric(), CDX=as.numeric(),
EFF=as.numeric())
compareValues <- function(row){
if(abs(row$FRQ[1]-row$ACB[1]) <= abs(row$FRQ[1]-row$ACB_alt[1])) {
row <- row[,colnames(new_df)]
new_df <<- rbind(new_df, row)
} else {
row <- row[,c("SNP", "FRQ", "ACB_alt", "ASW_alt", "BEB_alt", "CDX_alt", "EFF")]
row$EFF <- -1 * row$EFF # Negate EFF
colnames(row) <- colnames(new_df)
new_df <<- rbind(new_df, row)
}
}
# Iterate thru rows of df
by(df, 1:nrow(df), compareValues)
print(new_df)
# A tibble: 6 x 7
# SNP FRQ ACB ASW BEB CDX EFF
# <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 rs10007883 0.359 0.464 0.451 0.808 0.892 -1.00
# 2 rs10009522 0.865 0.604 0.475 0.163 0.145 1.00
# 3 rs10010325 0.528 0.542 0.533 0.547 0.414 -1.00
# 4 rs10010809 0.396 0.375 0.402 0.291 0.156 1.00
# 5 rs10015151 0.694 0.573 0.443 0.547 0.366 1.00
# 6 rs10016978 0.563 0.562 0.566 0.424 0.167 1.00
以最简单的形式使用“data.tablepackage. The use of
data.table”解决此问题的另一种方法是:
library(data.table)
df <- read.table(text = "SNP FRQ ACB ASW BEB CDX ACB_alt ASW_alt BEB_alt CDX_alt
rs10007883 0.3588 0.53645 0.54918 0.19186 0.10752 0.46354 0.45081 0.80813 0.89247
rs10009522 0.8654 0.60416 0.47540 0.16279 0.14516 0.39583 0.52459 0.83720 0.85483
rs10010325 0.5277 0.45833 0.467213 0.45348 0.58602 0.54166 0.53278 0.54651 0.41397
rs10010809 0.3958 0.375 0.401639 0.29069 0.15591 0.62500 0.59836 0.70930 0.84408
rs10016978 0.5633 0.5625 0.56557 0.42441 0.16666 0.43750 0.43442 0.57558 0.83333", header = TRUE, stringsAsFactors = FALSE)
dt <- data.table(df)
dt[, .(SNP = SNP,
FRQ = FRQ,
ACB = ifelse(abs(FRQ - ACB) <= abs(FRQ - ACB_alt), ACB, ACB_alt ),
ASW = ifelse(abs(FRQ - ACB) <= abs(FRQ - ACB_alt), ASW, ASW_alt ),
BEB = ifelse(abs(FRQ - ACB) <= abs(FRQ - ACB_alt), BEB, BEB_alt ),
CDX = ifelse(abs(FRQ - ACB) <= abs(FRQ - ACB_alt), CDX, CDX_alt ))]
#Result:
# SNP FRQ ACB ASW BEB CDX
#1: rs10007883 0.3588 0.46354 0.450810 0.80813 0.89247
#2: rs10009522 0.8654 0.60416 0.475400 0.16279 0.14516
#3: rs10010325 0.5277 0.54166 0.532780 0.54651 0.41397
#4: rs10010809 0.3958 0.37500 0.401639 0.29069 0.15591
#5: rs10016978 0.5633 0.56250 0.565570 0.42441 0.16666