根据 "closest value" 测试有选择地包括列

Selectively including columns based on a "closest value" test

我有一个 df 看起来像:

SNP        FRQ     ACB     ASW      BEB     CDX     ACB_alt ASW_alt BEB_alt CDX_alt P         EFF
rs10007883 0.3588  0.53645 0.54918  0.19186 0.10752 0.46354 0.45081 0.80813 0.89247 0.013510  -0.000152
rs10009522 0.8654  0.60416 0.47540  0.16279 0.14516 0.39583 0.52459 0.83720 0.85483 0.019823  0.009342
rs10010325 0.5277  0.45833 0.467213 0.45348 0.58602 0.54166 0.53278 0.54651 0.41397 0.9182352 -0.09135
rs10010809 0.3958  0.375   0.401639 0.29069 0.15591 0.62500 0.59836 0.70930 0.84408 0.0001923 -0.09402
rs10015151 0.6939  0.57291 0.44262  0.54651 0.36559 0.42708 0.55737 0.45348 0.63440 0.0192341 0.00012
rs10016978 0.5633  0.5625  0.56557  0.42441 0.16666 0.43750 0.43442 0.57558 0.83333 0.0153223 -0.00543

我想制作一个基于 "closer to" 测试有条件构建的新数据框。我想测试一下 FRQ 列是否更接近 ACB 列或 ACB_alt 列。如果 FRQ 更接近 ACB 列,我希望列 "SNP, FRQ, ACB, ASW, BEB, and CDX" 保留在新的 data.frame 中。如果 FRQ 列更接近 ACB_alt,我希望将列 "SNP, FRQ, ACB_alt,ASW_alt, BEB_alt, and CDX_alt" 保留在新数据框中。但是,我希望新数据框中新列的名称保持 "ACB, ASW,BEB,and CDX" 即使我使用的是 alt 值。

让我们运行以前两行为例。第 1 行中的 FRQ 是 0.3588。由于 0.3588 比 ACB (0.53645) 更接近 ACB_alt (0.46354),我希望新数据框的第一行由替代值组成:

SNP        FRQ    ACB     ASW     BEB     CDX
rs10007883 0.3588 0.46354 0.45081 0.80813 0.89247

第 2 行中的 FRQ 是 0.8654。由于 0.8654 比 ACB_alt (0.39583) 更接近 ACB (0.60416),我希望新数据框的第二行由常规值组成:

SNP        FRQ    ACB     ASW     BEB     CDX
rs10007883 0.3588 0.46354 0.45081 0.80813 0.89247
rs10009522 0.8654 0.60416 0.47540 0.16279 0.14516

谁能帮我解决这个问题?

EDIT:: 我添加了一个名为 "EFF" 的列。如果我使用正常值,我希望 EFF 保持不变。如果我使用替代值,我希望 EFF 翻转它的符号。

您可以通过筛选行和重命名列来创建 "normal" 数据框和 "alt" 数据框。然后 row_bind 将两个数据帧放在一起。

我们在开头添加一个 rownames 列,这样我们就可以在末尾保持相同的行顺序。

library('tidyverse')

df <- read_table2("SNP        FRQ     ACB     ASW      BEB     CDX     ACB_alt ASW_alt BEB_alt CDX_alt
rs10007883 0.3588  0.53645 0.54918  0.19186 0.10752 0.46354 0.45081 0.80813 0.89247
rs10009522 0.8654  0.60416 0.47540  0.16279 0.14516 0.39583 0.52459 0.83720 0.85483
rs10010325 0.5277  0.45833 0.467213 0.45348 0.58602 0.54166 0.53278 0.54651 0.41397
rs10010809 0.3958  0.375   0.401639 0.29069 0.15591 0.62500 0.59836 0.70930 0.84408
rs10015151 0.6939  0.57291 0.44262  0.54651 0.36559 0.42708 0.55737 0.45348 0.63440
rs10016978 0.5633  0.5625  0.56557  0.42441 0.16666 0.43750 0.43442 0.57558 0.83333"
)

df <- add_rownames(df)

df_non_alt <- df %>%
  filter(abs(FRQ - ACB) <= abs(FRQ - ACB_alt)) %>%
  select(-ends_with('_alt'))
df_non_alt
#> # A tibble: 4 x 7
#>   rowname        SNP    FRQ     ACB      ASW     BEB     CDX
#>     <chr>      <chr>  <dbl>   <dbl>    <dbl>   <dbl>   <dbl>
#> 1       2 rs10009522 0.8654 0.60416 0.475400 0.16279 0.14516
#> 2       4 rs10010809 0.3958 0.37500 0.401639 0.29069 0.15591
#> 3       5 rs10015151 0.6939 0.57291 0.442620 0.54651 0.36559
#> 4       6 rs10016978 0.5633 0.56250 0.565570 0.42441 0.16666

df_alt <- df %>%
  filter(abs(FRQ - ACB) > abs(FRQ - ACB_alt)) %>%
  select(rowname, SNP, FRQ, ends_with('_alt')) %>%
  rename_all(~gsub('_alt', '', .))
df_alt
#> # A tibble: 2 x 7
#>   rowname        SNP    FRQ     ACB     ASW     BEB     CDX
#>     <chr>      <chr>  <dbl>   <dbl>   <dbl>   <dbl>   <dbl>
#> 1       1 rs10007883 0.3588 0.46354 0.45081 0.80813 0.89247
#> 2       3 rs10010325 0.5277 0.54166 0.53278 0.54651 0.41397

bind_rows(df_non_alt, df_alt) %>%
  arrange(rowname) %>%
  select(-rowname)
#> # A tibble: 6 x 6
#>          SNP    FRQ     ACB      ASW     BEB     CDX
#>        <chr>  <dbl>   <dbl>    <dbl>   <dbl>   <dbl>
#> 1 rs10007883 0.3588 0.46354 0.450810 0.80813 0.89247
#> 2 rs10009522 0.8654 0.60416 0.475400 0.16279 0.14516
#> 3 rs10010325 0.5277 0.54166 0.532780 0.54651 0.41397
#> 4 rs10010809 0.3958 0.37500 0.401639 0.29069 0.15591
#> 5 rs10015151 0.6939 0.57291 0.442620 0.54651 0.36559
#> 6 rs10016978 0.5633 0.56250 0.565570 0.42441 0.16666

Paul 的代码使用了比我废弃的更好的方法(直到现在我才听说过 filter :/),但我认为 post.我做的很天真,基本上遍历每一行并将结果附加到一个空数据框。

编辑:我已经按照要求包含了 "EFF" 列,该函数现在翻转了替代案例的符号。

library('tidyverse')

df <- read_table2("SNP        FRQ     ACB     ASW      BEB     CDX     ACB_alt ASW_alt BEB_alt CDX_alt EFF
                  rs10007883 0.3588  0.53645 0.54918  0.19186 0.10752 0.46354 0.45081 0.80813 0.89247 1
                  rs10009522 0.8654  0.60416 0.47540  0.16279 0.14516 0.39583 0.52459 0.83720 0.85483 1
                  rs10010325 0.5277  0.45833 0.467213 0.45348 0.58602 0.54166 0.53278 0.54651 0.41397 1
                  rs10010809 0.3958  0.375   0.401639 0.29069 0.15591 0.62500 0.59836 0.70930 0.84408 1
                  rs10015151 0.6939  0.57291 0.44262  0.54651 0.36559 0.42708 0.55737 0.45348 0.63440 1
                  rs10016978 0.5633  0.5625  0.56557  0.42441 0.16666 0.43750 0.43442 0.57558 0.83333 1")

new_df <- data.frame(SNP=as.character(), FRQ=as.numeric(), ACB=as.numeric(), 
                     ASW=as.numeric(), BEB=as.numeric(), CDX=as.numeric(), 
                     EFF=as.numeric())

compareValues <- function(row){
  if(abs(row$FRQ[1]-row$ACB[1]) <= abs(row$FRQ[1]-row$ACB_alt[1])) {
    row <- row[,colnames(new_df)]
    new_df <<- rbind(new_df, row)
  } else {
    row <- row[,c("SNP", "FRQ", "ACB_alt", "ASW_alt", "BEB_alt", "CDX_alt", "EFF")]
    row$EFF <- -1 * row$EFF  # Negate EFF
    colnames(row) <- colnames(new_df)
    new_df <<- rbind(new_df, row)
  }
}

# Iterate thru rows of df
by(df, 1:nrow(df), compareValues)

print(new_df)
# A tibble: 6 x 7
# SNP          FRQ   ACB   ASW   BEB   CDX   EFF
# <chr>        <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 rs10007883 0.359 0.464 0.451 0.808 0.892 -1.00
# 2 rs10009522 0.865 0.604 0.475 0.163 0.145  1.00
# 3 rs10010325 0.528 0.542 0.533 0.547 0.414 -1.00
# 4 rs10010809 0.396 0.375 0.402 0.291 0.156  1.00
# 5 rs10015151 0.694 0.573 0.443 0.547 0.366  1.00
# 6 rs10016978 0.563 0.562 0.566 0.424 0.167  1.00

以最简单的形式使用“data.tablepackage. The use ofdata.table”解决此问题的另一种方法是:

library(data.table)
df <- read.table(text = "SNP        FRQ     ACB     ASW      BEB     CDX     ACB_alt ASW_alt BEB_alt CDX_alt
                  rs10007883 0.3588  0.53645 0.54918  0.19186 0.10752 0.46354 0.45081 0.80813 0.89247
                  rs10009522 0.8654  0.60416 0.47540  0.16279 0.14516 0.39583 0.52459 0.83720 0.85483
                  rs10010325 0.5277  0.45833 0.467213 0.45348 0.58602 0.54166 0.53278 0.54651 0.41397
                  rs10010809 0.3958  0.375   0.401639 0.29069 0.15591 0.62500 0.59836 0.70930 0.84408
                  rs10016978 0.5633  0.5625  0.56557  0.42441 0.16666 0.43750 0.43442 0.57558 0.83333", header = TRUE, stringsAsFactors = FALSE)


dt <- data.table(df)

dt[, .(SNP = SNP,
       FRQ = FRQ, 
       ACB = ifelse(abs(FRQ - ACB) <= abs(FRQ - ACB_alt), ACB, ACB_alt ),
       ASW = ifelse(abs(FRQ - ACB) <= abs(FRQ - ACB_alt), ASW, ASW_alt ),
       BEB = ifelse(abs(FRQ - ACB) <= abs(FRQ - ACB_alt), BEB, BEB_alt ),
       CDX = ifelse(abs(FRQ - ACB) <= abs(FRQ - ACB_alt), CDX, CDX_alt ))]

#Result:
#          SNP    FRQ     ACB      ASW     BEB     CDX
#1: rs10007883 0.3588 0.46354 0.450810 0.80813 0.89247
#2: rs10009522 0.8654 0.60416 0.475400 0.16279 0.14516
#3: rs10010325 0.5277 0.54166 0.532780 0.54651 0.41397
#4: rs10010809 0.3958 0.37500 0.401639 0.29069 0.15591
#5: rs10016978 0.5633 0.56250 0.565570 0.42441 0.16666