从多个单独的变量创建一个合并变量
Creating one merged variable from multiple separate one
任何帮助将不胜感激
我有一个从 PCR 板软件导出的文件。我已经编码了所有等位基因的调用,现在将它们合并到一个数据框中。
我需要创建一个新变量来合并 3 个等位基因(G1-1、G1-2 和 G2)以获得最终基因型。
然后我需要计算等位基因的出现次数以生成我需要生成的其他 3 个 APOL1 风险变量。
Allele logic for final genotype:
+/G2 = (G1-1-1(+) & G1-1-2(+)) & (G1-2-1(+) & G1-2-2(+)) & (occurence of (G2) at either G2-1 or G2-2)
+/+ = (G1-1-1(+) & G1-1-2(+)) & (G1-2-1(+) & G1-2-2(+)) & (G2-1(+) & G2-2(+))
G2/G2 = (G1-1-1(+) & G1-1-2(+)) & (G1-2-1(+) & G1-2-2(+)) & (G2-1(G2) & G2-2(G2))
G1^GM/+ = (occurence of (G1^S342G) at either G1-1-1 or G1-1-2) & (occurence of (G1^I384M) at either G1-2-1 or G1-2-2) & (G2-1(+) & G2-2(+))
G1^G+/+ = (occurence of (G1^S342G) at either G1-1-1 or G1-1-2) & (G1-2-1(+) & G1-2-2(+)) & (G2-1(+) & G2-2(+))
G1^GM/G1^GM = (occurence of (G1^S342G) at both G1-1-1 or G1-1-2) & (occurence of (G1^I384M) at both G1-2-1 or G1-2-2) & (G2-1(+) & G2-2(+))
G1^GM/G2 = (occurence of (G1^S342G) at either G1-1-1 or G1-1-2) & (occurence of (G1^I384M) at either G1-2-1 or G1-2-2) & (occurence of (G2) at either G2-1 or G2-2)
G1^G+/G2 = (occurence of (G1^S342G) at either G1-1-1 or G1-1-2) & (G1-2-1(+) & G1-2-2(+) & (occurence of (G2) at either G2-1 or G2-2)
Original dataframe
Final dataframe needed
原始数据帧结构
Classes ‘tbl_df’, ‘tbl’ and 'data.frame': 28 obs. of 6 variables:
$ G1-1-1 : chr "+" "+" "+" "+" ...
$ G1-1-2 : chr "+" "+" "+" "+" ...
$ G1-2-1 : chr "+" "+" "+" "+" ...
$ G1-2-2 : chr "+" "+" "+" "+" ...
$ G2-1 : chr "+" "+" "+" "+" ...
$ G2-2 : chr "G2" "+" "G2" "G2" ...
The APOL1 Risk variables logic is below:
If (+/+) categorize as 1 in "no APOL1 Risk Alleles"
If (+/G2) or (G1^GM/+) or (G1^G+/+) categorize as 1 in "1 APOL1 Risk Alleles"
If (G1^GM/G1^GM) or (G1^GM/G2) or (G2/G2) categorize as 1 in "2 APOL1 Risk Alleles"
您可以使用 dplyr 函数轻松实现逻辑。
执行逻辑,因为这应该是执行该逻辑所需的代码;
library(dplyr)
data <-
data.frame(
G1_1_1 = c("+", "+", "+", "+", "+", "+", "+", "G1S342G", "+", "G1S342G", "+", "+", "+", "+",
"G1S342G", "G1S342G", "+", "G1S342G", "+", "+", "+", "G1S342G", "+", "+",
"G1S342G", "+", "G1S342G", "G1S342G"),
G1_1_2 = c("+", "+", "+", "+", "+", "+", "+", "G1S342G", "+", "+", "+", "+", "+", "+", "+",
"+", "+", "+", "+", "+", "+", "+", "+", "+", "+", "+", "+", "+"),
G1_2_1 = c("+", "+", "+", "+", "+", "+", "+", "G1I384M", "+", "G1I384M", "+", "+", "+", "+",
"G1I384M", "G1I384M", "+", "G1I384M", "+", "+", "+", "+", "+", "+", "G1I384M",
"+", "G1I384M", "G1I384M"),
G1_2_2 = c("+", "+", "+", "+", "+", "+", "+", "G1I384M", "+", "+", "+", "+", "+", "+", "+",
"+", "+", "+", "+", "+", "+", "+", "+", "+", "+", "+", "+", "+"),
G2_1 = c("+", "+", "+", "+", "+", "+", "+", "+", "+", "+", "+", "+", "+", "+", "+",
"+", "+", "+", "+", "+", "+", "+", "+", "+", "+", "+", "+", "+"),
G2_2 = c("G2", "+", "G2", "G2", "G2", "+", "+", "+", "G2", "G2", "+", "G2", "+", "G2", "G2",
"+", "+", "+", "G2", "+", "G2", "+", "+", "+", "+", "+", "+", "+"),
stringsAsFactors = FALSE
)
result <-
data %>% mutate(
"Final genotype of APOL1" =
case_when(
# +/+ = (G1-1-1(+) & G1-1-2(+)) & (G1-2-1(+) & G1-2-2(+)) & (G2-1(+) & G2-2(+))
G1_1_1 == "+" & G1_1_2 == "+" & G1_2_1 == "+" &
G1_2_2 == "+" & G2_1 == "+" & G2_2 == "+" ~ "+/+",
# G2/G2 = (G1-1-1(+) & G1-1-2(+)) & (G1-2-1(+) & G1-2-2(+)) & (G2-1(G2) & G2-2(G2))
G1_1_1 == "+" & G1_1_2 == "+" & G1_2_1 == "+" &
G1_2_2 == "+" & G2_1 == "G2" & G2_2 == "G2" ~ "G2/G2",
# G1^GM/G1^GM = (occurence of (G1^S342G) at both G1-1-1 or G1-1-2)
# & (occurence of (G1^I384M) at both G1-2-1 or G1-2-2) & (G2-1(+) & G2-2(+))
G1_1_1 == "G1S342G" & G1_1_2 == "G1S342G" & G1_2_1 == "G1I384M" &
G1_2_2 == "G1I384M" & G2_1 == "+" & G2_2 == "+" ~ "G1GM/G1GM",
#+/G2 = (G1-1-1(+) & G1-1-2(+)) & (G1-2-1(+) & G1-2-2(+))
#& (occurence of (G2) at either G2-1 or G2-2)
G1_1_1 == "+" & G1_1_2 == "+" & G1_2_1 == "+" &
G1_2_2 == "+" & (G2_1 == "G2" | G2_2 == "G2") ~ "+/G2",
# G1^G+/+ = (occurence of (G1^S342G) at either G1-1-1 or G1-1-2)
# & (G1-2-1(+) & G1-2-2(+)) & (G2-1(+) & G2-2(+))
(G1_1_1 == "G1S342G" | G1_1_2 == "G1S342G") & G1_2_1 == "+" &
G1_2_2 == "+" & G2_1 == "+" & G2_2 == "+" ~ "G1G+/+",
# G1^G+/G2 = (occurence of (G1^S342G) at either G1-1-1 or G1-1-2)
# & (G1-2-1(+) & G1-2-2(+) & (occurence of (G2) at either G2-1 or G2-2)
(G1_1_1 == "G1S342G" | G1_1_2 == "G1S342G") & G1_2_1 == "+" &
G1_2_2 == "+" & (G2_1 == "G2" | G2_2 == "G2") ~ "G1G+/G2",
# G1^GM/+ = (occurence of (G1^S342G) at either G1-1-1 or G1-1-2) &
# (occurence of (G1^I384M) at either G1-2-1 or G1-2-2) & (G2-1(+) & G2-2(+))
(G1_1_1 == "G1S342G" | G1_1_2 == "G1S342G") &
(G1_2_1 == "G1I384M" | G1_2_2 == "G1I384M") &
G2_1 == "+" & G2_2 == "+" ~ "G1GM/+",
# G1^GM/G2 = (occurence of (G1^S342G) at either G1-1-1 or G1-1-2) &
# (occurence of (G1^I384M) at either G1-2-1 or G1-2-2) &
# (occurence of (G2) at either G2-1 or G2-2)
(G1_1_1 == "G1S342G" | G1_1_2 == "G1S342G") &
(G1_2_1 == "G1I384M" | G1_2_2 == "G1I384M") &
(G2_1 == "G2" | G2_2 == "G2") ~ "G1GM/G2",
TRUE ~ NA_character_),
"no APOL1 Risk Alleles" = ifelse(`Final genotype of APOL1` == "+/+", 1, NA),
"1 APOL1 Risk Alleles" =
ifelse(`Final genotype of APOL1` %in% c("+/G2", "G1GM/+", "G1G+/+"), 1, NA),
"2 APOL1 Risk Alleles" =
ifelse(`Final genotype of APOL1` %in% c("G1GM/G1GM", "G1GM/G2", "G2/G2"), 1, NA),
)
glimpse(result)
# Observations: 28
# Variables: 10
# $ G1_1_1 <chr> "+", "+", "+", "+", "+", "+", "+", "G1S342G", "+", "G1S342G", ...
# $ G1_1_2 <chr> "+", "+", "+", "+", "+", "+", "+", "G1S342G", "+", "+", "+", "...
# $ G1_2_1 <chr> "+", "+", "+", "+", "+", "+", "+", "G1I384M", "+", "G1I384M", ...
# $ G1_2_2 <chr> "+", "+", "+", "+", "+", "+", "+", "G1I384M", "+", "+", "+", "...
# $ G2_1 <chr> "+", "+", "+", "+", "+", "+", "+", "+", "+", "+", "+", "+", "+...
# $ G2_2 <chr> "G2", "+", "G2", "G2", "G2", "+", "+", "+", "G2", "G2", "+", "...
# $ `Final genotype of APOL1` <chr> "+/G2", "+/+", "+/G2", "+/G2", "+/G2", "+/+", "+/+", "G1GM/G1G...
# $ `no APOL1 Risk Alleles` <dbl> NA, 1, NA, NA, NA, 1, 1, NA, NA, NA, 1, NA, 1, NA, NA, NA, 1, ...
# $ `1 APOL1 Risk Alleles` <dbl> 1, NA, 1, 1, 1, NA, NA, NA, 1, NA, NA, 1, NA, 1, NA, 1, NA, 1,...
# $ `2 APOL1 Risk Alleles` <dbl> NA, NA, NA, NA, NA, NA, NA, 1, NA, 1, NA, NA, NA, NA, 1, NA, N...
任何帮助将不胜感激
我有一个从 PCR 板软件导出的文件。我已经编码了所有等位基因的调用,现在将它们合并到一个数据框中。
我需要创建一个新变量来合并 3 个等位基因(G1-1、G1-2 和 G2)以获得最终基因型。
然后我需要计算等位基因的出现次数以生成我需要生成的其他 3 个 APOL1 风险变量。
Allele logic for final genotype:
+/G2 = (G1-1-1(+) & G1-1-2(+)) & (G1-2-1(+) & G1-2-2(+)) & (occurence of (G2) at either G2-1 or G2-2)
+/+ = (G1-1-1(+) & G1-1-2(+)) & (G1-2-1(+) & G1-2-2(+)) & (G2-1(+) & G2-2(+))
G2/G2 = (G1-1-1(+) & G1-1-2(+)) & (G1-2-1(+) & G1-2-2(+)) & (G2-1(G2) & G2-2(G2))
G1^GM/+ = (occurence of (G1^S342G) at either G1-1-1 or G1-1-2) & (occurence of (G1^I384M) at either G1-2-1 or G1-2-2) & (G2-1(+) & G2-2(+))
G1^G+/+ = (occurence of (G1^S342G) at either G1-1-1 or G1-1-2) & (G1-2-1(+) & G1-2-2(+)) & (G2-1(+) & G2-2(+))
G1^GM/G1^GM = (occurence of (G1^S342G) at both G1-1-1 or G1-1-2) & (occurence of (G1^I384M) at both G1-2-1 or G1-2-2) & (G2-1(+) & G2-2(+))
G1^GM/G2 = (occurence of (G1^S342G) at either G1-1-1 or G1-1-2) & (occurence of (G1^I384M) at either G1-2-1 or G1-2-2) & (occurence of (G2) at either G2-1 or G2-2)
G1^G+/G2 = (occurence of (G1^S342G) at either G1-1-1 or G1-1-2) & (G1-2-1(+) & G1-2-2(+) & (occurence of (G2) at either G2-1 or G2-2)
Original dataframe
Final dataframe needed
原始数据帧结构
Classes ‘tbl_df’, ‘tbl’ and 'data.frame': 28 obs. of 6 variables:
$ G1-1-1 : chr "+" "+" "+" "+" ...
$ G1-1-2 : chr "+" "+" "+" "+" ...
$ G1-2-1 : chr "+" "+" "+" "+" ...
$ G1-2-2 : chr "+" "+" "+" "+" ...
$ G2-1 : chr "+" "+" "+" "+" ...
$ G2-2 : chr "G2" "+" "G2" "G2" ...
The APOL1 Risk variables logic is below:
If (+/+) categorize as 1 in "no APOL1 Risk Alleles"
If (+/G2) or (G1^GM/+) or (G1^G+/+) categorize as 1 in "1 APOL1 Risk Alleles"
If (G1^GM/G1^GM) or (G1^GM/G2) or (G2/G2) categorize as 1 in "2 APOL1 Risk Alleles"
您可以使用 dplyr 函数轻松实现逻辑。
执行逻辑,因为这应该是执行该逻辑所需的代码;
library(dplyr)
data <-
data.frame(
G1_1_1 = c("+", "+", "+", "+", "+", "+", "+", "G1S342G", "+", "G1S342G", "+", "+", "+", "+",
"G1S342G", "G1S342G", "+", "G1S342G", "+", "+", "+", "G1S342G", "+", "+",
"G1S342G", "+", "G1S342G", "G1S342G"),
G1_1_2 = c("+", "+", "+", "+", "+", "+", "+", "G1S342G", "+", "+", "+", "+", "+", "+", "+",
"+", "+", "+", "+", "+", "+", "+", "+", "+", "+", "+", "+", "+"),
G1_2_1 = c("+", "+", "+", "+", "+", "+", "+", "G1I384M", "+", "G1I384M", "+", "+", "+", "+",
"G1I384M", "G1I384M", "+", "G1I384M", "+", "+", "+", "+", "+", "+", "G1I384M",
"+", "G1I384M", "G1I384M"),
G1_2_2 = c("+", "+", "+", "+", "+", "+", "+", "G1I384M", "+", "+", "+", "+", "+", "+", "+",
"+", "+", "+", "+", "+", "+", "+", "+", "+", "+", "+", "+", "+"),
G2_1 = c("+", "+", "+", "+", "+", "+", "+", "+", "+", "+", "+", "+", "+", "+", "+",
"+", "+", "+", "+", "+", "+", "+", "+", "+", "+", "+", "+", "+"),
G2_2 = c("G2", "+", "G2", "G2", "G2", "+", "+", "+", "G2", "G2", "+", "G2", "+", "G2", "G2",
"+", "+", "+", "G2", "+", "G2", "+", "+", "+", "+", "+", "+", "+"),
stringsAsFactors = FALSE
)
result <-
data %>% mutate(
"Final genotype of APOL1" =
case_when(
# +/+ = (G1-1-1(+) & G1-1-2(+)) & (G1-2-1(+) & G1-2-2(+)) & (G2-1(+) & G2-2(+))
G1_1_1 == "+" & G1_1_2 == "+" & G1_2_1 == "+" &
G1_2_2 == "+" & G2_1 == "+" & G2_2 == "+" ~ "+/+",
# G2/G2 = (G1-1-1(+) & G1-1-2(+)) & (G1-2-1(+) & G1-2-2(+)) & (G2-1(G2) & G2-2(G2))
G1_1_1 == "+" & G1_1_2 == "+" & G1_2_1 == "+" &
G1_2_2 == "+" & G2_1 == "G2" & G2_2 == "G2" ~ "G2/G2",
# G1^GM/G1^GM = (occurence of (G1^S342G) at both G1-1-1 or G1-1-2)
# & (occurence of (G1^I384M) at both G1-2-1 or G1-2-2) & (G2-1(+) & G2-2(+))
G1_1_1 == "G1S342G" & G1_1_2 == "G1S342G" & G1_2_1 == "G1I384M" &
G1_2_2 == "G1I384M" & G2_1 == "+" & G2_2 == "+" ~ "G1GM/G1GM",
#+/G2 = (G1-1-1(+) & G1-1-2(+)) & (G1-2-1(+) & G1-2-2(+))
#& (occurence of (G2) at either G2-1 or G2-2)
G1_1_1 == "+" & G1_1_2 == "+" & G1_2_1 == "+" &
G1_2_2 == "+" & (G2_1 == "G2" | G2_2 == "G2") ~ "+/G2",
# G1^G+/+ = (occurence of (G1^S342G) at either G1-1-1 or G1-1-2)
# & (G1-2-1(+) & G1-2-2(+)) & (G2-1(+) & G2-2(+))
(G1_1_1 == "G1S342G" | G1_1_2 == "G1S342G") & G1_2_1 == "+" &
G1_2_2 == "+" & G2_1 == "+" & G2_2 == "+" ~ "G1G+/+",
# G1^G+/G2 = (occurence of (G1^S342G) at either G1-1-1 or G1-1-2)
# & (G1-2-1(+) & G1-2-2(+) & (occurence of (G2) at either G2-1 or G2-2)
(G1_1_1 == "G1S342G" | G1_1_2 == "G1S342G") & G1_2_1 == "+" &
G1_2_2 == "+" & (G2_1 == "G2" | G2_2 == "G2") ~ "G1G+/G2",
# G1^GM/+ = (occurence of (G1^S342G) at either G1-1-1 or G1-1-2) &
# (occurence of (G1^I384M) at either G1-2-1 or G1-2-2) & (G2-1(+) & G2-2(+))
(G1_1_1 == "G1S342G" | G1_1_2 == "G1S342G") &
(G1_2_1 == "G1I384M" | G1_2_2 == "G1I384M") &
G2_1 == "+" & G2_2 == "+" ~ "G1GM/+",
# G1^GM/G2 = (occurence of (G1^S342G) at either G1-1-1 or G1-1-2) &
# (occurence of (G1^I384M) at either G1-2-1 or G1-2-2) &
# (occurence of (G2) at either G2-1 or G2-2)
(G1_1_1 == "G1S342G" | G1_1_2 == "G1S342G") &
(G1_2_1 == "G1I384M" | G1_2_2 == "G1I384M") &
(G2_1 == "G2" | G2_2 == "G2") ~ "G1GM/G2",
TRUE ~ NA_character_),
"no APOL1 Risk Alleles" = ifelse(`Final genotype of APOL1` == "+/+", 1, NA),
"1 APOL1 Risk Alleles" =
ifelse(`Final genotype of APOL1` %in% c("+/G2", "G1GM/+", "G1G+/+"), 1, NA),
"2 APOL1 Risk Alleles" =
ifelse(`Final genotype of APOL1` %in% c("G1GM/G1GM", "G1GM/G2", "G2/G2"), 1, NA),
)
glimpse(result)
# Observations: 28
# Variables: 10
# $ G1_1_1 <chr> "+", "+", "+", "+", "+", "+", "+", "G1S342G", "+", "G1S342G", ...
# $ G1_1_2 <chr> "+", "+", "+", "+", "+", "+", "+", "G1S342G", "+", "+", "+", "...
# $ G1_2_1 <chr> "+", "+", "+", "+", "+", "+", "+", "G1I384M", "+", "G1I384M", ...
# $ G1_2_2 <chr> "+", "+", "+", "+", "+", "+", "+", "G1I384M", "+", "+", "+", "...
# $ G2_1 <chr> "+", "+", "+", "+", "+", "+", "+", "+", "+", "+", "+", "+", "+...
# $ G2_2 <chr> "G2", "+", "G2", "G2", "G2", "+", "+", "+", "G2", "G2", "+", "...
# $ `Final genotype of APOL1` <chr> "+/G2", "+/+", "+/G2", "+/G2", "+/G2", "+/+", "+/+", "G1GM/G1G...
# $ `no APOL1 Risk Alleles` <dbl> NA, 1, NA, NA, NA, 1, 1, NA, NA, NA, 1, NA, 1, NA, NA, NA, 1, ...
# $ `1 APOL1 Risk Alleles` <dbl> 1, NA, 1, 1, 1, NA, NA, NA, 1, NA, NA, 1, NA, 1, NA, 1, NA, 1,...
# $ `2 APOL1 Risk Alleles` <dbl> NA, NA, NA, NA, NA, NA, NA, 1, NA, 1, NA, NA, NA, NA, 1, NA, N...