如何在 R 中的函数中计算两个数据集之间的差异时保留数据集的 ID
How to Keep the id of a dataset when calculating the difference between two datasets within a function in R
我有一个函数可以计算 2 个数据集中行之间的差异(基于相同的列)。我想在计算后保留 id,因为在与另一个 table 合并后我需要它。我实际上不知道如何执行此步骤。这是数据和函数。
# data frame for recipients
IDr= c(seq(1,4))
Blood_type_r=c("A","B","AB","O")
data_R=data.frame(IDr,Blood_type_r,A=rep(0,4),B=c(rep(0,3),1),C=c(rep(1,3),0),D=rep(1,4),E=c(rep(0,2),rep(1,1),0),stringsAsFactors=FALSE)
data_R
IDr Blood_type_r A B C D E
1 1 A 0 0 1 1 0
2 2 B 0 0 1 1 0
3 3 AB 0 0 1 1 1
4 4 O 0 1 0 1 0
# data frame for donors
IDd= c(seq(1,8))
Blood_type_d= c(rep("A", each=2),rep("B", each=2),rep("AB", each=2),rep("O", each=2))
WD= c(rep(0.25, each=2),rep(0.125, each=2),rep(0.125, each=2),rep(0.5, each=2))
data_D=data.frame(IDd,Blood_type_d,A=c(rep(0,6),1,1),B=c(rep(0,6),1,1),C=c(rep(1,7),0),D=rep(1,8),E=c(rep(0,6),rep(1,1),0),WD,stringsAsFactors=FALSE)
data_D
IDd Blood_type_d A B C D E WD
1 1 A 0 0 1 1 0 0.250
2 2 A 0 0 1 1 0 0.250
3 3 B 0 0 1 1 0 0.125
4 4 B 0 0 1 1 0 0.125
5 5 AB 0 0 1 1 0 0.125
6 6 AB 0 0 1 1 0 0.125
7 7 O 1 1 1 1 1 0.500
8 8 O 1 1 0 1 0 0.500
# function
soustraction.i=function(D,R,i,threshold){
D=as.data.frame(D)
R=as.data.frame(R)
dif=map2_df(D, R[i,], `-`)
dif[dif<0] = 0
dif$mismatch=rowSums(dif)
dif=dif[which(dif$mismatch <= threshold),]
return(dif)
}
soustraction.i(data_D[,3:7],data_R[,3:7],1,3)
# A tibble: 8 x 6
A B C D E mismatch
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 0 0 0 0 0 0
2 0 0 0 0 0 0
3 0 0 0 0 0 0
4 0 0 0 0 0 0
5 0 0 0 0 0 0
6 0 0 0 0 0 0
7 1 1 0 0 1 3
8 1 1 0 0 0 2
我想要这样的输出(为捐赠者保留 IDd),但我不知道该怎么做,因为当我将它作为参数传递时,我的 2 个数据集必须具有相同的列数。例如,如果我将阈值设置为 3,我应该拥有来自捐赠者的所有 IDd table。
IDd A B C D E mismatch
1 1 0 0 0 0 0 0
2 2 0 0 0 0 0 0
3 3 0 0 0 0 0 0
4 4 0 0 0 0 0 0
5 5 0 0 0 0 0 0
6 6 0 0 0 0 0 0
7 7 1 1 0 0 1 3
8 8 1 1 0 0 0 2
感谢任何帮助,谢谢。
您可以将 ID 作为参数传入:
IDr= c(seq(1,4))
Blood_type_r=c("A","B","AB","O")
data_R=data.frame(IDr,Blood_type_r,A=rep(0,4),B=c(rep(0,3),1),C=c(rep(1,3),0),D=rep(1,4),E=c(rep(0,2),rep(1,1),0),stringsAsFactors=FALSE)
IDd= c(seq(1,8))
Blood_type_d= c(rep("A", each=2),rep("B", each=2),rep("AB", each=2),rep("O", each=2))
WD= c(rep(0.25, each=2),rep(0.125, each=2),rep(0.125, each=2),rep(0.5, each=2))
data_D=data.frame(IDd,Blood_type_d,A=c(rep(0,6),1,1),B=c(rep(0,6),1,1),C=c(rep(1,7),0),D=rep(1,8),E=c(rep(0,6),rep(1,1),0),WD,stringsAsFactors=FALSE)
soustraction.i=function(D,R,i,threshold, id){
if(nrow(D) != length(id))stop("Length of id has to be same as number of rows of D\n")
D=as.data.frame(D)
R=as.data.frame(R)
dif=map2_df(D, R[i,], `-`)
dif[dif<0] = 0
dif$mismatch=rowSums(dif)
dif=dif[which(dif$mismatch <= threshold),]
col1 <- colnames(dif)[1]
dif <- dif %>%
tibble::add_column(IDd = id, .before=col1)
return(dif)
}
soustraction.i(data_D[,3:7],data_R[,3:7],1,3, id=IDd)
# # A tibble: 8 x 7
# ID_d A B C D E mismatch
# <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 1 0 0 0 0 0 0
# 2 2 0 0 0 0 0 0
# 3 3 0 0 0 0 0 0
# 4 4 0 0 0 0 0 0
# 5 5 0 0 0 0 0 0
# 6 6 0 0 0 0 0 0
# 7 7 1 1 0 0 1 3
# 8 8 1 1 0 0 0 2
要在输出中包含 Id 列,您应该首先将其传递到输入中。试试这个功能:
soustraction.i=function(D,R,i,threshold){
D=as.data.frame(D)
R=as.data.frame(R)
dif=purrr::map2_df(D[-1], R[i,], `-`)
dif[dif<0] = 0
dif$mismatch=rowSums(dif)
dif= cbind(ID = D[1], dif)
dif=dif[which(dif$mismatch <= threshold),]
return(dif)
}
soustraction.i(data_D[,c(1, 3:7)],data_R[,3:7],1,3)
# IDd A B C D E mismatch
#1 1 0 0 0 0 0 0
#2 2 0 0 0 0 0 0
#3 3 0 0 0 0 0 0
#4 4 0 0 0 0 0 0
#5 5 0 0 0 0 0 0
#6 6 0 0 0 0 0 0
#7 7 1 1 0 0 1 3
#8 8 1 1 0 0 0 2
soustraction.i(data_D[,c(1, 3:7)],data_R[,3:7],1,2)
# IDd A B C D E mismatch
#1 1 0 0 0 0 0 0
#2 2 0 0 0 0 0 0
#3 3 0 0 0 0 0 0
#4 4 0 0 0 0 0 0
#5 5 0 0 0 0 0 0
#6 6 0 0 0 0 0 0
#8 8 1 1 0 0 0 2
请注意,我假设 Id 列是 data_D
中的第一列。
我有一个函数可以计算 2 个数据集中行之间的差异(基于相同的列)。我想在计算后保留 id,因为在与另一个 table 合并后我需要它。我实际上不知道如何执行此步骤。这是数据和函数。
# data frame for recipients
IDr= c(seq(1,4))
Blood_type_r=c("A","B","AB","O")
data_R=data.frame(IDr,Blood_type_r,A=rep(0,4),B=c(rep(0,3),1),C=c(rep(1,3),0),D=rep(1,4),E=c(rep(0,2),rep(1,1),0),stringsAsFactors=FALSE)
data_R
IDr Blood_type_r A B C D E
1 1 A 0 0 1 1 0
2 2 B 0 0 1 1 0
3 3 AB 0 0 1 1 1
4 4 O 0 1 0 1 0
# data frame for donors
IDd= c(seq(1,8))
Blood_type_d= c(rep("A", each=2),rep("B", each=2),rep("AB", each=2),rep("O", each=2))
WD= c(rep(0.25, each=2),rep(0.125, each=2),rep(0.125, each=2),rep(0.5, each=2))
data_D=data.frame(IDd,Blood_type_d,A=c(rep(0,6),1,1),B=c(rep(0,6),1,1),C=c(rep(1,7),0),D=rep(1,8),E=c(rep(0,6),rep(1,1),0),WD,stringsAsFactors=FALSE)
data_D
IDd Blood_type_d A B C D E WD
1 1 A 0 0 1 1 0 0.250
2 2 A 0 0 1 1 0 0.250
3 3 B 0 0 1 1 0 0.125
4 4 B 0 0 1 1 0 0.125
5 5 AB 0 0 1 1 0 0.125
6 6 AB 0 0 1 1 0 0.125
7 7 O 1 1 1 1 1 0.500
8 8 O 1 1 0 1 0 0.500
# function
soustraction.i=function(D,R,i,threshold){
D=as.data.frame(D)
R=as.data.frame(R)
dif=map2_df(D, R[i,], `-`)
dif[dif<0] = 0
dif$mismatch=rowSums(dif)
dif=dif[which(dif$mismatch <= threshold),]
return(dif)
}
soustraction.i(data_D[,3:7],data_R[,3:7],1,3)
# A tibble: 8 x 6
A B C D E mismatch
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 0 0 0 0 0 0
2 0 0 0 0 0 0
3 0 0 0 0 0 0
4 0 0 0 0 0 0
5 0 0 0 0 0 0
6 0 0 0 0 0 0
7 1 1 0 0 1 3
8 1 1 0 0 0 2
我想要这样的输出(为捐赠者保留 IDd),但我不知道该怎么做,因为当我将它作为参数传递时,我的 2 个数据集必须具有相同的列数。例如,如果我将阈值设置为 3,我应该拥有来自捐赠者的所有 IDd table。
IDd A B C D E mismatch
1 1 0 0 0 0 0 0
2 2 0 0 0 0 0 0
3 3 0 0 0 0 0 0
4 4 0 0 0 0 0 0
5 5 0 0 0 0 0 0
6 6 0 0 0 0 0 0
7 7 1 1 0 0 1 3
8 8 1 1 0 0 0 2
感谢任何帮助,谢谢。
您可以将 ID 作为参数传入:
IDr= c(seq(1,4))
Blood_type_r=c("A","B","AB","O")
data_R=data.frame(IDr,Blood_type_r,A=rep(0,4),B=c(rep(0,3),1),C=c(rep(1,3),0),D=rep(1,4),E=c(rep(0,2),rep(1,1),0),stringsAsFactors=FALSE)
IDd= c(seq(1,8))
Blood_type_d= c(rep("A", each=2),rep("B", each=2),rep("AB", each=2),rep("O", each=2))
WD= c(rep(0.25, each=2),rep(0.125, each=2),rep(0.125, each=2),rep(0.5, each=2))
data_D=data.frame(IDd,Blood_type_d,A=c(rep(0,6),1,1),B=c(rep(0,6),1,1),C=c(rep(1,7),0),D=rep(1,8),E=c(rep(0,6),rep(1,1),0),WD,stringsAsFactors=FALSE)
soustraction.i=function(D,R,i,threshold, id){
if(nrow(D) != length(id))stop("Length of id has to be same as number of rows of D\n")
D=as.data.frame(D)
R=as.data.frame(R)
dif=map2_df(D, R[i,], `-`)
dif[dif<0] = 0
dif$mismatch=rowSums(dif)
dif=dif[which(dif$mismatch <= threshold),]
col1 <- colnames(dif)[1]
dif <- dif %>%
tibble::add_column(IDd = id, .before=col1)
return(dif)
}
soustraction.i(data_D[,3:7],data_R[,3:7],1,3, id=IDd)
# # A tibble: 8 x 7
# ID_d A B C D E mismatch
# <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 1 0 0 0 0 0 0
# 2 2 0 0 0 0 0 0
# 3 3 0 0 0 0 0 0
# 4 4 0 0 0 0 0 0
# 5 5 0 0 0 0 0 0
# 6 6 0 0 0 0 0 0
# 7 7 1 1 0 0 1 3
# 8 8 1 1 0 0 0 2
要在输出中包含 Id 列,您应该首先将其传递到输入中。试试这个功能:
soustraction.i=function(D,R,i,threshold){
D=as.data.frame(D)
R=as.data.frame(R)
dif=purrr::map2_df(D[-1], R[i,], `-`)
dif[dif<0] = 0
dif$mismatch=rowSums(dif)
dif= cbind(ID = D[1], dif)
dif=dif[which(dif$mismatch <= threshold),]
return(dif)
}
soustraction.i(data_D[,c(1, 3:7)],data_R[,3:7],1,3)
# IDd A B C D E mismatch
#1 1 0 0 0 0 0 0
#2 2 0 0 0 0 0 0
#3 3 0 0 0 0 0 0
#4 4 0 0 0 0 0 0
#5 5 0 0 0 0 0 0
#6 6 0 0 0 0 0 0
#7 7 1 1 0 0 1 3
#8 8 1 1 0 0 0 2
soustraction.i(data_D[,c(1, 3:7)],data_R[,3:7],1,2)
# IDd A B C D E mismatch
#1 1 0 0 0 0 0 0
#2 2 0 0 0 0 0 0
#3 3 0 0 0 0 0 0
#4 4 0 0 0 0 0 0
#5 5 0 0 0 0 0 0
#6 6 0 0 0 0 0 0
#8 8 1 1 0 0 0 2
请注意,我假设 Id 列是 data_D
中的第一列。