在 R 中使用 data.table 使用控件(列中特定范围的数据)进行标准化
Normalization using a control (specific range of data in the column) using data.table in R
我正在尝试使用 data.table Cran R 包中包含的各种功能的参考或控件来规范化数据。
我的 table 可能看起来像这样(有更多行和更多 features/grouping 列):
myDF <- data.table(
Grouping=rep(c("P1","P2"),each=6),
type = rep(c(rep("samp",times=4),"CRTL","CRTL"),times=2),
ID= rep(1:6, times=2),
feat1 = rnorm(12),
feat2 = rnorm(12)
)
这会生成以下数据 table(这是此 运行 的输出):
Grouping type ID feat1 feat2
1: P1 samp 1 0.9852 0.24133
2: P1 samp 2 0.2358 1.26750
3: P1 samp 3 1.2034 2.19410
4: P1 samp 4 0.5468 -0.42462
5: P1 CRTL 5 0.3997 0.95686
6: P1 CRTL 6 0.9915 -1.41417
7: P2 samp 1 0.6461 -1.19252
8: P2 samp 2 0.7926 -0.68735
9: P2 samp 3 0.9408 0.07738
10: P2 samp 4 0.2759 1.37948
11: P2 CRTL 5 1.0898 -0.07205
12: P2 CRTL 6 0.5325 1.21850
我想归一化,这样对于每个 Grouping
,对于每个 type
,feat1
和 feat2
的中值(在我的实际情况下,这将是一个很长的特征列表)除以(标准化)类型的中值 'CRTL'.
我能够使用下面的代码实现此目的,但我希望有更优雅(更快的方法)来完成 this.Here 是我使用的代码:
cols_grouping=c('Grouping', 'type')
cols_features=c('feat1','feat2')
setkeyv(myDF,"Grouping")
myDF_norm=myDF[,lapply(.SD, median, rm.NA=TRUE), .SDcols=cols_features, by=cols_grouping]
setkeyv(myDF_norm,"Grouping")
crt_normalization = function(sub_table){
for (col in cols_features) {
i_col=paste0("i.",col)
sub_table[[col]]=sub_table[[col]]/sub_table[[i_col]]
sub_table[[i_col]]=NULL
}
return(sub_table)
}
myDF_norm=myDF_norm[
myDF_norm[type == "CRTL",
c("Grouping",cols_features),
with=FALSE]
][,crt_normalization(.SD),by='Grouping']
这个 returns 正确标准化的 table:
Grouping type feat1 feat2
1: P1 samp 2.0629 -3.2994
2: P1 CRTL 1.0000 1.0000
3: P2 samp 0.2282 -0.5321
4: P2 CRTL 1.0000 1.0000
希望您有一种依赖于 data.table 的方法,它可能更优雅、更高效
为了比较结果,最好定义一个种子。我几乎只使用一些基本的合并功能。不知道这个版本是不是比你的快
set.seed(123)
myDF <- data.table(
Grouping=rep(c("P1","P2"),each=6),
type = rep(c(rep("samp",times=4),"CRTL","CRTL"),times=2),
ID= rep(1:6, times=2),
feat1 = rnorm(12),
feat2 = rnorm(12)
)
merge( x = myDF[, .(Median.feat1 = median(feat1)
, Median.feat2 = median(feat2)), by = list(Grouping, type)]
, y = myDF[like(type, "CRTL" ), .(Median.feat1 = median(feat1)
, Median.feat2 = median(feat2)), by = list(Grouping, type)]
, by.x = c("Grouping")
, by.y = c("Grouping")
)[, .(Grouping
,type = type.x
, Median.feat1 = Median.feat1.x/Median.feat1.y
, Median.feat2 = Median.feat2.x/Median.feat2.y
)]
merge
结果如下 table:
Grouping type.x Median.feat1.x Median.feat2.x type.y Median.feat1.y Median.feat2.y
1: P1 samp 0.1510875 0.1236681 CRTL 0.7733013 0.4074872
2: P1 CRTL 0.7733013 0.4074872 CRTL 0.7733013 0.4074872
3: P2 samp 0.2108585 -0.1386234 CRTL -0.3634114 -1.1849065
4: P2 CRTL -0.3634114 -1.1849065 CRTL -0.3634114 -1.1849065
最终结果如下所示:
Grouping type Median.feat1 Median.feat2
1: P1 samp 0.1953798 0.3034897
2: P1 CRTL 1.0000000 1.0000000
3: P2 samp -0.5802199 0.1169910
4: P2 CRTL 1.0000000 1.0000000
这似乎有效
DT<-setDT(myDF)
DT1<-DT[, lapply(.SD, median),.SDcols = (cols_features), by=cols_grouping]
DT1[DT1[type=="CRTL"],.(Grouping, type,med_feat1=
feat1/i.feat1, med_feat2=feat2/i.feat2),on="Grouping"]
Grouping type med_feat1 med_feat2
1: P1 samp 1.10121 -3.29936
2: P1 CRTL 1.00000 1.00000
3: P2 samp 0.88683 -0.53205
4: P2 CRTL 1.00000 1.00000
结果与您发布的不同,但是当我运行您的代码时,我得到了相同的结果
根据您处理大量变量的要求更新了选项。仍然没有避免在进行自连接时创建多个 'i.' 列的开销。
DT<-setDT(myDF)
cols_grouping=c('Grouping', 'type')
N=2 # define the number of 'feat' variables
cols_features=paste0("feat",1:N)
cols_features_i <-paste0("i.",cols_features)
DT1<-DT[, lapply(.SD, median),.SDcols = (cols_features), by=cols_grouping]
DT2<-DT1[DT1[type=="CRTL"],,on="Grouping"]
DT2[,paste0("med_",cols_features):=Map(`/`, mget(cols_features), mget(cols_features_i))]
DT2[,grep("^i", colnames(DT2)):=NULL] # drop the unnecessary variables.
> DT2
Grouping type feat1 feat2 med_feat1 med_feat2
1: P1 samp 0.19585 0.562563 0.31809 -0.38011
2: P1 CRTL 0.61570 -1.479994 1.00000 1.00000
3: P2 samp 0.19385 0.087063 -2.66163 -0.99856
4: P2 CRTL -0.07283 -0.087189 1.00000 1.00000
必须有一个更优雅的答案...
我正在尝试使用 data.table Cran R 包中包含的各种功能的参考或控件来规范化数据。
我的 table 可能看起来像这样(有更多行和更多 features/grouping 列):
myDF <- data.table(
Grouping=rep(c("P1","P2"),each=6),
type = rep(c(rep("samp",times=4),"CRTL","CRTL"),times=2),
ID= rep(1:6, times=2),
feat1 = rnorm(12),
feat2 = rnorm(12)
)
这会生成以下数据 table(这是此 运行 的输出):
Grouping type ID feat1 feat2 1: P1 samp 1 0.9852 0.24133 2: P1 samp 2 0.2358 1.26750 3: P1 samp 3 1.2034 2.19410 4: P1 samp 4 0.5468 -0.42462 5: P1 CRTL 5 0.3997 0.95686 6: P1 CRTL 6 0.9915 -1.41417 7: P2 samp 1 0.6461 -1.19252 8: P2 samp 2 0.7926 -0.68735 9: P2 samp 3 0.9408 0.07738 10: P2 samp 4 0.2759 1.37948 11: P2 CRTL 5 1.0898 -0.07205 12: P2 CRTL 6 0.5325 1.21850
我想归一化,这样对于每个 Grouping
,对于每个 type
,feat1
和 feat2
的中值(在我的实际情况下,这将是一个很长的特征列表)除以(标准化)类型的中值 'CRTL'.
我能够使用下面的代码实现此目的,但我希望有更优雅(更快的方法)来完成 this.Here 是我使用的代码:
cols_grouping=c('Grouping', 'type')
cols_features=c('feat1','feat2')
setkeyv(myDF,"Grouping")
myDF_norm=myDF[,lapply(.SD, median, rm.NA=TRUE), .SDcols=cols_features, by=cols_grouping]
setkeyv(myDF_norm,"Grouping")
crt_normalization = function(sub_table){
for (col in cols_features) {
i_col=paste0("i.",col)
sub_table[[col]]=sub_table[[col]]/sub_table[[i_col]]
sub_table[[i_col]]=NULL
}
return(sub_table)
}
myDF_norm=myDF_norm[
myDF_norm[type == "CRTL",
c("Grouping",cols_features),
with=FALSE]
][,crt_normalization(.SD),by='Grouping']
这个 returns 正确标准化的 table:
Grouping type feat1 feat2 1: P1 samp 2.0629 -3.2994 2: P1 CRTL 1.0000 1.0000 3: P2 samp 0.2282 -0.5321 4: P2 CRTL 1.0000 1.0000
希望您有一种依赖于 data.table 的方法,它可能更优雅、更高效
为了比较结果,最好定义一个种子。我几乎只使用一些基本的合并功能。不知道这个版本是不是比你的快
set.seed(123)
myDF <- data.table(
Grouping=rep(c("P1","P2"),each=6),
type = rep(c(rep("samp",times=4),"CRTL","CRTL"),times=2),
ID= rep(1:6, times=2),
feat1 = rnorm(12),
feat2 = rnorm(12)
)
merge( x = myDF[, .(Median.feat1 = median(feat1)
, Median.feat2 = median(feat2)), by = list(Grouping, type)]
, y = myDF[like(type, "CRTL" ), .(Median.feat1 = median(feat1)
, Median.feat2 = median(feat2)), by = list(Grouping, type)]
, by.x = c("Grouping")
, by.y = c("Grouping")
)[, .(Grouping
,type = type.x
, Median.feat1 = Median.feat1.x/Median.feat1.y
, Median.feat2 = Median.feat2.x/Median.feat2.y
)]
merge
结果如下 table:
Grouping type.x Median.feat1.x Median.feat2.x type.y Median.feat1.y Median.feat2.y
1: P1 samp 0.1510875 0.1236681 CRTL 0.7733013 0.4074872
2: P1 CRTL 0.7733013 0.4074872 CRTL 0.7733013 0.4074872
3: P2 samp 0.2108585 -0.1386234 CRTL -0.3634114 -1.1849065
4: P2 CRTL -0.3634114 -1.1849065 CRTL -0.3634114 -1.1849065
最终结果如下所示:
Grouping type Median.feat1 Median.feat2
1: P1 samp 0.1953798 0.3034897
2: P1 CRTL 1.0000000 1.0000000
3: P2 samp -0.5802199 0.1169910
4: P2 CRTL 1.0000000 1.0000000
这似乎有效
DT<-setDT(myDF)
DT1<-DT[, lapply(.SD, median),.SDcols = (cols_features), by=cols_grouping]
DT1[DT1[type=="CRTL"],.(Grouping, type,med_feat1=
feat1/i.feat1, med_feat2=feat2/i.feat2),on="Grouping"]
Grouping type med_feat1 med_feat2
1: P1 samp 1.10121 -3.29936
2: P1 CRTL 1.00000 1.00000
3: P2 samp 0.88683 -0.53205
4: P2 CRTL 1.00000 1.00000
结果与您发布的不同,但是当我运行您的代码时,我得到了相同的结果
根据您处理大量变量的要求更新了选项。仍然没有避免在进行自连接时创建多个 'i.' 列的开销。
DT<-setDT(myDF)
cols_grouping=c('Grouping', 'type')
N=2 # define the number of 'feat' variables
cols_features=paste0("feat",1:N)
cols_features_i <-paste0("i.",cols_features)
DT1<-DT[, lapply(.SD, median),.SDcols = (cols_features), by=cols_grouping]
DT2<-DT1[DT1[type=="CRTL"],,on="Grouping"]
DT2[,paste0("med_",cols_features):=Map(`/`, mget(cols_features), mget(cols_features_i))]
DT2[,grep("^i", colnames(DT2)):=NULL] # drop the unnecessary variables.
> DT2
Grouping type feat1 feat2 med_feat1 med_feat2
1: P1 samp 0.19585 0.562563 0.31809 -0.38011
2: P1 CRTL 0.61570 -1.479994 1.00000 1.00000
3: P2 samp 0.19385 0.087063 -2.66163 -0.99856
4: P2 CRTL -0.07283 -0.087189 1.00000 1.00000
必须有一个更优雅的答案...