基于特定阈值的子集行
Subset rows based on a specific threshold value
我想根据阈值获取我的数据框的列观察值的子集。我会更详细地向您解释这个问题。
我有一个数据框,其中包含 35 名肺腺癌患者的甲基化水平。这是我的数据的一个子集:
> df.met[1:5,1:5]
A2BP1 A2M A2ML1 A4GALT AAAS
paciente6 0.36184475 0.4555788 0.6422624 0.08051388 0.15013343
paciente7 0.47566878 0.7329827 0.4938048 0.45487573 0.10827520
paciente8 0.17455497 0.7528387 0.5686839 0.37018038 0.12423923
paciente9 0.04830471 0.5166676 0.8878207 0.08881092 0.11779075
paciente10 0.16757806 0.7896194 0.5408747 0.35315243 0.09234602
现在,我需要获取另一个对象(具有相同的列数,但行数较少,并且每列不同),其中初始数据的所有列的值的子集都大于 0.1帧.
我的意图是得到一个这样的对象(不知道可不可以...):
A2BP1 A2M A2ML1 A4GALT AAAS
paciente6 0.36184475 0.4555788 0.6422624 0.15013343
paciente7 0.47566878 0.7329827 0.4938048 0.45487573 0.10827520
paciente8 0.17455497 0.7528387 0.5686839 0.37018038 0.12423923
paciente9 0.5166676 0.8878207 0.11779075
paciente10 0.16757806 0.7896194 0.5408747 0.35315243
换句话说,我想避免我的数据框的值小于 0.1。
非常感谢!
您可能需要
df.met[!rowSums(df.met <= 0.1),,drop=FALSE]
# A2BP1 A2M A2ML1 A4GALT AAAS
#paciente7 0.4756688 0.7329827 0.4938048 0.4548757 0.1082752
#paciente8 0.1745550 0.7528387 0.5686839 0.3701804 0.1242392
更新
基于编辑
is.na(df.met) <- df.met <= 0.1
df.met
# A2BP1 A2M A2ML1 A4GALT AAAS
#paciente6 0.3618447 0.4555788 0.6422624 NA 0.1501334
#paciente7 0.4756688 0.7329827 0.4938048 0.4548757 0.1082752
#paciente8 0.1745550 0.7528387 0.5686839 0.3701804 0.1242392
#paciente9 NA 0.5166676 0.8878207 NA 0.1177907
#paciente10 0.1675781 0.7896194 0.5408747 0.3531524 NA
使用data.table
library(data.table)#v1.9.5+
setDT(df.met, keep.rownames=TRUE)[]
for(j in 2:ncol(df.met)){
set(df.met, i=which(df.met[[j]] <=0.1), j=j, value=NA)
}
df.met
# rn A2BP1 A2M A2ML1 A4GALT AAAS
#1: paciente6 0.3618447 0.4555788 0.6422624 NA 0.1501334
#2: paciente7 0.4756688 0.7329827 0.4938048 0.4548757 0.1082752
#3: paciente8 0.1745550 0.7528387 0.5686839 0.3701804 0.1242392
#4: paciente9 NA 0.5166676 0.8878207 NA 0.1177907
#5: paciente10 0.1675781 0.7896194 0.5408747 0.3531524 NA
数据
df.met <- structure(list(A2BP1 = c(0.36184475, 0.47566878, 0.17455497,
0.04830471, 0.16757806), A2M = c(0.4555788, 0.7329827, 0.7528387,
0.5166676, 0.7896194), A2ML1 = c(0.6422624, 0.4938048, 0.5686839,
0.8878207, 0.5408747), A4GALT = c(0.08051388, 0.45487573, 0.37018038,
0.08881092, 0.35315243), AAAS = c(0.15013343, 0.1082752, 0.12423923,
0.11779075, 0.09234602)), .Names = c("A2BP1", "A2M", "A2ML1",
"A4GALT", "AAAS"), class = "data.frame", row.names = c("paciente6",
"paciente7", "paciente8", "paciente9", "paciente10"))
要匹配您想要的输出(值 <= 0.1 替换为空字段),您可以这样做:
library(dplyr)
df.met %>%
add_rownames("pacientes") %>%
mutate_each(funs(replace(., . <= 0.1, "")))
给出:
# Source: local data frame [5 x 6]
#
# pacientes A2BP1 A2M A2ML1 A4GALT AAAS
# 1 paciente6 0.36184475 0.4555788 0.6422624 0.15013343
# 2 paciente7 0.47566878 0.7329827 0.4938048 0.45487573 0.1082752
# 3 paciente8 0.17455497 0.7528387 0.5686839 0.37018038 0.12423923
# 4 paciente9 0.5166676 0.8878207 0.11779075
# 5 paciente10 0.16757806 0.7896194 0.5408747 0.35315243
注意:这会将所有列转换为字符。你应该改为:
df.met %>%
add_rownames("pacientes") %>%
mutate_each(funs(replace(., . <= 0.1, NA)))
这将保留您的初始数据结构(所有列都是数字)
我想根据阈值获取我的数据框的列观察值的子集。我会更详细地向您解释这个问题。
我有一个数据框,其中包含 35 名肺腺癌患者的甲基化水平。这是我的数据的一个子集:
> df.met[1:5,1:5]
A2BP1 A2M A2ML1 A4GALT AAAS
paciente6 0.36184475 0.4555788 0.6422624 0.08051388 0.15013343
paciente7 0.47566878 0.7329827 0.4938048 0.45487573 0.10827520
paciente8 0.17455497 0.7528387 0.5686839 0.37018038 0.12423923
paciente9 0.04830471 0.5166676 0.8878207 0.08881092 0.11779075
paciente10 0.16757806 0.7896194 0.5408747 0.35315243 0.09234602
现在,我需要获取另一个对象(具有相同的列数,但行数较少,并且每列不同),其中初始数据的所有列的值的子集都大于 0.1帧.
我的意图是得到一个这样的对象(不知道可不可以...):
A2BP1 A2M A2ML1 A4GALT AAAS
paciente6 0.36184475 0.4555788 0.6422624 0.15013343
paciente7 0.47566878 0.7329827 0.4938048 0.45487573 0.10827520
paciente8 0.17455497 0.7528387 0.5686839 0.37018038 0.12423923
paciente9 0.5166676 0.8878207 0.11779075
paciente10 0.16757806 0.7896194 0.5408747 0.35315243
换句话说,我想避免我的数据框的值小于 0.1。
非常感谢!
您可能需要
df.met[!rowSums(df.met <= 0.1),,drop=FALSE]
# A2BP1 A2M A2ML1 A4GALT AAAS
#paciente7 0.4756688 0.7329827 0.4938048 0.4548757 0.1082752
#paciente8 0.1745550 0.7528387 0.5686839 0.3701804 0.1242392
更新
基于编辑
is.na(df.met) <- df.met <= 0.1
df.met
# A2BP1 A2M A2ML1 A4GALT AAAS
#paciente6 0.3618447 0.4555788 0.6422624 NA 0.1501334
#paciente7 0.4756688 0.7329827 0.4938048 0.4548757 0.1082752
#paciente8 0.1745550 0.7528387 0.5686839 0.3701804 0.1242392
#paciente9 NA 0.5166676 0.8878207 NA 0.1177907
#paciente10 0.1675781 0.7896194 0.5408747 0.3531524 NA
使用data.table
library(data.table)#v1.9.5+
setDT(df.met, keep.rownames=TRUE)[]
for(j in 2:ncol(df.met)){
set(df.met, i=which(df.met[[j]] <=0.1), j=j, value=NA)
}
df.met
# rn A2BP1 A2M A2ML1 A4GALT AAAS
#1: paciente6 0.3618447 0.4555788 0.6422624 NA 0.1501334
#2: paciente7 0.4756688 0.7329827 0.4938048 0.4548757 0.1082752
#3: paciente8 0.1745550 0.7528387 0.5686839 0.3701804 0.1242392
#4: paciente9 NA 0.5166676 0.8878207 NA 0.1177907
#5: paciente10 0.1675781 0.7896194 0.5408747 0.3531524 NA
数据
df.met <- structure(list(A2BP1 = c(0.36184475, 0.47566878, 0.17455497,
0.04830471, 0.16757806), A2M = c(0.4555788, 0.7329827, 0.7528387,
0.5166676, 0.7896194), A2ML1 = c(0.6422624, 0.4938048, 0.5686839,
0.8878207, 0.5408747), A4GALT = c(0.08051388, 0.45487573, 0.37018038,
0.08881092, 0.35315243), AAAS = c(0.15013343, 0.1082752, 0.12423923,
0.11779075, 0.09234602)), .Names = c("A2BP1", "A2M", "A2ML1",
"A4GALT", "AAAS"), class = "data.frame", row.names = c("paciente6",
"paciente7", "paciente8", "paciente9", "paciente10"))
要匹配您想要的输出(值 <= 0.1 替换为空字段),您可以这样做:
library(dplyr)
df.met %>%
add_rownames("pacientes") %>%
mutate_each(funs(replace(., . <= 0.1, "")))
给出:
# Source: local data frame [5 x 6]
#
# pacientes A2BP1 A2M A2ML1 A4GALT AAAS
# 1 paciente6 0.36184475 0.4555788 0.6422624 0.15013343
# 2 paciente7 0.47566878 0.7329827 0.4938048 0.45487573 0.1082752
# 3 paciente8 0.17455497 0.7528387 0.5686839 0.37018038 0.12423923
# 4 paciente9 0.5166676 0.8878207 0.11779075
# 5 paciente10 0.16757806 0.7896194 0.5408747 0.35315243
注意:这会将所有列转换为字符。你应该改为:
df.met %>%
add_rownames("pacientes") %>%
mutate_each(funs(replace(., . <= 0.1, NA)))
这将保留您的初始数据结构(所有列都是数字)