R保留所有值都小于阈值的行
R keep rows where all values are < a threshold
我有一个有 19 列和 2000 多行的 data.frame。第 1 列是因变量,第 V1:V17 列是自变量。我只想保留列出的每个自变量的值在 0 到 0.30 之间的行。但是,每一行都有不同数量的自变量值。非值用NA填充,但我想或多或少地“忽略”NA值。
示例数据布局:
可重现样本:
> dfY <- structure(list(formula_vec1 = structure(c(9L, 3L, 12L, 6L, 11L,
5L, 8L, 2L, 7L, 1L, 10L, 4L), .Label = c("BQbinary10BQBAPA ~ BedrockGrouped",
"BQbinary10BQBAPA ~ FlowCategory2", "BQbinary10BQBAPA ~ MixedForBinary",
"BQbinary10BQBAPA ~ SurfGeoCode", "BQbinary10BQBAPA ~ WetBinary",
"BQbinary10BQBAPA ~ WetForBinary", "BQbinary10TPA ~ BedrockGrouped",
"BQbinary10TPA ~ FlowCategory2", "BQbinary10TPA ~ MixedForBinary",
"BQbinary10TPA ~ SurfGeoCode", "BQbinary10TPA ~ WetBinary", "BQbinary10TPA ~ WetForBinary"
), class = "factor"), `(Intercept)` = c(0.273438838044101, 0.706417333314839,
0.827323681408328, 0.827323681408328, 0.670245265976437, 1, 0.293077765083469,
0.0729824051982314, 1, 0.178457442476982, 0.998548313507269,
0.998548313507378), SlopeMIN = c(0.189866676669654, 0.654211112307992,
0.544287209863837, 0.544287209863837, 0.994937849813948, 0.999999999999999,
0.191210657045005, 0.03858791129829, 0.994491578112418, 0.994856875070902,
0.997947006402924, 0.997947006406413), V3 = c(NA, NA, NA, NA,
NA, NA, NA, NA, 1, 0.132226649521136, 0.997947006402968, 0.999999999999613
), V4 = c(NA, NA, NA, NA, NA, NA, NA, NA, 0.818915198441539,
0.0971466407595114, 1, 0.997629408064154), V5 = c(NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, 0.999999999999979, 0.997947006406465
), V6 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 0.999999999999999,
0.997947006406414), V7 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, 0.998510413314225, 0.998586213707042), V8 = c(NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, 0.999999999999999, 0.997947006406413
), V9 = c(NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_),
V10 = c(NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_), V11 = c(NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_), V12 = c(NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_), V13 = c(NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_), V14 = c(NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_), V15 = c(NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_), V16 = c(NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_), V17 = c(NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_), V18 = c(NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_)), .Names = c("formula_vec1",
"(Intercept)", "SlopeMIN", "V3", "V4", "V5", "V6", "V7", "V8",
"V9", "V10", "V11", "V12", "V13", "V14", "V15", "V16", "V17",
"V18"), row.names = 9:20, class = "data.frame")
预期输出(仅保留所有自变量均小于 0.300 的行):
在这个小子集中,只有 V1 中的值 <0.300,但完整 data.frame 中还有其他行具有多个符合条件的变量。
我试过使用最小值来提取我想要的行,但是那只能得到每行中的最小值,我需要所有的值都小于 0.300。
我也尝试过对数据进行子集化,但我对 NA 值感到困惑:
dfOutput <- dfInput[dfInput[,3:19]>0.00000001 | dfInput[,3:19]<0.300, ]
任何关于从这里到哪里去的想法将不胜感激。
这应该有效:
dfOutput <- dfInput[apply(dfInput[,3:19]>0.00000001 & dfInput[,3:19]<0.300, 1, all, na.rm=TRUE), ]
现在对于一个可重现的例子,我将解释发生了什么:
# data
df <- data.frame(x = c(1:3, NA, 3:1), y=c(NA, NA, NA, 3, 3, 2, 3))
# this returns a matrix!
df[, 1:2] > 2
# use apply
apply(df[, 1:2] > 2, 1, all)
# "ignore" NA's
apply(df[, 1:2] > 2, 1, all, na.rm=TRUE)
# finally you do
df[apply(df[, 1:2] > 2, 1, all, na.rm=TRUE), ]
我有一个有 19 列和 2000 多行的 data.frame。第 1 列是因变量,第 V1:V17 列是自变量。我只想保留列出的每个自变量的值在 0 到 0.30 之间的行。但是,每一行都有不同数量的自变量值。非值用NA填充,但我想或多或少地“忽略”NA值。
示例数据布局:
可重现样本:
> dfY <- structure(list(formula_vec1 = structure(c(9L, 3L, 12L, 6L, 11L,
5L, 8L, 2L, 7L, 1L, 10L, 4L), .Label = c("BQbinary10BQBAPA ~ BedrockGrouped",
"BQbinary10BQBAPA ~ FlowCategory2", "BQbinary10BQBAPA ~ MixedForBinary",
"BQbinary10BQBAPA ~ SurfGeoCode", "BQbinary10BQBAPA ~ WetBinary",
"BQbinary10BQBAPA ~ WetForBinary", "BQbinary10TPA ~ BedrockGrouped",
"BQbinary10TPA ~ FlowCategory2", "BQbinary10TPA ~ MixedForBinary",
"BQbinary10TPA ~ SurfGeoCode", "BQbinary10TPA ~ WetBinary", "BQbinary10TPA ~ WetForBinary"
), class = "factor"), `(Intercept)` = c(0.273438838044101, 0.706417333314839,
0.827323681408328, 0.827323681408328, 0.670245265976437, 1, 0.293077765083469,
0.0729824051982314, 1, 0.178457442476982, 0.998548313507269,
0.998548313507378), SlopeMIN = c(0.189866676669654, 0.654211112307992,
0.544287209863837, 0.544287209863837, 0.994937849813948, 0.999999999999999,
0.191210657045005, 0.03858791129829, 0.994491578112418, 0.994856875070902,
0.997947006402924, 0.997947006406413), V3 = c(NA, NA, NA, NA,
NA, NA, NA, NA, 1, 0.132226649521136, 0.997947006402968, 0.999999999999613
), V4 = c(NA, NA, NA, NA, NA, NA, NA, NA, 0.818915198441539,
0.0971466407595114, 1, 0.997629408064154), V5 = c(NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, 0.999999999999979, 0.997947006406465
), V6 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 0.999999999999999,
0.997947006406414), V7 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, 0.998510413314225, 0.998586213707042), V8 = c(NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, 0.999999999999999, 0.997947006406413
), V9 = c(NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_),
V10 = c(NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_), V11 = c(NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_), V12 = c(NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_), V13 = c(NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_), V14 = c(NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_), V15 = c(NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_), V16 = c(NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_), V17 = c(NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_), V18 = c(NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_)), .Names = c("formula_vec1",
"(Intercept)", "SlopeMIN", "V3", "V4", "V5", "V6", "V7", "V8",
"V9", "V10", "V11", "V12", "V13", "V14", "V15", "V16", "V17",
"V18"), row.names = 9:20, class = "data.frame")
预期输出(仅保留所有自变量均小于 0.300 的行):
在这个小子集中,只有 V1 中的值 <0.300,但完整 data.frame 中还有其他行具有多个符合条件的变量。
我试过使用最小值来提取我想要的行,但是那只能得到每行中的最小值,我需要所有的值都小于 0.300。
我也尝试过对数据进行子集化,但我对 NA 值感到困惑:
dfOutput <- dfInput[dfInput[,3:19]>0.00000001 | dfInput[,3:19]<0.300, ]
任何关于从这里到哪里去的想法将不胜感激。
这应该有效:
dfOutput <- dfInput[apply(dfInput[,3:19]>0.00000001 & dfInput[,3:19]<0.300, 1, all, na.rm=TRUE), ]
现在对于一个可重现的例子,我将解释发生了什么:
# data
df <- data.frame(x = c(1:3, NA, 3:1), y=c(NA, NA, NA, 3, 3, 2, 3))
# this returns a matrix!
df[, 1:2] > 2
# use apply
apply(df[, 1:2] > 2, 1, all)
# "ignore" NA's
apply(df[, 1:2] > 2, 1, all, na.rm=TRUE)
# finally you do
df[apply(df[, 1:2] > 2, 1, all, na.rm=TRUE), ]