具有公差的 match() 值
match() values with tolerance
我在绘图之前对数据集进行子集化,但关键是数字我不能使用 match()
或 %in%
的严格相等测试(它遗漏了一些值)。
我写了以下替代方案,但我认为这个问题非常普遍,以至于某处有更好的内置替代方案? all.equal
似乎不是为多个测试值设计的。
select_in <- function(x, ref, tol=1e-10){
testone <- function(value) abs(x - value) < tol
as.logical(rowSums(sapply(ref, testone)) )
}
x = c(1.0, 1+1e-13, 1.01, 2, 2+1e-9, 2-1e-11)
x %in% c(1,2,3)
#[1] TRUE FALSE FALSE TRUE FALSE FALSE
select_in(x, c(1, 2, 3))
#[1] TRUE TRUE FALSE TRUE FALSE TRUE
不确定它有多好,但 all.equal
有一个有效的公差参数:
`%~%` <- function(x,y) sapply(x, function(.x) {
any(sapply(y, function(.y) isTRUE(all.equal(.x, .y, tolerance=tol))))
})
x %~% c(1,2,3)
[1] TRUE TRUE FALSE TRUE FALSE TRUE
我不喜欢那里有两个应用函数。我会尽量缩短。
更新
另一种不使用 all.equal
可能更快的方法。事实证明比第一个解决方案快得多:
`%~%` <- function(x,y) {
out <- logical(length(x))
for(i in 1:length(x)) out[i] <- any(abs(x[i] - y) <= tol)
out
}
x %~% c(1,2,3)
[1] TRUE TRUE FALSE TRUE FALSE TRUE
基准
big.x <- rep(x, 1e3)
big.y <- rep(y, 100)
all.equal(select_in(big.x, big.y), big.x %~% big.y)
[1] TRUE
library(microbenchmark)
microbenchmark(
baptiste = select_in(big.x, big.y),
plafort2 = big.x %~% big.y,
times=50L)
Unit: milliseconds
expr min lq mean median uq max
baptiste 185.86828 199.57517 231.28246 244.81980 261.7451 271.3426
plafort2 49.03265 54.30729 84.88076 66.10971 118.3270 123.1074
neval cld
50 b
50 a
这似乎达到了目标(虽然不是很宽容):
fselect_in <- function(x, ref, d = 10){
round(x, digits=d) %in% round(ref, digits=d)
}
fselect_in(x, c(1,2,3))
# TRUE TRUE FALSE TRUE FALSE TRUE
另一个避免length(x) * length(ref)
搜索的想法:
ff = function(x, ref, tol = 1e-10)
{
sref = sort(ref)
i = findInterval(x, sref, all.inside = TRUE)
dif1 = abs(x - sref[i])
dif2 = abs(x - sref[i + 1])
dif = dif1 > dif2
dif1[dif] = dif2[dif]
dif1 <= tol
}
ff(c(1.0, 1+1e-13, 1.01, 2, 2+1e-9, 2-1e-11), c(1, 2, 3))
#[1] TRUE TRUE FALSE TRUE FALSE TRUE
并进行比较:
set.seed(911)
X = sample(1e2, 5e5, TRUE) + (sample(c(1e-8, 1e-9, 1e-10, 1e-12, 1e-13), 5e5, TRUE) * sample(c(-1, 1), 5e5, TRUE))
REF = as.double(1:1e2)
all.equal(ff(X, REF), select_in(X, REF))
#[1] TRUE
tol = 1e-10 #set this for Pierre's function
microbenchmark::microbenchmark(select_in(X, REF), fselect_in(X, REF), X %~% REF, ff(X, REF), { round(X, 10); round(REF, 10) }, times = 35)
#Unit: milliseconds
# expr min lq median uq max neval
# select_in(X, REF) 1259.95876 1324.52371 1380.10492 1428.78677 1495.61810 35
# fselect_in(X, REF) 121.47241 123.72678 125.28932 128.56770 142.15676 35
# X %~% REF 2023.78159 2088.97226 2161.66973 2219.46164 2547.89849 35
# ff(X, REF) 67.35003 69.39804 71.20871 73.22626 94.04477 35
# { round(X, 10) round(REF, 10) } 96.20344 96.88344 99.10093 102.66328 117.75189 35
Frank 的 match
应该比 findInterval
快,而且确实如此,大部分时间花在 round
.
我在绘图之前对数据集进行子集化,但关键是数字我不能使用 match()
或 %in%
的严格相等测试(它遗漏了一些值)。
我写了以下替代方案,但我认为这个问题非常普遍,以至于某处有更好的内置替代方案? all.equal
似乎不是为多个测试值设计的。
select_in <- function(x, ref, tol=1e-10){
testone <- function(value) abs(x - value) < tol
as.logical(rowSums(sapply(ref, testone)) )
}
x = c(1.0, 1+1e-13, 1.01, 2, 2+1e-9, 2-1e-11)
x %in% c(1,2,3)
#[1] TRUE FALSE FALSE TRUE FALSE FALSE
select_in(x, c(1, 2, 3))
#[1] TRUE TRUE FALSE TRUE FALSE TRUE
不确定它有多好,但 all.equal
有一个有效的公差参数:
`%~%` <- function(x,y) sapply(x, function(.x) {
any(sapply(y, function(.y) isTRUE(all.equal(.x, .y, tolerance=tol))))
})
x %~% c(1,2,3)
[1] TRUE TRUE FALSE TRUE FALSE TRUE
我不喜欢那里有两个应用函数。我会尽量缩短。
更新
另一种不使用 all.equal
可能更快的方法。事实证明比第一个解决方案快得多:
`%~%` <- function(x,y) {
out <- logical(length(x))
for(i in 1:length(x)) out[i] <- any(abs(x[i] - y) <= tol)
out
}
x %~% c(1,2,3)
[1] TRUE TRUE FALSE TRUE FALSE TRUE
基准
big.x <- rep(x, 1e3)
big.y <- rep(y, 100)
all.equal(select_in(big.x, big.y), big.x %~% big.y)
[1] TRUE
library(microbenchmark)
microbenchmark(
baptiste = select_in(big.x, big.y),
plafort2 = big.x %~% big.y,
times=50L)
Unit: milliseconds
expr min lq mean median uq max
baptiste 185.86828 199.57517 231.28246 244.81980 261.7451 271.3426
plafort2 49.03265 54.30729 84.88076 66.10971 118.3270 123.1074
neval cld
50 b
50 a
这似乎达到了目标(虽然不是很宽容):
fselect_in <- function(x, ref, d = 10){
round(x, digits=d) %in% round(ref, digits=d)
}
fselect_in(x, c(1,2,3))
# TRUE TRUE FALSE TRUE FALSE TRUE
另一个避免length(x) * length(ref)
搜索的想法:
ff = function(x, ref, tol = 1e-10)
{
sref = sort(ref)
i = findInterval(x, sref, all.inside = TRUE)
dif1 = abs(x - sref[i])
dif2 = abs(x - sref[i + 1])
dif = dif1 > dif2
dif1[dif] = dif2[dif]
dif1 <= tol
}
ff(c(1.0, 1+1e-13, 1.01, 2, 2+1e-9, 2-1e-11), c(1, 2, 3))
#[1] TRUE TRUE FALSE TRUE FALSE TRUE
并进行比较:
set.seed(911)
X = sample(1e2, 5e5, TRUE) + (sample(c(1e-8, 1e-9, 1e-10, 1e-12, 1e-13), 5e5, TRUE) * sample(c(-1, 1), 5e5, TRUE))
REF = as.double(1:1e2)
all.equal(ff(X, REF), select_in(X, REF))
#[1] TRUE
tol = 1e-10 #set this for Pierre's function
microbenchmark::microbenchmark(select_in(X, REF), fselect_in(X, REF), X %~% REF, ff(X, REF), { round(X, 10); round(REF, 10) }, times = 35)
#Unit: milliseconds
# expr min lq median uq max neval
# select_in(X, REF) 1259.95876 1324.52371 1380.10492 1428.78677 1495.61810 35
# fselect_in(X, REF) 121.47241 123.72678 125.28932 128.56770 142.15676 35
# X %~% REF 2023.78159 2088.97226 2161.66973 2219.46164 2547.89849 35
# ff(X, REF) 67.35003 69.39804 71.20871 73.22626 94.04477 35
# { round(X, 10) round(REF, 10) } 96.20344 96.88344 99.10093 102.66328 117.75189 35
Frank 的 match
应该比 findInterval
快,而且确实如此,大部分时间花在 round
.