在 data.table 中二进制搜索 integer64
Binary search for integer64 in data.table
我有一个 integer64
索引的 data.table
对象:
library(data.table)
library(bit64)
some_data = as.integer64(c(1514772184120000026, 1514772184120000068, 1514772184120000042, 1514772184120000078,1514772184120000011, 1514772184120000043, 1514772184120000094, 1514772184120000085,
1514772184120000083, 1514772184120000017, 1514772184120000013, 1514772184120000060, 1514772184120000032, 1514772184120000059, 1514772184120000029))
#
n <- 10
x <- setDT(data.frame(a = runif(n)))
x[, new_col := some_data[1:n]]
setorder(x, new_col)
然后我有一堆其他 integer64
需要在我的原始 data.table
对象 (x
):
的索引中进行二进制搜索
search_values <- some_data[(n+1):length(some_data)]
如果这些是本地整数,我可以使用 findInterval()
来解决问题:
values_index <- findInterval(search_values, x$new_col)
但是当 findInterval
的参数是 integer64
时,我得到:
Warning messages:
1: In as.double.integer64(vec) :
integer precision lost while converting to double
2: In as.double.integer64(x) :
integer precision lost while converting to double
错误的索引:
> values_index
[1] 10 10 10 10 10
例如search_values
的条目都大于 x$new_col
的所有条目是不正确的。
编辑:
期望的输出:
print(values_index)
9 10 6 10 1
为什么?:
value_index
的条目数与 search_values
一样多。对于 search_values
的每个条目,value_index
中的相应条目给出了 search_values
的条目在插入 x$new_col
中时的排名。所以 value_index
的第一个条目是 9
因为 search_values
(1514772184120000045
) 的第一个条目在 x$new_col
的条目中排名 9
.
如果我得到了你想要的,那么一个快速的解决方法可能是:
toadd <- search_values[!(search_values %in% x$new_col)] # search_values that is not in data
x[, i := .I] # mark the original data set
x <- rbindlist(list(x, data.table(new_col = toadd)),
use.names = T, fill = T) # add missing search_values
setkey(x, new_col) # order
x[, index := new_col %in% search_values] # mark where the values are
x[, index := cumsum(index)] # get indexes
x <- x[!is.na(i)] # remove added rows
x$index # should contain your desired output
也许你想要这样的东西:
findInterval2 <- function(y, x) {
toadd <- y[!(y %in% x$new_col)] # search_values that is not in data
x2 <- copy(x)
x2[, i := .I] # mark the original data set
x2 <- rbindlist(list(x2, data.table(new_col = toadd)),
use.names = T, fill = T) # add missing search_values
setkey(x2, new_col) # order
x2[, index := cumsum(!is.na(i))]
x2[match(y, new_col), index]
}
# x2 is:
# a new_col i index
# 1: 0.56602278 1514772184120000011 1 1
# 2: NA 1514772184120000013 NA 1
# 3: 0.29408237 1514772184120000017 2 2
# 4: 0.28532378 1514772184120000026 3 3
# 5: NA 1514772184120000029 NA 3
# 6: NA 1514772184120000032 NA 3
# 7: 0.66844754 1514772184120000042 4 4
# 8: 0.83008829 1514772184120000043 5 5
# 9: NA 1514772184120000059 NA 5
# 10: NA 1514772184120000060 NA 5
# 11: 0.76992760 1514772184120000068 6 6
# 12: 0.57049677 1514772184120000078 7 7
# 13: 0.14406169 1514772184120000083 8 8
# 14: 0.02044602 1514772184120000085 9 9
# 15: 0.68016024 1514772184120000094 10 10
findInterval2(search_values, x)
# [1] 1 5 3 5 3
如果没有,那么也许您可以根据需要更改代码。
更新
查看此整数示例,了解此函数给出与基数相同的结果 findInterval
now <- 10
n <- 10
n2 <- 10
some_data = as.integer(now + sample.int(n + n2, n + n2))
x <- setDT(data.frame(a = runif(n)))
x[, new_col := some_data[1:n]]
setorder(x, new_col)
search_values <- some_data[(n + 1):length(some_data)]
r1 <- findInterval2(search_values, x)
r2 <- findInterval(search_values, x$new_col)
all.equal(r1, r2)
我有一个 integer64
索引的 data.table
对象:
library(data.table)
library(bit64)
some_data = as.integer64(c(1514772184120000026, 1514772184120000068, 1514772184120000042, 1514772184120000078,1514772184120000011, 1514772184120000043, 1514772184120000094, 1514772184120000085,
1514772184120000083, 1514772184120000017, 1514772184120000013, 1514772184120000060, 1514772184120000032, 1514772184120000059, 1514772184120000029))
#
n <- 10
x <- setDT(data.frame(a = runif(n)))
x[, new_col := some_data[1:n]]
setorder(x, new_col)
然后我有一堆其他 integer64
需要在我的原始 data.table
对象 (x
):
search_values <- some_data[(n+1):length(some_data)]
如果这些是本地整数,我可以使用 findInterval()
来解决问题:
values_index <- findInterval(search_values, x$new_col)
但是当 findInterval
的参数是 integer64
时,我得到:
Warning messages:
1: In as.double.integer64(vec) :
integer precision lost while converting to double
2: In as.double.integer64(x) :
integer precision lost while converting to double
错误的索引:
> values_index
[1] 10 10 10 10 10
例如search_values
的条目都大于 x$new_col
的所有条目是不正确的。
编辑:
期望的输出:
print(values_index)
9 10 6 10 1
为什么?:
value_index
的条目数与 search_values
一样多。对于 search_values
的每个条目,value_index
中的相应条目给出了 search_values
的条目在插入 x$new_col
中时的排名。所以 value_index
的第一个条目是 9
因为 search_values
(1514772184120000045
) 的第一个条目在 x$new_col
的条目中排名 9
.
如果我得到了你想要的,那么一个快速的解决方法可能是:
toadd <- search_values[!(search_values %in% x$new_col)] # search_values that is not in data
x[, i := .I] # mark the original data set
x <- rbindlist(list(x, data.table(new_col = toadd)),
use.names = T, fill = T) # add missing search_values
setkey(x, new_col) # order
x[, index := new_col %in% search_values] # mark where the values are
x[, index := cumsum(index)] # get indexes
x <- x[!is.na(i)] # remove added rows
x$index # should contain your desired output
也许你想要这样的东西:
findInterval2 <- function(y, x) {
toadd <- y[!(y %in% x$new_col)] # search_values that is not in data
x2 <- copy(x)
x2[, i := .I] # mark the original data set
x2 <- rbindlist(list(x2, data.table(new_col = toadd)),
use.names = T, fill = T) # add missing search_values
setkey(x2, new_col) # order
x2[, index := cumsum(!is.na(i))]
x2[match(y, new_col), index]
}
# x2 is:
# a new_col i index
# 1: 0.56602278 1514772184120000011 1 1
# 2: NA 1514772184120000013 NA 1
# 3: 0.29408237 1514772184120000017 2 2
# 4: 0.28532378 1514772184120000026 3 3
# 5: NA 1514772184120000029 NA 3
# 6: NA 1514772184120000032 NA 3
# 7: 0.66844754 1514772184120000042 4 4
# 8: 0.83008829 1514772184120000043 5 5
# 9: NA 1514772184120000059 NA 5
# 10: NA 1514772184120000060 NA 5
# 11: 0.76992760 1514772184120000068 6 6
# 12: 0.57049677 1514772184120000078 7 7
# 13: 0.14406169 1514772184120000083 8 8
# 14: 0.02044602 1514772184120000085 9 9
# 15: 0.68016024 1514772184120000094 10 10
findInterval2(search_values, x)
# [1] 1 5 3 5 3
如果没有,那么也许您可以根据需要更改代码。
更新
查看此整数示例,了解此函数给出与基数相同的结果 findInterval
now <- 10
n <- 10
n2 <- 10
some_data = as.integer(now + sample.int(n + n2, n + n2))
x <- setDT(data.frame(a = runif(n)))
x[, new_col := some_data[1:n]]
setorder(x, new_col)
search_values <- some_data[(n + 1):length(some_data)]
r1 <- findInterval2(search_values, x)
r2 <- findInterval(search_values, x$new_col)
all.equal(r1, r2)