根据索引向量从列表中提取元素
extract elements from a list based on a vector of indices
我想根据存储在单独向量中的索引从列表中提取元素。
这是我的尝试:
list_positions<-c(2,3,4)
my_list<-list(c(1,3,4),c(2,3,4,5,6),c(1,2,3,4,6))
my_fun<-function(x,y){
x[y]
}
mapply(my_fun,x=my_list,y=list_positions)
也许有人可以提出更快的解决方案。我的列表有大约 1400 万个元素。我尝试了并行解决方案,我使用 clusterMap 而不是 mapply,但我仍然希望获得更好的性能。
我们可以 unlist
list
,基于 'my_list' 的 lengths
创建索引并提取 vector
v1 <- unlist(my_list)
p1 <- list_positions
v1[cumsum(lengths(my_list))- (lengths(my_list)-p1)]
#[1] 3 4 4
基准
set.seed(24)
lst <- lapply(1:1e6, function(i) sample(1:10, sample(2:5), replace=FALSE))
p2 <- sapply(lst, function(x) sample(length(x), 1))
system.time({
r1 <- mapply(`[`, lst, p2)
})
#user system elapsed
# 1.84 0.02 1.86
system.time( r4 <- mapply(my_fun, lst, p2) )
# user system elapsed
# 1.88 0.01 1.89
system.time({ r4 <- mapply(my_fun, lst, p2) }) #placing inside the {}
# user system elapsed
# 2.31 0.00 2.31
system.time({ ##cccmir's function
r3 <- mapply(my_func1, lst, p2)
})
# user system elapsed
# 12.10 0.03 12.13
system.time({
v2 <- unlist(lst)
r2 <- v2[cumsum(lengths(lst))- (lengths(lst)-p2)]
})
# user system elapsed
# 0.14 0.00 0.14
identical(r1, r2)
#[1] TRUE
在这种情况下您应该使用 for 循环,例如:
library(microbenchmark)
list_positions<-c(2,3,4)
my_list<-list(c(1,3,4),c(2,3,4,5,6),c(1,2,3,4,6))
my_fun<-function(x,y){
x[y]
}
mapply(my_fun,x=my_list,y=list_positions)
my_func1 <- function(aList, positions){
res <- numeric(length(aList))
for(i in seq_along(aList)) {
res[i] <- aList[[i]][positions[i]]
}
return(res)
}
my_func2 <- function(aList, positions) {
v1 <- unlist(aList)
p1 <- positions
v1[cumsum(lengths(my_list))- (lengths(my_list)-p1)]
}
microbenchmark(mapply(my_fun,x=my_list,y=list_positions), my_func1(my_list, list_positions), my_func2(my_list, list_positions), times = 1000)
#Unit: microseconds
# expr min lq mean median uq max neval
#mapply(my_fun, x = my_list, y = list_positions) 12.764 13.858 17.453172 14.588 16.775 119.613 1000
# my_func1(my_list, list_positions) 5.106 5.835 7.328412 6.200 6.929 38.292 1000
# my_func2(my_list, list_positions) 2.553 3.282 4.337367 3.283 3.648 52.514 1000
@akrun 解决方案是最快的
我想根据存储在单独向量中的索引从列表中提取元素。
这是我的尝试:
list_positions<-c(2,3,4)
my_list<-list(c(1,3,4),c(2,3,4,5,6),c(1,2,3,4,6))
my_fun<-function(x,y){
x[y]
}
mapply(my_fun,x=my_list,y=list_positions)
也许有人可以提出更快的解决方案。我的列表有大约 1400 万个元素。我尝试了并行解决方案,我使用 clusterMap 而不是 mapply,但我仍然希望获得更好的性能。
我们可以 unlist
list
,基于 'my_list' 的 lengths
创建索引并提取 vector
v1 <- unlist(my_list)
p1 <- list_positions
v1[cumsum(lengths(my_list))- (lengths(my_list)-p1)]
#[1] 3 4 4
基准
set.seed(24)
lst <- lapply(1:1e6, function(i) sample(1:10, sample(2:5), replace=FALSE))
p2 <- sapply(lst, function(x) sample(length(x), 1))
system.time({
r1 <- mapply(`[`, lst, p2)
})
#user system elapsed
# 1.84 0.02 1.86
system.time( r4 <- mapply(my_fun, lst, p2) )
# user system elapsed
# 1.88 0.01 1.89
system.time({ r4 <- mapply(my_fun, lst, p2) }) #placing inside the {}
# user system elapsed
# 2.31 0.00 2.31
system.time({ ##cccmir's function
r3 <- mapply(my_func1, lst, p2)
})
# user system elapsed
# 12.10 0.03 12.13
system.time({
v2 <- unlist(lst)
r2 <- v2[cumsum(lengths(lst))- (lengths(lst)-p2)]
})
# user system elapsed
# 0.14 0.00 0.14
identical(r1, r2)
#[1] TRUE
在这种情况下您应该使用 for 循环,例如:
library(microbenchmark)
list_positions<-c(2,3,4)
my_list<-list(c(1,3,4),c(2,3,4,5,6),c(1,2,3,4,6))
my_fun<-function(x,y){
x[y]
}
mapply(my_fun,x=my_list,y=list_positions)
my_func1 <- function(aList, positions){
res <- numeric(length(aList))
for(i in seq_along(aList)) {
res[i] <- aList[[i]][positions[i]]
}
return(res)
}
my_func2 <- function(aList, positions) {
v1 <- unlist(aList)
p1 <- positions
v1[cumsum(lengths(my_list))- (lengths(my_list)-p1)]
}
microbenchmark(mapply(my_fun,x=my_list,y=list_positions), my_func1(my_list, list_positions), my_func2(my_list, list_positions), times = 1000)
#Unit: microseconds
# expr min lq mean median uq max neval
#mapply(my_fun, x = my_list, y = list_positions) 12.764 13.858 17.453172 14.588 16.775 119.613 1000
# my_func1(my_list, list_positions) 5.106 5.835 7.328412 6.200 6.929 38.292 1000
# my_func2(my_list, list_positions) 2.553 3.282 4.337367 3.283 3.648 52.514 1000
@akrun 解决方案是最快的