从嵌套列表深处提取命名元素的通用方法
generalized approach to pulling named elements from deep within a nested list
我昨天问了一个问题,询问如何按名称访问嵌套列表的所有元素。
是否有通用函数可以让我获得列表中较深的命名元素?特别是,我正在尝试获取对 Slack 消息的回复的用户和时间戳,并且消息模式名称在消息的 replies
级别下成对重用 user
和 ts
指定回复者和回复时间。我无法通过上面的 purrr 解决方案 link 找到他们。不确定这是因为 user
和 ts
也是主要消息的元素名称,还是因为我无法通过使用 ~ purrr::map()
作为映射函数来访问那么低的级别map()
.
里面
l <- list(folder_1 = list(
`msg_1-1` = list(type = "message",
subtype = "channel_join",
ts = "1585771048.000200",
user = "UFUNNF8MA",
text = "<@UFUNNF8MA> has joined the channel"),
`msg_1-2` = list(type = "message",
subtype = "channel_purpose",
ts = "1585771049.000300",
user = "UNFUNQ8MA",
text = "<@UNFUNQ8MA> set the channel purpose: Talk about xyz")),
folder_2 = list(
`msg_2-1` = list(type = "message",
subtype = "channel_join",
ts = "1585771120.000200",
user = "UQKUNF8MA",
text = "<@UQKUNF8MA> has joined the channel",
replies = list(list(user = "UABCDEFG8", ts = "1585771220.002200"),
list(user = "UGFEDCBA8", ts = "1585771220.022000")))))
谢谢!
你可以这样做:
unlist(l)[grep("\.user", names(unlist(l)))]
#> folder_1.msg_1-1.user folder_1.msg_1-2.user
#> "UFUNNF8MA" "UNFUNQ8MA"
#> folder_2.msg_2-1.user folder_2.msg_2-1.replies.user
#> "UQKUNF8MA" "UABCDEFG8"
#> folder_2.msg_2-1.replies.user
#> "UGFEDCBA8"
对于更通用的解决方案,您可以:
get_elements <- function(list, field) {
as.character(unlist(list)[grep(paste0(field, "$"), names(unlist(list)))])
}
允许:
get_elements(l, "user")
#> [1] "UFUNNF8MA" "UNFUNQ8MA" "UQKUNF8MA" "UABCDEFG8" "UGFEDCBA8"
get_elements(l, "ts")
#> [1] "1585771048.000200" "1585771049.000300" "1585771120.000200"
你可以使用这个递归函数:
# assign the function
get_ele <- function(x, what){
is_list <- vapply(x, is.list, FALSE)
c(sapply(x[is_list], get_ele, what), x[[what]])
}
# use the function
unname(unlist(get_ele(l, "user")))
#R> [1] "UFUNNF8MA" "UNFUNQ8MA" "UABCDEFG8" "UGFEDCBA8" "UQKUNF8MA"
unname(unlist(get_ele(l, "ts")))
#R> [1] "1585771048.000200" "1585771049.000300" "1585771120.000200"
如果您需要多次执行此操作,那么此 C++ 版本可能会更快:
#include <Rcpp.h>
#include <vector>
using namespace Rcpp;
void get_ele_inner(Rcpp::List x, Rcpp::CharacterVector what,
std::vector<SEXP> &out_list){
SEXP names = x.attr("names");
if(!Rf_isNull(names)){
CharacterVector names_str(names);
for(R_len_t i = 0; i < x.size(); ++i){
if(names_str[i] == what[0])
out_list.emplace_back(x[i]);
else if(Rf_isNewList(x[i]))
get_ele_inner(x[i], what, out_list);
}
} else
for(auto xi : x)
if(Rf_isNewList(xi))
get_ele_inner(xi, what, out_list);
}
// [[Rcpp::export(rng = false)]]
SEXP get_ele(SEXP x, Rcpp::CharacterVector what) {
std::vector<SEXP> out_list;
get_ele_inner(x, what, out_list);
R_len_t const n_ele = out_list.size();
Rcpp::List out(n_ele);
for(R_len_t i = 0; i < n_ele; ++i)
out[i] = out_list[i];
return out;
}
您可以使用Rcpp::sourceCpp
编译函数。使用您提供的列表在我的笔记本电脑上似乎快了 10 倍(尽管这个列表很小)。
一个“整洁”的解决方案:
# Fixed the reprex list
l <- list(
folder_1 = list(
`msg_1-1` = list(
type = "message",
subtype = "channel_join",
ts = "1585771048.000200",
user = "UFUNNF8MA",
text = "<@UFUNNF8MA> has joined the channel"
),
`msg_1-2` = list(
type = "message",
subtype = "channel_purpose",
ts = "1585771049.000300",
user = "UNFUNQ8MA",
text = "<@UNFUNQ8MA> set the channel purpose: Talk about xyz"
)
),
folder_2 = list(
`msg_2-1` = list(
type = "message",
subtype = "channel_join",
ts = "1585771120.000200",
user = "UQKUNF8MA",
text = "<@UQKUNF8MA> has joined the channel",
replies = list(
list(user = "UABCDEFG8", ts = "1585771220.002200"),
list(user = "UGFEDCBA8", ts = "1585771220.022000")
)
)
)
)
# use the pipe
`%>%` <- magrittr::`%>%`
# map at 2nd level depth
messages <- purrr::map_depth(l, 2, ~{
if (!is.null(.x$replies)) {
#Map over replies if present
.replies <- purrr::map_dfr(.x$replies, ~{
tibble::tibble(ts = .x$ts, user = .x$user, text = rlang::`%||%`(.x$text, NA))
})
} else {
.replies <- NULL
}
#output as tibble
tibble::tibble(
ts = .x$ts, user = .x$user, text = .x$text, replies = list(
.replies
)
)
}) %>%
#remove one layer of nesting
unlist(recursive = FALSE) %>%
#bind together and keep folder name
dplyr::bind_rows(.id = "folder") %>%
# separate message name into it's own column
tidyr::separate(folder, into = c("folder", "message"), sep = "\.")
在 rrapply
包中使用 rrapply()
(基础 rapply()
的扩展版本):
library(rrapply)
## unlist 'user' nodes as vector
rrapply(l, condition = function(x, .xname) .xname == "user", how = "unlist")
#> folder_1.msg_1-1.user folder_1.msg_1-2.user
#> "UFUNNF8MA" "UNFUNQ8MA"
#> folder_2.msg_2-1.user folder_2.msg_2-1.replies.user
#> "UQKUNF8MA" "UABCDEFG8"
#> folder_2.msg_2-1.replies.user
#> "UGFEDCBA8"
## unlist 'ts' nodes as vector
rrapply(l, condition = function(x, .xname) .xname == "ts", how = "unlist")
#> folder_1.msg_1-1.ts folder_1.msg_1-2.ts
#> "1585771048.000200" "1585771049.000300"
#> folder_2.msg_2-1.ts folder_2.msg_2-1.replies.ts
#> "1585771120.000200" "1585771220.002200"
#> folder_2.msg_2-1.replies.ts
#> "1585771220.022000"
或返回融化的 data.frames 而不是未列出的载体:
## 'user' nodes as melted data.frame
rrapply(l, condition = function(x, .xname) .xname == "user", how = "melt")
#> L1 L2 L3 L4 L5 value
#> 1 folder_1 msg_1-1 user <NA> <NA> UFUNNF8MA
#> 2 folder_1 msg_1-2 user <NA> <NA> UNFUNQ8MA
#> 3 folder_2 msg_2-1 user <NA> <NA> UQKUNF8MA
#> 4 folder_2 msg_2-1 replies ..1 user UABCDEFG8
#> 5 folder_2 msg_2-1 replies ..2 user UGFEDCBA8
## 'ts' nodes as melted data.frame
rrapply(l, condition = function(x, .xname) .xname == "ts", how = "melt")
#> L1 L2 L3 L4 L5 value
#> 1 folder_1 msg_1-1 ts <NA> <NA> 1585771048.000200
#> 2 folder_1 msg_1-2 ts <NA> <NA> 1585771049.000300
#> 3 folder_2 msg_2-1 ts <NA> <NA> 1585771120.000200
#> 4 folder_2 msg_2-1 replies ..1 ts 1585771220.002200
#> 5 folder_2 msg_2-1 replies ..2 ts 1585771220.022000
我昨天问了一个问题,询问如何按名称访问嵌套列表的所有元素。
是否有通用函数可以让我获得列表中较深的命名元素?特别是,我正在尝试获取对 Slack 消息的回复的用户和时间戳,并且消息模式名称在消息的 replies
级别下成对重用 user
和 ts
指定回复者和回复时间。我无法通过上面的 purrr 解决方案 link 找到他们。不确定这是因为 user
和 ts
也是主要消息的元素名称,还是因为我无法通过使用 ~ purrr::map()
作为映射函数来访问那么低的级别map()
.
l <- list(folder_1 = list(
`msg_1-1` = list(type = "message",
subtype = "channel_join",
ts = "1585771048.000200",
user = "UFUNNF8MA",
text = "<@UFUNNF8MA> has joined the channel"),
`msg_1-2` = list(type = "message",
subtype = "channel_purpose",
ts = "1585771049.000300",
user = "UNFUNQ8MA",
text = "<@UNFUNQ8MA> set the channel purpose: Talk about xyz")),
folder_2 = list(
`msg_2-1` = list(type = "message",
subtype = "channel_join",
ts = "1585771120.000200",
user = "UQKUNF8MA",
text = "<@UQKUNF8MA> has joined the channel",
replies = list(list(user = "UABCDEFG8", ts = "1585771220.002200"),
list(user = "UGFEDCBA8", ts = "1585771220.022000")))))
谢谢!
你可以这样做:
unlist(l)[grep("\.user", names(unlist(l)))]
#> folder_1.msg_1-1.user folder_1.msg_1-2.user
#> "UFUNNF8MA" "UNFUNQ8MA"
#> folder_2.msg_2-1.user folder_2.msg_2-1.replies.user
#> "UQKUNF8MA" "UABCDEFG8"
#> folder_2.msg_2-1.replies.user
#> "UGFEDCBA8"
对于更通用的解决方案,您可以:
get_elements <- function(list, field) {
as.character(unlist(list)[grep(paste0(field, "$"), names(unlist(list)))])
}
允许:
get_elements(l, "user")
#> [1] "UFUNNF8MA" "UNFUNQ8MA" "UQKUNF8MA" "UABCDEFG8" "UGFEDCBA8"
get_elements(l, "ts")
#> [1] "1585771048.000200" "1585771049.000300" "1585771120.000200"
你可以使用这个递归函数:
# assign the function
get_ele <- function(x, what){
is_list <- vapply(x, is.list, FALSE)
c(sapply(x[is_list], get_ele, what), x[[what]])
}
# use the function
unname(unlist(get_ele(l, "user")))
#R> [1] "UFUNNF8MA" "UNFUNQ8MA" "UABCDEFG8" "UGFEDCBA8" "UQKUNF8MA"
unname(unlist(get_ele(l, "ts")))
#R> [1] "1585771048.000200" "1585771049.000300" "1585771120.000200"
如果您需要多次执行此操作,那么此 C++ 版本可能会更快:
#include <Rcpp.h>
#include <vector>
using namespace Rcpp;
void get_ele_inner(Rcpp::List x, Rcpp::CharacterVector what,
std::vector<SEXP> &out_list){
SEXP names = x.attr("names");
if(!Rf_isNull(names)){
CharacterVector names_str(names);
for(R_len_t i = 0; i < x.size(); ++i){
if(names_str[i] == what[0])
out_list.emplace_back(x[i]);
else if(Rf_isNewList(x[i]))
get_ele_inner(x[i], what, out_list);
}
} else
for(auto xi : x)
if(Rf_isNewList(xi))
get_ele_inner(xi, what, out_list);
}
// [[Rcpp::export(rng = false)]]
SEXP get_ele(SEXP x, Rcpp::CharacterVector what) {
std::vector<SEXP> out_list;
get_ele_inner(x, what, out_list);
R_len_t const n_ele = out_list.size();
Rcpp::List out(n_ele);
for(R_len_t i = 0; i < n_ele; ++i)
out[i] = out_list[i];
return out;
}
您可以使用Rcpp::sourceCpp
编译函数。使用您提供的列表在我的笔记本电脑上似乎快了 10 倍(尽管这个列表很小)。
一个“整洁”的解决方案:
# Fixed the reprex list
l <- list(
folder_1 = list(
`msg_1-1` = list(
type = "message",
subtype = "channel_join",
ts = "1585771048.000200",
user = "UFUNNF8MA",
text = "<@UFUNNF8MA> has joined the channel"
),
`msg_1-2` = list(
type = "message",
subtype = "channel_purpose",
ts = "1585771049.000300",
user = "UNFUNQ8MA",
text = "<@UNFUNQ8MA> set the channel purpose: Talk about xyz"
)
),
folder_2 = list(
`msg_2-1` = list(
type = "message",
subtype = "channel_join",
ts = "1585771120.000200",
user = "UQKUNF8MA",
text = "<@UQKUNF8MA> has joined the channel",
replies = list(
list(user = "UABCDEFG8", ts = "1585771220.002200"),
list(user = "UGFEDCBA8", ts = "1585771220.022000")
)
)
)
)
# use the pipe
`%>%` <- magrittr::`%>%`
# map at 2nd level depth
messages <- purrr::map_depth(l, 2, ~{
if (!is.null(.x$replies)) {
#Map over replies if present
.replies <- purrr::map_dfr(.x$replies, ~{
tibble::tibble(ts = .x$ts, user = .x$user, text = rlang::`%||%`(.x$text, NA))
})
} else {
.replies <- NULL
}
#output as tibble
tibble::tibble(
ts = .x$ts, user = .x$user, text = .x$text, replies = list(
.replies
)
)
}) %>%
#remove one layer of nesting
unlist(recursive = FALSE) %>%
#bind together and keep folder name
dplyr::bind_rows(.id = "folder") %>%
# separate message name into it's own column
tidyr::separate(folder, into = c("folder", "message"), sep = "\.")
在 rrapply
包中使用 rrapply()
(基础 rapply()
的扩展版本):
library(rrapply)
## unlist 'user' nodes as vector
rrapply(l, condition = function(x, .xname) .xname == "user", how = "unlist")
#> folder_1.msg_1-1.user folder_1.msg_1-2.user
#> "UFUNNF8MA" "UNFUNQ8MA"
#> folder_2.msg_2-1.user folder_2.msg_2-1.replies.user
#> "UQKUNF8MA" "UABCDEFG8"
#> folder_2.msg_2-1.replies.user
#> "UGFEDCBA8"
## unlist 'ts' nodes as vector
rrapply(l, condition = function(x, .xname) .xname == "ts", how = "unlist")
#> folder_1.msg_1-1.ts folder_1.msg_1-2.ts
#> "1585771048.000200" "1585771049.000300"
#> folder_2.msg_2-1.ts folder_2.msg_2-1.replies.ts
#> "1585771120.000200" "1585771220.002200"
#> folder_2.msg_2-1.replies.ts
#> "1585771220.022000"
或返回融化的 data.frames 而不是未列出的载体:
## 'user' nodes as melted data.frame
rrapply(l, condition = function(x, .xname) .xname == "user", how = "melt")
#> L1 L2 L3 L4 L5 value
#> 1 folder_1 msg_1-1 user <NA> <NA> UFUNNF8MA
#> 2 folder_1 msg_1-2 user <NA> <NA> UNFUNQ8MA
#> 3 folder_2 msg_2-1 user <NA> <NA> UQKUNF8MA
#> 4 folder_2 msg_2-1 replies ..1 user UABCDEFG8
#> 5 folder_2 msg_2-1 replies ..2 user UGFEDCBA8
## 'ts' nodes as melted data.frame
rrapply(l, condition = function(x, .xname) .xname == "ts", how = "melt")
#> L1 L2 L3 L4 L5 value
#> 1 folder_1 msg_1-1 ts <NA> <NA> 1585771048.000200
#> 2 folder_1 msg_1-2 ts <NA> <NA> 1585771049.000300
#> 3 folder_2 msg_2-1 ts <NA> <NA> 1585771120.000200
#> 4 folder_2 msg_2-1 replies ..1 ts 1585771220.002200
#> 5 folder_2 msg_2-1 replies ..2 ts 1585771220.022000