从嵌套列表深处提取命名元素的通用方法

generalized approach to pulling named elements from deep within a nested list

我昨天问了一个问题,询问如何按名称访问嵌套列表的所有元素。

是否有通用函数可以让我获得列表中较深的命名元素?特别是,我正在尝试获取对 Slack 消息的回复的用户和时间戳,并且消息模式名称在消息的 replies 级别下成对重用 userts指定回复者和回复时间。我无法通过上面的 purrr 解决方案 link 找到他们。不确定这是因为 userts 也是主要消息的元素名称,还是因为我无法通过使用 ~ purrr::map() 作为映射函数来访问那么低的级别map().

里面
l <- list(folder_1 = list(
      `msg_1-1` = list(type = "message",
                   subtype = "channel_join",
                   ts = "1585771048.000200",
                   user = "UFUNNF8MA",
                   text = "<@UFUNNF8MA> has joined the channel"),
      `msg_1-2` = list(type = "message",
                       subtype = "channel_purpose",
                       ts = "1585771049.000300",
                       user = "UNFUNQ8MA",
                       text = "<@UNFUNQ8MA> set the channel purpose: Talk about xyz")),
      folder_2 = list(
        `msg_2-1` = list(type = "message",
                      subtype = "channel_join",
                      ts = "1585771120.000200",
                      user = "UQKUNF8MA",
                      text = "<@UQKUNF8MA> has joined the channel",
                      replies = list(list(user = "UABCDEFG8", ts = "1585771220.002200"),
                          list(user = "UGFEDCBA8", ts = "1585771220.022000")))))

谢谢!

你可以这样做:

unlist(l)[grep("\.user", names(unlist(l)))]
#>         folder_1.msg_1-1.user         folder_1.msg_1-2.user 
#>                   "UFUNNF8MA"                   "UNFUNQ8MA" 
#>         folder_2.msg_2-1.user folder_2.msg_2-1.replies.user 
#>                   "UQKUNF8MA"                   "UABCDEFG8" 
#> folder_2.msg_2-1.replies.user 
#>                   "UGFEDCBA8" 

对于更通用的解决方案,您可以:

 get_elements <- function(list, field) {
   as.character(unlist(list)[grep(paste0(field, "$"), names(unlist(list)))])
 }

允许:

get_elements(l, "user")
#> [1] "UFUNNF8MA" "UNFUNQ8MA" "UQKUNF8MA" "UABCDEFG8" "UGFEDCBA8"

get_elements(l, "ts")
#> [1] "1585771048.000200" "1585771049.000300" "1585771120.000200"

你可以使用这个递归函数:

# assign the function
get_ele <- function(x, what){
  is_list <- vapply(x, is.list, FALSE)
  c(sapply(x[is_list], get_ele, what), x[[what]])
}

# use the function
unname(unlist(get_ele(l, "user")))
#R> [1] "UFUNNF8MA" "UNFUNQ8MA" "UABCDEFG8" "UGFEDCBA8" "UQKUNF8MA"

unname(unlist(get_ele(l, "ts")))
#R> [1] "1585771048.000200" "1585771049.000300" "1585771120.000200"

如果您需要多次执行此操作,那么此 C++ 版本可能会更快:

#include <Rcpp.h>
#include <vector>
using namespace Rcpp;

void get_ele_inner(Rcpp::List x, Rcpp::CharacterVector what, 
                   std::vector<SEXP> &out_list){
  SEXP names = x.attr("names");
  if(!Rf_isNull(names)){
    CharacterVector names_str(names);
    for(R_len_t i = 0; i < x.size(); ++i){
      if(names_str[i] == what[0])
        out_list.emplace_back(x[i]);
      else if(Rf_isNewList(x[i]))
        get_ele_inner(x[i], what, out_list);
      
    }
  } else 
    for(auto xi : x)
      if(Rf_isNewList(xi))
        get_ele_inner(xi, what, out_list);
}

// [[Rcpp::export(rng = false)]]
SEXP get_ele(SEXP x, Rcpp::CharacterVector what) {
  std::vector<SEXP> out_list;
  get_ele_inner(x, what, out_list);
  
  R_len_t const n_ele = out_list.size();
  Rcpp::List out(n_ele);
  for(R_len_t i = 0; i < n_ele; ++i)
    out[i] = out_list[i];
  
  return out;
}

您可以使用Rcpp::sourceCpp编译函数。使用您提供的列表在我的笔记本电脑上似乎快了 10 倍(尽管这个列表很小)。

一个“整洁”的解决方案:

# Fixed the reprex list
l <- list(
  folder_1 = list(
    `msg_1-1` = list(
      type = "message",
      subtype = "channel_join",
      ts = "1585771048.000200",
      user = "UFUNNF8MA",
      text = "<@UFUNNF8MA> has joined the channel"
    ),
    `msg_1-2` = list(
      type = "message",
      subtype = "channel_purpose",
      ts = "1585771049.000300",
      user = "UNFUNQ8MA",
      text = "<@UNFUNQ8MA> set the channel purpose: Talk about xyz"
    )
  ),
  folder_2 = list(
    `msg_2-1` = list(
      type = "message",
      subtype = "channel_join",
      ts = "1585771120.000200",
      user = "UQKUNF8MA",
      text = "<@UQKUNF8MA> has joined the channel",
      replies = list(
        list(user = "UABCDEFG8", ts = "1585771220.002200"),
        list(user = "UGFEDCBA8", ts = "1585771220.022000")
      )
    )
  )
)
# use the pipe
`%>%` <- magrittr::`%>%`
# map at 2nd level depth
messages <- purrr::map_depth(l, 2, ~{
  if (!is.null(.x$replies)) {
#Map over replies if present
    .replies <- purrr::map_dfr(.x$replies, ~{
      tibble::tibble(ts = .x$ts, user = .x$user, text = rlang::`%||%`(.x$text, NA))
    })
  } else {
    .replies <- NULL
  }
#output as tibble
  tibble::tibble(
    ts = .x$ts, user = .x$user, text = .x$text, replies =  list(
      .replies
    )
  )
  
}) %>%
#remove one layer of nesting
  unlist(recursive = FALSE) %>%
#bind together and keep folder name
  dplyr::bind_rows(.id = "folder") %>% 
# separate message name into it's own column
  tidyr::separate(folder, into = c("folder", "message"), sep = "\.")

rrapply 包中使用 rrapply()(基础 rapply() 的扩展版本):

library(rrapply)

## unlist 'user' nodes as vector
rrapply(l, condition = function(x, .xname) .xname == "user", how = "unlist")
#>         folder_1.msg_1-1.user         folder_1.msg_1-2.user 
#>                   "UFUNNF8MA"                   "UNFUNQ8MA" 
#>         folder_2.msg_2-1.user folder_2.msg_2-1.replies.user 
#>                   "UQKUNF8MA"                   "UABCDEFG8" 
#> folder_2.msg_2-1.replies.user 
#>                   "UGFEDCBA8"

## unlist 'ts' nodes as vector
rrapply(l, condition = function(x, .xname) .xname == "ts", how = "unlist")
#>         folder_1.msg_1-1.ts         folder_1.msg_1-2.ts 
#>         "1585771048.000200"         "1585771049.000300" 
#>         folder_2.msg_2-1.ts folder_2.msg_2-1.replies.ts 
#>         "1585771120.000200"         "1585771220.002200" 
#> folder_2.msg_2-1.replies.ts 
#>         "1585771220.022000"

或返回融化的 data.frames 而不是未列出的载体:

## 'user' nodes as melted data.frame
rrapply(l, condition = function(x, .xname) .xname == "user", how = "melt")
#>         L1      L2      L3   L4   L5     value
#> 1 folder_1 msg_1-1    user <NA> <NA> UFUNNF8MA
#> 2 folder_1 msg_1-2    user <NA> <NA> UNFUNQ8MA
#> 3 folder_2 msg_2-1    user <NA> <NA> UQKUNF8MA
#> 4 folder_2 msg_2-1 replies  ..1 user UABCDEFG8
#> 5 folder_2 msg_2-1 replies  ..2 user UGFEDCBA8

## 'ts' nodes as melted data.frame
rrapply(l, condition = function(x, .xname) .xname == "ts", how = "melt")
#>         L1      L2      L3   L4   L5             value
#> 1 folder_1 msg_1-1      ts <NA> <NA> 1585771048.000200
#> 2 folder_1 msg_1-2      ts <NA> <NA> 1585771049.000300
#> 3 folder_2 msg_2-1      ts <NA> <NA> 1585771120.000200
#> 4 folder_2 msg_2-1 replies  ..1   ts 1585771220.002200
#> 5 folder_2 msg_2-1 replies  ..2   ts 1585771220.022000