R中数据框的多级列表

Multi-level list to data frame in R

我想将下面的列表转换为数据框,但我没有成功。

该列表取自 Microsoft Azure 的 API,其中列出了所有资源类型及其技术信息(link:https://docs.microsoft.com/en-us/rest/api/compute/resource-skus/list)。该列表与此类似:

library(tidyverse)

input <- list(value = list(
  list(resourceType = "rt1", name = "name1", tier = "tier1", size = "size1", family = "family1", capabilities = list(list(name = "cap_name1", value = "value1_1"), list(name = "cap_name2", value = "value1_2"))),
  list(resourceType = "rt1", name = "name2", tier = "tier2", size = "size2", family = "family2", capabilities = list(list(name = "cap_name2", value = "value2_2"), list(name = "cap_name3", value = "value2_3"))),
  list(resourceType = "rt1", name = "name3", tier = "tier3", size = "size3", family = "family3", capabilities = list(list(name = "cap_name1", value = "value3_1"), list(name = "cap_name3", value = "value3_3"))),
  list(resourceType = "rt1", name = "name4", tier = "tier4", size = "size4", family = "family4", capabilities = list(list(name = "cap_name1", value = "value4_1"), list(name = "cap_name2", value = "value4_2"), list(name = "cap_name3", value = "value4_3"))),
  list(resourceType = "rt2", name = "name5", capabilities = list(list(name = "cap_name4", value = "value5_5")))
))


expected_output <-
  tibble(
    resourceType = c("rt1", "rt1", "rt1", "rt1"),
    name = c("name1", "name2", "name3", "name4"),
    tier = c("tier1", "tier2", "tier3", "tier4"),
    size = c("size1", "size2", "size3", "size4"),
    family = c("family1", "family2", "family3", "family4"),
    cap_name1 = c("value1_1", NA, "value3_1", "value4_1"),
    cap_name2 = c("value1_2", "value2_2", NA, "value4_2"),
    cap_name3 = c(NA, "value2_3", "value3_3", "value4_3"),
  )
expected_output
#> # A tibble: 4 × 8
#>   resourceType name  tier  size  family  cap_name1 cap_name2 cap_name3
#>   <chr>        <chr> <chr> <chr> <chr>   <chr>     <chr>     <chr>    
#> 1 rt1          name1 tier1 size1 family1 value1_1  value1_2  <NA>     
#> 2 rt1          name2 tier2 size2 family2 <NA>      value2_2  value2_3 
#> 3 rt1          name3 tier3 size3 family3 value3_1  <NA>      value3_3 
#> 4 rt1          name4 tier4 size4 family4 value4_1  value4_2  value4_3

reprex package (v2.0.1)

创建于 2022-05-12

我这里有两个问题:

  1. 我不知道如何只过滤resourceType == "rt1"。 我知道如何以这种方式过滤它:
input %>% pluck("value") %>% keep(~.x$resourceType == "rt1")

但我想以某种方式在没有 pluck 步骤的情况下完成。

  1. 主要问题是将其从input转换为expected_output。 我发现这种列出所有功能的复杂方法:
capabilities <- input %>% pluck("value") %>% keep(~.x$resourceType == "rt1") %>% transpose() %>% as_tibble() %>% pull(capabilities)
all_capabilities_names <- capabilities %>% map_depth(1, ~ map_chr(.x, "name")) %>% purrr::flatten_chr() %>% unique()
all_capabilities_names
#> [1] "cap_name1" "cap_name2" "cap_name3"

reprex package (v2.0.1)

创建于 2022-05-12

我被困在那里,因为我不知道如何将 value 映射到正确的列。

使用列表对我来说总是一场噩梦。任何帮助表示赞赏:)

借助一点点 purrry 魔法,您可以四处挖掘,在不同级别制作小标题,然后 unnest/reduce 一路向上。稍微玩一下:

library(tidyverse)

input <- list(value = list(
  list(resourceType = "rt1", name = "name1", tier = "tier1", size = "size1", family = "family1", capabilities = list(list(name = "cap_name1", value = "value1_1"), list(name = "cap_name2", value = "value1_2"))),
  list(resourceType = "rt1", name = "name2", tier = "tier2", size = "size2", family = "family2", capabilities = list(list(name = "cap_name2", value = "value2_2"), list(name = "cap_name3", value = "value2_3"))),
  list(resourceType = "rt1", name = "name3", tier = "tier3", size = "size3", family = "family3", capabilities = list(list(name = "cap_name1", value = "value3_1"), list(name = "cap_name3", value = "value3_3"))),
  list(resourceType = "rt1", name = "name4", tier = "tier4", size = "size4", family = "family4", capabilities = list(list(name = "cap_name1", value = "value4_1"), list(name = "cap_name2", value = "value4_2"), list(name = "cap_name3", value = "value4_3"))),
  list(resourceType = "rt2", name = "name5", capabilities = list(list(name = "cap_name4", value = "value5_4")))
))

output_test <- input[[1]] %>% 
  map(as_tibble) %>% 
  reduce(bind_rows) %>% 
  mutate(capabilities = map(capabilities, as_tibble)) %>% 
  unnest(capabilities, names_repair = "unique") %>% 
  filter(resourceType == "rt1") %>% 
  pivot_wider(names_from = `name...6`, values_from = value) %>% 
  rename(name = `name...2`)
#> New names:
#> • `name` -> `name...2`
#> • `name` -> `name...6`

output_test
#> # A tibble: 4 × 8
#>   resourceType name  tier  size  family  cap_name1 cap_name2 cap_name3
#>   <chr>        <chr> <chr> <chr> <chr>   <chr>     <chr>     <chr>    
#> 1 rt1          name1 tier1 size1 family1 value1_1  value1_2  <NA>     
#> 2 rt1          name2 tier2 size2 family2 <NA>      value2_2  value2_3 
#> 3 rt1          name3 tier3 size3 family3 value3_1  <NA>      value3_3 
#> 4 rt1          name4 tier4 size4 family4 value4_1  value4_2  value4_3

看看它是否有效:

expected_output <-
  tibble(
    resourceType = c("rt1", "rt1", "rt1", "rt1"),
    name = c("name1", "name2", "name3", "name4"),
    tier = c("tier1", "tier2", "tier3", "tier4"),
    size = c("size1", "size2", "size3", "size4"),
    family = c("family1", "family2", "family3", "family4"),
    cap_name1 = c("value1_1", NA, "value3_1", "value4_1"),
    cap_name2 = c("value1_2", "value2_2", NA, "value4_2"),
    cap_name3 = c(NA, "value2_3", "value3_3", "value4_3"),
  )

assertthat::are_equal(expected_output, output_test)
#> [1] TRUE

编辑 - 遇到另一个错误

如果其中一个列表中有一个空向量,则转向 tibble 将不起作用。您可以丢弃该向量,当将所有行绑定在一起时它将编码为 NA

library(tidyverse)

input <- list(value = list(
  list(resourceType = "rt1", name = "name1", tier = vector("character"), size = "size1", family = "family1", capabilities = list(list(name = "cap_name1", value = "value1_1"), list(name = "cap_name2", value = "value1_2"))),
  list(resourceType = "rt1", name = "name2", tier = "tier2", size = "size2", family = "family2", capabilities = list(list(name = "cap_name2", value = "value2_2"), list(name = "cap_name3", value = "value2_3"))),
  list(resourceType = "rt1", name = "name3", tier = "tier3", size = "size3", family = "family3", capabilities = list(list(name = "cap_name1", value = "value3_1"), list(name = "cap_name3", value = "value3_3"))),
  list(resourceType = "rt1", name = "name4", tier = "tier4", size = "size4", family = "family4", capabilities = list(list(name = "cap_name1", value = "value4_1"), list(name = "cap_name2", value = "value4_2"), list(name = "cap_name3", value = "value4_3"))),
  list(resourceType = "rt2", name = "name5", capabilities = list(list(name = "cap_name4", value = "value5_4")))
))

input$value %>% 
  map(~ discard(.x, is_empty) %>% as_tibble) %>% 
  reduce(bind_rows) %>% 
  mutate(capabilities = map(capabilities, as_tibble)) %>% 
  rename(value_name = name) %>% 
  unnest(capabilities, names_repair = "unique") %>% 
  filter(resourceType == "rt1") %>% 
  pivot_wider(names_from = `name`, values_from = value)
#> # A tibble: 4 × 8
#>   resourceType value_name size  family  tier  cap_name1 cap_name2 cap_name3
#>   <chr>        <chr>      <chr> <chr>   <chr> <chr>     <chr>     <chr>    
#> 1 rt1          name1      size1 family1 <NA>  value1_1  value1_2  <NA>     
#> 2 rt1          name2      size2 family2 tier2 <NA>      value2_2  value2_3 
#> 3 rt1          name3      size3 family3 tier3 value3_1  <NA>      value3_3 
#> 4 rt1          name4      size4 family4 tier4 value4_1  value4_2  value4_3

reprex package (v2.0.1)

创建于 2022-05-14