解析 json 的一列并与另一列绑定以制作数据框

Parse one column of json and bind with other column to make dataframe

我有以下格式的数据:

have <- structure(list(V1 = c(4L, 28L, 2L), 
                       V2 = c("[{\"group\":1,\"topic\":\"A\"},{\"group\":1,\"topic\":\"B\"},{\"group\":2,\"topic\":\"C\"},{\"group\":2,\"topic\":\"T\"},{\"group\":2,\"topic\":\"U\"},{\"group\":3,\"topic\":\"V\"},{\"group\":3,\"topic\":\"D\"},{\"group\":3,\"topic\":\"R\"},{\"group\":4,\"topic\":\"A\"},{\"group\":4,\"topic\":\"Q\"},{\"group\":4,\"topic\":\"S\"},{\"group\":4,\"topic\":\"W\"},{\"group\":6,\"topic\":\"O\"},{\"group\":6,\"topic\":\"P\"},{\"group\":6,\"topic\":\"E\"},{\"group\":6,\"topic\":\"F\"},{\"group\":6,\"topic\":\"G\"},{\"group\":6,\"topic\":\"H\"},{\"group\":6,\"topic\":\"I\"},{\"group\":6,\"topic\":\"J\"},{\"group\":6,\"topic\":\"K\"},{\"group\":6,\"topic\":\"L\"},{\"group\":6,\"topic\":\"M\"},{\"group\":6,\"topic\":\"N\"}]",
                              "[]", 
                              "[{\"group\":2,\"topic\":\"C\"},{\"group\":3,\"topic\":\"D\"},{\"group\":6,\"topic\":\"O\"},{\"group\":6,\"topic\":\"P\"},{\"group\":6,\"topic\":\"E\"},{\"group\":6,\"topic\":\"G\"},{\"group\":6,\"topic\":\"M\"}]")
), 
row.names = c(NA, 3L), 
class = "data.frame")

V2 的内容是每行的嵌套分组,如 [{"group":1,"topic":"A"},{"group":1,"topic":"B"}...]

我想获得一个宽数据框,其中每行的组+主题(请参阅 also_have)的每个组合都有一个指示符 (1/0)。像这样:

# A tibble: 3 x 4
     id topic_id_1 topic_id_2 topic_id_3 topic_id_4 ...
  <dbl>      <dbl>      <dbl>      <dbl>
1     4          1          1          0
2    28          0          0          0
3     2          0          0          0

第一步是解析json。

我可以使用 purrr::map(have$V2, jsonlite::fromJSON) 取消嵌套到列表中,但我不确定如何将 V1 列(我们可能会重命名为 id)绑定到每个元素结果列表的(注意列表元素二是空的,因为 V1==28 是空的)。这是第一个元素的片段 添加了 id (V1).

[[1]]
   group topic  id
1      1     A   4
2      1     B   4
3      2     C   4
4      2     T   4
...

或者,我认为 purrr::map_df(have$V2, jsonlite::fromJSON) 会让我更接近我最终需要的东西,但在这里我也不确定如何添加行 id (V1)。

df <- purrr::map_df(have$V2, jsonlite::fromJSON)
df

What I get:

   group topic
1      1     A
2      1     B
3      2     C
4      2     T
...

What I want (notice `V1==28` does not appear): 

   group topic id
1      1     A  4
2      1     B  4
3      2     C  4
4      2     T  4
5      2     U  4
6      3     V  4
7      3     D  4
8      3     R  4
9      4     A  4
10     4     Q  4
11     4     S  4
12     4     W  4
13     6     O  4
14     6     P  4
15     6     E  4
16     6     F  4
17     6     G  4
18     6     H  4
19     6     I  4
20     6     J  4
21     6     K  4
22     6     L  4
23     6     M  4
24     6     N  4
25     2     C  2
26     3     D  2
27     6     O  2
28     6     P  2
29     6     E  2
30     6     G  2
31     6     M  2

停止。

我想如果我能用 id 得到上面的数据框,我就能得到剩下的方法。最终目标是将此信息与 also_have 结合起来,然后转向广泛。

# join
also_have <- expand_grid(c(1:6), c(LETTERS)) %>%
  mutate(topic_id = 1:n()) %>%
  magrittr::set_colnames(c("group", "topic", "topic_id")) %>%
  select(topic_id, group, topic)

# pivot wide

# A tibble: 3 x 4
     id topic_id_1 topic_id_2 topic_id_3 topic_id_4 ...
  <dbl>      <dbl>      <dbl>      <dbl>
1     4          1          1          0
2    28          0          0          0
3     2          0          0          0

更新:

应用@akrun 的解决方案:

purrr::map_dfr(setNames(have$V2, have$V1), 
                     jsonlite::fromJSON, 
                     .id = 'V1') %>%
  rename(id = V1) %>%
  left_join(also_have, by=c("group", "topic")) %>%
  select(-group, -topic) %>%
  mutate(value = 1) %>%
  pivot_wider(id_cols = id, 
              names_from = topic_id,
              names_prefix = "topic_id",
              values_from = value,
              values_fill = 0
  ) %>% 
  full_join(tibble(id = as.character(have$V1))) %>%
  replace(is.na(.), 0)

# A tibble: 3 x 25
  id    topic_id1 topic_id2 topic_id29 topic_id46 topic_id47 topic_id74 topic_id56
  <chr>     <dbl>     <dbl>      <dbl>      <dbl>      <dbl>      <dbl>      <dbl>
1 4             1         1          1          1          1          1          1
2 2             0         0          1          0          0          0          1
3 28            0         0          0          0          0          0          0
# … with 17 more variables: topic_id70 <dbl>, topic_id79 <dbl>, topic_id95 <dbl>,
#   topic_id97 <dbl>, topic_id101 <dbl>, topic_id145 <dbl>, topic_id146 <dbl>,
#   topic_id135 <dbl>, topic_id136 <dbl>, topic_id137 <dbl>, topic_id138 <dbl>,
#   topic_id139 <dbl>, topic_id140 <dbl>, topic_id141 <dbl>, topic_id142 <dbl>,
#   topic_id143 <dbl>, topic_id144 <dbl>

我们可以传递一个命名向量,然后在 map_dfr

中使用 .id
 purrr::map_dfr(setNames(have$V2, have$V1), jsonlite::fromJSON, .id = 'id')

-输出

id group topic
1   4     1     A
2   4     1     B
3   4     2     C
4   4     2     T
5   4     2     U
6   4     3     V
7   4     3     D
8   4     3     R
9   4     4     A
10  4     4     Q
11  4     4     S
12  4     4     W
...

或者这可以在使用 rowwise

后在 dplyr 框架中完成
library(tidyr)
have %>%
    rowwise %>%
    transmute(ID = V1, V2 = list(fromJSON(V2))) %>% 
    ungroup %>%
    unnest(c(V2), keep_empty = TRUE) %>% 
    select(-V2)
# A tibble: 32 x 3
      ID group topic
   <int> <int> <chr>
 1     4     1 A    
 2     4     1 B    
 3     4     2 C    
 4     4     2 T    
 5     4     2 U    
 6     4     3 V    
 7     4     3 D    
 8     4     3 R    
 9     4     4 A    
10     4     4 Q    
# … with 22 more rows

第二步做一个连接

out <-  have %>%
     rowwise %>%
     transmute(ID = V1, V2 = list(fromJSON(V2))) %>% 
     ungroup %>%
     unnest(c(V2), keep_empty = TRUE) %>% 
     select(-V2) %>% right_join(also_have)
out
Joining, by = c("group", "topic")
# A tibble: 163 x 4
      ID group topic topic_id
   <int> <int> <chr>    <int>
 1     4     1 A            1
 2     4     1 B            2
 3     4     2 C           29
 4     4     2 T           46
 5     4     2 U           47
 6     4     3 V           74
 7     4     3 D           56
 8     4     3 R           70
 9     4     4 A           79
10     4     4 Q           95
# … with 153 more rows