无法使用 R 将多级 JSON 加载到数据框中
Unable to load multilevel JSON into data frame using R
我无法将这个嵌套的 JSON 加载到数据框中。
require(RJSONIO)
sample_json2 <- '[{ "_id" : { "$oid" : "50b59cd75bed76f46522c34e" }, "student_id" : 0, "class_id" : 2, "scores" : [ { "type" : "exam", "score" : 57.92947112575566 }, { "type" : "quiz", "score" : 21.24542588206755 }, { "type" : "homework", "score" : 68.19567810587429 }, { "type" : "homework", "score" : 67.95019716560351 }, { "type" : "homework", "score" : 18.81037253352722 } ] }
,{ "_id" : { "$oid" : "50b59cd75bed76f46522c34f" }, "student_id" : 0, "class_id" : 28, "scores" : [ { "type" : "exam", "score" : 39.17749400402234 }, { "type" : "quiz", "score" : 78.44172815491468 }, { "type" : "homework", "score" : 20.81782269075502 }, { "type" : "homework", "score" : 70.44520452408949 }, { "type" : "homework", "score" : 50.66616327819226 }, { "type" : "homework", "score" : 53.84983118363991 } ] }
,{ "_id" : { "$oid" : "50b59cd75bed76f46522c350" }, "student_id" : 0, "class_id" : 5, "scores" : [ { "type" : "exam", "score" : 88.22950674232497 }, { "type" : "quiz", "score" : 79.28962650427184 }, { "type" : "homework", "score" : 18.66254946562674 }, { "type" : "homework", "score" : 40.28154176513361 }, { "type" : "homework", "score" : 1.23735944117882 }, { "type" : "homework", "score" : 88.96101200683958 } ] }
,{ "_id" : { "$oid" : "50b59cd75bed76f46522c351" }, "student_id" : 0, "class_id" : 16, "scores" : [ { "type" : "exam", "score" : 59.1805667559299 }, { "type" : "quiz", "score" : 47.58960202938239 }, { "type" : "homework", "score" : 6.48470951607214 }, { "type" : "homework", "score" : 68.33519637418685 }, { "type" : "homework", "score" : 78.53068038180965 } ] }
,{ "_id" : { "$oid" : "50b59cd75bed76f46522c352" }, "student_id" : 0, "class_id" : 24, "scores" : [ { "type" : "exam", "score" : 4.444435759027499 }, { "type" : "quiz", "score" : 28.63057857803885 }, { "type" : "homework", "score" : 86.79352850434199 }, { "type" : "homework", "score" : 83.9164548767836 } ] }
,{ "_id" : { "$oid" : "50b59cd75bed76f46522c353" }, "student_id" : 0, "class_id" : 30, "scores" : [ { "type" : "exam", "score" : 14.34345947841966 }, { "type" : "quiz", "score" : 47.65945482174327 }, { "type" : "homework", "score" : 83.42772189120254 }, { "type" : "homework", "score" : 49.86812935368258 }, { "type" : "homework", "score" : 39.85525554437086 } ] }
,{ "_id" : { "$oid" : "50b59cd75bed76f46522c354" }, "student_id" : 0, "class_id" : 7, "scores" : [ { "type" : "exam", "score" : 18.20492211025179 }, { "type" : "quiz", "score" : 60.4769945611789 }, { "type" : "homework", "score" : 75.62999921143397 }, { "type" : "homework", "score" : 72.41228797373115 }, { "type" : "homework", "score" : 74.06744381708968 } ] }]'
我试过使用
json_file2 <- fromJSON(sample_json2, flatten = TRUE)
对于 jsonlite 包,但它仍然有分数作为串联列表。
我后来尝试 lapply 但数据框中的输出不是所需的格式。它是一个对任何 reader 都没有意义的列表。请注意,我是 R 的新手,所以只是关注在线博客并尝试推导输出。下面是我使用 lapply.
的代码
json_file2 <- fromJSON(sample_json2)
df <- lapply(json_file2, function(play){
data.frame(matrix(unlist(play, recursive = TRUE), byrow = FALSE))
})
我想要的输出格式是 5 行,一行中每种类型的分数。如果我可以保留列名称中的所有列标记,我也更愿意。例如:_id.$oid、student_id、class_id、类型、Scores.type、scores.score。但是下面的格式也可以正常工作。
$oid, student_id class_id type score
50b59cd75bed76f46522c35 0 7 exam 75.62
50b59cd75bed76f46522c354 0 7 quiz 59.62
50b59cd75bed76f46522c354 0 7 homework 59.62
50b59cd75bed76f46522c354 0 7 Homework 59.62
有人可以帮我解决这个问题吗?非常感谢。
您好,欢迎来到 SO!
我同意@SymbolixAU 的观点,即格式化您的 mongo 查询更容易,但是,如果不可能,我最终尝试了它。
请注意,这段代码只是一个非常丑陋和肮脏的解决方案,我也是 R 的初学者,但它应该能让你入门。我在这里使用了 tidyverse
,因为你没有指定它是否必须使用 base R。如果你不熟悉它,请转到 here。
library(tidyverse)
json_file <- RJSONIO::fromJSON(sample_json2, flatten = TRUE, simplify = TRUE)
df <- json_file %>%
unlist(recursive = F) %>%
matrix(ncol = 4, byrow = T) %>% # Converts the data to a matrix
as_data_frame %>% # Converts to tibble, for easier handling
mutate(V4 = map(V4, ~ data.frame(
matrix(unlist(.), ncol = 2, byrow = T), stringsAsFactors = FALSE)
)) %>%
unnest(V1, V2, V3) %>%
unnest(V4) %>% # Can't do both on a single unnest
mutate(X2 = as.numeric(X2), V2 = as.integer(V2), V3 = as.integer(V3)) %>%
rename(oid = V1, student_id = V2, class_id = V3, type = X1, score = X2)
你会得到这样的结果:
> df
# A tibble: 36 x 5
oid student_id class_id type score
<chr> <int> <int> <chr> <dbl>
1 50b59cd75bed76f46522c34e 0 2 exam 57.9
2 50b59cd75bed76f46522c34e 0 2 quiz 21.2
3 50b59cd75bed76f46522c34e 0 2 homework 68.2
4 50b59cd75bed76f46522c34e 0 2 homework 68.0
5 50b59cd75bed76f46522c34e 0 2 homework 18.8
6 50b59cd75bed76f46522c34f 0 28 exam 39.2
7 50b59cd75bed76f46522c34f 0 28 quiz 78.4
8 50b59cd75bed76f46522c34f 0 28 homework 20.8
9 50b59cd75bed76f46522c34f 0 28 homework 70.4
10 50b59cd75bed76f46522c34f 0 28 homework 50.7
# ... with 26 more rows
注意:我有一个非常相似的问题,这就是我共享代码的原因,但请注意,SO 标准不鼓励这样做...
我无法将这个嵌套的 JSON 加载到数据框中。
require(RJSONIO)
sample_json2 <- '[{ "_id" : { "$oid" : "50b59cd75bed76f46522c34e" }, "student_id" : 0, "class_id" : 2, "scores" : [ { "type" : "exam", "score" : 57.92947112575566 }, { "type" : "quiz", "score" : 21.24542588206755 }, { "type" : "homework", "score" : 68.19567810587429 }, { "type" : "homework", "score" : 67.95019716560351 }, { "type" : "homework", "score" : 18.81037253352722 } ] }
,{ "_id" : { "$oid" : "50b59cd75bed76f46522c34f" }, "student_id" : 0, "class_id" : 28, "scores" : [ { "type" : "exam", "score" : 39.17749400402234 }, { "type" : "quiz", "score" : 78.44172815491468 }, { "type" : "homework", "score" : 20.81782269075502 }, { "type" : "homework", "score" : 70.44520452408949 }, { "type" : "homework", "score" : 50.66616327819226 }, { "type" : "homework", "score" : 53.84983118363991 } ] }
,{ "_id" : { "$oid" : "50b59cd75bed76f46522c350" }, "student_id" : 0, "class_id" : 5, "scores" : [ { "type" : "exam", "score" : 88.22950674232497 }, { "type" : "quiz", "score" : 79.28962650427184 }, { "type" : "homework", "score" : 18.66254946562674 }, { "type" : "homework", "score" : 40.28154176513361 }, { "type" : "homework", "score" : 1.23735944117882 }, { "type" : "homework", "score" : 88.96101200683958 } ] }
,{ "_id" : { "$oid" : "50b59cd75bed76f46522c351" }, "student_id" : 0, "class_id" : 16, "scores" : [ { "type" : "exam", "score" : 59.1805667559299 }, { "type" : "quiz", "score" : 47.58960202938239 }, { "type" : "homework", "score" : 6.48470951607214 }, { "type" : "homework", "score" : 68.33519637418685 }, { "type" : "homework", "score" : 78.53068038180965 } ] }
,{ "_id" : { "$oid" : "50b59cd75bed76f46522c352" }, "student_id" : 0, "class_id" : 24, "scores" : [ { "type" : "exam", "score" : 4.444435759027499 }, { "type" : "quiz", "score" : 28.63057857803885 }, { "type" : "homework", "score" : 86.79352850434199 }, { "type" : "homework", "score" : 83.9164548767836 } ] }
,{ "_id" : { "$oid" : "50b59cd75bed76f46522c353" }, "student_id" : 0, "class_id" : 30, "scores" : [ { "type" : "exam", "score" : 14.34345947841966 }, { "type" : "quiz", "score" : 47.65945482174327 }, { "type" : "homework", "score" : 83.42772189120254 }, { "type" : "homework", "score" : 49.86812935368258 }, { "type" : "homework", "score" : 39.85525554437086 } ] }
,{ "_id" : { "$oid" : "50b59cd75bed76f46522c354" }, "student_id" : 0, "class_id" : 7, "scores" : [ { "type" : "exam", "score" : 18.20492211025179 }, { "type" : "quiz", "score" : 60.4769945611789 }, { "type" : "homework", "score" : 75.62999921143397 }, { "type" : "homework", "score" : 72.41228797373115 }, { "type" : "homework", "score" : 74.06744381708968 } ] }]'
我试过使用
json_file2 <- fromJSON(sample_json2, flatten = TRUE)
对于 jsonlite 包,但它仍然有分数作为串联列表。
我后来尝试 lapply 但数据框中的输出不是所需的格式。它是一个对任何 reader 都没有意义的列表。请注意,我是 R 的新手,所以只是关注在线博客并尝试推导输出。下面是我使用 lapply.
的代码json_file2 <- fromJSON(sample_json2)
df <- lapply(json_file2, function(play){
data.frame(matrix(unlist(play, recursive = TRUE), byrow = FALSE))
})
我想要的输出格式是 5 行,一行中每种类型的分数。如果我可以保留列名称中的所有列标记,我也更愿意。例如:_id.$oid、student_id、class_id、类型、Scores.type、scores.score。但是下面的格式也可以正常工作。
$oid, student_id class_id type score
50b59cd75bed76f46522c35 0 7 exam 75.62
50b59cd75bed76f46522c354 0 7 quiz 59.62
50b59cd75bed76f46522c354 0 7 homework 59.62
50b59cd75bed76f46522c354 0 7 Homework 59.62
有人可以帮我解决这个问题吗?非常感谢。
您好,欢迎来到 SO!
我同意@SymbolixAU 的观点,即格式化您的 mongo 查询更容易,但是,如果不可能,我最终尝试了它。
请注意,这段代码只是一个非常丑陋和肮脏的解决方案,我也是 R 的初学者,但它应该能让你入门。我在这里使用了 tidyverse
,因为你没有指定它是否必须使用 base R。如果你不熟悉它,请转到 here。
library(tidyverse)
json_file <- RJSONIO::fromJSON(sample_json2, flatten = TRUE, simplify = TRUE)
df <- json_file %>%
unlist(recursive = F) %>%
matrix(ncol = 4, byrow = T) %>% # Converts the data to a matrix
as_data_frame %>% # Converts to tibble, for easier handling
mutate(V4 = map(V4, ~ data.frame(
matrix(unlist(.), ncol = 2, byrow = T), stringsAsFactors = FALSE)
)) %>%
unnest(V1, V2, V3) %>%
unnest(V4) %>% # Can't do both on a single unnest
mutate(X2 = as.numeric(X2), V2 = as.integer(V2), V3 = as.integer(V3)) %>%
rename(oid = V1, student_id = V2, class_id = V3, type = X1, score = X2)
你会得到这样的结果:
> df
# A tibble: 36 x 5
oid student_id class_id type score
<chr> <int> <int> <chr> <dbl>
1 50b59cd75bed76f46522c34e 0 2 exam 57.9
2 50b59cd75bed76f46522c34e 0 2 quiz 21.2
3 50b59cd75bed76f46522c34e 0 2 homework 68.2
4 50b59cd75bed76f46522c34e 0 2 homework 68.0
5 50b59cd75bed76f46522c34e 0 2 homework 18.8
6 50b59cd75bed76f46522c34f 0 28 exam 39.2
7 50b59cd75bed76f46522c34f 0 28 quiz 78.4
8 50b59cd75bed76f46522c34f 0 28 homework 20.8
9 50b59cd75bed76f46522c34f 0 28 homework 70.4
10 50b59cd75bed76f46522c34f 0 28 homework 50.7
# ... with 26 more rows
注意:我有一个非常相似的问题,这就是我共享代码的原因,但请注意,SO 标准不鼓励这样做...