在 R 中,通过在嵌套列表中用 NA 替换 NULL 来防止 unlist 删除 NULL 值
In R, prevent unlist from removing NULL values, by replacing NULL with NA in nested list
这是我们从体育 API 中获取的 1 行数据,这些数据作为嵌套列表进入我们。我们的 fetch_results$data
是一个列表,其中包含许多游戏中的每一个的嵌套列表,因为此数据适用于许多足球比赛。列表的列表嵌套可以深入 3-4 层,内部列表为 scores
、time
、visitorTeam
,等等。
> dput(fetch_results$data[1])
list(list(id = 11984409L, league_id = 1326L, season_id = 15733L,
stage_id = 77442469L, round_id = 186274L, group_id = 225400L,
aggregate_id = NULL, venue_id = 7189L, referee_id = NULL,
localteam_id = 18716L, visitorteam_id = 18658L, winner_team_id = NULL,
weather_report = NULL, commentaries = FALSE, attendance = NULL,
pitch = NULL, details = "Match 1", neutral_venue = FALSE,
winning_odds_calculated = FALSE, formations = list(localteam_formation = NULL,
visitorteam_formation = NULL), scores = list(localteam_score = 0L,
visitorteam_score = 0L, localteam_pen_score = NULL, visitorteam_pen_score = NULL,
ht_score = NULL, ft_score = NULL, et_score = NULL, ps_score = NULL),
time = list(status = "NS", starting_at = list(date_time = "2021-06-11 19:00:00",
date = "2021-06-11", time = "19:00:00", timestamp = 1623438000L,
timezone = "UTC"), minute = NULL, second = NULL, added_time = NULL,
extra_minute = NULL, injury_time = NULL), coaches = list(
localteam_coach_id = 455836L, visitorteam_coach_id = 784486L),
standings = list(localteam_position = 3L, visitorteam_position = 1L),
assistants = list(first_assistant_id = NULL, second_assistant_id = NULL,
fourth_official_id = NULL), leg = "1/1", colors = NULL,
deleted = FALSE, is_placeholder = FALSE, localTeam = list(
data = list(id = 18716L, legacy_id = 213L, name = "Turkey",
short_code = "TUR", twitter = NULL, country_id = 404L,
national_team = TRUE, founded = 1923L, logo_path = "https://cdn.sportmonks.com/images//soccer/teams/28/18716.png",
venue_id = 9634L, current_season_id = 15733L, is_placeholder = NULL)),
visitorTeam = list(data = list(id = 18658L, legacy_id = 205L,
name = "Italy", short_code = "ITA", twitter = NULL, country_id = 251L,
national_team = TRUE, founded = 1898L, logo_path = "https://cdn.sportmonks.com/images//soccer/teams/2/18658.png",
venue_id = 7189L, current_season_id = 15733L, is_placeholder = NULL))))
要展平成数据框,我们使用:
zed <- fetch_results$data %>%
purrr::map(unlist) %>%
purrr::map(t) %>%
purrr::map(as_tibble) %>%
dplyr::bind_rows() %>%
readr::type_convert()
我们的数据帧输出的一行如下所示:
如果您仔细查看列表列表,主数据框中有 许多 个值为 NULL 的对象。整个 score
列表及其所有键都被删除。根据 this Whosebug post,看起来 unlist() 丢弃 NULL 值是罪魁祸首...
该线程中发布的解决方案仅解决 1 层嵌套深度的 NULL 值,但是上面的列表有许多嵌套列表,如果您搜索 list( 以上。
在不删除任何具有 NULL 值的列的情况下展平此列表列表的最佳方法是什么?如果最好的方法是先用 NA 替换 NULL,那么最好的方法是什么?我们现有的代码进行展平并接近,但不保留包含 NULL 的列。
一个选项是在我们做任何事情之前转换为 NA
。这可以通过 rrapply
以递归方式完成
library(rrapply)
library(purrr)
library(dplyr)
zed <- rrapply(fetch_results$data, f = function(x)
replace(x, is.null(x), NA)) %>%
map(unlist) %>%
map(t) %>%
map(as_tibble) %>%
bind_rows() %>%
type.convert(as.is = TRUE)
-输出
# A tibble: 1 x 75
id league_id season_id stage_id round_id group_id aggregate_id venue_id referee_id localteam_id visitorteam_id winner_team_id weather_report
<int> <int> <int> <int> <int> <int> <lgl> <int> <lgl> <int> <int> <lgl> <lgl>
1 11984409 1326 15733 77442469 186274 225400 NA 7189 NA 18716 18658 NA NA
# … with 62 more variables: commentaries <lgl>, attendance <lgl>, pitch <lgl>, details <chr>, neutral_venue <lgl>, winning_odds_calculated <lgl>,
# formations.localteam_formation <lgl>, formations.visitorteam_formation <lgl>, scores.localteam_score <int>, scores.visitorteam_score <int>,
# scores.localteam_pen_score <lgl>, scores.visitorteam_pen_score <lgl>, scores.ht_score <lgl>, scores.ft_score <lgl>, scores.et_score <lgl>,
# scores.ps_score <lgl>, time.status <chr>, time.starting_at.date_time <chr>, time.starting_at.date <chr>, time.starting_at.time <chr>,
# time.starting_at.timestamp <int>, time.starting_at.timezone <chr>, time.minute <lgl>, time.second <lgl>, time.added_time <lgl>, time.extra_minute <lgl>,
# time.injury_time <lgl>, coaches.localteam_coach_id <int>, coaches.visitorteam_coach_id <int>, standings.localteam_position <int>,
# standings.visitorteam_position <int>, assistants.first_assistant_id <lgl>, assistants.second_assistant_id <lgl>, assistants.fourth_official_id <lgl>,
# leg <chr>, colors <lgl>, deleted <lgl>, is_placeholder <lgl>, localTeam.data.id <int>, localTeam.data.legacy_id <int>, localTeam.data.name <chr>,
# localTeam.data.short_code <chr>, localTeam.data.twitter <lgl>, localTeam.data.country_id <int>, localTeam.data.national_team <lgl>,
# localTeam.data.founded <int>, localTeam.data.logo_path <chr>, localTeam.data.venue_id <int>, localTeam.data.current_season_id <int>,
# localTeam.data.is_placeholder <lgl>, visitorTeam.data.id <int>, visitorTeam.data.legacy_id <int>, visitorTeam.data.name <chr>,
# visitorTeam.data.short_code <chr>, visitorTeam.data.twitter <lgl>, visitorTeam.data.country_id <int>, visitorTeam.data.national_team <lgl>,
# visitorTeam.data.founded <int>, visitorTeam.data.logo_path <chr>, visitorTeam.data.venue_id <int>, visitorTeam.data.current_season_id <int>,
# visitorTeam.data.is_placeholder <lgl>
这是我们从体育 API 中获取的 1 行数据,这些数据作为嵌套列表进入我们。我们的 fetch_results$data
是一个列表,其中包含许多游戏中的每一个的嵌套列表,因为此数据适用于许多足球比赛。列表的列表嵌套可以深入 3-4 层,内部列表为 scores
、time
、visitorTeam
,等等。
> dput(fetch_results$data[1])
list(list(id = 11984409L, league_id = 1326L, season_id = 15733L,
stage_id = 77442469L, round_id = 186274L, group_id = 225400L,
aggregate_id = NULL, venue_id = 7189L, referee_id = NULL,
localteam_id = 18716L, visitorteam_id = 18658L, winner_team_id = NULL,
weather_report = NULL, commentaries = FALSE, attendance = NULL,
pitch = NULL, details = "Match 1", neutral_venue = FALSE,
winning_odds_calculated = FALSE, formations = list(localteam_formation = NULL,
visitorteam_formation = NULL), scores = list(localteam_score = 0L,
visitorteam_score = 0L, localteam_pen_score = NULL, visitorteam_pen_score = NULL,
ht_score = NULL, ft_score = NULL, et_score = NULL, ps_score = NULL),
time = list(status = "NS", starting_at = list(date_time = "2021-06-11 19:00:00",
date = "2021-06-11", time = "19:00:00", timestamp = 1623438000L,
timezone = "UTC"), minute = NULL, second = NULL, added_time = NULL,
extra_minute = NULL, injury_time = NULL), coaches = list(
localteam_coach_id = 455836L, visitorteam_coach_id = 784486L),
standings = list(localteam_position = 3L, visitorteam_position = 1L),
assistants = list(first_assistant_id = NULL, second_assistant_id = NULL,
fourth_official_id = NULL), leg = "1/1", colors = NULL,
deleted = FALSE, is_placeholder = FALSE, localTeam = list(
data = list(id = 18716L, legacy_id = 213L, name = "Turkey",
short_code = "TUR", twitter = NULL, country_id = 404L,
national_team = TRUE, founded = 1923L, logo_path = "https://cdn.sportmonks.com/images//soccer/teams/28/18716.png",
venue_id = 9634L, current_season_id = 15733L, is_placeholder = NULL)),
visitorTeam = list(data = list(id = 18658L, legacy_id = 205L,
name = "Italy", short_code = "ITA", twitter = NULL, country_id = 251L,
national_team = TRUE, founded = 1898L, logo_path = "https://cdn.sportmonks.com/images//soccer/teams/2/18658.png",
venue_id = 7189L, current_season_id = 15733L, is_placeholder = NULL))))
要展平成数据框,我们使用:
zed <- fetch_results$data %>%
purrr::map(unlist) %>%
purrr::map(t) %>%
purrr::map(as_tibble) %>%
dplyr::bind_rows() %>%
readr::type_convert()
我们的数据帧输出的一行如下所示:
如果您仔细查看列表列表,主数据框中有 许多 个值为 NULL 的对象。整个 score
列表及其所有键都被删除。根据 this Whosebug post,看起来 unlist() 丢弃 NULL 值是罪魁祸首...
该线程中发布的解决方案仅解决 1 层嵌套深度的 NULL 值,但是上面的列表有许多嵌套列表,如果您搜索 list( 以上。
在不删除任何具有 NULL 值的列的情况下展平此列表列表的最佳方法是什么?如果最好的方法是先用 NA 替换 NULL,那么最好的方法是什么?我们现有的代码进行展平并接近,但不保留包含 NULL 的列。
一个选项是在我们做任何事情之前转换为 NA
。这可以通过 rrapply
library(rrapply)
library(purrr)
library(dplyr)
zed <- rrapply(fetch_results$data, f = function(x)
replace(x, is.null(x), NA)) %>%
map(unlist) %>%
map(t) %>%
map(as_tibble) %>%
bind_rows() %>%
type.convert(as.is = TRUE)
-输出
# A tibble: 1 x 75
id league_id season_id stage_id round_id group_id aggregate_id venue_id referee_id localteam_id visitorteam_id winner_team_id weather_report
<int> <int> <int> <int> <int> <int> <lgl> <int> <lgl> <int> <int> <lgl> <lgl>
1 11984409 1326 15733 77442469 186274 225400 NA 7189 NA 18716 18658 NA NA
# … with 62 more variables: commentaries <lgl>, attendance <lgl>, pitch <lgl>, details <chr>, neutral_venue <lgl>, winning_odds_calculated <lgl>,
# formations.localteam_formation <lgl>, formations.visitorteam_formation <lgl>, scores.localteam_score <int>, scores.visitorteam_score <int>,
# scores.localteam_pen_score <lgl>, scores.visitorteam_pen_score <lgl>, scores.ht_score <lgl>, scores.ft_score <lgl>, scores.et_score <lgl>,
# scores.ps_score <lgl>, time.status <chr>, time.starting_at.date_time <chr>, time.starting_at.date <chr>, time.starting_at.time <chr>,
# time.starting_at.timestamp <int>, time.starting_at.timezone <chr>, time.minute <lgl>, time.second <lgl>, time.added_time <lgl>, time.extra_minute <lgl>,
# time.injury_time <lgl>, coaches.localteam_coach_id <int>, coaches.visitorteam_coach_id <int>, standings.localteam_position <int>,
# standings.visitorteam_position <int>, assistants.first_assistant_id <lgl>, assistants.second_assistant_id <lgl>, assistants.fourth_official_id <lgl>,
# leg <chr>, colors <lgl>, deleted <lgl>, is_placeholder <lgl>, localTeam.data.id <int>, localTeam.data.legacy_id <int>, localTeam.data.name <chr>,
# localTeam.data.short_code <chr>, localTeam.data.twitter <lgl>, localTeam.data.country_id <int>, localTeam.data.national_team <lgl>,
# localTeam.data.founded <int>, localTeam.data.logo_path <chr>, localTeam.data.venue_id <int>, localTeam.data.current_season_id <int>,
# localTeam.data.is_placeholder <lgl>, visitorTeam.data.id <int>, visitorTeam.data.legacy_id <int>, visitorTeam.data.name <chr>,
# visitorTeam.data.short_code <chr>, visitorTeam.data.twitter <lgl>, visitorTeam.data.country_id <int>, visitorTeam.data.national_team <lgl>,
# visitorTeam.data.founded <int>, visitorTeam.data.logo_path <chr>, visitorTeam.data.venue_id <int>, visitorTeam.data.current_season_id <int>,
# visitorTeam.data.is_placeholder <lgl>