tidyverse 中的双重嵌套
Double nesting in the tidyverse
使用 examples Wickhams 对数据科学中 R 中 purrr 的介绍,我正在尝试创建一个双嵌套列表。
library(gapminder)
library(purrr)
library(tidyr)
gapminder
nest_data <- gapminder %>% group_by(continent) %>% nest(.key = by_continent)
我如何进一步嵌套国家以便 nest_data 包含 by_continent 和最终包含小标题 by_year 的新嵌套级别 by_contry?
此外,在为 gapminder 数据创建此数据结构后 - 您如何 运行 每个国家/地区 bookchapter 的回归模型示例?
我的解决方案,下面有一些解释。
library(gapminder)
library(purrr)
library(tidyr)
library(broom)
nest_data <- gapminder %>% group_by(continent) %>% nest(.key = by_continent)
第一个问题是:如何在嵌套的 by_continent
中嵌套 by_country
@aosmith 在评论中提出了很好的解决方案
nested_again<-
nest_data %>% mutate(by_continent = map(by_continent, ~.x %>%
group_by(country) %>%
nest(.key = by_country)))
# Level 1
nested_again
# # A tibble: 5 × 2
# continent by_continent
# <fctr> <list>
# 1 Asia <tibble [33 × 2]>
# 2 Europe <tibble [30 × 2]>
# 3 Africa <tibble [52 × 2]>
# 4 Americas <tibble [25 × 2]>
# 5 Oceania <tibble [2 × 2]>
# Level 2
nested_again %>% unnest %>% slice(1:2)
# # A tibble: 2 × 3
# continent country by_country
# <fctr> <fctr> <list>
# 1 Asia Afghanistan <tibble [12 × 4]>
# 2 Asia Bahrain <tibble [12 × 4]>
第二个问题:如何在更深层次上应用回归模型(我想把模型保存在小标题上)
@aosmith 的解决方案(我称之为 sol1)
sol1<-mutate(nested_again, models = map(by_continent, "by_country") %>%
at_depth(2, ~lm(lifeExp ~ year, data = .x)))
sol1
# # A tibble: 5 × 3
# continent by_continent models
# <fctr> <list> <list>
# 1 Asia <tibble [33 × 2]> <list [33]>
# 2 Europe <tibble [30 × 2]> <list [30]>
# 3 Africa <tibble [52 × 2]> <list [52]>
# 4 Americas <tibble [25 × 2]> <list [25]>
# 5 Oceania <tibble [2 × 2]> <list [2]>
sol1 %>% unnest(models)
# Error: Each column must either be a list of vectors or a list of data frames [models]
sol1 %>% unnest(by_continent) %>% slice(1:2)
# # A tibble: 2 × 3
# continent country by_country
# <fctr> <fctr> <list>
# 1 Asia Afghanistan <tibble [12 × 4]>
# 2 Asia Bahrain <tibble [12 × 4]>
解决方案是做它应该做的,但是没有简单的方法来按国家/地区过滤,因为该信息嵌套在级别 2 中。
我根据@aosmith对第一个问题的解决方案提出解决方案2:
sol2<-nested_again %>% mutate(by_continent = map(by_continent, ~.x %>%
mutate(models = map(by_country, ~lm(lifeExp ~ year, data = .x) )) ))
sol2
# # A tibble: 5 × 2
# continent by_continent
# <fctr> <list>
# 1 Asia <tibble [33 × 4]>
# 2 Europe <tibble [30 × 4]>
# 3 Africa <tibble [52 × 4]>
# 4 Americas <tibble [25 × 4]>
# 5 Oceania <tibble [2 × 4]>
sol2 %>% unnest %>% slice(1:2)
# # A tibble: 2 × 4
# continent country by_country models
# <fctr> <fctr> <list> <list>
# 1 Asia Afghanistan <tibble [12 × 4]> <S3: lm>
# 2 Asia Bahrain <tibble [12 × 4]> <S3: lm>
sol2 %>% unnest %>% unnest(by_country) %>% colnames
# [1] "continent" "country" "year" "lifeExp" "pop"
# [6] "gdpPercap"
# get model by specific country
sol2 %>% unnest %>% filter(country == "Brazil") %$% models %>% extract2(1)
# Call:
# lm(formula = lifeExp ~ year, data = .x)
#
# Coefficients:
# (Intercept) year
# -709.9427 0.3901
# summary with broom::tidy
sol2 %>% unnest %>% filter(country == "Brazil") %$% models %>%
extract2(1) %>% tidy
# term estimate std.error statistic p.value
# 1 (Intercept) -709.9426860 10.801042821 -65.72909 1.617791e-14
# 2 year 0.3900895 0.005456243 71.49417 6.990433e-15
我们可以整理所有模型并保存数据以用于绘图或过滤
sol2 %<>% mutate(by_continent = map(by_continent, ~.x %>%
mutate(tidymodels = map(models, tidy )) ))
sol2 %>% unnest %>% unnest(tidymodels) %>%
ggplot(aes(country,p.value,colour=continent))+geom_point()+
facet_wrap(~continent)+
theme(axis.text.x = element_blank())
selc <- sol2 %>% unnest %>% unnest(tidymodels) %>% filter(p.value > 0.05) %>%
select(country) %>% unique %>% extract2(1)
gapminder %>% filter(country %in% selc ) %>%
ggplot(aes(year,lifeExp,colour=continent))+geom_line(aes(group=country))+
facet_wrap(~continent)
aaaa,我们可以使用模型
m1 <- sol2 %>% unnest %>% slice(1) %$% models %>% extract2(1)
x <- sol2 %>% unnest %>% slice(1) %>% unnest(by_country) %>% select(year)
pred1 <- data.frame(year = x, lifeExp = predict.lm(m1,x))
sol2 %>% unnest %>% slice(1) %>% unnest(by_country) %>%
ggplot(aes(year, lifeExp )) + geom_point() +
geom_line(data=pred1)
在这种情况下,确实没有充分的理由使用这种双重嵌套(当然,除了学习如何使用),但我在我的工作中发现了一个非常有价值的案例,特别是当你需要一个函数时在第 3 层工作,按第 1 层和第 2 层分组,并保存在第 2 层 - 当然为此我们也可以在第 1 层使用 for
循环,但那有什么好玩的;)我是不太确定 "nested" map
与 for
循环 + map
相比如何执行,但接下来我将对其进行测试。
基准
看起来差别不大
# comparison map_map with for_map
map_map<-function(nested_again){
nested_again %>% mutate(by_continent = map(by_continent, ~.x %>%
mutate(models = map(by_country, ~lm(lifeExp ~ year, data = .x) )) )) }
for_map<-function(nested_again){ for(i in 1:length(nested_again[[1]])){
nested_again$by_continent[[i]] %<>%
mutate(models = map(by_country, ~lm(lifeExp ~ year, data = .x) )) }}
res<-microbenchmark::microbenchmark(
mm<-map_map(nested_again), fm<-for_map(nested_again) )
res
# Unit: milliseconds
# expr min lq mean median uq max neval cld
# mm <- map_map(nested_again) 121.0033 144.5530 160.6785 155.2389 174.2915 240.2012 100 a
# fm <- for_map(nested_again) 131.4312 148.3329 164.7097 157.6589 173.6480 455.7862 100 a
autoplot(res)
使用 examples Wickhams 对数据科学中 R 中 purrr 的介绍,我正在尝试创建一个双嵌套列表。
library(gapminder)
library(purrr)
library(tidyr)
gapminder
nest_data <- gapminder %>% group_by(continent) %>% nest(.key = by_continent)
我如何进一步嵌套国家以便 nest_data 包含 by_continent 和最终包含小标题 by_year 的新嵌套级别 by_contry?
此外,在为 gapminder 数据创建此数据结构后 - 您如何 运行 每个国家/地区 bookchapter 的回归模型示例?
我的解决方案,下面有一些解释。
library(gapminder)
library(purrr)
library(tidyr)
library(broom)
nest_data <- gapminder %>% group_by(continent) %>% nest(.key = by_continent)
第一个问题是:如何在嵌套的 by_continent
中嵌套 by_country@aosmith 在评论中提出了很好的解决方案
nested_again<-
nest_data %>% mutate(by_continent = map(by_continent, ~.x %>%
group_by(country) %>%
nest(.key = by_country)))
# Level 1
nested_again
# # A tibble: 5 × 2
# continent by_continent
# <fctr> <list>
# 1 Asia <tibble [33 × 2]>
# 2 Europe <tibble [30 × 2]>
# 3 Africa <tibble [52 × 2]>
# 4 Americas <tibble [25 × 2]>
# 5 Oceania <tibble [2 × 2]>
# Level 2
nested_again %>% unnest %>% slice(1:2)
# # A tibble: 2 × 3
# continent country by_country
# <fctr> <fctr> <list>
# 1 Asia Afghanistan <tibble [12 × 4]>
# 2 Asia Bahrain <tibble [12 × 4]>
第二个问题:如何在更深层次上应用回归模型(我想把模型保存在小标题上)
@aosmith 的解决方案(我称之为 sol1)
sol1<-mutate(nested_again, models = map(by_continent, "by_country") %>%
at_depth(2, ~lm(lifeExp ~ year, data = .x)))
sol1
# # A tibble: 5 × 3
# continent by_continent models
# <fctr> <list> <list>
# 1 Asia <tibble [33 × 2]> <list [33]>
# 2 Europe <tibble [30 × 2]> <list [30]>
# 3 Africa <tibble [52 × 2]> <list [52]>
# 4 Americas <tibble [25 × 2]> <list [25]>
# 5 Oceania <tibble [2 × 2]> <list [2]>
sol1 %>% unnest(models)
# Error: Each column must either be a list of vectors or a list of data frames [models]
sol1 %>% unnest(by_continent) %>% slice(1:2)
# # A tibble: 2 × 3
# continent country by_country
# <fctr> <fctr> <list>
# 1 Asia Afghanistan <tibble [12 × 4]>
# 2 Asia Bahrain <tibble [12 × 4]>
解决方案是做它应该做的,但是没有简单的方法来按国家/地区过滤,因为该信息嵌套在级别 2 中。
我根据@aosmith对第一个问题的解决方案提出解决方案2:
sol2<-nested_again %>% mutate(by_continent = map(by_continent, ~.x %>%
mutate(models = map(by_country, ~lm(lifeExp ~ year, data = .x) )) ))
sol2
# # A tibble: 5 × 2
# continent by_continent
# <fctr> <list>
# 1 Asia <tibble [33 × 4]>
# 2 Europe <tibble [30 × 4]>
# 3 Africa <tibble [52 × 4]>
# 4 Americas <tibble [25 × 4]>
# 5 Oceania <tibble [2 × 4]>
sol2 %>% unnest %>% slice(1:2)
# # A tibble: 2 × 4
# continent country by_country models
# <fctr> <fctr> <list> <list>
# 1 Asia Afghanistan <tibble [12 × 4]> <S3: lm>
# 2 Asia Bahrain <tibble [12 × 4]> <S3: lm>
sol2 %>% unnest %>% unnest(by_country) %>% colnames
# [1] "continent" "country" "year" "lifeExp" "pop"
# [6] "gdpPercap"
# get model by specific country
sol2 %>% unnest %>% filter(country == "Brazil") %$% models %>% extract2(1)
# Call:
# lm(formula = lifeExp ~ year, data = .x)
#
# Coefficients:
# (Intercept) year
# -709.9427 0.3901
# summary with broom::tidy
sol2 %>% unnest %>% filter(country == "Brazil") %$% models %>%
extract2(1) %>% tidy
# term estimate std.error statistic p.value
# 1 (Intercept) -709.9426860 10.801042821 -65.72909 1.617791e-14
# 2 year 0.3900895 0.005456243 71.49417 6.990433e-15
我们可以整理所有模型并保存数据以用于绘图或过滤
sol2 %<>% mutate(by_continent = map(by_continent, ~.x %>%
mutate(tidymodels = map(models, tidy )) ))
sol2 %>% unnest %>% unnest(tidymodels) %>%
ggplot(aes(country,p.value,colour=continent))+geom_point()+
facet_wrap(~continent)+
theme(axis.text.x = element_blank())
selc <- sol2 %>% unnest %>% unnest(tidymodels) %>% filter(p.value > 0.05) %>%
select(country) %>% unique %>% extract2(1)
gapminder %>% filter(country %in% selc ) %>%
ggplot(aes(year,lifeExp,colour=continent))+geom_line(aes(group=country))+
facet_wrap(~continent)
aaaa,我们可以使用模型
m1 <- sol2 %>% unnest %>% slice(1) %$% models %>% extract2(1)
x <- sol2 %>% unnest %>% slice(1) %>% unnest(by_country) %>% select(year)
pred1 <- data.frame(year = x, lifeExp = predict.lm(m1,x))
sol2 %>% unnest %>% slice(1) %>% unnest(by_country) %>%
ggplot(aes(year, lifeExp )) + geom_point() +
geom_line(data=pred1)
在这种情况下,确实没有充分的理由使用这种双重嵌套(当然,除了学习如何使用),但我在我的工作中发现了一个非常有价值的案例,特别是当你需要一个函数时在第 3 层工作,按第 1 层和第 2 层分组,并保存在第 2 层 - 当然为此我们也可以在第 1 层使用 for
循环,但那有什么好玩的;)我是不太确定 "nested" map
与 for
循环 + map
相比如何执行,但接下来我将对其进行测试。
基准
看起来差别不大
# comparison map_map with for_map
map_map<-function(nested_again){
nested_again %>% mutate(by_continent = map(by_continent, ~.x %>%
mutate(models = map(by_country, ~lm(lifeExp ~ year, data = .x) )) )) }
for_map<-function(nested_again){ for(i in 1:length(nested_again[[1]])){
nested_again$by_continent[[i]] %<>%
mutate(models = map(by_country, ~lm(lifeExp ~ year, data = .x) )) }}
res<-microbenchmark::microbenchmark(
mm<-map_map(nested_again), fm<-for_map(nested_again) )
res
# Unit: milliseconds
# expr min lq mean median uq max neval cld
# mm <- map_map(nested_again) 121.0033 144.5530 160.6785 155.2389 174.2915 240.2012 100 a
# fm <- for_map(nested_again) 131.4312 148.3329 164.7097 157.6589 173.6480 455.7862 100 a
autoplot(res)