访问 R 中的嵌套列表
accessing nested lists in R
我为一些数据创建了一个双层嵌套结构。我如何访问第 2 级(或第 n 级)的数据?
library(gapminder)
library(purrr)
library(tidyr)
gapminder
nest_data <- gapminder %>% group_by(continent) %>% nest(.key = by_continent)
nest_2<-nest_data %>% mutate(by_continent = map(by_continent, ~.x %>% group_by(country) %>% nest(.key = by_country)))
我现在如何将中国的数据从 nest_2 获取到数据框或 tibble 中?
我可以获得整个亚洲的数据,但我无法孤立中国。
a<-nest_2[nest_2$continent=="Asia",]$by_continent ##Any better way of isolating Asia from nest_2?
我以为我可以做到
b<-a[a$country=="China",]$by_country
但是我得到以下错误
Error in a[a$country == "China", ] : incorrect number of dimensions
> glimpse(a)
List of 1
$ :Classes ‘tbl_df’, ‘tbl’ and 'data.frame': 33 obs. of 2 variables:
..$ country : Factor w/ 142 levels "Afghanistan",..: 1 8 9 19 25 56 59 60 61 62 ...
..$ by_country:List of 33
所以我的大错误是没有识别出该产品是一个列表,可以通过在最后添加 [[1]] 来补救。但是,我非常喜欢@Floo0 的解决方案。我冒昧地提供了一个采用变量名称的函数,以防列的顺序与提供的不同。
select_unnest <- function(df, listcol, var, var_val){ ###listcol, var and var_val must enclosed by ""
df[[listcol]][df[[var]]==var_val][[1]]
}
nest_2 %>% select_unnest(listcol = "by_continent", var = "continent", var_val = "Asia") %>%
select_unnest(listcol = "by_country", var = "country", var_val = "China")
您可能需要的是 [[]] 运算符,而不是简单的单个 []。
我不使用 purrr
所以不太明白你是如何得到这个 weird/deeply 嵌套的东西的(看起来你正在遵循与 类似的方法; 针对该问题的评论提出了一些替代方法)。我可以通过这种方式为中国提取 tibble,但必须有更好的方法来做你想做的事情......
n1 <- nest_2$by_continent[nest_2$continent=="Asia"][[1]]
n2 <- n1 %>% filter(country=="China")
n2$by_country[[1]]
您的 a
仍然是一个列表,在您可以做更多事情之前需要将其展平。
您可以使用 flatten_df
、dplyr::filter
和 unnest
:
library(dplyr)
flatten_df(a) %>%
filter(country == "China") %>%
unnest
# A tibble: 12 x 5
country year lifeExp pop gdpPercap
<fctr> <int> <dbl> <int> <dbl>
1 China 1952 44.00000 556263527 400.4486
2 China 1957 50.54896 637408000 575.9870
3 China 1962 44.50136 665770000 487.6740
4 China 1967 58.38112 754550000 612.7057
5 China 1972 63.11888 862030000 676.9001
6 China 1977 63.96736 943455000 741.2375
7 China 1982 65.52500 1000281000 962.4214
8 China 1987 67.27400 1084035000 1378.9040
9 China 1992 68.69000 1164970000 1655.7842
10 China 1997 70.42600 1230075000 2289.2341
11 China 2002 72.02800 1280400000 3119.2809
12 China 2007 72.96100 1318683096 4959.1149
另一种退出亚洲并最终陷入无法使用列表的情况的方法。这将避免以后 flatten
的需要。
asia = nest_2 %>%
filter(continent == "Asia") %>%
select(by_continent) %>%
unnest
这是一种可通过管道传输 (%>%
) 的基础 R 方法
select_unnest <- function(x, select_val){
x[[2]][x[[1]]==select_val][[1]]
}
nest_2 %>% select_unnest("Asia") %>% select_unnest("China")
比较时间:
Unit: microseconds
min lq mean median uq max neval
aosmith1 3202.105 3354.0055 4045.9602 3612.126 4179.9610 17119.495 100
aosmith2 5797.744 6191.9380 7327.6619 6716.445 7662.6415 24245.779 100
Floo0 227.169 303.3280 414.3779 346.135 400.6735 4804.500 100
Ben Bolker 622.267 720.6015 852.9727 775.172 875.5985 1942.495 100
代码:
microbenchmark::microbenchmark(
{a<-nest_2[nest_2$continent=="Asia",]$by_continent
flatten_df(a) %>%
filter(country == "China") %>%
unnest},
{nest_2 %>%
filter(continent == "Asia") %>%
select(by_continent) %>%
unnest%>%
filter(country == "China") %>%
unnest},
{nest_2 %>% select_unnest("Asia") %>% select_unnest("China")},
{n1 <- nest_2$by_continent[nest_2$continent=="Asia"][[1]]
n2 <- n1 %>% filter(country=="China")
n2$by_country[[1]]}
)
一个data.table解决方案:
DT <- as.data.table(gapminder)
#nest data (starting smallest and working up):
nest_DT <- DT[, list(by_country = list(.SD)), by = .(continent, country)]
nest_2 <- nest_DT[, list(by_continent = list(.SD)), by = .(continent)]
我们现在可以将 [filter, column][[1]]
形式的调用链接在一起以获得嵌套值
nest_2[continent == "Asia", by_continent][[1]]
country by_country
1: Afghanistan <data.table>
2: Bahrain <data.table>
3: Bangladesh <data.table>
4: Cambodia <data.table>
5: China <data.table>
6: Hong Kong, China <data.table>
7: India <data.table>
8: Indonesia <data.table>
9: Iran <data.table>
10: Iraq <data.table>
11: Israel <data.table>
12: Japan <data.table>
... ... ...
nest_2[continent == "Asia", by_continent][[1]][country == "China", by_country][[1]]
year lifeExp pop gdpPercap
1: 1952 44.00000 556263527 400.4486
2: 1957 50.54896 637408000 575.9870
3: 1962 44.50136 665770000 487.6740
4: 1967 58.38112 754550000 612.7057
5: 1972 63.11888 862030000 676.9001
6: 1977 63.96736 943455000 741.2375
7: 1982 65.52500 1000281000 962.4214
8: 1987 67.27400 1084035000 1378.9040
9: 1992 68.69000 1164970000 1655.7842
10: 1997 70.42600 1230075000 2289.2341
11: 2002 72.02800 1280400000 3119.2809
12: 2007 72.96100 1318683096 4959.1149
我为一些数据创建了一个双层嵌套结构。我如何访问第 2 级(或第 n 级)的数据?
library(gapminder)
library(purrr)
library(tidyr)
gapminder
nest_data <- gapminder %>% group_by(continent) %>% nest(.key = by_continent)
nest_2<-nest_data %>% mutate(by_continent = map(by_continent, ~.x %>% group_by(country) %>% nest(.key = by_country)))
我现在如何将中国的数据从 nest_2 获取到数据框或 tibble 中?
我可以获得整个亚洲的数据,但我无法孤立中国。
a<-nest_2[nest_2$continent=="Asia",]$by_continent ##Any better way of isolating Asia from nest_2?
我以为我可以做到
b<-a[a$country=="China",]$by_country
但是我得到以下错误
Error in a[a$country == "China", ] : incorrect number of dimensions
> glimpse(a)
List of 1
$ :Classes ‘tbl_df’, ‘tbl’ and 'data.frame': 33 obs. of 2 variables:
..$ country : Factor w/ 142 levels "Afghanistan",..: 1 8 9 19 25 56 59 60 61 62 ...
..$ by_country:List of 33
所以我的大错误是没有识别出该产品是一个列表,可以通过在最后添加 [[1]] 来补救。但是,我非常喜欢@Floo0 的解决方案。我冒昧地提供了一个采用变量名称的函数,以防列的顺序与提供的不同。
select_unnest <- function(df, listcol, var, var_val){ ###listcol, var and var_val must enclosed by ""
df[[listcol]][df[[var]]==var_val][[1]]
}
nest_2 %>% select_unnest(listcol = "by_continent", var = "continent", var_val = "Asia") %>%
select_unnest(listcol = "by_country", var = "country", var_val = "China")
您可能需要的是 [[]] 运算符,而不是简单的单个 []。
我不使用 purrr
所以不太明白你是如何得到这个 weird/deeply 嵌套的东西的(看起来你正在遵循与
n1 <- nest_2$by_continent[nest_2$continent=="Asia"][[1]]
n2 <- n1 %>% filter(country=="China")
n2$by_country[[1]]
您的 a
仍然是一个列表,在您可以做更多事情之前需要将其展平。
您可以使用 flatten_df
、dplyr::filter
和 unnest
:
library(dplyr)
flatten_df(a) %>%
filter(country == "China") %>%
unnest
# A tibble: 12 x 5
country year lifeExp pop gdpPercap
<fctr> <int> <dbl> <int> <dbl>
1 China 1952 44.00000 556263527 400.4486
2 China 1957 50.54896 637408000 575.9870
3 China 1962 44.50136 665770000 487.6740
4 China 1967 58.38112 754550000 612.7057
5 China 1972 63.11888 862030000 676.9001
6 China 1977 63.96736 943455000 741.2375
7 China 1982 65.52500 1000281000 962.4214
8 China 1987 67.27400 1084035000 1378.9040
9 China 1992 68.69000 1164970000 1655.7842
10 China 1997 70.42600 1230075000 2289.2341
11 China 2002 72.02800 1280400000 3119.2809
12 China 2007 72.96100 1318683096 4959.1149
另一种退出亚洲并最终陷入无法使用列表的情况的方法。这将避免以后 flatten
的需要。
asia = nest_2 %>%
filter(continent == "Asia") %>%
select(by_continent) %>%
unnest
这是一种可通过管道传输 (%>%
) 的基础 R 方法
select_unnest <- function(x, select_val){
x[[2]][x[[1]]==select_val][[1]]
}
nest_2 %>% select_unnest("Asia") %>% select_unnest("China")
比较时间:
Unit: microseconds
min lq mean median uq max neval
aosmith1 3202.105 3354.0055 4045.9602 3612.126 4179.9610 17119.495 100
aosmith2 5797.744 6191.9380 7327.6619 6716.445 7662.6415 24245.779 100
Floo0 227.169 303.3280 414.3779 346.135 400.6735 4804.500 100
Ben Bolker 622.267 720.6015 852.9727 775.172 875.5985 1942.495 100
代码:
microbenchmark::microbenchmark(
{a<-nest_2[nest_2$continent=="Asia",]$by_continent
flatten_df(a) %>%
filter(country == "China") %>%
unnest},
{nest_2 %>%
filter(continent == "Asia") %>%
select(by_continent) %>%
unnest%>%
filter(country == "China") %>%
unnest},
{nest_2 %>% select_unnest("Asia") %>% select_unnest("China")},
{n1 <- nest_2$by_continent[nest_2$continent=="Asia"][[1]]
n2 <- n1 %>% filter(country=="China")
n2$by_country[[1]]}
)
一个data.table解决方案:
DT <- as.data.table(gapminder)
#nest data (starting smallest and working up):
nest_DT <- DT[, list(by_country = list(.SD)), by = .(continent, country)]
nest_2 <- nest_DT[, list(by_continent = list(.SD)), by = .(continent)]
我们现在可以将 [filter, column][[1]]
形式的调用链接在一起以获得嵌套值
nest_2[continent == "Asia", by_continent][[1]]
country by_country
1: Afghanistan <data.table>
2: Bahrain <data.table>
3: Bangladesh <data.table>
4: Cambodia <data.table>
5: China <data.table>
6: Hong Kong, China <data.table>
7: India <data.table>
8: Indonesia <data.table>
9: Iran <data.table>
10: Iraq <data.table>
11: Israel <data.table>
12: Japan <data.table>
... ... ...
nest_2[continent == "Asia", by_continent][[1]][country == "China", by_country][[1]]
year lifeExp pop gdpPercap
1: 1952 44.00000 556263527 400.4486
2: 1957 50.54896 637408000 575.9870
3: 1962 44.50136 665770000 487.6740
4: 1967 58.38112 754550000 612.7057
5: 1972 63.11888 862030000 676.9001
6: 1977 63.96736 943455000 741.2375
7: 1982 65.52500 1000281000 962.4214
8: 1987 67.27400 1084035000 1378.9040
9: 1992 68.69000 1164970000 1655.7842
10: 1997 70.42600 1230075000 2289.2341
11: 2002 72.02800 1280400000 3119.2809
12: 2007 72.96100 1318683096 4959.1149