如何使用 plyr/dplyr 更快地从现有变量创建上限和下限年龄变量
How to make creating upper and lower age variables from existing variable much faster using plyr/dplyr
我有一些 'untidy' 格式的数据 - 变量名称中嵌入了 'age'。
使用 dplyr,我想创建一个 'tidy' 格式的数据集,其中的键是数据区、年份和年龄组,并且年龄组中的低年龄和高年龄是单独的变量。
一切都很好,除了最后一步花费的时间比我希望的要长得多。有没有更快的方法做到这一点仍然是 'readable'?
完整的可复制示例(使用 repmis
提取文件)
require(repmis)
require(stringr)
require(tidyr)
require(plyr)
require(dplyr)
persons <- source_DropboxData(
file="persons.csv",
key="vcz7qngb44vbynq"
) %>%
tbl_df() %>%
select(datazone, year,
contains("hspeop")
)
names(persons) <- names(persons) %>%str_replace_all( "GR.hspeop", "count_both_")
persons <- persons %>% gather(age_group, count, -datazone, -year)
persons <- persons %>% mutate(gender="both", age_group=str_replace_all(age_group, "count_both_", ""))
persons$age_group <- persons$age_group %>% revalue(
c(
"1619" = "16_19",
"2024" = "20_24",
"2529" = "25_29",
"3034" = "30_34",
"3539" = "35_39",
"4044" = "40_44",
"4549" = "45_49",
"5054" = "50_54",
"5559" = "55_59",
"6064" = "60_64",
"6569" = "65_69",
"7074" = "70_74",
"7579" = "75_79",
"8084" = "80_84",
"85over" = "85_100"
)
)
# deal with "" separately as revalue can't cope
persons$age_group[nchar(persons$age_group)==0] <- "all"
persons_by_age <- persons %>% filter(grepl("_", age_group)) # this is how to filter by contents of age_group
persons_by_age <- persons_by_age %>%
group_by(age_group) %>%
mutate(
lower_age = str_split(age_group, "_")[[1]][1] %>% as.numeric(),
upper_age = str_split(age_group, "_")[[1]][2] %>% as.numeric()
)
显然我在 mutate 中创建了同一个对象两次,因此速度可能会加倍。我还认为 group_by 意味着每个年龄组只需要完成一次操作,但它似乎对每一行都这样做。例如,按年龄组汇总、变异、然后加入是否是一种更快的方法?
编辑
上面的代码已经创建了输出,但比我希望的要慢得多。
最终输出的几个例子:
> persons_by_age
Source: local data frame [5,854,500 x 7]
datazone year age_group count gender lower_age upper_age
1 S01000001 1996 0 8 both 0 0
2 S01000002 1996 0 4 both 0 0
3 S01000003 1996 0 18 both 0 0
4 S01000004 1996 0 4 both 0 0
5 S01000005 1996 0 17 both 0 0
6 S01000006 1996 0 1 both 0 0
7 S01000007 1996 0 9 both 0 0
8 S01000008 1996 0 10 both 0 0
9 S01000009 1996 0 8 both 0 0
10 S01000010 1996 0 9 both 0 0
.. ... ... ... ... ... ... ...
> persons_by_age %>% filter(year==2000 & gender=="male" & lower_age > 30)
Source: local data frame [71,555 x 7]
datazone year age_group count gender lower_age upper_age
1 S01000001 2000 35_39 34 male 35 39
2 S01000002 2000 35_39 41 male 35 39
3 S01000003 2000 35_39 61 male 35 39
4 S01000004 2000 35_39 43 male 35 39
5 S01000005 2000 35_39 43 male 35 39
6 S01000006 2000 35_39 24 male 35 39
7 S01000007 2000 35_39 34 male 35 39
8 S01000008 2000 35_39 23 male 35 39
9 S01000009 2000 35_39 30 male 35 39
10 S01000010 2000 35_39 37 male 35 39
.. ... ... ... ... ... ... ...
> persons_by_age %>% filter(year==2000 & gender=="female" & lower_age > 30)
Source: local data frame [71,555 x 7]
datazone year age_group count gender lower_age upper_age
1 S01000001 2000 35_39 37 female 35 39
2 S01000002 2000 35_39 30 female 35 39
3 S01000003 2000 35_39 58 female 35 39
4 S01000004 2000 35_39 46 female 35 39
5 S01000005 2000 35_39 28 female 35 39
6 S01000006 2000 35_39 29 female 35 39
7 S01000007 2000 35_39 33 female 35 39
8 S01000008 2000 35_39 25 female 35 39
9 S01000009 2000 35_39 36 female 35 39
10 S01000010 2000 35_39 38 female 35 39
.. ... ... ... ... ... ... ...
你可以试试这个:
persons_by_age<-persons_by_age %>%
group_by(age_group) %>%
do(cbind(.,matrix(rep(unlist(strsplit(as.character(.[1,3]), "_")),nrow(.)),ncol=2,byrow=TRUE)))
.
允许您访问 group_by
中的组
对于每个组,age_group
列的第一行 (.[1,3]
) 被拆分,lower 和 upper 被做成一个向量,然后重复尽可能多的行在群里。
然后将得到的矩阵绑定到组中。
几秒钟后 运行。
separate
正如@jazzurro 所建议的那样要容易得多:
separate(persons_by_age, age_group, c("lower", "upper"), sep = "_",remove=FALSE)
我有一些 'untidy' 格式的数据 - 变量名称中嵌入了 'age'。 使用 dplyr,我想创建一个 'tidy' 格式的数据集,其中的键是数据区、年份和年龄组,并且年龄组中的低年龄和高年龄是单独的变量。
一切都很好,除了最后一步花费的时间比我希望的要长得多。有没有更快的方法做到这一点仍然是 'readable'?
完整的可复制示例(使用 repmis
提取文件)
require(repmis)
require(stringr)
require(tidyr)
require(plyr)
require(dplyr)
persons <- source_DropboxData(
file="persons.csv",
key="vcz7qngb44vbynq"
) %>%
tbl_df() %>%
select(datazone, year,
contains("hspeop")
)
names(persons) <- names(persons) %>%str_replace_all( "GR.hspeop", "count_both_")
persons <- persons %>% gather(age_group, count, -datazone, -year)
persons <- persons %>% mutate(gender="both", age_group=str_replace_all(age_group, "count_both_", ""))
persons$age_group <- persons$age_group %>% revalue(
c(
"1619" = "16_19",
"2024" = "20_24",
"2529" = "25_29",
"3034" = "30_34",
"3539" = "35_39",
"4044" = "40_44",
"4549" = "45_49",
"5054" = "50_54",
"5559" = "55_59",
"6064" = "60_64",
"6569" = "65_69",
"7074" = "70_74",
"7579" = "75_79",
"8084" = "80_84",
"85over" = "85_100"
)
)
# deal with "" separately as revalue can't cope
persons$age_group[nchar(persons$age_group)==0] <- "all"
persons_by_age <- persons %>% filter(grepl("_", age_group)) # this is how to filter by contents of age_group
persons_by_age <- persons_by_age %>%
group_by(age_group) %>%
mutate(
lower_age = str_split(age_group, "_")[[1]][1] %>% as.numeric(),
upper_age = str_split(age_group, "_")[[1]][2] %>% as.numeric()
)
显然我在 mutate 中创建了同一个对象两次,因此速度可能会加倍。我还认为 group_by 意味着每个年龄组只需要完成一次操作,但它似乎对每一行都这样做。例如,按年龄组汇总、变异、然后加入是否是一种更快的方法?
编辑
上面的代码已经创建了输出,但比我希望的要慢得多。
最终输出的几个例子:
> persons_by_age
Source: local data frame [5,854,500 x 7]
datazone year age_group count gender lower_age upper_age
1 S01000001 1996 0 8 both 0 0
2 S01000002 1996 0 4 both 0 0
3 S01000003 1996 0 18 both 0 0
4 S01000004 1996 0 4 both 0 0
5 S01000005 1996 0 17 both 0 0
6 S01000006 1996 0 1 both 0 0
7 S01000007 1996 0 9 both 0 0
8 S01000008 1996 0 10 both 0 0
9 S01000009 1996 0 8 both 0 0
10 S01000010 1996 0 9 both 0 0
.. ... ... ... ... ... ... ...
> persons_by_age %>% filter(year==2000 & gender=="male" & lower_age > 30)
Source: local data frame [71,555 x 7]
datazone year age_group count gender lower_age upper_age
1 S01000001 2000 35_39 34 male 35 39
2 S01000002 2000 35_39 41 male 35 39
3 S01000003 2000 35_39 61 male 35 39
4 S01000004 2000 35_39 43 male 35 39
5 S01000005 2000 35_39 43 male 35 39
6 S01000006 2000 35_39 24 male 35 39
7 S01000007 2000 35_39 34 male 35 39
8 S01000008 2000 35_39 23 male 35 39
9 S01000009 2000 35_39 30 male 35 39
10 S01000010 2000 35_39 37 male 35 39
.. ... ... ... ... ... ... ...
> persons_by_age %>% filter(year==2000 & gender=="female" & lower_age > 30)
Source: local data frame [71,555 x 7]
datazone year age_group count gender lower_age upper_age
1 S01000001 2000 35_39 37 female 35 39
2 S01000002 2000 35_39 30 female 35 39
3 S01000003 2000 35_39 58 female 35 39
4 S01000004 2000 35_39 46 female 35 39
5 S01000005 2000 35_39 28 female 35 39
6 S01000006 2000 35_39 29 female 35 39
7 S01000007 2000 35_39 33 female 35 39
8 S01000008 2000 35_39 25 female 35 39
9 S01000009 2000 35_39 36 female 35 39
10 S01000010 2000 35_39 38 female 35 39
.. ... ... ... ... ... ... ...
你可以试试这个:
persons_by_age<-persons_by_age %>%
group_by(age_group) %>%
do(cbind(.,matrix(rep(unlist(strsplit(as.character(.[1,3]), "_")),nrow(.)),ncol=2,byrow=TRUE)))
.
允许您访问 group_by
对于每个组,age_group
列的第一行 (.[1,3]
) 被拆分,lower 和 upper 被做成一个向量,然后重复尽可能多的行在群里。
然后将得到的矩阵绑定到组中。 几秒钟后 运行。
separate
正如@jazzurro 所建议的那样要容易得多:
separate(persons_by_age, age_group, c("lower", "upper"), sep = "_",remove=FALSE)