有没有更简洁的方法来整理数据框列表?
is there a cleaner way to tidy a list of data frames?
我使用 API returns 数据帧列表(Rblpapi bdh()
函数)。我想使用列表的 names()
作为组合单个数据框中的新列,将数据放入整齐的格式。我有一个解决方案,但我怀疑它容易出错并且比需要的慢。
#create example data set
library(tidyr)
obsA <- data_frame(
date = as.Date('2009-01-01') + 0:2,
X = rnorm(3, 0, 1),
Y = rnorm(3, 0, 2),
Z = rnorm(3, 0, 4)
)
obsB <- data_frame(
date = as.Date('2009-01-01') + 0:2,
X = rnorm(3, 10, 1),
Y = rnorm(3, 10, 2),
Z = rnorm(3, 10, 4)
)
obs<-list(obsA=obsA,obsB=obsB)
我可以轻松创建单个数据框,但它会将各个列表名称放入唯一的行名称中。
#create single data frame
obs_long<-do.call("rbind",obs)
#don't like this
rownames(obs_long)
#[1] "obsA.1" "obsA.2" "obsA.3" "obsB.1" "obsB.2" "obsB.3"
names(obs_long)
#[1] "date" "X" "Y" "Z"
我可以拉出该行,使用正则表达式删除添加的行标识符,然后 mutate()
到一个新列。
#Full solution but ungainly.
# Extra step to convert row names to a column. Risk of parsing error if
# a period is in item name.
tidy_obs<-do.call("rbind",obs) #%>%
mutate(item=str_extract(rownames(.),"[A-Za-z0-9 ]+"))%>%
select(date,item,everything())%>%
group_by(item)%>%arrange(date)
# > tidy_obs
# # A tibble: 6 x 5
# # Groups: item [2]
# date item X Y Z
# <date> <chr> <dbl> <dbl> <dbl>
# 1 2009-01-01 obsA -0.1030362 2.274885 -4.134265
# 2 2009-01-01 obsB 8.4210832 7.604203 13.449731
# 3 2009-01-02 obsA -0.2279141 -2.748717 4.372599
# 4 2009-01-02 obsB 12.8940563 10.594164 8.108275
# 5 2009-01-03 obsA 0.5749725 -4.041280 -0.524420
# 6 2009-01-03 obsB 10.1158769 12.684331 8.248651
这行得通,但我想知道是否有更直接的方法可以避免 mutate()
and/or 的额外步骤 str_extract()
的解析异常风险。谢谢!
您可以将 dplyr::bind_rows
与 .id
参数一起使用:
.id Data frame identifier.
When .id is supplied, a new column of
identifiers is created to link each row to its original data frame.
The labels are taken from the named arguments to bind_rows(). When a
list of data frames is supplied, the labels are taken from the names
of the list. If no names are found a numeric sequence is used instead.
bind_rows(obs, .id = "item")
# A tibble: 6 x 5
# item date X Y Z
# <chr> <date> <dbl> <dbl> <dbl>
#1 obsA 2009-01-01 -1.73508885 -0.4402811 7.342978
#2 obsA 2009-01-02 1.17149983 -0.5429690 8.167079
#3 obsA 2009-01-03 0.08631895 -0.1430551 5.925108
#4 obsB 2009-01-01 9.66203430 7.1094147 15.577023
#5 obsB 2009-01-02 10.43062660 9.6160614 15.077929
#6 obsB 2009-01-03 8.80792988 8.9604396 7.413831
也可能与 data.table
的 rbindlist
:
library(data.table)
rbindlist(obs, idcol = "item")
# item date X Y Z
#1: obsA 2009-01-01 -0.2900620 2.694434 2.555925
#2: obsA 2009-01-02 -1.0221531 -4.172495 -4.357794
#3: obsA 2009-01-03 0.2389569 -1.088882 -4.944420
#4: obsB 2009-01-01 9.4713142 10.433553 9.319284
#5: obsB 2009-01-02 10.0967994 11.941084 13.589136
#6: obsB 2009-01-03 9.9403227 11.727769 17.792899
基础 R 方法也可能是使用 Map
和 Reduce
:
Reduce(rbind, Map(function(x,name) cbind(x,item = name), obs, names(obs)))
我使用 API returns 数据帧列表(Rblpapi bdh()
函数)。我想使用列表的 names()
作为组合单个数据框中的新列,将数据放入整齐的格式。我有一个解决方案,但我怀疑它容易出错并且比需要的慢。
#create example data set
library(tidyr)
obsA <- data_frame(
date = as.Date('2009-01-01') + 0:2,
X = rnorm(3, 0, 1),
Y = rnorm(3, 0, 2),
Z = rnorm(3, 0, 4)
)
obsB <- data_frame(
date = as.Date('2009-01-01') + 0:2,
X = rnorm(3, 10, 1),
Y = rnorm(3, 10, 2),
Z = rnorm(3, 10, 4)
)
obs<-list(obsA=obsA,obsB=obsB)
我可以轻松创建单个数据框,但它会将各个列表名称放入唯一的行名称中。
#create single data frame
obs_long<-do.call("rbind",obs)
#don't like this
rownames(obs_long)
#[1] "obsA.1" "obsA.2" "obsA.3" "obsB.1" "obsB.2" "obsB.3"
names(obs_long)
#[1] "date" "X" "Y" "Z"
我可以拉出该行,使用正则表达式删除添加的行标识符,然后 mutate()
到一个新列。
#Full solution but ungainly.
# Extra step to convert row names to a column. Risk of parsing error if
# a period is in item name.
tidy_obs<-do.call("rbind",obs) #%>%
mutate(item=str_extract(rownames(.),"[A-Za-z0-9 ]+"))%>%
select(date,item,everything())%>%
group_by(item)%>%arrange(date)
# > tidy_obs
# # A tibble: 6 x 5
# # Groups: item [2]
# date item X Y Z
# <date> <chr> <dbl> <dbl> <dbl>
# 1 2009-01-01 obsA -0.1030362 2.274885 -4.134265
# 2 2009-01-01 obsB 8.4210832 7.604203 13.449731
# 3 2009-01-02 obsA -0.2279141 -2.748717 4.372599
# 4 2009-01-02 obsB 12.8940563 10.594164 8.108275
# 5 2009-01-03 obsA 0.5749725 -4.041280 -0.524420
# 6 2009-01-03 obsB 10.1158769 12.684331 8.248651
这行得通,但我想知道是否有更直接的方法可以避免 mutate()
and/or 的额外步骤 str_extract()
的解析异常风险。谢谢!
您可以将 dplyr::bind_rows
与 .id
参数一起使用:
.id Data frame identifier.
When .id is supplied, a new column of identifiers is created to link each row to its original data frame. The labels are taken from the named arguments to bind_rows(). When a list of data frames is supplied, the labels are taken from the names of the list. If no names are found a numeric sequence is used instead.
bind_rows(obs, .id = "item")
# A tibble: 6 x 5
# item date X Y Z
# <chr> <date> <dbl> <dbl> <dbl>
#1 obsA 2009-01-01 -1.73508885 -0.4402811 7.342978
#2 obsA 2009-01-02 1.17149983 -0.5429690 8.167079
#3 obsA 2009-01-03 0.08631895 -0.1430551 5.925108
#4 obsB 2009-01-01 9.66203430 7.1094147 15.577023
#5 obsB 2009-01-02 10.43062660 9.6160614 15.077929
#6 obsB 2009-01-03 8.80792988 8.9604396 7.413831
也可能与 data.table
的 rbindlist
:
library(data.table)
rbindlist(obs, idcol = "item")
# item date X Y Z
#1: obsA 2009-01-01 -0.2900620 2.694434 2.555925
#2: obsA 2009-01-02 -1.0221531 -4.172495 -4.357794
#3: obsA 2009-01-03 0.2389569 -1.088882 -4.944420
#4: obsB 2009-01-01 9.4713142 10.433553 9.319284
#5: obsB 2009-01-02 10.0967994 11.941084 13.589136
#6: obsB 2009-01-03 9.9403227 11.727769 17.792899
基础 R 方法也可能是使用 Map
和 Reduce
:
Reduce(rbind, Map(function(x,name) cbind(x,item = name), obs, names(obs)))