如何从 R 数据框中的列表列中提取变量?

How to extract variables from list columns in R data frames?

我正在使用 purrrsegmented 库来拟合多个模型。我的目标是首先创建列表列,然后从中提取有用的变量。以下是示例数据框。

数据

请注意,以下数据框只有 1 个 Vehicle.ID2。在原始数据集中,我有多个Vehicle.ID2s.

> dput(fedf)
structure(list(Time = c(18.9, 19, 19.1, 19.2, 19.3, 19.4, 19.5, 
19.6, 19.7, 19.8, 19.9, 20, 20.1, 20.2, 20.3, 20.4, 20.5, 20.6, 
20.7, 20.8, 20.9, 21, 21.1, 21.2, 21.3, 21.4, 21.5, 21.6, 21.7, 
21.8, 21.9, 22, 22.1, 22.2, 22.3, 22.4, 22.5, 22.6, 22.7, 22.8, 
22.9, 23, 23.1, 23.2, 23.3, 23.4, 23.5, 23.6, 23.7, 23.8, 23.9, 
24, 24.1, 24.2, 24.3, 24.4, 24.5, 24.6, 24.7, 24.8, 24.9, 25, 
25.1, 25.2, 25.3, 25.4, 25.5, 25.6, 25.7, 25.8, 25.9, 26, 26.1, 
26.2, 26.3, 26.4, 26.5, 26.6, 26.7, 26.8, 26.9, 27, 27.1, 27.2, 
27.3, 27.4, 27.5, 27.6, 27.7, 27.8, 27.9, 28, 28.1, 28.2, 28.3, 
28.4, 28.5, 28.6, 28.7, 28.8, 28.9, 29, 29.1, 29.2, 29.3, 29.4, 
29.5, 29.6, 29.7, 29.8, 29.9, 30, 30.1, 30.2, 30.3, 30.4, 30.5, 
30.6, 30.7, 30.8, 30.9), Vehicle.ID2 = c("11.1", "11.1", "11.1", 
"11.1", "11.1", "11.1", "11.1", "11.1", "11.1", "11.1", "11.1", 
"11.1", "11.1", "11.1", "11.1", "11.1", "11.1", "11.1", "11.1", 
"11.1", "11.1", "11.1", "11.1", "11.1", "11.1", "11.1", "11.1", 
"11.1", "11.1", "11.1", "11.1", "11.1", "11.1", "11.1", "11.1", 
"11.1", "11.1", "11.1", "11.1", "11.1", "11.1", "11.1", "11.1", 
"11.1", "11.1", "11.1", "11.1", "11.1", "11.1", "11.1", "11.1", 
"11.1", "11.1", "11.1", "11.1", "11.1", "11.1", "11.1", "11.1", 
"11.1", "11.1", "11.1", "11.1", "11.1", "11.1", "11.1", "11.1", 
"11.1", "11.1", "11.1", "11.1", "11.1", "11.1", "11.1", "11.1", 
"11.1", "11.1", "11.1", "11.1", "11.1", "11.1", "11.1", "11.1", 
"11.1", "11.1", "11.1", "11.1", "11.1", "11.1", "11.1", "11.1", 
"11.1", "11.1", "11.1", "11.1", "11.1", "11.1", "11.1", "11.1", 
"11.1", "11.1", "11.1", "11.1", "11.1", "11.1", "11.1", "11.1", 
"11.1", "11.1", "11.1", "11.1", "11.1", "11.1", "11.1", "11.1", 
"11.1", "11.1", "11.1", "11.1", "11.1", "11.1"), svel_mean = c(NA, 
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 4.76472709090909, 4.77719854545454, 
4.78953218181818, 4.80158254545454, 4.81301290909091, 4.82320690909091, 
4.83122454545454, 4.83587545454545, 4.83594327272727, 4.83050254545455, 
4.81921854545454, 4.80256563636364, 4.78185709090909, 4.758948, 
4.73566690909091, 4.71328727272727, 4.69232381818182, 4.67267072727273, 
4.65389381818182, 4.63546509090909, 4.61684309090909, 4.59742581818182, 
4.57649418181818, 4.55325309090909, 4.52699290909091, 4.49726818181818, 
4.46394654545455, 4.42708672727273, 4.38678090909091, 4.34312763636364, 
4.29634472727273, 4.24687272727273, 4.19533527272727, 4.14238290909091, 
4.08854727272727, 4.03419072727273, 3.97953490909091, 3.92470963636364, 
3.86979, 3.81481781818182, 3.75981527272727, 3.704794, 3.64976254545455, 
3.59472727272727, 3.53969090909091, 3.48465327272727, 3.42961290909091, 
3.37456781818182, 3.31951545454545, 3.26445218181818, 3.20936963636364, 
3.15425254545455, 3.09907545454545, 3.04379436363636, 2.98833436363636, 
2.93256963636364, 2.87630054545454, 2.81926036363636, 2.76120945454545, 
2.702132, 2.64245218181818, 2.58312109090909, 2.52547036363636, 
2.47087418181818, 2.42039836363636, 2.37458763636364, 2.33338690909091, 
2.29618309090909, 2.262014, 2.22988945454545, 2.19903090909091, 
2.16892327272727, 2.13925218181818, 2.10983054545455, 2.08054945454545, 
2.05134563636364, 2.02218490909091, 1.99305654545455, 1.96398472727273, 
1.935058, 1.90647618181818, 1.87860145454545, 1.85197418181818, 
1.82724109090909, 1.80498454545455, 1.785546, 1.76897709090909, 
1.75517927272727, 1.74412, 1.73593218181818, 1.73079563636364, 
1.72870563636364, 1.72933218181818, 1.73208145454545), dssvel = c(NA, 
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 0, 0, 0, 0, 0, 
0, 0, -2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
0, 0, 2, 0)), .Names = c("Time", "Vehicle.ID2", "svel_mean", 
"dssvel"), row.names = c(NA, -121L), class = c("tbl_df", "tbl", 
"data.frame"))  

拟合分段回归模型的函数:

segf2_1 <- function(df){
  try(segmented(lm(svel_mean ~ Time, data=df), seg.Z = ~Time,
                psi = list(Time = df$Time[which(df$dssvel != 0)])#,
                #control = seg.control(seed=2)
  ),
  silent=TRUE)
}  

应用函数:

library(dplyr)
library(segmented)
library(purrr)
library(tidyr)
## Nesting the data frame --------------------
by_veh <- fedf %>% 
  group_by(Vehicle.ID2) %>% 
  nest()


## Applying the function ---------------------
modelz <- by_veh %>% 
  mutate(segs = map(data, segf2_1))

##  Getting time, acceeration and speed ---------
m <- modelz %>% 
  mutate(time_at_action = segs %>% map(function(x) round(x$psi[,2],0)),

         action_accel = map(segs, function(x) slope(x)[[1]][,1]),

         fitted_speed = map(segs, function(x) fitted(x))
         )

数据框 m 有 4 个包含有用数据的列表列。

我想做的事情:

我的数据集中基本上需要两个新列:每段之间的拟合速度和恒定加速度。因此,我的预期输出如下:

期望的输出

> dput(data_set)
structure(list(Vehicle.ID2 = c("11.1", "11.1", "11.1", "11.1", 
"11.1", "11.1", "11.1", "11.1", "11.1", "11.1", "11.1", "11.1", 
"11.1", "11.1", "11.1", "11.1", "11.1", "11.1", "11.1", "11.1", 
"11.1", "11.1", "11.1", "11.1", "11.1", "11.1", "11.1", "11.1", 
"11.1", "11.1", "11.1", "11.1", "11.1", "11.1", "11.1", "11.1", 
"11.1", "11.1", "11.1", "11.1", "11.1", "11.1", "11.1", "11.1", 
"11.1", "11.1", "11.1", "11.1", "11.1", "11.1", "11.1", "11.1", 
"11.1", "11.1", "11.1", "11.1", "11.1", "11.1", "11.1", "11.1", 
"11.1", "11.1", "11.1", "11.1", "11.1", "11.1", "11.1", "11.1", 
"11.1", "11.1", "11.1", "11.1", "11.1", "11.1", "11.1", "11.1", 
"11.1", "11.1", "11.1", "11.1", "11.1", "11.1", "11.1", "11.1", 
"11.1", "11.1", "11.1", "11.1", "11.1", "11.1", "11.1", "11.1", 
"11.1", "11.1", "11.1", "11.1", "11.1", "11.1", "11.1", "11.1", 
"11.1", "11.1", "11.1", "11.1", "11.1", "11.1", "11.1", "11.1", 
"11.1", "11.1", "11.1", "11.1", "11.1", "11.1", "11.1", "11.1", 
"11.1", "11.1", "11.1", "11.1", "11.1"), Time = c(18.9, 19, 19.1, 
19.2, 19.3, 19.4, 19.5, 19.6, 19.7, 19.8, 19.9, 20, 20.1, 20.2, 
20.3, 20.4, 20.5, 20.6, 20.7, 20.8, 20.9, 21, 21.1, 21.2, 21.3, 
21.4, 21.5, 21.6, 21.7, 21.8, 21.9, 22, 22.1, 22.2, 22.3, 22.4, 
22.5, 22.6, 22.7, 22.8, 22.9, 23, 23.1, 23.2, 23.3, 23.4, 23.5, 
23.6, 23.7, 23.8, 23.9, 24, 24.1, 24.2, 24.3, 24.4, 24.5, 24.6, 
24.7, 24.8, 24.9, 25, 25.1, 25.2, 25.3, 25.4, 25.5, 25.6, 25.7, 
25.8, 25.9, 26, 26.1, 26.2, 26.3, 26.4, 26.5, 26.6, 26.7, 26.8, 
26.9, 27, 27.1, 27.2, 27.3, 27.4, 27.5, 27.6, 27.7, 27.8, 27.9, 
28, 28.1, 28.2, 28.3, 28.4, 28.5, 28.6, 28.7, 28.8, 28.9, 29, 
29.1, 29.2, 29.3, 29.4, 29.5, 29.6, 29.7, 29.8, 29.9, 30, 30.1, 
30.2, 30.3, 30.4, 30.5, 30.6, 30.7, 30.8, 30.9), svel_mean = c(NA, 
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 4.76472709090909, 4.77719854545454, 
4.78953218181818, 4.80158254545454, 4.81301290909091, 4.82320690909091, 
4.83122454545454, 4.83587545454545, 4.83594327272727, 4.83050254545455, 
4.81921854545454, 4.80256563636364, 4.78185709090909, 4.758948, 
4.73566690909091, 4.71328727272727, 4.69232381818182, 4.67267072727273, 
4.65389381818182, 4.63546509090909, 4.61684309090909, 4.59742581818182, 
4.57649418181818, 4.55325309090909, 4.52699290909091, 4.49726818181818, 
4.46394654545455, 4.42708672727273, 4.38678090909091, 4.34312763636364, 
4.29634472727273, 4.24687272727273, 4.19533527272727, 4.14238290909091, 
4.08854727272727, 4.03419072727273, 3.97953490909091, 3.92470963636364, 
3.86979, 3.81481781818182, 3.75981527272727, 3.704794, 3.64976254545455, 
3.59472727272727, 3.53969090909091, 3.48465327272727, 3.42961290909091, 
3.37456781818182, 3.31951545454545, 3.26445218181818, 3.20936963636364, 
3.15425254545455, 3.09907545454545, 3.04379436363636, 2.98833436363636, 
2.93256963636364, 2.87630054545454, 2.81926036363636, 2.76120945454545, 
2.702132, 2.64245218181818, 2.58312109090909, 2.52547036363636, 
2.47087418181818, 2.42039836363636, 2.37458763636364, 2.33338690909091, 
2.29618309090909, 2.262014, 2.22988945454545, 2.19903090909091, 
2.16892327272727, 2.13925218181818, 2.10983054545455, 2.08054945454545, 
2.05134563636364, 2.02218490909091, 1.99305654545455, 1.96398472727273, 
1.935058, 1.90647618181818, 1.87860145454545, 1.85197418181818, 
1.82724109090909, 1.80498454545455, 1.785546, 1.76897709090909, 
1.75517927272727, 1.74412, 1.73593218181818, 1.73079563636364, 
1.72870563636364, 1.72933218181818, 1.73208145454545), dssvel = c(NA, 
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 0, 0, 0, 0, 0, 
0, 0, -2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
0, 0, 2, 0), fitted_speed = c(NA, NA, NA, NA, NA, NA, NA, NA, 
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
NA, NA, NA, 4.86921514029415, 4.85760307362944, 4.84599100696473, 
4.83437894030001, 4.82276687363529, 4.81115480697058, 4.79954274030586, 
4.78793067364114, 4.77631860697643, 4.76470654031171, 4.75309447364699, 
4.74148240698228, 4.72987034031756, 4.71825827365285, 4.70664620698813, 
4.69503414032341, 4.6834220736587, 4.67181000699398, 4.66019794032926, 
4.64858587366455, 4.63697380699983, 4.62536174033511, 4.6137496736704, 
4.60213760700568, 4.59052554034097, 4.56141666682718, 4.50709477451488, 
4.45277288220258, 4.39845098989029, 4.34412909757799, 4.28980720526569, 
4.23548531295339, 4.1811634206411, 4.1268415283288, 4.0725196360165, 
4.0181977437042, 3.96387585139191, 3.90955395907961, 3.85523206676731, 
3.80091017445502, 3.74658828214272, 3.69226638983042, 3.63794449751812, 
3.58362260520583, 3.52930071289353, 3.47497882058123, 3.42065692826894, 
3.36633503595664, 3.31201314364434, 3.25769125133204, 3.20336935901975, 
3.14904746670745, 3.09472557439515, 3.04040368208286, 2.98608178977056, 
2.93175989745826, 2.87743800514596, 2.82311611283367, 2.76879422052137, 
2.71447232820907, 2.66015043589677, 2.60582854358448, 2.55150665127218, 
2.49718475895988, 2.44286286664759, 2.38854097433529, 2.33421908202299, 
2.27989718971069, 2.2255752973984, 2.18528965396358, 2.16277983827573, 
2.14027002258788, 2.11776020690003, 2.09525039121218, 2.07274057552433, 
2.05023075983648, 2.02772094414863, 2.00521112846078, 1.98270131277293, 
1.96019149708508, 1.93768168139723, 1.91517186570939, 1.89266205002154, 
1.87015223433369, 1.84764241864584, 1.82513260295799, 1.80262278727014, 
1.78011297158229, 1.75760315589444, 1.73509334020659, 1.71258352451874, 
1.69007370883089, 1.66756389314304, 1.64505407745519), action_accel = c(-0.1161, 
-0.1161, -0.1161, -0.1161, -0.1161, -0.1161, -0.1161, -0.1161, 
-0.1161, -0.1161, -0.1161, -0.1161, -0.1161, -0.1161, -0.1161, 
-0.1161, -0.1161, -0.1161, -0.1161, -0.1161, -0.1161, -0.1161, 
-0.1161, -0.1161, -0.1161, -0.1161, -0.1161, -0.1161, -0.1161, 
-0.1161, -0.1161, -0.1161, -0.1161, -0.1161, -0.1161, -0.1161, 
-0.1161, -0.1161, -0.1161, -0.1161, -0.1161, -0.1161, -0.1161, 
-0.1161, -0.1161, -0.1161, -0.1161, -0.1161, -0.1161, -0.1161, 
-0.1161, -0.5432, -0.5432, -0.5432, -0.5432, -0.5432, -0.5432, 
-0.5432, -0.5432, -0.5432, -0.5432, -0.5432, -0.5432, -0.5432, 
-0.5432, -0.5432, -0.5432, -0.5432, -0.5432, -0.5432, -0.5432, 
-0.5432, -0.5432, -0.5432, -0.5432, -0.5432, -0.5432, -0.5432, 
-0.5432, -0.5432, -0.5432, -0.5432, -0.5432, -0.5432, -0.5432, 
-0.5432, -0.5432, -0.5432, -0.5432, -0.5432, -0.5432, -0.2251, 
-0.2251, -0.2251, -0.2251, -0.2251, -0.2251, -0.2251, -0.2251, 
-0.2251, -0.2251, -0.2251, -0.2251, -0.2251, -0.2251, -0.2251, 
-0.2251, -0.2251, -0.2251, -0.2251, -0.2251, -0.2251, -0.2251, 
-0.2251, -0.2251, -0.2251, -0.2251, -0.2251, -0.2251, -0.2251, 
-0.2251)), row.names = c(NA, -121L), .Names = c("Vehicle.ID2", 
"Time", "svel_mean", "dssvel", "fitted_speed", "action_accel"
), class = "data.frame")

实现所需输出的一种方法:

我使用以下代码获得我想要的输出。但它很长。我确信必须有更直接的方法(类似于 broom::glancebroom::tidy)。请帮我改进这段代码。

library(tidyr)
## Unnesting -----------
time_at_action_df <- unnest(m, time_at_action)
action_accel_df <- unnest(m, action_accel)
fitted_speed_df <- unnest(m, fitted_speed)
data_set <- unnest(m, data)


## First point in time for each pair ----
du <- data_set %>% 
  group_by(Vehicle.ID2) %>% 
  summarise(psi_first = head(Time, 1)) %>% 
  ungroup()  

## Action Points ------
time_at_action_df <- time_at_action_df %>% 
  left_join(x = ., y = du) %>% 
  group_by(Vehicle.ID2) %>% 
  do(data.frame(time_at_action = c(unique(.$psi_first), .$time_at_action ))) %>% 
  ungroup()


## Accelerations/Decelerations and action points -----
action_accel_df <- action_accel_df %>% 
  arrange(Vehicle.ID2) 
action_accel_df <- cbind(action_accel_df, time_at_action = time_at_action_df$time_at_action)


## Combining fitted speeds with original data ------
fitted_speed_df <- fitted_speed_df %>% 
  group_by(Vehicle.ID2) %>% 
  do(data.frame(fitted_speed = c(rep(NA,27), .$fitted_speed))) %>% 
  ungroup()

## Arranging ----
data_set <- data_set %>% 
  arrange(Vehicle.ID2)

data_set <- cbind(data_set, fitted_speed_df)
data_set <- data_set[,-5]

## Combining slope with original data ---
data_set <- action_accel_df %>% 
  left_join(x=data_set, y = ., 
            by = c('Vehicle.ID2' = 'Vehicle.ID2', 
                   'Time' = 'time_at_action')) %>% 
  fill(action_accel, .direction = "down")

这个解决方案对您来说太过分了吗?'hard coding'?在有人对我使用 splitpurrr 大喊大叫之前。如果 length(time_at_action) != 2,代码就会中断,但我确信这不是无法克服的修复挑战。应该推广到多个 Vehicle.ID2.

library(dplyr)
library(segmented)
library(purrr)
library(tidyr)

data = fedf %>% split(., .$Vehicle.ID2)
model = data %>% map(segf2_1)
param = model %>% map( ~ {list(time_at_action = round(.$psi[, 2], 0),
                               action_accel = slope(.)[[1]][, 1],
                               fitted_speed = fitted(.))})

dfexp2 = pmap(.l = list(data, model, param),
     .f = function(x, y, z) {
       x %>% mutate(fitted_speed = c(rep(NA, times = sum(is.na(x$svel_mean))), z$fitted),
                    time = ifelse(
                      test = Time < z$time_at_action[1],
                      yes = z$action_accel[[1]],
                      no = ifelse(test = (Time > z$time_at_action[1]) & (Time < z$time_at_action[2]),
                                  yes = z$action_accel[[2]],
                                  no = z$action_accel[[3]])))})

dfexp2 %>% bind_rows()

编辑:没有硬编码的新代码:

dfexp2 = pmap(.l = list(data, model, param),
              .f = function(x, y, z) {
                x %>% mutate(fitted_speed = c(rep(NA, times = sum(is.na(x$svel_mean))), z$fitted),
                             time = z$action_accel[cut(Time ,c(min(Time), z$time_at_action, max(Time)), right = F)])})