以编程方式为变量的每个可能值创建一个虚拟对象,并将这些虚拟对象传递给公式
programmatically create a dummy for each possible value of a variable, and pass these dummies to a formula
我发现 NHL shift data 并想评估一个模型,在该模型中,进球数遵循泊松分布,具体取决于两队谁在场上。
我的观点是,我们已经很清楚谁能得分(进球和助攻),但也许有人真的很擅长帮助他的球队得分而无需得分 sheet(也许是产生失误?)或者只是非常擅长阻止对方得分。
我可以创建如下所示 "data" 的数据集。每支球队通常有 5 名球员在冰上,但我只放了 2 名球员以使示例易于理解。
基本上,每个轮班我都有一行,我知道轮班的结果 (goal_for),shift_duration 并且我有一个为球队效力的球员的 ID 列表(for_players) 和对方 (against_players)。
我想做的 是获取 "data" 数据集并创建 "model_data",其中包含一个虚拟变量,指示一名球员是否在冰上对于给定的班次。然后我会为我的泊松模型创建一个公式,其中将包含所有的假人并将其传递给模型。我也可以放弃一个虚拟人,一个虚拟人反对,但我也可以让 mgcv:gam 为我做。
我怀疑这会涉及一些!!和 quos(),但我不知道该怎么做。
data <- tibble(
shift_id = c(1, 2, 3, 4, 5, 6, 7, 8,9,10),
shift_duration = c(12, 7, 30, 11, 14, 16, 19, 32,11,12),
goal_for = c(1, 1, 0, 0, 1, 1, 0, 0,0,0),
for_players = list(
c("A", "B"),
c("A", "C"),
c("B", "C"),
c("A", "C"),
c("B", "C"),
c("A", "B"),
c("B", "C"),
c("A", "B"),
c("B", "C"),
c("A", "B")
),
against_players = list(
c("X", "Z"),
c("Y", "Z"),
c("X", "Y"),
c("X", "Y"),
c("X", "Z"),
c("Y", "Z"),
c("X", "Y"),
c("Y", "Z"),
c("X", "Y"),
c("Y", "Z")
)
)
(black magic goes here)
model_data <- tibble(
shift_id = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10),
shift_duration = c(12, 7, 30, 11, 14, 16, 19, 32, 11, 12),
goal_for = c(1, 1, 0, 0, 1, 1, 0, 0, 0, 0),
for_player_A = c(1, 1, 0, 1, 0, 1, 0, 1, 0, 1),
for_player_B = c(1, 0, 1, 0, 1, 1, 1, 1, 1, 1),
for_player_C = c(0, 1, 1, 1, 1, 0, 1, 0, 1, 0),
against_player_X = c(1, 0, 1, 1, 1, 0, 1, 0, 1, 0),
against_player_Y = c(0, 1, 1, 1, 0, 1, 1, 1, 1, 1),
against_player_Z = c(1, 1, 0, 0, 1, 1, 0, 1, 0, 1)
)
mod.gam <- mgcv::gam(
data = model_data,
formula = goal_for ~ offset(log(shift_duration)) + for_player_A + for_player_B + for_player_C +
against_player_X + against_player_Y + against_player_Z,
family = poisson(link = log)
)
数据 看起来像这样:
> data
# A tibble: 10 x 5
shift_id shift_duration goal_for for_players against_players
<dbl> <dbl> <dbl> <list> <list>
1 1.00 12.0 1.00 <chr [2]> <chr [2]>
2 2.00 7.00 1.00 <chr [2]> <chr [2]>
3 3.00 30.0 0 <chr [2]> <chr [2]>
4 4.00 11.0 0 <chr [2]> <chr [2]>
5 5.00 14.0 1.00 <chr [2]> <chr [2]>
6 6.00 16.0 1.00 <chr [2]> <chr [2]>
7 7.00 19.0 0 <chr [2]> <chr [2]>
8 8.00 32.0 0 <chr [2]> <chr [2]>
9 9.00 11.0 0 <chr [2]> <chr [2]>
10 10.0 12.0 0 <chr [2]> <chr [2]>
模型数据 看起来像这样:
> model_data
# A tibble: 10 x 9
shift_id shift_duration goal_for for_player_A for_player_B for_player_C against_player_X against_player_Y against_player_Z
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1.00 12.0 1.00 1.00 1.00 0 1.00 0 1.00
2 2.00 7.00 1.00 1.00 0 1.00 0 1.00 1.00
3 3.00 30.0 0 0 1.00 1.00 1.00 1.00 0
4 4.00 11.0 0 1.00 0 1.00 1.00 1.00 0
5 5.00 14.0 1.00 0 1.00 1.00 1.00 0 1.00
6 6.00 16.0 1.00 1.00 1.00 0 0 1.00 1.00
7 7.00 19.0 0 0 1.00 1.00 1.00 1.00 0
8 8.00 32.0 0 1.00 1.00 0 0 1.00 1.00
9 9.00 11.0 0 0 1.00 1.00 1.00 1.00 0
10 10.0 12.0 0 1.00 1.00 0 0 1.00 1.00
模型的结果:
Family: poisson
Link function: log
Formula:
goal_for ~ offset(log(shift_duration)) + for_player_A + for_player_B +
for_player_C + against_player_X + against_player_Y + against_player_Z
Parametric coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -22.0296 4317.9341 -0.005 0.996
for_player_A 0.0000 0.0000 NA NA
for_player_B -2.3026 2.0000 -1.151 0.250
for_player_C -0.1542 1.4142 -0.109 0.913
against_player_X 1.6094 1.4142 1.138 0.255
against_player_Y 0.0000 0.0000 NA NA
against_player_Z 20.2378 4317.9339 0.005 0.996
Rank: 5/7
R-sq.(adj) = 0.353 Deviance explained = 73.6%
UBRE = 0.26435 Scale est. = 1 n = 10
您可以使用 tidyr
...
中的函数将 data
数据框转换为 model_data
数据框
library(dplyr)
library(tidyr)
model_data <-
data %>%
unnest(for_players, .drop = F) %>%
spread(for_players, for_players, sep = '_') %>%
unnest(against_players, .drop = F) %>%
spread(against_players, against_players, sep = '_') %>%
mutate_at(vars(-(1:3)), funs(as.numeric(!is.na(.))))
我发现 NHL shift data 并想评估一个模型,在该模型中,进球数遵循泊松分布,具体取决于两队谁在场上。
我的观点是,我们已经很清楚谁能得分(进球和助攻),但也许有人真的很擅长帮助他的球队得分而无需得分 sheet(也许是产生失误?)或者只是非常擅长阻止对方得分。
我可以创建如下所示 "data" 的数据集。每支球队通常有 5 名球员在冰上,但我只放了 2 名球员以使示例易于理解。
基本上,每个轮班我都有一行,我知道轮班的结果 (goal_for),shift_duration 并且我有一个为球队效力的球员的 ID 列表(for_players) 和对方 (against_players)。
我想做的 是获取 "data" 数据集并创建 "model_data",其中包含一个虚拟变量,指示一名球员是否在冰上对于给定的班次。然后我会为我的泊松模型创建一个公式,其中将包含所有的假人并将其传递给模型。我也可以放弃一个虚拟人,一个虚拟人反对,但我也可以让 mgcv:gam 为我做。
我怀疑这会涉及一些!!和 quos(),但我不知道该怎么做。
data <- tibble(
shift_id = c(1, 2, 3, 4, 5, 6, 7, 8,9,10),
shift_duration = c(12, 7, 30, 11, 14, 16, 19, 32,11,12),
goal_for = c(1, 1, 0, 0, 1, 1, 0, 0,0,0),
for_players = list(
c("A", "B"),
c("A", "C"),
c("B", "C"),
c("A", "C"),
c("B", "C"),
c("A", "B"),
c("B", "C"),
c("A", "B"),
c("B", "C"),
c("A", "B")
),
against_players = list(
c("X", "Z"),
c("Y", "Z"),
c("X", "Y"),
c("X", "Y"),
c("X", "Z"),
c("Y", "Z"),
c("X", "Y"),
c("Y", "Z"),
c("X", "Y"),
c("Y", "Z")
)
)
(black magic goes here)
model_data <- tibble(
shift_id = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10),
shift_duration = c(12, 7, 30, 11, 14, 16, 19, 32, 11, 12),
goal_for = c(1, 1, 0, 0, 1, 1, 0, 0, 0, 0),
for_player_A = c(1, 1, 0, 1, 0, 1, 0, 1, 0, 1),
for_player_B = c(1, 0, 1, 0, 1, 1, 1, 1, 1, 1),
for_player_C = c(0, 1, 1, 1, 1, 0, 1, 0, 1, 0),
against_player_X = c(1, 0, 1, 1, 1, 0, 1, 0, 1, 0),
against_player_Y = c(0, 1, 1, 1, 0, 1, 1, 1, 1, 1),
against_player_Z = c(1, 1, 0, 0, 1, 1, 0, 1, 0, 1)
)
mod.gam <- mgcv::gam(
data = model_data,
formula = goal_for ~ offset(log(shift_duration)) + for_player_A + for_player_B + for_player_C +
against_player_X + against_player_Y + against_player_Z,
family = poisson(link = log)
)
数据 看起来像这样:
> data
# A tibble: 10 x 5
shift_id shift_duration goal_for for_players against_players
<dbl> <dbl> <dbl> <list> <list>
1 1.00 12.0 1.00 <chr [2]> <chr [2]>
2 2.00 7.00 1.00 <chr [2]> <chr [2]>
3 3.00 30.0 0 <chr [2]> <chr [2]>
4 4.00 11.0 0 <chr [2]> <chr [2]>
5 5.00 14.0 1.00 <chr [2]> <chr [2]>
6 6.00 16.0 1.00 <chr [2]> <chr [2]>
7 7.00 19.0 0 <chr [2]> <chr [2]>
8 8.00 32.0 0 <chr [2]> <chr [2]>
9 9.00 11.0 0 <chr [2]> <chr [2]>
10 10.0 12.0 0 <chr [2]> <chr [2]>
模型数据 看起来像这样:
> model_data
# A tibble: 10 x 9
shift_id shift_duration goal_for for_player_A for_player_B for_player_C against_player_X against_player_Y against_player_Z
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1.00 12.0 1.00 1.00 1.00 0 1.00 0 1.00
2 2.00 7.00 1.00 1.00 0 1.00 0 1.00 1.00
3 3.00 30.0 0 0 1.00 1.00 1.00 1.00 0
4 4.00 11.0 0 1.00 0 1.00 1.00 1.00 0
5 5.00 14.0 1.00 0 1.00 1.00 1.00 0 1.00
6 6.00 16.0 1.00 1.00 1.00 0 0 1.00 1.00
7 7.00 19.0 0 0 1.00 1.00 1.00 1.00 0
8 8.00 32.0 0 1.00 1.00 0 0 1.00 1.00
9 9.00 11.0 0 0 1.00 1.00 1.00 1.00 0
10 10.0 12.0 0 1.00 1.00 0 0 1.00 1.00
模型的结果:
Family: poisson
Link function: log
Formula:
goal_for ~ offset(log(shift_duration)) + for_player_A + for_player_B +
for_player_C + against_player_X + against_player_Y + against_player_Z
Parametric coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -22.0296 4317.9341 -0.005 0.996
for_player_A 0.0000 0.0000 NA NA
for_player_B -2.3026 2.0000 -1.151 0.250
for_player_C -0.1542 1.4142 -0.109 0.913
against_player_X 1.6094 1.4142 1.138 0.255
against_player_Y 0.0000 0.0000 NA NA
against_player_Z 20.2378 4317.9339 0.005 0.996
Rank: 5/7
R-sq.(adj) = 0.353 Deviance explained = 73.6%
UBRE = 0.26435 Scale est. = 1 n = 10
您可以使用 tidyr
...
data
数据框转换为 model_data
数据框
library(dplyr)
library(tidyr)
model_data <-
data %>%
unnest(for_players, .drop = F) %>%
spread(for_players, for_players, sep = '_') %>%
unnest(against_players, .drop = F) %>%
spread(against_players, against_players, sep = '_') %>%
mutate_at(vars(-(1:3)), funs(as.numeric(!is.na(.))))