根据R中不同行的其他变量值的组合计算不同行的变量值的差异
Calculate difference of variable values from different rows based on combination of other variables' values from different rows in R
我在 data.table
中有 id
和 period
识别的不平衡面板数据。有 8,278 个观察值和 230 个变量。
我想知道我的数据中的公司(由 id
标识)从计划进入市场(plan_entry == "yes"
,包含 NA
s)到他们实际进入市场(enter_market == "yes"
,不包含 NA
)。
因此,我想生成例如time_to_entry == 5
如果一家公司计划在 period == 4
进入市场并在 period == 9
最终进入市场。数据的大致结构如下,并且已经包含了所需的输出变量。请注意,公司可能会在未说明任何先前计划的情况下进入市场。也可能是他们有进入新市场的计划,同期进入新市场。在这两种情况下,我都想要 time_to_entry == 0
。如果一家公司从未进入任何市场,它也应该为 0。
示例数据和预期结果
library(data.table)
desired_output <-
data.table(id = as.factor(c(rep("C001", 3), "C002", rep("C003", 5), rep("C004", 2), rep("C005", 7))),
period = as.factor(c(1, 2, 3, 2, 1, 4, 5, 6, 10, 3, 4, 2, 3, 4, 7, 8, 9, 10)),
plan_entry = as.factor(c(rep(NA, 2), "yes", "no", NA, rep("no", 2), rep("yes", 4), rep(NA, 2), rep("yes", 4), "no")),
enter_market = as.factor(c(rep("no", 3), "yes", rep("no", 5), rep("yes", 2), rep("no", 5), rep("yes", 2))),
time_to_entry = c(rep(0, 10), 1, rep(0, 5), 5, 1))
desired_output
# id period plan_entry enter_market time_to_entry
# 1: C001 1 <NA> no 0
# 2: C001 2 <NA> no 0
# 3: C001 3 yes no 0
# 4: C002 2 no yes 0
# 5: C003 1 <NA> no 0
# 6: C003 4 no no 0
# 7: C003 5 no no 0
# 8: C003 6 yes no 0
# 9: C003 10 yes no 0
#10: C004 3 yes yes 0 ! there might be cases
# where companies enter a market without stating any plans to do so in previous periods
#11: C004 4 yes yes 1
#12: C005 2 <NA> no 0
#13: C005 3 <NA> no 0
#14: C005 4 yes no 0
#15: C005 7 yes no 0
#16: C005 8 yes no 0
#17: C005 9 yes yes 5
#18: C005 10 no yes 1
问题描述
因此,我需要一个命令来查找第一个 period
,其中 plan_entry == "yes"
用于特定的 id
,然后在以下 period
中搜索 enter_market == "yes"
并计算各自 period
之间的差异并将其存储在 time_to_entry
中。然后,它应该从这个 period
开始,寻找下一个 plan_entry == "yes"
特定的 id
(这可能是公司进入市场的同一时期。但是,这种情况应该不被考虑,而只被认为是下一个时期 enter_market == "yes"
.) 等等
有人知道怎么做吗?
不完整的分步方法
下面我尝试了另一种方法,但它没有考虑所有的要求,因为它只考虑公司第一次进入市场。我也很乐意了解 data.table
方法。
- 搜索每个公司的 dt$enter_market == "yes" 的最短期限
library(data.table)
library(dplyr)
# generate almost same dataset but without desired variable time_to_entry
dt <-
data.table(id = as.factor(c(rep("C001", 3), "C002", rep("C003", 5), rep("C004", 2), rep("C005", 7))),
period = as.factor(c(1, 2, 3, 2, 1, 4, 5, 6, 10, 3, 4, 2, 3, 4, 7, 8, 9, 10)),
plan_entry = as.factor(c(rep(NA, 2), "yes", "no", NA, rep("no", 2), rep("yes", 4), rep(NA, 2), rep("yes", 4), "no")),
enter_market = as.factor(c(rep("no", 3), "yes", rep("no", 5), rep("yes", 2), rep("no", 5), rep("yes", 2))))
# generate minimum period by company
dt[, min_period := min(as.numeric(period)), by = id]
# make data.table a data.frame
dt <- as.data.frame(dt)
# use dplyr
dt <-
dt %>%
group_by(id, enter_market) %>% # group data by id and market entry
mutate(min_entry_period = min(as.numeric(period))) # generate minimum period for grouped data
# minimum period for companies where plan_entry == "yes"
dt$entry_period <-
ifelse(
dt$min_period != dt$min_entry_period & dt$plan_entry == "yes",
dt$min_entry_period,
NA)
dt
# A tibble: 18 x 8
# Groups: id, enter_market [6]
# id period plan_entry enter_market time_to_entry min_period min_entry_period entry_period
# <fct> <fct> <fct> <fct> <dbl> <dbl> <dbl> <dbl>
# 1 C001 1 NA no 0 1 1 NA
# 2 C001 2 NA no 0 1 1 NA
# 3 C001 3 yes no 0 1 1 NA
# 4 C002 2 no yes 0 2 2 NA
# 5 C003 1 NA no 0 1 1 NA
# 6 C003 4 no no 0 1 1 NA
# 7 C003 5 no no 0 1 1 NA
# 8 C003 6 yes no 0 1 1 NA
# 9 C003 10 yes no 0 1 1 NA
#10 C004 3 yes yes 0 3 3 NA
#11 C004 4 yes yes 1 3 3 NA
#12 C005 2 NA no 0 2 2 NA
#13 C005 3 NA no 0 2 2 NA
#14 C005 4 yes no 0 2 2 NA
#15 C005 7 yes no 0 2 2 NA
#16 C005 8 yes no 0 2 2 NA
#17 C005 9 yes yes 5 2 9 9
#18 C005 10 no yes 0 2 9 NA
- dt$plan_entry ==“是”的所有时期
dt$plan_entry_period <-
ifelse(dt$plan_entry == "yes", dt$period, NA)
- 对于每个 dt$plan_entry == "yes" 的公司计算 entry_period – plan_entry_period
# fill in first entry_period for each observation by company
library(zoo) # for na.locf()
dt <- as.data.table(dt)
dt[, entry_period := na.locf(entry_period, na.rm = FALSE, fromLast = FALSE), by = id]
dt[, entry_period := na.locf(entry_period, na.rm = FALSE, fromLast = TRUE), by = id]
dt$time_to_entry <-
ifelse(
dt$plan_entry == "yes",
dt$entry_period - dt$plan_entry_period,
NA)
# check variable
summary(dt$time_to_entry)
dt
# id period plan_entry enter_market min_period min_entry_period entry_period plan_entry_period time_to_entry
# 1: C001 1 <NA> no 1 1 NA NA NA
# 2: C001 2 <NA> no 1 1 NA NA NA
# 3: C001 3 yes no 1 1 NA 3 NA
# 4: C002 2 no yes 2 2 NA NA 0
# 5: C003 1 <NA> no 1 1 NA NA NA
# 6: C003 4 no no 1 1 NA NA 0
# 7: C003 5 no no 1 1 NA NA 0
# 8: C003 6 yes no 1 1 NA 6 NA
# 9: C003 10 yes no 1 1 NA 10 NA
#10: C004 3 yes yes 3 3 NA 3 NA
#11: C004 4 yes yes 3 3 NA 4 NA
#12: C005 2 <NA> no 2 2 9 NA NA
#13: C005 3 <NA> no 2 2 9 NA NA
#14: C005 4 yes no 2 2 9 4 5
#15: C005 7 yes no 2 2 9 7 2
#16: C005 8 yes no 2 2 9 8 1
#17: C005 9 yes yes 2 9 9 9 0
#18: C005 10 no yes 2 9 9 NA 0
显然,time_to_entry
与 desired_result
数据集相比看起来非常不同。
我希望我能够很好地描述问题。我真的很感激任何建议!提前致谢!
这是一个可以重现您的 desired_result
的解决方案
dt <-
data.table(id = as.factor(c(rep("C001", 3), "C002", rep("C003", 5), rep("C004", 2), rep("C005", 7))),
period = as.numeric(c(1, 2, 3, 2, 1, 4, 5, 6, 10, 3, 4, 2, 3, 4, 7, 8, 9, 10)),
plan_entry = as.factor(c(rep(NA, 2), "yes", "no", NA, rep("no", 2), rep("yes", 4), rep(NA, 2), rep("yes", 4), "no")),
enter_market = as.factor(c(rep("no", 3), "yes", rep("no", 5), rep("yes", 2), rep("no", 5), rep("yes", 2))))
dt[, time_to_entry_with_plan := period - min(period),
by = .(id,plan_entry)]
dt[, time_to_entry_without_plan := period - min(period),
by = .(id,enter_market)]
dt[, time_to_entry:=fcase(enter_market == "yes" & plan_entry == "yes", time_to_entry_with_plan,
enter_market == "yes" & plan_entry == "no", time_to_entry_without_plan,
default = 0)]
结果:
id period plan_entry enter_market time_to_entry_with_plan time_to_entry_without_plan time_to_entry
1: C001 1 <NA> no 0 0 0
2: C001 2 <NA> no 1 1 0
3: C001 3 yes no 0 2 0
4: C002 2 no yes 0 0 0
5: C003 1 <NA> no 0 0 0
6: C003 4 no no 0 3 0
7: C003 5 no no 1 4 0
8: C003 6 yes no 0 5 0
9: C003 10 yes no 4 9 0
10: C004 3 yes yes 0 0 0
11: C004 4 yes yes 1 1 1
12: C005 2 <NA> no 0 0 0
13: C005 3 <NA> no 1 1 0
14: C005 4 yes no 0 2 0
15: C005 7 yes no 3 5 0
16: C005 8 yes no 4 6 0
17: C005 9 yes yes 5 0 5
18: C005 10 no yes 0 1 1
这是一个使用非等值连接的选项:
#find the previous latest enter_market before current row
DT[enter_market=="yes", prev_entry :=
fcoalesce(.SD[.SD, on=.(id, period<period), mult="last", x.period], 0L)
]
#non-equi join to find the first plan_entry before current enter_market but after previous latest enter_market
DT[enter_market=="yes", plan_period :=
DT[plan_entry=="yes"][.SD, on=.(id, period>=prev_entry, period<period), mult="first", x.period]
]
#calculate time_to_entry and set NAs to 0
DT[, time_to_entry := fcoalesce(period - plan_period, 0L)]
DT
数据(不含因素):
DT <-
data.table(id = c(rep("C001", 3), "C002", rep("C003", 5), rep("C004", 2), rep("C005", 7)),
period = as.integer(c(1, 2, 3, 2, 1, 4, 5, 6, 10, 3, 4, 2, 3, 4, 7, 8, 9, 10)),
plan_entry = c(rep(NA, 2), "yes", "no", NA, rep("no", 2), rep("yes", 4), rep(NA, 2), rep("yes", 4), "no"),
enter_market = c(rep("no", 3), "yes", rep("no", 5), rep("yes", 2), rep("no", 5), rep("yes", 2)))
我在 data.table
中有 id
和 period
识别的不平衡面板数据。有 8,278 个观察值和 230 个变量。
我想知道我的数据中的公司(由 id
标识)从计划进入市场(plan_entry == "yes"
,包含 NA
s)到他们实际进入市场(enter_market == "yes"
,不包含 NA
)。
因此,我想生成例如time_to_entry == 5
如果一家公司计划在 period == 4
进入市场并在 period == 9
最终进入市场。数据的大致结构如下,并且已经包含了所需的输出变量。请注意,公司可能会在未说明任何先前计划的情况下进入市场。也可能是他们有进入新市场的计划,同期进入新市场。在这两种情况下,我都想要 time_to_entry == 0
。如果一家公司从未进入任何市场,它也应该为 0。
示例数据和预期结果
library(data.table)
desired_output <-
data.table(id = as.factor(c(rep("C001", 3), "C002", rep("C003", 5), rep("C004", 2), rep("C005", 7))),
period = as.factor(c(1, 2, 3, 2, 1, 4, 5, 6, 10, 3, 4, 2, 3, 4, 7, 8, 9, 10)),
plan_entry = as.factor(c(rep(NA, 2), "yes", "no", NA, rep("no", 2), rep("yes", 4), rep(NA, 2), rep("yes", 4), "no")),
enter_market = as.factor(c(rep("no", 3), "yes", rep("no", 5), rep("yes", 2), rep("no", 5), rep("yes", 2))),
time_to_entry = c(rep(0, 10), 1, rep(0, 5), 5, 1))
desired_output
# id period plan_entry enter_market time_to_entry
# 1: C001 1 <NA> no 0
# 2: C001 2 <NA> no 0
# 3: C001 3 yes no 0
# 4: C002 2 no yes 0
# 5: C003 1 <NA> no 0
# 6: C003 4 no no 0
# 7: C003 5 no no 0
# 8: C003 6 yes no 0
# 9: C003 10 yes no 0
#10: C004 3 yes yes 0 ! there might be cases
# where companies enter a market without stating any plans to do so in previous periods
#11: C004 4 yes yes 1
#12: C005 2 <NA> no 0
#13: C005 3 <NA> no 0
#14: C005 4 yes no 0
#15: C005 7 yes no 0
#16: C005 8 yes no 0
#17: C005 9 yes yes 5
#18: C005 10 no yes 1
问题描述
因此,我需要一个命令来查找第一个 period
,其中 plan_entry == "yes"
用于特定的 id
,然后在以下 period
中搜索 enter_market == "yes"
并计算各自 period
之间的差异并将其存储在 time_to_entry
中。然后,它应该从这个 period
开始,寻找下一个 plan_entry == "yes"
特定的 id
(这可能是公司进入市场的同一时期。但是,这种情况应该不被考虑,而只被认为是下一个时期 enter_market == "yes"
.) 等等
有人知道怎么做吗?
不完整的分步方法
下面我尝试了另一种方法,但它没有考虑所有的要求,因为它只考虑公司第一次进入市场。我也很乐意了解 data.table
方法。
- 搜索每个公司的 dt$enter_market == "yes" 的最短期限
library(data.table)
library(dplyr)
# generate almost same dataset but without desired variable time_to_entry
dt <-
data.table(id = as.factor(c(rep("C001", 3), "C002", rep("C003", 5), rep("C004", 2), rep("C005", 7))),
period = as.factor(c(1, 2, 3, 2, 1, 4, 5, 6, 10, 3, 4, 2, 3, 4, 7, 8, 9, 10)),
plan_entry = as.factor(c(rep(NA, 2), "yes", "no", NA, rep("no", 2), rep("yes", 4), rep(NA, 2), rep("yes", 4), "no")),
enter_market = as.factor(c(rep("no", 3), "yes", rep("no", 5), rep("yes", 2), rep("no", 5), rep("yes", 2))))
# generate minimum period by company
dt[, min_period := min(as.numeric(period)), by = id]
# make data.table a data.frame
dt <- as.data.frame(dt)
# use dplyr
dt <-
dt %>%
group_by(id, enter_market) %>% # group data by id and market entry
mutate(min_entry_period = min(as.numeric(period))) # generate minimum period for grouped data
# minimum period for companies where plan_entry == "yes"
dt$entry_period <-
ifelse(
dt$min_period != dt$min_entry_period & dt$plan_entry == "yes",
dt$min_entry_period,
NA)
dt
# A tibble: 18 x 8
# Groups: id, enter_market [6]
# id period plan_entry enter_market time_to_entry min_period min_entry_period entry_period
# <fct> <fct> <fct> <fct> <dbl> <dbl> <dbl> <dbl>
# 1 C001 1 NA no 0 1 1 NA
# 2 C001 2 NA no 0 1 1 NA
# 3 C001 3 yes no 0 1 1 NA
# 4 C002 2 no yes 0 2 2 NA
# 5 C003 1 NA no 0 1 1 NA
# 6 C003 4 no no 0 1 1 NA
# 7 C003 5 no no 0 1 1 NA
# 8 C003 6 yes no 0 1 1 NA
# 9 C003 10 yes no 0 1 1 NA
#10 C004 3 yes yes 0 3 3 NA
#11 C004 4 yes yes 1 3 3 NA
#12 C005 2 NA no 0 2 2 NA
#13 C005 3 NA no 0 2 2 NA
#14 C005 4 yes no 0 2 2 NA
#15 C005 7 yes no 0 2 2 NA
#16 C005 8 yes no 0 2 2 NA
#17 C005 9 yes yes 5 2 9 9
#18 C005 10 no yes 0 2 9 NA
- dt$plan_entry ==“是”的所有时期
dt$plan_entry_period <-
ifelse(dt$plan_entry == "yes", dt$period, NA)
- 对于每个 dt$plan_entry == "yes" 的公司计算 entry_period – plan_entry_period
# fill in first entry_period for each observation by company
library(zoo) # for na.locf()
dt <- as.data.table(dt)
dt[, entry_period := na.locf(entry_period, na.rm = FALSE, fromLast = FALSE), by = id]
dt[, entry_period := na.locf(entry_period, na.rm = FALSE, fromLast = TRUE), by = id]
dt$time_to_entry <-
ifelse(
dt$plan_entry == "yes",
dt$entry_period - dt$plan_entry_period,
NA)
# check variable
summary(dt$time_to_entry)
dt
# id period plan_entry enter_market min_period min_entry_period entry_period plan_entry_period time_to_entry
# 1: C001 1 <NA> no 1 1 NA NA NA
# 2: C001 2 <NA> no 1 1 NA NA NA
# 3: C001 3 yes no 1 1 NA 3 NA
# 4: C002 2 no yes 2 2 NA NA 0
# 5: C003 1 <NA> no 1 1 NA NA NA
# 6: C003 4 no no 1 1 NA NA 0
# 7: C003 5 no no 1 1 NA NA 0
# 8: C003 6 yes no 1 1 NA 6 NA
# 9: C003 10 yes no 1 1 NA 10 NA
#10: C004 3 yes yes 3 3 NA 3 NA
#11: C004 4 yes yes 3 3 NA 4 NA
#12: C005 2 <NA> no 2 2 9 NA NA
#13: C005 3 <NA> no 2 2 9 NA NA
#14: C005 4 yes no 2 2 9 4 5
#15: C005 7 yes no 2 2 9 7 2
#16: C005 8 yes no 2 2 9 8 1
#17: C005 9 yes yes 2 9 9 9 0
#18: C005 10 no yes 2 9 9 NA 0
显然,time_to_entry
与 desired_result
数据集相比看起来非常不同。
我希望我能够很好地描述问题。我真的很感激任何建议!提前致谢!
这是一个可以重现您的 desired_result
的解决方案dt <-
data.table(id = as.factor(c(rep("C001", 3), "C002", rep("C003", 5), rep("C004", 2), rep("C005", 7))),
period = as.numeric(c(1, 2, 3, 2, 1, 4, 5, 6, 10, 3, 4, 2, 3, 4, 7, 8, 9, 10)),
plan_entry = as.factor(c(rep(NA, 2), "yes", "no", NA, rep("no", 2), rep("yes", 4), rep(NA, 2), rep("yes", 4), "no")),
enter_market = as.factor(c(rep("no", 3), "yes", rep("no", 5), rep("yes", 2), rep("no", 5), rep("yes", 2))))
dt[, time_to_entry_with_plan := period - min(period),
by = .(id,plan_entry)]
dt[, time_to_entry_without_plan := period - min(period),
by = .(id,enter_market)]
dt[, time_to_entry:=fcase(enter_market == "yes" & plan_entry == "yes", time_to_entry_with_plan,
enter_market == "yes" & plan_entry == "no", time_to_entry_without_plan,
default = 0)]
结果:
id period plan_entry enter_market time_to_entry_with_plan time_to_entry_without_plan time_to_entry
1: C001 1 <NA> no 0 0 0
2: C001 2 <NA> no 1 1 0
3: C001 3 yes no 0 2 0
4: C002 2 no yes 0 0 0
5: C003 1 <NA> no 0 0 0
6: C003 4 no no 0 3 0
7: C003 5 no no 1 4 0
8: C003 6 yes no 0 5 0
9: C003 10 yes no 4 9 0
10: C004 3 yes yes 0 0 0
11: C004 4 yes yes 1 1 1
12: C005 2 <NA> no 0 0 0
13: C005 3 <NA> no 1 1 0
14: C005 4 yes no 0 2 0
15: C005 7 yes no 3 5 0
16: C005 8 yes no 4 6 0
17: C005 9 yes yes 5 0 5
18: C005 10 no yes 0 1 1
这是一个使用非等值连接的选项:
#find the previous latest enter_market before current row
DT[enter_market=="yes", prev_entry :=
fcoalesce(.SD[.SD, on=.(id, period<period), mult="last", x.period], 0L)
]
#non-equi join to find the first plan_entry before current enter_market but after previous latest enter_market
DT[enter_market=="yes", plan_period :=
DT[plan_entry=="yes"][.SD, on=.(id, period>=prev_entry, period<period), mult="first", x.period]
]
#calculate time_to_entry and set NAs to 0
DT[, time_to_entry := fcoalesce(period - plan_period, 0L)]
DT
数据(不含因素):
DT <-
data.table(id = c(rep("C001", 3), "C002", rep("C003", 5), rep("C004", 2), rep("C005", 7)),
period = as.integer(c(1, 2, 3, 2, 1, 4, 5, 6, 10, 3, 4, 2, 3, 4, 7, 8, 9, 10)),
plan_entry = c(rep(NA, 2), "yes", "no", NA, rep("no", 2), rep("yes", 4), rep(NA, 2), rep("yes", 4), "no"),
enter_market = c(rep("no", 3), "yes", rep("no", 5), rep("yes", 2), rep("no", 5), rep("yes", 2)))