如果值为 NA dplyr,则输入变量
imputing variable if value is NA dplyr
我正在尝试估算一个变量,dt_drug_end
如果 dt_drug_end
缺失(NA)。
逻辑是:
1.If 它是 line_number
组的一部分,然后是 line_number
组,其中 dt_drug_end = max(dt_drug_end)
。
2. 否则,如果开始日期可用,则将结束日期设为 dt_drug_start
。
当 max(dt_drug_end) 为 NA 时,我的代码没有将结束日期作为开始日期。即使我使用逻辑 case_when( is.na(dt_drug_end) & is.na(max_dt_drug_end) ~ dt_drug_start) dt_end_date 是 NA 而不是匹配 dt_drug_end
patient_id line_number_raw drug_name dt_drug_start dt_drug_end reason_discount trt_diag flag_after_diag_adv
<chr> <chr> <chr> <date> <date> <chr> <chr> <dbl>
1 0Q97V6R3GI 1 Carboplatin 2013-04-12 2013-04-18 Yes, Comment field C34.3 1
2 0Q97V6R3GI 1 Cisplatin 2013-04-12 2013-06-16 Completed Treatment C34.3 1
3 0Q97V6R3GI 3 Denosumab 2016-12-06 NA No C34.3 1
4 0Q97V6R3GI 4 Osimertinib 2018-06-14 NA No C34.3 1
5 0Q97V6R3GI 1 Pemetrexed Disodium 2013-04-12 2013-06-16 Completed Treatment C34.3 1
6 0Q97V6R3GI 3 Erlotinib 2015-06-02 2018-05-30 Yes, due to progression C34.9 1
数据
df <- structure(
list(
patient_id = c("0Q97V6R3GI","0Q97V6R3GI","0Q97V6R3GI","0Q97V6R3GI","0Q97V6R3GI","0Q97V6R3GI"),
line_number_raw = c("1","1", "3", "4", "1", "3"),
drug_name = c(
"Carboplatin",
"Cisplatin",
"Denosumab",
"Osimertinib",
"Pemetrexed Disodium",
"Erlotinib"
),
dt_drug_start = structure(c(15807, 15807, 17141, 17696, 15807,16588), class = "Date"),
dt_drug_end = structure(c(15813, 15872,NA, NA, 15872, 17681), class = "Date"),
reason_discount = c(
"Yes, Comment field",
"Completed Treatment",
"No",
"No",
"Completed Treatment",
"Yes, due to progression"
),
trt_diag = c("C34.3", "C34.3", "C34.3", "C34.3", "C34.3", "C34.9"),
flag_after_diag_adv = c(1, 1, 1, 1, 1, 1)
),
row.names = c(NA,-6L),
class = c("tbl_df", "tbl", "data.frame"))
我的代码:
tb_episode %>%
group_by(patient_id, line_number_raw) %>%
mutate(max_dt_drug_end = case_when(
line_number_raw == "unknown" ~ NA_Date_,
TRUE ~ max(dt_drug_end, na.rm = T)
)) %>%
ungroup() %>%
mutate(dt_drug_end = case_when(
is.na(dt_drug_end) & !is.na(max_dt_drug_end) ~ max_dt_drug_end,
is.na(max_dt_drug_end) ~ dt_drug_start,
TRUE ~ dt_drug_end
))
我认为您的代码在技术上没问题,但问题是 运行 max(dt_drug_end, na.rm = T)
returns 是一个无穷大的值。这会触发您的 is.na
检查。
df %>%
# Create Helper column with dates based on your rules
group_by(patient_id, line_number_raw) %>%
mutate(impute_date = max(dt_drug_end, na.rm = T)) %>%
ungroup() %>%
mutate(impute_date = if_else(is.infinite(impute_date), lubridate::NA_Date_, impute_date)) %>%
mutate(impute_date = if_else(is.na(impute_date), dt_drug_start, impute_date) ) %>%
# impute
mutate(dt_drug_end =
if_else(is.na(dt_drug_end),
impute_date,
dt_drug_end)
)
使用 dplyr::rowwise()
的替代方法。
library(dplyr)
df %>%
group_by(line_number_raw) %>%
mutate(several = n() > 1,
gMax = max(dt_drug_end, na.rm = T)) %>%
rowwise() %>%
mutate(dt_drug_end_X = case_when(is.na(dt_drug_end) & several == T ~ gMax,
is.na(dt_drug_end) & several == F ~ dt_drug_start,
T ~ dt_drug_end)) %>%
select(starts_with(c('dt', 'line')))
# # A tibble: 6 x 4
# # Rowwise:
# dt_drug_start dt_drug_end dt_drug_end_X line_number_raw
# <date> <date> <date> <chr>
# 1 2013-04-12 2013-04-18 2013-04-18 1
# 2 2013-04-12 2013-06-16 2013-06-16 1
# 3 2016-12-06 NA 2018-05-30 3
# 4 2018-06-14 NA 2018-06-14 4
# 5 2013-04-12 2013-06-16 2013-06-16 1
# 6 2015-06-02 2018-05-30 2018-05-30 3
我正在尝试估算一个变量,dt_drug_end
如果 dt_drug_end
缺失(NA)。
逻辑是:
1.If 它是 line_number
组的一部分,然后是 line_number
组,其中 dt_drug_end = max(dt_drug_end)
。
2. 否则,如果开始日期可用,则将结束日期设为 dt_drug_start
。
当 max(dt_drug_end) 为 NA 时,我的代码没有将结束日期作为开始日期。即使我使用逻辑 case_when( is.na(dt_drug_end) & is.na(max_dt_drug_end) ~ dt_drug_start) dt_end_date 是 NA 而不是匹配 dt_drug_end
patient_id line_number_raw drug_name dt_drug_start dt_drug_end reason_discount trt_diag flag_after_diag_adv
<chr> <chr> <chr> <date> <date> <chr> <chr> <dbl>
1 0Q97V6R3GI 1 Carboplatin 2013-04-12 2013-04-18 Yes, Comment field C34.3 1
2 0Q97V6R3GI 1 Cisplatin 2013-04-12 2013-06-16 Completed Treatment C34.3 1
3 0Q97V6R3GI 3 Denosumab 2016-12-06 NA No C34.3 1
4 0Q97V6R3GI 4 Osimertinib 2018-06-14 NA No C34.3 1
5 0Q97V6R3GI 1 Pemetrexed Disodium 2013-04-12 2013-06-16 Completed Treatment C34.3 1
6 0Q97V6R3GI 3 Erlotinib 2015-06-02 2018-05-30 Yes, due to progression C34.9 1
数据
df <- structure(
list(
patient_id = c("0Q97V6R3GI","0Q97V6R3GI","0Q97V6R3GI","0Q97V6R3GI","0Q97V6R3GI","0Q97V6R3GI"),
line_number_raw = c("1","1", "3", "4", "1", "3"),
drug_name = c(
"Carboplatin",
"Cisplatin",
"Denosumab",
"Osimertinib",
"Pemetrexed Disodium",
"Erlotinib"
),
dt_drug_start = structure(c(15807, 15807, 17141, 17696, 15807,16588), class = "Date"),
dt_drug_end = structure(c(15813, 15872,NA, NA, 15872, 17681), class = "Date"),
reason_discount = c(
"Yes, Comment field",
"Completed Treatment",
"No",
"No",
"Completed Treatment",
"Yes, due to progression"
),
trt_diag = c("C34.3", "C34.3", "C34.3", "C34.3", "C34.3", "C34.9"),
flag_after_diag_adv = c(1, 1, 1, 1, 1, 1)
),
row.names = c(NA,-6L),
class = c("tbl_df", "tbl", "data.frame"))
我的代码:
tb_episode %>%
group_by(patient_id, line_number_raw) %>%
mutate(max_dt_drug_end = case_when(
line_number_raw == "unknown" ~ NA_Date_,
TRUE ~ max(dt_drug_end, na.rm = T)
)) %>%
ungroup() %>%
mutate(dt_drug_end = case_when(
is.na(dt_drug_end) & !is.na(max_dt_drug_end) ~ max_dt_drug_end,
is.na(max_dt_drug_end) ~ dt_drug_start,
TRUE ~ dt_drug_end
))
我认为您的代码在技术上没问题,但问题是 运行 max(dt_drug_end, na.rm = T)
returns 是一个无穷大的值。这会触发您的 is.na
检查。
df %>%
# Create Helper column with dates based on your rules
group_by(patient_id, line_number_raw) %>%
mutate(impute_date = max(dt_drug_end, na.rm = T)) %>%
ungroup() %>%
mutate(impute_date = if_else(is.infinite(impute_date), lubridate::NA_Date_, impute_date)) %>%
mutate(impute_date = if_else(is.na(impute_date), dt_drug_start, impute_date) ) %>%
# impute
mutate(dt_drug_end =
if_else(is.na(dt_drug_end),
impute_date,
dt_drug_end)
)
使用 dplyr::rowwise()
的替代方法。
library(dplyr)
df %>%
group_by(line_number_raw) %>%
mutate(several = n() > 1,
gMax = max(dt_drug_end, na.rm = T)) %>%
rowwise() %>%
mutate(dt_drug_end_X = case_when(is.na(dt_drug_end) & several == T ~ gMax,
is.na(dt_drug_end) & several == F ~ dt_drug_start,
T ~ dt_drug_end)) %>%
select(starts_with(c('dt', 'line')))
# # A tibble: 6 x 4
# # Rowwise:
# dt_drug_start dt_drug_end dt_drug_end_X line_number_raw
# <date> <date> <date> <chr>
# 1 2013-04-12 2013-04-18 2013-04-18 1
# 2 2013-04-12 2013-06-16 2013-06-16 1
# 3 2016-12-06 NA 2018-05-30 3
# 4 2018-06-14 NA 2018-06-14 4
# 5 2013-04-12 2013-06-16 2013-06-16 1
# 6 2015-06-02 2018-05-30 2018-05-30 3