如何基于现有数据集创建新数据集
How to create new dataset based on existing dataset
我有以下数据集:
individual sequence_special_drug all_drugs
A NA A
A NA B
A 1 C
A 2 D
A NA B
A NA Z
A 2 D
A NA Z
A 2 D
A NA A
A 3 E
我想在 R 中创建以下数据集:
Individual sequence_special_drug special_drug prior_special_drug prior_traditional_drug during_special_drug
A 1 C none A, B none
A 2 D C none B, Z, Z
A 3 E C, D A, B, B, Z, Z, A none
有什么快速的方法吗?我有很多个人,但这些都是可能的情况。 A special_drug
由序号标识; 'NA' 是 traditional_drug
.
prior_special_drug
将包含之前识别的任何 special_drug,因此对于第一个 special_drug C 没有之前的 special_drug,对于第二个 special_drug D ,有一个前special_drug是C,第三个special_drug有两个前special_drugC和D.
prior_traditional_drug
相同,但将包含在 sequence_special_drug 中标识为 NA 的所有内容。所以对于第一个special_drug(C),两个prior_traditional_drugs是A和B。对于第三个special_drug,prior_traditional_drugs是A,B,B,Z,Z , A.
during_special_drug
将包含在 special_drug 管理期间引用的每个 traditional_drug。这可以通过重复 sequence_special_drug(例如 2 -> NA NA -> 2 -> NA -> 2)在数据集中识别,因此 B、Z、Z.
编辑 - 对于 2 个人:
dat <- read.table(
text = "
individual sequence_special_drug all_drugs
A NA A
A NA B
A 1 C
A 2 D
A NA B
A NA Z
A 2 D
A NA Z
A 2 D
A NA A
A 3 E
B 1 D
B NA B
B NA Z
B 1 D
B NA Z
B 1 D
B NA A
B 2 E",
header = TRUE)
我预计:
- 错误的“none”第 3 行在 prior_traditional_drug -
Individual sequence_special_drug special_drug prior_special_drug prior_traditional_drug during_special_drug
A 1 C none A, B none
A 2 D C none B, Z, Z
A 3 E C, D A, B, B, Z, Z, A none
B 1 D none none B, Z, Z
B 2 E D B, Z, Z, A none
- prior_traditional_drug下的 RIGHT "A, B" 第 3 行 -
Individual sequence_special_drug special_drug prior_special_drug prior_traditional_drug during_special_drug
A 1 C none A, B none
A 2 D C A, B B, Z, Z
A 3 E C, D A, B, B, Z, Z, A none
B 1 D none none B, Z, Z
B 2 E D B, Z, Z, A none
但是我得到了:
我自己的数据集的错误信息
> special_drug <- example_data %>%
+ nest_by(individual) %>%
+ mutate(
+ spec_drug = list(get_all_drugs(data))
+ ) %>%
+ unnest(spec_drug) %>%
+ select(-data) %>%
+ ungroup()
`summarise()` has grouped output by 'sequence_special_drug'. You can override using the `.groups` argument.
Error: Problem with `mutate()` input `spec_drug`.
x Problem with `mutate()` input `flag3`.
x `false` must be a list, not a character vector.
ℹ Input `flag3` is `if_else(flag1 == 1, list(character(0)), flag3)`.
ℹ Input `spec_drug` is `list(get_all_drugs(data))`.
Run `rlang::last_error()` to see where the error occurred.
> rlang::last_error()
Error in is_rlang_error(parent) :
argument "parent" is missing, with no default
我自己的数据集比较像这样:
example_data <- read.table(
text = "
individual sequence_special_drug all_drugs
77779 NA Name1
77779 1 Name2
77779 1 Name2
77779 1 Name2
77779 2 Name3
4444 NA Name1
4444 1 Name4
4444 2 Name3
4444 3 Name7",
header = TRUE)
但下面的数据集也生成相同的错误消息:
example_data <- read.table(
text = "
individual sequence_special_drug all_drugs
A NA A
A 1 C
A 2 D
A 2 D
A 2 D
A 3 E
B NA B
B 1 D
B 2 E
B 3 F",
header = TRUE)
这是我针对这个特定问题的不优雅的解决方案,但给你一个提示可能有用。
library(data.table)
dt <- fread(
"
individual sequence_special_drug all_drugs
A NA A
A NA B
A 1 C
A 2 D
A NA B
A NA Z
A 2 D
A NA Z
A 2 D
A NA A
A 3 E
"
)
df <- unique(na.omit(dt))
setnames(df,"all_drugs","special_drug")
df
#> individual sequence_special_drug special_drug
#> 1: A 1 C
#> 2: A 2 D
#> 3: A 3 E
## add row ideantifier in dt
dt[,rd:=rowid(individual)]
## create prior_special_drug
df[,prior_special_drug:=shift(special_drug)]
df[3,4] <- df[special_drug < "E", paste(special_drug,collapse = ", ")]
df
#> individual sequence_special_drug special_drug prior_special_drug
#> 1: A 1 C <NA>
#> 2: A 2 D C
#> 3: A 3 E C, D
special.drug = df$special_drug
special.drug
#> [1] "C" "D" "E"
posi <- c(
dt[,first(.I[all_drugs==special.drug[1]])], #first position of C
dt[,first(.I[all_drugs==special.drug[2]])], #first position of D
dt[,last(.I[all_drugs==special.drug[2]])], #last position of D
dt[,last(.I[all_drugs==special.drug[3]])] #last position of E
)
posi
#> [1] 3 4 9 11
# dt[is.na(sequence_special_drug) & rd < posi[1], all_drugs]
# dt[is.na(sequence_special_drug) & rd %between% posi[2:3], all_drugs]
# dt[is.na(sequence_special_drug) & rd < posi[4], all_drugs]
drug <- c(
paste(dt[is.na(sequence_special_drug) & rd < posi[1], all_drugs],collapse = ", "),
paste(dt[is.na(sequence_special_drug) & rd %between% posi[2:3], all_drugs],collapse = ", "),
paste(dt[is.na(sequence_special_drug) & rd < posi[4], all_drugs],collapse = ", ")
)
drug
#> [1] "A, B" "B, Z, Z" "A, B, B, Z, Z, A"
## create prior_traditional_drug and during_special_drug
df[,prior_traditional_drug := drug]
df[,prior_traditional_drug := ifelse(special_drug == "D",NA,prior_traditional_drug)]
df[,during_special_drug := drug]
df[,during_special_drug := ifelse(special_drug %in% c("C","E"),NA,during_special_drug)]
## replace NA with "none" in df
for (jj in 1:ncol(df))
set(df,
i = which(is.na(df[[jj]])),
j = jj,
v = "none"
)
df
#> individual sequence_special_drug special_drug prior_special_drug
#> 1: A 1 C none
#> 2: A 2 D C
#> 3: A 3 E C, D
#> prior_traditional_drug during_special_drug
#> 1: A, B none
#> 2: none B, Z, Z
#> 3: A, B, B, Z, Z, A none
由 reprex package (v2.0.0)
于 2021-06-06 创建
这是我使用 {tidyverse}
的建议。我写了一个函数来获取每一列,然后将它们放在一起 get_all_drugs()
。然后,我运行函数通过单独的嵌套数据,如下例所示。
library(tidyverse)
example_data <- read.table(
text = "
individual sequence_special_drug all_drugs
A NA A
A NA B
A 1 C
A 2 D
A NA B
A NA Z
A 2 D
A NA Z
A 2 D
A NA A
A 3 E
B 1 D
B NA B
B NA Z
B 1 D
B NA Z
B 1 D
B NA A
B 2 E",
header = TRUE)
get_special_drugs <- function(.data) {
.data %>%
filter(sequence_special_drug != 0) %>%
distinct() %>%
select(sequence_special_drug, special_drug = all_drugs) %>%
mutate(prior_special_drug = as.list(accumulate(special_drug, c))) %>%
rowwise() %>%
mutate(prior_special_drug = list(
prior_special_drug[prior_special_drug != special_drug]
)) %>%
ungroup()
}
fix_drug_sequence <- function(.data) {
.data %>%
mutate(
seq_drug = replace_na(sequence_special_drug, 0),
flag = if_else(seq_drug == 0 & seq_drug != lead(seq_drug),
lead(seq_drug),
seq_drug),
flag = if_else(flag == 0 & flag != lead(flag),
lead(flag),
flag)
) %>%
select(-sequence_special_drug) %>%
rename(sequence_special_drug = flag)
}
get_prior_traditional_drug <- function(...) {
fix_drug_sequence(...) %>%
group_by(sequence_special_drug) %>%
mutate(
flag1 = max(seq_drug == sequence_special_drug & row_number() == 1),
) %>%
group_by(sequence_special_drug, flag1) %>%
summarise(
flag2 = list(all_drugs[seq_drug == 0])
) %>%
ungroup() %>%
mutate(
flag3 = as.list(accumulate(flag2, append)),
flag3 = if_else(flag1 == 1, lag(flag3), flag3)
) %>%
select(sequence_special_drug, prior_traditional_drug = flag3)
}
get_during_special_drugs <- function(...) {
fix_drug_sequence(...) %>%
group_by(sequence_special_drug) %>%
mutate(
flag = cumsum(seq_drug == sequence_special_drug)
) %>%
filter(flag > 0) %>%
summarise(
during_special_drug = list(all_drugs[seq_drug == 0])
)
}
get_all_drugs <- function(.data) {
spec_drug <- get_special_drugs(.data)
prior_traditional <- get_prior_traditional_drug(.data)
during_spec <- get_during_special_drugs(.data)
list(spec_drug, prior_traditional, during_spec) %>%
reduce(left_join, by = "sequence_special_drug")
}
special_drug <- example_data %>%
nest_by(individual) %>%
mutate(
spec_drug = list(get_all_drugs(data))
) %>%
unnest(spec_drug) %>%
select(-data) %>%
ungroup()
special_drug
我有以下数据集:
individual sequence_special_drug all_drugs
A NA A
A NA B
A 1 C
A 2 D
A NA B
A NA Z
A 2 D
A NA Z
A 2 D
A NA A
A 3 E
我想在 R 中创建以下数据集:
Individual sequence_special_drug special_drug prior_special_drug prior_traditional_drug during_special_drug
A 1 C none A, B none
A 2 D C none B, Z, Z
A 3 E C, D A, B, B, Z, Z, A none
有什么快速的方法吗?我有很多个人,但这些都是可能的情况。 A special_drug
由序号标识; 'NA' 是 traditional_drug
.
prior_special_drug
将包含之前识别的任何 special_drug,因此对于第一个 special_drug C 没有之前的 special_drug,对于第二个 special_drug D ,有一个前special_drug是C,第三个special_drug有两个前special_drugC和D.
prior_traditional_drug
相同,但将包含在 sequence_special_drug 中标识为 NA 的所有内容。所以对于第一个special_drug(C),两个prior_traditional_drugs是A和B。对于第三个special_drug,prior_traditional_drugs是A,B,B,Z,Z , A.
during_special_drug
将包含在 special_drug 管理期间引用的每个 traditional_drug。这可以通过重复 sequence_special_drug(例如 2 -> NA NA -> 2 -> NA -> 2)在数据集中识别,因此 B、Z、Z.
编辑 - 对于 2 个人:
dat <- read.table(
text = "
individual sequence_special_drug all_drugs
A NA A
A NA B
A 1 C
A 2 D
A NA B
A NA Z
A 2 D
A NA Z
A 2 D
A NA A
A 3 E
B 1 D
B NA B
B NA Z
B 1 D
B NA Z
B 1 D
B NA A
B 2 E",
header = TRUE)
我预计:
- 错误的“none”第 3 行在 prior_traditional_drug -
Individual sequence_special_drug special_drug prior_special_drug prior_traditional_drug during_special_drug
A 1 C none A, B none
A 2 D C none B, Z, Z
A 3 E C, D A, B, B, Z, Z, A none
B 1 D none none B, Z, Z
B 2 E D B, Z, Z, A none
- prior_traditional_drug下的 RIGHT "A, B" 第 3 行 -
Individual sequence_special_drug special_drug prior_special_drug prior_traditional_drug during_special_drug
A 1 C none A, B none
A 2 D C A, B B, Z, Z
A 3 E C, D A, B, B, Z, Z, A none
B 1 D none none B, Z, Z
B 2 E D B, Z, Z, A none
但是我得到了:
我自己的数据集的错误信息
> special_drug <- example_data %>%
+ nest_by(individual) %>%
+ mutate(
+ spec_drug = list(get_all_drugs(data))
+ ) %>%
+ unnest(spec_drug) %>%
+ select(-data) %>%
+ ungroup()
`summarise()` has grouped output by 'sequence_special_drug'. You can override using the `.groups` argument.
Error: Problem with `mutate()` input `spec_drug`.
x Problem with `mutate()` input `flag3`.
x `false` must be a list, not a character vector.
ℹ Input `flag3` is `if_else(flag1 == 1, list(character(0)), flag3)`.
ℹ Input `spec_drug` is `list(get_all_drugs(data))`.
Run `rlang::last_error()` to see where the error occurred.
> rlang::last_error()
Error in is_rlang_error(parent) :
argument "parent" is missing, with no default
我自己的数据集比较像这样:
example_data <- read.table(
text = "
individual sequence_special_drug all_drugs
77779 NA Name1
77779 1 Name2
77779 1 Name2
77779 1 Name2
77779 2 Name3
4444 NA Name1
4444 1 Name4
4444 2 Name3
4444 3 Name7",
header = TRUE)
但下面的数据集也生成相同的错误消息:
example_data <- read.table(
text = "
individual sequence_special_drug all_drugs
A NA A
A 1 C
A 2 D
A 2 D
A 2 D
A 3 E
B NA B
B 1 D
B 2 E
B 3 F",
header = TRUE)
这是我针对这个特定问题的不优雅的解决方案,但给你一个提示可能有用。
library(data.table)
dt <- fread(
"
individual sequence_special_drug all_drugs
A NA A
A NA B
A 1 C
A 2 D
A NA B
A NA Z
A 2 D
A NA Z
A 2 D
A NA A
A 3 E
"
)
df <- unique(na.omit(dt))
setnames(df,"all_drugs","special_drug")
df
#> individual sequence_special_drug special_drug
#> 1: A 1 C
#> 2: A 2 D
#> 3: A 3 E
## add row ideantifier in dt
dt[,rd:=rowid(individual)]
## create prior_special_drug
df[,prior_special_drug:=shift(special_drug)]
df[3,4] <- df[special_drug < "E", paste(special_drug,collapse = ", ")]
df
#> individual sequence_special_drug special_drug prior_special_drug
#> 1: A 1 C <NA>
#> 2: A 2 D C
#> 3: A 3 E C, D
special.drug = df$special_drug
special.drug
#> [1] "C" "D" "E"
posi <- c(
dt[,first(.I[all_drugs==special.drug[1]])], #first position of C
dt[,first(.I[all_drugs==special.drug[2]])], #first position of D
dt[,last(.I[all_drugs==special.drug[2]])], #last position of D
dt[,last(.I[all_drugs==special.drug[3]])] #last position of E
)
posi
#> [1] 3 4 9 11
# dt[is.na(sequence_special_drug) & rd < posi[1], all_drugs]
# dt[is.na(sequence_special_drug) & rd %between% posi[2:3], all_drugs]
# dt[is.na(sequence_special_drug) & rd < posi[4], all_drugs]
drug <- c(
paste(dt[is.na(sequence_special_drug) & rd < posi[1], all_drugs],collapse = ", "),
paste(dt[is.na(sequence_special_drug) & rd %between% posi[2:3], all_drugs],collapse = ", "),
paste(dt[is.na(sequence_special_drug) & rd < posi[4], all_drugs],collapse = ", ")
)
drug
#> [1] "A, B" "B, Z, Z" "A, B, B, Z, Z, A"
## create prior_traditional_drug and during_special_drug
df[,prior_traditional_drug := drug]
df[,prior_traditional_drug := ifelse(special_drug == "D",NA,prior_traditional_drug)]
df[,during_special_drug := drug]
df[,during_special_drug := ifelse(special_drug %in% c("C","E"),NA,during_special_drug)]
## replace NA with "none" in df
for (jj in 1:ncol(df))
set(df,
i = which(is.na(df[[jj]])),
j = jj,
v = "none"
)
df
#> individual sequence_special_drug special_drug prior_special_drug
#> 1: A 1 C none
#> 2: A 2 D C
#> 3: A 3 E C, D
#> prior_traditional_drug during_special_drug
#> 1: A, B none
#> 2: none B, Z, Z
#> 3: A, B, B, Z, Z, A none
由 reprex package (v2.0.0)
于 2021-06-06 创建这是我使用 {tidyverse}
的建议。我写了一个函数来获取每一列,然后将它们放在一起 get_all_drugs()
。然后,我运行函数通过单独的嵌套数据,如下例所示。
library(tidyverse)
example_data <- read.table(
text = "
individual sequence_special_drug all_drugs
A NA A
A NA B
A 1 C
A 2 D
A NA B
A NA Z
A 2 D
A NA Z
A 2 D
A NA A
A 3 E
B 1 D
B NA B
B NA Z
B 1 D
B NA Z
B 1 D
B NA A
B 2 E",
header = TRUE)
get_special_drugs <- function(.data) {
.data %>%
filter(sequence_special_drug != 0) %>%
distinct() %>%
select(sequence_special_drug, special_drug = all_drugs) %>%
mutate(prior_special_drug = as.list(accumulate(special_drug, c))) %>%
rowwise() %>%
mutate(prior_special_drug = list(
prior_special_drug[prior_special_drug != special_drug]
)) %>%
ungroup()
}
fix_drug_sequence <- function(.data) {
.data %>%
mutate(
seq_drug = replace_na(sequence_special_drug, 0),
flag = if_else(seq_drug == 0 & seq_drug != lead(seq_drug),
lead(seq_drug),
seq_drug),
flag = if_else(flag == 0 & flag != lead(flag),
lead(flag),
flag)
) %>%
select(-sequence_special_drug) %>%
rename(sequence_special_drug = flag)
}
get_prior_traditional_drug <- function(...) {
fix_drug_sequence(...) %>%
group_by(sequence_special_drug) %>%
mutate(
flag1 = max(seq_drug == sequence_special_drug & row_number() == 1),
) %>%
group_by(sequence_special_drug, flag1) %>%
summarise(
flag2 = list(all_drugs[seq_drug == 0])
) %>%
ungroup() %>%
mutate(
flag3 = as.list(accumulate(flag2, append)),
flag3 = if_else(flag1 == 1, lag(flag3), flag3)
) %>%
select(sequence_special_drug, prior_traditional_drug = flag3)
}
get_during_special_drugs <- function(...) {
fix_drug_sequence(...) %>%
group_by(sequence_special_drug) %>%
mutate(
flag = cumsum(seq_drug == sequence_special_drug)
) %>%
filter(flag > 0) %>%
summarise(
during_special_drug = list(all_drugs[seq_drug == 0])
)
}
get_all_drugs <- function(.data) {
spec_drug <- get_special_drugs(.data)
prior_traditional <- get_prior_traditional_drug(.data)
during_spec <- get_during_special_drugs(.data)
list(spec_drug, prior_traditional, during_spec) %>%
reduce(left_join, by = "sequence_special_drug")
}
special_drug <- example_data %>%
nest_by(individual) %>%
mutate(
spec_drug = list(get_all_drugs(data))
) %>%
unnest(spec_drug) %>%
select(-data) %>%
ungroup()
special_drug