使用 base R 根据另一个数据集中的值索引替换数据集中的值
Replace values in a dataset based off an index of values in another using base R
structure(list(ID = c(123, 5345, 234, 453, 3656, 345), diagnosis_1 = c("B657",
"B658", "B659", "B660", "B661", "B662"), diagnosis_2 = c("F8827",
"G432", NA, "B657", NA, "H8940"), diagnosis_3 = c(NA, "B657",
NA, NA, NA, "G432"), diagnosis_4 = c(NA, NA, NA, NA, NA, "B657"
), diagnosis_5 = c(NA, NA, NA, NA, NA, NA), diagnosis_6 = c(NA,
NA, NA, NA, NA, NA), diagnosis_7 = c(NA, NA, NA, NA, NA, NA),
diagnosis_8 = c(NA, NA, NA, NA, NA, NA), diagnosis_9 = c(NA,
NA, NA, NA, NA, NA), diagnosis_10 = c(NA, NA, NA, NA, NA,
NA), diagnosis_11 = c(NA, NA, NA, NA, NA, NA), diagnosis_12 = c(NA,
NA, NA, NA, NA, NA), diagnosis_13 = c(NA, NA, NA, NA, NA,
NA), age = c(54, 65, 23, 22, 33, 77)), row.names = c(NA,
-6L), class = "data.frame")
我想用此索引中的值替换诊断列中的值:
B657 1
B658 2
B659 3
B660 4
B661 5
B662 1
F8827 3
G432 3
H8940 4
实际上,table 有数千行,我处理其他 table 的诊断列数可变,因此与列数无关的解决方案将是理想的。索引也有几百条那么长..
如果索引table是这样划分的:
1 B657, B662
2 B658
3 B659, F8827, G432
4 B660 H8940
5 B661
这会对它的编码方式产生影响吗?
所需的输出如下所示:
非常感谢
你可以试试这个
df_replace <- read.table(text="B657 1
B658 2
B659 3
B660 4
B661 5
B662 1
F8827 3
G432 3
H8940 4", stringsAsFactors = F)
vec_repl <- as.character(df_replace$V2)
names(vec_repl) <- df_replace$V1
library(tidyverse)
df %>%
mutate_at(vars(starts_with("diag")), ~str_replace_all(., vec_repl))
ID diagnosis_1 diagnosis_2 diagnosis_3 diagnosis_4 diagnosis_5 diagnosis_6 diagnosis_7 diagnosis_8 diagnosis_9 diagnosis_10 diagnosis_11 diagnosis_12 diagnosis_13
1 123 1 3 <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
2 5345 2 3 1 <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
3 234 3 <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
4 453 4 1 <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
5 3656 5 <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
6 345 1 4 3 1 <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
age
1 54
2 65
3 23
4 22
5 33
6 77
在基础 R
中,您可以尝试使用附加包 stingr
this
df2 <- df
# use -c(1,ncol(df)) to select only columns where to replace values.
df2[,-c(1,ncol(df))] <- lapply(df[,-c(1,ncol(df))], function(x) str_replace_all(x, vec_repl))
head(df2)
ID diagnosis_1 diagnosis_2 diagnosis_3 diagnosis_4 diagnosis_5 diagnosis_6 diagnosis_7 diagnosis_8 diagnosis_9 diagnosis_10 diagnosis_11 diagnosis_12 diagnosis_13
1 123 1 3 <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
2 5345 2 3 1 <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
3 234 3 <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
4 453 4 1 <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
5 3656 5 <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
6 345 1 4 3 1 <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
age
1 54
2 65
3 23
4 22
5 33
6 77
一个可能的解决方案是首先构建一个向量 tab_vec
,其中旧值作为名称,新值和实际值。之后,我们可以使用 dplyr
包(版本 >= 1.0.0
)中的 recode
函数,并使用它 across
名称以字符串 "diagnosis"
开头的变量。
tab <- read.table(text="B657 1
B658 2
B659 3
B660 4
B661 5
B662 1
F8827 3
G432 3
H8940 4", header=F)
# create vector of replacements
tab_vec <- as.numeric(tab$V2)
names(tab_vec) <- tab$V1
tab_vec
# substitute the replacement values in the dataframe df
dplyr::mutate(df, across(starts_with("diagnosis"), ~recode(as.character(.), !!!tab_vec)))
输出
ID diagnosis_1 diagnosis_2 diagnosis_3 diagnosis_4 diagnosis_5 diagnosis_6 diagnosis_7 diagnosis_8 diagnosis_9 diagnosis_10 diagnosis_11 diagnosis_12 diagnosis_13 age
1 123 1 3 NA NA NA NA NA NA NA NA NA NA NA 54
2 5345 2 3 1 NA NA NA NA NA NA NA NA NA NA 65
3 234 3 NA NA NA NA NA NA NA NA NA NA NA NA 23
4 453 4 1 NA NA NA NA NA NA NA NA NA NA NA 22
5 3656 5 NA NA NA NA NA NA NA NA NA NA NA NA 33
6 345 1 4 3 1 NA NA NA NA NA NA NA NA NA 77
您可以使用 match
通过查找更改内容 table。
i <- startsWith(colnames(x), "diagnosis_")
x[,i] <- y[match(unlist(x[,i]), y[,1]),2]
x
# ID diagnosis_1 diagnosis_2 diagnosis_3 diagnosis_4 diagnosis_5 diagnosis_6 diagnosis_7 diagnosis_8 diagnosis_9 diagnosis_10 diagnosis_11 diagnosis_12 diagnosis_13 age
#1 123 1 3 NA NA NA NA NA NA NA NA NA NA NA 54
#2 5345 2 3 1 NA NA NA NA NA NA NA NA NA NA 65
#3 234 3 NA NA NA NA NA NA NA NA NA NA NA NA 23
#4 453 4 1 NA NA NA NA NA NA NA NA NA NA NA 22
#5 3656 5 NA NA NA NA NA NA NA NA NA NA NA NA 33
#6 345 1 4 3 1 NA NA NA NA NA NA NA NA NA 77
如果查找具有给定的不同结构:
zz <- strsplit(z, "[, ]+")
zz <- setNames(rep(seq_along(zz), lengths(zz)), unlist(zz))
i <- startsWith(colnames(x), "diagnosis_")
x[,i] <- zz[unlist(x[,i])]
如果找不到代码并且您不想将它们设置为 NA。
i <- startsWith(colnames(x), "diagnosis_")
j <- match(unlist(x[,i]), y[,1])
k <- !is.na(j)
tt <- unlist(x[,i])
tt[k] <- y[j[k],2]
x[,i] <- tt
rm(i, j, k, tt)
数据:
x <- structure(list(ID = c(123, 5345, 234, 453, 3656, 345), diagnosis_1 = c("B657",
"B658", "B659", "B660", "B661", "B662"), diagnosis_2 = c("F8827",
"G432", NA, "B657", NA, "H8940"), diagnosis_3 = c(NA, "B657",
NA, NA, NA, "G432"), diagnosis_4 = c(NA, NA, NA, NA, NA, "B657"
), diagnosis_5 = c(NA, NA, NA, NA, NA, NA), diagnosis_6 = c(NA,
NA, NA, NA, NA, NA), diagnosis_7 = c(NA, NA, NA, NA, NA, NA),
diagnosis_8 = c(NA, NA, NA, NA, NA, NA), diagnosis_9 = c(NA,
NA, NA, NA, NA, NA), diagnosis_10 = c(NA, NA, NA, NA, NA,
NA), diagnosis_11 = c(NA, NA, NA, NA, NA, NA), diagnosis_12 = c(NA,
NA, NA, NA, NA, NA), diagnosis_13 = c(NA, NA, NA, NA, NA,
NA), age = c(54, 65, 23, 22, 33, 77)), row.names = c(NA,
-6L), class = "data.frame")
y <- read.table(text="B657 1
B658 2
B659 3
B660 4
B661 5
B662 1
F8827 3
G432 3
H8940 4")
z <- readLines(con=textConnection("B657, B662
B658
B659, F8827, G432
B660 H8940
B661"))
structure(list(ID = c(123, 5345, 234, 453, 3656, 345), diagnosis_1 = c("B657",
"B658", "B659", "B660", "B661", "B662"), diagnosis_2 = c("F8827",
"G432", NA, "B657", NA, "H8940"), diagnosis_3 = c(NA, "B657",
NA, NA, NA, "G432"), diagnosis_4 = c(NA, NA, NA, NA, NA, "B657"
), diagnosis_5 = c(NA, NA, NA, NA, NA, NA), diagnosis_6 = c(NA,
NA, NA, NA, NA, NA), diagnosis_7 = c(NA, NA, NA, NA, NA, NA),
diagnosis_8 = c(NA, NA, NA, NA, NA, NA), diagnosis_9 = c(NA,
NA, NA, NA, NA, NA), diagnosis_10 = c(NA, NA, NA, NA, NA,
NA), diagnosis_11 = c(NA, NA, NA, NA, NA, NA), diagnosis_12 = c(NA,
NA, NA, NA, NA, NA), diagnosis_13 = c(NA, NA, NA, NA, NA,
NA), age = c(54, 65, 23, 22, 33, 77)), row.names = c(NA,
-6L), class = "data.frame")
我想用此索引中的值替换诊断列中的值:
B657 1
B658 2
B659 3
B660 4
B661 5
B662 1
F8827 3
G432 3
H8940 4
实际上,table 有数千行,我处理其他 table 的诊断列数可变,因此与列数无关的解决方案将是理想的。索引也有几百条那么长..
如果索引table是这样划分的:
1 B657, B662
2 B658
3 B659, F8827, G432
4 B660 H8940
5 B661
这会对它的编码方式产生影响吗?
所需的输出如下所示:
非常感谢
你可以试试这个
df_replace <- read.table(text="B657 1
B658 2
B659 3
B660 4
B661 5
B662 1
F8827 3
G432 3
H8940 4", stringsAsFactors = F)
vec_repl <- as.character(df_replace$V2)
names(vec_repl) <- df_replace$V1
library(tidyverse)
df %>%
mutate_at(vars(starts_with("diag")), ~str_replace_all(., vec_repl))
ID diagnosis_1 diagnosis_2 diagnosis_3 diagnosis_4 diagnosis_5 diagnosis_6 diagnosis_7 diagnosis_8 diagnosis_9 diagnosis_10 diagnosis_11 diagnosis_12 diagnosis_13
1 123 1 3 <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
2 5345 2 3 1 <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
3 234 3 <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
4 453 4 1 <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
5 3656 5 <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
6 345 1 4 3 1 <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
age
1 54
2 65
3 23
4 22
5 33
6 77
在基础 R
中,您可以尝试使用附加包 stingr
this
df2 <- df
# use -c(1,ncol(df)) to select only columns where to replace values.
df2[,-c(1,ncol(df))] <- lapply(df[,-c(1,ncol(df))], function(x) str_replace_all(x, vec_repl))
head(df2)
ID diagnosis_1 diagnosis_2 diagnosis_3 diagnosis_4 diagnosis_5 diagnosis_6 diagnosis_7 diagnosis_8 diagnosis_9 diagnosis_10 diagnosis_11 diagnosis_12 diagnosis_13
1 123 1 3 <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
2 5345 2 3 1 <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
3 234 3 <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
4 453 4 1 <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
5 3656 5 <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
6 345 1 4 3 1 <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
age
1 54
2 65
3 23
4 22
5 33
6 77
一个可能的解决方案是首先构建一个向量 tab_vec
,其中旧值作为名称,新值和实际值。之后,我们可以使用 dplyr
包(版本 >= 1.0.0
)中的 recode
函数,并使用它 across
名称以字符串 "diagnosis"
开头的变量。
tab <- read.table(text="B657 1
B658 2
B659 3
B660 4
B661 5
B662 1
F8827 3
G432 3
H8940 4", header=F)
# create vector of replacements
tab_vec <- as.numeric(tab$V2)
names(tab_vec) <- tab$V1
tab_vec
# substitute the replacement values in the dataframe df
dplyr::mutate(df, across(starts_with("diagnosis"), ~recode(as.character(.), !!!tab_vec)))
输出
ID diagnosis_1 diagnosis_2 diagnosis_3 diagnosis_4 diagnosis_5 diagnosis_6 diagnosis_7 diagnosis_8 diagnosis_9 diagnosis_10 diagnosis_11 diagnosis_12 diagnosis_13 age
1 123 1 3 NA NA NA NA NA NA NA NA NA NA NA 54
2 5345 2 3 1 NA NA NA NA NA NA NA NA NA NA 65
3 234 3 NA NA NA NA NA NA NA NA NA NA NA NA 23
4 453 4 1 NA NA NA NA NA NA NA NA NA NA NA 22
5 3656 5 NA NA NA NA NA NA NA NA NA NA NA NA 33
6 345 1 4 3 1 NA NA NA NA NA NA NA NA NA 77
您可以使用 match
通过查找更改内容 table。
i <- startsWith(colnames(x), "diagnosis_")
x[,i] <- y[match(unlist(x[,i]), y[,1]),2]
x
# ID diagnosis_1 diagnosis_2 diagnosis_3 diagnosis_4 diagnosis_5 diagnosis_6 diagnosis_7 diagnosis_8 diagnosis_9 diagnosis_10 diagnosis_11 diagnosis_12 diagnosis_13 age
#1 123 1 3 NA NA NA NA NA NA NA NA NA NA NA 54
#2 5345 2 3 1 NA NA NA NA NA NA NA NA NA NA 65
#3 234 3 NA NA NA NA NA NA NA NA NA NA NA NA 23
#4 453 4 1 NA NA NA NA NA NA NA NA NA NA NA 22
#5 3656 5 NA NA NA NA NA NA NA NA NA NA NA NA 33
#6 345 1 4 3 1 NA NA NA NA NA NA NA NA NA 77
如果查找具有给定的不同结构:
zz <- strsplit(z, "[, ]+")
zz <- setNames(rep(seq_along(zz), lengths(zz)), unlist(zz))
i <- startsWith(colnames(x), "diagnosis_")
x[,i] <- zz[unlist(x[,i])]
如果找不到代码并且您不想将它们设置为 NA。
i <- startsWith(colnames(x), "diagnosis_")
j <- match(unlist(x[,i]), y[,1])
k <- !is.na(j)
tt <- unlist(x[,i])
tt[k] <- y[j[k],2]
x[,i] <- tt
rm(i, j, k, tt)
数据:
x <- structure(list(ID = c(123, 5345, 234, 453, 3656, 345), diagnosis_1 = c("B657",
"B658", "B659", "B660", "B661", "B662"), diagnosis_2 = c("F8827",
"G432", NA, "B657", NA, "H8940"), diagnosis_3 = c(NA, "B657",
NA, NA, NA, "G432"), diagnosis_4 = c(NA, NA, NA, NA, NA, "B657"
), diagnosis_5 = c(NA, NA, NA, NA, NA, NA), diagnosis_6 = c(NA,
NA, NA, NA, NA, NA), diagnosis_7 = c(NA, NA, NA, NA, NA, NA),
diagnosis_8 = c(NA, NA, NA, NA, NA, NA), diagnosis_9 = c(NA,
NA, NA, NA, NA, NA), diagnosis_10 = c(NA, NA, NA, NA, NA,
NA), diagnosis_11 = c(NA, NA, NA, NA, NA, NA), diagnosis_12 = c(NA,
NA, NA, NA, NA, NA), diagnosis_13 = c(NA, NA, NA, NA, NA,
NA), age = c(54, 65, 23, 22, 33, 77)), row.names = c(NA,
-6L), class = "data.frame")
y <- read.table(text="B657 1
B658 2
B659 3
B660 4
B661 5
B662 1
F8827 3
G432 3
H8940 4")
z <- readLines(con=textConnection("B657, B662
B658
B659, F8827, G432
B660 H8940
B661"))