R中数据框中的匹配和替换操作
Match and replace operation in data frame in R
假设我的数据集如下所示:
John NA kaira
carry John NA
maya Sam maya
leo paty leo
tinker NA tinker
fabo leo maya
我有另一个数据集:
John 1
carry 2
maya 3
leo 4
tinker 5
fabo 6
sam 7
paty 8
kaira 9
我想将上面 table (df2) 的值与第一个 table (df1) 匹配,所以我的最终 table (df) 如下所示:
1 NA 9
2 1 NA
3 7 3
4 8 4
5 NA 5
6 4 3
您可以使用 match
来完成查找:
apply(df1, 2, function(x) df2[,2][match(x, df2[,1])])
V1 V2 V3
[1,] 1 NA 9
[2,] 2 1 NA
[3,] 3 NA 3
[4,] 4 8 4
[5,] 5 NA 5
[6,] 6 4 3
您会注意到我在第二列中有一个额外的 NA
值,因为第一个数据框的 "Sam" 与第二个数据框的 "sam" 不匹配灵敏度。如果您不关心区分大小写,您可以尝试:
apply(df1, 2, function(x) df2[,2][match(tolower(x), tolower(df2[,1]))])
# V1 V2 V3
# [1,] 1 NA 9
# [2,] 2 1 NA
# [3,] 3 7 3
# [4,] 4 8 4
# [5,] 5 NA 5
# [6,] 6 4 3
尝试:
library(dplyr)
df1 %>% mutate_each(funs(df2[,2][match(., df2[,1])]))
您可以简单地使用 mapvalues
从 plyr
:
library(plyr)
mapvalues(tolower(as.matrix(df)), tolower(df1$V1), df1$V2)
# V1 V2 V3
#[1,] "1" NA "9"
#[2,] "2" "1" NA
#[3,] "3" "7" "3"
#[4,] "4" "8" "4"
#[5,] "5" NA "5"
#[6,] "6" "4" "3"
数据:
df = structure(list(V1 = structure(c(3L, 1L, 5L, 4L, 6L, 2L), .Label = c("carry",
"fabo", "John", "leo", "maya", "tinker"), class = "factor"),
V2 = structure(c(NA, 1L, 4L, 3L, NA, 2L), .Label = c("John",
"leo", "paty", "Sam"), class = "factor"), V3 = structure(c(1L,
NA, 3L, 2L, 4L, 3L), .Label = c("kaira", "leo", "maya", "tinker"
), class = "factor")), .Names = c("V1", "V2", "V3"), class = "data.frame", row.names = c(NA,
-6L))
df1 = structure(list(V1 = structure(c(3L, 1L, 6L, 5L, 9L, 2L, 8L, 7L,
4L), .Label = c("carry", "fabo", "John", "kaira", "leo", "maya",
"paty", "sam", "tinker"), class = "factor"), V2 = 1:9), .Names = c("V1",
"V2"), class = "data.frame", row.names = c(NA, -9L))
也可以
df1[] <- match(unlist(df1), df2$V1)
# V1 V2 V3
# 1 1 NA 9
# 2 2 1 NA
# 3 3 NA 3
# 4 4 8 4
# 5 5 NA 5
# 6 6 4 3
如果 df2
中的数字并不总是有序的,升技调整代码将是
df1[] <- df2[match(unlist(df1), df2$V1), 2]
如果我们可以去除因素:
df3 <- data.frame(lapply(df, as.character), stringsAsFactors = FALSE)
然后
df3[!is.na(df3)] <- match(df3[!is.na(df3)] , as.character(df1[,1]))
假设我的数据集如下所示:
John NA kaira
carry John NA
maya Sam maya
leo paty leo
tinker NA tinker
fabo leo maya
我有另一个数据集:
John 1
carry 2
maya 3
leo 4
tinker 5
fabo 6
sam 7
paty 8
kaira 9
我想将上面 table (df2) 的值与第一个 table (df1) 匹配,所以我的最终 table (df) 如下所示:
1 NA 9
2 1 NA
3 7 3
4 8 4
5 NA 5
6 4 3
您可以使用 match
来完成查找:
apply(df1, 2, function(x) df2[,2][match(x, df2[,1])])
V1 V2 V3
[1,] 1 NA 9
[2,] 2 1 NA
[3,] 3 NA 3
[4,] 4 8 4
[5,] 5 NA 5
[6,] 6 4 3
您会注意到我在第二列中有一个额外的 NA
值,因为第一个数据框的 "Sam" 与第二个数据框的 "sam" 不匹配灵敏度。如果您不关心区分大小写,您可以尝试:
apply(df1, 2, function(x) df2[,2][match(tolower(x), tolower(df2[,1]))])
# V1 V2 V3
# [1,] 1 NA 9
# [2,] 2 1 NA
# [3,] 3 7 3
# [4,] 4 8 4
# [5,] 5 NA 5
# [6,] 6 4 3
尝试:
library(dplyr)
df1 %>% mutate_each(funs(df2[,2][match(., df2[,1])]))
您可以简单地使用 mapvalues
从 plyr
:
library(plyr)
mapvalues(tolower(as.matrix(df)), tolower(df1$V1), df1$V2)
# V1 V2 V3
#[1,] "1" NA "9"
#[2,] "2" "1" NA
#[3,] "3" "7" "3"
#[4,] "4" "8" "4"
#[5,] "5" NA "5"
#[6,] "6" "4" "3"
数据:
df = structure(list(V1 = structure(c(3L, 1L, 5L, 4L, 6L, 2L), .Label = c("carry",
"fabo", "John", "leo", "maya", "tinker"), class = "factor"),
V2 = structure(c(NA, 1L, 4L, 3L, NA, 2L), .Label = c("John",
"leo", "paty", "Sam"), class = "factor"), V3 = structure(c(1L,
NA, 3L, 2L, 4L, 3L), .Label = c("kaira", "leo", "maya", "tinker"
), class = "factor")), .Names = c("V1", "V2", "V3"), class = "data.frame", row.names = c(NA,
-6L))
df1 = structure(list(V1 = structure(c(3L, 1L, 6L, 5L, 9L, 2L, 8L, 7L,
4L), .Label = c("carry", "fabo", "John", "kaira", "leo", "maya",
"paty", "sam", "tinker"), class = "factor"), V2 = 1:9), .Names = c("V1",
"V2"), class = "data.frame", row.names = c(NA, -9L))
也可以
df1[] <- match(unlist(df1), df2$V1)
# V1 V2 V3
# 1 1 NA 9
# 2 2 1 NA
# 3 3 NA 3
# 4 4 8 4
# 5 5 NA 5
# 6 6 4 3
如果 df2
中的数字并不总是有序的,升技调整代码将是
df1[] <- df2[match(unlist(df1), df2$V1), 2]
如果我们可以去除因素:
df3 <- data.frame(lapply(df, as.character), stringsAsFactors = FALSE)
然后
df3[!is.na(df3)] <- match(df3[!is.na(df3)] , as.character(df1[,1]))