在 R 中使用 set_equal 检查 2 个字符变量匹配的唯一值
Check the unique values of 2 character variables match using set_equal in R
我有 2 个数据 tables,大约 ~500k obs。和 ~50 个变量。其中一些变量包含字符,我想找到一种简单快速的方法来检查一个数据中每个字符变量的唯一值 table 匹配(或不匹配)第二个数据中的相应变量 table
我假设使用 sapply
和 set_equal
会给出我的答案,但对于每个变量它总是 returns FALSE,即使我知道答案应该是 TRUE。测试一对单独的变量,returns 正确答案。我可以遍历每对变量,这很有效,但我试图理解为什么我的 sapply
/set_equal
方法没有解决它,所以它确实如此(或找到一个替代的非 -循环法)。
> # Extract & sort unique values from character vars in cube1 for matching character vars
> cube1c <- sapply(cube1m[, ..commonCharCols], unique)
> cube1c <- sapply(cube1c, sort)
> # Extract & sort unique values from character vars in cube2 for matching character vars
> cube2c <- sapply(cube2m[, ..commonCharCols], unique)
> cube2c <- sapply(cube2c, sort)
> # Test if values in each pair of variables are the same
> sapply(cube1c, function(x) setequal(x, cube2c))
cVar1 cVar2 cVar3 cVar4 cVar5 cVar6
FALSE FALSE FALSE FALSE FALSE FALSE
> setequal(cube1c$cVar2, cube2c$cVar2)
[1] TRUE
> for (icol in seq_len(length(commonCharCols))) {
+ print(commonCharCols[[icol]])
+ print(setequal(cube1c[[icol]], cube2c[[icol]]))
+ }
[1] "cVar1"
[1] FALSE
[1] "cVar2"
[1] TRUE
[1] "cVar3"
[1] FALSE
[1] "cVar4"
[1] TRUE
[1] "cVar5"
[1] TRUE
[1] "cVar6"
[1] FALSE
考虑 mapply
等长对象的元素循环调用 setequal
:
output <- mapply(setequal, cube1c, cube2c)
使用随机的种子数据进行演示
数据
library(data.table)
set.seed(872019)
data_tools <- c("sas", "stata", "spss", "python", "r", "julia")
cube1m <- data.table(replicate(50, sample(data_tools, 10, replace=TRUE)))
head(cube1m)
# V1 V2 V3 V4 V5
# 1: r python spss julia sas
# 2: julia spss python julia python
# 3: stata r sas stata r
# 4: r r julia julia sas
# 5: julia r sas spss r
# 6: stata r r r sas
cube2m <- data.table(replicate(50, sample(data_tools, 10, replace=TRUE)))
head(cube2m)
# V1 V2 V3 V4 V5
# 1: sas spss python r stata
# 2: r spss julia sas r
# 3: julia julia stata python julia
# 4: r r spss stata julia
# 5: julia r stata python sas
# 6: python r stata sas stata
代码+输出
commonCharCols <- paste0("V", 1:50)
cube1c <- sapply(cube1m[, ..commonCharCols], function(x) sort(unique(x)))
cube2c <- sapply(cube2m[, ..commonCharCols], function(x) sort(unique(x)))
output <- mapply(setequal, cube1c, cube2c)
output
# V1 V2 V3 V4 V5 V6 V7 V8 V9 V10 V11 V12 V13 V14 V15
# FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE
# V16 V17 V18 V19 V20 V21 V22 V23 V24 V25 V26 V27 V28 V29 V30
# FALSE FALSE FALSE FALSE TRUE TRUE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
# V31 V32 V33 V34 V35 V36 V37 V38 V39 V40 V41 V42 V43 V44 V45
# TRUE FALSE TRUE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE
# V46 V47 V48 V49 V50
# FALSE FALSE FALSE FALSE FALSE
output[output == TRUE]
# V6 V13 V20 V21 V23 V31 V33 V39
# TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
检查
sharedCols <- names(output[output == TRUE])
cube1m[, ..sharedCols]
# V6 V13 V20 V21 V23 V31 V33 V39
# 1: spss julia r spss python python python spss
# 2: sas sas sas stata spss spss stata r
# 3: sas julia r r python r sas julia
# 4: spss r julia spss stata r stata r
# 5: python spss stata spss r stata stata julia
# 6: sas python r julia sas julia python python
# 7: r python sas python stata julia spss spss
# 8: sas sas python stata r python sas julia
# 9: julia stata r spss julia r python sas
# 10: stata julia spss sas sas spss julia sas
cube2m[, ..sharedCols]
# V6 V13 V20 V21 V23 V31 V33 V39
# 1: julia julia r stata sas spss stata julia
# 2: python julia julia sas python julia python spss
# 3: sas stata sas julia spss python stata julia
# 4: python stata sas r r python sas r
# 5: spss spss sas spss spss r julia python
# 6: stata python spss julia r julia python spss
# 7: python sas python python julia julia stata sas
# 8: sas r r sas sas stata spss spss
# 9: julia sas stata sas stata python python julia
# 10: r stata julia r spss r stata r
我有 2 个数据 tables,大约 ~500k obs。和 ~50 个变量。其中一些变量包含字符,我想找到一种简单快速的方法来检查一个数据中每个字符变量的唯一值 table 匹配(或不匹配)第二个数据中的相应变量 table
我假设使用 sapply
和 set_equal
会给出我的答案,但对于每个变量它总是 returns FALSE,即使我知道答案应该是 TRUE。测试一对单独的变量,returns 正确答案。我可以遍历每对变量,这很有效,但我试图理解为什么我的 sapply
/set_equal
方法没有解决它,所以它确实如此(或找到一个替代的非 -循环法)。
> # Extract & sort unique values from character vars in cube1 for matching character vars
> cube1c <- sapply(cube1m[, ..commonCharCols], unique)
> cube1c <- sapply(cube1c, sort)
> # Extract & sort unique values from character vars in cube2 for matching character vars
> cube2c <- sapply(cube2m[, ..commonCharCols], unique)
> cube2c <- sapply(cube2c, sort)
> # Test if values in each pair of variables are the same
> sapply(cube1c, function(x) setequal(x, cube2c))
cVar1 cVar2 cVar3 cVar4 cVar5 cVar6
FALSE FALSE FALSE FALSE FALSE FALSE
> setequal(cube1c$cVar2, cube2c$cVar2)
[1] TRUE
> for (icol in seq_len(length(commonCharCols))) {
+ print(commonCharCols[[icol]])
+ print(setequal(cube1c[[icol]], cube2c[[icol]]))
+ }
[1] "cVar1"
[1] FALSE
[1] "cVar2"
[1] TRUE
[1] "cVar3"
[1] FALSE
[1] "cVar4"
[1] TRUE
[1] "cVar5"
[1] TRUE
[1] "cVar6"
[1] FALSE
考虑 mapply
等长对象的元素循环调用 setequal
:
output <- mapply(setequal, cube1c, cube2c)
使用随机的种子数据进行演示
数据
library(data.table)
set.seed(872019)
data_tools <- c("sas", "stata", "spss", "python", "r", "julia")
cube1m <- data.table(replicate(50, sample(data_tools, 10, replace=TRUE)))
head(cube1m)
# V1 V2 V3 V4 V5
# 1: r python spss julia sas
# 2: julia spss python julia python
# 3: stata r sas stata r
# 4: r r julia julia sas
# 5: julia r sas spss r
# 6: stata r r r sas
cube2m <- data.table(replicate(50, sample(data_tools, 10, replace=TRUE)))
head(cube2m)
# V1 V2 V3 V4 V5
# 1: sas spss python r stata
# 2: r spss julia sas r
# 3: julia julia stata python julia
# 4: r r spss stata julia
# 5: julia r stata python sas
# 6: python r stata sas stata
代码+输出
commonCharCols <- paste0("V", 1:50)
cube1c <- sapply(cube1m[, ..commonCharCols], function(x) sort(unique(x)))
cube2c <- sapply(cube2m[, ..commonCharCols], function(x) sort(unique(x)))
output <- mapply(setequal, cube1c, cube2c)
output
# V1 V2 V3 V4 V5 V6 V7 V8 V9 V10 V11 V12 V13 V14 V15
# FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE
# V16 V17 V18 V19 V20 V21 V22 V23 V24 V25 V26 V27 V28 V29 V30
# FALSE FALSE FALSE FALSE TRUE TRUE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
# V31 V32 V33 V34 V35 V36 V37 V38 V39 V40 V41 V42 V43 V44 V45
# TRUE FALSE TRUE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE
# V46 V47 V48 V49 V50
# FALSE FALSE FALSE FALSE FALSE
output[output == TRUE]
# V6 V13 V20 V21 V23 V31 V33 V39
# TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
检查
sharedCols <- names(output[output == TRUE])
cube1m[, ..sharedCols]
# V6 V13 V20 V21 V23 V31 V33 V39
# 1: spss julia r spss python python python spss
# 2: sas sas sas stata spss spss stata r
# 3: sas julia r r python r sas julia
# 4: spss r julia spss stata r stata r
# 5: python spss stata spss r stata stata julia
# 6: sas python r julia sas julia python python
# 7: r python sas python stata julia spss spss
# 8: sas sas python stata r python sas julia
# 9: julia stata r spss julia r python sas
# 10: stata julia spss sas sas spss julia sas
cube2m[, ..sharedCols]
# V6 V13 V20 V21 V23 V31 V33 V39
# 1: julia julia r stata sas spss stata julia
# 2: python julia julia sas python julia python spss
# 3: sas stata sas julia spss python stata julia
# 4: python stata sas r r python sas r
# 5: spss spss sas spss spss r julia python
# 6: stata python spss julia r julia python spss
# 7: python sas python python julia julia stata sas
# 8: sas r r sas sas stata spss spss
# 9: julia sas stata sas stata python python julia
# 10: r stata julia r spss r stata r