在 R 中使用 set_equal 检查 2 个字符变量匹配的唯一值

Check the unique values of 2 character variables match using set_equal in R

我有 2 个数据 tables,大约 ~500k obs。和 ~50 个变量。其中一些变量包含字符,我想找到一种简单快速的方法来检查一个数据中每个字符变量的唯一值 table 匹配(或不匹配)第二个数据中的相应变量 table

我假设使用 sapplyset_equal 会给出我的答案,但对于每个变量它总是 returns FALSE,即使我知道答案应该是 TRUE。测试一对单独的变量,returns 正确答案。我可以遍历每对变量,这很有效,但我试图理解为什么我的 sapply/set_equal 方法没有解决它,所以它确实如此(或找到一个替代的非 -循环法)。

> # Extract & sort unique values from character vars in cube1 for matching character vars
> cube1c <- sapply(cube1m[, ..commonCharCols], unique)
> cube1c <- sapply(cube1c, sort)

> # Extract & sort unique values from character vars in cube2 for matching character vars
> cube2c <- sapply(cube2m[, ..commonCharCols], unique)
> cube2c <- sapply(cube2c, sort)

> # Test if values in each pair of variables are the same
> sapply(cube1c, function(x) setequal(x, cube2c)) 
cVar1 cVar2 cVar3 cVar4 cVar5 cVar6
FALSE FALSE FALSE FALSE FALSE FALSE

> setequal(cube1c$cVar2, cube2c$cVar2)
[1] TRUE

> for (icol in seq_len(length(commonCharCols))) {
+     print(commonCharCols[[icol]])
+     print(setequal(cube1c[[icol]], cube2c[[icol]]))
+ }
[1] "cVar1"
[1] FALSE
[1] "cVar2"
[1] TRUE
[1] "cVar3"
[1] FALSE
[1] "cVar4"
[1] TRUE
[1] "cVar5"
[1] TRUE
[1] "cVar6"
[1] FALSE

考虑 mapply 等长对象的元素循环调用 setequal:

output <- mapply(setequal, cube1c, cube2c)

使用随机的种子数据进行演示

数据

library(data.table)

set.seed(872019)
data_tools <- c("sas", "stata", "spss", "python", "r", "julia")
cube1m <- data.table(replicate(50, sample(data_tools, 10, replace=TRUE)))

head(cube1m)    
#       V1     V2     V3    V4     V5
# 1:     r python   spss julia    sas
# 2: julia   spss python julia python
# 3: stata      r    sas stata      r
# 4:     r      r  julia julia    sas
# 5: julia      r    sas  spss      r
# 6: stata      r      r     r    sas

cube2m <- data.table(replicate(50, sample(data_tools, 10, replace=TRUE)))

head(cube2m)
#        V1    V2     V3     V4    V5
# 1:    sas  spss python      r stata
# 2:      r  spss  julia    sas     r
# 3:  julia julia  stata python julia
# 4:      r     r   spss  stata julia
# 5:  julia     r  stata python   sas
# 6: python     r  stata    sas stata

代码+输出

commonCharCols <- paste0("V", 1:50)
cube1c <- sapply(cube1m[, ..commonCharCols], function(x) sort(unique(x)))        
cube2c <- sapply(cube2m[, ..commonCharCols], function(x) sort(unique(x)))

output <- mapply(setequal, cube1c, cube2c)

output
#    V1    V2    V3    V4    V5    V6    V7    V8    V9   V10   V11   V12   V13   V14   V15 
# FALSE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE 
#   V16   V17   V18   V19   V20   V21   V22   V23   V24   V25   V26   V27   V28   V29   V30 
# FALSE FALSE FALSE FALSE  TRUE  TRUE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE 
#   V31   V32   V33   V34   V35   V36   V37   V38   V39   V40   V41   V42   V43   V44   V45 
#  TRUE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE 
#   V46   V47   V48   V49   V50 
# FALSE FALSE FALSE FALSE FALSE

 output[output == TRUE]
 #   V6  V13  V20  V21  V23  V31  V33  V39 
 # TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE

检查

sharedCols <- names(output[output == TRUE])

cube1m[, ..sharedCols]    
#         V6    V13    V20    V21    V23    V31    V33    V39
#  1:   spss  julia      r   spss python python python   spss
#  2:    sas    sas    sas  stata   spss   spss  stata      r
#  3:    sas  julia      r      r python      r    sas  julia
#  4:   spss      r  julia   spss  stata      r  stata      r
#  5: python   spss  stata   spss      r  stata  stata  julia
#  6:    sas python      r  julia    sas  julia python python
#  7:      r python    sas python  stata  julia   spss   spss
#  8:    sas    sas python  stata      r python    sas  julia
#  9:  julia  stata      r   spss  julia      r python    sas
# 10:  stata  julia   spss    sas    sas   spss  julia    sas

cube2m[, ..sharedCols]    
#         V6    V13    V20    V21    V23    V31    V33    V39
#  1:  julia  julia      r  stata    sas   spss  stata  julia
#  2: python  julia  julia    sas python  julia python   spss
#  3:    sas  stata    sas  julia   spss python  stata  julia
#  4: python  stata    sas      r      r python    sas      r
#  5:   spss   spss    sas   spss   spss      r  julia python
#  6:  stata python   spss  julia      r  julia python   spss
#  7: python    sas python python  julia  julia  stata    sas
#  8:    sas      r      r    sas    sas  stata   spss   spss
#  9:  julia    sas  stata    sas  stata python python  julia
# 10:      r  stata  julia      r   spss      r  stata      r