R 中的条件交叉点

Conditional Intersections in R

希望过滤掉两个表中都存在的动物(相交条件 1)并在不同表中的同一类别中共享相同大小的动物(相交条件 2)。知道一种对此进行编码的有效方法——例如,使用 dplyr?

library(dplyr)
animal1 <- data.frame(type = c("cat", "dog", "dog","bird", "elephant"), 
                      size = c("small","large","small", "medium", "large"), tableName = rep("animal1",5), stringsAsFactors = F)
      #       type   size tableName
      # 1      cat  small   animal1
      # 2      dog  large   animal1
      # 3      dog  small   animal1
      # 4     bird  medium   animal1
      # 5 elephant  large   animal1

animal2 <- data.frame(type = c("elephant", "dog", "dog", "elephant", "elephant"), 
                      size = c("medium","large","large", "small", "large"), 
                      tableName = rep("animal2",5), stringsAsFactors = F)
      #      type   size tableName
      # 1 elephant medium   animal2
      # 2      dog  large   animal2
      # 3      dog  large   animal2
      # 4 elephant  small   animal2
      # 5 elephant  large   animal2


rbindAnimal <- rbind(animal1, animal2)
      #        type   size tableName
      # 1       cat  small   animal1
      # 2       dog  large   animal1
      # 3       dog  small   animal1
      # 4      bird medium   animal1
      # 5  elephant  large   animal1
      # 6  elephant medium   animal2
      # 7       dog  large   animal2
      # 8       dog  large   animal2
      # 9  elephant  small   animal2
      # 10 elephant  large   animal2

# Intersection across both tables
intersectType <- intersect( rbindAnimal %>% filter(tableName == "animal1") %>% select(type), 
                                          rbindAnimal %>% filter(tableName == "animal2") %>% select(type) ) 
      #     type
      # 1 elephant
      # 2      dog

rbindAnimal <- rbindAnimal[which(rbindAnimal$type %in% intersectType$type),]

      #        type   size tableName
      # 2       dog  large   animal1
      # 3       dog  small   animal1
      # 5  elephant  large   animal1
      # 6  elephant medium   animal2
      # 7       dog  large   animal2
      # 8       dog  large   animal2
      # 9  elephant  small   animal2
      # 10 elephant  large   animal2

# Needs to return row numbers! Here: 2,5,7,8, and 10
#        type   size tableName
# 2       dog  large   animal1
# 5  elephant  large   animal1
# 7       dog  large   animal2
# 8       dog  large   animal2
# 10 elephant  large   animal2

解决方案:使用合并/semi_join/anti_join(感谢@Imo 的合并提示!)

library(dplyr)
animal1 <- data.frame(type = c("cat", "dog", "dog","bird", "elephant"), 
                      size = c("small","large","small", "medium", "large"), tableName = rep("animal1",5), stringsAsFactors = F)
      #       type   size tableName
      # 1      cat  small   animal1
      # 2      dog  large   animal1
      # 3      dog  small   animal1
      # 4     bird  medium   animal1
      # 5 elephant  large   animal1

animal2 <- data.frame(type = c("elephant", "dog", "dog", "elephant", "elephant"), 
                      size = c("medium","large","large", "small", "large"), 
                      tableName = rep("animal2",5), stringsAsFactors = F)
      #      type   size tableName
      # 1 elephant medium   animal2
      # 2      dog  large   animal2
      # 3      dog  large   animal2
      # 4 elephant  small   animal2
      # 5 elephant  large   animal2

rbindAnimal <- rbind(animal1, animal2)
mergedAnimals <- merge(animal1, animal2, by = c("type","size"), all = T)
sharedTypeSize <- mergedAnimals[complete.cases(mergedAnimals),] %>% select(type,size) %>% unique
sharedTypeSize <- merge(rbindAnimal, sharedTypeSize)

semi_join(rbindAnimal, sharedTypeSize)
      #        type  size tableName
      # 1      dog large   animal1
      # 2      dog large   animal2
      # 3      dog large   animal2
      # 4 elephant large   animal1
      # 5 elephant large   animal2

anti_join(rbindAnimal, sharedTypeSize)

      #       type   size tableName
      # 1      cat  small   animal1
      # 2      dog  small   animal1
      # 3     bird medium   animal1
      # 4 elephant medium   animal2
      # 5 elephant  small   animal2

"Needs to return row numbers!"

使用 data.table 中的 .I 非常简单,它存储行号:

library(data.table)
setDT(rbindAnimal)

w <- rbindAnimal[, if (uniqueN(tableName) > 1L) .I, by=.(type, size)]$V1
# [1]  2  7  8  5 10
rbindAnimal[-w]
#        type   size tableName
# 1:      cat  small   animal1
# 2:      dog  small   animal1
# 3:     bird medium   animal1
# 4: elephant medium   animal2
# 5: elephant  small   animal2

我们只是按数字排除行,而不是 anti-join(如 OP 的回答)。

工作原理

  • uniqueN 统计唯一值的个数。 OP的条件是(释义):

    The type-size combo appears in both tables.

    转换为

    uniqueN(tableName) > 1L in the by=.(type, size) group of rows.

  • 如果条件成立,
  • if (cond) x 给出 xNULL 否则,删除该组。


dplyr 变体

它在 dplyr 中也能正常工作(尽管我不确定如何获取行号):

rbindAnimal %>% group_by(type, size) %>% filter(n_distinct(tableName) == 1L)
#       type   size tableName
#      <chr>  <chr>     <chr>
# 1      cat  small   animal1
# 2      dog  small   animal1
# 3     bird medium   animal1
# 4 elephant medium   animal2
# 5 elephant  small   animal2