R 中的条件交叉点
Conditional Intersections in R
希望过滤掉两个表中都存在的动物(相交条件 1)并在不同表中的同一类别中共享相同大小的动物(相交条件 2)。知道一种对此进行编码的有效方法——例如,使用 dplyr?
library(dplyr)
animal1 <- data.frame(type = c("cat", "dog", "dog","bird", "elephant"),
size = c("small","large","small", "medium", "large"), tableName = rep("animal1",5), stringsAsFactors = F)
# type size tableName
# 1 cat small animal1
# 2 dog large animal1
# 3 dog small animal1
# 4 bird medium animal1
# 5 elephant large animal1
animal2 <- data.frame(type = c("elephant", "dog", "dog", "elephant", "elephant"),
size = c("medium","large","large", "small", "large"),
tableName = rep("animal2",5), stringsAsFactors = F)
# type size tableName
# 1 elephant medium animal2
# 2 dog large animal2
# 3 dog large animal2
# 4 elephant small animal2
# 5 elephant large animal2
rbindAnimal <- rbind(animal1, animal2)
# type size tableName
# 1 cat small animal1
# 2 dog large animal1
# 3 dog small animal1
# 4 bird medium animal1
# 5 elephant large animal1
# 6 elephant medium animal2
# 7 dog large animal2
# 8 dog large animal2
# 9 elephant small animal2
# 10 elephant large animal2
# Intersection across both tables
intersectType <- intersect( rbindAnimal %>% filter(tableName == "animal1") %>% select(type),
rbindAnimal %>% filter(tableName == "animal2") %>% select(type) )
# type
# 1 elephant
# 2 dog
rbindAnimal <- rbindAnimal[which(rbindAnimal$type %in% intersectType$type),]
# type size tableName
# 2 dog large animal1
# 3 dog small animal1
# 5 elephant large animal1
# 6 elephant medium animal2
# 7 dog large animal2
# 8 dog large animal2
# 9 elephant small animal2
# 10 elephant large animal2
# Needs to return row numbers! Here: 2,5,7,8, and 10
# type size tableName
# 2 dog large animal1
# 5 elephant large animal1
# 7 dog large animal2
# 8 dog large animal2
# 10 elephant large animal2
解决方案:使用合并/semi_join/anti_join(感谢@Imo 的合并提示!)
library(dplyr)
animal1 <- data.frame(type = c("cat", "dog", "dog","bird", "elephant"),
size = c("small","large","small", "medium", "large"), tableName = rep("animal1",5), stringsAsFactors = F)
# type size tableName
# 1 cat small animal1
# 2 dog large animal1
# 3 dog small animal1
# 4 bird medium animal1
# 5 elephant large animal1
animal2 <- data.frame(type = c("elephant", "dog", "dog", "elephant", "elephant"),
size = c("medium","large","large", "small", "large"),
tableName = rep("animal2",5), stringsAsFactors = F)
# type size tableName
# 1 elephant medium animal2
# 2 dog large animal2
# 3 dog large animal2
# 4 elephant small animal2
# 5 elephant large animal2
rbindAnimal <- rbind(animal1, animal2)
mergedAnimals <- merge(animal1, animal2, by = c("type","size"), all = T)
sharedTypeSize <- mergedAnimals[complete.cases(mergedAnimals),] %>% select(type,size) %>% unique
sharedTypeSize <- merge(rbindAnimal, sharedTypeSize)
semi_join(rbindAnimal, sharedTypeSize)
# type size tableName
# 1 dog large animal1
# 2 dog large animal2
# 3 dog large animal2
# 4 elephant large animal1
# 5 elephant large animal2
anti_join(rbindAnimal, sharedTypeSize)
# type size tableName
# 1 cat small animal1
# 2 dog small animal1
# 3 bird medium animal1
# 4 elephant medium animal2
# 5 elephant small animal2
"Needs to return row numbers!"
使用 data.table 中的 .I
非常简单,它存储行号:
library(data.table)
setDT(rbindAnimal)
w <- rbindAnimal[, if (uniqueN(tableName) > 1L) .I, by=.(type, size)]$V1
# [1] 2 7 8 5 10
rbindAnimal[-w]
# type size tableName
# 1: cat small animal1
# 2: dog small animal1
# 3: bird medium animal1
# 4: elephant medium animal2
# 5: elephant small animal2
我们只是按数字排除行,而不是 anti-join(如 OP 的回答)。
工作原理
uniqueN
统计唯一值的个数。 OP的条件是(释义):
The type-size combo appears in both tables.
转换为
uniqueN(tableName) > 1L
in the by=.(type, size)
group of rows.
如果条件成立,if (cond) x
给出 x
; NULL
否则,删除该组。
dplyr 变体
它在 dplyr 中也能正常工作(尽管我不确定如何获取行号):
rbindAnimal %>% group_by(type, size) %>% filter(n_distinct(tableName) == 1L)
# type size tableName
# <chr> <chr> <chr>
# 1 cat small animal1
# 2 dog small animal1
# 3 bird medium animal1
# 4 elephant medium animal2
# 5 elephant small animal2
希望过滤掉两个表中都存在的动物(相交条件 1)并在不同表中的同一类别中共享相同大小的动物(相交条件 2)。知道一种对此进行编码的有效方法——例如,使用 dplyr?
library(dplyr)
animal1 <- data.frame(type = c("cat", "dog", "dog","bird", "elephant"),
size = c("small","large","small", "medium", "large"), tableName = rep("animal1",5), stringsAsFactors = F)
# type size tableName
# 1 cat small animal1
# 2 dog large animal1
# 3 dog small animal1
# 4 bird medium animal1
# 5 elephant large animal1
animal2 <- data.frame(type = c("elephant", "dog", "dog", "elephant", "elephant"),
size = c("medium","large","large", "small", "large"),
tableName = rep("animal2",5), stringsAsFactors = F)
# type size tableName
# 1 elephant medium animal2
# 2 dog large animal2
# 3 dog large animal2
# 4 elephant small animal2
# 5 elephant large animal2
rbindAnimal <- rbind(animal1, animal2)
# type size tableName
# 1 cat small animal1
# 2 dog large animal1
# 3 dog small animal1
# 4 bird medium animal1
# 5 elephant large animal1
# 6 elephant medium animal2
# 7 dog large animal2
# 8 dog large animal2
# 9 elephant small animal2
# 10 elephant large animal2
# Intersection across both tables
intersectType <- intersect( rbindAnimal %>% filter(tableName == "animal1") %>% select(type),
rbindAnimal %>% filter(tableName == "animal2") %>% select(type) )
# type
# 1 elephant
# 2 dog
rbindAnimal <- rbindAnimal[which(rbindAnimal$type %in% intersectType$type),]
# type size tableName
# 2 dog large animal1
# 3 dog small animal1
# 5 elephant large animal1
# 6 elephant medium animal2
# 7 dog large animal2
# 8 dog large animal2
# 9 elephant small animal2
# 10 elephant large animal2
# Needs to return row numbers! Here: 2,5,7,8, and 10
# type size tableName
# 2 dog large animal1
# 5 elephant large animal1
# 7 dog large animal2
# 8 dog large animal2
# 10 elephant large animal2
解决方案:使用合并/semi_join/anti_join(感谢@Imo 的合并提示!)
library(dplyr)
animal1 <- data.frame(type = c("cat", "dog", "dog","bird", "elephant"),
size = c("small","large","small", "medium", "large"), tableName = rep("animal1",5), stringsAsFactors = F)
# type size tableName
# 1 cat small animal1
# 2 dog large animal1
# 3 dog small animal1
# 4 bird medium animal1
# 5 elephant large animal1
animal2 <- data.frame(type = c("elephant", "dog", "dog", "elephant", "elephant"),
size = c("medium","large","large", "small", "large"),
tableName = rep("animal2",5), stringsAsFactors = F)
# type size tableName
# 1 elephant medium animal2
# 2 dog large animal2
# 3 dog large animal2
# 4 elephant small animal2
# 5 elephant large animal2
rbindAnimal <- rbind(animal1, animal2)
mergedAnimals <- merge(animal1, animal2, by = c("type","size"), all = T)
sharedTypeSize <- mergedAnimals[complete.cases(mergedAnimals),] %>% select(type,size) %>% unique
sharedTypeSize <- merge(rbindAnimal, sharedTypeSize)
semi_join(rbindAnimal, sharedTypeSize)
# type size tableName
# 1 dog large animal1
# 2 dog large animal2
# 3 dog large animal2
# 4 elephant large animal1
# 5 elephant large animal2
anti_join(rbindAnimal, sharedTypeSize)
# type size tableName
# 1 cat small animal1
# 2 dog small animal1
# 3 bird medium animal1
# 4 elephant medium animal2
# 5 elephant small animal2
"Needs to return row numbers!"
使用 data.table 中的 .I
非常简单,它存储行号:
library(data.table)
setDT(rbindAnimal)
w <- rbindAnimal[, if (uniqueN(tableName) > 1L) .I, by=.(type, size)]$V1
# [1] 2 7 8 5 10
rbindAnimal[-w]
# type size tableName
# 1: cat small animal1
# 2: dog small animal1
# 3: bird medium animal1
# 4: elephant medium animal2
# 5: elephant small animal2
我们只是按数字排除行,而不是 anti-join(如 OP 的回答)。
工作原理
uniqueN
统计唯一值的个数。 OP的条件是(释义):The type-size combo appears in both tables.
转换为
uniqueN(tableName) > 1L
in theby=.(type, size)
group of rows.
如果条件成立,if (cond) x
给出x
;NULL
否则,删除该组。
dplyr 变体
它在 dplyr 中也能正常工作(尽管我不确定如何获取行号):
rbindAnimal %>% group_by(type, size) %>% filter(n_distinct(tableName) == 1L)
# type size tableName
# <chr> <chr> <chr>
# 1 cat small animal1
# 2 dog small animal1
# 3 bird medium animal1
# 4 elephant medium animal2
# 5 elephant small animal2