根据另一个 ID 代码有条件地删除一行

Conditionally remove a row based on another id code

在包含许多 ID 的数据集中,我只尝试操作 ID 为 7 或 9 的行,并保持其他所有内容不变。

我试图在所有没有对应变量的情况下有条件地从 7 或 9 中删除一行。因此,如果在下面的 dput 示例中,我想从 id=9 中删除第九行,因为 id=7 没有 itemcode=2。反之亦然,对于 id=7,我试图删除它的 itemcode=9,因为 id=9 没有它。

id  client    item itemcode unit X2001 X2002 X2003 X2004 X2005 X2006 X2007
...
7   7     Bob  eighth        8  100    13    18    15    NA    NA    NA    NA
8   7     Bob   ninth        9  100    11    21    10    NA    NA    NA    NA
9   9 Bob_new   first        1  100    NA    NA    NA    23    18    25    18

代码:

structure(list(id = c(7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 9L, 9L, 
9L, 9L, 9L, 9L, 9L, 9L, 10L), client = structure(c(1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L), .Label = c("Bob", 
"Bob_new", "Mark"), class = "factor"), item = structure(c(3L, 
9L, 4L, 2L, 8L, 7L, 1L, 5L, 3L, 6L, 9L, 4L, 2L, 8L, 7L, 1L, 3L
), .Label = c("eighth", "fifth", "first", "fourth", "ninth", 
"second", "seventh", "sixth", "third"), class = "factor"), itemcode = c(1L, 
3L, 4L, 5L, 6L, 7L, 8L, 9L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 1L
), unit = c(100L, 100L, 100L, 100L, 100L, 100L, 100L, 100L, 100L, 
100L, 100L, 100L, 100L, 100L, 100L, 100L, 100L), X2001 = structure(c(5L, 
6L, 1L, 4L, 2L, 5L, 3L, 1L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L
), .Label = c("11", "12", "13", "22", "24", "25", "NA"), class = "factor"), 
X2002 = structure(c(4L, 8L, 1L, 3L, 7L, 2L, 5L, 6L, 9L, 9L, 
9L, 9L, 9L, 9L, 9L, 9L, 9L), .Label = c("13", "14", "15", 
"17", "18", "21", "22", "24", "NA"), class = "factor"), X2003 = structure(c(5L, 
1L, 4L, 2L, 6L, 1L, 3L, 1L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 
7L), .Label = c("10", "11", "15", "19", "23", "24", "NA"), class = "factor"), 
X2004 = structure(c(7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 5L, 4L, 
2L, 6L, 1L, 3L, 4L, 3L, 4L), .Label = c("11", "14", "15", 
"20", "23", "25", "NA"), class = "factor"), X2005 = structure(c(6L, 
6L, 6L, 6L, 6L, 6L, 6L, 6L, 3L, 2L, 4L, 3L, 5L, 3L, 1L, 4L, 
3L), .Label = c("11", "13", "18", "19", "25", "NA"), class = "factor"), 
X2006 = structure(c(9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 8L, 6L, 
1L, 2L, 5L, 3L, 7L, 8L, 4L), .Label = c("10", "15", "18", 
"19", "20", "22", "23", "25", "NA"), class = "factor"), X2007 =     structure(c(8L, 
8L, 8L, 8L, 8L, 8L, 8L, 8L, 4L, 7L, 6L, 2L, 4L, 1L, 5L, 5L, 
3L), .Label = c("12", "13", "16", "18", "19", "21", "24", 
"NA"), class = "factor")), .Names = c("id", "client", "item", 
"itemcode", "unit", "X2001", "X2002", "X2003", "X2004", "X2005", 
"X2006", "X2007"), class = "data.frame", row.names = c(NA, -17L
))

—————————————————————————————————————— 另一个场景:

之前:

structure(list(id = c(7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 
7L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 10L), client = structure(c(1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 
2L, 2L, 3L), .Label = c("Bob", "Bob_new", "Mark"), class = "factor"), 
item = structure(c(3L, 9L, 10L, 9L, 4L, 2L, 8L, 7L, 7L, 1L, 
5L, 3L, 6L, 9L, 4L, 2L, 8L, 7L, 1L, 3L), .Label = c("eighth", 
"fifth", "first", "fourth", "ninth", "second", "seventh", 
"sixth", "third", "third "), class = "factor"), itemcode = c(1L, 
3L, 3L, 3L, 4L, 5L, 6L, 7L, 7L, 8L, 9L, 1L, 2L, 3L, 4L, 5L, 
6L, 7L, 8L, 1L), type = structure(c(1L, 1L, 2L, 3L, 1L, 1L, 
1L, 1L, 2L, 2L, 2L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("A", 
"B", "C"), class = "factor"), unit = c(100L, 100L, 100L, 
100L, 100L, 100L, 100L, 100L, 100L, 100L, 100L, 100L, 100L, 
100L, 100L, 100L, 100L, 100L, 100L, 100L), X2001 = c(24L, 
25L, 30L, 26L, 11L, 22L, 12L, 25L, 24L, 13L, 11L, NA, NA, 
NA, NA, NA, NA, NA, NA, NA), X2002 = c(17L, 24L, 12L, 96L, 
13L, 15L, 22L, 21L, 14L, 18L, 21L, NA, NA, NA, NA, NA, NA, 
NA, NA, NA), X2003 = c(23L, 10L, 46L, 94L, 19L, 11L, 24L, 
19L, 10L, 15L, 10L, NA, NA, NA, NA, NA, NA, NA, NA, NA), 
X2004 = c(NA, NA, 43L, 83L, NA, NA, NA, 6L, NA, NA, NA, 23L, 
20L, 14L, 25L, 11L, 15L, 20L, 15L, 20L), X2005 = c(NA, NA, 
97L, 86L, NA, NA, NA, 17L, NA, NA, NA, 18L, 13L, 19L, 18L, 
25L, 18L, 11L, 19L, 18L), X2006 = c(NA, NA, 11L, 91L, NA, 
NA, NA, 11L, NA, NA, NA, 25L, 22L, 10L, 15L, 20L, 18L, 23L, 
25L, 19L), X2007 = c(NA, NA, 19L, 27L, NA, NA, NA, 15L, NA, 
NA, NA, 18L, 24L, 21L, 13L, 18L, 12L, 19L, 19L, 16L)), .Names = c("id", 
"client", "item", "itemcode", "type", "unit", "X2001", "X2002", 
"X2003", "X2004", "X2005", "X2006", "X2007"), class = "data.frame", row.names = c(NA, 
-20L))

之后:

structure(list(id = c(7L, 7L, 7L, 7L, 7L, 7L, 9L, 9L, 9L, 9L, 
9L, 9L, 10L), client = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 2L, 
2L, 2L, 2L, 2L, 2L, 3L), .Label = c("Bob", "Bob_new", "Mark"), class = "factor"), 
item = structure(c(2L, 7L, 3L, 1L, 5L, 4L, 2L, 6L, 3L, 1L, 
5L, 4L, 2L), .Label = c("fifth", "first", "fourth", "seventh", 
"sixth", "third", "third "), class = "factor"), itemcode = c(1L, 
3L, 4L, 5L, 6L, 7L, 1L, 3L, 4L, 5L, 6L, 7L, 1L), type = structure(c(1L, 
2L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L), .Label = c("A", 
"B"), class = "factor"), unit = c(100L, 100L, 100L, 100L, 
100L, 100L, 100L, 100L, 100L, 100L, 100L, 100L, 100L), X2001 = c(24L, 
10L, 11L, 22L, 12L, 17L, NA, NA, NA, NA, NA, NA, NA), X2002 = c(17L, 
87L, 13L, 15L, 22L, 19L, NA, NA, NA, NA, NA, NA, NA), X2003 = c(23L, 
47L, 19L, 11L, 24L, 17L, NA, NA, NA, NA, NA, NA, NA), X2004 = c(NA, 
28L, NA, NA, NA, 28L, 23L, 14L, 25L, 11L, 15L, 20L, 20L), 
X2005 = c(NA, 43L, NA, NA, NA, 16L, 18L, 19L, 18L, 25L, 18L, 
11L, 18L), X2006 = c(NA, 69L, NA, NA, NA, 5L, 25L, 10L, 15L, 
20L, 18L, 23L, 19L), X2007 = c(NA, 72L, NA, NA, NA, 20L, 
18L, 21L, 13L, 18L, 12L, 19L, 16L)), .Names = c("id", "client", 
"item", "itemcode", "type", "unit", "X2001", "X2002", "X2003", 
"X2004", "X2005", "X2006", "X2007"), class = "data.frame", row.names = c(NA, 
-13L))

我可以实现上述过滤器代码以删除其相应位置(id 7 和 9)中不存在的项目。

但是如果项目有子级别,比如项目类型。如果项目在相应字段中没有类似类型,我也会尝试删除它们。

如果我理解这个问题:id=9 子集中的每个 itemcode 必须在 id=7 子集中具有相同的 itemcode (和反向)。如果不是这种情况,那么我们过滤掉带有非配对项目代码的行,但保留所有 ID 不在 7 或 9 中的内容。这是一种方法:

先获取常用物品代码:

items_9 <- df_all$itemcode[ df_all$id==9 ] 
items_7 <- df_all$itemcode[ df_all$id==7 ] 
items_common <- items_9[ items_9 %in% items_7 ]

select 7 和 9 的所有项目代码以及其余的:

df_new <- df_all[ 
  which( 
    ( df_all$id %in% c(7, 9) & 
      df_all$itemcode %in% items_common
    ) |
    !df_all$id %in% c(7,9)
  )
,]

你可以做这样的事情,它在两个方向上运行 setdiffcl() 函数并不是必需的,但我真的不喜欢一遍又一遍地写相同的表达式。

f <- function(x, y) setdiff(union(x, y), x)
cl <- function(var) substitute(df$itemcode[df$id == x], list(x = var))

所以现在您可以在 c(id7, id9) 上调用 f() 然后反转它并得到 c(id9, id7) 结果。

do.call(f, x <- list(cl(7), cl(9)))
# [1] 2
do.call(f, rev(x))
# [1] 9

您可以使用 dplyr

中的 filter
library(dplyr)
filter(df_all, itemcode %in% intersect(itemcode[id==7], 
              itemcode[id==9])|!id %in% c(7,9) )
#    id  client    item itemcode unit X2001 X2002 X2003 X2004 X2005 X2006 X2007
#1   7     Bob   first        1  100    24    17    23    NA    NA    NA    NA
#2   7     Bob   third        3  100    25    24    10    NA    NA    NA    NA
#3   7     Bob  fourth        4  100    11    13    19    NA    NA    NA    NA
#4   7     Bob   fifth        5  100    22    15    11    NA    NA    NA    NA
#5   7     Bob   sixth        6  100    12    22    24    NA    NA    NA    NA
#6   7     Bob seventh        7  100    24    14    10    NA    NA    NA    NA
#7   7     Bob  eighth        8  100    13    18    15    NA    NA    NA    NA
#8   9 Bob_new   first        1  100    NA    NA    NA    23    18    25    18
#9   9 Bob_new   third        3  100    NA    NA    NA    14    19    10    21
#10  9 Bob_new  fourth        4  100    NA    NA    NA    25    18    15    13
#11  9 Bob_new   fifth        5  100    NA    NA    NA    11    25    20    18
#12  9 Bob_new   sixth        6  100    NA    NA    NA    15    18    18    12
#13  9 Bob_new seventh        7  100    NA    NA    NA    20    11    23    19
#14  9 Bob_new  eighth        8  100    NA    NA    NA    15    19    25    19
#15 10    Mark   first        1  100    NA    NA    NA    20    18    19    16

更新

基于新的数据集,也许这有帮助

library(dplyr)
library(tidyr)
dfnew %>%
      unite(itemtype, itemcode,type) %>% 
      filter(itemtype %in% intersect(itemtype[id==7], 
             itemtype[id==9])|!id %in% c(7,9)) %>% 
      separate(itemtype, c('itemcode', 'type'))
#    id  client    item itemcode type unit X2001 X2002 X2003 X2004 X2005 X2006
# 1   7     Bob   first        1    A  100    24    17    23    NA    NA    NA
# 2   7     Bob  third         3    B  100    30    12    46    43    97    11
# 3   7     Bob  fourth        4    A  100    11    13    19    NA    NA    NA
# 4   7     Bob   fifth        5    A  100    22    15    11    NA    NA    NA
# 5   7     Bob   sixth        6    A  100    12    22    24    NA    NA    NA
# 6   7     Bob seventh        7    A  100    25    21    19     6    17    11
# 7   9 Bob_new   first        1    A  100    NA    NA    NA    23    18    25
# 8   9 Bob_new   third        3    B  100    NA    NA    NA    14    19    10
# 9   9 Bob_new  fourth        4    A  100    NA    NA    NA    25    18    15
# 10  9 Bob_new   fifth        5    A  100    NA    NA    NA    11    25    20
# 11  9 Bob_new   sixth        6    A  100    NA    NA    NA    15    18    18
# 12  9 Bob_new seventh        7    A  100    NA    NA    NA    20    11    23
# 13 10    Mark   first        1    A  100    NA    NA    NA    20    18    19
 #   X2007
 #1     NA
 #2     19
 #3     NA
 #4     NA
 #5     NA
 #6     15
 #7     18
 #8     21
 #9     13
 #10    18
 #11    12
 #12    19
 #13    16
library(dplyr)
df$remove <- paste(df$itemcode, df$type)
df<-invisible(filter(df,
                          remove %in% intersect(remove[type==7], 
                                                remove[type==9])|!type %in%     c(7,9) ))
#Remove the additional column after filter
df$remove <- NULL