比较 R 中的多列
Comparing multiple columns in R
我想比较 R
中不同大小的列的数据。
这是我的数据集。
pp_value_1 pp_value_2 pp_value_3 pp_filename nn_value_1 nn_value_2 nn_value_3 nn_filename mm_value_1 mm_value_2 mm_value_3 mm_filename
17 73 53 CC3 5 29 53 AA2 11 56 34 AA2
129 516 34 BB5 44 217 42 BB1 36 190 39 BB1
107 436 44 AA3 29 147 53 CC7 30 155 31 CC1
57 244 53 BB6 21 108 53 BB2 14 77 61 BB4
57 227 29 AA1 21 104 39 AA6 9 48 44 BB6
80 318 47 AA2 18 89 47 CC3 37 200 44 DD3
128 529 56 BB4 43 222 54 CC1 36 202 50 CC3
31 127 53 CC1 7 38 53 DD4
18 73 47 CC2
我使用了 duplicated
函数,在列后附加了 TRUE
和 FALSE
结果,然后对结果进行了 grep,但它没有用。这是我的代码:
v=duplicated(data$pp_filename, data$filename, data$filename)
b=cbind(data, dup=v)
dupl=(b[grep("FALSE", b$dup),])
这就是我想要得到的(再次以适当的格式):
pp_value_1 pp_value_2 pp_value_3 pp_filename nn_value_1 nn_value_2 nn_value_3 nn_filename mm_value_1 mm_value_2 mm_value_3 mm_filename
17 73 53 CC3 18 89 47 CC3 11 56 34 AA2
80 318 47 AA2 43 222 54 CC1 30 155 31 CC1
31 127 53 CC1 5 29 53 AA2 36 202 50 CC3
对我来说,简单的下标操作似乎比grep更合适。但是我还是找借口用grep了。
b <- read.table(header = TRUE, text = "
pp_value_1 pp_value_2 pp_value_3 pp_filename nn_value_1 nn_value_2 nn_value_3 nn_filename mm_value_1 mm_value_2 mm_value_3 mm_filename
17 73 53 CC3 5 29 53 AA2 11 56 34 AA2
129 516 34 BB5 44 217 42 BB1 36 190 39 BB1
107 436 44 AA3 29 147 53 CC7 30 155 31 CC1
57 244 53 BB6 21 108 53 BB2 14 77 61 BB4
57 227 29 AA1 21 104 39 AA6 9 48 44 BB6
80 318 47 AA2 18 89 47 CC3 37 200 44 DD3
128 529 56 BB4 43 222 54 CC1 36 202 50 CC3
31 127 53 CC1 7 38 53 DD4 18 73 47 CC2
")
# Any duplicated values among the "xx_filename" columns?
filenameCols <- grep("_filename", names(b))
v <- apply(b[,filenameCols], 1, FUN = anyDuplicated)
# Save rows with colliding xx_filename columns
dupl <- b[v != 0,]
这更多是关于数据格式的建议,以便于操作。
我想,更好的数据格式是:
spl = split.default(DF, substring(names(DF), 1, 2))
lDF = do.call(rbind,
lapply(seq_along(spl),
function(i)
setNames(cbind(names(spl)[i],
spl[[i]][complete.cases(spl[[i]]), ]),
c("type", gsub("^(.*?)_", "", names(spl[[i]]))))))
lDF
# type value_1 value_2 value_3 filename
#1 mm 11 56 34 AA2
#2 mm 36 190 39 BB1
#3 mm 30 155 31 CC1
#4 mm 14 77 61 BB4
#5 mm 9 48 44 BB6
#6 mm 37 200 44 DD3
#7 mm 36 202 50 CC3
#8 nn 5 29 53 AA2
#9 nn 44 217 42 BB1
#....
然后,您可以继续(至少根据我从问题中理解的内容):
commons = Reduce(intersect, split(lDF$filename, lDF$type))
lDF[lDF$filename %in% commons, ]
# type value_1 value_2 value_3 filename
#1 mm 11 56 34 AA2
#3 mm 30 155 31 CC1
#7 mm 36 202 50 CC3
#8 nn 5 29 53 AA2
#13 nn 18 89 47 CC3
#14 nn 43 222 54 CC1
#16 pp 17 73 53 CC3
#21 pp 80 318 47 AA2
#23 pp 31 127 53 CC1
如果您想要显示的格式,可以使用一些解决方法。例如:
res = lDF[lDF$filename %in% commons, ]
tmp = split(res[-1], res[[1]])
do.call(cbind,
lapply(seq_along(tmp),
function(i)
setNames(tmp[[i]],
paste(names(tmp)[i], names(tmp[[i]]), sep = "_"))))
"DF" 是:
DF = structure(list(pp_value_1 = c(17L, 129L, 107L, 57L, 57L, 80L,
128L, 31L, 18L), pp_value_2 = c(73L, 516L, 436L, 244L, 227L,
318L, 529L, 127L, 73L), pp_value_3 = c(53L, 34L, 44L, 53L, 29L,
47L, 56L, 53L, 47L), pp_filename = structure(c(9L, 5L, 3L, 6L,
1L, 2L, 4L, 7L, 8L), .Label = c("AA1", "AA2", "AA3", "BB4", "BB5",
"BB6", "CC1", "CC2", "CC3"), class = "factor"), nn_value_1 = c(5L,
44L, 29L, 21L, 21L, 18L, 43L, 7L, NA), nn_value_2 = c(29L, 217L,
147L, 108L, 104L, 89L, 222L, 38L, NA), nn_value_3 = c(53L, 42L,
53L, 53L, 39L, 47L, 54L, 53L, NA), nn_filename = structure(c(1L,
3L, 7L, 4L, 2L, 6L, 5L, 8L, NA), .Label = c("AA2", "AA6", "BB1",
"BB2", "CC1", "CC3", "CC7", "DD4"), class = "factor"), mm_value_1 = c(11L,
36L, 30L, 14L, 9L, 37L, 36L, NA, NA), mm_value_2 = c(56L, 190L,
155L, 77L, 48L, 200L, 202L, NA, NA), mm_value_3 = c(34L, 39L,
31L, 61L, 44L, 44L, 50L, NA, NA), mm_filename = structure(c(1L,
2L, 5L, 3L, 4L, 7L, 6L, NA, NA), .Label = c("AA2", "BB1", "BB4",
"BB6", "CC1", "CC3", "DD3"), class = "factor")), .Names = c("pp_value_1",
"pp_value_2", "pp_value_3", "pp_filename", "nn_value_1", "nn_value_2",
"nn_value_3", "nn_filename", "mm_value_1", "mm_value_2", "mm_value_3",
"mm_filename"), class = "data.frame", row.names = c(NA, -9L))
我想比较 R
中不同大小的列的数据。
这是我的数据集。
pp_value_1 pp_value_2 pp_value_3 pp_filename nn_value_1 nn_value_2 nn_value_3 nn_filename mm_value_1 mm_value_2 mm_value_3 mm_filename
17 73 53 CC3 5 29 53 AA2 11 56 34 AA2
129 516 34 BB5 44 217 42 BB1 36 190 39 BB1
107 436 44 AA3 29 147 53 CC7 30 155 31 CC1
57 244 53 BB6 21 108 53 BB2 14 77 61 BB4
57 227 29 AA1 21 104 39 AA6 9 48 44 BB6
80 318 47 AA2 18 89 47 CC3 37 200 44 DD3
128 529 56 BB4 43 222 54 CC1 36 202 50 CC3
31 127 53 CC1 7 38 53 DD4
18 73 47 CC2
我使用了 duplicated
函数,在列后附加了 TRUE
和 FALSE
结果,然后对结果进行了 grep,但它没有用。这是我的代码:
v=duplicated(data$pp_filename, data$filename, data$filename)
b=cbind(data, dup=v)
dupl=(b[grep("FALSE", b$dup),])
这就是我想要得到的(再次以适当的格式):
pp_value_1 pp_value_2 pp_value_3 pp_filename nn_value_1 nn_value_2 nn_value_3 nn_filename mm_value_1 mm_value_2 mm_value_3 mm_filename
17 73 53 CC3 18 89 47 CC3 11 56 34 AA2
80 318 47 AA2 43 222 54 CC1 30 155 31 CC1
31 127 53 CC1 5 29 53 AA2 36 202 50 CC3
对我来说,简单的下标操作似乎比grep更合适。但是我还是找借口用grep了。
b <- read.table(header = TRUE, text = "
pp_value_1 pp_value_2 pp_value_3 pp_filename nn_value_1 nn_value_2 nn_value_3 nn_filename mm_value_1 mm_value_2 mm_value_3 mm_filename
17 73 53 CC3 5 29 53 AA2 11 56 34 AA2
129 516 34 BB5 44 217 42 BB1 36 190 39 BB1
107 436 44 AA3 29 147 53 CC7 30 155 31 CC1
57 244 53 BB6 21 108 53 BB2 14 77 61 BB4
57 227 29 AA1 21 104 39 AA6 9 48 44 BB6
80 318 47 AA2 18 89 47 CC3 37 200 44 DD3
128 529 56 BB4 43 222 54 CC1 36 202 50 CC3
31 127 53 CC1 7 38 53 DD4 18 73 47 CC2
")
# Any duplicated values among the "xx_filename" columns?
filenameCols <- grep("_filename", names(b))
v <- apply(b[,filenameCols], 1, FUN = anyDuplicated)
# Save rows with colliding xx_filename columns
dupl <- b[v != 0,]
这更多是关于数据格式的建议,以便于操作。 我想,更好的数据格式是:
spl = split.default(DF, substring(names(DF), 1, 2))
lDF = do.call(rbind,
lapply(seq_along(spl),
function(i)
setNames(cbind(names(spl)[i],
spl[[i]][complete.cases(spl[[i]]), ]),
c("type", gsub("^(.*?)_", "", names(spl[[i]]))))))
lDF
# type value_1 value_2 value_3 filename
#1 mm 11 56 34 AA2
#2 mm 36 190 39 BB1
#3 mm 30 155 31 CC1
#4 mm 14 77 61 BB4
#5 mm 9 48 44 BB6
#6 mm 37 200 44 DD3
#7 mm 36 202 50 CC3
#8 nn 5 29 53 AA2
#9 nn 44 217 42 BB1
#....
然后,您可以继续(至少根据我从问题中理解的内容):
commons = Reduce(intersect, split(lDF$filename, lDF$type))
lDF[lDF$filename %in% commons, ]
# type value_1 value_2 value_3 filename
#1 mm 11 56 34 AA2
#3 mm 30 155 31 CC1
#7 mm 36 202 50 CC3
#8 nn 5 29 53 AA2
#13 nn 18 89 47 CC3
#14 nn 43 222 54 CC1
#16 pp 17 73 53 CC3
#21 pp 80 318 47 AA2
#23 pp 31 127 53 CC1
如果您想要显示的格式,可以使用一些解决方法。例如:
res = lDF[lDF$filename %in% commons, ]
tmp = split(res[-1], res[[1]])
do.call(cbind,
lapply(seq_along(tmp),
function(i)
setNames(tmp[[i]],
paste(names(tmp)[i], names(tmp[[i]]), sep = "_"))))
"DF" 是:
DF = structure(list(pp_value_1 = c(17L, 129L, 107L, 57L, 57L, 80L,
128L, 31L, 18L), pp_value_2 = c(73L, 516L, 436L, 244L, 227L,
318L, 529L, 127L, 73L), pp_value_3 = c(53L, 34L, 44L, 53L, 29L,
47L, 56L, 53L, 47L), pp_filename = structure(c(9L, 5L, 3L, 6L,
1L, 2L, 4L, 7L, 8L), .Label = c("AA1", "AA2", "AA3", "BB4", "BB5",
"BB6", "CC1", "CC2", "CC3"), class = "factor"), nn_value_1 = c(5L,
44L, 29L, 21L, 21L, 18L, 43L, 7L, NA), nn_value_2 = c(29L, 217L,
147L, 108L, 104L, 89L, 222L, 38L, NA), nn_value_3 = c(53L, 42L,
53L, 53L, 39L, 47L, 54L, 53L, NA), nn_filename = structure(c(1L,
3L, 7L, 4L, 2L, 6L, 5L, 8L, NA), .Label = c("AA2", "AA6", "BB1",
"BB2", "CC1", "CC3", "CC7", "DD4"), class = "factor"), mm_value_1 = c(11L,
36L, 30L, 14L, 9L, 37L, 36L, NA, NA), mm_value_2 = c(56L, 190L,
155L, 77L, 48L, 200L, 202L, NA, NA), mm_value_3 = c(34L, 39L,
31L, 61L, 44L, 44L, 50L, NA, NA), mm_filename = structure(c(1L,
2L, 5L, 3L, 4L, 7L, 6L, NA, NA), .Label = c("AA2", "BB1", "BB4",
"BB6", "CC1", "CC3", "DD3"), class = "factor")), .Names = c("pp_value_1",
"pp_value_2", "pp_value_3", "pp_filename", "nn_value_1", "nn_value_2",
"nn_value_3", "nn_filename", "mm_value_1", "mm_value_2", "mm_value_3",
"mm_filename"), class = "data.frame", row.names = c(NA, -9L))