subset data.table 仅保留大于特定值的元素应用于所有列
subset data.table keeping only elements greater than certain value applied to all columns
我想对 news(下方)进行子集化以创建 news2(下方更进一步),它只包含 rows/columns 其中 news 每个元素的 abs(value) > 0.01.
下面是我试过的代码:
gr <- data.frame(which(abs(news[, 1:ncol(news), with = FALSE]) > 0.01,
arr.ind = TRUE))
news2a <- news[gr$row, c(1, gr$col + 1L), with = FALSE]
news2a[, which(duplicated(names(news2a))) := NULL]
上面的代码并不总是有效。注:在真实数据集中,行数和列数都比较多
# news
ID diff.jan diff.feb diff.mar diff.apr
1: 7 -2.998852570e-13 2.764079712e-13 -3.291735832e-13 0.000000000e+00
2: 8 1.010000000e-01 -3.717073578e-13 -6.575639966e-13 -2.100269646e-13
3: 10 0.000000000e+00 -3.973537519e-13 0.000000000e+00 0.000000000e+00
4: 47 0.000000000e+00 0.000000000e+00 0.000000000e+00 -2.371100404e-13
5: 50 0.000000000e+00 -2.281689276e-13 2.192820401e-13 -1.857449127e-13
6: 79 0.000000000e+00 4.031985405e-13 -3.981825179e-13 0.000000000e+00
7: 202 6.409906781e-13 0.000000000e+00 NA 1.000000000e+01
8: 203 6.359592723e-13 0.000000000e+00 0.000000000e+00 1.100000000e+01
9: 468 2.545310002e-13 -2.426929277e-13 -2.612280890e-13 0.000000000e+00
diff.may diff.jun diff.jul diff.aug
1: 0.000000000e+00 0.000000000e+00 1.583933835e-13 1.182802403e-13
2: 0.000000000e+00 1.298306616e-13 -8.222315538e-13 9.721908246e-13
3: 0.000000000e+00 0.000000000e+00 0.000000000e+00 4.697083567e-13
4: -1.315189580e-13 6.926635309e-13 1.243841313e-13 0.000000000e+00
5: 0.000000000e+00 0.000000000e+00 0.000000000e+00 2.210000000e-01
6: 0.000000000e+00 0.000000000e+00 5.015727533e-13 0.000000000e+00
7: 0.000000000e+00 -1.073174486e-13 0.000000000e+00 0.000000000e+00
8: 0.000000000e+00 5.697594583e-13 0.000000000e+00 8.891748412e-13
9: -6.365151884e-13 1.595531286e-13 0.000000000e+00 -1.574081330e-13
news <- structure(list(ID = c(7L, 8L, 10L, 47L, 50L, 79L, 202L, 203L,
468L), diff.jan = c(-2.99885257e-13, 0.101, 0, 0, 0, 0, 6.409906781e-13,
6.359592723e-13, 2.545310002e-13), diff.feb = c(2.764079712e-13,
-3.717073578e-13, -3.973537519e-13, 0, -2.281689276e-13, 4.031985405e-13,
0, 0, -2.426929277e-13), diff.mar = c(-3.291735832e-13, -6.575639966e-13,
0, 0, 2.192820401e-13, -3.981825179e-13, NA, 0, -2.61228089e-13
), diff.apr = c(0, -2.100269646e-13, 0, -2.371100404e-13, -1.857449127e-13,
0, 10, 11, 0), diff.may = c(0, 0, 0, -1.31518958e-13, 0, 0, 0,
0, -6.365151884e-13), diff.jun = c(0, 1.298306616e-13, 0, 6.926635309e-13,
0, 0, -1.073174486e-13, 5.697594583e-13, 1.595531286e-13),
diff.jul = c(1.583933835e-13,
-8.222315538e-13, 0, 1.243841313e-13, 0, 5.015727533e-13, 0,
0, 0), diff.aug = c(1.182802403e-13, 9.721908246e-13, 4.697083567e-13,
0, 0.221, 0, 0, 8.891748412e-13, -1.57408133e-13)), .Names = c("ID",
"diff.jan", "diff.feb", "diff.mar", "diff.apr", "diff.may", "diff.jun",
"diff.jul", "diff.aug"), class = c("data.table", "data.frame"
), row.names = c(NA, -9L))
news2 是我想根据上面的 news 实现的。
#news2
ID diff.jan diff.apr diff.aug
1: 8 0.101 NA NA
2: 50 NA NA 0.221
3: 202 NA 10 NA
4: 203 NA 11 NA
dput(news2)
news2 <- structure(list(ID = c(8L, 50L, 202L, 203L), diff.jan = c(0.101,
NA, NA, NA), diff.apr = c(NA, NA, 10L, 11L), diff.aug = c(NA,
0.221, NA, NA)), .Names = c("ID", "diff.jan", "diff.apr", "diff.aug"
), class = c("data.table", "data.frame"), row.names = c(NA, -4L
))
您能否提供可实现预期结果的代码建议?
如果您将 data.table 融化为长格式,这很容易:
library(reshape2)
news1 <- melt(news, id.vars = "ID")
news2 <- news1[abs(value) > 0.01,]
# ID variable value
#1: 8 diff.jan 0.101
#2: 202 diff.apr 10.000
#3: 203 diff.apr 11.000
#4: 50 diff.aug 0.221
dcast.data.table(news2, ID ~ variable)
# ID diff.jan diff.apr diff.aug
#1: 8 0.101 NA NA
#2: 50 NA NA 0.221
#3: 202 NA 10 NA
#4: 203 NA 11 NA
就我个人而言,我不会执行最后一步。
我想对 news(下方)进行子集化以创建 news2(下方更进一步),它只包含 rows/columns 其中 news 每个元素的 abs(value) > 0.01.
下面是我试过的代码:
gr <- data.frame(which(abs(news[, 1:ncol(news), with = FALSE]) > 0.01,
arr.ind = TRUE))
news2a <- news[gr$row, c(1, gr$col + 1L), with = FALSE]
news2a[, which(duplicated(names(news2a))) := NULL]
上面的代码并不总是有效。注:在真实数据集中,行数和列数都比较多
# news
ID diff.jan diff.feb diff.mar diff.apr
1: 7 -2.998852570e-13 2.764079712e-13 -3.291735832e-13 0.000000000e+00
2: 8 1.010000000e-01 -3.717073578e-13 -6.575639966e-13 -2.100269646e-13
3: 10 0.000000000e+00 -3.973537519e-13 0.000000000e+00 0.000000000e+00
4: 47 0.000000000e+00 0.000000000e+00 0.000000000e+00 -2.371100404e-13
5: 50 0.000000000e+00 -2.281689276e-13 2.192820401e-13 -1.857449127e-13
6: 79 0.000000000e+00 4.031985405e-13 -3.981825179e-13 0.000000000e+00
7: 202 6.409906781e-13 0.000000000e+00 NA 1.000000000e+01
8: 203 6.359592723e-13 0.000000000e+00 0.000000000e+00 1.100000000e+01
9: 468 2.545310002e-13 -2.426929277e-13 -2.612280890e-13 0.000000000e+00
diff.may diff.jun diff.jul diff.aug
1: 0.000000000e+00 0.000000000e+00 1.583933835e-13 1.182802403e-13
2: 0.000000000e+00 1.298306616e-13 -8.222315538e-13 9.721908246e-13
3: 0.000000000e+00 0.000000000e+00 0.000000000e+00 4.697083567e-13
4: -1.315189580e-13 6.926635309e-13 1.243841313e-13 0.000000000e+00
5: 0.000000000e+00 0.000000000e+00 0.000000000e+00 2.210000000e-01
6: 0.000000000e+00 0.000000000e+00 5.015727533e-13 0.000000000e+00
7: 0.000000000e+00 -1.073174486e-13 0.000000000e+00 0.000000000e+00
8: 0.000000000e+00 5.697594583e-13 0.000000000e+00 8.891748412e-13
9: -6.365151884e-13 1.595531286e-13 0.000000000e+00 -1.574081330e-13
news <- structure(list(ID = c(7L, 8L, 10L, 47L, 50L, 79L, 202L, 203L,
468L), diff.jan = c(-2.99885257e-13, 0.101, 0, 0, 0, 0, 6.409906781e-13,
6.359592723e-13, 2.545310002e-13), diff.feb = c(2.764079712e-13,
-3.717073578e-13, -3.973537519e-13, 0, -2.281689276e-13, 4.031985405e-13,
0, 0, -2.426929277e-13), diff.mar = c(-3.291735832e-13, -6.575639966e-13,
0, 0, 2.192820401e-13, -3.981825179e-13, NA, 0, -2.61228089e-13
), diff.apr = c(0, -2.100269646e-13, 0, -2.371100404e-13, -1.857449127e-13,
0, 10, 11, 0), diff.may = c(0, 0, 0, -1.31518958e-13, 0, 0, 0,
0, -6.365151884e-13), diff.jun = c(0, 1.298306616e-13, 0, 6.926635309e-13,
0, 0, -1.073174486e-13, 5.697594583e-13, 1.595531286e-13),
diff.jul = c(1.583933835e-13,
-8.222315538e-13, 0, 1.243841313e-13, 0, 5.015727533e-13, 0,
0, 0), diff.aug = c(1.182802403e-13, 9.721908246e-13, 4.697083567e-13,
0, 0.221, 0, 0, 8.891748412e-13, -1.57408133e-13)), .Names = c("ID",
"diff.jan", "diff.feb", "diff.mar", "diff.apr", "diff.may", "diff.jun",
"diff.jul", "diff.aug"), class = c("data.table", "data.frame"
), row.names = c(NA, -9L))
news2 是我想根据上面的 news 实现的。
#news2
ID diff.jan diff.apr diff.aug
1: 8 0.101 NA NA
2: 50 NA NA 0.221
3: 202 NA 10 NA
4: 203 NA 11 NA
dput(news2)
news2 <- structure(list(ID = c(8L, 50L, 202L, 203L), diff.jan = c(0.101,
NA, NA, NA), diff.apr = c(NA, NA, 10L, 11L), diff.aug = c(NA,
0.221, NA, NA)), .Names = c("ID", "diff.jan", "diff.apr", "diff.aug"
), class = c("data.table", "data.frame"), row.names = c(NA, -4L
))
您能否提供可实现预期结果的代码建议?
如果您将 data.table 融化为长格式,这很容易:
library(reshape2)
news1 <- melt(news, id.vars = "ID")
news2 <- news1[abs(value) > 0.01,]
# ID variable value
#1: 8 diff.jan 0.101
#2: 202 diff.apr 10.000
#3: 203 diff.apr 11.000
#4: 50 diff.aug 0.221
dcast.data.table(news2, ID ~ variable)
# ID diff.jan diff.apr diff.aug
#1: 8 0.101 NA NA
#2: 50 NA NA 0.221
#3: 202 NA 10 NA
#4: 203 NA 11 NA
就我个人而言,我不会执行最后一步。