使用 R 中的其他阈值 table 替换数据帧中的数据
Replace data from dataframe using other threshold table in R
我有 2 个数据框,第一个有很多个月的数据,第二个有阈值(最小值和最大值,每个月都不同)。现在我想用 NA 替换阈值之外的任何值。
数据帧的结构如下:
数据包含名称为 "month"、"a"、"b" 和 "c" 的列。阈值有 "month"、"a.min"、"a.max"、"b.min" 和 "b.max".
thresholds <- structure(list(month = 1:3, a.min = c(1L, 2L, 0L), a.max = c(9L, 8L, 3L), b.min = c(50L, 60L, 30L), b.max = c(70L, 75L, 90L)), .Names = c("month", "a.min", "a.max", "b.min", "b.max"), row.names = c(NA, -3L), class = "data.frame")
df <- structure(list(a = c(3.693, 0.534, 3.068, 2.633, 3.047, 3.072, 3.278, 3.533, 3.406, 2.893, 2.722, 0.513, 1.994, 1.743, 1.958, 2.03, 2.222, 2.207, 2.393, 2.731, 15.464, 4.065, 3.458, 3.142, 2.705, 17.285, 1.794, 2.139, 2.455, 2.83, 3.008, 3.358, 3.663, 2.936, 2.636, 2.42, 3.403, 2.83, 2.74, 3.119, 2.376, 3.285, 3.267, 2.966, 3.675, 2.803, 3.097, 3.381, 2.774, 3.335, 3.857, 2.854, 3.093, 2.368, 2.8, 2.643, 3.047, 2.559, 2.119, 1.712, 1.614, 1.474, 1.82, 2.147, 2.405, 2.543, 2.374, 2.962, 3.375, 3.002, 2.785, 2.643, 2.304, 2.052, 2.116, 2.203, 2.574, 2.537, 2.306, 1.316, 2.164, 1.855, 1.501, 1.331, 1.417, 1.158, 0.792, 0.183, 0.567, 1.406, 0.975, 1.48, 0.473, 0.689, 0.046, 0.498, 1.847, 2.079, 2.454, 3.372), b = c(72.26, 77.25, 72.3, 75.79, 72.98, 83.6, 79.16, 80.9, 80.2, 80.2, 73.33, 72, 63.7, 47.14, 30.86, 47.2, 56.69, 46.94, 56.74, 50.95, 65.32, 71.82, 67.36, 65.04, 60, 53.26, 39.08, 46.73, 57.16, 80.9, 63.45, 52.17, 56.59, 54.27, 54.87, 43.51, 59.04, 50.24, 40.62, 46.33, 43.49, 55.31, 55.21, 55.76, 60.77, 49.29, 45.27, 34.23, 51.32, 81.9, 82.6, 79.03, 69.54, 70.3, 77.78, 96.4, 95.9, 93.2, 101.9, 93.2, 93, 93.8, 79.67, 63.16, 59.23, 61.44, 48.7, 60.45, 69.92, 69.54, 67.86, 73.45, 95.6, 87.8, 78.91, 71.7, 84.1, 93.4, 89.5, 88.5, 88.2, 88.2, 98.7, 117.9, 141, 157.2, 155.8, 149.6, 95.2, 91.1, 113.4, 66.98, 39.31, 41.21, 255.8, 247.5, 248.2, 251, 255.1, 250.4),c = c(384.399, 388.0435, 391.158, 394.1089, 396.2393, 397.7653,405.9039, 413.3497, 413.8737, 412.4252,401.0619, 395.5369,393.344, 390.2218, 380.8314, 370.9777, 365.3473, 365.9187,362.2083, 368.0958, 369.2954, 369.1633, 367.9333, 364.1945,359.7283, 357.4523,357.9721, 356.7934, 355.4262, 358.4297,357.7325, 362.7329, 365.4261, 363.8837,362.5658, 363.5668,369.6555, 366.5757, 360.5511, 360.7731, 360.5672, 363.6154,367.0974, 363.4489, 373.0476, 379.0865, 382.3346, 386.7982,394.0651, 398.8354,398.6193, 401.3643, 401.9453, 405.3331,417.1013, 425.4676, 423.6085, 421.9701,410.8265, 404.4327,401.7433, 397.9707, 389.2195, 379.0507, 371.2411, 370.1493,365.7072, 367.7261, 370.8189, 368.1045, 365.2104, 366.9838,370.7158, 371.3767,370.1482, 367.5164, 365.9738, 367.5455,368.9097, 366.8438, 361.4221, 363.1824,364.9451, 362.9793,364.1421, 360.9064, 359.4199, 358.8081, 354.5116, 352.878,351.8854, 354.0268, 364.0585, 368.6769, 382.3471, 385.0213,385.3837, 390.994, 388.8896, 386.261), month = c(1L, 1L,1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L,3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L)), .Names = c("a", "b", "c","month"), row.names = c(NA, -100L), class = "data.frame")
我手动创建了一些预期的输出。基本上,对于第 1 个月,限制 a.min 和 a.max 应用于列 a。然后,对于第 2 个月,将应用下一个限制。对于 b 列,应用限制 b.min 和 b.max:
outcome <- structure(list(month = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L), a = c(3.693, NA,3.68, 2.633, 3.47, 3.72, 3.278, 3.533, 3.46, 2.893, 2.722, NA,1.994, 1.743, 1.958, 2.3, 2.222, 2.27, 2.393, 2.731, NA, 4.65,3.458, 3.142, 2.75, NA,1.794, 2.139, 2.455, 2.83, 3.8, 3.358,3.663, 2.936, 2.636, 2.42, 3.43, 2.83, 2.74, 3.119, 2.376, 3.285,3.267, 2.966, 3.675, 2.83, 3.97, 3.381, 2.774, 3.335, 3.857,2.854, 3.93, 2.368, 2.8, 2.643, 3.47, 2.559, 2.119, NA, NA, NA,NA, 2.147, 2.45, 2.543, 2.374, 2.962, 3.375, 3.2, 2.785, 2.643,2.34, 2.52, 2.116, 2.23, 2.574, 2.537, 2.36, NA, 2.164, NA, NA,NA, NA, NA, NA, NA, NA, 1.46, 0.975, 1.48, 0.473, 0.689, 0.46,0.498, 1.847, 2.79, 2.454, NA), b = c(NA, NA, NA, NA, NA, NA,NA, NA, NA, NA, NA, NA, 63.7, NA, NA, NA, 56.69, NA, 56.74, 5.95,65.32, NA, 67.36, 65.4, 6, 53.26, NA, NA, 57.16, NA, 63.45, 52.17,56.59, 54.27, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 6.77, NA,NA,NA, NA,NA, NA, NA, 69.54, 7.3, NA, NA, NA, NA, NA, NA, NA,NA, NA, 63.16, NA, 61.44, NA, 6.45, 69.92, 69.54, 67.86, 73.45, NA, NA, NA, 71.7, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 66.98, 39.31, 41.21, NA, NA, NA, NA, NA, NA), c = c(384.399, 388.435, 391.158, 394.189, 396.2393, 397.7653,45.939,413.3497, 413.8737, 412.4252, 41.619, 395.5369, 393.344,39.2218, 38.8314, 37.9777, 365.3473, 365.9187, 362.283, 368.958,369.2954, 369.1633, 367.9333, 364.1945, 359.7283,357.4523,357.9721,356.7934,355.4262,358.4297,357.7325,362.7329,365.4261, 363.8837, 362.5658, 363.5668, 369.6555, 366.5757,36.5511, 36.7731, 36.5672, 363.6154, 367.974, 363.4489, 373.476,379.865, 382.3346, 386.7982, 394.651, 398.8354, 398.6193,41.3643, 41.9453, 45.3331, 417.113, 425.4676, 423.685, 421.971,41.8265, 44.4327, 41.7433, 397.977, 389.2195, 379.57, 371.2411,37.1493, 365.772, 367.7261, 7.8189,368.145,365.214,366.9838,37.7158, 371.3767, 37.1482, 367.5164, 365.9738, 367.5455,368.997, 366.8438, 361.4221, 363.1824, 364.9451, 362.9793,364.1421, 36.964, 359.4199, 358.881, 354.5116, 352.878, 351.8854, 354.268, 364.585, 368.6769, 382.3471, 385.213, 385.3837, 39.994, 388.8896, 386.261)), .Names = c("month", "a", "b","c"), row.names = c(NA, -100L), class = "data.frame")
现在我的问题是:如何在 R 中执行此操作???我怎样才能在有几十列的数据帧上做到这一点?
基数 R:
# use merge to pull in the thresholds
outcome <- merge(df, thresholds, all.x=TRUE, by="month")
# define the columns to look at, that require a .min, .max column
threshold_cols <- c("a", "b")
# loop and update
for(i in threshold_cols){
# create a condition vector to highlight ones out of the range
con <- outcome[[i]] < outcome[[sprintf("%s.min", i)]] |
outcome[[i]] > outcome[[sprintf("%s.max", i)]]
# force these as NA
outcome[[i]][con] <- NA
}
这样可以吗?强尼
使用 dplyr
你可以做这样的事情
library(dplyr)
df2 <- df %>%
left_join(thresholds) %>%
mutate(a=ifelse(a > a.min & a < a.max, a, NA),
b=ifelse(b > b.min & b < b.max, b, NA)) %>%
select(month, a, b, c)
df2
month a b c
1 1 3.693 NA 384.3990
2 1 NA NA 388.0435
3 1 3.068 NA 391.1580
4 1 2.633 NA 394.1089
5 1 3.047 NA 396.2393
6 1 3.072 NA 397.7653
7 1 3.278 NA 405.9039
...
或者,这可以通过一系列 非等值更新连接:
library(data.table)
setDT(df)[setDT(thresholds), on = .(month, a < a.min), a := NA][
thresholds, on = .(month, a > a.max), a := NA][
thresholds, on = .(month, b < b.min), b := NA][
thresholds, on = .(month, b > b.max), b := NA][]
a b c month
1: 3.693 NA 384.3990 1
2: NA NA 388.0435 1
3: 3.068 NA 391.1580 1
4: 2.633 NA 394.1089 1
5: 3.047 NA 396.2393 1
6: 3.072 NA 397.7653 1
7: 3.278 NA 405.9039 1
8: 3.533 NA 413.3497 1
9: 3.406 NA 413.8737 1
10: 2.893 NA 412.4252 1
11: 2.722 NA 401.0619 1
12: NA NA 395.5369 1
13: 1.994 63.70 393.3440 1
14: 1.743 NA 390.2218 1
15: 1.958 NA 380.8314 1
16: 2.030 NA 370.9777 1
17: 2.222 56.69 365.3473 1
18: 2.207 NA 365.9187 1
19: 2.393 56.74 362.2083 1
20: 2.731 50.95 368.0958 1
21: NA 65.32 369.2954 1
22: 4.065 NA 369.1633 1
23: 3.458 67.36 367.9333 1
24: 3.142 65.04 364.1945 1
25: 2.705 60.00 359.7283 1
26: NA 53.26 357.4523 1
27: 1.794 NA 357.9721 1
28: 2.139 NA 356.7934 1
29: 2.455 57.16 355.4262 1
30: 2.830 NA 358.4297 1
31: 3.008 63.45 357.7325 1
32: 3.358 52.17 362.7329 1
33: 3.663 56.59 365.4261 1
34: 2.936 54.27 363.8837 1
35: 2.636 NA 362.5658 2
36: 2.420 NA 363.5668 2
37: 3.403 NA 369.6555 2
38: 2.830 NA 366.5757 2
39: 2.740 NA 360.5511 2
40: 3.119 NA 360.7731 2
41: 2.376 NA 360.5672 2
42: 3.285 NA 363.6154 2
43: 3.267 NA 367.0974 2
44: 2.966 NA 363.4489 2
45: 3.675 60.77 373.0476 2
46: 2.803 NA 379.0865 2
47: 3.097 NA 382.3346 2
48: 3.381 NA 386.7982 2
49: 2.774 NA 394.0651 2
50: 3.335 NA 398.8354 2
51: 3.857 NA 398.6193 2
52: 2.854 NA 401.3643 2
53: 3.093 69.54 401.9453 2
54: 2.368 70.30 405.3331 2
55: 2.800 NA 417.1013 2
56: 2.643 NA 425.4676 2
57: 3.047 NA 423.6085 2
58: 2.559 NA 421.9701 2
59: 2.119 NA 410.8265 2
60: NA NA 404.4327 2
61: NA NA 401.7433 2
62: NA NA 397.9707 2
63: NA NA 389.2195 2
64: 2.147 63.16 379.0507 2
65: 2.405 NA 371.2411 2
66: 2.543 61.44 370.1493 2
67: 2.374 NA 365.7072 2
68: 2.962 60.45 367.7261 2
69: 3.375 69.92 370.8189 2
70: 3.002 69.54 368.1045 2
71: 2.785 67.86 365.2104 2
72: 2.643 73.45 366.9838 2
73: 2.304 NA 370.7158 2
74: 2.052 NA 371.3767 2
75: 2.116 NA 370.1482 2
76: 2.203 71.70 367.5164 2
77: 2.574 NA 365.9738 2
78: 2.537 NA 367.5455 2
79: 2.306 NA 368.9097 2
80: NA NA 366.8438 2
81: 2.164 NA 361.4221 2
82: NA NA 363.1824 2
83: NA NA 364.9451 2
84: NA NA 362.9793 2
85: NA NA 364.1421 2
86: NA NA 360.9064 2
87: NA NA 359.4199 2
88: NA NA 358.8081 2
89: NA NA 354.5116 2
90: 1.406 NA 352.8780 3
91: 0.975 NA 351.8854 3
92: 1.480 66.98 354.0268 3
93: 0.473 39.31 364.0585 3
94: 0.689 41.21 368.6769 3
95: 0.046 NA 382.3471 3
96: 0.498 NA 385.0213 3
97: 1.847 NA 385.3837 3
98: 2.079 NA 390.9940 3
99: 2.454 NA 388.8896 3
100: NA NA 386.2610 3
a b c month
编辑
OP 在 中透露,他希望 运行 在具有许多列的巨大数据帧上解决这个问题。
非等值更新连接的序列也可以在循环中执行:
threshold_cols <- c("a", "b")
setDT(df)
for(i in threshold_cols){
df[thresholds, on = c("month", sprintf("%s<%s.min", i, i)), (i) := NA][
thresholds, on = c("month", sprintf("%s>%s.max", i, i)), (i) := NA]
}
df
如上图原地修改。添加了 没有 个额外的列。这与 不同,其中 outcome
包含从 thresholds
合并的所有 a.min
、a.max
、b.min
、b.max
列.
我有 2 个数据框,第一个有很多个月的数据,第二个有阈值(最小值和最大值,每个月都不同)。现在我想用 NA 替换阈值之外的任何值。
数据帧的结构如下: 数据包含名称为 "month"、"a"、"b" 和 "c" 的列。阈值有 "month"、"a.min"、"a.max"、"b.min" 和 "b.max".
thresholds <- structure(list(month = 1:3, a.min = c(1L, 2L, 0L), a.max = c(9L, 8L, 3L), b.min = c(50L, 60L, 30L), b.max = c(70L, 75L, 90L)), .Names = c("month", "a.min", "a.max", "b.min", "b.max"), row.names = c(NA, -3L), class = "data.frame")
df <- structure(list(a = c(3.693, 0.534, 3.068, 2.633, 3.047, 3.072, 3.278, 3.533, 3.406, 2.893, 2.722, 0.513, 1.994, 1.743, 1.958, 2.03, 2.222, 2.207, 2.393, 2.731, 15.464, 4.065, 3.458, 3.142, 2.705, 17.285, 1.794, 2.139, 2.455, 2.83, 3.008, 3.358, 3.663, 2.936, 2.636, 2.42, 3.403, 2.83, 2.74, 3.119, 2.376, 3.285, 3.267, 2.966, 3.675, 2.803, 3.097, 3.381, 2.774, 3.335, 3.857, 2.854, 3.093, 2.368, 2.8, 2.643, 3.047, 2.559, 2.119, 1.712, 1.614, 1.474, 1.82, 2.147, 2.405, 2.543, 2.374, 2.962, 3.375, 3.002, 2.785, 2.643, 2.304, 2.052, 2.116, 2.203, 2.574, 2.537, 2.306, 1.316, 2.164, 1.855, 1.501, 1.331, 1.417, 1.158, 0.792, 0.183, 0.567, 1.406, 0.975, 1.48, 0.473, 0.689, 0.046, 0.498, 1.847, 2.079, 2.454, 3.372), b = c(72.26, 77.25, 72.3, 75.79, 72.98, 83.6, 79.16, 80.9, 80.2, 80.2, 73.33, 72, 63.7, 47.14, 30.86, 47.2, 56.69, 46.94, 56.74, 50.95, 65.32, 71.82, 67.36, 65.04, 60, 53.26, 39.08, 46.73, 57.16, 80.9, 63.45, 52.17, 56.59, 54.27, 54.87, 43.51, 59.04, 50.24, 40.62, 46.33, 43.49, 55.31, 55.21, 55.76, 60.77, 49.29, 45.27, 34.23, 51.32, 81.9, 82.6, 79.03, 69.54, 70.3, 77.78, 96.4, 95.9, 93.2, 101.9, 93.2, 93, 93.8, 79.67, 63.16, 59.23, 61.44, 48.7, 60.45, 69.92, 69.54, 67.86, 73.45, 95.6, 87.8, 78.91, 71.7, 84.1, 93.4, 89.5, 88.5, 88.2, 88.2, 98.7, 117.9, 141, 157.2, 155.8, 149.6, 95.2, 91.1, 113.4, 66.98, 39.31, 41.21, 255.8, 247.5, 248.2, 251, 255.1, 250.4),c = c(384.399, 388.0435, 391.158, 394.1089, 396.2393, 397.7653,405.9039, 413.3497, 413.8737, 412.4252,401.0619, 395.5369,393.344, 390.2218, 380.8314, 370.9777, 365.3473, 365.9187,362.2083, 368.0958, 369.2954, 369.1633, 367.9333, 364.1945,359.7283, 357.4523,357.9721, 356.7934, 355.4262, 358.4297,357.7325, 362.7329, 365.4261, 363.8837,362.5658, 363.5668,369.6555, 366.5757, 360.5511, 360.7731, 360.5672, 363.6154,367.0974, 363.4489, 373.0476, 379.0865, 382.3346, 386.7982,394.0651, 398.8354,398.6193, 401.3643, 401.9453, 405.3331,417.1013, 425.4676, 423.6085, 421.9701,410.8265, 404.4327,401.7433, 397.9707, 389.2195, 379.0507, 371.2411, 370.1493,365.7072, 367.7261, 370.8189, 368.1045, 365.2104, 366.9838,370.7158, 371.3767,370.1482, 367.5164, 365.9738, 367.5455,368.9097, 366.8438, 361.4221, 363.1824,364.9451, 362.9793,364.1421, 360.9064, 359.4199, 358.8081, 354.5116, 352.878,351.8854, 354.0268, 364.0585, 368.6769, 382.3471, 385.0213,385.3837, 390.994, 388.8896, 386.261), month = c(1L, 1L,1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L,3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L)), .Names = c("a", "b", "c","month"), row.names = c(NA, -100L), class = "data.frame")
我手动创建了一些预期的输出。基本上,对于第 1 个月,限制 a.min 和 a.max 应用于列 a。然后,对于第 2 个月,将应用下一个限制。对于 b 列,应用限制 b.min 和 b.max:
outcome <- structure(list(month = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L), a = c(3.693, NA,3.68, 2.633, 3.47, 3.72, 3.278, 3.533, 3.46, 2.893, 2.722, NA,1.994, 1.743, 1.958, 2.3, 2.222, 2.27, 2.393, 2.731, NA, 4.65,3.458, 3.142, 2.75, NA,1.794, 2.139, 2.455, 2.83, 3.8, 3.358,3.663, 2.936, 2.636, 2.42, 3.43, 2.83, 2.74, 3.119, 2.376, 3.285,3.267, 2.966, 3.675, 2.83, 3.97, 3.381, 2.774, 3.335, 3.857,2.854, 3.93, 2.368, 2.8, 2.643, 3.47, 2.559, 2.119, NA, NA, NA,NA, 2.147, 2.45, 2.543, 2.374, 2.962, 3.375, 3.2, 2.785, 2.643,2.34, 2.52, 2.116, 2.23, 2.574, 2.537, 2.36, NA, 2.164, NA, NA,NA, NA, NA, NA, NA, NA, 1.46, 0.975, 1.48, 0.473, 0.689, 0.46,0.498, 1.847, 2.79, 2.454, NA), b = c(NA, NA, NA, NA, NA, NA,NA, NA, NA, NA, NA, NA, 63.7, NA, NA, NA, 56.69, NA, 56.74, 5.95,65.32, NA, 67.36, 65.4, 6, 53.26, NA, NA, 57.16, NA, 63.45, 52.17,56.59, 54.27, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 6.77, NA,NA,NA, NA,NA, NA, NA, 69.54, 7.3, NA, NA, NA, NA, NA, NA, NA,NA, NA, 63.16, NA, 61.44, NA, 6.45, 69.92, 69.54, 67.86, 73.45, NA, NA, NA, 71.7, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 66.98, 39.31, 41.21, NA, NA, NA, NA, NA, NA), c = c(384.399, 388.435, 391.158, 394.189, 396.2393, 397.7653,45.939,413.3497, 413.8737, 412.4252, 41.619, 395.5369, 393.344,39.2218, 38.8314, 37.9777, 365.3473, 365.9187, 362.283, 368.958,369.2954, 369.1633, 367.9333, 364.1945, 359.7283,357.4523,357.9721,356.7934,355.4262,358.4297,357.7325,362.7329,365.4261, 363.8837, 362.5658, 363.5668, 369.6555, 366.5757,36.5511, 36.7731, 36.5672, 363.6154, 367.974, 363.4489, 373.476,379.865, 382.3346, 386.7982, 394.651, 398.8354, 398.6193,41.3643, 41.9453, 45.3331, 417.113, 425.4676, 423.685, 421.971,41.8265, 44.4327, 41.7433, 397.977, 389.2195, 379.57, 371.2411,37.1493, 365.772, 367.7261, 7.8189,368.145,365.214,366.9838,37.7158, 371.3767, 37.1482, 367.5164, 365.9738, 367.5455,368.997, 366.8438, 361.4221, 363.1824, 364.9451, 362.9793,364.1421, 36.964, 359.4199, 358.881, 354.5116, 352.878, 351.8854, 354.268, 364.585, 368.6769, 382.3471, 385.213, 385.3837, 39.994, 388.8896, 386.261)), .Names = c("month", "a", "b","c"), row.names = c(NA, -100L), class = "data.frame")
现在我的问题是:如何在 R 中执行此操作???我怎样才能在有几十列的数据帧上做到这一点?
基数 R:
# use merge to pull in the thresholds
outcome <- merge(df, thresholds, all.x=TRUE, by="month")
# define the columns to look at, that require a .min, .max column
threshold_cols <- c("a", "b")
# loop and update
for(i in threshold_cols){
# create a condition vector to highlight ones out of the range
con <- outcome[[i]] < outcome[[sprintf("%s.min", i)]] |
outcome[[i]] > outcome[[sprintf("%s.max", i)]]
# force these as NA
outcome[[i]][con] <- NA
}
这样可以吗?强尼
使用 dplyr
你可以做这样的事情
library(dplyr)
df2 <- df %>%
left_join(thresholds) %>%
mutate(a=ifelse(a > a.min & a < a.max, a, NA),
b=ifelse(b > b.min & b < b.max, b, NA)) %>%
select(month, a, b, c)
df2
month a b c
1 1 3.693 NA 384.3990
2 1 NA NA 388.0435
3 1 3.068 NA 391.1580
4 1 2.633 NA 394.1089
5 1 3.047 NA 396.2393
6 1 3.072 NA 397.7653
7 1 3.278 NA 405.9039
...
或者,这可以通过一系列 非等值更新连接:
library(data.table)
setDT(df)[setDT(thresholds), on = .(month, a < a.min), a := NA][
thresholds, on = .(month, a > a.max), a := NA][
thresholds, on = .(month, b < b.min), b := NA][
thresholds, on = .(month, b > b.max), b := NA][]
a b c month 1: 3.693 NA 384.3990 1 2: NA NA 388.0435 1 3: 3.068 NA 391.1580 1 4: 2.633 NA 394.1089 1 5: 3.047 NA 396.2393 1 6: 3.072 NA 397.7653 1 7: 3.278 NA 405.9039 1 8: 3.533 NA 413.3497 1 9: 3.406 NA 413.8737 1 10: 2.893 NA 412.4252 1 11: 2.722 NA 401.0619 1 12: NA NA 395.5369 1 13: 1.994 63.70 393.3440 1 14: 1.743 NA 390.2218 1 15: 1.958 NA 380.8314 1 16: 2.030 NA 370.9777 1 17: 2.222 56.69 365.3473 1 18: 2.207 NA 365.9187 1 19: 2.393 56.74 362.2083 1 20: 2.731 50.95 368.0958 1 21: NA 65.32 369.2954 1 22: 4.065 NA 369.1633 1 23: 3.458 67.36 367.9333 1 24: 3.142 65.04 364.1945 1 25: 2.705 60.00 359.7283 1 26: NA 53.26 357.4523 1 27: 1.794 NA 357.9721 1 28: 2.139 NA 356.7934 1 29: 2.455 57.16 355.4262 1 30: 2.830 NA 358.4297 1 31: 3.008 63.45 357.7325 1 32: 3.358 52.17 362.7329 1 33: 3.663 56.59 365.4261 1 34: 2.936 54.27 363.8837 1 35: 2.636 NA 362.5658 2 36: 2.420 NA 363.5668 2 37: 3.403 NA 369.6555 2 38: 2.830 NA 366.5757 2 39: 2.740 NA 360.5511 2 40: 3.119 NA 360.7731 2 41: 2.376 NA 360.5672 2 42: 3.285 NA 363.6154 2 43: 3.267 NA 367.0974 2 44: 2.966 NA 363.4489 2 45: 3.675 60.77 373.0476 2 46: 2.803 NA 379.0865 2 47: 3.097 NA 382.3346 2 48: 3.381 NA 386.7982 2 49: 2.774 NA 394.0651 2 50: 3.335 NA 398.8354 2 51: 3.857 NA 398.6193 2 52: 2.854 NA 401.3643 2 53: 3.093 69.54 401.9453 2 54: 2.368 70.30 405.3331 2 55: 2.800 NA 417.1013 2 56: 2.643 NA 425.4676 2 57: 3.047 NA 423.6085 2 58: 2.559 NA 421.9701 2 59: 2.119 NA 410.8265 2 60: NA NA 404.4327 2 61: NA NA 401.7433 2 62: NA NA 397.9707 2 63: NA NA 389.2195 2 64: 2.147 63.16 379.0507 2 65: 2.405 NA 371.2411 2 66: 2.543 61.44 370.1493 2 67: 2.374 NA 365.7072 2 68: 2.962 60.45 367.7261 2 69: 3.375 69.92 370.8189 2 70: 3.002 69.54 368.1045 2 71: 2.785 67.86 365.2104 2 72: 2.643 73.45 366.9838 2 73: 2.304 NA 370.7158 2 74: 2.052 NA 371.3767 2 75: 2.116 NA 370.1482 2 76: 2.203 71.70 367.5164 2 77: 2.574 NA 365.9738 2 78: 2.537 NA 367.5455 2 79: 2.306 NA 368.9097 2 80: NA NA 366.8438 2 81: 2.164 NA 361.4221 2 82: NA NA 363.1824 2 83: NA NA 364.9451 2 84: NA NA 362.9793 2 85: NA NA 364.1421 2 86: NA NA 360.9064 2 87: NA NA 359.4199 2 88: NA NA 358.8081 2 89: NA NA 354.5116 2 90: 1.406 NA 352.8780 3 91: 0.975 NA 351.8854 3 92: 1.480 66.98 354.0268 3 93: 0.473 39.31 364.0585 3 94: 0.689 41.21 368.6769 3 95: 0.046 NA 382.3471 3 96: 0.498 NA 385.0213 3 97: 1.847 NA 385.3837 3 98: 2.079 NA 390.9940 3 99: 2.454 NA 388.8896 3 100: NA NA 386.2610 3 a b c month
编辑
OP 在
非等值更新连接的序列也可以在循环中执行:
threshold_cols <- c("a", "b")
setDT(df)
for(i in threshold_cols){
df[thresholds, on = c("month", sprintf("%s<%s.min", i, i)), (i) := NA][
thresholds, on = c("month", sprintf("%s>%s.max", i, i)), (i) := NA]
}
df
如上图原地修改。添加了 没有 个额外的列。这与 outcome
包含从 thresholds
合并的所有 a.min
、a.max
、b.min
、b.max
列.