使用 R 中的刻度和标准偏差进行温度检查
Temperature checks using scale and standard deviation in R
我有这样的数据,除了有更多不同名称的单位。
structure(list(Date = structure(c(1585551600, 1585555200, 1585558800,
1585562400, 1585566000, 1585569600, 1585573200, 1585576800, 1585580400,
1585584000, 1585587600, 1585591200, 1585594800, 1585598400, 1585602000,
1585605600, 1585609200, 1585612800, 1585616400, 1585620000), class = c("POSIXct",
"POSIXt"), tzone = ""), Name = c("Suwannee 11.5", "Suwannee 11.5",
"Suwannee 11.5", "Suwannee 11.5", "Suwannee 11.5", "Suwannee 11.5",
"Suwannee 11.5", "Suwannee 11.5", "Suwannee 11.5", "Suwannee 11.5",
"Suwannee 11.5", "Suwannee 11.5", "Suwannee 11.5", "Suwannee 11.5",
"Suwannee 11.5", "Suwannee 11.5", "Suwannee 11.5", "Suwannee 11.5",
"Suwannee 11.5", "Suwannee 11.5"), Temp = c(23.7, 23.6, 23.6,
23.6, 23.6, 23.5, 23.5, 23.5, 23.4, 23.4, 23.3, 23.3, 23.3, 23.4,
33.8, 37, 40.6, 31.4, 27.8, 30.2), Data.scaled = c(2.0065971204521,
1.96308734902769, 1.96308734902769, 1.96308734902769, 1.96308734902769,
1.91957757760328, 1.91957757760328, 1.91957757760328, 1.87606780617886,
1.87606780617886, 1.83255803475445, 1.83255803475445, 1.83255803475445,
1.87606780617886, 6.40108403431786, 7.79339671989909, 9.35974849117797,
5.35684952013194, 3.79049774885305, 4.83473226303898), deviation_greater_than_2sd = c(FALSE,
FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,
FALSE, FALSE, FALSE, FALSE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE
)), row.names = 1401:1420, class = "data.frame")
Date Name Temp Data.scaled deviation_greater_than_2sd
1401 2020-03-30 11:00:00 Suwannee 11.5 23.4 1.876 FALSE
1402 2020-03-30 12:00:00 Suwannee 11.5 23.4 1.876 FALSE
1403 2020-03-30 13:00:00 Suwannee 11.5 23.3 1.833 FALSE
1404 2020-03-30 14:00:00 Suwannee 11.5 23.3 1.833 FALSE
1405 2020-03-30 15:00:00 Suwannee 11.5 23.3 1.833 FALSE
1406 2020-03-30 16:00:00 Suwannee 11.5 23.4 1.876 FALSE
1407 2020-03-28 23:00:00 Suwannee 11.5 23.8 2.050 FALSE
1408 2020-03-29 20:00:00 Suwannee 11.5 23.8 2.050 FALSE
1409 2020-03-29 21:00:00 Suwannee 11.5 23.9 2.094 FALSE
1410 2020-03-29 22:00:00 Suwannee 11.5 23.9 2.094 FALSE
1411 2020-03-30 00:00:00 Suwannee 11.5 23.9 2.094 FALSE
1412 2020-03-30 01:00:00 Suwannee 11.5 23.9 2.094 FALSE
1413 2020-03-30 02:00:00 Suwannee 11.5 23.8 2.050 FALSE
1414 2020-03-29 23:00:00 Suwannee 11.5 24.0 2.137 FALSE
1415 2020-03-30 17:00:00 Suwannee 11.5 33.8 6.401 TRUE
1416 2020-03-30 18:00:00 Suwannee 11.5 37.0 7.793 TRUE
1417 2020-03-30 19:00:00 Suwannee 11.5 40.6 9.360 TRUE
1418 2020-03-30 20:00:00 Suwannee 11.5 31.4 5.357 TRUE
1419 2020-03-30 21:00:00 Suwannee 11.5 27.8 3.790 TRUE
1420 2020-03-30 22:00:00 Suwannee 11.5 30.2 4.835 TRUE
我想要确定的是何时将传感器从水中拉出导致温度峰值。
temp.test <- subset(temp, Name == "Suwannee 11.5")
temp.test <- temp.test[,c("Date", "Name", "Temp")]
temp.test <- temp.test %>%
mutate(Data.scaled = as.numeric(scale(temp.test$Temp)),
deviation_greater_than_2sd = Data.scaled >= 2.05)
我不确定如何将 temp.test <- temp.test %>% mutate(Data.scaled = as.numeric(scale(temp.test$Temp)), deviation_greater_than_2sd = Data.scaled >= 2.05)
分别应用到数据中的所有名称,但将所有 运行 应用到一起,这样我就不必先对每个名称进行子集化。
如果我不这样做,运行在整个数据集上都很好,但它只是在寻找与所有数据组合的偏差,并且某些站点的温度不同,所以我担心会错过异常。
请注意,当我 运行 使用其他名称时,它会遗漏许多“Suwannee 11.5”异常。
Date Name Temp Data.scaled deviation_greater_than_2sd
37275 2020-11-23 01:00:00 Clammers Cut 21.3 -0.4578 FALSE
37276 2020-11-23 02:00:00 Clammers Cut 21.2 -0.4752 FALSE
37277 2020-11-23 03:00:00 Clammers Cut 21.3 -0.4578 FALSE
37278 2020-11-23 04:00:00 Clammers Cut 21.7 -0.3882 FALSE
37279 2020-11-23 05:00:00 Clammers Cut 21.6 -0.4056 FALSE
37280 2020-11-23 06:00:00 Clammers Cut 21.4 -0.4404 FALSE
37281 2020-11-23 07:00:00 Clammers Cut 21.1 -0.4925 FALSE
37282 2020-11-23 08:00:00 Clammers Cut 21.0 -0.5099 FALSE
37283 2020-11-23 09:00:00 Clammers Cut 21.0 -0.5099 FALSE
37284 2020-11-23 10:00:00 Clammers Cut 21.0 -0.5099 FALSE
37285 2020-11-23 11:00:00 Clammers Cut 20.7 -0.5621 FALSE
37286 2020-11-23 12:00:00 Clammers Cut 20.6 -0.5795 FALSE
37287 2020-11-23 13:00:00 Clammers Cut 20.5 -0.5969 FALSE
37288 2020-11-23 14:00:00 Clammers Cut 20.5 -0.5969 FALSE
37289 2020-11-23 15:00:00 Clammers Cut 20.6 -0.5795 FALSE
37290 2020-11-23 16:00:00 Clammers Cut 21.0 -0.5099 FALSE
37291 2020-11-23 17:00:00 Clammers Cut 21.5 -0.4230 FALSE
37292 2020-11-23 18:00:00 Clammers Cut 22.2 -0.3013 FALSE
37293 2020-03-30 18:00:00 Suwannee 11.5 37.0 2.2723 TRUE
37294 2020-03-30 19:00:00 Suwannee 11.5 40.6 2.8983 TRUE
我在想也许是某种 apply
函数?但我对使用 apply
函数还很陌生。
由于您的测试数据只包含一个唯一的名称,我只是想随意更改名称以使我的功能清晰。
temp <- structure(list(Date = structure(c(1585551600, 1585555200, 1585558800,
1585562400, 1585566000, 1585569600, 1585573200, 1585576800, 1585580400,
1585584000, 1585587600, 1585591200, 1585594800, 1585598400, 1585602000,
1585605600, 1585609200, 1585612800, 1585616400, 1585620000), class = c("POSIXct",
"POSIXt"), tzone = ""), Name = c("Suwannee 11.5", "Suwannee 11.5",
"Suwannee 11.5", "Suwannee 11.5", "Suwannee 11.5", "Suwannee 11.5",
"Suwannee 11.5", "Suwannee 11.5", "Suwannee 11.5", "Suwannee 11.5",
"Suwannee 11.5", "Suwannee 11.5", "Suwannee 11.5", "Suwannee 11.5",
"Suwannee 11.5", "Suwannee 11.5", "Suwannee 11.5", "Suwannee 11.5",
"Suwannee 11.5", "Suwannee 11.5"), Temp = c(23.7, 23.6, 23.6,
23.6, 23.6, 23.5, 23.5, 23.5, 23.4, 23.4, 23.3, 23.3, 23.3, 23.4,
33.8, 37, 40.6, 31.4, 27.8, 30.2), Data.scaled = c(2.0065971204521,
1.96308734902769, 1.96308734902769, 1.96308734902769, 1.96308734902769,
1.91957757760328, 1.91957757760328, 1.91957757760328, 1.87606780617886,
1.87606780617886, 1.83255803475445, 1.83255803475445, 1.83255803475445,
1.87606780617886, 6.40108403431786, 7.79339671989909, 9.35974849117797,
5.35684952013194, 3.79049774885305, 4.83473226303898), deviation_greater_than_2sd = c(FALSE,
FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,
FALSE, FALSE, FALSE, FALSE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE
)), row.names = 1401:1420, class = "data.frame")
set.seed(5555) #for reprocubility
temp$Name <- sample(c("A","B","C"),NROW(temp),replace = TRUE)
您想将尖峰检测分别应用于共享相同 Name
的所有数据。因此使用 split
-function
nameDataSplits <- split(temp,temp$Name)
现在 nameDataSplits
是一个包含数据帧的列表。在每个数据帧中,Name
是相同的。
要对列表中的每个数据帧应用尖峰检测,请将其放入类似
的函数中
addSpikes <- function(subdf) {
temp.test <- subdf
temp.test <- temp.test[,c("Date", "Name", "Temp")]
temp.test <- temp.test %>%
mutate(Data.scaled = as.numeric(scale(temp.test$Temp)),
deviation_greater_than_2sd = Data.scaled >= 2.05)
return(temp.test)
}
这个功能是你复制粘贴的,欢迎优化。
现在您可以通过 lapply
:
将峰值添加到您的数据中
spikesAdded <- lapply(nameDataSplits, addSpikes)
要将列表转换回数据框,请使用
spikesAddedDF <- do.call("rbind",spikesAdded)
更新每个名称的不同阈值:
您可以将所需的阈值放入命名向量中
yourThreshs <- setNames(rnorm(3),c("A","B","C"))
确保 yourThreshs
的名称与 temp$Name
中的名称完全相同。
然后你可以用第二个参数修改 addSpikes
-function,例如thresh
:
addSpikes <- function(subdf, thresh) {
temp.test <- subdf
temp.test <- temp.test[,c("Date", "Name", "Temp")]
temp.test <- temp.test %>%
mutate(Data.scaled = as.numeric(scale(temp.test$Temp)),
deviation_greater_than_2sd = Data.scaled >= thresh)
return(temp.test)
}
然后做
spikesAdded <- lapply(names(nameDataSplits), function(nam) {
addSpikes(subdf = nameDataSplits[[nam]],
thresh = yourThreshs[[nam]])
})
spikesAddedDF <- do.call("rbind",spikesAdded)
分位数阈值更新
如果你想根据数据计算阈值,例如使用分位数,您可以定义函数
addSpikes <- function(subdf, quantilePercentage = 0.9) {
temp.test <- subdf
temp.test <- temp.test[,c("Date", "Name", "Temp")]
temp.test$Data.scaled <- as.numeric(scale(temp.test$Temp))
quantileThreshold <- quantile(temp.test$Data.scaled, quantilePercentage)
temp.test$deviation_greater_than_2sd <- temp.test$Data.scaled >= quantileThreshold
return(temp.test)
}
现在您可以将阈值计算为分位数,然后尝试选择最合适的阈值。
您可以像上面那样继续:
spikesAdded <- lapply(nameDataSplits, addSpikes)
spikesAddedDF <- do.call("rbind",spikesAdded)
此外,如果你想做更多花哨的事情,你可以给addSpikes
函数一个函数作为参数:
myThreholdGeneratingFunction <- function(x) {
##some code that takes a vector of numerics and calculates a single number,
#e.g. quantile(x,0.9), mean(x),...
}
addSpikes <- function(subdf, thresholdGeneratingFunction = myThreholdGeneratingFunction) {
temp.test <- subdf
temp.test <- temp.test[,c("Date", "Name", "Temp")]
temp.test$Data.scaled <- as.numeric(scale(temp.test$Temp))
quantileThreshold <- myThreholdGeneratingFunction(temp.test$Data.scaled)
temp.test$deviation_greater_than_2sd <- temp.test$Data.scaled >= quantileThreshold
return(temp.test)
}
我有这样的数据,除了有更多不同名称的单位。
structure(list(Date = structure(c(1585551600, 1585555200, 1585558800,
1585562400, 1585566000, 1585569600, 1585573200, 1585576800, 1585580400,
1585584000, 1585587600, 1585591200, 1585594800, 1585598400, 1585602000,
1585605600, 1585609200, 1585612800, 1585616400, 1585620000), class = c("POSIXct",
"POSIXt"), tzone = ""), Name = c("Suwannee 11.5", "Suwannee 11.5",
"Suwannee 11.5", "Suwannee 11.5", "Suwannee 11.5", "Suwannee 11.5",
"Suwannee 11.5", "Suwannee 11.5", "Suwannee 11.5", "Suwannee 11.5",
"Suwannee 11.5", "Suwannee 11.5", "Suwannee 11.5", "Suwannee 11.5",
"Suwannee 11.5", "Suwannee 11.5", "Suwannee 11.5", "Suwannee 11.5",
"Suwannee 11.5", "Suwannee 11.5"), Temp = c(23.7, 23.6, 23.6,
23.6, 23.6, 23.5, 23.5, 23.5, 23.4, 23.4, 23.3, 23.3, 23.3, 23.4,
33.8, 37, 40.6, 31.4, 27.8, 30.2), Data.scaled = c(2.0065971204521,
1.96308734902769, 1.96308734902769, 1.96308734902769, 1.96308734902769,
1.91957757760328, 1.91957757760328, 1.91957757760328, 1.87606780617886,
1.87606780617886, 1.83255803475445, 1.83255803475445, 1.83255803475445,
1.87606780617886, 6.40108403431786, 7.79339671989909, 9.35974849117797,
5.35684952013194, 3.79049774885305, 4.83473226303898), deviation_greater_than_2sd = c(FALSE,
FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,
FALSE, FALSE, FALSE, FALSE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE
)), row.names = 1401:1420, class = "data.frame")
Date Name Temp Data.scaled deviation_greater_than_2sd
1401 2020-03-30 11:00:00 Suwannee 11.5 23.4 1.876 FALSE
1402 2020-03-30 12:00:00 Suwannee 11.5 23.4 1.876 FALSE
1403 2020-03-30 13:00:00 Suwannee 11.5 23.3 1.833 FALSE
1404 2020-03-30 14:00:00 Suwannee 11.5 23.3 1.833 FALSE
1405 2020-03-30 15:00:00 Suwannee 11.5 23.3 1.833 FALSE
1406 2020-03-30 16:00:00 Suwannee 11.5 23.4 1.876 FALSE
1407 2020-03-28 23:00:00 Suwannee 11.5 23.8 2.050 FALSE
1408 2020-03-29 20:00:00 Suwannee 11.5 23.8 2.050 FALSE
1409 2020-03-29 21:00:00 Suwannee 11.5 23.9 2.094 FALSE
1410 2020-03-29 22:00:00 Suwannee 11.5 23.9 2.094 FALSE
1411 2020-03-30 00:00:00 Suwannee 11.5 23.9 2.094 FALSE
1412 2020-03-30 01:00:00 Suwannee 11.5 23.9 2.094 FALSE
1413 2020-03-30 02:00:00 Suwannee 11.5 23.8 2.050 FALSE
1414 2020-03-29 23:00:00 Suwannee 11.5 24.0 2.137 FALSE
1415 2020-03-30 17:00:00 Suwannee 11.5 33.8 6.401 TRUE
1416 2020-03-30 18:00:00 Suwannee 11.5 37.0 7.793 TRUE
1417 2020-03-30 19:00:00 Suwannee 11.5 40.6 9.360 TRUE
1418 2020-03-30 20:00:00 Suwannee 11.5 31.4 5.357 TRUE
1419 2020-03-30 21:00:00 Suwannee 11.5 27.8 3.790 TRUE
1420 2020-03-30 22:00:00 Suwannee 11.5 30.2 4.835 TRUE
我想要确定的是何时将传感器从水中拉出导致温度峰值。
temp.test <- subset(temp, Name == "Suwannee 11.5")
temp.test <- temp.test[,c("Date", "Name", "Temp")]
temp.test <- temp.test %>%
mutate(Data.scaled = as.numeric(scale(temp.test$Temp)),
deviation_greater_than_2sd = Data.scaled >= 2.05)
我不确定如何将 temp.test <- temp.test %>% mutate(Data.scaled = as.numeric(scale(temp.test$Temp)), deviation_greater_than_2sd = Data.scaled >= 2.05)
分别应用到数据中的所有名称,但将所有 运行 应用到一起,这样我就不必先对每个名称进行子集化。
如果我不这样做,运行在整个数据集上都很好,但它只是在寻找与所有数据组合的偏差,并且某些站点的温度不同,所以我担心会错过异常。
请注意,当我 运行 使用其他名称时,它会遗漏许多“Suwannee 11.5”异常。
Date Name Temp Data.scaled deviation_greater_than_2sd
37275 2020-11-23 01:00:00 Clammers Cut 21.3 -0.4578 FALSE
37276 2020-11-23 02:00:00 Clammers Cut 21.2 -0.4752 FALSE
37277 2020-11-23 03:00:00 Clammers Cut 21.3 -0.4578 FALSE
37278 2020-11-23 04:00:00 Clammers Cut 21.7 -0.3882 FALSE
37279 2020-11-23 05:00:00 Clammers Cut 21.6 -0.4056 FALSE
37280 2020-11-23 06:00:00 Clammers Cut 21.4 -0.4404 FALSE
37281 2020-11-23 07:00:00 Clammers Cut 21.1 -0.4925 FALSE
37282 2020-11-23 08:00:00 Clammers Cut 21.0 -0.5099 FALSE
37283 2020-11-23 09:00:00 Clammers Cut 21.0 -0.5099 FALSE
37284 2020-11-23 10:00:00 Clammers Cut 21.0 -0.5099 FALSE
37285 2020-11-23 11:00:00 Clammers Cut 20.7 -0.5621 FALSE
37286 2020-11-23 12:00:00 Clammers Cut 20.6 -0.5795 FALSE
37287 2020-11-23 13:00:00 Clammers Cut 20.5 -0.5969 FALSE
37288 2020-11-23 14:00:00 Clammers Cut 20.5 -0.5969 FALSE
37289 2020-11-23 15:00:00 Clammers Cut 20.6 -0.5795 FALSE
37290 2020-11-23 16:00:00 Clammers Cut 21.0 -0.5099 FALSE
37291 2020-11-23 17:00:00 Clammers Cut 21.5 -0.4230 FALSE
37292 2020-11-23 18:00:00 Clammers Cut 22.2 -0.3013 FALSE
37293 2020-03-30 18:00:00 Suwannee 11.5 37.0 2.2723 TRUE
37294 2020-03-30 19:00:00 Suwannee 11.5 40.6 2.8983 TRUE
我在想也许是某种 apply
函数?但我对使用 apply
函数还很陌生。
由于您的测试数据只包含一个唯一的名称,我只是想随意更改名称以使我的功能清晰。
temp <- structure(list(Date = structure(c(1585551600, 1585555200, 1585558800,
1585562400, 1585566000, 1585569600, 1585573200, 1585576800, 1585580400,
1585584000, 1585587600, 1585591200, 1585594800, 1585598400, 1585602000,
1585605600, 1585609200, 1585612800, 1585616400, 1585620000), class = c("POSIXct",
"POSIXt"), tzone = ""), Name = c("Suwannee 11.5", "Suwannee 11.5",
"Suwannee 11.5", "Suwannee 11.5", "Suwannee 11.5", "Suwannee 11.5",
"Suwannee 11.5", "Suwannee 11.5", "Suwannee 11.5", "Suwannee 11.5",
"Suwannee 11.5", "Suwannee 11.5", "Suwannee 11.5", "Suwannee 11.5",
"Suwannee 11.5", "Suwannee 11.5", "Suwannee 11.5", "Suwannee 11.5",
"Suwannee 11.5", "Suwannee 11.5"), Temp = c(23.7, 23.6, 23.6,
23.6, 23.6, 23.5, 23.5, 23.5, 23.4, 23.4, 23.3, 23.3, 23.3, 23.4,
33.8, 37, 40.6, 31.4, 27.8, 30.2), Data.scaled = c(2.0065971204521,
1.96308734902769, 1.96308734902769, 1.96308734902769, 1.96308734902769,
1.91957757760328, 1.91957757760328, 1.91957757760328, 1.87606780617886,
1.87606780617886, 1.83255803475445, 1.83255803475445, 1.83255803475445,
1.87606780617886, 6.40108403431786, 7.79339671989909, 9.35974849117797,
5.35684952013194, 3.79049774885305, 4.83473226303898), deviation_greater_than_2sd = c(FALSE,
FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,
FALSE, FALSE, FALSE, FALSE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE
)), row.names = 1401:1420, class = "data.frame")
set.seed(5555) #for reprocubility
temp$Name <- sample(c("A","B","C"),NROW(temp),replace = TRUE)
您想将尖峰检测分别应用于共享相同 Name
的所有数据。因此使用 split
-function
nameDataSplits <- split(temp,temp$Name)
现在 nameDataSplits
是一个包含数据帧的列表。在每个数据帧中,Name
是相同的。
要对列表中的每个数据帧应用尖峰检测,请将其放入类似
addSpikes <- function(subdf) {
temp.test <- subdf
temp.test <- temp.test[,c("Date", "Name", "Temp")]
temp.test <- temp.test %>%
mutate(Data.scaled = as.numeric(scale(temp.test$Temp)),
deviation_greater_than_2sd = Data.scaled >= 2.05)
return(temp.test)
}
这个功能是你复制粘贴的,欢迎优化。
现在您可以通过 lapply
:
spikesAdded <- lapply(nameDataSplits, addSpikes)
要将列表转换回数据框,请使用
spikesAddedDF <- do.call("rbind",spikesAdded)
更新每个名称的不同阈值:
您可以将所需的阈值放入命名向量中
yourThreshs <- setNames(rnorm(3),c("A","B","C"))
确保 yourThreshs
的名称与 temp$Name
中的名称完全相同。
然后你可以用第二个参数修改 addSpikes
-function,例如thresh
:
addSpikes <- function(subdf, thresh) {
temp.test <- subdf
temp.test <- temp.test[,c("Date", "Name", "Temp")]
temp.test <- temp.test %>%
mutate(Data.scaled = as.numeric(scale(temp.test$Temp)),
deviation_greater_than_2sd = Data.scaled >= thresh)
return(temp.test)
}
然后做
spikesAdded <- lapply(names(nameDataSplits), function(nam) {
addSpikes(subdf = nameDataSplits[[nam]],
thresh = yourThreshs[[nam]])
})
spikesAddedDF <- do.call("rbind",spikesAdded)
分位数阈值更新
如果你想根据数据计算阈值,例如使用分位数,您可以定义函数
addSpikes <- function(subdf, quantilePercentage = 0.9) {
temp.test <- subdf
temp.test <- temp.test[,c("Date", "Name", "Temp")]
temp.test$Data.scaled <- as.numeric(scale(temp.test$Temp))
quantileThreshold <- quantile(temp.test$Data.scaled, quantilePercentage)
temp.test$deviation_greater_than_2sd <- temp.test$Data.scaled >= quantileThreshold
return(temp.test)
}
现在您可以将阈值计算为分位数,然后尝试选择最合适的阈值。 您可以像上面那样继续:
spikesAdded <- lapply(nameDataSplits, addSpikes)
spikesAddedDF <- do.call("rbind",spikesAdded)
此外,如果你想做更多花哨的事情,你可以给addSpikes
函数一个函数作为参数:
myThreholdGeneratingFunction <- function(x) {
##some code that takes a vector of numerics and calculates a single number,
#e.g. quantile(x,0.9), mean(x),...
}
addSpikes <- function(subdf, thresholdGeneratingFunction = myThreholdGeneratingFunction) {
temp.test <- subdf
temp.test <- temp.test[,c("Date", "Name", "Temp")]
temp.test$Data.scaled <- as.numeric(scale(temp.test$Temp))
quantileThreshold <- myThreholdGeneratingFunction(temp.test$Data.scaled)
temp.test$deviation_greater_than_2sd <- temp.test$Data.scaled >= quantileThreshold
return(temp.test)
}