数据框中多个组的平滑预测
Smooth prediction of several groups within a dataframe
我正在尝试使用非线性回归 (NLR) 函数来预测值 (y) 随时间 (x) 的变化,然后计算预测值达到最大值(最佳值)的时间).我得到了关于实际测量值 (y) 的预测,这很好,但是这些预测被锚定到 x 值,这意味着我只能以特定的增量获得预测值。这可以在下面的图片中看到。
Predicted values (Line) over actual values (Points).
这意味着计算出的最优值将始终为 x 值之一,但我正在使用此 NLR 函数对 y 处于最优值的时间进行数学合理估计。
我不知道问题是否出在我获取这些值的方法中,但这里有一个示例:
dat <- structure(list(measure = structure(c(1L, 12L, 13L, 14L, 15L,
16L, 17L, 18L, 19L, 2L, 3L, 1L, 12L, 13L, 14L, 15L, 16L, 17L,
18L, 19L, 1L, 12L, 13L, 14L, 15L, 16L, 17L, 18L, 19L, 2L, 3L,
4L, 5L, 1L, 12L, 13L, 14L, 15L, 16L, 17L, 18L, 19L, 2L, 3L, 4L,
5L), .Label = c("L1", "L10", "L11", "L12", "L13", "L14", "L15",
"L16", "L17", "L18", "L19", "L2", "L3", "L4", "L5", "L6", "L7",
"L8", "L9"), class = "factor"), sample = structure(c(64L, 64L,
64L, 64L, 64L, 64L, 64L, 64L, 64L, 64L, 64L, 65L, 65L, 65L, 65L,
65L, 65L, 65L, 65L, 65L, 66L, 66L, 66L, 66L, 66L, 66L, 66L, 66L,
66L, 66L, 66L, 66L, 66L, 67L, 67L, 67L, 67L, 67L, 67L, 67L, 67L,
67L, 67L, 67L, 67L, 67L), .Label = c("010719A", "010719B", "010719C",
"020419A", "020419B", "020419C", "040219A", "040219B", "040219C",
"040319A", "040319B", "040319C", "050219A", "050219B", "050219C",
"060519B", "070519A", "070519B", "070519C", "080419A", "080419B",
"080419C", "080719A", "080719B", "080719C", "090419A", "090419B",
"090419C", "100419A", "100419B", "100419C", "110219A", "110219B",
"110219C", "110319A", "110319B", "110319C", "110619A", "110619B",
"110619C", "120609A", "120609B", "120609C", "130519A", "130519B",
"130519C", "140519A", "140519B", "140519C", "150419A", "150419B",
"150419C", "170619A", "170619B", "170619C", "180219B", "180219C",
"180319A", "180319B", "180319C", "180619A", "180619B", "180619C",
"220119A", "220119C", "230119A", "230119B", "230119C", "250219A",
"250219B", "250219C", "250319A", "250319B", "250319C", "260319A",
"260319B", "260319C", "280119A", "280119B", "280119C", "290119A",
"290119B", "290119C", "300119A", "300119B", "300119C"), class = "factor"),
y = c(0, 10, 10, 13.33, 16.67, 16.67, 26.67, 13.33, 30, 36.67,
26.67, 0, 3.33, 3.33, 10, 16.67, 16.67, 3.33, 3.33, 0, 0,
0, 11.43, 20, 14.29, 14.29, 20, 14.29, 2.86, 17.14, 28.57,
34.29, 11.43, 0, 2.94, 2.94, 11.76, 20.59, 20.59, 23.53,
20.59, 14.71, 17.65, 32.35, 20.59, 8.82), x = c(0, 5.833,
8.667, 12, 14.667, 16.833, 23.667, 29.833, 32.833, 35.833,
38.583, 0, 5.833, 8.667, 12, 14.667, 16.833, 23.667, 29.833,
32.833, 0, 5.833, 8.833, 11.917, 14.667, 16.917, 23.667,
29.833, 32.833, 35.833, 38.833, 41.583, 47.833, 0, 5.833,
8.833, 11.917, 14.667, 16.917, 23.667, 29.833, 32.833, 35.833,
38.833, 41.583, 47.833)), row.names = c(NA, -46L), class = c("tbl_df",
"tbl", "data.frame"))
这是我正在使用的代码片段。这是我如何得到每个 x 和 y 值的预测。
library(tidyverse)
library(modelr)
samples <- dat$sample[dat$measure == "L1"]
output <- tibble(predictions = c(0))
for (i in seq_along(samples)) {
df <- tibble(ex = dat$x[dat$sample == samples[i]],
why = dat$y[dat$sample == samples[i]])
nlm <- nls(df$why ~ alpha * df$ex^beta * exp((-gamma) * df$ex),
data = df,
start = list(alpha = 1.5, beta = 1.85, gamma = 0.095),
control = list(maxiter = 10000))
output <- add_row(output, predictions = predict(nlm, newdata = df$ex))
output <- output %>%
mutate(predictions = round(predictions, digits = 2))
}
output <- output[-1,]
dat <- dat %>%
mutate(pred = output$predictions)
从中制作一个 ggplot 会产生与上图相同的结果。
简而言之,我不知道如何在图形的两个或多个点之间平滑地推断(插值?),然后计算该图形(线)何时处于最佳状态。有什么方法可以预测这些点之间的关系吗?是否可以迭代完成?我需要执行此操作的完整数据中有近 100 个样本。
很快:
您可以在使用 predict
时定义一个新的数据框:
df <- dat[dat$sample == dat$sample[1],]
nlm <- nls(y ~ alpha * x^beta * exp((-gamma) * x),
data = df,
start = list(alpha = 1.5, beta = 1.85, gamma = 0.095),
control = list(maxiter = 10000))
predicted <- data.frame(x = seq(min(df$x),max(df$x),0.01),
y = predict(nlm,newdata = data.frame(x = seq(min(df$x),max(df$x),0.01))))
在这里它给了你很多分数,这应该能让你找到你的最大值。但是:
- 当你使用一个模型时,你可以做一些数学运算来从估计的系数中得到最大值,我认为这会更好。在这里你可以计算导数并找到函数maxima
- 如果你想要一些局部最大值而不能计算导数,那么你可以尝试从你的估计中估计导数,并找到导数的零点
我正在尝试使用非线性回归 (NLR) 函数来预测值 (y) 随时间 (x) 的变化,然后计算预测值达到最大值(最佳值)的时间).我得到了关于实际测量值 (y) 的预测,这很好,但是这些预测被锚定到 x 值,这意味着我只能以特定的增量获得预测值。这可以在下面的图片中看到。
Predicted values (Line) over actual values (Points).
这意味着计算出的最优值将始终为 x 值之一,但我正在使用此 NLR 函数对 y 处于最优值的时间进行数学合理估计。
我不知道问题是否出在我获取这些值的方法中,但这里有一个示例:
dat <- structure(list(measure = structure(c(1L, 12L, 13L, 14L, 15L,
16L, 17L, 18L, 19L, 2L, 3L, 1L, 12L, 13L, 14L, 15L, 16L, 17L,
18L, 19L, 1L, 12L, 13L, 14L, 15L, 16L, 17L, 18L, 19L, 2L, 3L,
4L, 5L, 1L, 12L, 13L, 14L, 15L, 16L, 17L, 18L, 19L, 2L, 3L, 4L,
5L), .Label = c("L1", "L10", "L11", "L12", "L13", "L14", "L15",
"L16", "L17", "L18", "L19", "L2", "L3", "L4", "L5", "L6", "L7",
"L8", "L9"), class = "factor"), sample = structure(c(64L, 64L,
64L, 64L, 64L, 64L, 64L, 64L, 64L, 64L, 64L, 65L, 65L, 65L, 65L,
65L, 65L, 65L, 65L, 65L, 66L, 66L, 66L, 66L, 66L, 66L, 66L, 66L,
66L, 66L, 66L, 66L, 66L, 67L, 67L, 67L, 67L, 67L, 67L, 67L, 67L,
67L, 67L, 67L, 67L, 67L), .Label = c("010719A", "010719B", "010719C",
"020419A", "020419B", "020419C", "040219A", "040219B", "040219C",
"040319A", "040319B", "040319C", "050219A", "050219B", "050219C",
"060519B", "070519A", "070519B", "070519C", "080419A", "080419B",
"080419C", "080719A", "080719B", "080719C", "090419A", "090419B",
"090419C", "100419A", "100419B", "100419C", "110219A", "110219B",
"110219C", "110319A", "110319B", "110319C", "110619A", "110619B",
"110619C", "120609A", "120609B", "120609C", "130519A", "130519B",
"130519C", "140519A", "140519B", "140519C", "150419A", "150419B",
"150419C", "170619A", "170619B", "170619C", "180219B", "180219C",
"180319A", "180319B", "180319C", "180619A", "180619B", "180619C",
"220119A", "220119C", "230119A", "230119B", "230119C", "250219A",
"250219B", "250219C", "250319A", "250319B", "250319C", "260319A",
"260319B", "260319C", "280119A", "280119B", "280119C", "290119A",
"290119B", "290119C", "300119A", "300119B", "300119C"), class = "factor"),
y = c(0, 10, 10, 13.33, 16.67, 16.67, 26.67, 13.33, 30, 36.67,
26.67, 0, 3.33, 3.33, 10, 16.67, 16.67, 3.33, 3.33, 0, 0,
0, 11.43, 20, 14.29, 14.29, 20, 14.29, 2.86, 17.14, 28.57,
34.29, 11.43, 0, 2.94, 2.94, 11.76, 20.59, 20.59, 23.53,
20.59, 14.71, 17.65, 32.35, 20.59, 8.82), x = c(0, 5.833,
8.667, 12, 14.667, 16.833, 23.667, 29.833, 32.833, 35.833,
38.583, 0, 5.833, 8.667, 12, 14.667, 16.833, 23.667, 29.833,
32.833, 0, 5.833, 8.833, 11.917, 14.667, 16.917, 23.667,
29.833, 32.833, 35.833, 38.833, 41.583, 47.833, 0, 5.833,
8.833, 11.917, 14.667, 16.917, 23.667, 29.833, 32.833, 35.833,
38.833, 41.583, 47.833)), row.names = c(NA, -46L), class = c("tbl_df",
"tbl", "data.frame"))
这是我正在使用的代码片段。这是我如何得到每个 x 和 y 值的预测。
library(tidyverse)
library(modelr)
samples <- dat$sample[dat$measure == "L1"]
output <- tibble(predictions = c(0))
for (i in seq_along(samples)) {
df <- tibble(ex = dat$x[dat$sample == samples[i]],
why = dat$y[dat$sample == samples[i]])
nlm <- nls(df$why ~ alpha * df$ex^beta * exp((-gamma) * df$ex),
data = df,
start = list(alpha = 1.5, beta = 1.85, gamma = 0.095),
control = list(maxiter = 10000))
output <- add_row(output, predictions = predict(nlm, newdata = df$ex))
output <- output %>%
mutate(predictions = round(predictions, digits = 2))
}
output <- output[-1,]
dat <- dat %>%
mutate(pred = output$predictions)
从中制作一个 ggplot 会产生与上图相同的结果。 简而言之,我不知道如何在图形的两个或多个点之间平滑地推断(插值?),然后计算该图形(线)何时处于最佳状态。有什么方法可以预测这些点之间的关系吗?是否可以迭代完成?我需要执行此操作的完整数据中有近 100 个样本。
很快:
您可以在使用 predict
时定义一个新的数据框:
df <- dat[dat$sample == dat$sample[1],]
nlm <- nls(y ~ alpha * x^beta * exp((-gamma) * x),
data = df,
start = list(alpha = 1.5, beta = 1.85, gamma = 0.095),
control = list(maxiter = 10000))
predicted <- data.frame(x = seq(min(df$x),max(df$x),0.01),
y = predict(nlm,newdata = data.frame(x = seq(min(df$x),max(df$x),0.01))))
在这里它给了你很多分数,这应该能让你找到你的最大值。但是:
- 当你使用一个模型时,你可以做一些数学运算来从估计的系数中得到最大值,我认为这会更好。在这里你可以计算导数并找到函数maxima
- 如果你想要一些局部最大值而不能计算导数,那么你可以尝试从你的估计中估计导数,并找到导数的零点