数据框中多个组的平滑预测

Smooth prediction of several groups within a dataframe

我正在尝试使用非线性回归 (NLR) 函数来预测值 (y) 随时间 (x) 的变化,然后计算预测值达到最大值(最佳值)的时间).我得到了关于实际测量值 (y) 的预测,这很好,但是这些预测被锚定到 x 值,这意味着我只能以特定的增量获得预测值。这可以在下面的图片中看到。

Predicted values (Line) over actual values (Points).

这意味着计算出的最优值将始终为 x 值之一,但我正在使用此 NLR 函数对 y 处于最优值的时间进行数学合理估计。

我不知道问题是否出在我获取这些值的方法中,但这里有一个示例:

dat <- structure(list(measure = structure(c(1L, 12L, 13L, 14L, 15L, 
16L, 17L, 18L, 19L, 2L, 3L, 1L, 12L, 13L, 14L, 15L, 16L, 17L, 
18L, 19L, 1L, 12L, 13L, 14L, 15L, 16L, 17L, 18L, 19L, 2L, 3L, 
4L, 5L, 1L, 12L, 13L, 14L, 15L, 16L, 17L, 18L, 19L, 2L, 3L, 4L, 
5L), .Label = c("L1", "L10", "L11", "L12", "L13", "L14", "L15", 
"L16", "L17", "L18", "L19", "L2", "L3", "L4", "L5", "L6", "L7", 
"L8", "L9"), class = "factor"), sample = structure(c(64L, 64L, 
64L, 64L, 64L, 64L, 64L, 64L, 64L, 64L, 64L, 65L, 65L, 65L, 65L, 
65L, 65L, 65L, 65L, 65L, 66L, 66L, 66L, 66L, 66L, 66L, 66L, 66L, 
66L, 66L, 66L, 66L, 66L, 67L, 67L, 67L, 67L, 67L, 67L, 67L, 67L, 
67L, 67L, 67L, 67L, 67L), .Label = c("010719A", "010719B", "010719C", 
"020419A", "020419B", "020419C", "040219A", "040219B", "040219C", 
"040319A", "040319B", "040319C", "050219A", "050219B", "050219C", 
"060519B", "070519A", "070519B", "070519C", "080419A", "080419B", 
"080419C", "080719A", "080719B", "080719C", "090419A", "090419B", 
"090419C", "100419A", "100419B", "100419C", "110219A", "110219B", 
"110219C", "110319A", "110319B", "110319C", "110619A", "110619B", 
"110619C", "120609A", "120609B", "120609C", "130519A", "130519B", 
"130519C", "140519A", "140519B", "140519C", "150419A", "150419B", 
"150419C", "170619A", "170619B", "170619C", "180219B", "180219C", 
"180319A", "180319B", "180319C", "180619A", "180619B", "180619C", 
"220119A", "220119C", "230119A", "230119B", "230119C", "250219A", 
"250219B", "250219C", "250319A", "250319B", "250319C", "260319A", 
"260319B", "260319C", "280119A", "280119B", "280119C", "290119A", 
"290119B", "290119C", "300119A", "300119B", "300119C"), class = "factor"), 
y = c(0, 10, 10, 13.33, 16.67, 16.67, 26.67, 13.33, 30, 36.67, 
26.67, 0, 3.33, 3.33, 10, 16.67, 16.67, 3.33, 3.33, 0, 0, 
0, 11.43, 20, 14.29, 14.29, 20, 14.29, 2.86, 17.14, 28.57, 
34.29, 11.43, 0, 2.94, 2.94, 11.76, 20.59, 20.59, 23.53, 
20.59, 14.71, 17.65, 32.35, 20.59, 8.82), x = c(0, 5.833, 
8.667, 12, 14.667, 16.833, 23.667, 29.833, 32.833, 35.833, 
38.583, 0, 5.833, 8.667, 12, 14.667, 16.833, 23.667, 29.833, 
32.833, 0, 5.833, 8.833, 11.917, 14.667, 16.917, 23.667, 
29.833, 32.833, 35.833, 38.833, 41.583, 47.833, 0, 5.833, 
8.833, 11.917, 14.667, 16.917, 23.667, 29.833, 32.833, 35.833, 
38.833, 41.583, 47.833)), row.names = c(NA, -46L), class = c("tbl_df", 
"tbl", "data.frame"))

这是我正在使用的代码片段。这是我如何得到每个 x 和 y 值的预测。

library(tidyverse)
library(modelr)

samples <- dat$sample[dat$measure == "L1"]
output <- tibble(predictions = c(0))

for (i in seq_along(samples)) {
  df <- tibble(ex = dat$x[dat$sample == samples[i]],
               why = dat$y[dat$sample == samples[i]])

  nlm <- nls(df$why ~ alpha * df$ex^beta * exp((-gamma) * df$ex),
             data = df,
             start = list(alpha = 1.5, beta = 1.85, gamma = 0.095),
             control = list(maxiter = 10000))

  output <- add_row(output, predictions = predict(nlm, newdata = df$ex))

  output <- output %>% 
    mutate(predictions = round(predictions, digits = 2))
}

output <- output[-1,]

dat <- dat %>% 
  mutate(pred = output$predictions)

从中制作一个 ggplot 会产生与上图相同的结果。 简而言之,我不知道如何在图形的两个或多个点之间平滑地推断(插值?),然后计算该图形(线)何时处于最佳状态。有什么方法可以预测这些点之间的关系吗?是否可以迭代完成?我需要执行此操作的完整数据中有近 100 个样本。

很快:

您可以在使用 predict 时定义一个新的数据框:

df <- dat[dat$sample == dat$sample[1],]
nlm <- nls(y ~ alpha * x^beta * exp((-gamma) * x),
           data = df,
           start = list(alpha = 1.5, beta = 1.85, gamma = 0.095),
           control = list(maxiter = 10000))
predicted <- data.frame(x = seq(min(df$x),max(df$x),0.01),
                        y = predict(nlm,newdata = data.frame(x = seq(min(df$x),max(df$x),0.01))))

在这里它给了你很多分数,这应该能让你找到你的最大值。但是:

  • 当你使用一个模型时,你可以做一些数学运算来从估计的系数中得到最大值,我认为这会更好。在这里你可以计算导数并找到函数maxima
  • 如果你想要一些局部最大值而不能计算导数,那么你可以尝试从你的估计中估计导数,并找到导数的零点