计算分布拟合函数 R 的 y 值

Calculate y value for distribution fitting functions R

我正在为不同的分布函数绘制曲线,我需要知道每条曲线的最高 y 值。稍后我将只绘制一条曲线,它被选为最佳拟合。

这是函数(它有点硬编码,我正在研究它):

library(plyr)
library(dplyr)
library(fitdistrplus)
library(evd)
library(gamlss)
        
        
fdistr <- function(d) {
  
  #  Uncomment to try  run line by line
  # d <- data_to_plot
  
  TLT <- d$TLT
  if (sum(TLT<=0)) {TLT[TLT<=0] <- 0.001} # removing value < 0 for log clculation
  gev <- fgev(TLT, std.err=FALSE)
  distr <- c('norm', 'lnorm', 'weibull', 'gamma')
  fit <- lapply(X=distr, FUN=fitdist, data=TLT)
  fit[[5]] <- gev
  distr[5] <- 'gev'
  names(fit) <- distr
  Loglike <- sapply(X=fit, FUN=logLik)
  Loglike_Best <- which(Loglike == max(Loglike))
  
  #  Uncomment to try  run line by line
  # max <- which.max(density(d$TLT)$y)
  # max_density <- stats::density(d$TLT)$y[max]
  # max_y <- max_density
  
  x_data <- max(d$TLT)
  
  hist(TLT, prob=TRUE, breaks= x_data,
       main=paste(d$DLT_Code[1], 
                  '- best :',
                  names(Loglike[Loglike_Best])),
       sub = 'Total Lead Times',
       col='lightgrey',
       border='white'
       # ylim=  c(0,max_y)
  )
  
  lines(density(TLT),
        col='darkgrey',
        lty=2,
        lwd=2)
  
  grid(nx = NA, ny = NULL, col = "gray", lty = "dotted",
       lwd = .5, equilogs = TRUE)
  
  curve(dnorm(x, 
              mean=fit[['norm']]$estimate[1], 
              sd=fit[['norm']]$estimate[2]), 
        add=TRUE, col='blue', lwd=2)
  
  curve(dlnorm(x, 
               meanlog=fit[['lnorm']]$estimate[1], 
               sdlog=fit[['lnorm']]$estimate[2]), 
        add=TRUE, col='darkgreen', lwd=2)
  
  curve(dweibull(x, 
                 shape=fit[['weibull']]$estimate[1], 
                 scale=fit[['weibull']]$estimate[2]), 
        add=TRUE, col='purple', lwd=2)
  
  curve(dgamma(x, 
               shape=fit[['gamma']]$estimate[1], 
               rate=fit[['gamma']]$estimate[2]), 
        add=TRUE, col='Gold', lwd=2)
  
  
  curve(dgev(x, 
             loc=fit[['gev']]$estimate[1],
             scale=fit[['gev']]$estimate[2], 
             shape=fit[['gev']]$estimate[3]), 
        add=TRUE, col='red', lwd=2)
  
  
  legend_loglik <- paste(c('Norm', 'LogNorm', 'Weibull', 'Gamma','GEV'), c(':'),
                         round(Loglike, digits=2))
  
  legend("topright", legend=legend_loglik, 
         col=c('blue', 'darkgreen', 'purple', 'gold', 'red'),
         lty=1, lwd=2,
         bty='o', bg='white', box.lty=2, box.lwd = 1, box.col='white')  
  
  return(data.frame(DLT_Code = d$DLT_Code[1],
                    n = length(d$TLT),
                    Best = names(Loglike[Loglike_Best]),
                    lnorm = Loglike[1],
                    norm = Loglike[2],
                    weibul = Loglike[3],
                    gamma = Loglike[4],
                    GEV = Loglike[5]))
  
}



#  Creating data set
TLT <- c(rep(0,32), rep(1,120), rep(2,10), rep(3,67), rep(4,14),  rep(5,7), 6)
DLT_Code <- c(rep('DLT_Code',251))

data_to_plot <- data.frame(cbind(DLT_Code,TLT))
data_to_plot$TLT <- as.numeric(as.character(data_to_plot$TLT ))


DLT_Distr <- do.call(rbind, by(data = data_to_plot, INDICES = data_to_plot$DLT_Code, FUN=fdistr))

我试着玩 max_y,然后在 ylim 中使用它。我只能对法线密度做到这一点,但不能对其余曲线做到这一点。

目前的情节是这样的(一些曲线被剪掉了):

如果设置 ylim = c(0,2)我们可以看到,lognormal和gamma分布超出了1:

我需要知道每条曲线的最大值,所以,当我选择打印哪条曲线时,要设置正确的 ylim

您可以使用 purrr::map_dbl 将函数 optimize 映射到您的密度上,前提是您稍微重新排列代码并且您知道要找到哪些输入值 maxima/the密度存在。

您可以提前使用任何参数设置密度,这样您就可以使用 optimize 找到它们的峰值,并将它们传递给 curve 函数。

作为一个可重现的小例子:

library(purrr)

# parameterize your densities
mynorm <- function(x) dnorm(x, mean = 0, sd = 1) 
mygamma <- function(x) dgamma(x, rate = .5, shape = 1) 

# get largest maximum over interval
ymax <- max(purrr::map_dbl(c(mynorm, mygamma), ~ optimize(., interval = c(0, 3), maximum = T)$objective))

# 0.4999811

# plot data
curve(mynorm, col = "blue", lwd = 2, xlim = c(0, 3), ylim = c(0, ymax * 1.1))
curve(mygamma, col = "red", lwd = 2, add = T)

使用您的代码,我已经实现了上述解决方案并调整了 curve 函数的 x 网格,以便在我们在评论中讨论后向您展示我的意思,使事情更加清晰和向您展示您实际应该绘制的内容:

library(plyr)
library(dplyr)
library(fitdistrplus)
library(evd)
library(gamlss)
library(purrr) # <- add this library


fdistr <- function(d) {
  
  #  Uncomment to try  run line by line
  # d <- data_to_plot
  
  TLT <- d$TLT
  if (sum(TLT<=0)) {TLT[TLT<=0] <- 0.001} # removing value < 0 for log clculation
  gev <- fgev(TLT, std.err=FALSE)
  distr <- c('norm', 'lnorm', 'weibull', 'gamma')
  fit <- lapply(X=distr, FUN=fitdist, data=TLT)
  fit[[5]] <- gev
  distr[5] <- 'gev'
  names(fit) <- distr
  Loglike <- sapply(X=fit, FUN=logLik)
  Loglike_Best <- which(Loglike == max(Loglike))
  
  #  Uncomment to try  run line by line
  # max <- which.max(density(d$TLT)$y)
  # max_density <- stats::density(d$TLT)$y[max]
  # max_y <- max_density
  
  x_data <- max(d$TLT)
  
  # parameterize your densities before plotting
  mynorm <- function(x) {
    dnorm(x, 
          mean=fit[['norm']]$estimate[1], 
          sd=fit[['norm']]$estimate[2])
  }
  
  mylnorm <- function(x){
    dlnorm(x, 
           meanlog=fit[['lnorm']]$estimate[1], 
           sdlog=fit[['lnorm']]$estimate[2])
  }
  
  myweibull <- function(x) {
    dweibull(x, 
             shape=fit[['weibull']]$estimate[1], 
             scale=fit[['weibull']]$estimate[2])
  }
  
  mygamma <- function(x) {
    dgamma(x, 
           shape=fit[['gamma']]$estimate[1], 
           rate=fit[['gamma']]$estimate[2])
  }
  
  mygev <- function(x){
    dgev(x, 
         loc=fit[['gev']]$estimate[1],
         scale=fit[['gev']]$estimate[2], 
         shape=fit[['gev']]$estimate[3])
  }
  
  distributions <- c(mynorm, mylnorm, myweibull, mygamma, mygev)
  
  # get the max of each density
  y <- purrr::map_dbl(distributions, ~ optimize(., interval = c(0, x_data), maximum = T)$objective)

  # find the max (excluding infinity)
  ymax <- max(y[abs(y) < Inf])
  
  
  hist(TLT, prob=TRUE, breaks= x_data,
       main=paste(d$DLT_Code[1], 
                  '- best :',
                  names(Loglike[Loglike_Best])),
       sub = 'Total Lead Times',
       col='lightgrey',
       border='white',
       ylim=  c(0, ymax)
  )
  
  lines(density(TLT),
        col='darkgrey',
        lty=2,
        lwd=2)
  
  grid(nx = NA, ny = NULL, col = "gray", lty = "dotted",
       lwd = .5, equilogs = TRUE)
  
  curve(mynorm, 
        add=TRUE, col='blue', lwd=2, n = 1E5) # <- increase x grid
  
  curve(mylnorm, 
        add=TRUE, col='darkgreen', lwd=2, n = 1E5) # <- increase x grid
  
  curve(myweibull, 
        add=TRUE, col='purple', lwd=2, n = 1E5) # <- increase x grid
  
  curve(mygamma, 
        add=TRUE, col='Gold', lwd=2, n = 1E5) # <- increase x grid
  
  
  curve(mygev, 
        add=TRUE, col='red', lwd=2, n = 1E5) # <- increase x grid
  
  
  legend_loglik <- paste(c('Norm', 'LogNorm', 'Weibull', 'Gamma','GEV'), c(':'),
                         round(Loglike, digits=2))
  
  legend("topright", legend=legend_loglik, 
         col=c('blue', 'darkgreen', 'purple', 'gold', 'red'),
         lty=1, lwd=2,
         bty='o', bg='white', box.lty=2, box.lwd = 1, box.col='white')  
  
  return(data.frame(DLT_Code = d$DLT_Code[1],
                    n = length(d$TLT),
                    Best = names(Loglike[Loglike_Best]),
                    lnorm = Loglike[1],
                    norm = Loglike[2],
                    weibul = Loglike[3],
                    gamma = Loglike[4],
                    GEV = Loglike[5]))
  
}



#  Creating data set
TLT <- c(rep(0,32), rep(1,120), rep(2,10), rep(3,67), rep(4,14),  rep(5,7), 6)
DLT_Code <- c(rep('DLT_Code',251))

data_to_plot <- data.frame(cbind(DLT_Code,TLT))
data_to_plot$TLT <- as.numeric(as.character(data_to_plot$TLT ))


DLT_Distr <- do.call(rbind, by(data = data_to_plot, INDICES = data_to_plot$DLT_Code, FUN=fdistr))


为什么你的绘图高度与解决方案输出不匹配

为了进一步说明您的绘图发生了什么以及您可能遇到的一些困惑,您需要了解 curve 函数如何绘制您的数据。默认情况下 curve 取 101 个 x 值并对函数求值以获得它们的 y 值,然后将这些点绘制成一条线。由于某些密度的峰值非常尖锐,因此 curve 函数未评估足够的 x 值来绘制密度峰值。为了表明你想要我的意思是我将关注你的伽马密度。不要像输出那样担心代码。下面我有 n.

不同值的前几个 (x,y) 坐标
library(purrr)

mygamma <- function(x) {
  dgamma(x, 
         shape=fit[['gamma']]$estimate[1], # 0.6225622
         rate=fit[['gamma']]$estimate[2]) # 0.3568242
}

number_of_x <- c(5, 10, 101, 75000)
purrr::imap_dfr(number_of_x, ~ curve(mygamma, xlim = c(0, 6), n = .), .id = "n") %>% 
  dplyr::mutate_at(1, ~ sprintf("n = %i", number_of_x[as.numeric(.)])) %>% 
  dplyr::mutate(n = factor(n, unique(n))) %>% 
  dplyr::filter(x > 0) %>% 
  dplyr::group_by(n) %>% 
  dplyr::slice_min(order_by = x, n = 5)

 n                 x       y
   <fct>         <dbl>   <dbl>
 1 n = 5     1.5        0.184 
 2 n = 5     3          0.0828
 3 n = 5     4.5        0.0416
 4 n = 5     6          0.0219
 5 n = 10    0.667      0.336 
 6 n = 10    1.33       0.204 
 7 n = 10    2          0.138 
 8 n = 10    2.67       0.0975
 9 n = 10    3.33       0.0707
10 n = 101   0.06       1.04  
11 n = 101   0.12       0.780 
12 n = 101   0.18       0.655 
13 n = 101   0.24       0.575 
14 n = 101   0.3        0.518 
15 n = 75000 0.0000800 12.9   
16 n = 75000 0.000160   9.90  
17 n = 75000 0.000240   8.50  
18 n = 75000 0.000320   7.62  
19 n = 75000 0.000400   7.01  

请注意,当 n = 5 时,您绘制的值非常少。随着 n 的增加,x 值之间的距离变小。由于这些函数是连续的,因此可以绘制无限数量的点,但这无法通过计算完成,因此绘制了 x 值的子集以进行近似。 x 值越多,近似值越好。通常,默认值 n = 101 工作正常,但由于伽马和对数正态密度具有如此尖锐的峰值,绘图函数正在跨过最大值。下面是 n = 5, 10, 101, 75000 的完整数据图,添加了点。

最后我用了这个解决方案,发现:

mygamma <- function(x) dgamma(x, shape=fit[['gamma']]$estimate[1], 
                                  rate=fit[['gamma']]$estimate[2]) 
get_curve_values <- function(fn, x_data){
res <- curve(fn, from=0, to=x_data)
dev.off()
res
}
curve_val <- get_curve_values(mygamma, x_data)
ylim <- max(curve_val$y,na.rm = TRUE)