计算分布拟合函数 R 的 y 值
Calculate y value for distribution fitting functions R
我正在为不同的分布函数绘制曲线,我需要知道每条曲线的最高 y 值。稍后我将只绘制一条曲线,它被选为最佳拟合。
这是函数(它有点硬编码,我正在研究它):
library(plyr)
library(dplyr)
library(fitdistrplus)
library(evd)
library(gamlss)
fdistr <- function(d) {
# Uncomment to try run line by line
# d <- data_to_plot
TLT <- d$TLT
if (sum(TLT<=0)) {TLT[TLT<=0] <- 0.001} # removing value < 0 for log clculation
gev <- fgev(TLT, std.err=FALSE)
distr <- c('norm', 'lnorm', 'weibull', 'gamma')
fit <- lapply(X=distr, FUN=fitdist, data=TLT)
fit[[5]] <- gev
distr[5] <- 'gev'
names(fit) <- distr
Loglike <- sapply(X=fit, FUN=logLik)
Loglike_Best <- which(Loglike == max(Loglike))
# Uncomment to try run line by line
# max <- which.max(density(d$TLT)$y)
# max_density <- stats::density(d$TLT)$y[max]
# max_y <- max_density
x_data <- max(d$TLT)
hist(TLT, prob=TRUE, breaks= x_data,
main=paste(d$DLT_Code[1],
'- best :',
names(Loglike[Loglike_Best])),
sub = 'Total Lead Times',
col='lightgrey',
border='white'
# ylim= c(0,max_y)
)
lines(density(TLT),
col='darkgrey',
lty=2,
lwd=2)
grid(nx = NA, ny = NULL, col = "gray", lty = "dotted",
lwd = .5, equilogs = TRUE)
curve(dnorm(x,
mean=fit[['norm']]$estimate[1],
sd=fit[['norm']]$estimate[2]),
add=TRUE, col='blue', lwd=2)
curve(dlnorm(x,
meanlog=fit[['lnorm']]$estimate[1],
sdlog=fit[['lnorm']]$estimate[2]),
add=TRUE, col='darkgreen', lwd=2)
curve(dweibull(x,
shape=fit[['weibull']]$estimate[1],
scale=fit[['weibull']]$estimate[2]),
add=TRUE, col='purple', lwd=2)
curve(dgamma(x,
shape=fit[['gamma']]$estimate[1],
rate=fit[['gamma']]$estimate[2]),
add=TRUE, col='Gold', lwd=2)
curve(dgev(x,
loc=fit[['gev']]$estimate[1],
scale=fit[['gev']]$estimate[2],
shape=fit[['gev']]$estimate[3]),
add=TRUE, col='red', lwd=2)
legend_loglik <- paste(c('Norm', 'LogNorm', 'Weibull', 'Gamma','GEV'), c(':'),
round(Loglike, digits=2))
legend("topright", legend=legend_loglik,
col=c('blue', 'darkgreen', 'purple', 'gold', 'red'),
lty=1, lwd=2,
bty='o', bg='white', box.lty=2, box.lwd = 1, box.col='white')
return(data.frame(DLT_Code = d$DLT_Code[1],
n = length(d$TLT),
Best = names(Loglike[Loglike_Best]),
lnorm = Loglike[1],
norm = Loglike[2],
weibul = Loglike[3],
gamma = Loglike[4],
GEV = Loglike[5]))
}
# Creating data set
TLT <- c(rep(0,32), rep(1,120), rep(2,10), rep(3,67), rep(4,14), rep(5,7), 6)
DLT_Code <- c(rep('DLT_Code',251))
data_to_plot <- data.frame(cbind(DLT_Code,TLT))
data_to_plot$TLT <- as.numeric(as.character(data_to_plot$TLT ))
DLT_Distr <- do.call(rbind, by(data = data_to_plot, INDICES = data_to_plot$DLT_Code, FUN=fdistr))
我试着玩 max_y
,然后在 ylim
中使用它。我只能对法线密度做到这一点,但不能对其余曲线做到这一点。
目前的情节是这样的(一些曲线被剪掉了):
如果设置 ylim = c(0,2)
我们可以看到,lognormal和gamma分布超出了1:
我需要知道每条曲线的最大值,所以,当我选择打印哪条曲线时,要设置正确的 ylim
。
您可以使用 purrr::map_dbl
将函数 optimize
映射到您的密度上,前提是您稍微重新排列代码并且您知道要找到哪些输入值 maxima/the密度存在。
您可以提前使用任何参数设置密度,这样您就可以使用 optimize
找到它们的峰值,并将它们传递给 curve
函数。
作为一个可重现的小例子:
library(purrr)
# parameterize your densities
mynorm <- function(x) dnorm(x, mean = 0, sd = 1)
mygamma <- function(x) dgamma(x, rate = .5, shape = 1)
# get largest maximum over interval
ymax <- max(purrr::map_dbl(c(mynorm, mygamma), ~ optimize(., interval = c(0, 3), maximum = T)$objective))
# 0.4999811
# plot data
curve(mynorm, col = "blue", lwd = 2, xlim = c(0, 3), ylim = c(0, ymax * 1.1))
curve(mygamma, col = "red", lwd = 2, add = T)
使用您的代码,我已经实现了上述解决方案并调整了 curve
函数的 x
网格,以便在我们在评论中讨论后向您展示我的意思,使事情更加清晰和向您展示您实际应该绘制的内容:
library(plyr)
library(dplyr)
library(fitdistrplus)
library(evd)
library(gamlss)
library(purrr) # <- add this library
fdistr <- function(d) {
# Uncomment to try run line by line
# d <- data_to_plot
TLT <- d$TLT
if (sum(TLT<=0)) {TLT[TLT<=0] <- 0.001} # removing value < 0 for log clculation
gev <- fgev(TLT, std.err=FALSE)
distr <- c('norm', 'lnorm', 'weibull', 'gamma')
fit <- lapply(X=distr, FUN=fitdist, data=TLT)
fit[[5]] <- gev
distr[5] <- 'gev'
names(fit) <- distr
Loglike <- sapply(X=fit, FUN=logLik)
Loglike_Best <- which(Loglike == max(Loglike))
# Uncomment to try run line by line
# max <- which.max(density(d$TLT)$y)
# max_density <- stats::density(d$TLT)$y[max]
# max_y <- max_density
x_data <- max(d$TLT)
# parameterize your densities before plotting
mynorm <- function(x) {
dnorm(x,
mean=fit[['norm']]$estimate[1],
sd=fit[['norm']]$estimate[2])
}
mylnorm <- function(x){
dlnorm(x,
meanlog=fit[['lnorm']]$estimate[1],
sdlog=fit[['lnorm']]$estimate[2])
}
myweibull <- function(x) {
dweibull(x,
shape=fit[['weibull']]$estimate[1],
scale=fit[['weibull']]$estimate[2])
}
mygamma <- function(x) {
dgamma(x,
shape=fit[['gamma']]$estimate[1],
rate=fit[['gamma']]$estimate[2])
}
mygev <- function(x){
dgev(x,
loc=fit[['gev']]$estimate[1],
scale=fit[['gev']]$estimate[2],
shape=fit[['gev']]$estimate[3])
}
distributions <- c(mynorm, mylnorm, myweibull, mygamma, mygev)
# get the max of each density
y <- purrr::map_dbl(distributions, ~ optimize(., interval = c(0, x_data), maximum = T)$objective)
# find the max (excluding infinity)
ymax <- max(y[abs(y) < Inf])
hist(TLT, prob=TRUE, breaks= x_data,
main=paste(d$DLT_Code[1],
'- best :',
names(Loglike[Loglike_Best])),
sub = 'Total Lead Times',
col='lightgrey',
border='white',
ylim= c(0, ymax)
)
lines(density(TLT),
col='darkgrey',
lty=2,
lwd=2)
grid(nx = NA, ny = NULL, col = "gray", lty = "dotted",
lwd = .5, equilogs = TRUE)
curve(mynorm,
add=TRUE, col='blue', lwd=2, n = 1E5) # <- increase x grid
curve(mylnorm,
add=TRUE, col='darkgreen', lwd=2, n = 1E5) # <- increase x grid
curve(myweibull,
add=TRUE, col='purple', lwd=2, n = 1E5) # <- increase x grid
curve(mygamma,
add=TRUE, col='Gold', lwd=2, n = 1E5) # <- increase x grid
curve(mygev,
add=TRUE, col='red', lwd=2, n = 1E5) # <- increase x grid
legend_loglik <- paste(c('Norm', 'LogNorm', 'Weibull', 'Gamma','GEV'), c(':'),
round(Loglike, digits=2))
legend("topright", legend=legend_loglik,
col=c('blue', 'darkgreen', 'purple', 'gold', 'red'),
lty=1, lwd=2,
bty='o', bg='white', box.lty=2, box.lwd = 1, box.col='white')
return(data.frame(DLT_Code = d$DLT_Code[1],
n = length(d$TLT),
Best = names(Loglike[Loglike_Best]),
lnorm = Loglike[1],
norm = Loglike[2],
weibul = Loglike[3],
gamma = Loglike[4],
GEV = Loglike[5]))
}
# Creating data set
TLT <- c(rep(0,32), rep(1,120), rep(2,10), rep(3,67), rep(4,14), rep(5,7), 6)
DLT_Code <- c(rep('DLT_Code',251))
data_to_plot <- data.frame(cbind(DLT_Code,TLT))
data_to_plot$TLT <- as.numeric(as.character(data_to_plot$TLT ))
DLT_Distr <- do.call(rbind, by(data = data_to_plot, INDICES = data_to_plot$DLT_Code, FUN=fdistr))
为什么你的绘图高度与解决方案输出不匹配
为了进一步说明您的绘图发生了什么以及您可能遇到的一些困惑,您需要了解 curve
函数如何绘制您的数据。默认情况下 curve
取 101 个 x 值并对函数求值以获得它们的 y 值,然后将这些点绘制成一条线。由于某些密度的峰值非常尖锐,因此 curve
函数未评估足够的 x 值来绘制密度峰值。为了表明你想要我的意思是我将关注你的伽马密度。不要像输出那样担心代码。下面我有 n
.
不同值的前几个 (x,y) 坐标
library(purrr)
mygamma <- function(x) {
dgamma(x,
shape=fit[['gamma']]$estimate[1], # 0.6225622
rate=fit[['gamma']]$estimate[2]) # 0.3568242
}
number_of_x <- c(5, 10, 101, 75000)
purrr::imap_dfr(number_of_x, ~ curve(mygamma, xlim = c(0, 6), n = .), .id = "n") %>%
dplyr::mutate_at(1, ~ sprintf("n = %i", number_of_x[as.numeric(.)])) %>%
dplyr::mutate(n = factor(n, unique(n))) %>%
dplyr::filter(x > 0) %>%
dplyr::group_by(n) %>%
dplyr::slice_min(order_by = x, n = 5)
n x y
<fct> <dbl> <dbl>
1 n = 5 1.5 0.184
2 n = 5 3 0.0828
3 n = 5 4.5 0.0416
4 n = 5 6 0.0219
5 n = 10 0.667 0.336
6 n = 10 1.33 0.204
7 n = 10 2 0.138
8 n = 10 2.67 0.0975
9 n = 10 3.33 0.0707
10 n = 101 0.06 1.04
11 n = 101 0.12 0.780
12 n = 101 0.18 0.655
13 n = 101 0.24 0.575
14 n = 101 0.3 0.518
15 n = 75000 0.0000800 12.9
16 n = 75000 0.000160 9.90
17 n = 75000 0.000240 8.50
18 n = 75000 0.000320 7.62
19 n = 75000 0.000400 7.01
请注意,当 n = 5
时,您绘制的值非常少。随着 n
的增加,x 值之间的距离变小。由于这些函数是连续的,因此可以绘制无限数量的点,但这无法通过计算完成,因此绘制了 x 值的子集以进行近似。 x 值越多,近似值越好。通常,默认值 n = 101
工作正常,但由于伽马和对数正态密度具有如此尖锐的峰值,绘图函数正在跨过最大值。下面是 n = 5, 10, 101, 75000
的完整数据图,添加了点。
最后我用了这个解决方案,发现:
mygamma <- function(x) dgamma(x, shape=fit[['gamma']]$estimate[1],
rate=fit[['gamma']]$estimate[2])
get_curve_values <- function(fn, x_data){
res <- curve(fn, from=0, to=x_data)
dev.off()
res
}
curve_val <- get_curve_values(mygamma, x_data)
ylim <- max(curve_val$y,na.rm = TRUE)
我正在为不同的分布函数绘制曲线,我需要知道每条曲线的最高 y 值。稍后我将只绘制一条曲线,它被选为最佳拟合。
这是函数(它有点硬编码,我正在研究它):
library(plyr)
library(dplyr)
library(fitdistrplus)
library(evd)
library(gamlss)
fdistr <- function(d) {
# Uncomment to try run line by line
# d <- data_to_plot
TLT <- d$TLT
if (sum(TLT<=0)) {TLT[TLT<=0] <- 0.001} # removing value < 0 for log clculation
gev <- fgev(TLT, std.err=FALSE)
distr <- c('norm', 'lnorm', 'weibull', 'gamma')
fit <- lapply(X=distr, FUN=fitdist, data=TLT)
fit[[5]] <- gev
distr[5] <- 'gev'
names(fit) <- distr
Loglike <- sapply(X=fit, FUN=logLik)
Loglike_Best <- which(Loglike == max(Loglike))
# Uncomment to try run line by line
# max <- which.max(density(d$TLT)$y)
# max_density <- stats::density(d$TLT)$y[max]
# max_y <- max_density
x_data <- max(d$TLT)
hist(TLT, prob=TRUE, breaks= x_data,
main=paste(d$DLT_Code[1],
'- best :',
names(Loglike[Loglike_Best])),
sub = 'Total Lead Times',
col='lightgrey',
border='white'
# ylim= c(0,max_y)
)
lines(density(TLT),
col='darkgrey',
lty=2,
lwd=2)
grid(nx = NA, ny = NULL, col = "gray", lty = "dotted",
lwd = .5, equilogs = TRUE)
curve(dnorm(x,
mean=fit[['norm']]$estimate[1],
sd=fit[['norm']]$estimate[2]),
add=TRUE, col='blue', lwd=2)
curve(dlnorm(x,
meanlog=fit[['lnorm']]$estimate[1],
sdlog=fit[['lnorm']]$estimate[2]),
add=TRUE, col='darkgreen', lwd=2)
curve(dweibull(x,
shape=fit[['weibull']]$estimate[1],
scale=fit[['weibull']]$estimate[2]),
add=TRUE, col='purple', lwd=2)
curve(dgamma(x,
shape=fit[['gamma']]$estimate[1],
rate=fit[['gamma']]$estimate[2]),
add=TRUE, col='Gold', lwd=2)
curve(dgev(x,
loc=fit[['gev']]$estimate[1],
scale=fit[['gev']]$estimate[2],
shape=fit[['gev']]$estimate[3]),
add=TRUE, col='red', lwd=2)
legend_loglik <- paste(c('Norm', 'LogNorm', 'Weibull', 'Gamma','GEV'), c(':'),
round(Loglike, digits=2))
legend("topright", legend=legend_loglik,
col=c('blue', 'darkgreen', 'purple', 'gold', 'red'),
lty=1, lwd=2,
bty='o', bg='white', box.lty=2, box.lwd = 1, box.col='white')
return(data.frame(DLT_Code = d$DLT_Code[1],
n = length(d$TLT),
Best = names(Loglike[Loglike_Best]),
lnorm = Loglike[1],
norm = Loglike[2],
weibul = Loglike[3],
gamma = Loglike[4],
GEV = Loglike[5]))
}
# Creating data set
TLT <- c(rep(0,32), rep(1,120), rep(2,10), rep(3,67), rep(4,14), rep(5,7), 6)
DLT_Code <- c(rep('DLT_Code',251))
data_to_plot <- data.frame(cbind(DLT_Code,TLT))
data_to_plot$TLT <- as.numeric(as.character(data_to_plot$TLT ))
DLT_Distr <- do.call(rbind, by(data = data_to_plot, INDICES = data_to_plot$DLT_Code, FUN=fdistr))
我试着玩 max_y
,然后在 ylim
中使用它。我只能对法线密度做到这一点,但不能对其余曲线做到这一点。
目前的情节是这样的(一些曲线被剪掉了):
如果设置 ylim = c(0,2)
我们可以看到,lognormal和gamma分布超出了1:
我需要知道每条曲线的最大值,所以,当我选择打印哪条曲线时,要设置正确的 ylim
。
您可以使用 purrr::map_dbl
将函数 optimize
映射到您的密度上,前提是您稍微重新排列代码并且您知道要找到哪些输入值 maxima/the密度存在。
您可以提前使用任何参数设置密度,这样您就可以使用 optimize
找到它们的峰值,并将它们传递给 curve
函数。
作为一个可重现的小例子:
library(purrr)
# parameterize your densities
mynorm <- function(x) dnorm(x, mean = 0, sd = 1)
mygamma <- function(x) dgamma(x, rate = .5, shape = 1)
# get largest maximum over interval
ymax <- max(purrr::map_dbl(c(mynorm, mygamma), ~ optimize(., interval = c(0, 3), maximum = T)$objective))
# 0.4999811
# plot data
curve(mynorm, col = "blue", lwd = 2, xlim = c(0, 3), ylim = c(0, ymax * 1.1))
curve(mygamma, col = "red", lwd = 2, add = T)
使用您的代码,我已经实现了上述解决方案并调整了 curve
函数的 x
网格,以便在我们在评论中讨论后向您展示我的意思,使事情更加清晰和向您展示您实际应该绘制的内容:
library(plyr)
library(dplyr)
library(fitdistrplus)
library(evd)
library(gamlss)
library(purrr) # <- add this library
fdistr <- function(d) {
# Uncomment to try run line by line
# d <- data_to_plot
TLT <- d$TLT
if (sum(TLT<=0)) {TLT[TLT<=0] <- 0.001} # removing value < 0 for log clculation
gev <- fgev(TLT, std.err=FALSE)
distr <- c('norm', 'lnorm', 'weibull', 'gamma')
fit <- lapply(X=distr, FUN=fitdist, data=TLT)
fit[[5]] <- gev
distr[5] <- 'gev'
names(fit) <- distr
Loglike <- sapply(X=fit, FUN=logLik)
Loglike_Best <- which(Loglike == max(Loglike))
# Uncomment to try run line by line
# max <- which.max(density(d$TLT)$y)
# max_density <- stats::density(d$TLT)$y[max]
# max_y <- max_density
x_data <- max(d$TLT)
# parameterize your densities before plotting
mynorm <- function(x) {
dnorm(x,
mean=fit[['norm']]$estimate[1],
sd=fit[['norm']]$estimate[2])
}
mylnorm <- function(x){
dlnorm(x,
meanlog=fit[['lnorm']]$estimate[1],
sdlog=fit[['lnorm']]$estimate[2])
}
myweibull <- function(x) {
dweibull(x,
shape=fit[['weibull']]$estimate[1],
scale=fit[['weibull']]$estimate[2])
}
mygamma <- function(x) {
dgamma(x,
shape=fit[['gamma']]$estimate[1],
rate=fit[['gamma']]$estimate[2])
}
mygev <- function(x){
dgev(x,
loc=fit[['gev']]$estimate[1],
scale=fit[['gev']]$estimate[2],
shape=fit[['gev']]$estimate[3])
}
distributions <- c(mynorm, mylnorm, myweibull, mygamma, mygev)
# get the max of each density
y <- purrr::map_dbl(distributions, ~ optimize(., interval = c(0, x_data), maximum = T)$objective)
# find the max (excluding infinity)
ymax <- max(y[abs(y) < Inf])
hist(TLT, prob=TRUE, breaks= x_data,
main=paste(d$DLT_Code[1],
'- best :',
names(Loglike[Loglike_Best])),
sub = 'Total Lead Times',
col='lightgrey',
border='white',
ylim= c(0, ymax)
)
lines(density(TLT),
col='darkgrey',
lty=2,
lwd=2)
grid(nx = NA, ny = NULL, col = "gray", lty = "dotted",
lwd = .5, equilogs = TRUE)
curve(mynorm,
add=TRUE, col='blue', lwd=2, n = 1E5) # <- increase x grid
curve(mylnorm,
add=TRUE, col='darkgreen', lwd=2, n = 1E5) # <- increase x grid
curve(myweibull,
add=TRUE, col='purple', lwd=2, n = 1E5) # <- increase x grid
curve(mygamma,
add=TRUE, col='Gold', lwd=2, n = 1E5) # <- increase x grid
curve(mygev,
add=TRUE, col='red', lwd=2, n = 1E5) # <- increase x grid
legend_loglik <- paste(c('Norm', 'LogNorm', 'Weibull', 'Gamma','GEV'), c(':'),
round(Loglike, digits=2))
legend("topright", legend=legend_loglik,
col=c('blue', 'darkgreen', 'purple', 'gold', 'red'),
lty=1, lwd=2,
bty='o', bg='white', box.lty=2, box.lwd = 1, box.col='white')
return(data.frame(DLT_Code = d$DLT_Code[1],
n = length(d$TLT),
Best = names(Loglike[Loglike_Best]),
lnorm = Loglike[1],
norm = Loglike[2],
weibul = Loglike[3],
gamma = Loglike[4],
GEV = Loglike[5]))
}
# Creating data set
TLT <- c(rep(0,32), rep(1,120), rep(2,10), rep(3,67), rep(4,14), rep(5,7), 6)
DLT_Code <- c(rep('DLT_Code',251))
data_to_plot <- data.frame(cbind(DLT_Code,TLT))
data_to_plot$TLT <- as.numeric(as.character(data_to_plot$TLT ))
DLT_Distr <- do.call(rbind, by(data = data_to_plot, INDICES = data_to_plot$DLT_Code, FUN=fdistr))
为什么你的绘图高度与解决方案输出不匹配
为了进一步说明您的绘图发生了什么以及您可能遇到的一些困惑,您需要了解 curve
函数如何绘制您的数据。默认情况下 curve
取 101 个 x 值并对函数求值以获得它们的 y 值,然后将这些点绘制成一条线。由于某些密度的峰值非常尖锐,因此 curve
函数未评估足够的 x 值来绘制密度峰值。为了表明你想要我的意思是我将关注你的伽马密度。不要像输出那样担心代码。下面我有 n
.
library(purrr)
mygamma <- function(x) {
dgamma(x,
shape=fit[['gamma']]$estimate[1], # 0.6225622
rate=fit[['gamma']]$estimate[2]) # 0.3568242
}
number_of_x <- c(5, 10, 101, 75000)
purrr::imap_dfr(number_of_x, ~ curve(mygamma, xlim = c(0, 6), n = .), .id = "n") %>%
dplyr::mutate_at(1, ~ sprintf("n = %i", number_of_x[as.numeric(.)])) %>%
dplyr::mutate(n = factor(n, unique(n))) %>%
dplyr::filter(x > 0) %>%
dplyr::group_by(n) %>%
dplyr::slice_min(order_by = x, n = 5)
n x y
<fct> <dbl> <dbl>
1 n = 5 1.5 0.184
2 n = 5 3 0.0828
3 n = 5 4.5 0.0416
4 n = 5 6 0.0219
5 n = 10 0.667 0.336
6 n = 10 1.33 0.204
7 n = 10 2 0.138
8 n = 10 2.67 0.0975
9 n = 10 3.33 0.0707
10 n = 101 0.06 1.04
11 n = 101 0.12 0.780
12 n = 101 0.18 0.655
13 n = 101 0.24 0.575
14 n = 101 0.3 0.518
15 n = 75000 0.0000800 12.9
16 n = 75000 0.000160 9.90
17 n = 75000 0.000240 8.50
18 n = 75000 0.000320 7.62
19 n = 75000 0.000400 7.01
请注意,当 n = 5
时,您绘制的值非常少。随着 n
的增加,x 值之间的距离变小。由于这些函数是连续的,因此可以绘制无限数量的点,但这无法通过计算完成,因此绘制了 x 值的子集以进行近似。 x 值越多,近似值越好。通常,默认值 n = 101
工作正常,但由于伽马和对数正态密度具有如此尖锐的峰值,绘图函数正在跨过最大值。下面是 n = 5, 10, 101, 75000
的完整数据图,添加了点。
最后我用了这个解决方案,发现
mygamma <- function(x) dgamma(x, shape=fit[['gamma']]$estimate[1],
rate=fit[['gamma']]$estimate[2])
get_curve_values <- function(fn, x_data){
res <- curve(fn, from=0, to=x_data)
dev.off()
res
}
curve_val <- get_curve_values(mygamma, x_data)
ylim <- max(curve_val$y,na.rm = TRUE)