为 R 中的 GLM 物种分布模型创建响应曲线的最佳方法?
best way to create response curves for a GLM species distribution model in R?
我是 运行 一个预测物种出现概率的二项式 GLM,我在一个数据集上训练并在另一个数据集上测试模型:
TrainingData<-read.csv("TrainingData.csv")[,-1]
TrainingData[,1]<-as.factor(TrainingData[,1])
TrainingData[,4]<-as.factor(TrainingData[,4])
TestData<-read.csv("TestData.csv")[,-1]
TestData[,1]<-as.factor(TestData[,1])
TestData[,4]<-as.factor(TestData[,4])
mod<-glm(presence~var1+var2+var3, family=binomial, data=TrainingData)
probs=predict(mod, TestData, type="response")
创建响应曲线以绘制存在概率与每个预测变量之间关系的最佳方法(或函数)是什么?
谢谢!
边际概率可以从 predict.glm 计算,类型 = "terms",
因为每一项都是在其余变量设置为平均值的情况下计算的。
使用 plogis(term + intercept) 将其转换回概率尺度。
其次,因为您的数据集包含连续值和因子的组合
对于您的预测变量,为每种类型制作了单独的图并合并
grid.arrange.
尽管这直接根据您提供的 glm 模型回答了您的问题,
我仍然建议检查您的预测变量的空间自相关
和响应变量,因为这可能会对您的最终模型产生影响。
library(reshape2)
library(dplyr)
library(tidyr)
library(ggplot2)
library(gridExtra)
TrainingData <- read.csv("~/Downloads/TrainingData.csv", header = TRUE)
TrainingData[['presence']] <- as.factor(TrainingData[['presence']])
TrainingData[['var3']] <- as.factor(TrainingData[['var3']])
TrainingData[['X']] <- NULL # Not used in the model
TestData <- read.csv("~/Downloads/TestData.csv", header = TRUE)
TestData[['presence']] <- as.factor(TestData[['presence']])
TestData[['var3']] <- as.factor(TestData[['var3']])
TestData[['X']] <- NULL
Presence/Absence 型号
mod <- glm(presence ~ var1 + var2 + var3, family = binomial, data = TrainingData)
获取每个居中变量的预测概率(即剩余变量设置为它们的平均值)。
mod_terms <- predict(mod, newdata = TestData, type = "terms")
mod_prob <- data.frame(idx = 1:nrow(TestData), plogis(mod_terms +
attr(mod_terms, "constant")))
mod_probg <- mod_prob %>% gather(variable, probability, -idx)
将测试数据融为长格式
TestData['idx'] <- 1:nrow(TestData) # Add index to data
TestData[['X']] <- NULL # Drop the X variable since it was not used in the model
data_long <- melt(TestData, id = c("presence","idx"))
data_long[['value']] <- as.numeric(data_df[['value']])
将测试数据与预测合并,并分离包含连续(var1 和 var2)和因子(var3)的数据。
# Merge Testdata with predictions
data_df <- merge(data_long, mod_probg, by = c("idx", "variable"))
data_df <- data_df %>% arrange(variable, value)
data_continuous <- data_df %>% filter(., variable != "var3") %>%
transform(value = as.numeric(value)) %>% arrange(variable, value)
data_factor <- data_df %>% filter(., variable == "var3") %>%
transform(value = as.factor(value))%>%
arrange(idx)
ggplot 输出
g_continuous <- ggplot(data_continuous, aes(x = value, y = probability)) + geom_point()+
facet_wrap(~variable, scales = "free_x")
g_factor <- ggplot(data = data_factor, aes(x = value, y = probability)) + geom_boxplot() +
facet_wrap(~variable)
grid.arrange(g_continuous, g_factor, nrow = 1)
我是 运行 一个预测物种出现概率的二项式 GLM,我在一个数据集上训练并在另一个数据集上测试模型:
TrainingData<-read.csv("TrainingData.csv")[,-1]
TrainingData[,1]<-as.factor(TrainingData[,1])
TrainingData[,4]<-as.factor(TrainingData[,4])
TestData<-read.csv("TestData.csv")[,-1]
TestData[,1]<-as.factor(TestData[,1])
TestData[,4]<-as.factor(TestData[,4])
mod<-glm(presence~var1+var2+var3, family=binomial, data=TrainingData)
probs=predict(mod, TestData, type="response")
创建响应曲线以绘制存在概率与每个预测变量之间关系的最佳方法(或函数)是什么?
谢谢!
边际概率可以从 predict.glm 计算,类型 = "terms",
因为每一项都是在其余变量设置为平均值的情况下计算的。
使用 plogis(term + intercept) 将其转换回概率尺度。
其次,因为您的数据集包含连续值和因子的组合 对于您的预测变量,为每种类型制作了单独的图并合并 grid.arrange.
尽管这直接根据您提供的 glm 模型回答了您的问题, 我仍然建议检查您的预测变量的空间自相关 和响应变量,因为这可能会对您的最终模型产生影响。
library(reshape2)
library(dplyr)
library(tidyr)
library(ggplot2)
library(gridExtra)
TrainingData <- read.csv("~/Downloads/TrainingData.csv", header = TRUE)
TrainingData[['presence']] <- as.factor(TrainingData[['presence']])
TrainingData[['var3']] <- as.factor(TrainingData[['var3']])
TrainingData[['X']] <- NULL # Not used in the model
TestData <- read.csv("~/Downloads/TestData.csv", header = TRUE)
TestData[['presence']] <- as.factor(TestData[['presence']])
TestData[['var3']] <- as.factor(TestData[['var3']])
TestData[['X']] <- NULL
Presence/Absence 型号
mod <- glm(presence ~ var1 + var2 + var3, family = binomial, data = TrainingData)
获取每个居中变量的预测概率(即剩余变量设置为它们的平均值)。
mod_terms <- predict(mod, newdata = TestData, type = "terms")
mod_prob <- data.frame(idx = 1:nrow(TestData), plogis(mod_terms +
attr(mod_terms, "constant")))
mod_probg <- mod_prob %>% gather(variable, probability, -idx)
将测试数据融为长格式
TestData['idx'] <- 1:nrow(TestData) # Add index to data
TestData[['X']] <- NULL # Drop the X variable since it was not used in the model
data_long <- melt(TestData, id = c("presence","idx"))
data_long[['value']] <- as.numeric(data_df[['value']])
将测试数据与预测合并,并分离包含连续(var1 和 var2)和因子(var3)的数据。
# Merge Testdata with predictions
data_df <- merge(data_long, mod_probg, by = c("idx", "variable"))
data_df <- data_df %>% arrange(variable, value)
data_continuous <- data_df %>% filter(., variable != "var3") %>%
transform(value = as.numeric(value)) %>% arrange(variable, value)
data_factor <- data_df %>% filter(., variable == "var3") %>%
transform(value = as.factor(value))%>%
arrange(idx)
ggplot 输出
g_continuous <- ggplot(data_continuous, aes(x = value, y = probability)) + geom_point()+
facet_wrap(~variable, scales = "free_x")
g_factor <- ggplot(data = data_factor, aes(x = value, y = probability)) + geom_boxplot() +
facet_wrap(~variable)
grid.arrange(g_continuous, g_factor, nrow = 1)