如何在模型输出中列出每个预测的概率
How to list probability for each prediction in model output
使用修改后的 Iris 数据集测试一些预测物种的模型。现在仅限于 SVM 和随机森林。 运行 这在 R-studio 中。
简化设置:
library(caret)
#data
data(iris)
#rename
dataset <- iris
#smaller sample
sample_data <- dataset[sample(nrow(dataset), 60), ]
#create some noise so model is less-than-perfect
noise_df <- data.frame(
Sepal.Length = c(5.7, 5.7, 5.7, 5.7, 5.7, 5.7, 5.7, 5.7, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0),
Sepal.Width = c(3.8, 3.8, 3.8, 3.8, 3.8, 3.8, 3.8, 3.8, 2.7, 2.7, 2.7, 2.8, 2.8, 2.8, 2.8, 2.8, 3.1, 3.1, 3.1, 3.1, 3.1, 3.1),
Petal.Length = c(5.2, 5.2, 5.3, 5.3, 5.4, 5.4, 5.4, 5.4, 5.5, 5.5, 5.5, 5.6, 5.6, 5.7, 5.7, 5.8, 1.3, 1.3, 1.3, 1.3, 1.3, 1.3),
Petal.Width = c(1.8, 1.8, 1.8, 1.9, 1.9, 1.9, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2),
Species = c("setosa","setosa", "setosa","setosa", "setosa","setosa","setosa","setosa", "setosa","setosa", "setosa", "setosa", "setosa", "setosa", "setosa", "setosa", "virginica", "virginica", "virginica", "virginica", "virginica", "virginica")
)
#combine sample with noise
dataset2 <- rbind(sample_data, noise_df)
#split data into train/test
set.seed(7)
validation_index <- createDataPartition(dataset2$Species, p=0.70, list=FALSE)
test_set <- dataset2[-validation_index,]
train_set <- dataset2[validation_index,]
#====================
#build models
#====================
control <- trainControl(method="cv", number=10)
metric <- "Accuracy"
#random forest model
set.seed(3)
fit.rf <- train(Species~., data=train_set, method="rf", metric=metric, trControl=control)
#svm model
set.seed(3)
fit.svm <- train(Species~., data=train_set, method="svmRadial", metric=metric, trControl=control)
#====================
#run model on test
#====================
predictions <- predict(fit.svm, test_set)
confusionMatrix(predictions, test_set$Species)
混淆矩阵输出:
Reference
Prediction setosa versicolor virginica
setosa 11 0 3
versicolor 0 3 0
virginica 0 1 5
我想知道是否可以列出每个预测的概率。例如:
setosa versicolor virginica predicted
1 0.9 0.0 0.1 setosa
2 0.1 0.8 0.1 versicolor
3 0.33 0.33 0.33 virginica
我猜想随机森林可能只列出了 0 和 1,但想知道 SVM 是否可以选择像上面的例子那样分解概率。如果是这样,我不确定如何塑造我的数据或要使用的功能。它是 decision_function 还是 predict_proba 函数,但我不清楚如何在 r.
中正确执行它
对于随机森林,概率是预测每个标签的决策树的比例,你可以使用 predict(.. , type="prob")
:
data.frame(predict(fit.rf,type="prob", newdata=test_set),
predicted=predict(fit.rf, newdata=test_set))
setosa versicolor virginica predicted
147 0.016 0.002 0.982 virginica
15 0.908 0.068 0.024 setosa
103 0.486 0.000 0.514 virginica
118 0.416 0.056 0.528 virginica
129 0.344 0.000 0.656 virginica
39 0.388 0.080 0.532 virginica
对于 kernlab svm,您需要设置 prob.model = TRUE
:
set.seed(3)
fit.svm <- train(Species~., data=train_set, method="svmRadial", metric=metric, trControl=control, prob.model = TRUE)
data.frame(predict(fit.svm,newdata=test_set,type="prob"),
predicted=predict(fit.svm,newdata=test_set))
setosa versicolor virginica predicted
1 0.129916071 0.051873046 0.81821088 virginica
2 0.884025291 0.030853736 0.08512097 setosa
3 0.129054108 0.006256384 0.86468951 virginica
4 0.104952659 0.124066424 0.77098092 virginica
使用修改后的 Iris 数据集测试一些预测物种的模型。现在仅限于 SVM 和随机森林。 运行 这在 R-studio 中。
简化设置:
library(caret)
#data
data(iris)
#rename
dataset <- iris
#smaller sample
sample_data <- dataset[sample(nrow(dataset), 60), ]
#create some noise so model is less-than-perfect
noise_df <- data.frame(
Sepal.Length = c(5.7, 5.7, 5.7, 5.7, 5.7, 5.7, 5.7, 5.7, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0),
Sepal.Width = c(3.8, 3.8, 3.8, 3.8, 3.8, 3.8, 3.8, 3.8, 2.7, 2.7, 2.7, 2.8, 2.8, 2.8, 2.8, 2.8, 3.1, 3.1, 3.1, 3.1, 3.1, 3.1),
Petal.Length = c(5.2, 5.2, 5.3, 5.3, 5.4, 5.4, 5.4, 5.4, 5.5, 5.5, 5.5, 5.6, 5.6, 5.7, 5.7, 5.8, 1.3, 1.3, 1.3, 1.3, 1.3, 1.3),
Petal.Width = c(1.8, 1.8, 1.8, 1.9, 1.9, 1.9, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2),
Species = c("setosa","setosa", "setosa","setosa", "setosa","setosa","setosa","setosa", "setosa","setosa", "setosa", "setosa", "setosa", "setosa", "setosa", "setosa", "virginica", "virginica", "virginica", "virginica", "virginica", "virginica")
)
#combine sample with noise
dataset2 <- rbind(sample_data, noise_df)
#split data into train/test
set.seed(7)
validation_index <- createDataPartition(dataset2$Species, p=0.70, list=FALSE)
test_set <- dataset2[-validation_index,]
train_set <- dataset2[validation_index,]
#====================
#build models
#====================
control <- trainControl(method="cv", number=10)
metric <- "Accuracy"
#random forest model
set.seed(3)
fit.rf <- train(Species~., data=train_set, method="rf", metric=metric, trControl=control)
#svm model
set.seed(3)
fit.svm <- train(Species~., data=train_set, method="svmRadial", metric=metric, trControl=control)
#====================
#run model on test
#====================
predictions <- predict(fit.svm, test_set)
confusionMatrix(predictions, test_set$Species)
混淆矩阵输出:
Reference
Prediction setosa versicolor virginica
setosa 11 0 3
versicolor 0 3 0
virginica 0 1 5
我想知道是否可以列出每个预测的概率。例如:
setosa versicolor virginica predicted
1 0.9 0.0 0.1 setosa
2 0.1 0.8 0.1 versicolor
3 0.33 0.33 0.33 virginica
我猜想随机森林可能只列出了 0 和 1,但想知道 SVM 是否可以选择像上面的例子那样分解概率。如果是这样,我不确定如何塑造我的数据或要使用的功能。它是 decision_function 还是 predict_proba 函数,但我不清楚如何在 r.
中正确执行它对于随机森林,概率是预测每个标签的决策树的比例,你可以使用 predict(.. , type="prob")
:
data.frame(predict(fit.rf,type="prob", newdata=test_set),
predicted=predict(fit.rf, newdata=test_set))
setosa versicolor virginica predicted
147 0.016 0.002 0.982 virginica
15 0.908 0.068 0.024 setosa
103 0.486 0.000 0.514 virginica
118 0.416 0.056 0.528 virginica
129 0.344 0.000 0.656 virginica
39 0.388 0.080 0.532 virginica
对于 kernlab svm,您需要设置 prob.model = TRUE
:
set.seed(3)
fit.svm <- train(Species~., data=train_set, method="svmRadial", metric=metric, trControl=control, prob.model = TRUE)
data.frame(predict(fit.svm,newdata=test_set,type="prob"),
predicted=predict(fit.svm,newdata=test_set))
setosa versicolor virginica predicted
1 0.129916071 0.051873046 0.81821088 virginica
2 0.884025291 0.030853736 0.08512097 setosa
3 0.129054108 0.006256384 0.86468951 virginica
4 0.104952659 0.124066424 0.77098092 virginica