在 R 中插入符号调整模型时如何保持参数不变?

How to keep parameter constant when tuning model in caret in R?

以下代码:

require(caret)
require(plyr)

portuguese_scores = read.table("https://raw.githubusercontent.com/JimGorman17/Datasets/master/student-por.csv",sep=";",header=TRUE, stringsAsFactors = FALSE)
portuguese_scores <- portuguese_scores[,!names(portuguese_scores) %in% c("school", "age", "G1", "G2")]
median_score <- summary(portuguese_scores$G3)['Median']
portuguese_scores$score_gte_than_median <- as.factor(median_score<=portuguese_scores$G3)
portuguese_scores <- portuguese_scores[,!names(portuguese_scores) %in% c("G3")]

portuguese_scores$sex <- as.numeric(mapvalues(portuguese_scores$sex, from = c("M", "F"), to = c(0, 1)))
portuguese_scores$address <- as.numeric(mapvalues(portuguese_scores$address, from = c("U", "R"), to = c(0, 1)))
portuguese_scores$famsize <- as.numeric(mapvalues(portuguese_scores$famsize, from = c("LE3", "GT3"), to = c(0, 1)))
portuguese_scores$Pstatus <- as.numeric(mapvalues(portuguese_scores$Pstatus, from = c("T", "A"), to = c(0, 1)))
portuguese_scores$Mjob <- as.numeric(mapvalues(portuguese_scores$Mjob, from = c("at_home","health","other","services","teacher"), to = c(0, 1,2,3,4)))
portuguese_scores$Fjob <- as.numeric(mapvalues(portuguese_scores$Fjob, from = c("at_home","health","other","services","teacher"), to = c(0, 1,2,3,4)))
portuguese_scores$reason <- as.numeric(mapvalues(portuguese_scores$reason, from = c("course","home","other","reputation"), to = c(0, 1,2,3)))
portuguese_scores$guardian <- as.numeric(mapvalues(portuguese_scores$guardian, from = c("father","mother","other"), to = c(0, 1,2)))
portuguese_scores$schoolsup <- as.numeric(mapvalues(portuguese_scores$schoolsup, from = c("no","yes"), to = c(0, 1)))
portuguese_scores$famsup <- as.numeric(mapvalues(portuguese_scores$famsup, from = c("no","yes"), to = c(0, 1)))
portuguese_scores$paid <- as.numeric(mapvalues(portuguese_scores$paid, from = c("no","yes"), to = c(0, 1)))
portuguese_scores$activities <- as.numeric(mapvalues(portuguese_scores$activities, from = c("no","yes"), to = c(0, 1)))
portuguese_scores$nursery <- as.numeric(mapvalues(portuguese_scores$nursery, from = c("no","yes"), to = c(0, 1)))
portuguese_scores$higher <- as.numeric(mapvalues(portuguese_scores$higher, from = c("no","yes"), to = c(0, 1)))
portuguese_scores$internet <- as.numeric(mapvalues(portuguese_scores$internet, from = c("no","yes"), to = c(0, 1)))
portuguese_scores$romantic <- as.numeric(mapvalues(portuguese_scores$romantic, from = c("no","yes"), to = c(0, 1)))

normalize <- function(x){ return( (x - min(x) )/( max(x) - min(x) ) )}
port_n <- data.frame(lapply(portuguese_scores[1:28], normalize), portuguese_scores[29])

set.seed(123)

train_sample <- sample(nrow(port_n), .9 * nrow(port_n))
port_train <- port_n[train_sample,]
port_test <- port_n[-train_sample,]

out1 <- train(port_train[,1:28], port_train[,29], method = "svmRadial")
out1

生成以下输出:

Support Vector Machines with Radial Basis Function Kernel 

584 samples
 28 predictor
  2 classes: 'FALSE', 'TRUE' 

No pre-processing
Resampling: Bootstrapped (25 reps) 
Summary of sample sizes: 584, 584, 584, 584, 584, 584, ... 
Resampling results across tuning parameters:

  C     Accuracy   Kappa      Accuracy SD  Kappa SD  
  0.25  0.7383930  0.4633478  0.02782725   0.05484469
  0.50  0.7382364  0.4637857  0.02883617   0.05763094
  1.00  0.7290191  0.4456935  0.02570423   0.05180727

Tuning parameter 'sigma' was held constant at a value of 0.02166535
Accuracy was used to select the optimal model using  the largest value.
The final values used for the model were sigma = 0.02166535 and C = 0.25. 

我的问题:

更新(致所有投票者):

为此,您需要使用 tuneGrid 参数。您需要为参数创建自己的对,然后对其进行测试。

例如,由于您想在所有情况下都测试 C=0.25,因此您需要创建一个如下所示的 data.frame:

svmGrid <- data.frame(C=rep(0.25,10), sigma=1:10/100)

这具有相同的 C 值 (0.25) 和不同的 sigma 值来优化。您需要自己为 sigma 提供这些值(这只是一个示例 - 想用多少就用多少)。

也就是说,根据上面的data.frame,你的svm模型会被测试10次。每次 C 将保持不变并等于 0.25,并且 sigma 将采用 0.01 到 0.1 之间的值,步长为 0.01。将进行 10 次测试,然后选择最佳组合。

然后你 运行 这样的模型:

#adding the tuneGrid argument
out1 <- train(port_train[,1:28], port_train[,29], method = "svmRadial", tuneGrid=svmGrid)

输出:

> out1
Support Vector Machines with Radial Basis Function Kernel 

584 samples
 28 predictor
  2 classes: 'FALSE', 'TRUE' 

No pre-processing
Resampling: Bootstrapped (25 reps) 

Summary of sample sizes: 584, 584, 584, 584, 584, 584, ... 

Resampling results across tuning parameters:

  sigma  Accuracy   Kappa      Accuracy SD  Kappa SD  
  0.01   0.7297315  0.4417768  0.03082764   0.06044173
  0.02   0.7312643  0.4474754  0.03289345   0.06567919
  0.03   0.7301472  0.4468033  0.03618417   0.07187019
  0.04   0.7288286  0.4463212  0.03609275   0.07200966
  0.05   0.7281374  0.4466735  0.03569426   0.07055105
  0.06   0.7238098  0.4400315  0.03348371   0.06666725
  0.07   0.7213752  0.4364012  0.03467845   0.06849882
  0.08   0.7175949  0.4286502  0.04013475   0.08014780
  0.09   0.7042396  0.3981745  0.04346037   0.08864786
  0.10   0.6651296  0.3061489  0.06450228   0.14079631

Tuning parameter 'C' was held constant at a value of 0.25
Accuracy was used to select the optimal model using  the largest value.
The final values used for the model were sigma = 0.02 and C = 0.25. 

并且您有优化的 sigma!