如何在 R 中对 SVM 进行预测?
How to do the prediction for SVM in R?
我有 2 个数据集,train_val
和 test
。我想构建 3 个模型并使用这些模型来预测结果。这是我的 3 个模型:
#Model 1
rf.model <- randomForest(Survived ~ ., data = train_val, type = 'response')
#Model 2
svm.model.linear <- svm(Survived ~ ., data = train_val, kernel="linear", cost = 2, gamma = 0.1)
#Model 3
svm.model.radial <- svm(Survived ~ ., data = train_val, kernel="radial", cost = 10, gamma = 0.1)
用上面的代码训练完3个模型后,我用下面的代码做预测:
prediction <- predict(rf.model, newdata = test)
prediction
的输出是:
然后我将结果放入数据框中:
df <- data.frame(PassengerId = 892:1309, Survived = prediction)
df
的输出是
到目前为止一切正常,但是,当我将 prediction
中的 rf.model
替换为 svm.model.linear
时,输出发生了变化:
因此,数据帧也有错误:
我可以知道这是什么原因吗?我应该如何获得与之前使用 rf.model
时相同的 df
输出?任何帮助将不胜感激!
这是来自 post
的完整代码
library(dplyr)
library(tidyr)
library(ggplot2)
library(Amelia)
library(corrgram)
library(caret)
library(randomForest)
library(e1071)
#Import train and test dataset
train <- read.csv("C:/Users/User/Desktop/Titanic/train.csv")
test <- read.csv("C:/Users/User/Desktop/Titanic/test.csv")
#Ensure dataset loaded correctly
head(train)
head(test)
train$set <- "train"
test$set <- "test"
test$Survived <- NA
full <- rbind(train,test)
summary(full)
str(full)
# Creating new training and testing data set
full <- dplyr::select(full,-PassengerId,-Ticket,-Cabin)
#Creating a new variable called Dependents
full$Dependents <- full$SibSp + full$Parch
head(full)
missing_values <- full %>% summarise(across(everything(), ~sum(is.na(.))/length(.)))
missing_values <- missing_values %>% pivot_longer(cols = everything(),names_to = "feature",values_to="missing_pct")
ggplot(missing_values,aes(x=reorder(feature,-missing_pct),y=missing_pct,label=missing_pct)) + geom_text(hjust = 0,aes(label=scales::percent(missing_pct))) +
geom_col(fill='red') + coord_flip() + scale_y_continuous(labels=scales::percent)
#Overview of scatterplots between all variables
full$Sex <- factor(full$Sex)
full$Sex.factor <- as.numeric(full$Sex)
str(full)
full %>% filter(set == "train") %>% select(-Name,-Embarked,-set,-SibSp,-Parch) %>% corrgram(lower.panel = panel.shade,upper.panel = panel.pie)
full$Survived <- as.factor(full$Survived)
full$Pclass <- as.factor(full$Pclass)
full %>% filter(set == "train") %>% ggplot(aes(Sex)) + geom_bar(aes(fill=Survived))
full %>% filter(set == "train") %>% ggplot(aes(Pclass)) + geom_bar(aes(fill=Survived))
full %>% filter(set == "train") %>% ggplot(aes(Age)) + geom_histogram(bins=10,aes(fill=Survived))
full %>% filter(set == "train") %>% ggplot(aes(Dependents)) + geom_histogram(bins=10,aes(fill=Survived))
full %>% ggplot(aes(x=Pclass,y=Age)) + geom_boxplot(aes(fill=Pclass))
full %>% ggplot(aes(x=Sex,y=Age)) + geom_boxplot(aes(fill=Sex))
#Getting the median age based on Pclass
p1MedAge <- full %>% filter(Pclass == 1 & Age != "NA") %>% summarise(median(Age)) %>% as.numeric()
p2MedAge <- full %>% filter(Pclass == 2 & Age != "NA") %>% summarise(median(Age)) %>% as.numeric()
p3MedAge <- full %>% filter(Pclass == 3 & Age != "NA") %>% summarise(median(Age)) %>% as.numeric()
#Imputing median age to missing data for train data
full$Age[is.na(full$Age) & full$Pclass == 1] <- p1MedAge
full$Age[is.na(full$Age) & full$Pclass == 2] <- p2MedAge
full$Age[is.na(full$Age) & full$Pclass == 3] <- p3MedAge
#Checking for missing age data
any(is.na(full$Age)
full[is.na(full$Fare),]
#Imputing the mean fare for Pclass 3
full$Fare[is.na(full$Fare)] <- round(mean(subset(full$Fare, full$Pclass == 3),na.rm = T),0)
#Final check of missing data
any(is.na(subset(full,select=-c(Survived))))
full <- full %>% select(-Name,-SibSp,-Parch,-Sex.factor)
str(full)
full$Dependents <- factor(full$Dependents)
full$Embarked <- factor(full$Embarked)
str(full)
train <- full %>% filter(set == "train") %>% select(-set)
test <- full %>% filter (set == "test") %>% select(-set)
#Creating data partition to cross validation
ind = createDataPartition(train$Survived,times = 1,p = 0.8,list = FALSE)
train_val <- train[ind,]
test_val <- train[-ind,]
#Checking distribution of data partition
round(prop.table(table(train$Survived)*100),digits=3)
round(prop.table(table(train_val$Survived)*100),digits=3)
round(prop.table(table(test_val$Survived)*100),digits=3)
log.model <- glm(formula = Survived ~ . - Fare - Embarked, data = train_val, family = binomial(link='logit'))
summary(log.model)
glm.prediction <- predict(log.model, newdata=test_val, type='response')
glm.prediction <- ifelse(glm.prediction >= 0.5, 1, 0)
table(test_val$Survived,glm.prediction)
sum(test_val$Survived==glm.prediction) / nrow(test_val)
rf.model <- randomForest(Survived ~ ., data = train_val)
print(rf.model$confusion)
importance(rf.model)
rf.prediction <- predict(rf.model, test_val)
table(test_val$Survived,rf.prediction)
sum(test_val$Survived==rf.prediction) / nrow(test_val)
tuned.svm.linear <- tune.svm(Survived ~., data = train, kernel = "linear", cost = c(0.01,0.1,0.2,0.5,0.8,1,2,3,5,10),gamma=c(0.1,0.5,1,2,5))
summary(tuned.svm.linear)
tuned.svm.radial <- tune.svm(Survived ~., data = train, kernel = "radial", cost = c(0.01,0.1,0.2,0.5,0.8,1,2,3,5,10),gamma=c(0.1,0.5,1,2,5))
summary(tuned.svm.radial)
svm.model.linear <- svm(Survived ~ ., data = train_val, kernel="linear", cost = 2, gamma = 0.1)
svm.model.radial <- svm(Survived ~ ., data = train_val, kernel="radial", cost = 10, gamma = 0.1)
svm.prediction.linear <- predict(svm.model.linear, test_val)
table(test_val$Survived,svm.prediction.linear)
sum(test_val$Survived==svm.prediction.linear) / nrow(test_val)
svm.prediction.radial <- predict(svm.model.radial, test_val)
table(test_val$Survived,svm.prediction.radial)
sum(test_val$Survived==svm.prediction.radial) / nrow(test_val)
#Predicting Survival on test data set using Random Forest model
rf.model <- randomForest(Survived ~ ., data = train, type = 'response')
prediction <- predict(svm.model.linear, newdata = test)
submission <- data.frame(PassengerId = 892:1309, Survived = prediction)
write.csv(submission, file = "submission.csv", row.names = FALSE)
paste("Your submission was successfully saved!")
这是我运行代码时出错的地方
如你所见,当我运行到第submission <- data.frame(PassengerId = 892:1309, Survived = prediction)
行时,错误出现了。
您将 table 用作 newdata
。
您应该使用 test_val
,它已经过与 train_val
相同的处理。相反,您正在使用 train_val
进行训练,但使用 test
作为您的 newdata
.
如果您对 test_val
table 进行预测,svm 和随机森林模型都可以工作,并且会给您 177 个预测。
您还需要将 submission
data.frame 更改为 177 行而不是 418 行。
编辑
正如评论中所讨论的(尽管它们现在已被删除?),您想使用基于 train
数据构建的模型来预测 test
数据。
试试这个:
svm.model.linear <- svm(Survived ~ ., data = train, kernel="linear", cost = 2, gamma = 0.1)
svm.prediction.linear <- predict(svm.model.linear, test[,-1])
predict
函数对于 R 中的不同模型的工作方式略有不同,这可能会造成混淆。当您将它与 svm 模型一起使用时,它实际上是在调用 predict.svm()
。这个特定的函数不喜欢你用一个空的 Survived
列传递它 newdata
。如果您通过指定 newdata=test[,-1]
删除该列,那么预测将按预期工作。
我有 2 个数据集,train_val
和 test
。我想构建 3 个模型并使用这些模型来预测结果。这是我的 3 个模型:
#Model 1
rf.model <- randomForest(Survived ~ ., data = train_val, type = 'response')
#Model 2
svm.model.linear <- svm(Survived ~ ., data = train_val, kernel="linear", cost = 2, gamma = 0.1)
#Model 3
svm.model.radial <- svm(Survived ~ ., data = train_val, kernel="radial", cost = 10, gamma = 0.1)
用上面的代码训练完3个模型后,我用下面的代码做预测:
prediction <- predict(rf.model, newdata = test)
prediction
的输出是:
然后我将结果放入数据框中:
df <- data.frame(PassengerId = 892:1309, Survived = prediction)
df
的输出是
到目前为止一切正常,但是,当我将 prediction
中的 rf.model
替换为 svm.model.linear
时,输出发生了变化:
因此,数据帧也有错误:
我可以知道这是什么原因吗?我应该如何获得与之前使用 rf.model
时相同的 df
输出?任何帮助将不胜感激!
这是来自 post
的完整代码library(dplyr)
library(tidyr)
library(ggplot2)
library(Amelia)
library(corrgram)
library(caret)
library(randomForest)
library(e1071)
#Import train and test dataset
train <- read.csv("C:/Users/User/Desktop/Titanic/train.csv")
test <- read.csv("C:/Users/User/Desktop/Titanic/test.csv")
#Ensure dataset loaded correctly
head(train)
head(test)
train$set <- "train"
test$set <- "test"
test$Survived <- NA
full <- rbind(train,test)
summary(full)
str(full)
# Creating new training and testing data set
full <- dplyr::select(full,-PassengerId,-Ticket,-Cabin)
#Creating a new variable called Dependents
full$Dependents <- full$SibSp + full$Parch
head(full)
missing_values <- full %>% summarise(across(everything(), ~sum(is.na(.))/length(.)))
missing_values <- missing_values %>% pivot_longer(cols = everything(),names_to = "feature",values_to="missing_pct")
ggplot(missing_values,aes(x=reorder(feature,-missing_pct),y=missing_pct,label=missing_pct)) + geom_text(hjust = 0,aes(label=scales::percent(missing_pct))) +
geom_col(fill='red') + coord_flip() + scale_y_continuous(labels=scales::percent)
#Overview of scatterplots between all variables
full$Sex <- factor(full$Sex)
full$Sex.factor <- as.numeric(full$Sex)
str(full)
full %>% filter(set == "train") %>% select(-Name,-Embarked,-set,-SibSp,-Parch) %>% corrgram(lower.panel = panel.shade,upper.panel = panel.pie)
full$Survived <- as.factor(full$Survived)
full$Pclass <- as.factor(full$Pclass)
full %>% filter(set == "train") %>% ggplot(aes(Sex)) + geom_bar(aes(fill=Survived))
full %>% filter(set == "train") %>% ggplot(aes(Pclass)) + geom_bar(aes(fill=Survived))
full %>% filter(set == "train") %>% ggplot(aes(Age)) + geom_histogram(bins=10,aes(fill=Survived))
full %>% filter(set == "train") %>% ggplot(aes(Dependents)) + geom_histogram(bins=10,aes(fill=Survived))
full %>% ggplot(aes(x=Pclass,y=Age)) + geom_boxplot(aes(fill=Pclass))
full %>% ggplot(aes(x=Sex,y=Age)) + geom_boxplot(aes(fill=Sex))
#Getting the median age based on Pclass
p1MedAge <- full %>% filter(Pclass == 1 & Age != "NA") %>% summarise(median(Age)) %>% as.numeric()
p2MedAge <- full %>% filter(Pclass == 2 & Age != "NA") %>% summarise(median(Age)) %>% as.numeric()
p3MedAge <- full %>% filter(Pclass == 3 & Age != "NA") %>% summarise(median(Age)) %>% as.numeric()
#Imputing median age to missing data for train data
full$Age[is.na(full$Age) & full$Pclass == 1] <- p1MedAge
full$Age[is.na(full$Age) & full$Pclass == 2] <- p2MedAge
full$Age[is.na(full$Age) & full$Pclass == 3] <- p3MedAge
#Checking for missing age data
any(is.na(full$Age)
full[is.na(full$Fare),]
#Imputing the mean fare for Pclass 3
full$Fare[is.na(full$Fare)] <- round(mean(subset(full$Fare, full$Pclass == 3),na.rm = T),0)
#Final check of missing data
any(is.na(subset(full,select=-c(Survived))))
full <- full %>% select(-Name,-SibSp,-Parch,-Sex.factor)
str(full)
full$Dependents <- factor(full$Dependents)
full$Embarked <- factor(full$Embarked)
str(full)
train <- full %>% filter(set == "train") %>% select(-set)
test <- full %>% filter (set == "test") %>% select(-set)
#Creating data partition to cross validation
ind = createDataPartition(train$Survived,times = 1,p = 0.8,list = FALSE)
train_val <- train[ind,]
test_val <- train[-ind,]
#Checking distribution of data partition
round(prop.table(table(train$Survived)*100),digits=3)
round(prop.table(table(train_val$Survived)*100),digits=3)
round(prop.table(table(test_val$Survived)*100),digits=3)
log.model <- glm(formula = Survived ~ . - Fare - Embarked, data = train_val, family = binomial(link='logit'))
summary(log.model)
glm.prediction <- predict(log.model, newdata=test_val, type='response')
glm.prediction <- ifelse(glm.prediction >= 0.5, 1, 0)
table(test_val$Survived,glm.prediction)
sum(test_val$Survived==glm.prediction) / nrow(test_val)
rf.model <- randomForest(Survived ~ ., data = train_val)
print(rf.model$confusion)
importance(rf.model)
rf.prediction <- predict(rf.model, test_val)
table(test_val$Survived,rf.prediction)
sum(test_val$Survived==rf.prediction) / nrow(test_val)
tuned.svm.linear <- tune.svm(Survived ~., data = train, kernel = "linear", cost = c(0.01,0.1,0.2,0.5,0.8,1,2,3,5,10),gamma=c(0.1,0.5,1,2,5))
summary(tuned.svm.linear)
tuned.svm.radial <- tune.svm(Survived ~., data = train, kernel = "radial", cost = c(0.01,0.1,0.2,0.5,0.8,1,2,3,5,10),gamma=c(0.1,0.5,1,2,5))
summary(tuned.svm.radial)
svm.model.linear <- svm(Survived ~ ., data = train_val, kernel="linear", cost = 2, gamma = 0.1)
svm.model.radial <- svm(Survived ~ ., data = train_val, kernel="radial", cost = 10, gamma = 0.1)
svm.prediction.linear <- predict(svm.model.linear, test_val)
table(test_val$Survived,svm.prediction.linear)
sum(test_val$Survived==svm.prediction.linear) / nrow(test_val)
svm.prediction.radial <- predict(svm.model.radial, test_val)
table(test_val$Survived,svm.prediction.radial)
sum(test_val$Survived==svm.prediction.radial) / nrow(test_val)
#Predicting Survival on test data set using Random Forest model
rf.model <- randomForest(Survived ~ ., data = train, type = 'response')
prediction <- predict(svm.model.linear, newdata = test)
submission <- data.frame(PassengerId = 892:1309, Survived = prediction)
write.csv(submission, file = "submission.csv", row.names = FALSE)
paste("Your submission was successfully saved!")
这是我运行代码时出错的地方
如你所见,当我运行到第submission <- data.frame(PassengerId = 892:1309, Survived = prediction)
行时,错误出现了。
您将 table 用作 newdata
。
您应该使用 test_val
,它已经过与 train_val
相同的处理。相反,您正在使用 train_val
进行训练,但使用 test
作为您的 newdata
.
如果您对 test_val
table 进行预测,svm 和随机森林模型都可以工作,并且会给您 177 个预测。
您还需要将 submission
data.frame 更改为 177 行而不是 418 行。
编辑
正如评论中所讨论的(尽管它们现在已被删除?),您想使用基于 train
数据构建的模型来预测 test
数据。
试试这个:
svm.model.linear <- svm(Survived ~ ., data = train, kernel="linear", cost = 2, gamma = 0.1)
svm.prediction.linear <- predict(svm.model.linear, test[,-1])
predict
函数对于 R 中的不同模型的工作方式略有不同,这可能会造成混淆。当您将它与 svm 模型一起使用时,它实际上是在调用 predict.svm()
。这个特定的函数不喜欢你用一个空的 Survived
列传递它 newdata
。如果您通过指定 newdata=test[,-1]
删除该列,那么预测将按预期工作。