R 中用于文本分类的 SVM
SVM for text classification in R
我正在使用 SVM 对我的文本进行分类,但我实际上并没有得到结果,而是通过数值概率得到的。
Dataframe(1:20 训练集,21:50 测试集)
更新:
ou <- structure(list(text = structure(c(1L, 6L, 1L, 1L, 8L, 13L, 24L,
5L, 11L, 12L, 33L, 36L, 20L, 25L, 4L, 19L, 9L, 29L, 22L, 3L,
8L, 8L, 8L, 2L, 8L, 27L, 30L, 3L, 14L, 35L, 3L, 34L, 23L, 31L,
22L, 6L, 6L, 7L, 17L, 3L, 8L, 32L, 18L, 15L, 21L, 26L, 3L, 16L,
10L, 28L), .Label = c("access, access, access, access", "character(0)",
"report", "report, access", "report, access, access", "report, access, access, access",
"report, access, access, access, access, access, access", "report, access, access, access, access, access, access, access",
"report, access, access, access, access, access, access, report",
"report, access, access, access, access, access, report", "report, access, access, access, report",
"report, access, access, access, report, access", "report, access, access, report, access, access, access, access, access, access",
"report, data", "report, data, data", "report, data, data, data",
"report, data, data, data, data", "report, data, data, data, data, data",
"report, data, data, data, report, report, data, access,access",
"report, data, data, report", "report, data, report", "report, report",
"report, report, access, access, access", "report, report, access, access, report, report, report, report, report, report, data, data, report, access, report, report",
"report, report, access, report, report, report, report, report, data, data, report, access, report, report",
"report, report, access, report, report, report, report, report, report, data, data, report, access, report, report",
"report, report, data", "report, report, data, report", "report, report, report, data, report, report, data, data, report, data, data",
"report, report, report, report", "report, report, report, report, data, report, report, data, report, data, report",
"report, report, report, report, report, data, report, data, data",
"report, report, report, report, report, report, report", "report, report, report, report, report, report, report, access, access, access",
"report, report, report, report, report, report, report, report, data, data, report, access, report, report",
"report, report, report, report, report, report, report, report, report, report, data, report, report, report, report, report, report, report,report"
), class = "factor"), value = structure(c(2L, 2L, 2L, 2L, 2L,
2L, 3L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 2L, 3L, 2L, 3L, 3L, 3L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("",
"Access", "Report/Data"), class = "factor")), .Names = c("text",
"value"), class = "data.frame", row.names = c(NA, -50L))
使用代码:
library(RTextTools)
doc_matrix <- create_matrix(ou$text, language="english", removeNumbers=TRUE, stemWords=TRUE, removeSparseTerms=.998)
#container <- create_container(doc_matrix, ou$text, trainSize=1:20, testSize=21:50, virgin=FALSE)
container <- create_container(doc_matrix, as.numeric(factor(ou$text)), trainSize=1:20, testSize=21:50, virgin=FALSE)
#Training models
SVM <- train_model(container,"SVM")
MAXENT <- train_model(container,"MAXENT")
BAGGING <- train_model(container,"BAGGING")
TREE <- train_model(container,"TREE")
#Classify data using trained models
SVM_CLASSIFY <- classify_model(container, SVM)
MAXENT_CLASSIFY <- classify_model(container, MAXENT)
BAGGING_CLASSIFY <- classify_model(container, BAGGING)
#Analytics
analytics <- create_analytics(container,SVM_CLASSIFY)
models <- train_models(container, algorithms=c("MAXENT","SVM"))
results <- classify_models(container, models)
analytics <- create_analytics(container, results)
summary(analytics)
SVM <- cross_validate(container, 5, "SVM")
write.csv(analytics@document_summary, "DocumentSummary.csv")
预期结果:
text value
21 report, access, access, access, access, access, access, access Access
22 report, access, access, access, access, access, access, access Access
23 report, access, access, access, access, access, access, access Access
24 character(0) NA
25 report, access, access, access, access, access, access, access Access
26 report, report, data Report/Data
27 report, report, report, report Report/Data
28 report Report/Data
29 report, data Report/Data
30 report, report, report, report, report, report, report, report,
data, data, report, access, report, report Report/Data
结果概率为:
> MAXENTROPY_LABEL MAXENTROPY_PROB SVM_LABEL SVM_PROB MANUAL_CODE CONSENSUS_CODE CONSENSUS_AGREE CONSENSUS_INCORRECT PROBABILITY_CODE PROBABILITY_INCORRECT
> 1 8 0.999999066 22 0.070090645 8 8 1 0 8 0
> 2 8 0.999999066 22 0.070090645 8 8 1 0 8 0
> 3 8 0.999999066 22 0.070090645 8 8 1 0 8 0
> 4 1 0.055555556 12 0.071384112 2 12 1 1 12 1
> 5 8 0.999999066 22 0.070090645 8 8 1 0 8 0
> 6 25 1 12 0.074126949 27 25 1 1 25 1
> 7 33 0.627904676 13 0.068572857 30 33 1 1 33 1
> 8 33 0.406792176 12 0.074592181 3 33 1 1 33 1
> 9 20 1 12 0.074507793 14 20 1 1 20 1
编辑 1:
我怎样才能实现 标签名称 而不是 SVM 标签编号。
我平时做的是
ou <- cbind(ou$text, results)
并打印标签:
ou$value <- "NONE"
ou$value[results$SVM_LABEL=="1"] <- "Access"
ou$value[results$SVM_LABEL=="-1"] <- "Report/Data"
ou
(假设你在训练模型时使用了1和-1)
我知道它有点原始,但它很清晰并且工作正常
我正在使用 SVM 对我的文本进行分类,但我实际上并没有得到结果,而是通过数值概率得到的。
Dataframe(1:20 训练集,21:50 测试集)
更新:
ou <- structure(list(text = structure(c(1L, 6L, 1L, 1L, 8L, 13L, 24L,
5L, 11L, 12L, 33L, 36L, 20L, 25L, 4L, 19L, 9L, 29L, 22L, 3L,
8L, 8L, 8L, 2L, 8L, 27L, 30L, 3L, 14L, 35L, 3L, 34L, 23L, 31L,
22L, 6L, 6L, 7L, 17L, 3L, 8L, 32L, 18L, 15L, 21L, 26L, 3L, 16L,
10L, 28L), .Label = c("access, access, access, access", "character(0)",
"report", "report, access", "report, access, access", "report, access, access, access",
"report, access, access, access, access, access, access", "report, access, access, access, access, access, access, access",
"report, access, access, access, access, access, access, report",
"report, access, access, access, access, access, report", "report, access, access, access, report",
"report, access, access, access, report, access", "report, access, access, report, access, access, access, access, access, access",
"report, data", "report, data, data", "report, data, data, data",
"report, data, data, data, data", "report, data, data, data, data, data",
"report, data, data, data, report, report, data, access,access",
"report, data, data, report", "report, data, report", "report, report",
"report, report, access, access, access", "report, report, access, access, report, report, report, report, report, report, data, data, report, access, report, report",
"report, report, access, report, report, report, report, report, data, data, report, access, report, report",
"report, report, access, report, report, report, report, report, report, data, data, report, access, report, report",
"report, report, data", "report, report, data, report", "report, report, report, data, report, report, data, data, report, data, data",
"report, report, report, report", "report, report, report, report, data, report, report, data, report, data, report",
"report, report, report, report, report, data, report, data, data",
"report, report, report, report, report, report, report", "report, report, report, report, report, report, report, access, access, access",
"report, report, report, report, report, report, report, report, data, data, report, access, report, report",
"report, report, report, report, report, report, report, report, report, report, data, report, report, report, report, report, report, report,report"
), class = "factor"), value = structure(c(2L, 2L, 2L, 2L, 2L,
2L, 3L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 2L, 3L, 2L, 3L, 3L, 3L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("",
"Access", "Report/Data"), class = "factor")), .Names = c("text",
"value"), class = "data.frame", row.names = c(NA, -50L))
使用代码:
library(RTextTools)
doc_matrix <- create_matrix(ou$text, language="english", removeNumbers=TRUE, stemWords=TRUE, removeSparseTerms=.998)
#container <- create_container(doc_matrix, ou$text, trainSize=1:20, testSize=21:50, virgin=FALSE)
container <- create_container(doc_matrix, as.numeric(factor(ou$text)), trainSize=1:20, testSize=21:50, virgin=FALSE)
#Training models
SVM <- train_model(container,"SVM")
MAXENT <- train_model(container,"MAXENT")
BAGGING <- train_model(container,"BAGGING")
TREE <- train_model(container,"TREE")
#Classify data using trained models
SVM_CLASSIFY <- classify_model(container, SVM)
MAXENT_CLASSIFY <- classify_model(container, MAXENT)
BAGGING_CLASSIFY <- classify_model(container, BAGGING)
#Analytics
analytics <- create_analytics(container,SVM_CLASSIFY)
models <- train_models(container, algorithms=c("MAXENT","SVM"))
results <- classify_models(container, models)
analytics <- create_analytics(container, results)
summary(analytics)
SVM <- cross_validate(container, 5, "SVM")
write.csv(analytics@document_summary, "DocumentSummary.csv")
预期结果:
text value
21 report, access, access, access, access, access, access, access Access
22 report, access, access, access, access, access, access, access Access
23 report, access, access, access, access, access, access, access Access
24 character(0) NA
25 report, access, access, access, access, access, access, access Access
26 report, report, data Report/Data
27 report, report, report, report Report/Data
28 report Report/Data
29 report, data Report/Data
30 report, report, report, report, report, report, report, report,
data, data, report, access, report, report Report/Data
结果概率为:
> MAXENTROPY_LABEL MAXENTROPY_PROB SVM_LABEL SVM_PROB MANUAL_CODE CONSENSUS_CODE CONSENSUS_AGREE CONSENSUS_INCORRECT PROBABILITY_CODE PROBABILITY_INCORRECT
> 1 8 0.999999066 22 0.070090645 8 8 1 0 8 0
> 2 8 0.999999066 22 0.070090645 8 8 1 0 8 0
> 3 8 0.999999066 22 0.070090645 8 8 1 0 8 0
> 4 1 0.055555556 12 0.071384112 2 12 1 1 12 1
> 5 8 0.999999066 22 0.070090645 8 8 1 0 8 0
> 6 25 1 12 0.074126949 27 25 1 1 25 1
> 7 33 0.627904676 13 0.068572857 30 33 1 1 33 1
> 8 33 0.406792176 12 0.074592181 3 33 1 1 33 1
> 9 20 1 12 0.074507793 14 20 1 1 20 1
编辑 1: 我怎样才能实现 标签名称 而不是 SVM 标签编号。
我平时做的是
ou <- cbind(ou$text, results)
并打印标签:
ou$value <- "NONE"
ou$value[results$SVM_LABEL=="1"] <- "Access"
ou$value[results$SVM_LABEL=="-1"] <- "Report/Data"
ou
(假设你在训练模型时使用了1和-1)
我知道它有点原始,但它很清晰并且工作正常