使用 scikit-learn 区分相似类别
use scikit-learn to distinguish between similar categories
我想将文档中的文本分类为不同的类别。每份文件只能属于以下类别之一:PR、AR、KID、SAR。
我找到了一个使用 scikit-learn 的示例并且我可以使用它:
import numpy
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.multiclass import OneVsRestClassifier
from pandas import DataFrame
def build_data_frame(path, classification):
rows = []
index = []
f = open(path, mode = 'r', encoding="utf8")
txt = f.read()
rows.append({'text': txt, 'class': classification})
index.append(path)
data_frame = DataFrame(rows, index=index)
return data_frame
# Categories
PR = 'PR'
AR = 'AR'
KID = 'KID'
SAR = 'SAR'
# Training documents
SOURCES = [
(r'C:/temp_training/PR/PR1.txt', PR),
(r'C:/temp_training/PR/PR2.txt', PR),
(r'C:/temp_training/PR/PR3.txt', PR),
(r'C:/temp_training/PR/PR4.txt', PR),
(r'C:/temp_training/PR/PR5.txt', PR),
(r'C:/temp_training/AR/AR1.txt', AR),
(r'C:/temp_training/AR/AR2.txt', AR),
(r'C:/temp_training/AR/AR3.txt', AR),
(r'C:/temp_training/AR/AR4.txt', AR),
(r'C:/temp_training/AR/AR5.txt', AR),
(r'C:/temp_training/KID/KID1.txt', KID),
(r'C:/temp_training/KID/KID2.txt', KID),
(r'C:/temp_training/KID/KID3.txt', KID),
(r'C:/temp_training/KID/KID4.txt', KID),
(r'C:/temp_training/KID/KID5.txt', KID),
(r'C:/temp_training/SAR/SAR1.txt', SAR),
(r'C:/temp_training/SAR/SAR2.txt', SAR),
(r'C:/temp_training/SAR/SAR3.txt', SAR),
(r'C:/temp_training/SAR/SAR4.txt', SAR),
(r'C:/temp_training/SAR/SAR5.txt', SAR)
]
# Real documents
TESTS = [
(r'C:/temp_testing/PR/PR1.txt'),
(r'C:/temp_testing/PR/PR2.txt'),
(r'C:/temp_testing/PR/PR3.txt'),
(r'C:/temp_testing/PR/PR4.txt'),
(r'C:/temp_testing/PR/PR5.txt'),
(r'C:/temp_testing/AR/AR1.txt'),
(r'C:/temp_testing/AR/AR2.txt'),
(r'C:/temp_testing/AR/AR3.txt'),
(r'C:/temp_testing/AR/AR4.txt'),
(r'C:/temp_testing/AR/AR5.txt'),
(r'C:/temp_testing/KID/KID1.txt'),
(r'C:/temp_testing/KID/KID2.txt'),
(r'C:/temp_testing/KID/KID3.txt'),
(r'C:/temp_testing/KID/KID4.txt'),
(r'C:/temp_testing/KID/KID5.txt'),
(r'C:/temp_testing/SAR/SAR1.txt'),
(r'C:/temp_testing/SAR/SAR2.txt'),
(r'C:/temp_testing/SAR/SAR3.txt'),
(r'C:/temp_testing/SAR/SAR4.txt'),
(r'C:/temp_testing/SAR/SAR5.txt')
]
data_train = DataFrame({'text': [], 'class': []})
for path, classification in SOURCES:
data_train = data_train.append(build_data_frame(path, classification))
data_train = data_train.reindex(numpy.random.permutation(data_train.index))
examples = []
for path in TESTS:
f = open(path, mode = 'r', encoding = 'utf8')
txt = f.read()
examples.append(txt)
target_names = [PR, AR, KID, SAR]
classifier = Pipeline([
('vectorizer', CountVectorizer(ngram_range=(1, 2), analyzer='word', strip_accents='unicode', stop_words='english')),
('tfidf', TfidfTransformer()),
('clf', OneVsRestClassifier(LinearSVC()))])
classifier.fit(data_train['text'], data_train['class'])
predicted = classifier.predict(examples)
print(predicted)
输出:
['PR' 'PR' 'PR' 'PR' 'PR' 'AR' 'AR' 'AR' 'AR' 'AR' 'KID' 'KID' 'KID' 'KID'
'KID' 'AR' 'AR' 'AR' 'SAR' 'AR']
PR、AR、KID完美识别
但是,SAR 文档(最后 5 个)除了其中一个之外没有被正确分类。 SAR 和 AR 非常相似,这可以解释为什么算法会混淆。
我尝试使用 n-gram 值,但 1(最小)和 2(最大)似乎给出了最好的结果。
知道如何提高区分 AR 和 SAR 类别的精度吗?
有没有办法显示特定文档的识别百分比?即 PR (70%),这意味着该算法对预测有 70% 的信心
如果您需要文档,这里是数据集:http://1drv.ms/21dnL6j
这不是严格意义上的编程问题,因此我建议您尝试将其发布到更与数据科学相关的堆栈中。
无论如何你可以尝试一些事情:
- 使用其他分类器。
- 使用网格搜索调整分类器超参数。
- 使用 OneVsOne 而不是 OneVsAll 作为策略。这可能会帮助您区分 SAR 和 AR。
- 对于 "display the percentage of recognition for a particular document",您可以使用来自某些模型的概率输出。使用
classifier.predict_proba
函数而不是 classifier.predict
函数。
祝你好运!
我想将文档中的文本分类为不同的类别。每份文件只能属于以下类别之一:PR、AR、KID、SAR。
我找到了一个使用 scikit-learn 的示例并且我可以使用它:
import numpy
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.multiclass import OneVsRestClassifier
from pandas import DataFrame
def build_data_frame(path, classification):
rows = []
index = []
f = open(path, mode = 'r', encoding="utf8")
txt = f.read()
rows.append({'text': txt, 'class': classification})
index.append(path)
data_frame = DataFrame(rows, index=index)
return data_frame
# Categories
PR = 'PR'
AR = 'AR'
KID = 'KID'
SAR = 'SAR'
# Training documents
SOURCES = [
(r'C:/temp_training/PR/PR1.txt', PR),
(r'C:/temp_training/PR/PR2.txt', PR),
(r'C:/temp_training/PR/PR3.txt', PR),
(r'C:/temp_training/PR/PR4.txt', PR),
(r'C:/temp_training/PR/PR5.txt', PR),
(r'C:/temp_training/AR/AR1.txt', AR),
(r'C:/temp_training/AR/AR2.txt', AR),
(r'C:/temp_training/AR/AR3.txt', AR),
(r'C:/temp_training/AR/AR4.txt', AR),
(r'C:/temp_training/AR/AR5.txt', AR),
(r'C:/temp_training/KID/KID1.txt', KID),
(r'C:/temp_training/KID/KID2.txt', KID),
(r'C:/temp_training/KID/KID3.txt', KID),
(r'C:/temp_training/KID/KID4.txt', KID),
(r'C:/temp_training/KID/KID5.txt', KID),
(r'C:/temp_training/SAR/SAR1.txt', SAR),
(r'C:/temp_training/SAR/SAR2.txt', SAR),
(r'C:/temp_training/SAR/SAR3.txt', SAR),
(r'C:/temp_training/SAR/SAR4.txt', SAR),
(r'C:/temp_training/SAR/SAR5.txt', SAR)
]
# Real documents
TESTS = [
(r'C:/temp_testing/PR/PR1.txt'),
(r'C:/temp_testing/PR/PR2.txt'),
(r'C:/temp_testing/PR/PR3.txt'),
(r'C:/temp_testing/PR/PR4.txt'),
(r'C:/temp_testing/PR/PR5.txt'),
(r'C:/temp_testing/AR/AR1.txt'),
(r'C:/temp_testing/AR/AR2.txt'),
(r'C:/temp_testing/AR/AR3.txt'),
(r'C:/temp_testing/AR/AR4.txt'),
(r'C:/temp_testing/AR/AR5.txt'),
(r'C:/temp_testing/KID/KID1.txt'),
(r'C:/temp_testing/KID/KID2.txt'),
(r'C:/temp_testing/KID/KID3.txt'),
(r'C:/temp_testing/KID/KID4.txt'),
(r'C:/temp_testing/KID/KID5.txt'),
(r'C:/temp_testing/SAR/SAR1.txt'),
(r'C:/temp_testing/SAR/SAR2.txt'),
(r'C:/temp_testing/SAR/SAR3.txt'),
(r'C:/temp_testing/SAR/SAR4.txt'),
(r'C:/temp_testing/SAR/SAR5.txt')
]
data_train = DataFrame({'text': [], 'class': []})
for path, classification in SOURCES:
data_train = data_train.append(build_data_frame(path, classification))
data_train = data_train.reindex(numpy.random.permutation(data_train.index))
examples = []
for path in TESTS:
f = open(path, mode = 'r', encoding = 'utf8')
txt = f.read()
examples.append(txt)
target_names = [PR, AR, KID, SAR]
classifier = Pipeline([
('vectorizer', CountVectorizer(ngram_range=(1, 2), analyzer='word', strip_accents='unicode', stop_words='english')),
('tfidf', TfidfTransformer()),
('clf', OneVsRestClassifier(LinearSVC()))])
classifier.fit(data_train['text'], data_train['class'])
predicted = classifier.predict(examples)
print(predicted)
输出:
['PR' 'PR' 'PR' 'PR' 'PR' 'AR' 'AR' 'AR' 'AR' 'AR' 'KID' 'KID' 'KID' 'KID'
'KID' 'AR' 'AR' 'AR' 'SAR' 'AR']
PR、AR、KID完美识别
但是,SAR 文档(最后 5 个)除了其中一个之外没有被正确分类。 SAR 和 AR 非常相似,这可以解释为什么算法会混淆。
我尝试使用 n-gram 值,但 1(最小)和 2(最大)似乎给出了最好的结果。
知道如何提高区分 AR 和 SAR 类别的精度吗?
有没有办法显示特定文档的识别百分比?即 PR (70%),这意味着该算法对预测有 70% 的信心
如果您需要文档,这里是数据集:http://1drv.ms/21dnL6j
这不是严格意义上的编程问题,因此我建议您尝试将其发布到更与数据科学相关的堆栈中。
无论如何你可以尝试一些事情:
- 使用其他分类器。
- 使用网格搜索调整分类器超参数。
- 使用 OneVsOne 而不是 OneVsAll 作为策略。这可能会帮助您区分 SAR 和 AR。
- 对于 "display the percentage of recognition for a particular document",您可以使用来自某些模型的概率输出。使用
classifier.predict_proba
函数而不是classifier.predict
函数。
祝你好运!