我调查样本大小对文本分类器性能影响的功能无法正常工作

My function to investigate the impact of sample size on the text classifier performance is not working correctly

我定义了以下函数,returns 训练和测试数据集的 AUC 和 PRC 分数,您可以通过以下链接找到: 训练数据集 https://drive.google.com/file/d/1466SDm1nOpeDb_3UnW8Qjc1VEY_Be0R5/view?usp=sharing 测试数据集 https://drive.google.com/file/d/1vphjb3xbrklhLHNMYUexN6X_axepm0Xy/view?usp=sharing

两个数据集都有以下格式的样本。文本列包含文档,标签列给出每个文档的情感。

标签文字 1 我必须承认我沉迷于“2.0 版...... 0 我认为这是一个巨大的耻辱...... 1 The Inn Puzzle 的 The Sunsout No Room 有奇怪的...... …………

import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn import svm
from sklearn.svm import LinearSVC
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import roc_curve, auc,precision_recall_curve
from sklearn.metrics import classification_report
from matplotlib import pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

train = pd.read_csv("train5-1.csv")
test = pd.read_csv("test5.csv")

def create_model(train_docs, train_y, test_docs, test_y, \
              model_type='svm', stop_words=None, min_df=1, print_result = True, algorithm_para=1.0):
  
  tfidf_vect = TfidfVectorizer(stop_words=stop_words,min_df=min_df)
  tfidf_vect.fit_transform(train["text"])
  y_test=test['label'].values
  y_train=train["label"].values
  X_train=tfidf_vect.fit_transform(train['text'].values)
  X_test=tfidf_vect.transform(test['text'].values)

  if 'svm' in model_type:
      clf = svm.SVC(kernel='linear',probability=True)
      clf=svm.LinearSVC(C=algorithm_para).fit(X_train, y_train)
      predicted=clf.predict(X_test)
      
      labels=sorted(train['label'].unique())
      precision, recall, fscore, support=\
          precision_recall_fscore_support(\
          y_test, predicted, labels=labels)
      
      
      if print_result==True:
 
        print("labels: ", labels)
        print("precision: ", precision)
        print("recall: ", recall)
        print("f-score: ", fscore)
        print("support: ", support)
        
      predict_p=clf._predict_proba_lr(X_test)
      labels
      predict_p[0:3]
      
      y_test[0:3]
      y_pred = predict_p[:,1]
      
      fpr, tpr, thresholds = roc_curve(y_test,y_pred, pos_label=1)
      precision, recall, thresholds = precision_recall_curve(y_test, y_pred, pos_label=1)
      
      auc_score= auc(fpr, tpr)
      prc_score=auc(recall, precision)


      if print_result==True:
        print("AUC: {:.2%}".format(auc_score), "PRC: {:.2%}".format(prc_score))
        plt.figure();
        plt.plot(fpr, tpr, color='darkorange', lw=2);
        plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--');
        plt.xlim([0.0, 1.0]);
        plt.ylim([0.0, 1.05]);
        plt.xlabel('False Positive Rate');
        plt.ylabel('True Positive Rate');
        plt.title('AUC of SVM Model');
        plt.show()
        
        plt.figure();
        plt.plot(recall, precision, color='darkorange', lw=2);
        plt.xlim([0.0, 1.0]);
        plt.ylim([0.0, 1.05]);
        plt.xlabel('Recall');
        plt.ylabel('Precision');
        plt.title('Precision_Recall_Curve of SVM Model');
        plt.show();

  else:
    clf=MultinomialNB(alpha=algorithm_para).fit(X_train, y_train)
    predicted=clf.predict(X_test)
    
    labels=sorted(train['label'].unique())
    precision, recall, fscore, support=\
          precision_recall_fscore_support(\
          y_test, predicted, labels=labels)  
      
    if print_result==True:

          
        print("labels: ", labels)
        print("precision: ", precision)
        print("recall: ", recall)
        print("f-score: ", fscore)
        print("support: ", support)
        
    predict_p=clf.predict_proba(X_test)
    labels
    predict_p[0:3]
      
    y_test[0:3]
    y_pred = predict_p[:,1]
      
    fpr, tpr, thresholds = roc_curve(y_test,y_pred, pos_label=1)
    precision, recall, thresholds = precision_recall_curve(y_test, y_pred, pos_label=1)
      
    auc_score= auc(fpr, tpr)
    prc_score=auc(recall, precision)


    if print_result==True:
        print("AUC: {:.2%}".format(auc_score), "PRC: {:.2%}".format(prc_score))
        plt.figure();
        plt.plot(fpr, tpr, color='darkorange', lw=2);
        plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--');
        plt.xlim([0.0, 1.0]);
        plt.ylim([0.0, 1.05]);
        plt.xlabel('False Positive Rate');
        plt.ylabel('True Positive Rate');
        plt.title('AUC of SVM Model');
        plt.show()
        
        plt.figure();
        plt.plot(recall, precision, color='darkorange', lw=2);
        plt.xlim([0.0, 1.0]);
        plt.ylim([0.0, 1.05]);
        plt.xlabel('Recall');
        plt.ylabel('Precision');
        plt.title('Precision_Recall_Curve of SVM Model');
        plt.show();

  return auc_score, prc_score

然后,为了研究样本量对上述分类器性能的影响,我定义了另一个函数如下:

def sample_size_impact(train_docs, train_y, test_docs, test_y):

  auc_list_svm=[]
  
  t_size = np.linspace(500,12000, 24)

  for i in range (int(len(train_docs)/500)):
    auc_score_svm= create_model(train_docs[:(i+1)*500], train_y[:(i+1)*500], test_docs, test_y, \
          model_type='svm', stop_words = 'english', min_df = 1, print_result=False, algorithm_para=1.0)
    auc_list_svm.append(auc_score_svm)

  plt.figure();
  plt.plot(auc_list_svm, color='darkorange');
  plt.xlabel('Smple Size');
  plt.ylabel('AUC');
  plt.title('sample size impact comparison');
  plt.show()

但是 sample_size_impact 函数无法正常工作。 你能调查一下我的代码并告诉我哪里出错了吗?

您在 create_model 中出错,您每次都使用全局(完整)训练数据 train 而不是参数 train_docs。应该是:

  tfidf_vect.fit_transform(train_docs["text"])
  y_test=test_docs['label'].values
  y_train=train_docs["label"].values
  X_train=tfidf_vect.fit_transform(train_docs['text'].values)
  X_test=tfidf_vect.transform(test_docs['text'].values)