使用 sklearn 向量化文件
Vectorizing Files using sklearn
我正在尝试读取 100 个训练文件并使用 sklean 对它们进行矢量化。这些文件的内容是代表系统调用的词。矢量化后,我想打印出矢量。
我的第一次尝试如下:
import re
import os
import sys
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
import numpy as np
import numpy.linalg as LA
trainingdataDir = 'C:\data\Training data'
def readfile():
for file in os.listdir(trainingdataDir):
trainingfiles = os.path.join(trainingdataDir, file)
if os.path.isfile(trainingfiles):
data = open(trainingfiles, "rb").read()
return data
train_set = [readfile()]
vectorizer = CountVectorizer()
transformer = TfidfTransformer()
trainVectorizerArray = vectorizer.fit_transform(train_set).toarray()
print 'Fit Vectorizer to train set', trainVectorizerArray
但是,这只是 return 最后一个文件的向量。
我的结论是 print 函数应该放在 for 循环中。所以第二次尝试:
def readfile():
for file in os.listdir(trainingdataDir):
trainingfiles = os.path.join(trainingdataDir, file)
if os.path.isfile(trainingfiles):
data = open(trainingfiles, "rb").read()
trainVectorizerArray = vectorizer.fit_transform(data).toarray()
print 'Fit Vectorizer to train set', trainVectorizerArray
然而,这并没有return任何东西。
你能帮我解决这个问题吗?为什么我看不到正在打印的矢量?
问题是因为用于矢量化的数据集列表是空的。我设法矢量化了一组 100 个文件。我首先打开文件,然后读取每个文件,最后将它们添加到列表中。然后 'tfidf_vectorizer'
使用数据集列表
import re
import os
import sys
import numpy as np
import numpy.linalg as LA
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
trainingdataDir = 'C:\data\Training data'
tfidf_vectorizer = TfidfVectorizer()
transformer = TfidfTransformer()
def readfile(trainingdataDir):
train_set = []
for file in os.listdir(trainingdataDir):
trainingfiles = os.path.join(trainingdataDir, file)
if os.path.isfile(trainingfiles):
data = open(trainingfiles, 'r')
data_set=str.decode(data.read())
train_set.append(data_set)
return train_set
tfidf_matrix_train = tfidf_vectorizer.fit_transform(readfile(trainingdataDir))
print 'Fit Vectorizer to train set',tfidf_matrix_train
print "cosine scores ==> ",cosine_similarity(tfidf_matrix_train[0:1], tfidf_matrix_train)
我正在尝试读取 100 个训练文件并使用 sklean 对它们进行矢量化。这些文件的内容是代表系统调用的词。矢量化后,我想打印出矢量。 我的第一次尝试如下:
import re
import os
import sys
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
import numpy as np
import numpy.linalg as LA
trainingdataDir = 'C:\data\Training data'
def readfile():
for file in os.listdir(trainingdataDir):
trainingfiles = os.path.join(trainingdataDir, file)
if os.path.isfile(trainingfiles):
data = open(trainingfiles, "rb").read()
return data
train_set = [readfile()]
vectorizer = CountVectorizer()
transformer = TfidfTransformer()
trainVectorizerArray = vectorizer.fit_transform(train_set).toarray()
print 'Fit Vectorizer to train set', trainVectorizerArray
但是,这只是 return 最后一个文件的向量。 我的结论是 print 函数应该放在 for 循环中。所以第二次尝试:
def readfile():
for file in os.listdir(trainingdataDir):
trainingfiles = os.path.join(trainingdataDir, file)
if os.path.isfile(trainingfiles):
data = open(trainingfiles, "rb").read()
trainVectorizerArray = vectorizer.fit_transform(data).toarray()
print 'Fit Vectorizer to train set', trainVectorizerArray
然而,这并没有return任何东西。 你能帮我解决这个问题吗?为什么我看不到正在打印的矢量?
问题是因为用于矢量化的数据集列表是空的。我设法矢量化了一组 100 个文件。我首先打开文件,然后读取每个文件,最后将它们添加到列表中。然后 'tfidf_vectorizer'
使用数据集列表import re
import os
import sys
import numpy as np
import numpy.linalg as LA
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
trainingdataDir = 'C:\data\Training data'
tfidf_vectorizer = TfidfVectorizer()
transformer = TfidfTransformer()
def readfile(trainingdataDir):
train_set = []
for file in os.listdir(trainingdataDir):
trainingfiles = os.path.join(trainingdataDir, file)
if os.path.isfile(trainingfiles):
data = open(trainingfiles, 'r')
data_set=str.decode(data.read())
train_set.append(data_set)
return train_set
tfidf_matrix_train = tfidf_vectorizer.fit_transform(readfile(trainingdataDir))
print 'Fit Vectorizer to train set',tfidf_matrix_train
print "cosine scores ==> ",cosine_similarity(tfidf_matrix_train[0:1], tfidf_matrix_train)