使用 NLTK 删除停止词
Stop Word Removal with NLTK
我一直在研究 NLTK 和数据库分类。我在删除停用词时遇到问题。当我打印停用词列表时,所有单词的前面都带有 "u'" 。例如:
[你们都',你们只是',你们是',你们'结束',你们都',你们'通过']
我不确定这是正常现象还是部分问题。
当我打印 (1_feats) 时,我得到了一个单词列表,其中一些是语料库中列出的停用词。
import os
from nltk.classify import NaiveBayesClassifier
from nltk.corpus import stopwords
stopset = list(set(stopwords.words('english')))
morewords = 'delivery', 'shipment', 'only', 'copy', 'attach', 'material'
stopset.append(morewords)
def word_feats(words):
return dict([(word, True) for word in words.split() if word not in stopset])
ids_1 = {}
ids_2 = {}
ids_3 = {}
ids_4 = {}
ids_5 = {}
ids_6 = {}
ids_7 = {}
ids_8 = {}
ids_9 = {}
path1 = "/Users/myname/Documents/Data Classifier Files/1/"
for name in os.listdir(path1):
if name[-4:] == '.txt':
f = open(path1 + "/" + name, "r")
ids_1[name] = f.read()
f.close()
path2 = "/Users/myname/Documents/Data Classifier Files/2/"
for name in os.listdir(path2):
if name[-4:] == '.txt':
f = open(path2 + "/" + name, "r")
ids_2[name] = f.read()
f.close()
path3 = "/Users/myname/Documents/Data Classifier Files/3/"
for name in os.listdir(path3):
if name[-4:] == '.txt':
f = open(path3 + "/" + name, "r")
ids_3[name] = f.read()
f.close()
path4 = "/Users/myname/Documents/Data Classifier Files/4/"
for name in os.listdir(path4):
if name[-4:] == '.txt':
f = open(path4 + "/" + name, "r")
ids_4[name] = f.read()
f.close()
path5 = "/Users/myname/Documents/Data Classifier Files/5/"
for name in os.listdir(path5):
if name[-4:] == '.txt':
f = open(path5 + "/" + name, "r")
ids_5[name] = f.read()
f.close()
path6 = "/Users/myname/Documents/Data Classifier Files/6/"
for name in os.listdir(path6):
if name[-4:] == '.txt':
f = open(path6 + "/" + name, "r")
ids_6[name] = f.read()
f.close()
path7 = "/Users/myname/Documents/Data Classifier Files/7/"
for name in os.listdir(path7):
if name[-4:] == '.txt':
f = open(path7 + "/" + name, "r")
ids_7[name] = f.read()
f.close()
path8 = "/Users/myname/Documents/Data Classifier Files/8/"
for name in os.listdir(path8):
if name[-4:] == '.txt':
f = open(path8 + "/" + name, "r")
ids_8[name] = f.read()
f.close()
path9 = "/Users/myname/Documents/Data Classifier Files/9/"
for name in os.listdir(path9):
if name[-4:] == '.txt':
f = open(path9 + "/" + name, "r")
ids_9[name] = f.read()
f.close()
feats_1 = [(word_feats(ids_1[f]), '1') for f in ids_1 ]
feats_2 = [(word_feats(ids_2[f]), "2") for f in ids_2 ]
feats_3 = [(word_feats(ids_3[f]), '3') for f in ids_3 ]
feats_4 = [(word_feats(ids_4[f]), '4') for f in ids_4 ]
feats_5 = [(word_feats(ids_5[f]), '5') for f in ids_5 ]
feats_6 = [(word_feats(ids_6[f]), '6') for f in ids_6 ]
feats_7 = [(word_feats(ids_7[f]), '7') for f in ids_7 ]
feats_8 = [(word_feats(ids_8[f]), '8') for f in ids_8 ]
feats_9 = [(word_feats(ids_9[f]), '9') for f in ids_9 ]
trainfeats = feats_1 + feats_2 + feats_3 + feats_4 + feats_5 + feats_6 + feats_7 + feats_8 + feats_9
classifier = NaiveBayesClassifier.train(trainfeats)
执行完这三行后,
stopset = list(set(stopwords.words('english')))
morewords = 'delivery', 'shipment', 'only', 'copy', 'attach', 'material'
stopset.append(morewords)
看看stopset
(输出缩短):
>>> stopset
[u'all',
u'just',
u'being',
...
u'having',
u'once',
('delivery', 'shipment', 'only', 'copy', 'attach', 'material')]
来自 morewords
的附加条目与之前的单词不在同一级别:相反,整个单词元组被视为单个停用词,这没有任何意义。
原因很简单:
list.append()
添加一个元素,list.extend()
添加多个元素。
因此,将 stopset.append(morewords)
更改为 stopset.extend(morewords)
。
或者更好的是,将停用词保持为一组,以便更快地查找。
添加多个元素的正确方法是 set.update()
:
stopset = set(stopwords.words('english'))
morewords = ['delivery', 'shipment', 'only', 'copy', 'attach', 'material']
stopset.update(morewords)
我一直在研究 NLTK 和数据库分类。我在删除停用词时遇到问题。当我打印停用词列表时,所有单词的前面都带有 "u'" 。例如: [你们都',你们只是',你们是',你们'结束',你们都',你们'通过'] 我不确定这是正常现象还是部分问题。
当我打印 (1_feats) 时,我得到了一个单词列表,其中一些是语料库中列出的停用词。
import os
from nltk.classify import NaiveBayesClassifier
from nltk.corpus import stopwords
stopset = list(set(stopwords.words('english')))
morewords = 'delivery', 'shipment', 'only', 'copy', 'attach', 'material'
stopset.append(morewords)
def word_feats(words):
return dict([(word, True) for word in words.split() if word not in stopset])
ids_1 = {}
ids_2 = {}
ids_3 = {}
ids_4 = {}
ids_5 = {}
ids_6 = {}
ids_7 = {}
ids_8 = {}
ids_9 = {}
path1 = "/Users/myname/Documents/Data Classifier Files/1/"
for name in os.listdir(path1):
if name[-4:] == '.txt':
f = open(path1 + "/" + name, "r")
ids_1[name] = f.read()
f.close()
path2 = "/Users/myname/Documents/Data Classifier Files/2/"
for name in os.listdir(path2):
if name[-4:] == '.txt':
f = open(path2 + "/" + name, "r")
ids_2[name] = f.read()
f.close()
path3 = "/Users/myname/Documents/Data Classifier Files/3/"
for name in os.listdir(path3):
if name[-4:] == '.txt':
f = open(path3 + "/" + name, "r")
ids_3[name] = f.read()
f.close()
path4 = "/Users/myname/Documents/Data Classifier Files/4/"
for name in os.listdir(path4):
if name[-4:] == '.txt':
f = open(path4 + "/" + name, "r")
ids_4[name] = f.read()
f.close()
path5 = "/Users/myname/Documents/Data Classifier Files/5/"
for name in os.listdir(path5):
if name[-4:] == '.txt':
f = open(path5 + "/" + name, "r")
ids_5[name] = f.read()
f.close()
path6 = "/Users/myname/Documents/Data Classifier Files/6/"
for name in os.listdir(path6):
if name[-4:] == '.txt':
f = open(path6 + "/" + name, "r")
ids_6[name] = f.read()
f.close()
path7 = "/Users/myname/Documents/Data Classifier Files/7/"
for name in os.listdir(path7):
if name[-4:] == '.txt':
f = open(path7 + "/" + name, "r")
ids_7[name] = f.read()
f.close()
path8 = "/Users/myname/Documents/Data Classifier Files/8/"
for name in os.listdir(path8):
if name[-4:] == '.txt':
f = open(path8 + "/" + name, "r")
ids_8[name] = f.read()
f.close()
path9 = "/Users/myname/Documents/Data Classifier Files/9/"
for name in os.listdir(path9):
if name[-4:] == '.txt':
f = open(path9 + "/" + name, "r")
ids_9[name] = f.read()
f.close()
feats_1 = [(word_feats(ids_1[f]), '1') for f in ids_1 ]
feats_2 = [(word_feats(ids_2[f]), "2") for f in ids_2 ]
feats_3 = [(word_feats(ids_3[f]), '3') for f in ids_3 ]
feats_4 = [(word_feats(ids_4[f]), '4') for f in ids_4 ]
feats_5 = [(word_feats(ids_5[f]), '5') for f in ids_5 ]
feats_6 = [(word_feats(ids_6[f]), '6') for f in ids_6 ]
feats_7 = [(word_feats(ids_7[f]), '7') for f in ids_7 ]
feats_8 = [(word_feats(ids_8[f]), '8') for f in ids_8 ]
feats_9 = [(word_feats(ids_9[f]), '9') for f in ids_9 ]
trainfeats = feats_1 + feats_2 + feats_3 + feats_4 + feats_5 + feats_6 + feats_7 + feats_8 + feats_9
classifier = NaiveBayesClassifier.train(trainfeats)
执行完这三行后,
stopset = list(set(stopwords.words('english')))
morewords = 'delivery', 'shipment', 'only', 'copy', 'attach', 'material'
stopset.append(morewords)
看看stopset
(输出缩短):
>>> stopset
[u'all',
u'just',
u'being',
...
u'having',
u'once',
('delivery', 'shipment', 'only', 'copy', 'attach', 'material')]
来自 morewords
的附加条目与之前的单词不在同一级别:相反,整个单词元组被视为单个停用词,这没有任何意义。
原因很简单:
list.append()
添加一个元素,list.extend()
添加多个元素。
因此,将 stopset.append(morewords)
更改为 stopset.extend(morewords)
。
或者更好的是,将停用词保持为一组,以便更快地查找。
添加多个元素的正确方法是 set.update()
:
stopset = set(stopwords.words('english'))
morewords = ['delivery', 'shipment', 'only', 'copy', 'attach', 'material']
stopset.update(morewords)