如何将 Reuters-21578 数据集作为输入参数传递给 Python 中的标记化功能
How to pass Reuters-21578 dataset as an input parameter for tokenize funktion in Python
我尝试将 Reuters-21578 数据集作为输入参数传递给 tokenize funktion def tokenize(text):
,它应该删除停用词、tokenize、词干和小写。
#!/usr/bin/python3
import nltk
import pandas as pd
import numpy as np
import string
from nltk.corpus import reuters
from nltk import word_tokenize
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
import re
cachedStopWords = stopwords.words("english")
for index, i in enumerate(reuters.fileids()):
text = reuters.raw(fileids=[i])
#output in a txt file
#print(text, file=open("output.txt", "a"))
def tokenize(text):
min_length = 3
words = map(lambda word: word.lower(), word_tokenize(text));
words = [word for word in words
if word not in cachedStopWords]
tokens =(list(map(lambda token: PorterStemmer().stem(token),
words)));
p = re.compile('[a-zA-Z]+');
filtered_tokens =list(filter(lambda token:
p.match(token) and len(token)>=min_length,
tokens));
return filtered_tokens
result=tokenize(text)
print(result)
结果我只得到以下信息:
['a.h.a', 'automot', 'technolog', 'corp', 'year', 'net', 'shr', 'shr', 'dilut', 'net', 'rev', 'mln', 'mln']
如果我将整个数据集传递给 tokenize 函数,那会怎样?
你正在为每个 for 循环覆盖文本,这就是为什么你得到的输出属于路透社数据集中的最后一条记录。只需对您的代码做一些小修改..
text = ''
for index, i in enumerate(reuters.fileids()):
text += reuters.raw(fileids=[i])
我尝试将 Reuters-21578 数据集作为输入参数传递给 tokenize funktion def tokenize(text):
,它应该删除停用词、tokenize、词干和小写。
#!/usr/bin/python3
import nltk
import pandas as pd
import numpy as np
import string
from nltk.corpus import reuters
from nltk import word_tokenize
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
import re
cachedStopWords = stopwords.words("english")
for index, i in enumerate(reuters.fileids()):
text = reuters.raw(fileids=[i])
#output in a txt file
#print(text, file=open("output.txt", "a"))
def tokenize(text):
min_length = 3
words = map(lambda word: word.lower(), word_tokenize(text));
words = [word for word in words
if word not in cachedStopWords]
tokens =(list(map(lambda token: PorterStemmer().stem(token),
words)));
p = re.compile('[a-zA-Z]+');
filtered_tokens =list(filter(lambda token:
p.match(token) and len(token)>=min_length,
tokens));
return filtered_tokens
result=tokenize(text)
print(result)
结果我只得到以下信息:
['a.h.a', 'automot', 'technolog', 'corp', 'year', 'net', 'shr', 'shr', 'dilut', 'net', 'rev', 'mln', 'mln']
如果我将整个数据集传递给 tokenize 函数,那会怎样?
你正在为每个 for 循环覆盖文本,这就是为什么你得到的输出属于路透社数据集中的最后一条记录。只需对您的代码做一些小修改..
text = ''
for index, i in enumerate(reuters.fileids()):
text += reuters.raw(fileids=[i])