如何使用 nltk 将标记列表转换为 wordnet 引理列表?

How to convert token list into wordnet lemma list using nltk?

我有一个从 pdf 源中提取的标记列表。我能够预处理文本并将其标记化,但我想遍历标记并将列表中的每个标记转换为 wordnet 语料库中的引理。所以,我的令牌列表如下所示:

['0000', 'Everyone', 'age', 'remembers', 'Þ', 'rst', 'heard', 'contest', 'I', 'sitting', 'hideout', 'watching', ...]

没有 'Everyone'、'0000'、'Þ' 等词的词元需要删除。但是对于 'age'、'remembers'、'heard' 等词,标记列表应该如下所示:

['age', 'remember', 'hear', ...]


syns = wn.synsets("heard")


def clean_text(text):
    # Eliminating punctuations
    text = "".join([word for word in text if word not in string.punctuation])
    # tokenizing
    tokens = re.split("\W+", text)
    # lemmatizing and removing stopwords
    text = [wn.lemmatize(word) for word in tokens if word not in stopwords]
    # converting token list into synset
    syns = [text.lemmas()[0].name() for text in wn.synsets(text)]
    return text


syns = [text.lemmas()[0].name() for text in wn.synsets(text)]
AttributeError: 'list' object has no attribute 'lower'



import string
import re
from wordcloud import WordCloud
import nltk
from nltk.tokenize.treebank import TreebankWordDetokenizer
from nltk.corpus import wordnet
import PyPDF4
import matplotlib
import numpy as np
from PIL import Image

stopwords = nltk.corpus.stopwords.words('english')
moreStopwords = ['clin97803078874365pallr1indd'] # additional stopwords to be removed manually.
wn = nltk.WordNetLemmatizer()

data = PyPDF4.PdfFileReader(open('ReadyPlayerOne.pdf', 'rb'))
pageData = ''
for page in data.pages:
    pageData += page.extractText()
# print(pageData)

def clean_text(text):
    text = "".join([word for word in text if word not in string.punctuation])
    tokens = re.split("\W+", text)
    text = [wn.lemmatize(word) for word in tokens if word not in stopwords]
    syns = [text.lemmas()[0].name() for text in wordnet.synsets(text)]
    return syns


您正在用单词列表调用 wordnet.synsets(text)(检查此时 text 是什么),您应该用 word 调用它。 wordnet.synsets 的预处理试图将 .lower() 应用于其参数,因此出现错误 (AttributeError: 'list' object has no attribute 'lower').

下面是 clean_text 的功能版本,修复了这个问题:

import string
import re
import nltk
from nltk.corpus import wordnet

stopwords = nltk.corpus.stopwords.words('english')
wn = nltk.WordNetLemmatizer()

def clean_text(text):
    text = "".join([word for word in text if word not in string.punctuation])
    tokens = re.split("\W+", text)
    text = [wn.lemmatize(word) for word in tokens if word not in stopwords]
    lemmas = []
    for token in text:
        lemmas += [synset.lemmas()[0].name() for synset in wordnet.synsets(token)]
    return lemmas

text = "The grass was greener."



['grass', 'Grass', 'supergrass', 'eatage', 'pot', 'grass', 'grass', 'grass', 'grass', 'grass', 'denounce', 'green', 'green', 'green', 'green', 'fleeceable']