如何在列表中的句子列表中找到每个单词的引理和频率计数?
How to find the lemmas and frequency count of each word in list of sentences in a list?
我想使用 WordNet Lemmatizer 找出词条,我还需要计算每个词的频率。
我收到以下错误。
轨迹如下:
TypeError: unhashable type: 'list'
注意:语料库在 nltk
包本身上可用。
目前我试过的如下:
import nltk, re
import string
from collections import Counter
from string import punctuation
from nltk.tokenize import TweetTokenizer, sent_tokenize, word_tokenize
from nltk.corpus import gutenberg, stopwords
from nltk.stem import WordNetLemmatizer
def remove_punctuation(from_text):
table = str.maketrans('', '', string.punctuation)
stripped = [w.translate(table) for w in from_text]
return stripped
def preprocessing():
raw_data = (gutenberg.raw('shakespeare-hamlet.txt'))
tokens_sentences = sent_tokenize(raw_data)
tokens = [[word.lower() for word in line.split()] for line in tokens_sentences]
print(len(tokens))
global stripped_tokens
stripped_tokens = [remove_punctuation(i) for i in tokens]
sw = (stopwords.words('english'))
filter_set = [[token for token in sentence if (token.lower() not in sw and token.isalnum())] for sentence in stripped_tokens]
lemma= WordNetLemmatizer()
global lem
lem = []
for w in filter_set:
lem.append(lemma.lemmatize(w))
preprocessing()
请帮我解决这个问题。
问题是 lemma.lemmatize
需要 string
而你传递的是 list
。 filter_set
的元素是 lists
。您需要更改行:
lem.append(lemma.lemmatize(w))
像这样:
lem.append([wi for wi in map(lemma.lemmatize, w)])
以上代码将 lemma.lemmatize 应用于 w
中的每个标记 (wi
)。完整代码:
import nltk, re
import string
from collections import Counter
from string import punctuation
from nltk.tokenize import TweetTokenizer, sent_tokenize, word_tokenize
from nltk.corpus import gutenberg, stopwords
from nltk.stem import WordNetLemmatizer
def remove_punctuation(from_text):
table = str.maketrans('', '', string.punctuation)
stripped = [w.translate(table) for w in from_text]
return stripped
def preprocessing():
raw_data = (gutenberg.raw('shakespeare-hamlet.txt'))
tokens_sentences = sent_tokenize(raw_data)
tokens = [[word.lower() for word in line.split()] for line in tokens_sentences]
print(len(tokens))
stripped_tokens = [remove_punctuation(i) for i in tokens]
sw = (stopwords.words('english'))
filter_set = [[token for token in sentence if (token.lower() not in sw and token.isalnum())] for sentence in
stripped_tokens]
lemma = WordNetLemmatizer()
lem = []
for w in filter_set:
lem.append([wi for wi in map(lemma.lemmatize, w)])
return lem
result = preprocessing()
for e in result[:10]: # take the first 10 results
print(e)
输出
['tragedie', 'hamlet', 'william', 'shakespeare', '1599', 'actus', 'primus']
['scoena', 'prima']
['enter', 'barnardo', 'francisco', 'two', 'centinels']
['barnardo']
['who']
['fran']
['nay', 'answer', 'stand', 'vnfold', 'selfe', 'bar']
['long', 'liue', 'king', 'fran']
['barnardo']
['bar']
更新
要获取频率,您可以使用 Counter
:
result = preprocessing()
frequencies = Counter(word for sentence in result for word in sentence)
for word, frequency in frequencies.most_common(10): # get the 10 most frequent words
print(word, frequency)
输出
ham 337
lord 217
king 180
haue 175
come 127
let 107
shall 107
hamlet 107
thou 105
good 98
我想使用 WordNet Lemmatizer 找出词条,我还需要计算每个词的频率。
我收到以下错误。
轨迹如下:
TypeError: unhashable type: 'list'
注意:语料库在 nltk
包本身上可用。
目前我试过的如下:
import nltk, re
import string
from collections import Counter
from string import punctuation
from nltk.tokenize import TweetTokenizer, sent_tokenize, word_tokenize
from nltk.corpus import gutenberg, stopwords
from nltk.stem import WordNetLemmatizer
def remove_punctuation(from_text):
table = str.maketrans('', '', string.punctuation)
stripped = [w.translate(table) for w in from_text]
return stripped
def preprocessing():
raw_data = (gutenberg.raw('shakespeare-hamlet.txt'))
tokens_sentences = sent_tokenize(raw_data)
tokens = [[word.lower() for word in line.split()] for line in tokens_sentences]
print(len(tokens))
global stripped_tokens
stripped_tokens = [remove_punctuation(i) for i in tokens]
sw = (stopwords.words('english'))
filter_set = [[token for token in sentence if (token.lower() not in sw and token.isalnum())] for sentence in stripped_tokens]
lemma= WordNetLemmatizer()
global lem
lem = []
for w in filter_set:
lem.append(lemma.lemmatize(w))
preprocessing()
请帮我解决这个问题。
问题是 lemma.lemmatize
需要 string
而你传递的是 list
。 filter_set
的元素是 lists
。您需要更改行:
lem.append(lemma.lemmatize(w))
像这样:
lem.append([wi for wi in map(lemma.lemmatize, w)])
以上代码将 lemma.lemmatize 应用于 w
中的每个标记 (wi
)。完整代码:
import nltk, re
import string
from collections import Counter
from string import punctuation
from nltk.tokenize import TweetTokenizer, sent_tokenize, word_tokenize
from nltk.corpus import gutenberg, stopwords
from nltk.stem import WordNetLemmatizer
def remove_punctuation(from_text):
table = str.maketrans('', '', string.punctuation)
stripped = [w.translate(table) for w in from_text]
return stripped
def preprocessing():
raw_data = (gutenberg.raw('shakespeare-hamlet.txt'))
tokens_sentences = sent_tokenize(raw_data)
tokens = [[word.lower() for word in line.split()] for line in tokens_sentences]
print(len(tokens))
stripped_tokens = [remove_punctuation(i) for i in tokens]
sw = (stopwords.words('english'))
filter_set = [[token for token in sentence if (token.lower() not in sw and token.isalnum())] for sentence in
stripped_tokens]
lemma = WordNetLemmatizer()
lem = []
for w in filter_set:
lem.append([wi for wi in map(lemma.lemmatize, w)])
return lem
result = preprocessing()
for e in result[:10]: # take the first 10 results
print(e)
输出
['tragedie', 'hamlet', 'william', 'shakespeare', '1599', 'actus', 'primus']
['scoena', 'prima']
['enter', 'barnardo', 'francisco', 'two', 'centinels']
['barnardo']
['who']
['fran']
['nay', 'answer', 'stand', 'vnfold', 'selfe', 'bar']
['long', 'liue', 'king', 'fran']
['barnardo']
['bar']
更新
要获取频率,您可以使用 Counter
:
result = preprocessing()
frequencies = Counter(word for sentence in result for word in sentence)
for word, frequency in frequencies.most_common(10): # get the 10 most frequent words
print(word, frequency)
输出
ham 337
lord 217
king 180
haue 175
come 127
let 107
shall 107
hamlet 107
thou 105
good 98