如何查看每个单词的 tf-idf 分数
how to view tf-idf score against each word
我想知道文档中每个单词的 tf-idf
分数。但是,它在矩阵中只有 returns 个值,但我看到每个单词的 tf-idf
分数的特定表示类型。
我使用了 processed 并且代码有效,但是我想更改它的呈现方式:
代码:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
bow_transformer = CountVectorizer(analyzer=text_process).fit(df["comments"].head())
print(len(bow_transformer.vocabulary_))
tfidf_transformer = CountVectorizer(analyzer=text_process).fit(messages['message'])
bow_transformer.vocabulary_transformer().fit(message_bow)
message_tfidf = tfidf_transformer.transform(message_bow)
我得到这样的结果 (39028,01),(1393,1672)
。但是,我希望结果像
features tfidf
fruit 0.00344
excellent 0.00289
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
vect = TfidfVectorizer()
tfidf_matrix = vect.fit_transform(documents["comments"])
df = pd.DataFrame(tfidf_matrix.toarray(),columns=vect.get_feature_names())
print(df)
您可以使用以下代码实现上述结果:
def extract_topn_from_vector(feature_names, sorted_items, topn=5):
"""
get the feature names and tf-idf score of top n items in the doc,
in descending order of scores.
"""
# use only top n items from vector.
sorted_items = sorted_items[:topn]
results= {}
# word index and corresponding tf-idf score
for idx, score in sorted_items:
results[feature_names[idx]] = round(score, 3)
# return a sorted list of tuples with feature name and tf-idf score as its element(in descending order of tf-idf scores).
return sorted(results.items(), key=lambda kv: kv[1], reverse=True)
feature_names = count_vect.get_feature_names()
coo_matrix = message_tfidf.tocoo()
tuples = zip(coo_matrix.col, coo_matrix.data)
sorted_items = sorted(tuples, key=lambda x: (x[1], x[0]), reverse=True)
# extract only the top n elements.
# Here, n is 10.
word_tfidf = extract_topn_from_vector(feature_names, sorted_items, 10)
print("{} {}".format("features", "tfidf"))
for k in word_tfidf:
print("{} - {}".format(k[0], k[1]))
查看下面的完整代码以更好地了解上面的代码片段。
下面的代码是 self-explanatory.
完整代码:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from nltk.corpus import stopwords
import string
import nltk
import pandas as pd
data = pd.read_csv('yourfile.csv')
stops = set(stopwords.words("english"))
wl = nltk.WordNetLemmatizer()
def clean_text(text):
"""
- Remove Punctuations
- Tokenization
- Remove Stopwords
- stemming/lemmatizing
"""
text_nopunct = "".join([char for char in text if char not in string.punctuation])
tokens = re.split("\W+", text)
text = [word for word in tokens if word not in stops]
text = [wl.lemmatize(word) for word in text]
return text
def extract_topn_from_vector(feature_names, sorted_items, topn=5):
"""
get the feature names and tf-idf score of top n items in the doc,
in descending order of scores.
"""
# use only top n items from vector.
sorted_items = sorted_items[:topn]
results= {}
# word index and corresponding tf-idf score
for idx, score in sorted_items:
results[feature_names[idx]] = round(score, 3)
# return a sorted list of tuples with feature name and tf-idf score as its element(in descending order of tf-idf scores).
return sorted(results.items(), key=lambda kv: kv[1], reverse=True)
count_vect = CountVectorizer(analyzer=clean_text, tokenizer = None, preprocessor = None, stop_words = None, max_features = 5000)
freq_term_matrix = count_vect.fit_transform(data['text_body'])
tfidf = TfidfTransformer(norm="l2")
tfidf.fit(freq_term_matrix)
feature_names = count_vect.get_feature_names()
# sample document
doc = 'watched horrid thing TV. Needless say one movies watch see much worse get.'
tf_idf_vector = tfidf.transform(count_vect.transform([doc]))
coo_matrix = tf_idf_vector.tocoo()
tuples = zip(coo_matrix.col, coo_matrix.data)
sorted_items = sorted(tuples, key=lambda x: (x[1], x[0]), reverse=True)
# extract only the top n elements.
# Here, n is 10.
word_tfidf = extract_topn_from_vector(feature_names,sorted_items,10)
print("{} {}".format("features", "tfidf"))
for k in word_tfidf:
print("{} - {}".format(k[0], k[1]))
示例输出:
features tfidf
Needless - 0.515
horrid - 0.501
worse - 0.312
watched - 0.275
TV - 0.272
say - 0.202
watch - 0.199
thing - 0.189
much - 0.177
see - 0.164
我想知道文档中每个单词的 tf-idf
分数。但是,它在矩阵中只有 returns 个值,但我看到每个单词的 tf-idf
分数的特定表示类型。
我使用了 processed 并且代码有效,但是我想更改它的呈现方式:
代码:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
bow_transformer = CountVectorizer(analyzer=text_process).fit(df["comments"].head())
print(len(bow_transformer.vocabulary_))
tfidf_transformer = CountVectorizer(analyzer=text_process).fit(messages['message'])
bow_transformer.vocabulary_transformer().fit(message_bow)
message_tfidf = tfidf_transformer.transform(message_bow)
我得到这样的结果 (39028,01),(1393,1672)
。但是,我希望结果像
features tfidf
fruit 0.00344
excellent 0.00289
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
vect = TfidfVectorizer()
tfidf_matrix = vect.fit_transform(documents["comments"])
df = pd.DataFrame(tfidf_matrix.toarray(),columns=vect.get_feature_names())
print(df)
您可以使用以下代码实现上述结果:
def extract_topn_from_vector(feature_names, sorted_items, topn=5):
"""
get the feature names and tf-idf score of top n items in the doc,
in descending order of scores.
"""
# use only top n items from vector.
sorted_items = sorted_items[:topn]
results= {}
# word index and corresponding tf-idf score
for idx, score in sorted_items:
results[feature_names[idx]] = round(score, 3)
# return a sorted list of tuples with feature name and tf-idf score as its element(in descending order of tf-idf scores).
return sorted(results.items(), key=lambda kv: kv[1], reverse=True)
feature_names = count_vect.get_feature_names()
coo_matrix = message_tfidf.tocoo()
tuples = zip(coo_matrix.col, coo_matrix.data)
sorted_items = sorted(tuples, key=lambda x: (x[1], x[0]), reverse=True)
# extract only the top n elements.
# Here, n is 10.
word_tfidf = extract_topn_from_vector(feature_names, sorted_items, 10)
print("{} {}".format("features", "tfidf"))
for k in word_tfidf:
print("{} - {}".format(k[0], k[1]))
查看下面的完整代码以更好地了解上面的代码片段。 下面的代码是 self-explanatory.
完整代码:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from nltk.corpus import stopwords
import string
import nltk
import pandas as pd
data = pd.read_csv('yourfile.csv')
stops = set(stopwords.words("english"))
wl = nltk.WordNetLemmatizer()
def clean_text(text):
"""
- Remove Punctuations
- Tokenization
- Remove Stopwords
- stemming/lemmatizing
"""
text_nopunct = "".join([char for char in text if char not in string.punctuation])
tokens = re.split("\W+", text)
text = [word for word in tokens if word not in stops]
text = [wl.lemmatize(word) for word in text]
return text
def extract_topn_from_vector(feature_names, sorted_items, topn=5):
"""
get the feature names and tf-idf score of top n items in the doc,
in descending order of scores.
"""
# use only top n items from vector.
sorted_items = sorted_items[:topn]
results= {}
# word index and corresponding tf-idf score
for idx, score in sorted_items:
results[feature_names[idx]] = round(score, 3)
# return a sorted list of tuples with feature name and tf-idf score as its element(in descending order of tf-idf scores).
return sorted(results.items(), key=lambda kv: kv[1], reverse=True)
count_vect = CountVectorizer(analyzer=clean_text, tokenizer = None, preprocessor = None, stop_words = None, max_features = 5000)
freq_term_matrix = count_vect.fit_transform(data['text_body'])
tfidf = TfidfTransformer(norm="l2")
tfidf.fit(freq_term_matrix)
feature_names = count_vect.get_feature_names()
# sample document
doc = 'watched horrid thing TV. Needless say one movies watch see much worse get.'
tf_idf_vector = tfidf.transform(count_vect.transform([doc]))
coo_matrix = tf_idf_vector.tocoo()
tuples = zip(coo_matrix.col, coo_matrix.data)
sorted_items = sorted(tuples, key=lambda x: (x[1], x[0]), reverse=True)
# extract only the top n elements.
# Here, n is 10.
word_tfidf = extract_topn_from_vector(feature_names,sorted_items,10)
print("{} {}".format("features", "tfidf"))
for k in word_tfidf:
print("{} - {}".format(k[0], k[1]))
示例输出:
features tfidf
Needless - 0.515
horrid - 0.501
worse - 0.312
watched - 0.275
TV - 0.272
say - 0.202
watch - 0.199
thing - 0.189
much - 0.177
see - 0.164