如何反映与 sklearn 的 TfidfVectorizer 相同的结果?
How to reflect the same results as from sklearn's TfidfVectorizer?
我正在尝试从头开始构建 TfidfVectorizer,我已经构建了与 sklearn 几乎相同的矢量化器,但我无法获得与 TfidfVectorizer 相同的 tf-idf 分数。
这是我的代码:
def vocab(corpus):
entire_corpus = ' '.join([i for i in corpus]).split()
values = Counter(entire_corpus)
return dict(values)
def tfidf(corpus, vocab):
row = 0
vocabs = vocab(corpus)
for sentence in corpus:
col = 0
word_freq = Counter(sentence.split())
for word, freq in word_freq.items():
tf = freq/len(sentence)
n = vocabs.get(word, -1)
if n != -1:
idf = 1.0 + math.log((len(corpus)+1)/(n+1))
print((row, col), tf*idf)
col = col+1
row = row + 1
vocabs = vocab(corpus)
tfidf(corpus, vocabs)
第一行的输出是
(0, 0) 0.038461538461538464
(0, 1) 0.038461538461538464
(0, 2) 0.038461538461538464
(0, 3) 0.05810867783715349
(0, 4) 0.038461538461538464
而 sklearn 的 TfIDFvectorizer 的输出是
(0, 8) 0.38408524091481483
(0, 6) 0.38408524091481483
(0, 3) 0.38408524091481483
(0, 2) 0.5802858236844359
(0, 1) 0.46979138557992045
你能告诉我哪里错了吗?谢谢。
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import Counter
import numpy as np
import pandas as pd
def tfidf_vectorizer(corpus):
terms = list(set(' '.join([i for i in corpus]).split()))
terms.sort()
mat = np.zeros((len(corpus), len(terms)))
for i in range(len(corpus)):
tf = Counter(corpus[i].split())
for j in range(len(terms)):
df = len([document for document in corpus if terms[j] in document])
idf = 1.0 + np.log((len(corpus) + 1) / (df + 1))
mat[i, j] = tf[terms[j]] * idf
return (terms, mat)
corpus = ['this is the first document',
'this document is the second document',
'this one is the third']
# manual calculation
vectorizer_1 = tfidf_vectorizer(corpus)
terms_1 = vectorizer_1[0]
matrix_1 = vectorizer_1[1]
# scikit-learn calculation
vectorizer_2 = TfidfVectorizer(norm=None).fit(corpus)
terms_2 = vectorizer_2.get_feature_names()
matrix_2 = vectorizer_2.transform(corpus).toarray()
print(pd.DataFrame(data=matrix_1, columns=terms_1))
document first is one second the third this
0 1.287682 1.693147 1.0 0.000000 0.000000 1.0 0.000000 1.0
1 2.575364 0.000000 1.0 0.000000 1.693147 1.0 0.000000 1.0
2 0.000000 0.000000 1.0 1.693147 0.000000 1.0 1.693147 1.0
print(pd.DataFrame(data=matrix_2, columns=terms_2))
document first is one second the third this
0 1.287682 1.693147 1.0 0.000000 0.000000 1.0 0.000000 1.0
1 2.575364 0.000000 1.0 0.000000 1.693147 1.0 0.000000 1.0
2 0.000000 0.000000 1.0 1.693147 0.000000 1.0 1.693147 1.0
我正在尝试从头开始构建 TfidfVectorizer,我已经构建了与 sklearn 几乎相同的矢量化器,但我无法获得与 TfidfVectorizer 相同的 tf-idf 分数。
这是我的代码:
def vocab(corpus):
entire_corpus = ' '.join([i for i in corpus]).split()
values = Counter(entire_corpus)
return dict(values)
def tfidf(corpus, vocab):
row = 0
vocabs = vocab(corpus)
for sentence in corpus:
col = 0
word_freq = Counter(sentence.split())
for word, freq in word_freq.items():
tf = freq/len(sentence)
n = vocabs.get(word, -1)
if n != -1:
idf = 1.0 + math.log((len(corpus)+1)/(n+1))
print((row, col), tf*idf)
col = col+1
row = row + 1
vocabs = vocab(corpus)
tfidf(corpus, vocabs)
第一行的输出是
(0, 0) 0.038461538461538464
(0, 1) 0.038461538461538464
(0, 2) 0.038461538461538464
(0, 3) 0.05810867783715349
(0, 4) 0.038461538461538464
而 sklearn 的 TfIDFvectorizer 的输出是
(0, 8) 0.38408524091481483
(0, 6) 0.38408524091481483
(0, 3) 0.38408524091481483
(0, 2) 0.5802858236844359
(0, 1) 0.46979138557992045
你能告诉我哪里错了吗?谢谢。
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import Counter
import numpy as np
import pandas as pd
def tfidf_vectorizer(corpus):
terms = list(set(' '.join([i for i in corpus]).split()))
terms.sort()
mat = np.zeros((len(corpus), len(terms)))
for i in range(len(corpus)):
tf = Counter(corpus[i].split())
for j in range(len(terms)):
df = len([document for document in corpus if terms[j] in document])
idf = 1.0 + np.log((len(corpus) + 1) / (df + 1))
mat[i, j] = tf[terms[j]] * idf
return (terms, mat)
corpus = ['this is the first document',
'this document is the second document',
'this one is the third']
# manual calculation
vectorizer_1 = tfidf_vectorizer(corpus)
terms_1 = vectorizer_1[0]
matrix_1 = vectorizer_1[1]
# scikit-learn calculation
vectorizer_2 = TfidfVectorizer(norm=None).fit(corpus)
terms_2 = vectorizer_2.get_feature_names()
matrix_2 = vectorizer_2.transform(corpus).toarray()
print(pd.DataFrame(data=matrix_1, columns=terms_1))
document first is one second the third this
0 1.287682 1.693147 1.0 0.000000 0.000000 1.0 0.000000 1.0
1 2.575364 0.000000 1.0 0.000000 1.693147 1.0 0.000000 1.0
2 0.000000 0.000000 1.0 1.693147 0.000000 1.0 1.693147 1.0
print(pd.DataFrame(data=matrix_2, columns=terms_2))
document first is one second the third this
0 1.287682 1.693147 1.0 0.000000 0.000000 1.0 0.000000 1.0
1 2.575364 0.000000 1.0 0.000000 1.693147 1.0 0.000000 1.0
2 0.000000 0.000000 1.0 1.693147 0.000000 1.0 1.693147 1.0