如何反映与 sklearn 的 TfidfVectorizer 相同的结果？

Question

我正在尝试从头开始构建 TfidfVectorizer，我已经构建了与 sklearn 几乎相同的矢量化器，但我无法获得与 TfidfVectorizer 相同的 tf-idf 分数。

这是我的代码：

def vocab(corpus):
    entire_corpus = ' '.join([i for i in corpus]).split()
    values = Counter(entire_corpus)
    return dict(values)


def tfidf(corpus, vocab):
    row = 0
    vocabs = vocab(corpus)
    for sentence in corpus:
        col = 0
        word_freq = Counter(sentence.split())
        for word, freq in word_freq.items():
            tf = freq/len(sentence)
            n = vocabs.get(word, -1)
            if n != -1:
                idf = 1.0 + math.log((len(corpus)+1)/(n+1))
            print((row, col), tf*idf)
            col = col+1
        row = row + 1


vocabs = vocab(corpus)
tfidf(corpus, vocabs)

第一行的输出是

(0, 0) 0.038461538461538464

(0, 1) 0.038461538461538464

(0, 2) 0.038461538461538464

(0, 3) 0.05810867783715349

(0, 4) 0.038461538461538464

而 sklearn 的 TfIDFvectorizer 的输出是

(0, 8) 0.38408524091481483

(0, 6) 0.38408524091481483

(0, 3) 0.38408524091481483

(0, 2) 0.5802858236844359

(0, 1) 0.46979138557992045

你能告诉我哪里错了吗？谢谢。

Answer 1

from sklearn.feature_extraction.text import TfidfVectorizer
from collections import Counter
import numpy as np
import pandas as pd

def tfidf_vectorizer(corpus):

    terms = list(set(' '.join([i for i in corpus]).split()))

    terms.sort()

    mat = np.zeros((len(corpus), len(terms)))

    for i in range(len(corpus)):

        tf = Counter(corpus[i].split())

        for j in range(len(terms)):

            df = len([document for document in corpus if terms[j] in document])

            idf = 1.0 + np.log((len(corpus) + 1) / (df + 1))

            mat[i, j] = tf[terms[j]] * idf

    return (terms, mat)

corpus = ['this is the first document',
          'this document is the second document',
          'this one is the third']

# manual calculation
vectorizer_1 = tfidf_vectorizer(corpus)

terms_1 = vectorizer_1[0]
matrix_1 = vectorizer_1[1]

# scikit-learn calculation
vectorizer_2 = TfidfVectorizer(norm=None).fit(corpus)

terms_2 = vectorizer_2.get_feature_names()
matrix_2 = vectorizer_2.transform(corpus).toarray()

print(pd.DataFrame(data=matrix_1, columns=terms_1))

   document     first   is       one    second  the     third  this
0  1.287682  1.693147  1.0  0.000000  0.000000  1.0  0.000000   1.0
1  2.575364  0.000000  1.0  0.000000  1.693147  1.0  0.000000   1.0
2  0.000000  0.000000  1.0  1.693147  0.000000  1.0  1.693147   1.0

print(pd.DataFrame(data=matrix_2, columns=terms_2))

   document     first   is       one    second  the     third  this
0  1.287682  1.693147  1.0  0.000000  0.000000  1.0  0.000000   1.0
1  2.575364  0.000000  1.0  0.000000  1.693147  1.0  0.000000   1.0
2  0.000000  0.000000  1.0  1.693147  0.000000  1.0  1.693147   1.0

如何反映与 sklearn 的 TfidfVectorizer 相同的结果？

How to reflect the same results as from sklearn's TfidfVectorizer?

python

machine-learning

tf-idf

scikit-learn