使用 lucene 对文档进行评分

scoring documents with lucene

在lucene中自定义tf-idf算法

我要更改 lucene 中的 tf-idf 算法,所以我创建一个新的 IndexSearcher 实例并调用 setSimilarity 函数。

is.setSimilarity(new TFIDFSimilarity() {
            @Override
            // Computes a score factor based on the fraction of all query terms that a document contains.
            // @param overlap - the number of query terms matched in the document
            // @param maxOverlap - the total number of terms in the query
            public float coord(int overlap, int maxOverlap) {
                return 1;
            }

            @Override
            // Computes the normalization value for a query given the sum of the squared weights of each of the query terms.
            // @param sumOfSquaredWeights - the sum of the squares of query term weights
            public float queryNorm(float valueForNormalization) {
                return 100;
            }

            @Override
            // Computes a score factor based on a term or phrase's frequency in a document.
            public float tf(float freq) {
                return freq;
            }

            @Override
            // Computes a score factor based on a term's document frequency (the number of documents which contain the term).
            public float idf(long docFreq, long numDocs) {
                return numDocs/ docFreq;
            }

            @Override
            // Compute an index-time normalization value for this field instance.
            // @param state - statistics of the current field (such as length, boost, etc)
            public float lengthNorm(FieldInvertState state) {
                return 1;
            }

            @Override
            // Decodes a normalization factor stored in an index.
            public float decodeNormValue(long l) {
                return l;
            }

            @Override
            // Encodes a normalization factor for storage in an index.
            public long encodeNormValue(float v) {
                return 0;
            }

            @Override
            // Computes the amount of a sloppy phrase match, based on an edit distance.
            // @param distance - the edit distance of this sloppy phrase match
            public float sloppyFreq(int distance) {
                return 1;
            }

            @Override
            // Calculate a scoring factor based on the data in the payload.
            public float scorePayload(int doc, int start, int end, BytesRef payload) {
                return 1;
            }
        });

如何使用这些函数实现真正的 tf-idf 算法,然后将它们定制成我自己的算法?

最后我使用了这个 class 它给了我 TF.IDF 相似度

import org.apache.lucene.index.FieldInvertState;
import org.apache.lucene.search.similarities.DefaultSimilarity;

public class Similarity1 extends DefaultSimilarity {

    public float tf(int freq) {
        if (freq > 0) {
            return 1 + (float) Math.log(freq);
        } else {
            return 0;
        }
    }

    public float idf(int docFreq, int numDocs) {
        return super.idf(docFreq, numDocs);
    }

    // normalization factor so that queries can be compared
    public float queryNorm(float sumOfSquaredWeights) {
        return super.queryNorm(sumOfSquaredWeights);
    }

    // number of terms in the query that were found in the document
    public float coord(int overlap, int maxOverlap) {
        return super.coord(overlap, maxOverlap);
    }

    public float computeNorm(String fieldName, FieldInvertState state) {
        return super.computeNorm(state);
    }
}