Lucene CustomScoreQuery 不从 FunctionQuery 的 FieldSource 传递值
Lucene CustomScoreQuery does not pass value from FunctionQuery's FieldSource
如果我从 Lucene Java Doc Page 中理解正确,将 CustomScoreQuery
实例设置为严格应该传递 FunctionQuery
的 FieldSource
值而不修改(如规范化)到方法 public float customScore(int doc, float subQueryScore, float valSrcScore)
中 CustomScoreProvider
的 valSrcScore
。
因此,我想,我得到了精确的浮点值,它存储在文档的 FloatSourceField
中。
但当索引数据量变大时,情况似乎并非如此。这里我有一个最小的例子来说明我的意思:
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.*;
import org.apache.lucene.index.*;
import org.apache.lucene.queries.*;
import org.apache.lucene.queries.function.FunctionQuery;
import org.apache.lucene.queries.function.valuesource.FloatFieldSource;
import org.apache.lucene.search.*;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.util.Version;
import java.io.IOException;
public class CustomScoreTest {
public static void main(String[] args) throws IOException {
RAMDirectory index = new RAMDirectory();
IndexWriterConfig config = new IndexWriterConfig(Version.LATEST, new StandardAnalyzer());
IndexWriter writer = new IndexWriter(index, config);
// prepare dummy text
String text = "";
for (int i = 0; i < 1000; i++) text += "abc ";
// add dummy docs
for (int i = 0; i <25000; i++) {
Document doc = new Document();
doc.add(new FloatField("number", i * 100f, Field.Store.YES));
doc.add(new TextField("text", text, Field.Store.YES));
writer.addDocument(doc);
}
writer.close();
IndexReader reader = IndexReader.open(index);
IndexSearcher searcher = new IndexSearcher(reader);
Query q1 = new TermQuery(new Term("text", "abc"));
CustomScoreQuery q2 = new CustomScoreQuery(q1, new FunctionQuery(new FloatFieldSource("number"))) {
protected CustomScoreProvider getCustomScoreProvider(AtomicReaderContext ctx) throws IOException {
return new CustomScoreProvider(ctx) {
public float customScore(int doc, float subQueryScore, float valSrcScore) throws IOException {
float diff = Math.abs(valSrcScore - searcher.doc(doc).getField("number").numericValue().floatValue());
if (diff > 0) throw new IllegalStateException("diff: " + diff);
return super.customScore(doc, subQueryScore, valSrcScore);
}
};
}
};
// In strict custom scoring, the part does not participate in weight normalization.
// This may be useful when one wants full control over how scores are modified, and
// does not care about normalising by the part
q2.setStrict(true);
// Exception in thread "main" java.lang.IllegalStateException: diff: 1490700.0
searcher.search(q2, 10);
}
}
如本示例中所述,抛出异常是因为 valSrcScore
与存储在文档 "number" 字段中的实际值有很大差异。
但是当我将索引虚拟文档的数量减少到 2500 个时,它按预期工作并且我得到的值与 "number" 字段中的值相差 0。
我在这里做错了什么?
你是哪个版本的 lucene 运行?一种可能是随着索引大小的增长,AtomicReaderContext
应该替换为 LeafReaderContext
。只是假设
如果我从 Lucene Java Doc Page 中理解正确,将 CustomScoreQuery
实例设置为严格应该传递 FunctionQuery
的 FieldSource
值而不修改(如规范化)到方法 public float customScore(int doc, float subQueryScore, float valSrcScore)
中 CustomScoreProvider
的 valSrcScore
。
因此,我想,我得到了精确的浮点值,它存储在文档的 FloatSourceField
中。
但当索引数据量变大时,情况似乎并非如此。这里我有一个最小的例子来说明我的意思:
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.*;
import org.apache.lucene.index.*;
import org.apache.lucene.queries.*;
import org.apache.lucene.queries.function.FunctionQuery;
import org.apache.lucene.queries.function.valuesource.FloatFieldSource;
import org.apache.lucene.search.*;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.util.Version;
import java.io.IOException;
public class CustomScoreTest {
public static void main(String[] args) throws IOException {
RAMDirectory index = new RAMDirectory();
IndexWriterConfig config = new IndexWriterConfig(Version.LATEST, new StandardAnalyzer());
IndexWriter writer = new IndexWriter(index, config);
// prepare dummy text
String text = "";
for (int i = 0; i < 1000; i++) text += "abc ";
// add dummy docs
for (int i = 0; i <25000; i++) {
Document doc = new Document();
doc.add(new FloatField("number", i * 100f, Field.Store.YES));
doc.add(new TextField("text", text, Field.Store.YES));
writer.addDocument(doc);
}
writer.close();
IndexReader reader = IndexReader.open(index);
IndexSearcher searcher = new IndexSearcher(reader);
Query q1 = new TermQuery(new Term("text", "abc"));
CustomScoreQuery q2 = new CustomScoreQuery(q1, new FunctionQuery(new FloatFieldSource("number"))) {
protected CustomScoreProvider getCustomScoreProvider(AtomicReaderContext ctx) throws IOException {
return new CustomScoreProvider(ctx) {
public float customScore(int doc, float subQueryScore, float valSrcScore) throws IOException {
float diff = Math.abs(valSrcScore - searcher.doc(doc).getField("number").numericValue().floatValue());
if (diff > 0) throw new IllegalStateException("diff: " + diff);
return super.customScore(doc, subQueryScore, valSrcScore);
}
};
}
};
// In strict custom scoring, the part does not participate in weight normalization.
// This may be useful when one wants full control over how scores are modified, and
// does not care about normalising by the part
q2.setStrict(true);
// Exception in thread "main" java.lang.IllegalStateException: diff: 1490700.0
searcher.search(q2, 10);
}
}
如本示例中所述,抛出异常是因为 valSrcScore
与存储在文档 "number" 字段中的实际值有很大差异。
但是当我将索引虚拟文档的数量减少到 2500 个时,它按预期工作并且我得到的值与 "number" 字段中的值相差 0。
我在这里做错了什么?
你是哪个版本的 lucene 运行?一种可能是随着索引大小的增长,AtomicReaderContext
应该替换为 LeafReaderContext
。只是假设