使用 PostingsSolrHighlighter 进行自定义
Customization with PostingsSolrHighlighter
我正在使用带默认参数的 PostingsSolrHighlighter,它似乎提供了良好的结果。我的用例是针对完整文档搜索文本段落并获取相关片段,并且此荧光笔确实提供了匹配文档中最接近的段落。
我需要的定制是删除似乎与搜索到的段落无关的垃圾片段。此时,我希望采用一种简单的方法:删除相对分数为 0.5 或更低的片段。
但是我找不到一种方法来获得 Solr 为 sentences/snippets 计算的分数,所以我可以丢弃不太想要的分数。
我可以告诉 solr 仅当它的分数大于 xyz 分数时才保留片段,或者它可以以某种方式给我片段分数吗?
我仍然想保持结果片段的顺序(根据原始文档中片段的起始位置)。
谢谢。
好的,我无法通过任何方式要求 Solr return 带有分数的片段或丢弃分数低于特定数字的片段,但我扩展了荧光笔和格式化程序,现在得到了带有分数的片段(solr 计算的绝对分数和相对分数)。如果有人需要,下面是扩展代码:
package solrExtension;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.postingshighlight.DefaultPassageFormatter;
import org.apache.lucene.search.postingshighlight.Passage;
import org.apache.lucene.search.postingshighlight.PassageFormatter;
import org.apache.lucene.search.postingshighlight.PostingsHighlighter;
import org.apache.solr.common.params.HighlightParams;
import org.apache.solr.common.util.NamedList;
import org.apache.solr.highlight.PostingsSolrHighlighter;
import org.apache.solr.request.SolrQueryRequest;
import org.apache.solr.search.DocList;
public class CustomPostingsSolrHighlighter extends PostingsSolrHighlighter {
protected PostingsHighlighter getHighlighter(SolrQueryRequest req) {
return new customSolrExtendedPostingsHighlighter(req);
}
public class customSolrExtendedPostingsHighlighter extends PostingsSolrHighlighter.SolrExtendedPostingsHighlighter {
public customSolrExtendedPostingsHighlighter(SolrQueryRequest req) {
super(req);
}
@Override
protected PassageFormatter getFormatter(String fieldName) {
String preTag = params.getFieldParam(fieldName, HighlightParams.TAG_PRE, "<em>");
String postTag = params.getFieldParam(fieldName, HighlightParams.TAG_POST, "</em>");
String ellipsis = params.getFieldParam(fieldName, HighlightParams.TAG_ELLIPSIS, "... ");
String encoder = params.getFieldParam(fieldName, HighlightParams.ENCODER, "simple");
return new CustomPassageFormatter(preTag, postTag, ellipsis, "html".equals(encoder));
}
}
public class CustomPassageFormatter extends DefaultPassageFormatter {
public CustomPassageFormatter() {
super();
}
public CustomPassageFormatter(String preTag, String postTag,
String ellipsis, boolean escape) {
super(preTag, postTag, ellipsis, escape);
}
@Override
public String format(Passage passages[], String content) {
StringBuilder sb = new StringBuilder();
int pos = 0;
double psgTtlScore = 0, maxScore=0, score;
int psgCounts = 0;
List<CustomPsg> psgGroups = new ArrayList<CustomPsg>();
for (Passage passage : passages) {
// don't add ellipsis if it's the first one, or if it's
// connected.
if (passage.getStartOffset() > pos && pos > 0) {
score = psgTtlScore / psgCounts;
if (score > maxScore)
maxScore = score;
sb.append("[[").append(score).append("]]");
psgGroups.add(new CustomPsg(sb.toString(), score));
//sb.append(ellipsis);
psgTtlScore = 0;
psgCounts = 0;
sb = new StringBuilder();
}
psgTtlScore += passage.getScore();
psgCounts++;
pos = passage.getStartOffset();
for (int i = 0; i < passage.getNumMatches(); i++) {
int start = passage.getMatchStarts()[i];
int end = passage.getMatchEnds()[i];
// it's possible to have overlapping terms
if (start > pos) {
append(sb, content, pos, start);
}
if (end > pos) {
sb.append(preTag);
append(sb, content, Math.max(pos, start), end);
sb.append(postTag);
pos = end;
}
}
sb.append("[").append(passage.getScore()).append("]");
// it's possible a "term" from the analyzer could span a
// sentence boundary.
append(sb, content, pos, Math.max(pos, passage.getEndOffset()));
pos = passage.getEndOffset();
}
sb.append("[[").append(psgTtlScore / psgCounts).append("]]");
psgGroups.add(new CustomPsg(sb.toString(), psgTtlScore / psgCounts));
sb = new StringBuilder();
for (CustomPsg psg : psgGroups) {
sb.append(psg.psg).append("{{").append(psg.score/maxScore).append("}}").append(ellipsis);
}
return sb.toString();
}
private class CustomPsg {
public String psg;
public double score;
public CustomPsg(String psg, double score) {
this.psg = psg;
this.score = score;
}
}
}
}
使用这个扩展代码,我在搜索结果中得到了片段及其分数,如下所示:
.....sions and other areas subject to its jurisdiction[22.847776]. [[22.847776]]{{0.954086}}
..... and the Trustee[19.287382]. [[19.287382]]{{0.805409}}
[]中的数字表示单个语句的分数,[[]]中的数字表示平均分数(所有句子的总和' 整个片段(一个片段可能包含多个句子)的分数/片段中的句子计数),{{ }} 中的数字表示片段的相对分数(相对于其他片段的平均分数)。
我正在使用带默认参数的 PostingsSolrHighlighter,它似乎提供了良好的结果。我的用例是针对完整文档搜索文本段落并获取相关片段,并且此荧光笔确实提供了匹配文档中最接近的段落。
我需要的定制是删除似乎与搜索到的段落无关的垃圾片段。此时,我希望采用一种简单的方法:删除相对分数为 0.5 或更低的片段。 但是我找不到一种方法来获得 Solr 为 sentences/snippets 计算的分数,所以我可以丢弃不太想要的分数。
我可以告诉 solr 仅当它的分数大于 xyz 分数时才保留片段,或者它可以以某种方式给我片段分数吗?
我仍然想保持结果片段的顺序(根据原始文档中片段的起始位置)。
谢谢。
好的,我无法通过任何方式要求 Solr return 带有分数的片段或丢弃分数低于特定数字的片段,但我扩展了荧光笔和格式化程序,现在得到了带有分数的片段(solr 计算的绝对分数和相对分数)。如果有人需要,下面是扩展代码:
package solrExtension;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.postingshighlight.DefaultPassageFormatter;
import org.apache.lucene.search.postingshighlight.Passage;
import org.apache.lucene.search.postingshighlight.PassageFormatter;
import org.apache.lucene.search.postingshighlight.PostingsHighlighter;
import org.apache.solr.common.params.HighlightParams;
import org.apache.solr.common.util.NamedList;
import org.apache.solr.highlight.PostingsSolrHighlighter;
import org.apache.solr.request.SolrQueryRequest;
import org.apache.solr.search.DocList;
public class CustomPostingsSolrHighlighter extends PostingsSolrHighlighter {
protected PostingsHighlighter getHighlighter(SolrQueryRequest req) {
return new customSolrExtendedPostingsHighlighter(req);
}
public class customSolrExtendedPostingsHighlighter extends PostingsSolrHighlighter.SolrExtendedPostingsHighlighter {
public customSolrExtendedPostingsHighlighter(SolrQueryRequest req) {
super(req);
}
@Override
protected PassageFormatter getFormatter(String fieldName) {
String preTag = params.getFieldParam(fieldName, HighlightParams.TAG_PRE, "<em>");
String postTag = params.getFieldParam(fieldName, HighlightParams.TAG_POST, "</em>");
String ellipsis = params.getFieldParam(fieldName, HighlightParams.TAG_ELLIPSIS, "... ");
String encoder = params.getFieldParam(fieldName, HighlightParams.ENCODER, "simple");
return new CustomPassageFormatter(preTag, postTag, ellipsis, "html".equals(encoder));
}
}
public class CustomPassageFormatter extends DefaultPassageFormatter {
public CustomPassageFormatter() {
super();
}
public CustomPassageFormatter(String preTag, String postTag,
String ellipsis, boolean escape) {
super(preTag, postTag, ellipsis, escape);
}
@Override
public String format(Passage passages[], String content) {
StringBuilder sb = new StringBuilder();
int pos = 0;
double psgTtlScore = 0, maxScore=0, score;
int psgCounts = 0;
List<CustomPsg> psgGroups = new ArrayList<CustomPsg>();
for (Passage passage : passages) {
// don't add ellipsis if it's the first one, or if it's
// connected.
if (passage.getStartOffset() > pos && pos > 0) {
score = psgTtlScore / psgCounts;
if (score > maxScore)
maxScore = score;
sb.append("[[").append(score).append("]]");
psgGroups.add(new CustomPsg(sb.toString(), score));
//sb.append(ellipsis);
psgTtlScore = 0;
psgCounts = 0;
sb = new StringBuilder();
}
psgTtlScore += passage.getScore();
psgCounts++;
pos = passage.getStartOffset();
for (int i = 0; i < passage.getNumMatches(); i++) {
int start = passage.getMatchStarts()[i];
int end = passage.getMatchEnds()[i];
// it's possible to have overlapping terms
if (start > pos) {
append(sb, content, pos, start);
}
if (end > pos) {
sb.append(preTag);
append(sb, content, Math.max(pos, start), end);
sb.append(postTag);
pos = end;
}
}
sb.append("[").append(passage.getScore()).append("]");
// it's possible a "term" from the analyzer could span a
// sentence boundary.
append(sb, content, pos, Math.max(pos, passage.getEndOffset()));
pos = passage.getEndOffset();
}
sb.append("[[").append(psgTtlScore / psgCounts).append("]]");
psgGroups.add(new CustomPsg(sb.toString(), psgTtlScore / psgCounts));
sb = new StringBuilder();
for (CustomPsg psg : psgGroups) {
sb.append(psg.psg).append("{{").append(psg.score/maxScore).append("}}").append(ellipsis);
}
return sb.toString();
}
private class CustomPsg {
public String psg;
public double score;
public CustomPsg(String psg, double score) {
this.psg = psg;
this.score = score;
}
}
}
}
使用这个扩展代码,我在搜索结果中得到了片段及其分数,如下所示:
.....sions and other areas subject to its jurisdiction[22.847776]. [[22.847776]]{{0.954086}}
..... and the Trustee[19.287382]. [[19.287382]]{{0.805409}}
[]中的数字表示单个语句的分数,[[]]中的数字表示平均分数(所有句子的总和' 整个片段(一个片段可能包含多个句子)的分数/片段中的句子计数),{{ }} 中的数字表示片段的相对分数(相对于其他片段的平均分数)。