在 apache solr 聚类结果中检索文档 ID
Retrieving document ID in apache solr clustering results
所以我尝试使用 solr 6 附带的 Lingo 聚类算法对 solr 搜索结果进行聚类。它完成了这项工作,但我需要它来检索文档 ID(这些 ID 称为 P_ID这里)与聚类结果。我一直在努力,但没有任何运气,非常感谢任何帮助。
这是 solrconfig.xml 文件
<lib dir="${solr.install.dir:../../..}/contrib/clustering/lib/" regex=".*\.jar" />
<lib dir="${solr.install.dir:../../..}/dist/" regex="solr-clustering-\d.*\.jar" />
<requestHandler name="/clustering"
startup="lazy"
enable="${solr.clustering.enabled:true}"
class="solr.SearchHandler">
<lst name="defaults">
<bool name="clustering">true</bool>
<bool name="clustering.results">true</bool>
<bool name="carrot.produceSummary">true</bool>
<!-- Logical field to physical field mapping. -->
<str name="carrot.url">P_ID</str>
<str name="carrot.title">input</str>
<str name="carrot.snippet">input</str>
<!-- Configure any other request handler parameters. We will cluster the
top 100 search results so bump up the 'rows' parameter. -->
<str name="defType">edismax</str>
<str name="qf">
input^1.4
</str>
<str name="q.alt">*:*</str>
<str name="rows">100</str>
<str name="fl">*</str>
</lst>
<arr name="last-components">
<str>clustering</str>
</arr>
</requestHandler>
这是我得到的结果:
{
"responseHeader":{
"status":0,
"QTime":24},
"response":{"numFound":16,"start":0,"docs":[
{
"date":"2016-09-18 13:50:07.0",
"input":"Text",
"type":"q",
"U_ID":2,
"P_ID":1,
"_version_":1548945773383647232},
{
"date":"2016-09-18 13:53:09.0",
"input":"Text 2",
"type":"q",
"U_ID":10,
"P_ID":2,
"_version_":1548945773385744384},
{
"date":"2016-09-18 14:20:29.0",
"input":"Text 3",
"type":"q",
"U_ID":12,
"P_ID":3,
"_version_":1548945773385744385},
{
"date":"2016-09-18 13:50:07.0",
"input":"Text 4",
"type":"q",
"U_ID":3,
"P_ID":4,
"_version_":1548945773385744386},
]
},
"clusters":[{
"labels":["label 1"],
"score":6.723284893605449,
"docs":["text ",
"Text 2",
"Text 4"
]},
{
"labels":["lable 2"],
"score":10.22078770519469,
"docs":["text 3",
"Text 2"
]},
{
"labels":["label 3"],
"score":8.32470981979922,
"docs":["text 1",
"text 3"
]},
]}
如您所见,在 "clusters" 部分下,它为我提供了集群和文档,但没有为我提供文档 ID,我什至尝试将 fl 参数更改为 P_ID (文档 ID)但它没有用。它甚至可以在响应部分而非集群部分显示 P_ID 值。
好的所以我想出了一个满足我需要的解决方案。我所需要做的就是将文档 ID 放入架构中的 <uniqueKey></uniqueKey>
元素中。
所以我尝试使用 solr 6 附带的 Lingo 聚类算法对 solr 搜索结果进行聚类。它完成了这项工作,但我需要它来检索文档 ID(这些 ID 称为 P_ID这里)与聚类结果。我一直在努力,但没有任何运气,非常感谢任何帮助。
这是 solrconfig.xml 文件
<lib dir="${solr.install.dir:../../..}/contrib/clustering/lib/" regex=".*\.jar" />
<lib dir="${solr.install.dir:../../..}/dist/" regex="solr-clustering-\d.*\.jar" />
<requestHandler name="/clustering"
startup="lazy"
enable="${solr.clustering.enabled:true}"
class="solr.SearchHandler">
<lst name="defaults">
<bool name="clustering">true</bool>
<bool name="clustering.results">true</bool>
<bool name="carrot.produceSummary">true</bool>
<!-- Logical field to physical field mapping. -->
<str name="carrot.url">P_ID</str>
<str name="carrot.title">input</str>
<str name="carrot.snippet">input</str>
<!-- Configure any other request handler parameters. We will cluster the
top 100 search results so bump up the 'rows' parameter. -->
<str name="defType">edismax</str>
<str name="qf">
input^1.4
</str>
<str name="q.alt">*:*</str>
<str name="rows">100</str>
<str name="fl">*</str>
</lst>
<arr name="last-components">
<str>clustering</str>
</arr>
</requestHandler>
这是我得到的结果:
{
"responseHeader":{
"status":0,
"QTime":24},
"response":{"numFound":16,"start":0,"docs":[
{
"date":"2016-09-18 13:50:07.0",
"input":"Text",
"type":"q",
"U_ID":2,
"P_ID":1,
"_version_":1548945773383647232},
{
"date":"2016-09-18 13:53:09.0",
"input":"Text 2",
"type":"q",
"U_ID":10,
"P_ID":2,
"_version_":1548945773385744384},
{
"date":"2016-09-18 14:20:29.0",
"input":"Text 3",
"type":"q",
"U_ID":12,
"P_ID":3,
"_version_":1548945773385744385},
{
"date":"2016-09-18 13:50:07.0",
"input":"Text 4",
"type":"q",
"U_ID":3,
"P_ID":4,
"_version_":1548945773385744386},
]
},
"clusters":[{
"labels":["label 1"],
"score":6.723284893605449,
"docs":["text ",
"Text 2",
"Text 4"
]},
{
"labels":["lable 2"],
"score":10.22078770519469,
"docs":["text 3",
"Text 2"
]},
{
"labels":["label 3"],
"score":8.32470981979922,
"docs":["text 1",
"text 3"
]},
]}
如您所见,在 "clusters" 部分下,它为我提供了集群和文档,但没有为我提供文档 ID,我什至尝试将 fl 参数更改为 P_ID (文档 ID)但它没有用。它甚至可以在响应部分而非集群部分显示 P_ID 值。
好的所以我想出了一个满足我需要的解决方案。我所需要做的就是将文档 ID 放入架构中的 <uniqueKey></uniqueKey>
元素中。