Apache Solr 在增量导入后仍然保留旧数据
Apache Solr still keep old data after delta import
我正在使用 solr 7.6。
我从 mysql 完全导入,table customer
看起来像:
customer_id pk int
customer_code varchar
name varchar
update_datetime timestamp
我修改一条记录,改
customer_id customer_code name
46027 C1 zxc
到
customer_id customer_code name
46027 C1 789
然后我使用 data-config
进行增量导入,看起来像
<dataConfig>
<dataSource type="JdbcDataSource" driver="com.mysql.cj.jdbc.Driver"
url="jdbc:mysql://localhost:3306/test" user="test" password="123456"/>
<document>
<entity name="customer" pk="customer_id"
query="select customer_id, customer_code, name from customer"
deltaImportQuery="select customer_id, customer_code, name from customer where customer_id='${dih.delta.customer_id}'"
deltaQuery="select customer_id from customer where update_datetime > '${dih.last_index_time}'"
>
</entity>
</document>
</dataConfig>
delta导入成功,solr可以return查询name:789
的新结果。
但是,当我查询旧数据name:zxc
时,它仍然可以return旧数据:
{
"responseHeader":{
"status":0,
"QTime":0,
"params":{
"q":"name",
"_":"1547619027918"}},
"response":{"numFound":1,"start":0,"docs":[
{
"customer_id":46027,
"name":"zxc",
"id":"dd513a36-dfff-4ed2-a4fe-f728f42adfce",
"_version_":1622797739301535744}]
}}
为什么?如果这条记录已经更新,我如何让 solr 删除旧数据?
customer_id
是MySql.
中类型为int
的主键
我在 Solr 的 schema
中添加了 customer_id
和 name
,并将 customer_id
设置为 pint
。
下一个截图是Solr的schema
标签,它说unique key field
是id
。
---------------- 更新 ------------------ -
managed-schema.xml
是:
<?xml version="1.0" encoding="UTF-8"?>
<!-- Solr managed schema - automatically generated - DO NOT EDIT -->
<schema name="default-config" version="1.6">
<uniqueKey>id</uniqueKey>
<fieldType name="ancestor_path" class="solr.TextField">
<analyzer type="index">
<tokenizer class="solr.KeywordTokenizerFactory"/>
</analyzer>
<analyzer type="query">
<tokenizer class="solr.PathHierarchyTokenizerFactory" delimiter="/"/>
</analyzer>
</fieldType>
<fieldType name="binary" class="solr.BinaryField"/>
<fieldType name="boolean" class="solr.BoolField" sortMissingLast="true"/>
<fieldType name="booleans" class="solr.BoolField" sortMissingLast="true" multiValued="true"/>
<!-- field : delimited_payloads_float, delimited_payloads_int,
delimited_payloads_string, descendent_path, location, location_rpt,
lowercase-->
<!-- field starts with p, e.g. pdate -->
<fieldType name="random" class="solr.RandomSortField" indexed="true"/>
<fieldType name="string" class="solr.StrField" sortMissingLast="true" docValues="true"/>
<fieldType name="strings" class="solr.StrField" sortMissingLast="true" docValues="true" multiValued="true"/>
<!-- field name starts with text_-->
<field name="customer_id" type="pint" uninvertible="true" indexed="true" stored="true"/>
<field name="name" type="text_en" uninvertible="true" indexed="true" stored="true"/>
<field name="_root_" type="string" docValues="false" indexed="true" stored="false"/>
<field name="_text_" type="text_general" multiValued="true" indexed="true" stored="false"/>
<field name="_version_" type="plong" indexed="false" stored="false"/>
<field name="id" type="string" multiValued="false" indexed="true" required="true" stored="true"/>
<!-- default dynamic fields -->
</schema>
由于您没有 id 字段的值,Solr 正在为您生成一个唯一的值。您必须包含一个实际上是您要提交的文档的唯一 ID 的 ID,或者更改 uniqueKey 定义 - 我建议先做,因为以后可以根据需要轻松更改它。
如果 customer_id
唯一标识文档,请将 customer_id AS id, ..
添加到您的 SQL SELECT 语句中:
SELECT customer_id AS id, customer_id, customer_code, name FROM customer
我正在使用 solr 7.6。
我从 mysql 完全导入,table customer
看起来像:
customer_id pk int
customer_code varchar
name varchar
update_datetime timestamp
我修改一条记录,改
customer_id customer_code name
46027 C1 zxc
到
customer_id customer_code name
46027 C1 789
然后我使用 data-config
进行增量导入,看起来像
<dataConfig>
<dataSource type="JdbcDataSource" driver="com.mysql.cj.jdbc.Driver"
url="jdbc:mysql://localhost:3306/test" user="test" password="123456"/>
<document>
<entity name="customer" pk="customer_id"
query="select customer_id, customer_code, name from customer"
deltaImportQuery="select customer_id, customer_code, name from customer where customer_id='${dih.delta.customer_id}'"
deltaQuery="select customer_id from customer where update_datetime > '${dih.last_index_time}'"
>
</entity>
</document>
</dataConfig>
delta导入成功,solr可以return查询name:789
的新结果。
但是,当我查询旧数据name:zxc
时,它仍然可以return旧数据:
{
"responseHeader":{
"status":0,
"QTime":0,
"params":{
"q":"name",
"_":"1547619027918"}},
"response":{"numFound":1,"start":0,"docs":[
{
"customer_id":46027,
"name":"zxc",
"id":"dd513a36-dfff-4ed2-a4fe-f728f42adfce",
"_version_":1622797739301535744}]
}}
为什么?如果这条记录已经更新,我如何让 solr 删除旧数据?
customer_id
是MySql.
int
的主键
我在 Solr 的 schema
中添加了 customer_id
和 name
,并将 customer_id
设置为 pint
。
下一个截图是Solr的schema
标签,它说unique key field
是id
。
---------------- 更新 ------------------ -
managed-schema.xml
是:
<?xml version="1.0" encoding="UTF-8"?>
<!-- Solr managed schema - automatically generated - DO NOT EDIT -->
<schema name="default-config" version="1.6">
<uniqueKey>id</uniqueKey>
<fieldType name="ancestor_path" class="solr.TextField">
<analyzer type="index">
<tokenizer class="solr.KeywordTokenizerFactory"/>
</analyzer>
<analyzer type="query">
<tokenizer class="solr.PathHierarchyTokenizerFactory" delimiter="/"/>
</analyzer>
</fieldType>
<fieldType name="binary" class="solr.BinaryField"/>
<fieldType name="boolean" class="solr.BoolField" sortMissingLast="true"/>
<fieldType name="booleans" class="solr.BoolField" sortMissingLast="true" multiValued="true"/>
<!-- field : delimited_payloads_float, delimited_payloads_int,
delimited_payloads_string, descendent_path, location, location_rpt,
lowercase-->
<!-- field starts with p, e.g. pdate -->
<fieldType name="random" class="solr.RandomSortField" indexed="true"/>
<fieldType name="string" class="solr.StrField" sortMissingLast="true" docValues="true"/>
<fieldType name="strings" class="solr.StrField" sortMissingLast="true" docValues="true" multiValued="true"/>
<!-- field name starts with text_-->
<field name="customer_id" type="pint" uninvertible="true" indexed="true" stored="true"/>
<field name="name" type="text_en" uninvertible="true" indexed="true" stored="true"/>
<field name="_root_" type="string" docValues="false" indexed="true" stored="false"/>
<field name="_text_" type="text_general" multiValued="true" indexed="true" stored="false"/>
<field name="_version_" type="plong" indexed="false" stored="false"/>
<field name="id" type="string" multiValued="false" indexed="true" required="true" stored="true"/>
<!-- default dynamic fields -->
</schema>
由于您没有 id 字段的值,Solr 正在为您生成一个唯一的值。您必须包含一个实际上是您要提交的文档的唯一 ID 的 ID,或者更改 uniqueKey 定义 - 我建议先做,因为以后可以根据需要轻松更改它。
如果 customer_id
唯一标识文档,请将 customer_id AS id, ..
添加到您的 SQL SELECT 语句中:
SELECT customer_id AS id, customer_id, customer_code, name FROM customer