如何使用摄取附件插件和 JavaScript 客户端在 Elasticsearch 6.1 中索引 PDF?
How to index a PDF in Elasticsearch 6.1 with ingest-attachment plugin & JavaScript Client?
我尝试按照以下问题的答案中的说明进行操作:
我找不到 ElasticSearch 的 JavaScript 客户端的很多示例,所以这是我所拥有的:
创建索引
// elasticsearch Client
var elasticsearch = require('elasticsearch');
var client = new elasticsearch.Client({hosts: [ 'http://localhost:9200/']});
// Create index
client.create({index: 'pdfs', type: 'pdf', id: 'my-index-id',
body: {description: 'Test pdf indexing'}
})
.then(function () {console.log("Index created");})
.catch(function (error) {console.log(error);});
在节点中定义索引映射:
var body = {
pdf:{
properties:{
title : {"type" : "keyword", "index" : "false"},
type : {"type" : "keyword", "index" : "false"},
"attachment.pdf" : {"type" : "keyword"}
}
}
}
client.indices.putMapping({index:"pdfs", type:"pdf", body:body})
.then((response) => {addPipeline()})
.catch((error) => {console.log("putMapping error: " + error)})
使用 PUT 在节点集群中定义摄取管道 API
function addPipeline(){
client.ingest.putPipeline({
id: 'my-pipeline-id',
body: {
"description" : "parse pdfs and index into ES",
"processors" : [
{ "attachment" : { "field" : "pdf", "indexed_chars" : -1 } },
{ "remove" : { "field" : "pdf" } }
]
}
})
.then(function () {
console.log("putPipeline Resolved");
})
.catch(function (error) {
console.log("putPipeline error: " + error);
});
};
在尝试上传 PDF 之前,我检查了索引是否已创建:
curl -XGET 'localhost:9200/_cat/indices?v&pretty'
结果:
health status index uuid pri rep docs.count docs.deleted store.size pri.store.size
yellow open .kibana EaUbEQCETVKQbYThrhPGaA 1 1 1 0 3.6kb 3.6kb
yellow open pdfs Z2SR-ApFR9SYsvY08tgSZw 5 1 1 0 4.6kb 4.6kb
当我尝试使用以下命令索引 PDF 时,出现错误。
curl -H 'Content-Type: application/pdf' -XPUT 'localhost:9200/my_index/my_type/id?pipeline=my-pipeline-id' -d'
{
"pdf": @/Users/user/path/to/pdf/file.pdf
}'
错误:
{"error":"Content-Type header [application/pdf] is not supported","status":406}
这是因为我的 PDF 不是 Base64 编码还是我做错了什么?我正在尝试创建一个数字图书馆来搜索 PDF。
更新:
我用以下代码对我的 pdf 进行了编码:
openssl base64 -in /Users/user/path/to/pdf/file.pdf -out base64_encoded_file
重新创建了我的索引,运行 base64_encoded_file 上的以下命令:
curl -H 'Content-Type: application/json' -XPUT 'localhost:9200/my_index/my_type/id?pipeline=my-pipeline-id' -d @/base64_encoded_file
我收到以下错误:
Warning: Couldn't read data from file "/base64_encoded_file", this makes an empty POST.
{"error":{"root_cause":[{"type":"parse_exception","reason":"request body is required"}],"type":"parse_exception","reason":"request body is required"},"status":400}
我尝试将文件添加为正文:
curl -H 'Content-Type: application/json' -XPUT 'localhost:9200/my_index/my_type/id?pipeline=my-pipeline-id' -d '
{
"pdf" : @/base64_encoded_file
}'
错误:
{"error":{"root_cause":[{"type":"parse_exception","reason":"Failed to parse content to map"}],"type":"parse_exception","reason":"Failed to parse content to map","caused_by":{"type":"json_parse_exception","reason":"Unexpected character ('@' (code 64)): expected a valid value (number, String, array, object, 'true', 'false' or 'null')\n at [Source: org.elasticsearch.transport.netty4.ByteBufStreamInput@6db5a3dc; line: 3, column: 16]"}},"status":400}
哈尔普
我找到了问题的答案:
Elasticsearch 不会从源中获取数据,因此,
curl -H 'Content-Type: application/json' -XPUT 'localhost:9200/my_index/my_type/id?pipeline=my-pipeline-id' -d '
{
"pdf" : @/base64_encoded_file
}'
行不通。 attachment options (in my example, "pdf") must be data, not a filepath. This 线程中的 "field" 解释了将 [pdf] 内容发送到 elasticsearch 的三个选项:
- 您可以 [从 pdf 中] 提取内容,然后将您想要索引的内容发送到 elasticsearch。
- 您可以将二进制 BASE64 发送到 elasticsearch ingest 进行提取
- 您可以将二进制文件发送到 FSCrawler,它会在发送到 elasticsearch 之前进行提取。
简而言之,传递给elasticsearch的数据必须如documentation中定义的那样。
curl -H 'Content-Type: application/json' -XPUT 'localhost:9200/my_index/my_type/id?pipeline=my-pipeline-id' -d '
{
"pdf" : "base64_encoded_data"
}'
我尝试按照以下问题的答案中的说明进行操作:
我找不到 ElasticSearch 的 JavaScript 客户端的很多示例,所以这是我所拥有的:
创建索引
// elasticsearch Client
var elasticsearch = require('elasticsearch');
var client = new elasticsearch.Client({hosts: [ 'http://localhost:9200/']});
// Create index
client.create({index: 'pdfs', type: 'pdf', id: 'my-index-id',
body: {description: 'Test pdf indexing'}
})
.then(function () {console.log("Index created");})
.catch(function (error) {console.log(error);});
在节点中定义索引映射:
var body = {
pdf:{
properties:{
title : {"type" : "keyword", "index" : "false"},
type : {"type" : "keyword", "index" : "false"},
"attachment.pdf" : {"type" : "keyword"}
}
}
}
client.indices.putMapping({index:"pdfs", type:"pdf", body:body})
.then((response) => {addPipeline()})
.catch((error) => {console.log("putMapping error: " + error)})
使用 PUT 在节点集群中定义摄取管道 API
function addPipeline(){
client.ingest.putPipeline({
id: 'my-pipeline-id',
body: {
"description" : "parse pdfs and index into ES",
"processors" : [
{ "attachment" : { "field" : "pdf", "indexed_chars" : -1 } },
{ "remove" : { "field" : "pdf" } }
]
}
})
.then(function () {
console.log("putPipeline Resolved");
})
.catch(function (error) {
console.log("putPipeline error: " + error);
});
};
在尝试上传 PDF 之前,我检查了索引是否已创建:
curl -XGET 'localhost:9200/_cat/indices?v&pretty'
结果:
health status index uuid pri rep docs.count docs.deleted store.size pri.store.size
yellow open .kibana EaUbEQCETVKQbYThrhPGaA 1 1 1 0 3.6kb 3.6kb
yellow open pdfs Z2SR-ApFR9SYsvY08tgSZw 5 1 1 0 4.6kb 4.6kb
当我尝试使用以下命令索引 PDF 时,出现错误。
curl -H 'Content-Type: application/pdf' -XPUT 'localhost:9200/my_index/my_type/id?pipeline=my-pipeline-id' -d'
{
"pdf": @/Users/user/path/to/pdf/file.pdf
}'
错误:
{"error":"Content-Type header [application/pdf] is not supported","status":406}
这是因为我的 PDF 不是 Base64 编码还是我做错了什么?我正在尝试创建一个数字图书馆来搜索 PDF。
更新:
我用以下代码对我的 pdf 进行了编码:
openssl base64 -in /Users/user/path/to/pdf/file.pdf -out base64_encoded_file
重新创建了我的索引,运行 base64_encoded_file 上的以下命令:
curl -H 'Content-Type: application/json' -XPUT 'localhost:9200/my_index/my_type/id?pipeline=my-pipeline-id' -d @/base64_encoded_file
我收到以下错误:
Warning: Couldn't read data from file "/base64_encoded_file", this makes an empty POST.
{"error":{"root_cause":[{"type":"parse_exception","reason":"request body is required"}],"type":"parse_exception","reason":"request body is required"},"status":400}
我尝试将文件添加为正文:
curl -H 'Content-Type: application/json' -XPUT 'localhost:9200/my_index/my_type/id?pipeline=my-pipeline-id' -d '
{
"pdf" : @/base64_encoded_file
}'
错误:
{"error":{"root_cause":[{"type":"parse_exception","reason":"Failed to parse content to map"}],"type":"parse_exception","reason":"Failed to parse content to map","caused_by":{"type":"json_parse_exception","reason":"Unexpected character ('@' (code 64)): expected a valid value (number, String, array, object, 'true', 'false' or 'null')\n at [Source: org.elasticsearch.transport.netty4.ByteBufStreamInput@6db5a3dc; line: 3, column: 16]"}},"status":400}
哈尔普
我找到了问题的答案:
Elasticsearch 不会从源中获取数据,因此,
curl -H 'Content-Type: application/json' -XPUT 'localhost:9200/my_index/my_type/id?pipeline=my-pipeline-id' -d '
{
"pdf" : @/base64_encoded_file
}'
行不通。 attachment options (in my example, "pdf") must be data, not a filepath. This 线程中的 "field" 解释了将 [pdf] 内容发送到 elasticsearch 的三个选项:
- 您可以 [从 pdf 中] 提取内容,然后将您想要索引的内容发送到 elasticsearch。
- 您可以将二进制 BASE64 发送到 elasticsearch ingest 进行提取
- 您可以将二进制文件发送到 FSCrawler,它会在发送到 elasticsearch 之前进行提取。
简而言之,传递给elasticsearch的数据必须如documentation中定义的那样。
curl -H 'Content-Type: application/json' -XPUT 'localhost:9200/my_index/my_type/id?pipeline=my-pipeline-id' -d '
{
"pdf" : "base64_encoded_data"
}'