将表格从 PDF 提取到 Azure 搜索
Extract tables from PDF to Azure search
我正在尝试将 pdf 内容编入 azure 搜索索引。为此,我在 Azure Search Power Skills GitHub 存储库中使用 analyzeForm 和 extractTables 项目的函数应用程序。
AnalyzeForm 具有字段映射,我可以使用它来将输出字段映射到相应的 Azure 搜索索引字段。
但我还想提取 table 内容,为此我必须使用 ExtractTables API。 API returns table 这种形式的记录:
{
"values": [
{
"recordId": "record1",
"data": {
"tables": [
{
"page_number": 1,
"row_count": 3,
"column_count": 4,
"cells": [
{
"text": "Item",
"rowIndex": 0,
"colIndex": 0,
"confidence": 1.0,
"is_header": false
},
{
"text": "Quantity",
"rowIndex": 0,
"colIndex": 1,
"confidence": 1.0,
"is_header": false
},
{
"text": "Rate",
"rowIndex": 0,
"colIndex": 2,
"confidence": 1.0,
"is_header": false
},
{
"text": "Amount",
"rowIndex": 0,
"colIndex": 3,
"confidence": 1.0,
"is_header": false
},
{
"text": "Iphone 12 (64 GB)",
"rowIndex": 1,
"colIndex": 0,
"confidence": 1.0,
"is_header": false
},
{
"text": "1",
"rowIndex": 1,
"colIndex": 1,
"confidence": 1.0,
"is_header": false
},
{
"text": "0.00",
"rowIndex": 1,
"colIndex": 2,
"confidence": 1.0,
"is_header": false
},
{
"text": "0.00",
"rowIndex": 1,
"colIndex": 3,
"confidence": 1.0,
"is_header": false
},
{
"text": "MI 10 (6 GB)",
"rowIndex": 2,
"colIndex": 0,
"confidence": 1.0,
"is_header": false
},
{
"text": "1",
"rowIndex": 2,
"colIndex": 1,
"confidence": 1.0,
"is_header": false
},
{
"text": "0.00",
"rowIndex": 2,
"colIndex": 2,
"confidence": 1.0,
"is_header": false
},
{
"text": "0.00",
"rowIndex": 2,
"colIndex": 3,
"confidence": 1.0,
"is_header": false
}
]
}
]
}
}
]
}
如何映射从 table 中提取的数据以将其索引到 Azure 搜索中?有什么办法可以将其添加为自定义技能吗?如果是,索引模式是什么?
您可以将 Azure 函数用作自定义技能。只需将 ExtractTables 中的字段映射到预期的自定义技能字段:
private class OutputRecord
{
public class OutputRecordData
{
public string Name { get; set; } = "";
public string Description { get; set; } = "";
public string Source { get; set; } = "";
public string SourceUrl { get; set; } = "";
public string LicenseAttribution { get; set; } = "";
public string LicenseUrl { get; set; } = "";
}
public class OutputRecordMessage
{
public string Message { get; set; }
}
public string RecordId { get; set; }
public OutputRecordData Data { get; set; }
public List<OutputRecordMessage> Errors { get; set; }
public List<OutputRecordMessage> Warnings { get; set; }
}
https://docs.microsoft.com/en-us/azure/search/cognitive-search-create-custom-skill-example
有 3 种方法可以转换技能输出,您可以使用整形器技能、内联整形或使用自定义技能。在这种情况下,我同意@Thiago 的观点,即最好的选择是使用自定义技能,因为整形器不允许您仅从 table 结果中选择特定的列值。
如果您正在使用 Power Skills,您可以将技能编辑为 return 映射到您的索引字段定义的响应。假设您的索引字段如下所示:
{
"name": "Items",
"type": "Collection(Edm.ComplexType)",
"analyzer": null,
"synonymMaps": [],
"fields": [
{
"name": "Item_Id",
"type": "Edm.String",
"facetable": false,
"filterable": false,
"key": false,
"retrievable": false,
"searchable": false,
"sortable": false,
"analyzer": null,
"indexAnalyzer": null,
"searchAnalyzer": null,
"synonymMaps": [],
"fields": []
},
{
"name": "Quantity",
"type": "Edm.Int64",
"facetable": false,
"filterable": false,
"retrievable": false,
"sortable": false,
"analyzer": null,
"indexAnalyzer": null,
"searchAnalyzer": null,
"synonymMaps": [],
"fields": []
},
{
"name": "Rate",
"type": "Edm.Double",
"facetable": false,
"filterable": false,
"retrievable": false,
"sortable": false,
"analyzer": null,
"indexAnalyzer": null,
"searchAnalyzer": null,
"synonymMaps": [],
"fields": []
},
{
"name": "Amount",
"type": "Edm.Double",
"facetable": false,
"filterable": false,
"retrievable": false,
"sortable": false,
"analyzer": null,
"indexAnalyzer": null,
"searchAnalyzer": null,
"synonymMaps": [],
"fields": []
}
]
}
您可以将您的技能编辑为 return 一个 JSON 对象,例如:
{
"items" : [
{
"item": "Iphone 12",
"quantity": 1,
"price": 1000,
"amount: 1000
}
]
}
我正在尝试将 pdf 内容编入 azure 搜索索引。为此,我在 Azure Search Power Skills GitHub 存储库中使用 analyzeForm 和 extractTables 项目的函数应用程序。
AnalyzeForm 具有字段映射,我可以使用它来将输出字段映射到相应的 Azure 搜索索引字段。 但我还想提取 table 内容,为此我必须使用 ExtractTables API。 API returns table 这种形式的记录:
{
"values": [
{
"recordId": "record1",
"data": {
"tables": [
{
"page_number": 1,
"row_count": 3,
"column_count": 4,
"cells": [
{
"text": "Item",
"rowIndex": 0,
"colIndex": 0,
"confidence": 1.0,
"is_header": false
},
{
"text": "Quantity",
"rowIndex": 0,
"colIndex": 1,
"confidence": 1.0,
"is_header": false
},
{
"text": "Rate",
"rowIndex": 0,
"colIndex": 2,
"confidence": 1.0,
"is_header": false
},
{
"text": "Amount",
"rowIndex": 0,
"colIndex": 3,
"confidence": 1.0,
"is_header": false
},
{
"text": "Iphone 12 (64 GB)",
"rowIndex": 1,
"colIndex": 0,
"confidence": 1.0,
"is_header": false
},
{
"text": "1",
"rowIndex": 1,
"colIndex": 1,
"confidence": 1.0,
"is_header": false
},
{
"text": "0.00",
"rowIndex": 1,
"colIndex": 2,
"confidence": 1.0,
"is_header": false
},
{
"text": "0.00",
"rowIndex": 1,
"colIndex": 3,
"confidence": 1.0,
"is_header": false
},
{
"text": "MI 10 (6 GB)",
"rowIndex": 2,
"colIndex": 0,
"confidence": 1.0,
"is_header": false
},
{
"text": "1",
"rowIndex": 2,
"colIndex": 1,
"confidence": 1.0,
"is_header": false
},
{
"text": "0.00",
"rowIndex": 2,
"colIndex": 2,
"confidence": 1.0,
"is_header": false
},
{
"text": "0.00",
"rowIndex": 2,
"colIndex": 3,
"confidence": 1.0,
"is_header": false
}
]
}
]
}
}
]
}
如何映射从 table 中提取的数据以将其索引到 Azure 搜索中?有什么办法可以将其添加为自定义技能吗?如果是,索引模式是什么?
您可以将 Azure 函数用作自定义技能。只需将 ExtractTables 中的字段映射到预期的自定义技能字段:
private class OutputRecord
{
public class OutputRecordData
{
public string Name { get; set; } = "";
public string Description { get; set; } = "";
public string Source { get; set; } = "";
public string SourceUrl { get; set; } = "";
public string LicenseAttribution { get; set; } = "";
public string LicenseUrl { get; set; } = "";
}
public class OutputRecordMessage
{
public string Message { get; set; }
}
public string RecordId { get; set; }
public OutputRecordData Data { get; set; }
public List<OutputRecordMessage> Errors { get; set; }
public List<OutputRecordMessage> Warnings { get; set; }
}
https://docs.microsoft.com/en-us/azure/search/cognitive-search-create-custom-skill-example
有 3 种方法可以转换技能输出,您可以使用整形器技能、内联整形或使用自定义技能。在这种情况下,我同意@Thiago 的观点,即最好的选择是使用自定义技能,因为整形器不允许您仅从 table 结果中选择特定的列值。
如果您正在使用 Power Skills,您可以将技能编辑为 return 映射到您的索引字段定义的响应。假设您的索引字段如下所示:
{
"name": "Items",
"type": "Collection(Edm.ComplexType)",
"analyzer": null,
"synonymMaps": [],
"fields": [
{
"name": "Item_Id",
"type": "Edm.String",
"facetable": false,
"filterable": false,
"key": false,
"retrievable": false,
"searchable": false,
"sortable": false,
"analyzer": null,
"indexAnalyzer": null,
"searchAnalyzer": null,
"synonymMaps": [],
"fields": []
},
{
"name": "Quantity",
"type": "Edm.Int64",
"facetable": false,
"filterable": false,
"retrievable": false,
"sortable": false,
"analyzer": null,
"indexAnalyzer": null,
"searchAnalyzer": null,
"synonymMaps": [],
"fields": []
},
{
"name": "Rate",
"type": "Edm.Double",
"facetable": false,
"filterable": false,
"retrievable": false,
"sortable": false,
"analyzer": null,
"indexAnalyzer": null,
"searchAnalyzer": null,
"synonymMaps": [],
"fields": []
},
{
"name": "Amount",
"type": "Edm.Double",
"facetable": false,
"filterable": false,
"retrievable": false,
"sortable": false,
"analyzer": null,
"indexAnalyzer": null,
"searchAnalyzer": null,
"synonymMaps": [],
"fields": []
}
]
}
您可以将您的技能编辑为 return 一个 JSON 对象,例如:
{
"items" : [
{
"item": "Iphone 12",
"quantity": 1,
"price": 1000,
"amount: 1000
}
]
}