Azure 认知搜索文本翻译技能 5 万个字符限制
Azure Cognitive Search text translation skill 50k charactes limitation
我们正在使用 Azure 认知搜索来索引各种文档,例如存储在 Azure Blob 存储中的 Word 或 PDF 文件。我们希望能够翻译非英文文档的提取内容,并将翻译结果存储到索引中的专用字段中。
目前内置的文本翻译认知技能最多支持输入50,000个字符。我们拥有的文档最多可包含 1 MB 的文本。根据文档,可以使用内置的 Split Skill 将文本拆分成块,但是没有可以将翻译后的块合并回一起的技能。我们的目标是将所有提取的文本翻译并存储在一个 Edm.String 类型的索引字段中,而不是数组。
除了通过 Web API 为此目的创建自定义认知技能之外,是否有任何方法可以在编制索引时翻译大文本块?
是的,Merge Skill 实际上会这样做。如下所示定义技能集中的技能。此技能的 "text" 和 "offsets" 输入是可选的,您可以使用 "itemsToInsert" 指定要合并的文本(为翻译输出指定适当的来源)。如果您想在每个合并部分之前或之后插入一个 space,请使用 insertPreTag 和 insertPostTag。
{
"@odata.type": "#Microsoft.Skills.Text.MergeSkill",
"description": "Merge text back together",
"context": "/document",
"insertPreTag": "",
"insertPostTag": "",
"inputs": [
{
"name": "itemsToInsert",
"source": "/document/translation_output/*/text"
}
],
"outputs": [
{
"name": "mergedText",
"targetName" : "merged_text_field_in_your_index"
}
]
}
下面是 C# 中的一个片段,使用 Microsoft.Azure.Search 类。它遵循 Jennifer 在上面的回复中给出的建议。
技能组定义已经过测试,可以正确支持大于 5 万个字符的文本块的翻译。
private static IList<Skill> GetSkills()
{
var skills = new List<Skill>();
skills.AddRange(new Skill[] {
// ...some skills in the pipeline before translation
new ConditionalSkill(
name: "05-1-set-language-code-for-split",
description: "Set compatible language code for split skill (e.g. 'ru' is not supported)",
context: "/document",
inputs: new []
{
new InputFieldMappingEntry(name: "condition", source: SplitLanguageExpression),
new InputFieldMappingEntry(name: "whenTrue", source: "/document/language_code"),
new InputFieldMappingEntry(name: "whenFalse", source: "= 'en'")
},
outputs: new [] { new OutputFieldMappingEntry(name: "output", targetName: "language_code_split") }
),
new SplitSkill
(
name: "05-2-split-original-content",
description: "Split original merged content into chunks for translation",
defaultLanguageCode: SplitSkillLanguage.En,
textSplitMode: TextSplitMode.Pages,
maximumPageLength: 50000,
context: "/document/merged_content_original",
inputs: new []
{
new InputFieldMappingEntry(name: "text", source: "/document/merged_content_original"),
new InputFieldMappingEntry(name: "languageCode", source: "/document/language_code_split")
},
outputs: new [] { new OutputFieldMappingEntry(name: "textItems", targetName: "pages") }
),
new TextTranslationSkill
(
name: "05-3-translate-original-content-pages",
description: "Translate original merged content chunks",
defaultToLanguageCode: TextTranslationSkillLanguage.En,
context: "/document/merged_content_original/pages/*",
inputs: new []
{
new InputFieldMappingEntry(name: "text", source: "/document/merged_content_original/pages/*"),
new InputFieldMappingEntry(name: "fromLanguageCode", source: "/document/language_code")
},
outputs: new [] { new OutputFieldMappingEntry(name: "translatedText", targetName: "translated_text") }
),
new MergeSkill
(
name: "05-4-merge-translated-content-pages",
description: "Merge translated content into one text string",
context: "/document",
insertPreTag: " ",
insertPostTag: " ",
inputs: new []
{
new InputFieldMappingEntry(name: "itemsToInsert", source: "/document/merged_content_original/pages/*/translated_text")
},
outputs: new [] { new OutputFieldMappingEntry(name: "mergedText", targetName: "merged_content_translated") }
),
// ... some skills in the pipeline after translation
});
return skills;
}
private static string SplitLanguageExpression
{
get
{
var values = Enum.GetValues(typeof(SplitSkillLanguage)).Cast<SplitSkillLanguage>();
var parts = values.Select(v => "($(/document/language_code) == '" + v.ToString().ToLower() +"')");
return "= " + string.Join(" || ", parts);
}
}
我们正在使用 Azure 认知搜索来索引各种文档,例如存储在 Azure Blob 存储中的 Word 或 PDF 文件。我们希望能够翻译非英文文档的提取内容,并将翻译结果存储到索引中的专用字段中。
目前内置的文本翻译认知技能最多支持输入50,000个字符。我们拥有的文档最多可包含 1 MB 的文本。根据文档,可以使用内置的 Split Skill 将文本拆分成块,但是没有可以将翻译后的块合并回一起的技能。我们的目标是将所有提取的文本翻译并存储在一个 Edm.String 类型的索引字段中,而不是数组。
除了通过 Web API 为此目的创建自定义认知技能之外,是否有任何方法可以在编制索引时翻译大文本块?
是的,Merge Skill 实际上会这样做。如下所示定义技能集中的技能。此技能的 "text" 和 "offsets" 输入是可选的,您可以使用 "itemsToInsert" 指定要合并的文本(为翻译输出指定适当的来源)。如果您想在每个合并部分之前或之后插入一个 space,请使用 insertPreTag 和 insertPostTag。
{
"@odata.type": "#Microsoft.Skills.Text.MergeSkill",
"description": "Merge text back together",
"context": "/document",
"insertPreTag": "",
"insertPostTag": "",
"inputs": [
{
"name": "itemsToInsert",
"source": "/document/translation_output/*/text"
}
],
"outputs": [
{
"name": "mergedText",
"targetName" : "merged_text_field_in_your_index"
}
]
}
下面是 C# 中的一个片段,使用 Microsoft.Azure.Search 类。它遵循 Jennifer 在上面的回复中给出的建议。
技能组定义已经过测试,可以正确支持大于 5 万个字符的文本块的翻译。
private static IList<Skill> GetSkills()
{
var skills = new List<Skill>();
skills.AddRange(new Skill[] {
// ...some skills in the pipeline before translation
new ConditionalSkill(
name: "05-1-set-language-code-for-split",
description: "Set compatible language code for split skill (e.g. 'ru' is not supported)",
context: "/document",
inputs: new []
{
new InputFieldMappingEntry(name: "condition", source: SplitLanguageExpression),
new InputFieldMappingEntry(name: "whenTrue", source: "/document/language_code"),
new InputFieldMappingEntry(name: "whenFalse", source: "= 'en'")
},
outputs: new [] { new OutputFieldMappingEntry(name: "output", targetName: "language_code_split") }
),
new SplitSkill
(
name: "05-2-split-original-content",
description: "Split original merged content into chunks for translation",
defaultLanguageCode: SplitSkillLanguage.En,
textSplitMode: TextSplitMode.Pages,
maximumPageLength: 50000,
context: "/document/merged_content_original",
inputs: new []
{
new InputFieldMappingEntry(name: "text", source: "/document/merged_content_original"),
new InputFieldMappingEntry(name: "languageCode", source: "/document/language_code_split")
},
outputs: new [] { new OutputFieldMappingEntry(name: "textItems", targetName: "pages") }
),
new TextTranslationSkill
(
name: "05-3-translate-original-content-pages",
description: "Translate original merged content chunks",
defaultToLanguageCode: TextTranslationSkillLanguage.En,
context: "/document/merged_content_original/pages/*",
inputs: new []
{
new InputFieldMappingEntry(name: "text", source: "/document/merged_content_original/pages/*"),
new InputFieldMappingEntry(name: "fromLanguageCode", source: "/document/language_code")
},
outputs: new [] { new OutputFieldMappingEntry(name: "translatedText", targetName: "translated_text") }
),
new MergeSkill
(
name: "05-4-merge-translated-content-pages",
description: "Merge translated content into one text string",
context: "/document",
insertPreTag: " ",
insertPostTag: " ",
inputs: new []
{
new InputFieldMappingEntry(name: "itemsToInsert", source: "/document/merged_content_original/pages/*/translated_text")
},
outputs: new [] { new OutputFieldMappingEntry(name: "mergedText", targetName: "merged_content_translated") }
),
// ... some skills in the pipeline after translation
});
return skills;
}
private static string SplitLanguageExpression
{
get
{
var values = Enum.GetValues(typeof(SplitSkillLanguage)).Cast<SplitSkillLanguage>();
var parts = values.Select(v => "($(/document/language_code) == '" + v.ToString().ToLower() +"')");
return "= " + string.Join(" || ", parts);
}
}