如何访问 Microsoft.ML 中 FeaturizeText 生成的 n-gram?
How do I access the n-grams produced by FeaturizeText in Microsoft.ML?
我在 Microsoft.ML 中获得了第一个文本分析器 运行。我想获取由模型确定的 ngram 列表,但我只能在不知道它们指的是什么的情况下获取数值向量“计数”。
到目前为止,这是我工作代码的核心:
var mlContext = new MLContext();
var articles = SampleData.Articles.Select(a => new TextData{ Text=a }).ToArray();
var dataview = mlContext.Data.LoadFromEnumerable(articles);
var options = new TextFeaturizingEstimator.Options() {
OutputTokensColumnName = "OutputTokens",
CaseMode = TextNormalizingEstimator.CaseMode.Lower,
KeepDiacritics = false,
KeepPunctuations = false,
KeepNumbers = false,
Norm = TextFeaturizingEstimator.NormFunction.L2,
StopWordsRemoverOptions = new StopWordsRemovingEstimator.Options() {
Language = TextFeaturizingEstimator.Language.Dutch,
},
WordFeatureExtractor = new WordBagEstimator.Options() {
NgramLength = 4,
SkipLength = 1,
UseAllLengths = true,
MaximumNgramsCount = new int[] { 20, 10, 10, 10 },
Weighting = NgramExtractingEstimator.WeightingCriteria.TfIdf,
},
CharFeatureExtractor = null,
};
var textPipeline = mlContext.Transforms.Text
.FeaturizeText("Features", options, "Text");
var textTransformer = textPipeline.Fit(dataview);
var predictionEngine = mlContext.Model.CreatePredictionEngine<TextData, TransformedTextData>(textTransformer);
foreach (var article in articles)
{
var prediction = predictionEngine.Predict(article);
Console.WriteLine($"Article: {article.Text.Substring(0, 30)}...");
Console.WriteLine($"Number of Features: {prediction.Features.Length}");
Console.WriteLine($"Features: {string.Join(",", prediction.Features.Take(50).Select(f => f.ToString("0.00")))}\n");
}
好吧,我想通了,如果有人可能遇到同样的问题,我想在这里分享它。首先,您像往常一样创建模型。注意放置 Ngrams 步骤输出的列的名称(在我们的例子中为“ProduceNgrams”)。
然后“Schema.GetSlotNames”和“slotNames.GetValues”的组合就可以获取所需的 ngram:
var textPipeline =
mlContext.Transforms.Text.NormalizeText("Tokens", "Text", TextNormalizingEstimator.CaseMode.Lower, false, false, false)
.Append(mlContext.Transforms.Text.TokenizeIntoWords("Tokens"))
.Append(mlContext.Transforms.Text.RemoveDefaultStopWords("Tokens", language: StopWordsRemovingEstimator.Language.Dutch))
.Append(mlContext.Transforms.Conversion.MapValueToKey("Tokens"))
.Append(mlContext.Transforms.Text.ProduceNgrams("NgramFeatures", "Tokens"))
.Append(mlContext.Transforms.Text.LatentDirichletAllocation("LDAFeatures", "NgramFeatures",
numberOfTopics: 10
))
.Append(mlContext.Transforms.NormalizeLpNorm("Features", "LDAFeatures"));
var textTransformer = textPipeline.Fit(dataview);
var transformedDataView = textTransformer.Transform(dataview);
VBuffer<ReadOnlyMemory<char>> slotNames = default;
transformedDataView.Schema["NgramFeatures"].GetSlotNames(ref slotNames);
var ngrams = slotNames.GetValues().ToArray().Select(x => x.Span.ToString()); //.Replace('|',' '));
Console.WriteLine($"Ngrams: {string.Join(", ", ngrams)}\n");
var predictionEngine = mlContext.Model.CreatePredictionEngine<TextData, TransformedTextData>(textTransformer);
var articlesWithFeatures = new List<(TextData, TransformedTextData)>();
foreach (var article in articles)
{
var articleWithFeatures = predictionEngine.Predict(article);
Console.WriteLine($"Article: {article.Text.Substring(0, 30)}...");
Console.WriteLine($"Number of Features: {articleWithFeatures.Features.Length}");
Console.WriteLine($"Features: {string.Join(",", articleWithFeatures.Features.Take(50).Select(f => f.ToString("0.00")))}\n");
articlesWithFeatures.Add((article, articleWithFeatures));
}
我在 Microsoft.ML 中获得了第一个文本分析器 运行。我想获取由模型确定的 ngram 列表,但我只能在不知道它们指的是什么的情况下获取数值向量“计数”。
到目前为止,这是我工作代码的核心:
var mlContext = new MLContext();
var articles = SampleData.Articles.Select(a => new TextData{ Text=a }).ToArray();
var dataview = mlContext.Data.LoadFromEnumerable(articles);
var options = new TextFeaturizingEstimator.Options() {
OutputTokensColumnName = "OutputTokens",
CaseMode = TextNormalizingEstimator.CaseMode.Lower,
KeepDiacritics = false,
KeepPunctuations = false,
KeepNumbers = false,
Norm = TextFeaturizingEstimator.NormFunction.L2,
StopWordsRemoverOptions = new StopWordsRemovingEstimator.Options() {
Language = TextFeaturizingEstimator.Language.Dutch,
},
WordFeatureExtractor = new WordBagEstimator.Options() {
NgramLength = 4,
SkipLength = 1,
UseAllLengths = true,
MaximumNgramsCount = new int[] { 20, 10, 10, 10 },
Weighting = NgramExtractingEstimator.WeightingCriteria.TfIdf,
},
CharFeatureExtractor = null,
};
var textPipeline = mlContext.Transforms.Text
.FeaturizeText("Features", options, "Text");
var textTransformer = textPipeline.Fit(dataview);
var predictionEngine = mlContext.Model.CreatePredictionEngine<TextData, TransformedTextData>(textTransformer);
foreach (var article in articles)
{
var prediction = predictionEngine.Predict(article);
Console.WriteLine($"Article: {article.Text.Substring(0, 30)}...");
Console.WriteLine($"Number of Features: {prediction.Features.Length}");
Console.WriteLine($"Features: {string.Join(",", prediction.Features.Take(50).Select(f => f.ToString("0.00")))}\n");
}
好吧,我想通了,如果有人可能遇到同样的问题,我想在这里分享它。首先,您像往常一样创建模型。注意放置 Ngrams 步骤输出的列的名称(在我们的例子中为“ProduceNgrams”)。
然后“Schema.GetSlotNames”和“slotNames.GetValues”的组合就可以获取所需的 ngram:
var textPipeline =
mlContext.Transforms.Text.NormalizeText("Tokens", "Text", TextNormalizingEstimator.CaseMode.Lower, false, false, false)
.Append(mlContext.Transforms.Text.TokenizeIntoWords("Tokens"))
.Append(mlContext.Transforms.Text.RemoveDefaultStopWords("Tokens", language: StopWordsRemovingEstimator.Language.Dutch))
.Append(mlContext.Transforms.Conversion.MapValueToKey("Tokens"))
.Append(mlContext.Transforms.Text.ProduceNgrams("NgramFeatures", "Tokens"))
.Append(mlContext.Transforms.Text.LatentDirichletAllocation("LDAFeatures", "NgramFeatures",
numberOfTopics: 10
))
.Append(mlContext.Transforms.NormalizeLpNorm("Features", "LDAFeatures"));
var textTransformer = textPipeline.Fit(dataview);
var transformedDataView = textTransformer.Transform(dataview);
VBuffer<ReadOnlyMemory<char>> slotNames = default;
transformedDataView.Schema["NgramFeatures"].GetSlotNames(ref slotNames);
var ngrams = slotNames.GetValues().ToArray().Select(x => x.Span.ToString()); //.Replace('|',' '));
Console.WriteLine($"Ngrams: {string.Join(", ", ngrams)}\n");
var predictionEngine = mlContext.Model.CreatePredictionEngine<TextData, TransformedTextData>(textTransformer);
var articlesWithFeatures = new List<(TextData, TransformedTextData)>();
foreach (var article in articles)
{
var articleWithFeatures = predictionEngine.Predict(article);
Console.WriteLine($"Article: {article.Text.Substring(0, 30)}...");
Console.WriteLine($"Number of Features: {articleWithFeatures.Features.Length}");
Console.WriteLine($"Features: {string.Join(",", articleWithFeatures.Features.Take(50).Select(f => f.ToString("0.00")))}\n");
articlesWithFeatures.Add((article, articleWithFeatures));
}