如何在 Azure 中 return 输出一个简单的时间戳?
How to return a simple timestamp output in Azure?
我正在尝试 trim 降低我从正在使用的 Azure 语音到文本模型获得的数据。第 21 行是指定输出格式的地方,我已将其更改为 "simple" 但我仍然得到详细的输出。我使用的代码是:
using System;
using System.Threading.Tasks;
using Microsoft.CognitiveServices.Speech;
using Microsoft.CognitiveServices.Speech.Audio;
namespace NEST
{
class Program
{
static void Main(string[] args)
{
var key = "";
var region = "";
var audioFilePath = @"C:/Users/MichaelSchwartz/source/repos/AI-102-Process-Speech-master/transcribe_speech_to_text/media/narration.wav";
var speechConfig = SpeechConfig.FromSubscription(key, region);
// Generates timestamps
speechConfig.RequestWordLevelTimestamps();
speechConfig.OutputFormat = OutputFormat.Simple;
var stopRecognition = new TaskCompletionSource<int>();
// Calls the audio file
var audioConfig = AudioConfig.FromWavFileInput(audioFilePath);
var recognizer = new SpeechRecognizer(speechConfig, audioConfig);
//Display Recognized
recognizer.Recognized += (s, e) =>
{
if (e.Result.Reason == ResultReason.RecognizedSpeech)
{
Console.WriteLine($"RECOGNIZED :{e.Result.Properties.GetProperty(PropertyId.SpeechServiceResponse_JsonResult)}");
}
else if (e.Result.Reason == ResultReason.NoMatch)
{
Console.WriteLine($"NOMATCH: Speech could not be recognized.");
}
};
recognizer.Canceled += (s, e) =>
{
Console.WriteLine($"CANCELED: Reason={e.Reason}");
if (e.Reason == CancellationReason.Error)
{
Console.WriteLine($"CANCELED: ErrorCode={e.ErrorCode}");
Console.WriteLine($"CANCELED: ErrorDetails={e.ErrorDetails}");
Console.WriteLine($"CANCELED: Did you update the subscription info?");
}
stopRecognition.TrySetResult(0);
};
recognizer.SessionStopped += (s, e) =>
{
Console.WriteLine("\n Session stopped event.");
stopRecognition.TrySetResult(0);
};
recognizer.StartContinuousRecognitionAsync().GetAwaiter().GetResult();
// Waits for completion. Use Task.WaitAny to keep the task rooted.
Task.WaitAny(new[] { stopRecognition.Task });
do
{
Console.WriteLine("Press Enter to stop");
} while (Console.ReadKey().Key != ConsoleKey.Enter);
// Stops recognition.
recognizer.StopContinuousRecognitionAsync().ConfigureAwait(false);
}
}
}
返回的时间单位是什么?偏移量:173800000?该模型运行几秒钟,而不是几小时。 "offset"和"duration"是什么意思?
有没有办法在消息级别而不是单词级别添加时间戳?或者至少是一种专注于指示每个话语何时开始的单词级数据子集的方法?我正在转录更长的话语,这是几个小时的音频。
输出为:
RECOGNIZED :{"DisplayText":"The speech Translation API transcribes audio streams into text. Your application can display this text to the user or act upon it as command input. You can use this API either with an SDK client library, or a rest a rest API.","Duration":163400000,"Id":"02d2042cadec4ae9bf324c91949620e0","NBest":[{"Confidence":0.85213876,"Display":"The speech Translation API transcribes audio streams into text. Your application can display this text to the user or act upon it as command input. You can use this API either with an SDK client library, or a rest a rest API.","ITN":"the speech translation API transcribes audio streams into text your application can display this text to the user or act upon it as command input you can use this API either with an SDK client library or a rest a rest API","Lexical":"the speech translation API transcribes audio streams into text your application can display this text to the user or act upon it as command input you can use this API either with an SDK client library or a rest a rest API","MaskedITN":"the speech translation api transcribes audio streams into text your application can display this text to the user or act upon it as command input you can use this api either with an sdk client library or a rest a rest api","Words":[{"Duration":1700000,"Offset":21800000,"Word":"the"},{"Duration":4300000,"Offset":23600000,"Word":"speech"},{"Duration":7300000,"Offset":28000000,"Word":"translation"},{"Duration":5600000,"Offset":35400000,"Word":"API"},{"Duration":6600000,"Offset":41300000,"Word":"transcribes"},{"Duration":3400000,"Offset":48000000,"Word":"audio"},{"Duration":4000000,"Offset":51500000,"Word":"streams"},{"Duration":1900000,"Offset":55600000,"Word":"into"},{"Duration":5900000,"Offset":57600000,"Word":"text"},{"Duration":2300000,"Offset":66200000,"Word":"your"},{"Duration":7100000,"Offset":68600000,"Word":"application"},{"Duration":1500000,"Offset":75800000,"Word":"can"},{"Duration":3700000,"Offset":77400000,"Word":"display"},{"Duration":2100000,"Offset":81200000,"Word":"this"},{"Duration":3200000,"Offset":83400000,"Word":"text"},{"Duration":800000,"Offset":86700000,"Word":"to"},{"Duration":1100000,"Offset":87600000,"Word":"the"},{"Duration":4900000,"Offset":88800000,"Word":"user"},{"Duration":2700000,"Offset":94000000,"Word":"or"},{"Duration":1700000,"Offset":96800000,"Word":"act"},{"Duration":2300000,"Offset":98600000,"Word":"upon"},{"Duration":900000,"Offset":101000000,"Word":"it"},{"Duration":1300000,"Offset":102000000,"Word":"as"},{"Duration":3700000,"Offset":103400000,"Word":"command"},{"Duration":5800000,"Offset":107200000,"Word":"input"},{"Duration":2000000,"Offset":116900000,"Word":"you"},{"Duration":1700000,"Offset":119000000,"Word":"can"},{"Duration":2300000,"Offset":120800000,"Word":"use"},{"Duration":2100000,"Offset":123200000,"Word":"this"},{"Duration":6300000,"Offset":125400000,"Word":"API"},{"Duration":2500000,"Offset":131800000,"Word":"either"},{"Duration":1500000,"Offset":134400000,"Word":"with"},{"Duration":1100000,"Offset":136000000,"Word":"an"},{"Duration":6300000,"Offset":137200000,"Word":"SDK"},{"Duration":4100000,"Offset":143600000,"Word":"client"},{"Duration":7900000,"Offset":147800000,"Word":"library"},{"Duration":6200000,"Offset":158100000,"Word":"or"},{"Duration":2000000,"Offset":164900000,"Word":"a"},{"Duration":4700000,"Offset":167000000,"Word":"rest"},{"Duration":1700000,"Offset":172000000,"Word":"a"},{"Duration":4300000,"Offset":173800000,"Word":"rest"},{"Duration":7000000,"Offset":178200000,"Word":"API"}]},{"Confidence":0.85089046,"Display":"the speech translation API transcribes audio streams into text your application can display this text to the user or act upon it is command input you can use this API either with an SDK client library or a rest a rest API","ITN":"the speech translation api transcribes audio streams into text your application can display this text to the user or act upon it is command input you can use this api either with an sdk client library or a rest a rest api","Lexical":"the speech translation api transcribes audio streams into text your application can display this text to the user or act upon it is command input you can use this api either with an sdk client library or a rest a rest api","MaskedITN":"the speech translation api transcribes audio streams into text your application can display this text to the user or act upon it is command input you can use this api either with an sdk client library or a rest a rest api","Words":[{"Duration":1700000,"Offset":21800000,"Word":"the"},{"Duration":4300000,"Offset":23600000,"Word":"speech"},{"Duration":7300000,"Offset":28000000,"Word":"translation"},{"Duration":5600000,"Offset":35400000,"Word":"API"},{"Duration":6600000,"Offset":41300000,"Word":"transcribes"},{"Duration":3400000,"Offset":48000000,"Word":"audio"},{"Duration":4000000,"Offset":51500000,"Word":"streams"},{"Duration":1900000,"Offset":55600000,"Word":"into"},{"Duration":5900000,"Offset":57600000,"Word":"text"},{"Duration":2300000,"Offset":66200000,"Word":"your"},{"Duration":7100000,"Offset":68600000,"Word":"application"},{"Duration":1500000,"Offset":75800000,"Word":"can"},{"Duration":3700000,"Offset":77400000,"Word":"display"},{"Duration":2100000,"Offset":81200000,"Word":"this"},{"Duration":3200000,"Offset":83400000,"Word":"text"},{"Duration":800000,"Offset":86700000,"Word":"to"},{"Duration":1100000,"Offset":87600000,"Word":"the"},{"Duration":4900000,"Offset":88800000,"Word":"user"},{"Duration":2700000,"Offset":94000000,"Word":"or"},{"Duration":1700000,"Offset":96800000,"Word":"act"},{"Duration":2300000,"Offset":98600000,"Word":"upon"},{"Duration":900000,"Offset":101000000,"Word":"it"},{"Duration":1300000,"Offset":102000000,"Word":"is"},{"Duration":3700000,"Offset":103400000,"Word":"command"},{"Duration":5800000,"Offset":107200000,"Word":"input"},{"Duration":2000000,"Offset":116900000,"Word":"you"},{"Duration":1700000,"Offset":119000000,"Word":"can"},{"Duration":2300000,"Offset":120800000,"Word":"use"},{"Duration":2100000,"Offset":123200000,"Word":"this"},{"Duration":6300000,"Offset":125400000,"Word":"API"},{"Duration":2500000,"Offset":131800000,"Word":"either"},{"Duration":1500000,"Offset":134400000,"Word":"with"},{"Duration":1100000,"Offset":136000000,"Word":"an"},{"Duration":6300000,"Offset":137200000,"Word":"SDK"},{"Duration":4100000,"Offset":143600000,"Word":"client"},{"Duration":7900000,"Offset":147800000,"Word":"library"},{"Duration":6200000,"Offset":158100000,"Word":"or"},{"Duration":2000000,"Offset":164900000,"Word":"a"},{"Duration":4700000,"Offset":167000000,"Word":"rest"},{"Duration":1700000,"Offset":172000000,"Word":"a"},{"Duration":4300000,"Offset":173800000,"Word":"rest"},{"Duration":7000000,"Offset":178200000,"Word":"API"}]},{"Confidence":0.8548482,"Display":"the speech translation API transcribes audio streams and a text your application can display this text to the user or act upon it as command input you can use this API either with an SDK client library or a rest a rest API","ITN":"the speech translation api transcribes audio streams and a text your application can display this text to the user or act upon it as command input you can use this api either with an sdk client library or a rest a rest api","Lexical":"the speech translation api transcribes audio streams and a text your application can display this text to the user or act upon it as command input you can use this api either with an sdk client library or a rest a rest api","MaskedITN":"the speech translation api transcribes audio streams and a text your application can display this text to the user or act upon it as command input you can use this api either with an sdk client library or a rest a rest api","Words":[{"Duration":1700000,"Offset":21800000,"Word":"the"},{"Duration":4300000,"Offset":23600000,"Word":"speech"},{"Duration":7300000,"Offset":28000000,"Word":"translation"},{"Duration":5600000,"Offset":35400000,"Word":"API"},{"Duration":6600000,"Offset":41300000,"Word":"transcribes"},{"Duration":3400000,"Offset":48000000,"Word":"audio"},{"Duration":3900000,"Offset":51500000,"Word":"streams"},{"Duration":1200000,"Offset":55500000,"Word":"and"},{"Duration":700000,"Offset":56800000,"Word":"a"},{"Duration":5900000,"Offset":57600000,"Word":"text"},{"Duration":2300000,"Offset":66200000,"Word":"your"},{"Duration":7100000,"Offset":68600000,"Word":"application"},{"Duration":1500000,"Offset":75800000,"Word":"can"},{"Duration":3700000,"Offset":77400000,"Word":"display"},{"Duration":2100000,"Offset":81200000,"Word":"this"},{"Duration":3200000,"Offset":83400000,"Word":"text"},{"Duration":800000,"Offset":86700000,"Word":"to"},{"Duration":1100000,"Offset":87600000,"Word":"the"},{"Duration":4900000,"Offset":88800000,"Word":"user"},{"Duration":2700000,"Offset":94000000,"Word":"or"},{"Duration":1700000,"Offset":96800000,"Word":"act"},{"Duration":2300000,"Offset":98600000,"Word":"upon"},{"Duration":900000,"Offset":101000000,"Word":"it"},{"Duration":1300000,"Offset":102000000,"Word":"as"},{"Duration":3700000,"Offset":103400000,"Word":"command"},{"Duration":5800000,"Offset":107200000,"Word":"input"},{"Duration":2000000,"Offset":116900000,"Word":"you"},{"Duration":1700000,"Offset":119000000,"Word":"can"},{"Duration":2300000,"Offset":120800000,"Word":"use"},{"Duration":2100000,"Offset":123200000,"Word":"this"},{"Duration":6300000,"Offset":125400000,"Word":"API"},{"Duration":2500000,"Offset":131800000,"Word":"either"},{"Duration":1500000,"Offset":134400000,"Word":"with"},{"Duration":1100000,"Offset":136000000,"Word":"an"},{"Duration":6300000,"Offset":137200000,"Word":"SDK"},{"Duration":4100000,"Offset":143600000,"Word":"client"},{"Duration":7900000,"Offset":147800000,"Word":"library"},{"Duration":6200000,"Offset":158100000,"Word":"or"},{"Duration":2000000,"Offset":164900000,"Word":"a"},{"Duration":4700000,"Offset":167000000,"Word":"rest"},{"Duration":1700000,"Offset":172000000,"Word":"a"},{"Duration":4300000,"Offset":173800000,"Word":"rest"},{"Duration":7000000,"Offset":178200000,"Word":"API"}]},{"Confidence":0.8535998,"Display":"the speech translation API transcribes audio streams and a text your application can display this text to the user or act upon it is command input you can use this API either with an SDK client library or a rest a rest API","ITN":"the speech translation api transcribes audio streams and a text your application can display this text to the user or act upon it is command input you can use this api either with an sdk client library or a rest a rest api","Lexical":"the speech translation api transcribes audio streams and a text your application can display this text to the user or act upon it is command input you can use this api either with an sdk client library or a rest a rest api","MaskedITN":"the speech translation api transcribes audio streams and a text your application can display this text to the user or act upon it is command input you can use this api either with an sdk client library or a rest a rest api","Words":[{"Duration":1700000,"Offset":21800000,"Word":"the"},{"Duration":4300000,"Offset":23600000,"Word":"speech"},{"Duration":7300000,"Offset":28000000,"Word":"translation"},{"Duration":5600000,"Offset":35400000,"Word":"API"},{"Duration":6600000,"Offset":41300000,"Word":"transcribes"},{"Duration":3400000,"Offset":48000000,"Word":"audio"},{"Duration":3900000,"Offset":51500000,"Word":"streams"},{"Duration":1200000,"Offset":55500000,"Word":"and"},{"Duration":700000,"Offset":56800000,"Word":"a"},{"Duration":5900000,"Offset":57600000,"Word":"text"},{"Duration":2300000,"Offset":66200000,"Word":"your"},{"Duration":7100000,"Offset":68600000,"Word":"application"},{"Duration":1500000,"Offset":75800000,"Word":"can"},{"Duration":3700000,"Offset":77400000,"Word":"display"},{"Duration":2100000,"Offset":81200000,"Word":"this"},{"Duration":3200000,"Offset":83400000,"Word":"text"},{"Duration":800000,"Offset":86700000,"Word":"to"},{"Duration":1100000,"Offset":87600000,"Word":"the"},{"Duration":4900000,"Offset":88800000,"Word":"user"},{"Duration":2700000,"Offset":94000000,"Word":"or"},{"Duration":1700000,"Offset":96800000,"Word":"act"},{"Duration":2300000,"Offset":98600000,"Word":"upon"},{"Duration":900000,"Offset":101000000,"Word":"it"},{"Duration":1300000,"Offset":102000000,"Word":"is"},{"Duration":3700000,"Offset":103400000,"Word":"command"},{"Duration":5800000,"Offset":107200000,"Word":"input"},{"Duration":2000000,"Offset":116900000,"Word":"you"},{"Duration":1700000,"Offset":119000000,"Word":"can"},{"Duration":2300000,"Offset":120800000,"Word":"use"},{"Duration":2100000,"Offset":123200000,"Word":"this"},{"Duration":6300000,"Offset":125400000,"Word":"API"},{"Duration":2500000,"Offset":131800000,"Word":"either"},{"Duration":1500000,"Offset":134400000,"Word":"with"},{"Duration":1100000,"Offset":136000000,"Word":"an"},{"Duration":6300000,"Offset":137200000,"Word":"SDK"},{"Duration":4100000,"Offset":143600000,"Word":"client"},{"Duration":7900000,"Offset":147800000,"Word":"library"},{"Duration":6200000,"Offset":158100000,"Word":"or"},{"Duration":2000000,"Offset":164900000,"Word":"a"},{"Duration":4700000,"Offset":167000000,"Word":"rest"},{"Duration":1700000,"Offset":172000000,"Word":"a"},{"Duration":4300000,"Offset":173800000,"Word":"rest"},{"Duration":7000000,"Offset":178200000,"Word":"API"}]},{"Confidence":0.8474758,"Display":"the speech translation API transcribes audio streams into text your application can display this text to the user or act upon it as command input you can use this API either within SDK client library or a rest a rest API","ITN":"the speech translation api transcribes audio streams into text your application can display this text to the user or act upon it as command input you can use this api either within sdk client library or a rest a rest api","Lexical":"the speech translation api transcribes audio streams into text your application can display this text to the user or act upon it as command input you can use this api either within sdk client library or a rest a rest api","MaskedITN":"the speech translation api transcribes audio streams into text your application can display this text to the user or act upon it as command input you can use this api either within sdk client library or a rest a rest api","Words":[{"Duration":1700000,"Offset":21800000,"Word":"the"},{"Duration":4300000,"Offset":23600000,"Word":"speech"},{"Duration":7300000,"Offset":28000000,"Word":"translation"},{"Duration":5600000,"Offset":35400000,"Word":"API"},{"Duration":6600000,"Offset":41300000,"Word":"transcribes"},{"Duration":3400000,"Offset":48000000,"Word":"audio"},{"Duration":4000000,"Offset":51500000,"Word":"streams"},{"Duration":1900000,"Offset":55600000,"Word":"into"},{"Duration":5900000,"Offset":57600000,"Word":"text"},{"Duration":2300000,"Offset":66200000,"Word":"your"},{"Duration":7100000,"Offset":68600000,"Word":"application"},{"Duration":1500000,"Offset":75800000,"Word":"can"},{"Duration":3700000,"Offset":77400000,"Word":"display"},{"Duration":2100000,"Offset":81200000,"Word":"this"},{"Duration":3200000,"Offset":83400000,"Word":"text"},{"Duration":800000,"Offset":86700000,"Word":"to"},{"Duration":1100000,"Offset":87600000,"Word":"the"},{"Duration":4900000,"Offset":88800000,"Word":"user"},{"Duration":2700000,"Offset":94000000,"Word":"or"},{"Duration":1700000,"Offset":96800000,"Word":"act"},{"Duration":2300000,"Offset":98600000,"Word":"upon"},{"Duration":900000,"Offset":101000000,"Word":"it"},{"Duration":1300000,"Offset":102000000,"Word":"as"},{"Duration":3700000,"Offset":103400000,"Word":"command"},{"Duration":5800000,"Offset":107200000,"Word":"input"},{"Duration":2000000,"Offset":116900000,"Word":"you"},{"Duration":1700000,"Offset":119000000,"Word":"can"},{"Duration":2300000,"Offset":120800000,"Word":"use"},{"Duration":2100000,"Offset":123200000,"Word":"this"},{"Duration":6300000,"Offset":125400000,"Word":"API"},{"Duration":2500000,"Offset":131800000,"Word":"either"},{"Duration":2700000,"Offset":134400000,"Word":"within"},{"Duration":6300000,"Offset":137200000,"Word":"SDK"},{"Duration":4100000,"Offset":143600000,"Word":"client"},{"Duration":7900000,"Offset":147800000,"Word":"library"},{"Duration":6200000,"Offset":158100000,"Word":"or"},{"Duration":2000000,"Offset":164900000,"Word":"a"},{"Duration":4700000,"Offset":167000000,"Word":"rest"},{"Duration":1700000,"Offset":172000000,"Word":"a"},{"Duration":4300000,"Offset":173800000,"Word":"rest"},{"Duration":7000000,"Offset":178200000,"Word":"API"}]}],"Offset":21800000,"RecognitionStatus":"Success"}
CANCELED: Reason=EndOfStream
Press Enter to stop
Session stopped event.
还有,为什么输出结果重复了5次?我正在寻找 trim 尽可能多地进行数据分析。有没有办法将测量单位更改为更用户友好的单位,例如秒?
第 1 季度:
为什么输出结果重复5次?
实际上,您可以通过以下问题从STT FAQ中找到答案:
I get several results for each phrase with the detailed output format.
Which one should I use?
根据设计,您可以在 JSON 的 NBest
响应中获得多个具有不同置信度分数的结果,默认情况下,系统将选择第一个作为显示结果。您可以根据需要选择结果,例如置信度最高的结果。
第 2 季度:
有没有办法将测量单位更改为对用户更友好的单位,例如秒?
事实上,Azure 没有提供任何进一步的方法来使用结果。但是我根据你的代码写了一个简单的演示:
using Microsoft.CognitiveServices.Speech;
using Microsoft.CognitiveServices.Speech.Audio;
using Newtonsoft.Json;
using System;
using System.Collections.Generic;
using System.Threading.Tasks;
using System.Linq;
namespace STTwithTime
{
class Program
{
static void Main(string[] args)
{
var key = "";
var region = "";
var audioFilePath = @"";
var speechConfig = SpeechConfig.FromSubscription(key, region);
// Generates timestamps
speechConfig.RequestWordLevelTimestamps();
speechConfig.OutputFormat = OutputFormat.Detailed;
var stopRecognition = new TaskCompletionSource<int>();
var audioConfig = AudioConfig.FromWavFileInput(audioFilePath);
var recognizer = new SpeechRecognizer(speechConfig, audioConfig);
//Display Recognized
recognizer.Recognized += (s, e) =>
{
if (e.Result.Reason == ResultReason.RecognizedSpeech)
{
var result = JsonConvert.DeserializeObject<Result>(e.Result.Properties.GetProperty(PropertyId.SpeechServiceResponse_JsonResult));
var maxConfidenceValue = result.NBest.Max(item => item.Confidence);
var maxConfidence = result.NBest.Find(item => item.Confidence == maxConfidenceValue);
Console.WriteLine("================================");
Console.WriteLine("Confidence:"+maxConfidence.Confidence);
Console.WriteLine("RECOGNIZED :" + maxConfidence.Display);
Console.WriteLine("Duration: :" + Convert.ToDouble(result.Duration) / 10000000);
Console.WriteLine("Words:");
foreach (var word in maxConfidence.Words) {
Console.WriteLine(word.word + "=> offset:" + Convert.ToDouble(word.Offset) / 10000000 + " duraction:" + Convert.ToDouble(word.Duration) / 10000000);
}
}
else if (e.Result.Reason == ResultReason.NoMatch)
{
Console.WriteLine($"NOMATCH: Speech could not be recognized.");
}
};
recognizer.Canceled += (s, e) =>
{
Console.WriteLine($"CANCELED: Reason={e.Reason}");
if (e.Reason == CancellationReason.Error)
{
Console.WriteLine($"CANCELED: ErrorCode={e.ErrorCode}");
Console.WriteLine($"CANCELED: ErrorDetails={e.ErrorDetails}");
Console.WriteLine($"CANCELED: Did you update the subscription info?");
}
stopRecognition.TrySetResult(0);
};
recognizer.SessionStopped += (s, e) =>
{
Console.WriteLine("\n Session stopped event.");
stopRecognition.TrySetResult(0);
};
recognizer.StartContinuousRecognitionAsync().GetAwaiter().GetResult();
// Waits for completion. Use Task.WaitAny to keep the task rooted.
Task.WaitAny(new[] { stopRecognition.Task });
}
}
public class Word
{
public int Duration { get; set; }
public int Offset { get; set; }
public string word { get; set; }
}
public class NBest
{
public double Confidence { get; set; }
public string Display { get; set; }
public string ITN { get; set; }
public string Lexical { get; set; }
public string MaskedITN { get; set; }
public List<Word> Words { get; set; }
}
public class Result
{
public string DisplayText { get; set; }
public int Duration { get; set; }
public string Id { get; set; }
public List<NBest> NBest { get; set; }
public int Offset { get; set; }
public string RecognitionStatus { get; set; }
}
}
结果:
如果您指定一个长时间的.wav文件,结果将被分成多方。
我正在尝试 trim 降低我从正在使用的 Azure 语音到文本模型获得的数据。第 21 行是指定输出格式的地方,我已将其更改为 "simple" 但我仍然得到详细的输出。我使用的代码是:
using System;
using System.Threading.Tasks;
using Microsoft.CognitiveServices.Speech;
using Microsoft.CognitiveServices.Speech.Audio;
namespace NEST
{
class Program
{
static void Main(string[] args)
{
var key = "";
var region = "";
var audioFilePath = @"C:/Users/MichaelSchwartz/source/repos/AI-102-Process-Speech-master/transcribe_speech_to_text/media/narration.wav";
var speechConfig = SpeechConfig.FromSubscription(key, region);
// Generates timestamps
speechConfig.RequestWordLevelTimestamps();
speechConfig.OutputFormat = OutputFormat.Simple;
var stopRecognition = new TaskCompletionSource<int>();
// Calls the audio file
var audioConfig = AudioConfig.FromWavFileInput(audioFilePath);
var recognizer = new SpeechRecognizer(speechConfig, audioConfig);
//Display Recognized
recognizer.Recognized += (s, e) =>
{
if (e.Result.Reason == ResultReason.RecognizedSpeech)
{
Console.WriteLine($"RECOGNIZED :{e.Result.Properties.GetProperty(PropertyId.SpeechServiceResponse_JsonResult)}");
}
else if (e.Result.Reason == ResultReason.NoMatch)
{
Console.WriteLine($"NOMATCH: Speech could not be recognized.");
}
};
recognizer.Canceled += (s, e) =>
{
Console.WriteLine($"CANCELED: Reason={e.Reason}");
if (e.Reason == CancellationReason.Error)
{
Console.WriteLine($"CANCELED: ErrorCode={e.ErrorCode}");
Console.WriteLine($"CANCELED: ErrorDetails={e.ErrorDetails}");
Console.WriteLine($"CANCELED: Did you update the subscription info?");
}
stopRecognition.TrySetResult(0);
};
recognizer.SessionStopped += (s, e) =>
{
Console.WriteLine("\n Session stopped event.");
stopRecognition.TrySetResult(0);
};
recognizer.StartContinuousRecognitionAsync().GetAwaiter().GetResult();
// Waits for completion. Use Task.WaitAny to keep the task rooted.
Task.WaitAny(new[] { stopRecognition.Task });
do
{
Console.WriteLine("Press Enter to stop");
} while (Console.ReadKey().Key != ConsoleKey.Enter);
// Stops recognition.
recognizer.StopContinuousRecognitionAsync().ConfigureAwait(false);
}
}
}
返回的时间单位是什么?偏移量:173800000?该模型运行几秒钟,而不是几小时。 "offset"和"duration"是什么意思?
有没有办法在消息级别而不是单词级别添加时间戳?或者至少是一种专注于指示每个话语何时开始的单词级数据子集的方法?我正在转录更长的话语,这是几个小时的音频。
输出为:
RECOGNIZED :{"DisplayText":"The speech Translation API transcribes audio streams into text. Your application can display this text to the user or act upon it as command input. You can use this API either with an SDK client library, or a rest a rest API.","Duration":163400000,"Id":"02d2042cadec4ae9bf324c91949620e0","NBest":[{"Confidence":0.85213876,"Display":"The speech Translation API transcribes audio streams into text. Your application can display this text to the user or act upon it as command input. You can use this API either with an SDK client library, or a rest a rest API.","ITN":"the speech translation API transcribes audio streams into text your application can display this text to the user or act upon it as command input you can use this API either with an SDK client library or a rest a rest API","Lexical":"the speech translation API transcribes audio streams into text your application can display this text to the user or act upon it as command input you can use this API either with an SDK client library or a rest a rest API","MaskedITN":"the speech translation api transcribes audio streams into text your application can display this text to the user or act upon it as command input you can use this api either with an sdk client library or a rest a rest api","Words":[{"Duration":1700000,"Offset":21800000,"Word":"the"},{"Duration":4300000,"Offset":23600000,"Word":"speech"},{"Duration":7300000,"Offset":28000000,"Word":"translation"},{"Duration":5600000,"Offset":35400000,"Word":"API"},{"Duration":6600000,"Offset":41300000,"Word":"transcribes"},{"Duration":3400000,"Offset":48000000,"Word":"audio"},{"Duration":4000000,"Offset":51500000,"Word":"streams"},{"Duration":1900000,"Offset":55600000,"Word":"into"},{"Duration":5900000,"Offset":57600000,"Word":"text"},{"Duration":2300000,"Offset":66200000,"Word":"your"},{"Duration":7100000,"Offset":68600000,"Word":"application"},{"Duration":1500000,"Offset":75800000,"Word":"can"},{"Duration":3700000,"Offset":77400000,"Word":"display"},{"Duration":2100000,"Offset":81200000,"Word":"this"},{"Duration":3200000,"Offset":83400000,"Word":"text"},{"Duration":800000,"Offset":86700000,"Word":"to"},{"Duration":1100000,"Offset":87600000,"Word":"the"},{"Duration":4900000,"Offset":88800000,"Word":"user"},{"Duration":2700000,"Offset":94000000,"Word":"or"},{"Duration":1700000,"Offset":96800000,"Word":"act"},{"Duration":2300000,"Offset":98600000,"Word":"upon"},{"Duration":900000,"Offset":101000000,"Word":"it"},{"Duration":1300000,"Offset":102000000,"Word":"as"},{"Duration":3700000,"Offset":103400000,"Word":"command"},{"Duration":5800000,"Offset":107200000,"Word":"input"},{"Duration":2000000,"Offset":116900000,"Word":"you"},{"Duration":1700000,"Offset":119000000,"Word":"can"},{"Duration":2300000,"Offset":120800000,"Word":"use"},{"Duration":2100000,"Offset":123200000,"Word":"this"},{"Duration":6300000,"Offset":125400000,"Word":"API"},{"Duration":2500000,"Offset":131800000,"Word":"either"},{"Duration":1500000,"Offset":134400000,"Word":"with"},{"Duration":1100000,"Offset":136000000,"Word":"an"},{"Duration":6300000,"Offset":137200000,"Word":"SDK"},{"Duration":4100000,"Offset":143600000,"Word":"client"},{"Duration":7900000,"Offset":147800000,"Word":"library"},{"Duration":6200000,"Offset":158100000,"Word":"or"},{"Duration":2000000,"Offset":164900000,"Word":"a"},{"Duration":4700000,"Offset":167000000,"Word":"rest"},{"Duration":1700000,"Offset":172000000,"Word":"a"},{"Duration":4300000,"Offset":173800000,"Word":"rest"},{"Duration":7000000,"Offset":178200000,"Word":"API"}]},{"Confidence":0.85089046,"Display":"the speech translation API transcribes audio streams into text your application can display this text to the user or act upon it is command input you can use this API either with an SDK client library or a rest a rest API","ITN":"the speech translation api transcribes audio streams into text your application can display this text to the user or act upon it is command input you can use this api either with an sdk client library or a rest a rest api","Lexical":"the speech translation api transcribes audio streams into text your application can display this text to the user or act upon it is command input you can use this api either with an sdk client library or a rest a rest api","MaskedITN":"the speech translation api transcribes audio streams into text your application can display this text to the user or act upon it is command input you can use this api either with an sdk client library or a rest a rest api","Words":[{"Duration":1700000,"Offset":21800000,"Word":"the"},{"Duration":4300000,"Offset":23600000,"Word":"speech"},{"Duration":7300000,"Offset":28000000,"Word":"translation"},{"Duration":5600000,"Offset":35400000,"Word":"API"},{"Duration":6600000,"Offset":41300000,"Word":"transcribes"},{"Duration":3400000,"Offset":48000000,"Word":"audio"},{"Duration":4000000,"Offset":51500000,"Word":"streams"},{"Duration":1900000,"Offset":55600000,"Word":"into"},{"Duration":5900000,"Offset":57600000,"Word":"text"},{"Duration":2300000,"Offset":66200000,"Word":"your"},{"Duration":7100000,"Offset":68600000,"Word":"application"},{"Duration":1500000,"Offset":75800000,"Word":"can"},{"Duration":3700000,"Offset":77400000,"Word":"display"},{"Duration":2100000,"Offset":81200000,"Word":"this"},{"Duration":3200000,"Offset":83400000,"Word":"text"},{"Duration":800000,"Offset":86700000,"Word":"to"},{"Duration":1100000,"Offset":87600000,"Word":"the"},{"Duration":4900000,"Offset":88800000,"Word":"user"},{"Duration":2700000,"Offset":94000000,"Word":"or"},{"Duration":1700000,"Offset":96800000,"Word":"act"},{"Duration":2300000,"Offset":98600000,"Word":"upon"},{"Duration":900000,"Offset":101000000,"Word":"it"},{"Duration":1300000,"Offset":102000000,"Word":"is"},{"Duration":3700000,"Offset":103400000,"Word":"command"},{"Duration":5800000,"Offset":107200000,"Word":"input"},{"Duration":2000000,"Offset":116900000,"Word":"you"},{"Duration":1700000,"Offset":119000000,"Word":"can"},{"Duration":2300000,"Offset":120800000,"Word":"use"},{"Duration":2100000,"Offset":123200000,"Word":"this"},{"Duration":6300000,"Offset":125400000,"Word":"API"},{"Duration":2500000,"Offset":131800000,"Word":"either"},{"Duration":1500000,"Offset":134400000,"Word":"with"},{"Duration":1100000,"Offset":136000000,"Word":"an"},{"Duration":6300000,"Offset":137200000,"Word":"SDK"},{"Duration":4100000,"Offset":143600000,"Word":"client"},{"Duration":7900000,"Offset":147800000,"Word":"library"},{"Duration":6200000,"Offset":158100000,"Word":"or"},{"Duration":2000000,"Offset":164900000,"Word":"a"},{"Duration":4700000,"Offset":167000000,"Word":"rest"},{"Duration":1700000,"Offset":172000000,"Word":"a"},{"Duration":4300000,"Offset":173800000,"Word":"rest"},{"Duration":7000000,"Offset":178200000,"Word":"API"}]},{"Confidence":0.8548482,"Display":"the speech translation API transcribes audio streams and a text your application can display this text to the user or act upon it as command input you can use this API either with an SDK client library or a rest a rest API","ITN":"the speech translation api transcribes audio streams and a text your application can display this text to the user or act upon it as command input you can use this api either with an sdk client library or a rest a rest api","Lexical":"the speech translation api transcribes audio streams and a text your application can display this text to the user or act upon it as command input you can use this api either with an sdk client library or a rest a rest api","MaskedITN":"the speech translation api transcribes audio streams and a text your application can display this text to the user or act upon it as command input you can use this api either with an sdk client library or a rest a rest api","Words":[{"Duration":1700000,"Offset":21800000,"Word":"the"},{"Duration":4300000,"Offset":23600000,"Word":"speech"},{"Duration":7300000,"Offset":28000000,"Word":"translation"},{"Duration":5600000,"Offset":35400000,"Word":"API"},{"Duration":6600000,"Offset":41300000,"Word":"transcribes"},{"Duration":3400000,"Offset":48000000,"Word":"audio"},{"Duration":3900000,"Offset":51500000,"Word":"streams"},{"Duration":1200000,"Offset":55500000,"Word":"and"},{"Duration":700000,"Offset":56800000,"Word":"a"},{"Duration":5900000,"Offset":57600000,"Word":"text"},{"Duration":2300000,"Offset":66200000,"Word":"your"},{"Duration":7100000,"Offset":68600000,"Word":"application"},{"Duration":1500000,"Offset":75800000,"Word":"can"},{"Duration":3700000,"Offset":77400000,"Word":"display"},{"Duration":2100000,"Offset":81200000,"Word":"this"},{"Duration":3200000,"Offset":83400000,"Word":"text"},{"Duration":800000,"Offset":86700000,"Word":"to"},{"Duration":1100000,"Offset":87600000,"Word":"the"},{"Duration":4900000,"Offset":88800000,"Word":"user"},{"Duration":2700000,"Offset":94000000,"Word":"or"},{"Duration":1700000,"Offset":96800000,"Word":"act"},{"Duration":2300000,"Offset":98600000,"Word":"upon"},{"Duration":900000,"Offset":101000000,"Word":"it"},{"Duration":1300000,"Offset":102000000,"Word":"as"},{"Duration":3700000,"Offset":103400000,"Word":"command"},{"Duration":5800000,"Offset":107200000,"Word":"input"},{"Duration":2000000,"Offset":116900000,"Word":"you"},{"Duration":1700000,"Offset":119000000,"Word":"can"},{"Duration":2300000,"Offset":120800000,"Word":"use"},{"Duration":2100000,"Offset":123200000,"Word":"this"},{"Duration":6300000,"Offset":125400000,"Word":"API"},{"Duration":2500000,"Offset":131800000,"Word":"either"},{"Duration":1500000,"Offset":134400000,"Word":"with"},{"Duration":1100000,"Offset":136000000,"Word":"an"},{"Duration":6300000,"Offset":137200000,"Word":"SDK"},{"Duration":4100000,"Offset":143600000,"Word":"client"},{"Duration":7900000,"Offset":147800000,"Word":"library"},{"Duration":6200000,"Offset":158100000,"Word":"or"},{"Duration":2000000,"Offset":164900000,"Word":"a"},{"Duration":4700000,"Offset":167000000,"Word":"rest"},{"Duration":1700000,"Offset":172000000,"Word":"a"},{"Duration":4300000,"Offset":173800000,"Word":"rest"},{"Duration":7000000,"Offset":178200000,"Word":"API"}]},{"Confidence":0.8535998,"Display":"the speech translation API transcribes audio streams and a text your application can display this text to the user or act upon it is command input you can use this API either with an SDK client library or a rest a rest API","ITN":"the speech translation api transcribes audio streams and a text your application can display this text to the user or act upon it is command input you can use this api either with an sdk client library or a rest a rest api","Lexical":"the speech translation api transcribes audio streams and a text your application can display this text to the user or act upon it is command input you can use this api either with an sdk client library or a rest a rest api","MaskedITN":"the speech translation api transcribes audio streams and a text your application can display this text to the user or act upon it is command input you can use this api either with an sdk client library or a rest a rest api","Words":[{"Duration":1700000,"Offset":21800000,"Word":"the"},{"Duration":4300000,"Offset":23600000,"Word":"speech"},{"Duration":7300000,"Offset":28000000,"Word":"translation"},{"Duration":5600000,"Offset":35400000,"Word":"API"},{"Duration":6600000,"Offset":41300000,"Word":"transcribes"},{"Duration":3400000,"Offset":48000000,"Word":"audio"},{"Duration":3900000,"Offset":51500000,"Word":"streams"},{"Duration":1200000,"Offset":55500000,"Word":"and"},{"Duration":700000,"Offset":56800000,"Word":"a"},{"Duration":5900000,"Offset":57600000,"Word":"text"},{"Duration":2300000,"Offset":66200000,"Word":"your"},{"Duration":7100000,"Offset":68600000,"Word":"application"},{"Duration":1500000,"Offset":75800000,"Word":"can"},{"Duration":3700000,"Offset":77400000,"Word":"display"},{"Duration":2100000,"Offset":81200000,"Word":"this"},{"Duration":3200000,"Offset":83400000,"Word":"text"},{"Duration":800000,"Offset":86700000,"Word":"to"},{"Duration":1100000,"Offset":87600000,"Word":"the"},{"Duration":4900000,"Offset":88800000,"Word":"user"},{"Duration":2700000,"Offset":94000000,"Word":"or"},{"Duration":1700000,"Offset":96800000,"Word":"act"},{"Duration":2300000,"Offset":98600000,"Word":"upon"},{"Duration":900000,"Offset":101000000,"Word":"it"},{"Duration":1300000,"Offset":102000000,"Word":"is"},{"Duration":3700000,"Offset":103400000,"Word":"command"},{"Duration":5800000,"Offset":107200000,"Word":"input"},{"Duration":2000000,"Offset":116900000,"Word":"you"},{"Duration":1700000,"Offset":119000000,"Word":"can"},{"Duration":2300000,"Offset":120800000,"Word":"use"},{"Duration":2100000,"Offset":123200000,"Word":"this"},{"Duration":6300000,"Offset":125400000,"Word":"API"},{"Duration":2500000,"Offset":131800000,"Word":"either"},{"Duration":1500000,"Offset":134400000,"Word":"with"},{"Duration":1100000,"Offset":136000000,"Word":"an"},{"Duration":6300000,"Offset":137200000,"Word":"SDK"},{"Duration":4100000,"Offset":143600000,"Word":"client"},{"Duration":7900000,"Offset":147800000,"Word":"library"},{"Duration":6200000,"Offset":158100000,"Word":"or"},{"Duration":2000000,"Offset":164900000,"Word":"a"},{"Duration":4700000,"Offset":167000000,"Word":"rest"},{"Duration":1700000,"Offset":172000000,"Word":"a"},{"Duration":4300000,"Offset":173800000,"Word":"rest"},{"Duration":7000000,"Offset":178200000,"Word":"API"}]},{"Confidence":0.8474758,"Display":"the speech translation API transcribes audio streams into text your application can display this text to the user or act upon it as command input you can use this API either within SDK client library or a rest a rest API","ITN":"the speech translation api transcribes audio streams into text your application can display this text to the user or act upon it as command input you can use this api either within sdk client library or a rest a rest api","Lexical":"the speech translation api transcribes audio streams into text your application can display this text to the user or act upon it as command input you can use this api either within sdk client library or a rest a rest api","MaskedITN":"the speech translation api transcribes audio streams into text your application can display this text to the user or act upon it as command input you can use this api either within sdk client library or a rest a rest api","Words":[{"Duration":1700000,"Offset":21800000,"Word":"the"},{"Duration":4300000,"Offset":23600000,"Word":"speech"},{"Duration":7300000,"Offset":28000000,"Word":"translation"},{"Duration":5600000,"Offset":35400000,"Word":"API"},{"Duration":6600000,"Offset":41300000,"Word":"transcribes"},{"Duration":3400000,"Offset":48000000,"Word":"audio"},{"Duration":4000000,"Offset":51500000,"Word":"streams"},{"Duration":1900000,"Offset":55600000,"Word":"into"},{"Duration":5900000,"Offset":57600000,"Word":"text"},{"Duration":2300000,"Offset":66200000,"Word":"your"},{"Duration":7100000,"Offset":68600000,"Word":"application"},{"Duration":1500000,"Offset":75800000,"Word":"can"},{"Duration":3700000,"Offset":77400000,"Word":"display"},{"Duration":2100000,"Offset":81200000,"Word":"this"},{"Duration":3200000,"Offset":83400000,"Word":"text"},{"Duration":800000,"Offset":86700000,"Word":"to"},{"Duration":1100000,"Offset":87600000,"Word":"the"},{"Duration":4900000,"Offset":88800000,"Word":"user"},{"Duration":2700000,"Offset":94000000,"Word":"or"},{"Duration":1700000,"Offset":96800000,"Word":"act"},{"Duration":2300000,"Offset":98600000,"Word":"upon"},{"Duration":900000,"Offset":101000000,"Word":"it"},{"Duration":1300000,"Offset":102000000,"Word":"as"},{"Duration":3700000,"Offset":103400000,"Word":"command"},{"Duration":5800000,"Offset":107200000,"Word":"input"},{"Duration":2000000,"Offset":116900000,"Word":"you"},{"Duration":1700000,"Offset":119000000,"Word":"can"},{"Duration":2300000,"Offset":120800000,"Word":"use"},{"Duration":2100000,"Offset":123200000,"Word":"this"},{"Duration":6300000,"Offset":125400000,"Word":"API"},{"Duration":2500000,"Offset":131800000,"Word":"either"},{"Duration":2700000,"Offset":134400000,"Word":"within"},{"Duration":6300000,"Offset":137200000,"Word":"SDK"},{"Duration":4100000,"Offset":143600000,"Word":"client"},{"Duration":7900000,"Offset":147800000,"Word":"library"},{"Duration":6200000,"Offset":158100000,"Word":"or"},{"Duration":2000000,"Offset":164900000,"Word":"a"},{"Duration":4700000,"Offset":167000000,"Word":"rest"},{"Duration":1700000,"Offset":172000000,"Word":"a"},{"Duration":4300000,"Offset":173800000,"Word":"rest"},{"Duration":7000000,"Offset":178200000,"Word":"API"}]}],"Offset":21800000,"RecognitionStatus":"Success"}
CANCELED: Reason=EndOfStream
Press Enter to stop
Session stopped event.
还有,为什么输出结果重复了5次?我正在寻找 trim 尽可能多地进行数据分析。有没有办法将测量单位更改为更用户友好的单位,例如秒?
第 1 季度:
为什么输出结果重复5次?
实际上,您可以通过以下问题从STT FAQ中找到答案:
I get several results for each phrase with the detailed output format. Which one should I use?
根据设计,您可以在 JSON 的 NBest
响应中获得多个具有不同置信度分数的结果,默认情况下,系统将选择第一个作为显示结果。您可以根据需要选择结果,例如置信度最高的结果。
第 2 季度:
有没有办法将测量单位更改为对用户更友好的单位,例如秒?
事实上,Azure 没有提供任何进一步的方法来使用结果。但是我根据你的代码写了一个简单的演示:
using Microsoft.CognitiveServices.Speech;
using Microsoft.CognitiveServices.Speech.Audio;
using Newtonsoft.Json;
using System;
using System.Collections.Generic;
using System.Threading.Tasks;
using System.Linq;
namespace STTwithTime
{
class Program
{
static void Main(string[] args)
{
var key = "";
var region = "";
var audioFilePath = @"";
var speechConfig = SpeechConfig.FromSubscription(key, region);
// Generates timestamps
speechConfig.RequestWordLevelTimestamps();
speechConfig.OutputFormat = OutputFormat.Detailed;
var stopRecognition = new TaskCompletionSource<int>();
var audioConfig = AudioConfig.FromWavFileInput(audioFilePath);
var recognizer = new SpeechRecognizer(speechConfig, audioConfig);
//Display Recognized
recognizer.Recognized += (s, e) =>
{
if (e.Result.Reason == ResultReason.RecognizedSpeech)
{
var result = JsonConvert.DeserializeObject<Result>(e.Result.Properties.GetProperty(PropertyId.SpeechServiceResponse_JsonResult));
var maxConfidenceValue = result.NBest.Max(item => item.Confidence);
var maxConfidence = result.NBest.Find(item => item.Confidence == maxConfidenceValue);
Console.WriteLine("================================");
Console.WriteLine("Confidence:"+maxConfidence.Confidence);
Console.WriteLine("RECOGNIZED :" + maxConfidence.Display);
Console.WriteLine("Duration: :" + Convert.ToDouble(result.Duration) / 10000000);
Console.WriteLine("Words:");
foreach (var word in maxConfidence.Words) {
Console.WriteLine(word.word + "=> offset:" + Convert.ToDouble(word.Offset) / 10000000 + " duraction:" + Convert.ToDouble(word.Duration) / 10000000);
}
}
else if (e.Result.Reason == ResultReason.NoMatch)
{
Console.WriteLine($"NOMATCH: Speech could not be recognized.");
}
};
recognizer.Canceled += (s, e) =>
{
Console.WriteLine($"CANCELED: Reason={e.Reason}");
if (e.Reason == CancellationReason.Error)
{
Console.WriteLine($"CANCELED: ErrorCode={e.ErrorCode}");
Console.WriteLine($"CANCELED: ErrorDetails={e.ErrorDetails}");
Console.WriteLine($"CANCELED: Did you update the subscription info?");
}
stopRecognition.TrySetResult(0);
};
recognizer.SessionStopped += (s, e) =>
{
Console.WriteLine("\n Session stopped event.");
stopRecognition.TrySetResult(0);
};
recognizer.StartContinuousRecognitionAsync().GetAwaiter().GetResult();
// Waits for completion. Use Task.WaitAny to keep the task rooted.
Task.WaitAny(new[] { stopRecognition.Task });
}
}
public class Word
{
public int Duration { get; set; }
public int Offset { get; set; }
public string word { get; set; }
}
public class NBest
{
public double Confidence { get; set; }
public string Display { get; set; }
public string ITN { get; set; }
public string Lexical { get; set; }
public string MaskedITN { get; set; }
public List<Word> Words { get; set; }
}
public class Result
{
public string DisplayText { get; set; }
public int Duration { get; set; }
public string Id { get; set; }
public List<NBest> NBest { get; set; }
public int Offset { get; set; }
public string RecognitionStatus { get; set; }
}
}
结果:
如果您指定一个长时间的.wav文件,结果将被分成多方。