Microsoft Cognitive Speech to Text - 不转换整个 .wav 文件
Microsoft Cognitive Speech to Text - Not converting entire .wav file
尝试转换个人 .wav 文件时,只有一小部分语音被转换为文本,并且每次转换都在完全相同的位置停止。如果有文件大小限制(我的文件是 80 MB)或者是因为定价层(免费),我无法在文档中找到任何内容。有谁知道为什么在我的例子中,转换在三句话后停止?谁能指点一下?
来自 Microsoft 站点的示例代码:
class Program
{
static async Task Main()
{
await RecognizeSpeechAsync();
}
static async Task RecognizeSpeechAsync()
{
var config = SpeechConfig.FromSubscription("YourSubscriptionKey", "YourServiceRegion");
using (var audioInput = AudioConfig.FromWavFileInput(@"FilePath\MyWav.wav"))
using (var recognizer = new SpeechRecognizer(config, audioInput))
{
Console.WriteLine("Recognizing first result...");
var result = await recognizer.RecognizeOnceAsync();
switch (result.Reason)
{
case ResultReason.RecognizedSpeech:
Console.WriteLine($"We recognized: {result.Text}");
break;
case ResultReason.NoMatch:
Console.WriteLine($"NOMATCH: Speech could not be recognized.");
break;
case ResultReason.Canceled:
var cancellation = CancellationDetails.FromResult(result);
Console.WriteLine($"CANCELED: Reason={cancellation.Reason}");
if (cancellation.Reason == CancellationReason.Error)
{
Console.WriteLine($"CANCELED: ErrorCode={cancellation.ErrorCode}");
Console.WriteLine($"CANCELED: ErrorDetails={cancellation.ErrorDetails}");
Console.WriteLine($"CANCELED: Did you update the subscription info?");
}
break;
}
}
}
}
编辑
我已将代码示例粘贴在通过搜索引擎搜索的下方,以防原始代码被更改或删除。对于一个 80 MB 的 .wav 文件,代码 运行 花费了大约 20 分钟。
"Main"代码:
// Speech recognition with audio stream
public static async Task RecognitionWithPullAudioStreamAsync()
{
// Creates an instance of a speech config with specified subscription key and service region.
// Replace with your own subscription key and service region (e.g., "westus").
var config = SpeechConfig.FromSubscription("YourSubscriptionKey", "YourServiceRegion");
StringBuilder sb = new StringBuilder(); // remember System.Text
var stopRecognition = new TaskCompletionSource<int>();
// Create an audio stream from a wav file.
// Replace with your own audio file name.
using (var audioInput = Helper.OpenWavFile(@"whatstheweatherlike.wav"))
{
// Creates a speech recognizer using audio stream input.
using (var recognizer = new SpeechRecognizer(config, audioInput))
{
// Subscribes to events.
recognizer.Recognizing += (s, e) =>
{
// you can leave the below line uncommented but your console window will go crazy if you have a file like mine (80 MB)
//Console.WriteLine($"RECOGNIZING: Text={e.Result.Text}");
sb.Append(e.Result.Text);
};
recognizer.Recognized += (s, e) =>
{
if (e.Result.Reason == ResultReason.RecognizedSpeech)
{
Console.WriteLine($"RECOGNIZED: Text={e.Result.Text}");
File.AppendAllText("test.txt", e.Result.Text);
}
else if (e.Result.Reason == ResultReason.NoMatch)
{
Console.WriteLine($"NOMATCH: Speech could not be recognized.");
}
};
recognizer.Canceled += (s, e) =>
{
Console.WriteLine($"CANCELED: Reason={e.Reason}");
if (e.Reason == CancellationReason.Error)
{
Console.WriteLine($"CANCELED: ErrorCode={e.ErrorCode}");
Console.WriteLine($"CANCELED: ErrorDetails={e.ErrorDetails}");
Console.WriteLine($"CANCELED: Did you update the subscription info?");
}
stopRecognition.TrySetResult(0);
};
recognizer.SessionStarted += (s, e) =>
{
Console.WriteLine("\nSession started event.");
};
recognizer.SessionStopped += (s, e) =>
{
File.AppendAllText("test.txt", e.Result.Text);
Console.WriteLine("\nSession stopped event.");
Console.WriteLine("\nStop recognition.");
stopRecognition.TrySetResult(0);
};
// Starts continuous recognition. Uses StopContinuousRecognitionAsync() to stop recognition.
await recognizer.StartContinuousRecognitionAsync().ConfigureAwait(false);
// Waits for completion.
// Use Task.WaitAny to keep the task rooted.
Task.WaitAny(new[] { stopRecognition.Task });
// Stops recognition.
await recognizer.StopContinuousRecognitionAsync().ConfigureAwait(false);
}
}
}
助手class:
using Microsoft.CognitiveServices.Speech;
using Microsoft.CognitiveServices.Speech.Audio;
using System;
using System.Diagnostics;
using System.IO;
namespace MicrosoftSpeechSDKSamples
{
public class Helper
{
public static AudioConfig OpenWavFile(string filename)
{
BinaryReader reader = new BinaryReader(File.OpenRead(filename));
return OpenWavFile(reader);
}
public static AudioConfig OpenWavFile(BinaryReader reader)
{
AudioStreamFormat format = readWaveHeader(reader);
return AudioConfig.FromStreamInput(new BinaryAudioStreamReader(reader), format);
}
public static BinaryAudioStreamReader CreateWavReader(string filename)
{
BinaryReader reader = new BinaryReader(File.OpenRead(filename));
// read the wave header so that it won't get into the in the following readings
AudioStreamFormat format = readWaveHeader(reader);
return new BinaryAudioStreamReader(reader);
}
public static AudioStreamFormat readWaveHeader(BinaryReader reader)
{
// Tag "RIFF"
char[] data = new char[4];
reader.Read(data, 0, 4);
Trace.Assert((data[0] == 'R') && (data[1] == 'I') && (data[2] == 'F') && (data[3] == 'F'), "Wrong wav header");
// Chunk size
long fileSize = reader.ReadInt32();
// Subchunk, Wave Header
// Subchunk, Format
// Tag: "WAVE"
reader.Read(data, 0, 4);
Trace.Assert((data[0] == 'W') && (data[1] == 'A') && (data[2] == 'V') && (data[3] == 'E'), "Wrong wav tag in wav header");
// Tag: "fmt"
reader.Read(data, 0, 4);
Trace.Assert((data[0] == 'f') && (data[1] == 'm') && (data[2] == 't') && (data[3] == ' '), "Wrong format tag in wav header");
// chunk format size
var formatSize = reader.ReadInt32();
var formatTag = reader.ReadUInt16();
var channels = reader.ReadUInt16();
var samplesPerSecond = reader.ReadUInt32();
var avgBytesPerSec = reader.ReadUInt32();
var blockAlign = reader.ReadUInt16();
var bitsPerSample = reader.ReadUInt16();
// Until now we have read 16 bytes in format, the rest is cbSize and is ignored for now.
if (formatSize > 16)
reader.ReadBytes((int)(formatSize - 16));
// Second Chunk, data
// tag: data.
reader.Read(data, 0, 4);
Trace.Assert((data[0] == 'd') && (data[1] == 'a') && (data[2] == 't') && (data[3] == 'a'), "Wrong data tag in wav");
// data chunk size
int dataSize = reader.ReadInt32();
// now, we have the format in the format parameter and the
// reader set to the start of the body, i.e., the raw sample data
return AudioStreamFormat.GetWaveFormatPCM(samplesPerSecond, (byte)bitsPerSample, (byte)channels);
}
}
/// <summary>
/// Adapter class to the native stream api.
/// </summary>
public sealed class BinaryAudioStreamReader : PullAudioInputStreamCallback
{
private System.IO.BinaryReader _reader;
/// <summary>
/// Creates and initializes an instance of BinaryAudioStreamReader.
/// </summary>
/// <param name="reader">The underlying stream to read the audio data from. Note: The stream contains the bare sample data, not the container (like wave header data, etc).</param>
public BinaryAudioStreamReader(System.IO.BinaryReader reader)
{
_reader = reader;
}
/// <summary>
/// Creates and initializes an instance of BinaryAudioStreamReader.
/// </summary>
/// <param name="stream">The underlying stream to read the audio data from. Note: The stream contains the bare sample data, not the container (like wave header data, etc).</param>
public BinaryAudioStreamReader(System.IO.Stream stream)
: this(new System.IO.BinaryReader(stream))
{
}
/// <summary>
/// Reads binary data from the stream.
/// </summary>
/// <param name="dataBuffer">The buffer to fill</param>
/// <param name="size">The size of data in the buffer.</param>
/// <returns>The number of bytes filled, or 0 in case the stream hits its end and there is no more data available.
/// If there is no data immediate available, Read() blocks until the next data becomes available.</returns>
public override int Read(byte[] dataBuffer, uint size)
{
return _reader.Read(dataBuffer, 0, (int)size);
}
/// <summary>
/// This method performs cleanup of resources.
/// The Boolean parameter <paramref name="disposing"/> indicates whether the method is called from <see cref="IDisposable.Dispose"/> (if <paramref name="disposing"/> is true) or from the finalizer (if <paramref name="disposing"/> is false).
/// Derived classes should override this method to dispose resource if needed.
/// </summary>
/// <param name="disposing">Flag to request disposal.</param>
protected override void Dispose(bool disposing)
{
if (disposed)
{
return;
}
if (disposing)
{
_reader.Dispose();
}
disposed = true;
base.Dispose(disposing);
}
private bool disposed = false;
}
/// <summary>
/// Implements a custom class for PushAudioOutputStreamCallback.
/// This is to receive the audio data when the synthesizer has produced audio data.
/// </summary>
public sealed class PushAudioOutputStreamSampleCallback : PushAudioOutputStreamCallback
{
private byte[] audioData;
/// <summary>
/// Constructor
/// </summary>
public PushAudioOutputStreamSampleCallback()
{
audioData = new byte[0];
}
/// <summary>
/// A callback which is invoked when the synthesizer has a output audio chunk to write out
/// </summary>
/// <param name="dataBuffer">The output audio chunk sent by synthesizer</param>
/// <returns>Tell synthesizer how many bytes are received</returns>
public override uint Write(byte[] dataBuffer)
{
int oldSize = audioData.Length;
Array.Resize(ref audioData, oldSize + dataBuffer.Length);
for (int i = 0; i < dataBuffer.Length; ++i)
{
audioData[oldSize + i] = dataBuffer[i];
}
Console.WriteLine($"{dataBuffer.Length} bytes received.");
return (uint)dataBuffer.Length;
}
/// <summary>
/// A callback which is invoked when the synthesizer is about to close the stream
/// </summary>
public override void Close()
{
Console.WriteLine("Push audio output stream closed.");
}
/// <summary>
/// Get the received audio data
/// </summary>
/// <returns>The received audio data in byte array</returns>
public byte[] GetAudioData()
{
return audioData;
}
}
}
您提供的示例代码使用的是 RecognizeOnceAsync,它会在语音停顿时创建最终识别结果。对于长时间录制,我建议使用 StartContinuousRecognitionAsync 和 StopContinuousRecognitionAsync 方法。在此处查看示例代码:https://github.com/Azure-Samples/cognitive-services-speech-sdk/blob/master/samples/csharp/sharedcontent/console/speech_recognition_samples.cs
让我们知道这是否有帮助。
有点晚了,但也许会对其他人有所帮助。在我的项目中,我们使用了 Batch transcription API。
https://docs.microsoft.com/en-us/azure/cognitive-services/speech-service/batch-transcription
使用流程很简单:
- 您将文件上传到您的 Blob 存储
- 获取有关您的文件 (blob) 的引用以及 SAS 令牌,但没有它也能正常工作。
- 为要在其中获取结果的容器创建临时 SAS。 API 会将结果上传到此容器。
- 并向 API 发送请求。
即使是大文件,它也能很好地工作。我什至上传了一部分有声书。
这是一个请求示例:
{
"contentUrls": [
"{{path to audio blob}}"
],
"properties": {
"diarizationEnabled": false,
"wordLevelTimestampsEnabled": false,
"punctuationMode": "DictatedAndAutomatic",
"profanityFilterMode": "Masked",
"destinationContainerUrl": "{{path to your container with SAS token}}"
},
"locale": "en-US",
"displayName": "Transcription using default model for en-US",
}
尝试转换个人 .wav 文件时,只有一小部分语音被转换为文本,并且每次转换都在完全相同的位置停止。如果有文件大小限制(我的文件是 80 MB)或者是因为定价层(免费),我无法在文档中找到任何内容。有谁知道为什么在我的例子中,转换在三句话后停止?谁能指点一下?
来自 Microsoft 站点的示例代码:
class Program
{
static async Task Main()
{
await RecognizeSpeechAsync();
}
static async Task RecognizeSpeechAsync()
{
var config = SpeechConfig.FromSubscription("YourSubscriptionKey", "YourServiceRegion");
using (var audioInput = AudioConfig.FromWavFileInput(@"FilePath\MyWav.wav"))
using (var recognizer = new SpeechRecognizer(config, audioInput))
{
Console.WriteLine("Recognizing first result...");
var result = await recognizer.RecognizeOnceAsync();
switch (result.Reason)
{
case ResultReason.RecognizedSpeech:
Console.WriteLine($"We recognized: {result.Text}");
break;
case ResultReason.NoMatch:
Console.WriteLine($"NOMATCH: Speech could not be recognized.");
break;
case ResultReason.Canceled:
var cancellation = CancellationDetails.FromResult(result);
Console.WriteLine($"CANCELED: Reason={cancellation.Reason}");
if (cancellation.Reason == CancellationReason.Error)
{
Console.WriteLine($"CANCELED: ErrorCode={cancellation.ErrorCode}");
Console.WriteLine($"CANCELED: ErrorDetails={cancellation.ErrorDetails}");
Console.WriteLine($"CANCELED: Did you update the subscription info?");
}
break;
}
}
}
}
编辑 我已将代码示例粘贴在通过搜索引擎搜索的下方,以防原始代码被更改或删除。对于一个 80 MB 的 .wav 文件,代码 运行 花费了大约 20 分钟。
"Main"代码:
// Speech recognition with audio stream
public static async Task RecognitionWithPullAudioStreamAsync()
{
// Creates an instance of a speech config with specified subscription key and service region.
// Replace with your own subscription key and service region (e.g., "westus").
var config = SpeechConfig.FromSubscription("YourSubscriptionKey", "YourServiceRegion");
StringBuilder sb = new StringBuilder(); // remember System.Text
var stopRecognition = new TaskCompletionSource<int>();
// Create an audio stream from a wav file.
// Replace with your own audio file name.
using (var audioInput = Helper.OpenWavFile(@"whatstheweatherlike.wav"))
{
// Creates a speech recognizer using audio stream input.
using (var recognizer = new SpeechRecognizer(config, audioInput))
{
// Subscribes to events.
recognizer.Recognizing += (s, e) =>
{
// you can leave the below line uncommented but your console window will go crazy if you have a file like mine (80 MB)
//Console.WriteLine($"RECOGNIZING: Text={e.Result.Text}");
sb.Append(e.Result.Text);
};
recognizer.Recognized += (s, e) =>
{
if (e.Result.Reason == ResultReason.RecognizedSpeech)
{
Console.WriteLine($"RECOGNIZED: Text={e.Result.Text}");
File.AppendAllText("test.txt", e.Result.Text);
}
else if (e.Result.Reason == ResultReason.NoMatch)
{
Console.WriteLine($"NOMATCH: Speech could not be recognized.");
}
};
recognizer.Canceled += (s, e) =>
{
Console.WriteLine($"CANCELED: Reason={e.Reason}");
if (e.Reason == CancellationReason.Error)
{
Console.WriteLine($"CANCELED: ErrorCode={e.ErrorCode}");
Console.WriteLine($"CANCELED: ErrorDetails={e.ErrorDetails}");
Console.WriteLine($"CANCELED: Did you update the subscription info?");
}
stopRecognition.TrySetResult(0);
};
recognizer.SessionStarted += (s, e) =>
{
Console.WriteLine("\nSession started event.");
};
recognizer.SessionStopped += (s, e) =>
{
File.AppendAllText("test.txt", e.Result.Text);
Console.WriteLine("\nSession stopped event.");
Console.WriteLine("\nStop recognition.");
stopRecognition.TrySetResult(0);
};
// Starts continuous recognition. Uses StopContinuousRecognitionAsync() to stop recognition.
await recognizer.StartContinuousRecognitionAsync().ConfigureAwait(false);
// Waits for completion.
// Use Task.WaitAny to keep the task rooted.
Task.WaitAny(new[] { stopRecognition.Task });
// Stops recognition.
await recognizer.StopContinuousRecognitionAsync().ConfigureAwait(false);
}
}
}
助手class:
using Microsoft.CognitiveServices.Speech;
using Microsoft.CognitiveServices.Speech.Audio;
using System;
using System.Diagnostics;
using System.IO;
namespace MicrosoftSpeechSDKSamples
{
public class Helper
{
public static AudioConfig OpenWavFile(string filename)
{
BinaryReader reader = new BinaryReader(File.OpenRead(filename));
return OpenWavFile(reader);
}
public static AudioConfig OpenWavFile(BinaryReader reader)
{
AudioStreamFormat format = readWaveHeader(reader);
return AudioConfig.FromStreamInput(new BinaryAudioStreamReader(reader), format);
}
public static BinaryAudioStreamReader CreateWavReader(string filename)
{
BinaryReader reader = new BinaryReader(File.OpenRead(filename));
// read the wave header so that it won't get into the in the following readings
AudioStreamFormat format = readWaveHeader(reader);
return new BinaryAudioStreamReader(reader);
}
public static AudioStreamFormat readWaveHeader(BinaryReader reader)
{
// Tag "RIFF"
char[] data = new char[4];
reader.Read(data, 0, 4);
Trace.Assert((data[0] == 'R') && (data[1] == 'I') && (data[2] == 'F') && (data[3] == 'F'), "Wrong wav header");
// Chunk size
long fileSize = reader.ReadInt32();
// Subchunk, Wave Header
// Subchunk, Format
// Tag: "WAVE"
reader.Read(data, 0, 4);
Trace.Assert((data[0] == 'W') && (data[1] == 'A') && (data[2] == 'V') && (data[3] == 'E'), "Wrong wav tag in wav header");
// Tag: "fmt"
reader.Read(data, 0, 4);
Trace.Assert((data[0] == 'f') && (data[1] == 'm') && (data[2] == 't') && (data[3] == ' '), "Wrong format tag in wav header");
// chunk format size
var formatSize = reader.ReadInt32();
var formatTag = reader.ReadUInt16();
var channels = reader.ReadUInt16();
var samplesPerSecond = reader.ReadUInt32();
var avgBytesPerSec = reader.ReadUInt32();
var blockAlign = reader.ReadUInt16();
var bitsPerSample = reader.ReadUInt16();
// Until now we have read 16 bytes in format, the rest is cbSize and is ignored for now.
if (formatSize > 16)
reader.ReadBytes((int)(formatSize - 16));
// Second Chunk, data
// tag: data.
reader.Read(data, 0, 4);
Trace.Assert((data[0] == 'd') && (data[1] == 'a') && (data[2] == 't') && (data[3] == 'a'), "Wrong data tag in wav");
// data chunk size
int dataSize = reader.ReadInt32();
// now, we have the format in the format parameter and the
// reader set to the start of the body, i.e., the raw sample data
return AudioStreamFormat.GetWaveFormatPCM(samplesPerSecond, (byte)bitsPerSample, (byte)channels);
}
}
/// <summary>
/// Adapter class to the native stream api.
/// </summary>
public sealed class BinaryAudioStreamReader : PullAudioInputStreamCallback
{
private System.IO.BinaryReader _reader;
/// <summary>
/// Creates and initializes an instance of BinaryAudioStreamReader.
/// </summary>
/// <param name="reader">The underlying stream to read the audio data from. Note: The stream contains the bare sample data, not the container (like wave header data, etc).</param>
public BinaryAudioStreamReader(System.IO.BinaryReader reader)
{
_reader = reader;
}
/// <summary>
/// Creates and initializes an instance of BinaryAudioStreamReader.
/// </summary>
/// <param name="stream">The underlying stream to read the audio data from. Note: The stream contains the bare sample data, not the container (like wave header data, etc).</param>
public BinaryAudioStreamReader(System.IO.Stream stream)
: this(new System.IO.BinaryReader(stream))
{
}
/// <summary>
/// Reads binary data from the stream.
/// </summary>
/// <param name="dataBuffer">The buffer to fill</param>
/// <param name="size">The size of data in the buffer.</param>
/// <returns>The number of bytes filled, or 0 in case the stream hits its end and there is no more data available.
/// If there is no data immediate available, Read() blocks until the next data becomes available.</returns>
public override int Read(byte[] dataBuffer, uint size)
{
return _reader.Read(dataBuffer, 0, (int)size);
}
/// <summary>
/// This method performs cleanup of resources.
/// The Boolean parameter <paramref name="disposing"/> indicates whether the method is called from <see cref="IDisposable.Dispose"/> (if <paramref name="disposing"/> is true) or from the finalizer (if <paramref name="disposing"/> is false).
/// Derived classes should override this method to dispose resource if needed.
/// </summary>
/// <param name="disposing">Flag to request disposal.</param>
protected override void Dispose(bool disposing)
{
if (disposed)
{
return;
}
if (disposing)
{
_reader.Dispose();
}
disposed = true;
base.Dispose(disposing);
}
private bool disposed = false;
}
/// <summary>
/// Implements a custom class for PushAudioOutputStreamCallback.
/// This is to receive the audio data when the synthesizer has produced audio data.
/// </summary>
public sealed class PushAudioOutputStreamSampleCallback : PushAudioOutputStreamCallback
{
private byte[] audioData;
/// <summary>
/// Constructor
/// </summary>
public PushAudioOutputStreamSampleCallback()
{
audioData = new byte[0];
}
/// <summary>
/// A callback which is invoked when the synthesizer has a output audio chunk to write out
/// </summary>
/// <param name="dataBuffer">The output audio chunk sent by synthesizer</param>
/// <returns>Tell synthesizer how many bytes are received</returns>
public override uint Write(byte[] dataBuffer)
{
int oldSize = audioData.Length;
Array.Resize(ref audioData, oldSize + dataBuffer.Length);
for (int i = 0; i < dataBuffer.Length; ++i)
{
audioData[oldSize + i] = dataBuffer[i];
}
Console.WriteLine($"{dataBuffer.Length} bytes received.");
return (uint)dataBuffer.Length;
}
/// <summary>
/// A callback which is invoked when the synthesizer is about to close the stream
/// </summary>
public override void Close()
{
Console.WriteLine("Push audio output stream closed.");
}
/// <summary>
/// Get the received audio data
/// </summary>
/// <returns>The received audio data in byte array</returns>
public byte[] GetAudioData()
{
return audioData;
}
}
}
您提供的示例代码使用的是 RecognizeOnceAsync,它会在语音停顿时创建最终识别结果。对于长时间录制,我建议使用 StartContinuousRecognitionAsync 和 StopContinuousRecognitionAsync 方法。在此处查看示例代码:https://github.com/Azure-Samples/cognitive-services-speech-sdk/blob/master/samples/csharp/sharedcontent/console/speech_recognition_samples.cs 让我们知道这是否有帮助。
有点晚了,但也许会对其他人有所帮助。在我的项目中,我们使用了 Batch transcription API。 https://docs.microsoft.com/en-us/azure/cognitive-services/speech-service/batch-transcription
使用流程很简单:
- 您将文件上传到您的 Blob 存储
- 获取有关您的文件 (blob) 的引用以及 SAS 令牌,但没有它也能正常工作。
- 为要在其中获取结果的容器创建临时 SAS。 API 会将结果上传到此容器。
- 并向 API 发送请求。
即使是大文件,它也能很好地工作。我什至上传了一部分有声书。
这是一个请求示例:
{
"contentUrls": [
"{{path to audio blob}}"
],
"properties": {
"diarizationEnabled": false,
"wordLevelTimestampsEnabled": false,
"punctuationMode": "DictatedAndAutomatic",
"profanityFilterMode": "Masked",
"destinationContainerUrl": "{{path to your container with SAS token}}"
},
"locale": "en-US",
"displayName": "Transcription using default model for en-US",
}