Python 中用于 Azure 认知语音翻译服务的自定义音频输入字节
Custom audio input bytes to azure cognitive speech translation service in Python
我需要能够翻译我可以从任何来源获得的自定义音频字节,并将语音翻译成我需要的语言(目前是印地语)。我一直在尝试使用 Python:
中的以下代码传递自定义音频字节
import azure.cognitiveservices.speech as speechsdk
from azure.cognitiveservices.speech.audio import AudioStreamFormat, PullAudioInputStream, PullAudioInputStreamCallback, AudioConfig, PushAudioInputStream
speech_key, service_region = "key", "region"
channels = 1
bitsPerSample = 16
samplesPerSecond = 16000
audioFormat = AudioStreamFormat(samplesPerSecond, bitsPerSample, channels)
class CustomPullAudioInputStreamCallback(PullAudioInputStreamCallback):
def __init__(self):
return super(CustomPullAudioInputStreamCallback, self).__init__()
def read(self, file_bytes):
print (len(file_bytes))
return len(file_bytes)
def close(self):
return super(CustomPullAudioInputStreamCallback, self).close()
class CustomPushAudioInputStream(PushAudioInputStream):
def write(self, file_bytes):
print (type(file_bytes))
return super(CustomPushAudioInputStream, self).write(file_bytes)
def close():
return super(CustomPushAudioInputStream, self).close()
translation_config = speechsdk.translation.SpeechTranslationConfig(subscription=speech_key, region=service_region)
fromLanguage = 'en-US'
toLanguage = 'hi'
translation_config.speech_recognition_language = fromLanguage
translation_config.add_target_language(toLanguage)
translation_config.voice_name = "hi-IN-Kalpana-Apollo"
pull_audio_input_stream_callback = CustomPullAudioInputStreamCallback()
# pull_audio_input_stream = PullAudioInputStream(pull_audio_input_stream_callback, audioFormat)
# custom_pull_audio_input_stream = CustomPushAudioInputStream(audioFormat)
audio_config = AudioConfig(use_default_microphone=False, stream=pull_audio_input_stream_callback)
recognizer = speechsdk.translation.TranslationRecognizer(translation_config=translation_config,
audio_config=audio_config)
def synthesis_callback(evt):
size = len(evt.result.audio)
print('AUDIO SYNTHESIZED: {} byte(s) {}'.format(size, '(COMPLETED)' if size == 0 else ''))
if size > 0:
t_sound_file = open("translated_output.wav", "wb+")
t_sound_file.write(evt.result.audio)
t_sound_file.close()
recognizer.stop_continuous_recognition_async()
def recognized_complete(evt):
if evt.result.reason == speechsdk.ResultReason.TranslatedSpeech:
print("RECOGNIZED '{}': {}".format(fromLanguage, result.text))
print("TRANSLATED into {}: {}".format(toLanguage, result.translations['hi']))
elif evt.result.reason == speechsdk.ResultReason.RecognizedSpeech:
print("RECOGNIZED: {} (text could not be translated)".format(result.text))
elif evt.result.reason == speechsdk.ResultReason.NoMatch:
print("NOMATCH: Speech could not be recognized: {}".format(result.no_match_details))
elif evt.reason == speechsdk.ResultReason.Canceled:
print("CANCELED: Reason={}".format(result.cancellation_details.reason))
if result.cancellation_details.reason == speechsdk.CancellationReason.Error:
print("CANCELED: ErrorDetails={}".format(result.cancellation_details.error_details))
def receiving_bytes(audio_bytes):
# audio_bytes contain bytes of audio to be translated
recognizer.synthesizing.connect(synthesis_callback)
recognizer.recognized.connect(recognized_complete)
pull_audio_input_stream_callback.read(audio_bytes)
recognizer.start_continuous_recognition_async()
receiving_bytes(audio_bytes)
输出:
错误:AttributeError:'PullAudioInputStreamCallback' 对象没有属性 '_impl'
软件包及其版本:
Python 3.6.3
azure-cognitiveservices-speech 1.11.0
可以成功执行文件翻译,但我不想为收到的每个字节块保存文件。
我可以将自定义音频字节传递给 Azure 语音翻译服务并在 Python 中获得结果吗?如果是那么怎么办?
提供的示例代码使用回调作为 AudioConfig 的流参数,这似乎是不允许的。
此代码应该可以正常工作而不会引发错误:
pull_audio_input_stream_callback = CustomPullAudioInputStreamCallback()
pull_audio_input_stream = PullAudioInputStream(pull_stream_callback=pull_audio_input_stream_callback, stream_format=audioFormat)
audio_config = AudioConfig(use_default_microphone=False, stream=pull_audio_input_stream)
我自己找到了问题的解决方案。我认为它也适用于 PullAudioInputStream。但使用 PushAudioInputStream 对我有用。您不需要创建自定义 类 它会像下面这样工作:
import azure.cognitiveservices.speech as speechsdk
from azure.cognitiveservices.speech.audio import AudioStreamFormat, PullAudioInputStream, PullAudioInputStreamCallback, AudioConfig, PushAudioInputStream
from threading import Thread, Event
speech_key, service_region = "key", "region"
channels = 1
bitsPerSample = 16
samplesPerSecond = 16000
audioFormat = AudioStreamFormat(samplesPerSecond, bitsPerSample, channels)
translation_config = speechsdk.translation.SpeechTranslationConfig(subscription=speech_key, region=service_region)
fromLanguage = 'en-US'
toLanguage = 'hi'
translation_config.speech_recognition_language = fromLanguage
translation_config.add_target_language(toLanguage)
translation_config.voice_name = "hi-IN-Kalpana-Apollo"
# Remove Custom classes as they are not needed.
custom_push_stream = speechsdk.audio.PushAudioInputStream(stream_format=audioFormat)
audio_config = AudioConfig(stream=custom_push_stream)
recognizer = speechsdk.translation.TranslationRecognizer(translation_config=translation_config, audio_config=audio_config)
# Create an event
synthesis_done = Event()
def synthesis_callback(evt):
size = len(evt.result.audio)
print('AUDIO SYNTHESIZED: {} byte(s) {}'.format(size, '(COMPLETED)' if size == 0 else ''))
if size > 0:
t_sound_file = open("translated_output.wav", "wb+")
t_sound_file.write(evt.result.audio)
t_sound_file.close()
# Setting the event
synthesis_done.set()
def recognized_complete(evt):
if evt.result.reason == speechsdk.ResultReason.TranslatedSpeech:
print("RECOGNIZED '{}': {}".format(fromLanguage, result.text))
print("TRANSLATED into {}: {}".format(toLanguage, result.translations['hi']))
elif evt.result.reason == speechsdk.ResultReason.RecognizedSpeech:
print("RECOGNIZED: {} (text could not be translated)".format(result.text))
elif evt.result.reason == speechsdk.ResultReason.NoMatch:
print("NOMATCH: Speech could not be recognized: {}".format(result.no_match_details))
elif evt.reason == speechsdk.ResultReason.Canceled:
print("CANCELED: Reason={}".format(result.cancellation_details.reason))
if result.cancellation_details.reason == speechsdk.CancellationReason.Error:
print("CANCELED: ErrorDetails={}".format(result.cancellation_details.error_details))
recognizer.synthesizing.connect(synthesis_callback)
recognizer.recognized.connect(recognized_complete)
# Read and get data from an audio file
open_audio_file = open("speech_wav_audio.wav", 'rb')
file_bytes = open_audio_file.read()
# Write the bytes to the stream
custom_push_stream.write(file_bytes)
custom_push_stream.close()
# Start the recognition
recognizer.start_continuous_recognition()
# Waiting for the event to complete
synthesis_done.wait()
# Once the event gets completed you can call Stop recognition
recognizer.stop_continuous_recognition()
自从 start_continuous_recognition
在不同的线程中启动以来,我一直使用线程中的事件,如果不使用线程,您将无法从回调事件中获取数据。 synthesis_done.wait
将通过等待事件完成来解决此问题,然后才会调用 stop_continuous_recognition
。获得音频字节后,您可以在 synthesis_callback
中做任何您想做的事。我简化了示例并从 wav 文件中获取字节。
我需要能够翻译我可以从任何来源获得的自定义音频字节,并将语音翻译成我需要的语言(目前是印地语)。我一直在尝试使用 Python:
中的以下代码传递自定义音频字节import azure.cognitiveservices.speech as speechsdk
from azure.cognitiveservices.speech.audio import AudioStreamFormat, PullAudioInputStream, PullAudioInputStreamCallback, AudioConfig, PushAudioInputStream
speech_key, service_region = "key", "region"
channels = 1
bitsPerSample = 16
samplesPerSecond = 16000
audioFormat = AudioStreamFormat(samplesPerSecond, bitsPerSample, channels)
class CustomPullAudioInputStreamCallback(PullAudioInputStreamCallback):
def __init__(self):
return super(CustomPullAudioInputStreamCallback, self).__init__()
def read(self, file_bytes):
print (len(file_bytes))
return len(file_bytes)
def close(self):
return super(CustomPullAudioInputStreamCallback, self).close()
class CustomPushAudioInputStream(PushAudioInputStream):
def write(self, file_bytes):
print (type(file_bytes))
return super(CustomPushAudioInputStream, self).write(file_bytes)
def close():
return super(CustomPushAudioInputStream, self).close()
translation_config = speechsdk.translation.SpeechTranslationConfig(subscription=speech_key, region=service_region)
fromLanguage = 'en-US'
toLanguage = 'hi'
translation_config.speech_recognition_language = fromLanguage
translation_config.add_target_language(toLanguage)
translation_config.voice_name = "hi-IN-Kalpana-Apollo"
pull_audio_input_stream_callback = CustomPullAudioInputStreamCallback()
# pull_audio_input_stream = PullAudioInputStream(pull_audio_input_stream_callback, audioFormat)
# custom_pull_audio_input_stream = CustomPushAudioInputStream(audioFormat)
audio_config = AudioConfig(use_default_microphone=False, stream=pull_audio_input_stream_callback)
recognizer = speechsdk.translation.TranslationRecognizer(translation_config=translation_config,
audio_config=audio_config)
def synthesis_callback(evt):
size = len(evt.result.audio)
print('AUDIO SYNTHESIZED: {} byte(s) {}'.format(size, '(COMPLETED)' if size == 0 else ''))
if size > 0:
t_sound_file = open("translated_output.wav", "wb+")
t_sound_file.write(evt.result.audio)
t_sound_file.close()
recognizer.stop_continuous_recognition_async()
def recognized_complete(evt):
if evt.result.reason == speechsdk.ResultReason.TranslatedSpeech:
print("RECOGNIZED '{}': {}".format(fromLanguage, result.text))
print("TRANSLATED into {}: {}".format(toLanguage, result.translations['hi']))
elif evt.result.reason == speechsdk.ResultReason.RecognizedSpeech:
print("RECOGNIZED: {} (text could not be translated)".format(result.text))
elif evt.result.reason == speechsdk.ResultReason.NoMatch:
print("NOMATCH: Speech could not be recognized: {}".format(result.no_match_details))
elif evt.reason == speechsdk.ResultReason.Canceled:
print("CANCELED: Reason={}".format(result.cancellation_details.reason))
if result.cancellation_details.reason == speechsdk.CancellationReason.Error:
print("CANCELED: ErrorDetails={}".format(result.cancellation_details.error_details))
def receiving_bytes(audio_bytes):
# audio_bytes contain bytes of audio to be translated
recognizer.synthesizing.connect(synthesis_callback)
recognizer.recognized.connect(recognized_complete)
pull_audio_input_stream_callback.read(audio_bytes)
recognizer.start_continuous_recognition_async()
receiving_bytes(audio_bytes)
输出: 错误:AttributeError:'PullAudioInputStreamCallback' 对象没有属性 '_impl'
软件包及其版本:
Python 3.6.3 azure-cognitiveservices-speech 1.11.0
可以成功执行文件翻译,但我不想为收到的每个字节块保存文件。
我可以将自定义音频字节传递给 Azure 语音翻译服务并在 Python 中获得结果吗?如果是那么怎么办?
提供的示例代码使用回调作为 AudioConfig 的流参数,这似乎是不允许的。
此代码应该可以正常工作而不会引发错误:
pull_audio_input_stream_callback = CustomPullAudioInputStreamCallback()
pull_audio_input_stream = PullAudioInputStream(pull_stream_callback=pull_audio_input_stream_callback, stream_format=audioFormat)
audio_config = AudioConfig(use_default_microphone=False, stream=pull_audio_input_stream)
我自己找到了问题的解决方案。我认为它也适用于 PullAudioInputStream。但使用 PushAudioInputStream 对我有用。您不需要创建自定义 类 它会像下面这样工作:
import azure.cognitiveservices.speech as speechsdk
from azure.cognitiveservices.speech.audio import AudioStreamFormat, PullAudioInputStream, PullAudioInputStreamCallback, AudioConfig, PushAudioInputStream
from threading import Thread, Event
speech_key, service_region = "key", "region"
channels = 1
bitsPerSample = 16
samplesPerSecond = 16000
audioFormat = AudioStreamFormat(samplesPerSecond, bitsPerSample, channels)
translation_config = speechsdk.translation.SpeechTranslationConfig(subscription=speech_key, region=service_region)
fromLanguage = 'en-US'
toLanguage = 'hi'
translation_config.speech_recognition_language = fromLanguage
translation_config.add_target_language(toLanguage)
translation_config.voice_name = "hi-IN-Kalpana-Apollo"
# Remove Custom classes as they are not needed.
custom_push_stream = speechsdk.audio.PushAudioInputStream(stream_format=audioFormat)
audio_config = AudioConfig(stream=custom_push_stream)
recognizer = speechsdk.translation.TranslationRecognizer(translation_config=translation_config, audio_config=audio_config)
# Create an event
synthesis_done = Event()
def synthesis_callback(evt):
size = len(evt.result.audio)
print('AUDIO SYNTHESIZED: {} byte(s) {}'.format(size, '(COMPLETED)' if size == 0 else ''))
if size > 0:
t_sound_file = open("translated_output.wav", "wb+")
t_sound_file.write(evt.result.audio)
t_sound_file.close()
# Setting the event
synthesis_done.set()
def recognized_complete(evt):
if evt.result.reason == speechsdk.ResultReason.TranslatedSpeech:
print("RECOGNIZED '{}': {}".format(fromLanguage, result.text))
print("TRANSLATED into {}: {}".format(toLanguage, result.translations['hi']))
elif evt.result.reason == speechsdk.ResultReason.RecognizedSpeech:
print("RECOGNIZED: {} (text could not be translated)".format(result.text))
elif evt.result.reason == speechsdk.ResultReason.NoMatch:
print("NOMATCH: Speech could not be recognized: {}".format(result.no_match_details))
elif evt.reason == speechsdk.ResultReason.Canceled:
print("CANCELED: Reason={}".format(result.cancellation_details.reason))
if result.cancellation_details.reason == speechsdk.CancellationReason.Error:
print("CANCELED: ErrorDetails={}".format(result.cancellation_details.error_details))
recognizer.synthesizing.connect(synthesis_callback)
recognizer.recognized.connect(recognized_complete)
# Read and get data from an audio file
open_audio_file = open("speech_wav_audio.wav", 'rb')
file_bytes = open_audio_file.read()
# Write the bytes to the stream
custom_push_stream.write(file_bytes)
custom_push_stream.close()
# Start the recognition
recognizer.start_continuous_recognition()
# Waiting for the event to complete
synthesis_done.wait()
# Once the event gets completed you can call Stop recognition
recognizer.stop_continuous_recognition()
自从 start_continuous_recognition
在不同的线程中启动以来,我一直使用线程中的事件,如果不使用线程,您将无法从回调事件中获取数据。 synthesis_done.wait
将通过等待事件完成来解决此问题,然后才会调用 stop_continuous_recognition
。获得音频字节后,您可以在 synthesis_callback
中做任何您想做的事。我简化了示例并从 wav 文件中获取字节。