使用来自服务器的数据而不是文件为 Microsoft azure speech SDK 转录
Using data from server instead of file to transcribe for Microsoft azure speech SDK
我正在尝试将数据发送到 Azure Speech SDK 进行转录。我希望它从 python 文件接收数据,放入缓冲区,然后连续转录。
我正在使用来自 azure speech SDK 的示例。
def speech_recognition_with_pull_stream():
"""gives an example how to use a pull audio stream to recognize speech from a custom audio
source"""
class WavFileReaderCallback(speechsdk.audio.PullAudioInputStreamCallback):
"""Example class that implements the Pull Audio Stream interface to recognize speech from
an audio file"""
def __init__(self, filename: str):
super().__init__()
self._file_h = wave.open(filename, mode=None)
self.sample_width = self._file_h.getsampwidth()
assert self._file_h.getnchannels() == 1
assert self._file_h.getsampwidth() == 2
assert self._file_h.getframerate() == 16000
assert self._file_h.getcomptype() == 'NONE'
def read(self, buffer: memoryview) -> int:
"""read callback function"""
size = buffer.nbytes
frames = self._file_h.readframes(size // self.sample_width)
buffer[:len(frames)] = frames
return len(frames)
def close(self):
"""close callback function"""
self._file_h.close()
speech_config = speechsdk.SpeechConfig(subscription=speech_key, region=service_region)
# specify the audio format
wave_format = speechsdk.audio.AudioStreamFormat(samples_per_second=16000, bits_per_sample=16,
channels=1)
# setup the audio stream
callback = WavFileReaderCallback(weatherfilename)
stream = speechsdk.audio.PullAudioInputStream(callback, wave_format)
audio_config = speechsdk.audio.AudioConfig(stream=stream)
# instantiate the speech recognizer with pull stream input
speech_recognizer = speechsdk.SpeechRecognizer(speech_config=speech_config, audio_config=audio_config)
done = False
def stop_cb(evt):
"""callback that signals to stop continuous recognition upon receiving an event `evt`"""
print('CLOSING on {}'.format(evt))
nonlocal done
done = True
# Connect callbacks to the events fired by the speech recognizer
speech_recognizer.recognizing.connect(lambda evt: print('RECOGNIZING: {}'.format(evt)))
speech_recognizer.recognized.connect(lambda evt: print('RECOGNIZED: {}'.format(evt)))
speech_recognizer.session_started.connect(lambda evt: print('SESSION STARTED: {}'.format(evt)))
speech_recognizer.session_stopped.connect(lambda evt: print('SESSION STOPPED {}'.format(evt)))
speech_recognizer.canceled.connect(lambda evt: print('CANCELED {}'.format(evt)))
# stop continuous recognition on either session stopped or canceled events
speech_recognizer.session_stopped.connect(stop_cb)
speech_recognizer.canceled.connect(stop_cb)
# Start continuous speech recognition
speech_recognizer.start_continuous_recognition()
while not done:
time.sleep(.5)
speech_recognizer.stop_continuous_recognition()
我不想从音频文件 'callback = WavFileReaderCallback(weatherfilename)' 获取数据,而是想从另一个 python 发送数据的文件获取数据:
tcp_client = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
.
.
.
tcp_client.sendall(bytes(data))
如何将这些数据放入缓冲区以供语音 SDK 转录。请指导,谢谢。
我是语音 SDK 团队的 Darren。请查看 SpeechSDK GitHub 存储库中的 speech_recognition_with_push_stream Python 示例:
https://github.com/Azure-Samples/cognitive-services-speech-sdk/blob/054b4783de9d52f28109c435bf90e073513fec97/samples/python/console/speech_sample.py#L417
我想这就是您要找的。
根据您的数据可用性模型,替代方法可能是 speech_recognition_with_pull_stream:
https://github.com/Azure-Samples/cognitive-services-speech-sdk/blob/054b4783de9d52f28109c435bf90e073513fec97/samples/python/console/speech_sample.py#L346
如果您需要更多帮助,请随时打开 GitHub 问题:
https://github.com/Azure-Samples/cognitive-services-speech-sdk/issues
谢谢,
达伦
我正在尝试将数据发送到 Azure Speech SDK 进行转录。我希望它从 python 文件接收数据,放入缓冲区,然后连续转录。 我正在使用来自 azure speech SDK 的示例。
def speech_recognition_with_pull_stream():
"""gives an example how to use a pull audio stream to recognize speech from a custom audio
source"""
class WavFileReaderCallback(speechsdk.audio.PullAudioInputStreamCallback):
"""Example class that implements the Pull Audio Stream interface to recognize speech from
an audio file"""
def __init__(self, filename: str):
super().__init__()
self._file_h = wave.open(filename, mode=None)
self.sample_width = self._file_h.getsampwidth()
assert self._file_h.getnchannels() == 1
assert self._file_h.getsampwidth() == 2
assert self._file_h.getframerate() == 16000
assert self._file_h.getcomptype() == 'NONE'
def read(self, buffer: memoryview) -> int:
"""read callback function"""
size = buffer.nbytes
frames = self._file_h.readframes(size // self.sample_width)
buffer[:len(frames)] = frames
return len(frames)
def close(self):
"""close callback function"""
self._file_h.close()
speech_config = speechsdk.SpeechConfig(subscription=speech_key, region=service_region)
# specify the audio format
wave_format = speechsdk.audio.AudioStreamFormat(samples_per_second=16000, bits_per_sample=16,
channels=1)
# setup the audio stream
callback = WavFileReaderCallback(weatherfilename)
stream = speechsdk.audio.PullAudioInputStream(callback, wave_format)
audio_config = speechsdk.audio.AudioConfig(stream=stream)
# instantiate the speech recognizer with pull stream input
speech_recognizer = speechsdk.SpeechRecognizer(speech_config=speech_config, audio_config=audio_config)
done = False
def stop_cb(evt):
"""callback that signals to stop continuous recognition upon receiving an event `evt`"""
print('CLOSING on {}'.format(evt))
nonlocal done
done = True
# Connect callbacks to the events fired by the speech recognizer
speech_recognizer.recognizing.connect(lambda evt: print('RECOGNIZING: {}'.format(evt)))
speech_recognizer.recognized.connect(lambda evt: print('RECOGNIZED: {}'.format(evt)))
speech_recognizer.session_started.connect(lambda evt: print('SESSION STARTED: {}'.format(evt)))
speech_recognizer.session_stopped.connect(lambda evt: print('SESSION STOPPED {}'.format(evt)))
speech_recognizer.canceled.connect(lambda evt: print('CANCELED {}'.format(evt)))
# stop continuous recognition on either session stopped or canceled events
speech_recognizer.session_stopped.connect(stop_cb)
speech_recognizer.canceled.connect(stop_cb)
# Start continuous speech recognition
speech_recognizer.start_continuous_recognition()
while not done:
time.sleep(.5)
speech_recognizer.stop_continuous_recognition()
我不想从音频文件 'callback = WavFileReaderCallback(weatherfilename)' 获取数据,而是想从另一个 python 发送数据的文件获取数据:
tcp_client = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
.
.
.
tcp_client.sendall(bytes(data))
如何将这些数据放入缓冲区以供语音 SDK 转录。请指导,谢谢。
我是语音 SDK 团队的 Darren。请查看 SpeechSDK GitHub 存储库中的 speech_recognition_with_push_stream Python 示例: https://github.com/Azure-Samples/cognitive-services-speech-sdk/blob/054b4783de9d52f28109c435bf90e073513fec97/samples/python/console/speech_sample.py#L417
我想这就是您要找的。
根据您的数据可用性模型,替代方法可能是 speech_recognition_with_pull_stream: https://github.com/Azure-Samples/cognitive-services-speech-sdk/blob/054b4783de9d52f28109c435bf90e073513fec97/samples/python/console/speech_sample.py#L346
如果您需要更多帮助,请随时打开 GitHub 问题: https://github.com/Azure-Samples/cognitive-services-speech-sdk/issues
谢谢,
达伦