在 python 中使用 Azure 语音服务读取音频文件并转换为文本,但只有第一句被转换为语音

Reading audio file and converting into text using Azure Speech services in python, but only the first sentence is converted into speech

下面是代码,

import json
import os
from azure.storage.blob import BlobServiceClient, BlobClient, ContainerClient
import azure.cognitiveservices.speech as speechsdk

def main(filename):
    container_name="test-container"
            print(filename)
    blob_service_client = BlobServiceClient.from_connection_string("DefaultEndpoint")
    container_client=blob_service_client.get_container_client(container_name)
    blob_client = container_client.get_blob_client(filename)
    with open(filename, "wb") as f:
        data = blob_client.download_blob()
        data.readinto(f)

    speech_key, service_region = "1234567", "eastus"
    speech_config = speechsdk.SpeechConfig(subscription=speech_key, region=service_region)

    audio_input = speechsdk.audio.AudioConfig(filename=filename)
    print("Audio Input:-",audio_input)
  
    speech_config.speech_recognition_language="en-US"
    speech_config.request_word_level_timestamps()
    speech_config.enable_dictation()
    speech_config.output_format = speechsdk.OutputFormat(1)

    speech_recognizer = speechsdk.SpeechRecognizer(speech_config=speech_config, audio_config=audio_input)
    print("speech_recognizer:-",speech_recognizer)
    #result = speech_recognizer.recognize_once()
    all_results = []

    def handle_final_result(evt):
        all_results.append(evt.result.text)  
    done = False 

    def stop_cb(evt):
        #print('CLOSING on {}'.format(evt))
        speech_recognizer.stop_continuous_recognition()
        global done
        done= True

    #Appends the recognized text to the all_results variable. 
    speech_recognizer.recognized.connect(handle_final_result) 
    speech_recognizer.session_stopped.connect(stop_cb)
    speech_recognizer.canceled.connect(stop_cb)

    speech_recognizer.start_continuous_recognition()
    
    
    #while not done:
        #time.sleep(.5)
    
    print("Printing all results from speech to text:")
    print(all_results)


    
main(filename="test.wav")

从主函数调用时出错,

test.wav
Audio Input:- <azure.cognitiveservices.speech.audio.AudioConfig object at 0x00000204D72F4E88>
speech_recognizer:- <azure.cognitiveservices.speech.SpeechRecognizer object at 0x00000204D7065148>
[]

预期输出(不使用主函数的输出)

test.wav
Audio Input:- <azure.cognitiveservices.speech.audio.AudioConfig object at 0x00000204D72F4E88>
speech_recognizer:- <azure.cognitiveservices.speech.SpeechRecognizer object at 0x00000204D7065148>
Printing all results from speech to text:
['hi', '', '', 'Uh.', 'A good laugh.', '1487', "OK, OK, I think that's enough.", '']

如果我们不使用 main 函数,现有代码可以完美运行,但是当我使用 main 函数调用它时,我没有得到所需的输出。遗漏部分请指导。

如文章 here、recognize_once_async() 中所述(您正在使用的方法)- 此方法只会从检测到的语音开头开始的输入中检测可识别的话语直到下一次暂停。

据我了解,如果您使用 start_continuous_recognition(),您的要求将得到满足。启动函数将启动并继续处理所有话语,直到您调用停止函数。

这个方法有很多事件与之相关,“已识别”事件在语音识别过程发生时触发。您需要有一个事件处理程序来处理识别和提取文本。您可以参考文章here了解更多信息。

分享一个使用 start_continuous_recognition() 将音频转换为文本的示例片段。

import azure.cognitiveservices.speech as speechsdk
import time
import datetime

# Creates an instance of a speech config with specified subscription key and service region.
# Replace with your own subscription key and region identifier from here: https://aka.ms/speech/sdkregion
speech_key, service_region = "YOURSUBSCRIPTIONKEY", "YOURREGION"
speech_config = speechsdk.SpeechConfig(subscription=speech_key, region=service_region)

# Creates an audio configuration that points to an audio file.
# Replace with your own audio filename.
audio_filename = "sample.wav"
audio_input = speechsdk.audio.AudioConfig(filename=audio_filename)

# Creates a recognizer with the given settings
speech_config.speech_recognition_language="en-US"
speech_config.request_word_level_timestamps()
speech_config.enable_dictation()
speech_config.output_format = speechsdk.OutputFormat(1)

speech_recognizer = speechsdk.SpeechRecognizer(speech_config=speech_config, audio_config=audio_input)

#result = speech_recognizer.recognize_once()
all_results = []



#https://docs.microsoft.com/en-us/python/api/azure-cognitiveservices-speech/azure.cognitiveservices.speech.recognitionresult?view=azure-python
def handle_final_result(evt):
    all_results.append(evt.result.text) 
    
    
done = False

def stop_cb(evt):
    print('CLOSING on {}'.format(evt))
    speech_recognizer.stop_continuous_recognition()
    global done
    done= True

#Appends the recognized text to the all_results variable. 
speech_recognizer.recognized.connect(handle_final_result) 

#Connect callbacks to the events fired by the speech recognizer & displays the info/status
#Ref:https://docs.microsoft.com/en-us/python/api/azure-cognitiveservices-speech/azure.cognitiveservices.speech.eventsignal?view=azure-python   
speech_recognizer.recognizing.connect(lambda evt: print('RECOGNIZING: {}'.format(evt)))
speech_recognizer.recognized.connect(lambda evt: print('RECOGNIZED: {}'.format(evt)))
speech_recognizer.session_started.connect(lambda evt: print('SESSION STARTED: {}'.format(evt)))
speech_recognizer.session_stopped.connect(lambda evt: print('SESSION STOPPED {}'.format(evt)))
speech_recognizer.canceled.connect(lambda evt: print('CANCELED {}'.format(evt)))
# stop continuous recognition on either session stopped or canceled events
speech_recognizer.session_stopped.connect(stop_cb)
speech_recognizer.canceled.connect(stop_cb)

speech_recognizer.start_continuous_recognition()

while not done:
    time.sleep(.5)
    
print("Printing all results:")
print(all_results)

示例输出:


通过函数调用同样的方法

封装在一个函数中并尝试调用它。

只是做了一些调整并封装在一个函数中。确保非本地访问变量“完成”。 请检查并告诉我

import azure.cognitiveservices.speech as speechsdk
import time
import datetime

def speech_to_text():
    
    # Creates an instance of a speech config with specified subscription key and service region.
    # Replace with your own subscription key and region identifier from here: https://aka.ms/speech/sdkregion
    speech_key, service_region = "<>", "<>"
    speech_config = speechsdk.SpeechConfig(subscription=speech_key, region=service_region)

    # Creates an audio configuration that points to an audio file.
    # Replace with your own audio filename.
    audio_filename = "whatstheweatherlike.wav"
    audio_input = speechsdk.audio.AudioConfig(filename=audio_filename)

    # Creates a recognizer with the given settings
    speech_config.speech_recognition_language="en-US"
    speech_config.request_word_level_timestamps()
    speech_config.enable_dictation()
    speech_config.output_format = speechsdk.OutputFormat(1)

    speech_recognizer = speechsdk.SpeechRecognizer(speech_config=speech_config, audio_config=audio_input)

    #result = speech_recognizer.recognize_once()
    all_results = []



    #https://docs.microsoft.com/en-us/python/api/azure-cognitiveservices-speech/azure.cognitiveservices.speech.recognitionresult?view=azure-python
    def handle_final_result(evt):
        all_results.append(evt.result.text) 
    
    
    done = False

    def stop_cb(evt):
        print('CLOSING on {}'.format(evt))
        speech_recognizer.stop_continuous_recognition()
        nonlocal done
        done= True

    #Appends the recognized text to the all_results variable. 
    speech_recognizer.recognized.connect(handle_final_result) 

    #Connect callbacks to the events fired by the speech recognizer & displays the info/status
    #Ref:https://docs.microsoft.com/en-us/python/api/azure-cognitiveservices-speech/azure.cognitiveservices.speech.eventsignal?view=azure-python   
    speech_recognizer.recognizing.connect(lambda evt: print('RECOGNIZING: {}'.format(evt)))
    speech_recognizer.recognized.connect(lambda evt: print('RECOGNIZED: {}'.format(evt)))
    speech_recognizer.session_started.connect(lambda evt: print('SESSION STARTED: {}'.format(evt)))
    speech_recognizer.session_stopped.connect(lambda evt: print('SESSION STOPPED {}'.format(evt)))
    speech_recognizer.canceled.connect(lambda evt: print('CANCELED {}'.format(evt)))
    # stop continuous recognition on either session stopped or canceled events
    speech_recognizer.session_stopped.connect(stop_cb)
    speech_recognizer.canceled.connect(stop_cb)

    speech_recognizer.start_continuous_recognition()

    while not done:
        time.sleep(.5)
            
    print("Printing all results:")
    print(all_results)

#calling the conversion through a function    
speech_to_text()