如何将实时音频 url 传递到 Google 语音到文本 API
How to pass live audio url to Google Speech to Text API
我有一个 url 的现场录音,我正在尝试使用 Google 语音转文本 API 进行转录。我正在使用 Cloud Speech to Text API 中的示例代码。但是,问题是当我通过实时 url 时,我没有收到任何输出。以下是我的代码的相关部分。任何帮助将不胜感激!
from google.cloud import speech
from google.cloud.speech import enums
from google.cloud.speech import types
import io
import os
import time
import requests
import numpy as np
from google.cloud import speech
from google.cloud.speech import enums
from google.cloud.speech import types
from urllib.request import urlopen
from datetime import datetime
from datetime import timedelta
os.environ["GOOGLE_APPLICATION_CREDENTIALS"]= "app_creds.json"
def get_stream():
stream = urlopen('streamurl')
duration = 60
begin = datetime.now()
duration = timedelta(seconds=duration)
while datetime.now() - begin < duration:
data = stream.read(8000)
return data
def transcribe_streaming():
"""Streams transcription of the given audio file."""
client = speech.SpeechClient()
content = get_stream()
# In practice, stream should be a generator yielding chunks of audio data.
stream = [content]
requests = (types.StreamingRecognizeRequest(audio_content=chunk)
for chunk in stream)
config = types.RecognitionConfig(
encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16,
sample_rate_hertz=16000,
language_code='en-US')
streaming_config = types.StreamingRecognitionConfig(config=config)
# streaming_recognize returns a generator.
responses = client.streaming_recognize(streaming_config, requests)
for response in responses:
# Once the transcription has settled, the first result will contain the
# is_final result. The other results will be for subsequent portions of
# the audio.
for result in response.results:
print('Finished: {}'.format(result.is_final))
print('Stability: {}'.format(result.stability))
alternatives = result.alternatives
# The alternatives are ordered from most likely to least.
for alternative in alternatives:
print('Confidence: {}'.format(alternative.confidence))
print(u'Transcript: {}'.format(alternative.transcript))
尝试使用:
import urllib
urllib.urlretrieve ("http://www.example.com/songs/mp3.mp3", "mp3.mp3")
(对于 Python 3+ 使用导入 urllib.request
和 urllib.request.urlretrieve
)
我以前用过的部分代码,不知道是否有帮助:
def live_recognize_loop(self):
client = self.client
def is_running():
return self.recording
while self.recording:
with MicrophoneStream(RATE, CHUNK) as stream:
audio_generator = stream.generator(is_running)
requests = (types.StreamingRecognizeRequest(audio_content=content) for content in audio_generator)
responses = client.streaming_recognize(client.custom_streaming_config, requests)
responses_iterator = iter(responses)
while self.recording:
try:
response = next(responses_iterator)
except StopIteration:
break
except OutOfRange:
# Exception 400 - Exceeded maximum allowed stream duration of 65 seconds.
self.user_display(self.intermediateFrame.GetMessageText())
break # Start over
except ServiceUnavailable as e:
# Exception 503 - Getting metadata from plugin failed
self.log("{0} - NOT RECOGNIZED - {1}\n".format(self.getDate(), e))
break
except ResourceExhausted as e:
break
except GoogleAPICallError as e:
break
if response.results:
result = response.results[0]
if result.alternatives:
transcript = result.alternatives[0].transcript
self.intermediateFrame.SetMessageText(transcript)
if not result.is_final:
self.intermediateFrame.Display()
# print(transcript)
else:
self.user_display(transcript)
self.intermediateFrame.Display(False)
self.intermediateFrame.SetMessageText("")
#print("\t\t FINAL: %s" % transcript)
break # Start over
麦克风流class
from __future__ import division
import pyaudio
from six.moves import queue
class MicrophoneStream(object):
"""Opens a recording stream as a generator yielding the audio chunks."""
def __init__(self, rate, chunk):
self._rate = rate
self._chunk = chunk
# Create a thread-safe buffer of audio data
self._buff = queue.Queue()
self.closed = True
def __enter__(self):
self._audio_interface = pyaudio.PyAudio()
self._audio_stream = self._audio_interface.open(
format=pyaudio.paInt16,
# The API currently only supports 1-channel (mono) audio
channels=1, rate=self._rate,
input=True, frames_per_buffer=self._chunk,
# Run the audio stream asynchronously to fill the buffer object.
# This is necessary so that the input device's buffer doesn't
# overflow while the calling thread makes network requests, etc.
stream_callback=self._fill_buffer,
)
self.closed = False
return self
def __exit__(self, type, value, traceback):
self._audio_stream.stop_stream()
self._audio_stream.close()
self.closed = True
# Signal the generator to terminate so that the client's
# streaming_recognize method will not block the process termination.
self._buff.put(None)
self._audio_interface.terminate()
def _fill_buffer(self, in_data, frame_count, time_info, status_flags):
"""Continuously collect data from the audio stream, into the buffer."""
self._buff.put(in_data)
return None, pyaudio.paContinue
def generator(self, is_running=None):
while not self.closed:
# Use a blocking get() to ensure there's at least one chunk of
# data, and stop iteration if the chunk is None, indicating the
# end of the audio stream.
chunk = self._buff.get()
if callable(is_running) and not is_running():
return
if chunk is None:
return
data = [chunk]
# Now consume whatever other data's still buffered.
while True:
try:
chunk = self._buff.get(block=False)
if chunk is None:
return
data.append(chunk)
except queue.Empty:
break
yield b''.join(data)
向 Google 语音服务发送音频时,请确保服务对象设置与音频编码匹配。在您的特定情况下
config = types.RecognitionConfig(
encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16,
sample_rate_hertz=16000,
language_code='en-US')
对应单声道,16KHz,线性16位PCM编码。如果您需要转录不同格式的音频,请参阅 list of other supported encodings。
我有一个 url 的现场录音,我正在尝试使用 Google 语音转文本 API 进行转录。我正在使用 Cloud Speech to Text API 中的示例代码。但是,问题是当我通过实时 url 时,我没有收到任何输出。以下是我的代码的相关部分。任何帮助将不胜感激!
from google.cloud import speech
from google.cloud.speech import enums
from google.cloud.speech import types
import io
import os
import time
import requests
import numpy as np
from google.cloud import speech
from google.cloud.speech import enums
from google.cloud.speech import types
from urllib.request import urlopen
from datetime import datetime
from datetime import timedelta
os.environ["GOOGLE_APPLICATION_CREDENTIALS"]= "app_creds.json"
def get_stream():
stream = urlopen('streamurl')
duration = 60
begin = datetime.now()
duration = timedelta(seconds=duration)
while datetime.now() - begin < duration:
data = stream.read(8000)
return data
def transcribe_streaming():
"""Streams transcription of the given audio file."""
client = speech.SpeechClient()
content = get_stream()
# In practice, stream should be a generator yielding chunks of audio data.
stream = [content]
requests = (types.StreamingRecognizeRequest(audio_content=chunk)
for chunk in stream)
config = types.RecognitionConfig(
encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16,
sample_rate_hertz=16000,
language_code='en-US')
streaming_config = types.StreamingRecognitionConfig(config=config)
# streaming_recognize returns a generator.
responses = client.streaming_recognize(streaming_config, requests)
for response in responses:
# Once the transcription has settled, the first result will contain the
# is_final result. The other results will be for subsequent portions of
# the audio.
for result in response.results:
print('Finished: {}'.format(result.is_final))
print('Stability: {}'.format(result.stability))
alternatives = result.alternatives
# The alternatives are ordered from most likely to least.
for alternative in alternatives:
print('Confidence: {}'.format(alternative.confidence))
print(u'Transcript: {}'.format(alternative.transcript))
尝试使用:
import urllib
urllib.urlretrieve ("http://www.example.com/songs/mp3.mp3", "mp3.mp3")
(对于 Python 3+ 使用导入 urllib.request
和 urllib.request.urlretrieve
)
我以前用过的部分代码,不知道是否有帮助:
def live_recognize_loop(self):
client = self.client
def is_running():
return self.recording
while self.recording:
with MicrophoneStream(RATE, CHUNK) as stream:
audio_generator = stream.generator(is_running)
requests = (types.StreamingRecognizeRequest(audio_content=content) for content in audio_generator)
responses = client.streaming_recognize(client.custom_streaming_config, requests)
responses_iterator = iter(responses)
while self.recording:
try:
response = next(responses_iterator)
except StopIteration:
break
except OutOfRange:
# Exception 400 - Exceeded maximum allowed stream duration of 65 seconds.
self.user_display(self.intermediateFrame.GetMessageText())
break # Start over
except ServiceUnavailable as e:
# Exception 503 - Getting metadata from plugin failed
self.log("{0} - NOT RECOGNIZED - {1}\n".format(self.getDate(), e))
break
except ResourceExhausted as e:
break
except GoogleAPICallError as e:
break
if response.results:
result = response.results[0]
if result.alternatives:
transcript = result.alternatives[0].transcript
self.intermediateFrame.SetMessageText(transcript)
if not result.is_final:
self.intermediateFrame.Display()
# print(transcript)
else:
self.user_display(transcript)
self.intermediateFrame.Display(False)
self.intermediateFrame.SetMessageText("")
#print("\t\t FINAL: %s" % transcript)
break # Start over
麦克风流class
from __future__ import division
import pyaudio
from six.moves import queue
class MicrophoneStream(object):
"""Opens a recording stream as a generator yielding the audio chunks."""
def __init__(self, rate, chunk):
self._rate = rate
self._chunk = chunk
# Create a thread-safe buffer of audio data
self._buff = queue.Queue()
self.closed = True
def __enter__(self):
self._audio_interface = pyaudio.PyAudio()
self._audio_stream = self._audio_interface.open(
format=pyaudio.paInt16,
# The API currently only supports 1-channel (mono) audio
channels=1, rate=self._rate,
input=True, frames_per_buffer=self._chunk,
# Run the audio stream asynchronously to fill the buffer object.
# This is necessary so that the input device's buffer doesn't
# overflow while the calling thread makes network requests, etc.
stream_callback=self._fill_buffer,
)
self.closed = False
return self
def __exit__(self, type, value, traceback):
self._audio_stream.stop_stream()
self._audio_stream.close()
self.closed = True
# Signal the generator to terminate so that the client's
# streaming_recognize method will not block the process termination.
self._buff.put(None)
self._audio_interface.terminate()
def _fill_buffer(self, in_data, frame_count, time_info, status_flags):
"""Continuously collect data from the audio stream, into the buffer."""
self._buff.put(in_data)
return None, pyaudio.paContinue
def generator(self, is_running=None):
while not self.closed:
# Use a blocking get() to ensure there's at least one chunk of
# data, and stop iteration if the chunk is None, indicating the
# end of the audio stream.
chunk = self._buff.get()
if callable(is_running) and not is_running():
return
if chunk is None:
return
data = [chunk]
# Now consume whatever other data's still buffered.
while True:
try:
chunk = self._buff.get(block=False)
if chunk is None:
return
data.append(chunk)
except queue.Empty:
break
yield b''.join(data)
向 Google 语音服务发送音频时,请确保服务对象设置与音频编码匹配。在您的特定情况下
config = types.RecognitionConfig(
encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16,
sample_rate_hertz=16000,
language_code='en-US')
对应单声道,16KHz,线性16位PCM编码。如果您需要转录不同格式的音频,请参阅 list of other supported encodings。