Google Python 中的云语音转文本:将翻译和时间保存到 JSON
Google cloud speech to text in Python: Save translation and time to JSON
我正在使用标准解决方案进行带有时间戳的语音到文本处理(请参见下面的代码)。我从 了解到可以向 gcloud 命令行工具添加参数,例如 --format=json
。
一般问题:我如何在google.cloud.speech
中指定那些?我似乎无法在 Google 网站上找到任何关于如何使用 Python 执行此操作的文档。
具体问题:我现在的目标是写出一个字典样式的文件JSON,其中包含所有单词的条目,以及每个单词的开始和结束时间单词。我意识到我云编写了一个 hacky 解决方案,但如果一个选项已经存在,那将是更可取的。
代码:
def transcribe_file_with_word_time_offsets(speech_file, language):
"""Transcribe the given audio file synchronously and output the word time
offsets."""
print("Start")
from google.cloud import speech
from google.cloud.speech import enums
from google.cloud.speech import types
print("checking credentials")
client = speech.SpeechClient(credentials=credentials)
print("Checked")
with io.open(speech_file, 'rb') as audio_file:
content = audio_file.read()
print("audio file read")
audio = types.RecognitionAudio(content=content)
print("config start")
config = types.RecognitionConfig(
encoding=enums.RecognitionConfig.AudioEncoding.FLAC,
language_code=language,
enable_word_time_offsets=True)
print("Recognizing:")
response = client.recognize(config, audio)
print("Recognized")
for result in response.results:
alternative = result.alternatives[0]
print('Transcript: {}'.format(alternative.transcript))
for word_info in alternative.words:
word = word_info.word
start_time = word_info.start_time
end_time = word_info.end_time
print('Word: {}, start_time: {}, end_time: {}'.format(
word,
start_time.seconds + start_time.nanos * 1e-9,
end_time.seconds + end_time.nanos * 1e-9))
if __name__ == '__main__':
parser = argparse.ArgumentParser(description=__doc__,
formatter_class=argparse.RawDescriptionHelpFormatter)
parser.add_argument(dest='path', help='Audio file to be recognized')
args = parser.parse_args()
transcribe_file_with_word_time_offsets(args.path, 'en-US')
这里是 hacky 解决方案:
...
transcript_dict = {'Word':[], 'start_time': [], 'end_time':[]}
for result in response.results:
alternative = result.alternatives[0]
print('Transcript: {}'.format(alternative.transcript))
for word_info in alternative.words:
word = word_info.word
start_time = word_info.start_time
end_time = word_info.end_time
transcript_dict['Word'].append(word)
transcript_dict['start_time'].append(
start_time.seconds + start_time.nanos * 1e-9)
transcript_dict['end_time'].append(
end_time.seconds + end_time.nanos * 1e-9)
print(transcript_dict)
...
链接问题中使用 protobuf
的解决方案对我不起作用(2020 年 11 月),但它让我找到了 this comment,它对我的演讲 API:
speech.types.RecognizeResponse.to_json(response)
# alternatively
type(response).to_json(response)
例子
from google.cloud import speech_v1 as speech
def transcribe_gcs(gcs_uri):
client = speech.SpeechClient()
audio = speech.RecognitionAudio(uri=gcs_uri)
config = speech.RecognitionConfig(
language_code="en-US",
)
return client.recognize(config=config, audio=audio)
sample_audio_uri = "gs://cloud-samples-tests/speech/brooklyn.flac"
response = transcribe_gcs(sample_audio_uri)
response_json = type(response).to_json(response)
print(response_json)
{
"results": [
{
"alternatives": [
{
"transcript": "how old is the Brooklyn Bridge",
"confidence": 0.98314303,
"words": []
}
],
"channelTag": 0
}
]
}
您可以尝试类似的方法:
from google.cloud import speech_v1p1beta1 as speech
import proto
client = speech.SpeechClient()
audio = speech.RecognitionAudio(...)
config = speech.RecognitionConfig(...)
operation = client.long_running_recognize(config=config, audio=audio)
response = operation.result()
response_dict = proto.Message.to_dict(response)
我正在使用标准解决方案进行带有时间戳的语音到文本处理(请参见下面的代码)。我从 --format=json
。
一般问题:我如何在google.cloud.speech
中指定那些?我似乎无法在 Google 网站上找到任何关于如何使用 Python 执行此操作的文档。
具体问题:我现在的目标是写出一个字典样式的文件JSON,其中包含所有单词的条目,以及每个单词的开始和结束时间单词。我意识到我云编写了一个 hacky 解决方案,但如果一个选项已经存在,那将是更可取的。
代码:
def transcribe_file_with_word_time_offsets(speech_file, language):
"""Transcribe the given audio file synchronously and output the word time
offsets."""
print("Start")
from google.cloud import speech
from google.cloud.speech import enums
from google.cloud.speech import types
print("checking credentials")
client = speech.SpeechClient(credentials=credentials)
print("Checked")
with io.open(speech_file, 'rb') as audio_file:
content = audio_file.read()
print("audio file read")
audio = types.RecognitionAudio(content=content)
print("config start")
config = types.RecognitionConfig(
encoding=enums.RecognitionConfig.AudioEncoding.FLAC,
language_code=language,
enable_word_time_offsets=True)
print("Recognizing:")
response = client.recognize(config, audio)
print("Recognized")
for result in response.results:
alternative = result.alternatives[0]
print('Transcript: {}'.format(alternative.transcript))
for word_info in alternative.words:
word = word_info.word
start_time = word_info.start_time
end_time = word_info.end_time
print('Word: {}, start_time: {}, end_time: {}'.format(
word,
start_time.seconds + start_time.nanos * 1e-9,
end_time.seconds + end_time.nanos * 1e-9))
if __name__ == '__main__':
parser = argparse.ArgumentParser(description=__doc__,
formatter_class=argparse.RawDescriptionHelpFormatter)
parser.add_argument(dest='path', help='Audio file to be recognized')
args = parser.parse_args()
transcribe_file_with_word_time_offsets(args.path, 'en-US')
这里是 hacky 解决方案:
...
transcript_dict = {'Word':[], 'start_time': [], 'end_time':[]}
for result in response.results:
alternative = result.alternatives[0]
print('Transcript: {}'.format(alternative.transcript))
for word_info in alternative.words:
word = word_info.word
start_time = word_info.start_time
end_time = word_info.end_time
transcript_dict['Word'].append(word)
transcript_dict['start_time'].append(
start_time.seconds + start_time.nanos * 1e-9)
transcript_dict['end_time'].append(
end_time.seconds + end_time.nanos * 1e-9)
print(transcript_dict)
...
链接问题中使用 protobuf
的解决方案对我不起作用(2020 年 11 月),但它让我找到了 this comment,它对我的演讲 API:
speech.types.RecognizeResponse.to_json(response)
# alternatively
type(response).to_json(response)
例子
from google.cloud import speech_v1 as speech
def transcribe_gcs(gcs_uri):
client = speech.SpeechClient()
audio = speech.RecognitionAudio(uri=gcs_uri)
config = speech.RecognitionConfig(
language_code="en-US",
)
return client.recognize(config=config, audio=audio)
sample_audio_uri = "gs://cloud-samples-tests/speech/brooklyn.flac"
response = transcribe_gcs(sample_audio_uri)
response_json = type(response).to_json(response)
print(response_json)
{
"results": [
{
"alternatives": [
{
"transcript": "how old is the Brooklyn Bridge",
"confidence": 0.98314303,
"words": []
}
],
"channelTag": 0
}
]
}
您可以尝试类似的方法:
from google.cloud import speech_v1p1beta1 as speech
import proto
client = speech.SpeechClient()
audio = speech.RecognitionAudio(...)
config = speech.RecognitionConfig(...)
operation = client.long_running_recognize(config=config, audio=audio)
response = operation.result()
response_dict = proto.Message.to_dict(response)