(Unity 5.6) (IBM Watson SDK) 将用户语音解析为文本,然后再解析为语音
(Unity 5.6) (IBM Watson SDK) Parsing user speech into text and then back into speech
我正在尝试使用新的 [IBM Watson SDK for Unity] 获取用户的语音输入,将其解析为文本,然后将该文本传回 watson 的文本转语音,以便它可以重复用户输入的内容说。
我已经使用 SDK 的 Speech To Text widget[takes audio clip from microphone and parses into text], in conjunction with it's Speech display widget [将文本输入从语音转换为文本并将其显示在 canvas 场景中] 在屏幕上显示用户的语音。然后我创建了一个 UI 输入字段,将其与 SpeechDisplaywidget 的文本输出以及 texttoSpeechWidget 的输入相关联 - 然后制作了一个 UI 按钮,单击该按钮时,将文本发送到文本到语音服务。它应该播放关联的文本到语音文件,但是当我单击该按钮时,没有任何反应。 'status' 显示只是读取 "ready" 并且从不改变,整个事情没有抛出任何错误。我会粘贴代码,但我还没有编写任何代码,只是连接了所有必要的序列化字段并点击开始。我该怎么办?
抱歉含糊不清,但由于这是我的第一个问题,我无法 post 更多图片或链接。 :/
这是我尝试使用的 "texttoSpeechWidget" 的代码。
using UnityEngine;
using UnityEngine.UI;
using IBM.Watson.DeveloperCloud.Services.TextToSpeech.v1;
using IBM.Watson.DeveloperCloud.Logging;
using IBM.Watson.DeveloperCloud.DataTypes;
using System.Collections.Generic;
using IBM.Watson.DeveloperCloud.Utilities;
#pragma warning disable 414
namespace IBM.Watson.DeveloperCloud.Widgets
{
/// <summary>
/// TextToSpeech widget class wraps the TextToSpeech serivce.
/// </summary>
[RequireComponent(typeof(AudioSource))]
public class TextToSpeechWidget : Widget
{
#region Inputs
[SerializeField]
private Input m_TextInput = new Input("Text", typeof(TextToSpeechData),
"OnTextInput");
[SerializeField]
private Input m_VoiceInput = new Input("Voice", typeof(VoiceData),
"OnVoiceSelect");
#endregion
#region Outputs
[SerializeField]
private Output m_Speaking = new Output(typeof(SpeakingStateData), true);
[SerializeField]
private Output m_DisableMic = new Output(typeof(DisableMicData));
[SerializeField]
private Output m_LevelOut = new Output(typeof(LevelData));
#endregion
#region Private Data
TextToSpeech m_TextToSpeech = new TextToSpeech();
[SerializeField, Tooltip("How often to send level out data in seconds.")]
private float m_LevelOutInterval = 0.05f;
[SerializeField]
private float m_LevelOutputModifier = 1.0f;
[SerializeField]
private Button m_TextToSpeechButton = null;
[SerializeField]
private InputField m_Input = null;
[SerializeField]
private Text m_StatusText = null;
[SerializeField]
private VoiceType m_Voice = VoiceType.en_US_Michael;
[SerializeField]
private bool m_UsePost = false;
private AudioSource m_Source = null;
private int m_LastPlayPos = 0;
private class Speech
{
~Speech()
{
if (Clip != null)
UnityObjectUtil.DestroyUnityObject(Clip);
}
public bool Ready { get; set; }
public AudioClip Clip { get; set; }
public Speech(TextToSpeech textToSpeech, string text, bool usePost)
{
textToSpeech.ToSpeech(text, OnAudioClip, usePost);
}
private void OnAudioClip(AudioClip clip)
{
Clip = clip;
Ready = true;
}
};
private Queue<Speech> m_SpeechQueue = new Queue<Speech>();
private Speech m_ActiveSpeech = null;
#endregion
#region Public Memebers
/// <summary>
/// Gets or sets the voice. Default voice is English, US - Michael
/// </summary>
/// <value>The voice.</value>
public VoiceType Voice
{
get
{
return m_Voice;
}
set
{
m_Voice = value;
}
}
#endregion
#region Event Handlers
/// <summary>
/// Button event handler.
/// </summary>
public void OnTextToSpeech()
{
if (m_TextToSpeech.Voice != m_Voice)
m_TextToSpeech.Voice = m_Voice;
if (m_Input != null)
m_SpeechQueue.Enqueue(new Speech(m_TextToSpeech, m_Input.text, m_UsePost));
if (m_StatusText != null)
m_StatusText.text = "THINKING";
if (m_TextToSpeechButton != null)
m_TextToSpeechButton.interactable = false;
}
#endregion
#region Private Functions
private void OnTextInput(Data data)
{
TextToSpeechData text = data as TextToSpeechData;
if (text == null)
throw new WatsonException("Wrong data type received.");
if (!string.IsNullOrEmpty(text.Text))
{
if (m_TextToSpeech.Voice != m_Voice)
m_TextToSpeech.Voice = m_Voice;
m_SpeechQueue.Enqueue(new Speech(m_TextToSpeech, text.Text, m_UsePost));
}
}
private void OnVoiceSelect(Data data)
{
VoiceData voice = data as VoiceData;
if (voice == null)
throw new WatsonException("Unexpected data type");
m_Voice = voice.Voice;
}
private void OnEnable()
{
UnityObjectUtil.StartDestroyQueue();
if (m_StatusText != null)
m_StatusText.text = "READY";
}
/// <exclude />
protected override void Start()
{
base.Start();
m_Source = GetComponent<AudioSource>();
}
private void Update()
{
if (m_Source != null && !m_Source.isPlaying
&& m_SpeechQueue.Count > 0
&& m_SpeechQueue.Peek().Ready)
{
CancelInvoke("OnEndSpeech");
m_ActiveSpeech = m_SpeechQueue.Dequeue();
if (m_ActiveSpeech.Clip != null)
{
if (m_Speaking.IsConnected)
m_Speaking.SendData(new SpeakingStateData(true));
if (m_DisableMic.IsConnected)
m_DisableMic.SendData(new DisableMicData(true));
m_Source.spatialBlend = 0.0f; // 2D sound
m_Source.loop = false; // do not loop
m_Source.clip = m_ActiveSpeech.Clip; // clip
m_Source.Play();
Invoke("OnEndSpeech", ((float)m_ActiveSpeech.Clip.samples / (float)m_ActiveSpeech.Clip.frequency) + 0.1f);
if (m_LevelOut.IsConnected)
{
m_LastPlayPos = 0;
InvokeRepeating("OnLevelOut", m_LevelOutInterval, m_LevelOutInterval);
}
}
else
{
Log.Warning("TextToSpeechWidget", "Skipping null AudioClip");
}
}
if (m_TextToSpeechButton != null)
m_TextToSpeechButton.interactable = true;
if (m_StatusText != null)
m_StatusText.text = "READY";
}
private void OnLevelOut()
{
if (m_Source != null && m_Source.isPlaying)
{
int currentPos = m_Source.timeSamples;
if (currentPos > m_LastPlayPos)
{
float[] samples = new float[currentPos - m_LastPlayPos];
m_Source.clip.GetData(samples, m_LastPlayPos);
m_LevelOut.SendData(new LevelData(Mathf.Max(samples) * m_LevelOutputModifier, m_LevelOutputModifier));
m_LastPlayPos = currentPos;
}
}
else
CancelInvoke("OnLevelOut");
}
private void OnEndSpeech()
{
if (m_Speaking.IsConnected)
m_Speaking.SendData(new SpeakingStateData(false));
if (m_DisableMic.IsConnected)
m_DisableMic.SendData(new DisableMicData(false));
if (m_Source.isPlaying)
m_Source.Stop();
m_ActiveSpeech = null;
}
/// <exclude />
protected override string GetName()
{
return "TextToSpeech";
}
#endregion
}
}
首先,您需要编写一些代码来实现它。
- 使用 Widget class 作为基础创建一个新的 widget。
将您的输入作为 SpeechToText 并将输出作为 TextToSpeech
[SerializeField]
private Input m_SpeechInput = new Input("Text", typeof(SpeechToTextData), "OnSpeech");
[SerializeField]
private Output m_SpeechOutput = new Output(typeof(TextToSpeechData), true);
</pre>
然后您应该将 OnSpeech 函数添加到您的小部件中,以通过创建新的 TextToSpeechData 对象类型并使用输入 SpeechToTextData 的文本(如果它是最终的)来处理从 SpeechToTextData 到 TextToSpeechData 的对话。
像这样在 OnSpeech 中使用您的输出;
if (m_SpeechOutput.IsConnected)
m_SpeechOutput.SendData( /* object of type SpeechToTextData*/ );
</pre>
确保场景中存在 SpeechToTextWidget 和 TextToSpeechWidget,并且它们都连接到这个新的小部件。
我正在尝试使用新的 [IBM Watson SDK for Unity] 获取用户的语音输入,将其解析为文本,然后将该文本传回 watson 的文本转语音,以便它可以重复用户输入的内容说。
我已经使用 SDK 的 Speech To Text widget[takes audio clip from microphone and parses into text], in conjunction with it's Speech display widget [将文本输入从语音转换为文本并将其显示在 canvas 场景中] 在屏幕上显示用户的语音。然后我创建了一个 UI 输入字段,将其与 SpeechDisplaywidget 的文本输出以及 texttoSpeechWidget 的输入相关联 - 然后制作了一个 UI 按钮,单击该按钮时,将文本发送到文本到语音服务。它应该播放关联的文本到语音文件,但是当我单击该按钮时,没有任何反应。 'status' 显示只是读取 "ready" 并且从不改变,整个事情没有抛出任何错误。我会粘贴代码,但我还没有编写任何代码,只是连接了所有必要的序列化字段并点击开始。我该怎么办?
抱歉含糊不清,但由于这是我的第一个问题,我无法 post 更多图片或链接。 :/
这是我尝试使用的 "texttoSpeechWidget" 的代码。
using UnityEngine;
using UnityEngine.UI;
using IBM.Watson.DeveloperCloud.Services.TextToSpeech.v1;
using IBM.Watson.DeveloperCloud.Logging;
using IBM.Watson.DeveloperCloud.DataTypes;
using System.Collections.Generic;
using IBM.Watson.DeveloperCloud.Utilities;
#pragma warning disable 414
namespace IBM.Watson.DeveloperCloud.Widgets
{
/// <summary>
/// TextToSpeech widget class wraps the TextToSpeech serivce.
/// </summary>
[RequireComponent(typeof(AudioSource))]
public class TextToSpeechWidget : Widget
{
#region Inputs
[SerializeField]
private Input m_TextInput = new Input("Text", typeof(TextToSpeechData),
"OnTextInput");
[SerializeField]
private Input m_VoiceInput = new Input("Voice", typeof(VoiceData),
"OnVoiceSelect");
#endregion
#region Outputs
[SerializeField]
private Output m_Speaking = new Output(typeof(SpeakingStateData), true);
[SerializeField]
private Output m_DisableMic = new Output(typeof(DisableMicData));
[SerializeField]
private Output m_LevelOut = new Output(typeof(LevelData));
#endregion
#region Private Data
TextToSpeech m_TextToSpeech = new TextToSpeech();
[SerializeField, Tooltip("How often to send level out data in seconds.")]
private float m_LevelOutInterval = 0.05f;
[SerializeField]
private float m_LevelOutputModifier = 1.0f;
[SerializeField]
private Button m_TextToSpeechButton = null;
[SerializeField]
private InputField m_Input = null;
[SerializeField]
private Text m_StatusText = null;
[SerializeField]
private VoiceType m_Voice = VoiceType.en_US_Michael;
[SerializeField]
private bool m_UsePost = false;
private AudioSource m_Source = null;
private int m_LastPlayPos = 0;
private class Speech
{
~Speech()
{
if (Clip != null)
UnityObjectUtil.DestroyUnityObject(Clip);
}
public bool Ready { get; set; }
public AudioClip Clip { get; set; }
public Speech(TextToSpeech textToSpeech, string text, bool usePost)
{
textToSpeech.ToSpeech(text, OnAudioClip, usePost);
}
private void OnAudioClip(AudioClip clip)
{
Clip = clip;
Ready = true;
}
};
private Queue<Speech> m_SpeechQueue = new Queue<Speech>();
private Speech m_ActiveSpeech = null;
#endregion
#region Public Memebers
/// <summary>
/// Gets or sets the voice. Default voice is English, US - Michael
/// </summary>
/// <value>The voice.</value>
public VoiceType Voice
{
get
{
return m_Voice;
}
set
{
m_Voice = value;
}
}
#endregion
#region Event Handlers
/// <summary>
/// Button event handler.
/// </summary>
public void OnTextToSpeech()
{
if (m_TextToSpeech.Voice != m_Voice)
m_TextToSpeech.Voice = m_Voice;
if (m_Input != null)
m_SpeechQueue.Enqueue(new Speech(m_TextToSpeech, m_Input.text, m_UsePost));
if (m_StatusText != null)
m_StatusText.text = "THINKING";
if (m_TextToSpeechButton != null)
m_TextToSpeechButton.interactable = false;
}
#endregion
#region Private Functions
private void OnTextInput(Data data)
{
TextToSpeechData text = data as TextToSpeechData;
if (text == null)
throw new WatsonException("Wrong data type received.");
if (!string.IsNullOrEmpty(text.Text))
{
if (m_TextToSpeech.Voice != m_Voice)
m_TextToSpeech.Voice = m_Voice;
m_SpeechQueue.Enqueue(new Speech(m_TextToSpeech, text.Text, m_UsePost));
}
}
private void OnVoiceSelect(Data data)
{
VoiceData voice = data as VoiceData;
if (voice == null)
throw new WatsonException("Unexpected data type");
m_Voice = voice.Voice;
}
private void OnEnable()
{
UnityObjectUtil.StartDestroyQueue();
if (m_StatusText != null)
m_StatusText.text = "READY";
}
/// <exclude />
protected override void Start()
{
base.Start();
m_Source = GetComponent<AudioSource>();
}
private void Update()
{
if (m_Source != null && !m_Source.isPlaying
&& m_SpeechQueue.Count > 0
&& m_SpeechQueue.Peek().Ready)
{
CancelInvoke("OnEndSpeech");
m_ActiveSpeech = m_SpeechQueue.Dequeue();
if (m_ActiveSpeech.Clip != null)
{
if (m_Speaking.IsConnected)
m_Speaking.SendData(new SpeakingStateData(true));
if (m_DisableMic.IsConnected)
m_DisableMic.SendData(new DisableMicData(true));
m_Source.spatialBlend = 0.0f; // 2D sound
m_Source.loop = false; // do not loop
m_Source.clip = m_ActiveSpeech.Clip; // clip
m_Source.Play();
Invoke("OnEndSpeech", ((float)m_ActiveSpeech.Clip.samples / (float)m_ActiveSpeech.Clip.frequency) + 0.1f);
if (m_LevelOut.IsConnected)
{
m_LastPlayPos = 0;
InvokeRepeating("OnLevelOut", m_LevelOutInterval, m_LevelOutInterval);
}
}
else
{
Log.Warning("TextToSpeechWidget", "Skipping null AudioClip");
}
}
if (m_TextToSpeechButton != null)
m_TextToSpeechButton.interactable = true;
if (m_StatusText != null)
m_StatusText.text = "READY";
}
private void OnLevelOut()
{
if (m_Source != null && m_Source.isPlaying)
{
int currentPos = m_Source.timeSamples;
if (currentPos > m_LastPlayPos)
{
float[] samples = new float[currentPos - m_LastPlayPos];
m_Source.clip.GetData(samples, m_LastPlayPos);
m_LevelOut.SendData(new LevelData(Mathf.Max(samples) * m_LevelOutputModifier, m_LevelOutputModifier));
m_LastPlayPos = currentPos;
}
}
else
CancelInvoke("OnLevelOut");
}
private void OnEndSpeech()
{
if (m_Speaking.IsConnected)
m_Speaking.SendData(new SpeakingStateData(false));
if (m_DisableMic.IsConnected)
m_DisableMic.SendData(new DisableMicData(false));
if (m_Source.isPlaying)
m_Source.Stop();
m_ActiveSpeech = null;
}
/// <exclude />
protected override string GetName()
{
return "TextToSpeech";
}
#endregion
}
}
首先,您需要编写一些代码来实现它。
- 使用 Widget class 作为基础创建一个新的 widget。
将您的输入作为 SpeechToText 并将输出作为 TextToSpeech
[SerializeField] private Input m_SpeechInput = new Input("Text", typeof(SpeechToTextData), "OnSpeech"); [SerializeField] private Output m_SpeechOutput = new Output(typeof(TextToSpeechData), true); </pre>
然后您应该将 OnSpeech 函数添加到您的小部件中,以通过创建新的 TextToSpeechData 对象类型并使用输入 SpeechToTextData 的文本(如果它是最终的)来处理从 SpeechToTextData 到 TextToSpeechData 的对话。
像这样在 OnSpeech 中使用您的输出;
if (m_SpeechOutput.IsConnected) m_SpeechOutput.SendData( /* object of type SpeechToTextData*/ ); </pre>
确保场景中存在 SpeechToTextWidget 和 TextToSpeechWidget,并且它们都连接到这个新的小部件。