如何 运行 在 iOS 上使用口袋狮身人面像唤醒单词检测?
How to run wake word detection with pocket sphinx on iOS?
我尝试 运行 在 iOS 上从 pocket sphinx 检测唤醒词。作为基础,我使用 TLSphinx 并且语音转文本有效(不是很好的 STT,但它可以识别单词)。
我通过一个新函数扩展了 decoder.swift:
public func detectWakeWord (complete: @escaping (Bool?) -> ()) throws {
ps_set_keyphrase(psDecoder, "keyphrase_search", "ZWEI")
ps_set_search(psDecoder, "keyphrase_search")
do {
if #available(iOS 10.0, *) {
try AVAudioSession.sharedInstance().setCategory(.playAndRecord, mode: .voiceChat, options: [])
} else {
try AVAudioSession.sharedInstance().setCategory(.playAndRecord)
}
} catch let error as NSError {
print("Error setting the shared AVAudioSession: \(error)")
throw DecodeErrors.CantSetAudioSession(error)
}
engine = AVAudioEngine()
let input = engine.inputNode
let mixer = AVAudioMixerNode()
let output = engine.outputNode
engine.attach(mixer)
engine.connect(input, to: mixer, format: input.outputFormat(forBus: 0))
engine.connect(mixer, to: output, format: input.outputFormat(forBus: 0))
// We forceunwrap this because the docs for AVAudioFormat specify that this constructor return nil when the channels
// are greater than 2.
let formatIn = AVAudioFormat(commonFormat: .pcmFormatFloat32, sampleRate: 44100, channels: 1, interleaved: false)!
let formatOut = AVAudioFormat(commonFormat: .pcmFormatInt16, sampleRate: 16000, channels: 1, interleaved: false)!
guard let bufferMapper = AVAudioConverter(from: formatIn, to: formatOut) else {
// Returns nil if the format conversion is not possible.
throw DecodeErrors.CantConvertAudioFormat
}
mixer.installTap(onBus: 0, bufferSize: 2048, format: formatIn, block: {
[unowned self] (buffer: AVAudioPCMBuffer!, time: AVAudioTime!) in
guard let sphinxBuffer = AVAudioPCMBuffer(pcmFormat: formatOut, frameCapacity: buffer.frameCapacity) else {
// Returns nil in the following cases:
// - if the format has zero bytes per frame (format.streamDescription->mBytesPerFrame == 0)
// - if the buffer byte capacity (frameCapacity * format.streamDescription->mBytesPerFrame)
// cannot be represented by an uint32_t
print("Can't create PCM buffer")
return
}
// This is needed because the 'frameLenght' default value is 0 (since iOS 10) and cause the 'convert' call
// to faile with an error (Error Domain=NSOSStatusErrorDomain Code=-50 "(null)")
// More here:
sphinxBuffer.frameLength = sphinxBuffer.frameCapacity
var error : NSError?
let inputBlock : AVAudioConverterInputBlock = {
inNumPackets, outStatus in
outStatus.pointee = AVAudioConverterInputStatus.haveData
return buffer
}
bufferMapper.convert(to: sphinxBuffer, error: &error, withInputFrom: inputBlock)
print("Error? ", error as Any);
let audioData = sphinxBuffer.toData()
self.process_raw(audioData)
print("Process: \(buffer.frameLength) frames - \(audioData.count) bytes - sample time: \(time.sampleTime)")
self.end_utt()
let hypothesis = self.get_hyp()
print("HYPOTHESIS: ", hypothesis)
DispatchQueue.main.async {
complete(hypothesis != nil)
}
self.start_utt()
})
start_utt()
do {
try engine.start()
} catch let error as NSError {
end_utt()
print("Can't start AVAudioEngine: \(error)")
throw DecodeErrors.CantStartAudioEngine(error)
}
}
没有错误,但 hypothesis
始终为零。
我的字典将所有内容都映射到“ZWEI”,因此如果检测到任何内容,则应检测到唤醒词。
ZWEI AH P Z EH TS B AAH EX
ZWEI(2) HH IH T
ZWEI(3) F EH EX Q OE F EH N T L IH CC T
ZWEI(4) G AX V AH EX T AX T
...
ZWEI(12113) N AY NZWO B IIH T AX N
有人知道为什么假设总是为零吗?
我不得不 运行 self.get_hyp()
之前 self.end_utt()
.
我不确定为什么,但是从语音到文本的调用顺序是不同的。
编辑
另一个提示:为了获得更好的唤醒词检测质量,请增加麦克风输入的缓冲区大小。例如:
mixer.installTap(onBus: 0, bufferSize: 8192, format: formatIn, block: [...]
我尝试 运行 在 iOS 上从 pocket sphinx 检测唤醒词。作为基础,我使用 TLSphinx 并且语音转文本有效(不是很好的 STT,但它可以识别单词)。
我通过一个新函数扩展了 decoder.swift:
public func detectWakeWord (complete: @escaping (Bool?) -> ()) throws {
ps_set_keyphrase(psDecoder, "keyphrase_search", "ZWEI")
ps_set_search(psDecoder, "keyphrase_search")
do {
if #available(iOS 10.0, *) {
try AVAudioSession.sharedInstance().setCategory(.playAndRecord, mode: .voiceChat, options: [])
} else {
try AVAudioSession.sharedInstance().setCategory(.playAndRecord)
}
} catch let error as NSError {
print("Error setting the shared AVAudioSession: \(error)")
throw DecodeErrors.CantSetAudioSession(error)
}
engine = AVAudioEngine()
let input = engine.inputNode
let mixer = AVAudioMixerNode()
let output = engine.outputNode
engine.attach(mixer)
engine.connect(input, to: mixer, format: input.outputFormat(forBus: 0))
engine.connect(mixer, to: output, format: input.outputFormat(forBus: 0))
// We forceunwrap this because the docs for AVAudioFormat specify that this constructor return nil when the channels
// are greater than 2.
let formatIn = AVAudioFormat(commonFormat: .pcmFormatFloat32, sampleRate: 44100, channels: 1, interleaved: false)!
let formatOut = AVAudioFormat(commonFormat: .pcmFormatInt16, sampleRate: 16000, channels: 1, interleaved: false)!
guard let bufferMapper = AVAudioConverter(from: formatIn, to: formatOut) else {
// Returns nil if the format conversion is not possible.
throw DecodeErrors.CantConvertAudioFormat
}
mixer.installTap(onBus: 0, bufferSize: 2048, format: formatIn, block: {
[unowned self] (buffer: AVAudioPCMBuffer!, time: AVAudioTime!) in
guard let sphinxBuffer = AVAudioPCMBuffer(pcmFormat: formatOut, frameCapacity: buffer.frameCapacity) else {
// Returns nil in the following cases:
// - if the format has zero bytes per frame (format.streamDescription->mBytesPerFrame == 0)
// - if the buffer byte capacity (frameCapacity * format.streamDescription->mBytesPerFrame)
// cannot be represented by an uint32_t
print("Can't create PCM buffer")
return
}
// This is needed because the 'frameLenght' default value is 0 (since iOS 10) and cause the 'convert' call
// to faile with an error (Error Domain=NSOSStatusErrorDomain Code=-50 "(null)")
// More here:
sphinxBuffer.frameLength = sphinxBuffer.frameCapacity
var error : NSError?
let inputBlock : AVAudioConverterInputBlock = {
inNumPackets, outStatus in
outStatus.pointee = AVAudioConverterInputStatus.haveData
return buffer
}
bufferMapper.convert(to: sphinxBuffer, error: &error, withInputFrom: inputBlock)
print("Error? ", error as Any);
let audioData = sphinxBuffer.toData()
self.process_raw(audioData)
print("Process: \(buffer.frameLength) frames - \(audioData.count) bytes - sample time: \(time.sampleTime)")
self.end_utt()
let hypothesis = self.get_hyp()
print("HYPOTHESIS: ", hypothesis)
DispatchQueue.main.async {
complete(hypothesis != nil)
}
self.start_utt()
})
start_utt()
do {
try engine.start()
} catch let error as NSError {
end_utt()
print("Can't start AVAudioEngine: \(error)")
throw DecodeErrors.CantStartAudioEngine(error)
}
}
没有错误,但 hypothesis
始终为零。
我的字典将所有内容都映射到“ZWEI”,因此如果检测到任何内容,则应检测到唤醒词。
ZWEI AH P Z EH TS B AAH EX
ZWEI(2) HH IH T
ZWEI(3) F EH EX Q OE F EH N T L IH CC T
ZWEI(4) G AX V AH EX T AX T
...
ZWEI(12113) N AY NZWO B IIH T AX N
有人知道为什么假设总是为零吗?
我不得不 运行 self.get_hyp()
之前 self.end_utt()
.
我不确定为什么,但是从语音到文本的调用顺序是不同的。
编辑
另一个提示:为了获得更好的唤醒词检测质量,请增加麦克风输入的缓冲区大小。例如:
mixer.installTap(onBus: 0, bufferSize: 8192, format: formatIn, block: [...]