如何在电脑播放音频前暂停语音识别(JS SpeechRecognition),播放完再恢复语音识别?

How to pause speech recognition (JS SpeechRecognition) before audio is played by the computer and then resume speech recognition after being played?

webkitSpeechRecognition有没有办法在电脑播放音频时暂停和恢复语音识别?现在的计算机似乎混淆了用户通过麦克风输入的内容和从 wav 文件输出的音频。

现在我创建了以下内容:

var speechRecognition = window.SpeechRecognition || window.webkitSpeechRecognition; 
var recognition = new webkitSpeechRecognition();
    
window.addEventListener('DOMContentLoaded', function() {
    document.getElementById("speak_button").addEventListener('click', function() {
       recognition.start();
       setInterval(updateCountDown,1000); /* countdown timer starts 1 second after 
                                           being clicked */
       updateCountDown(); //this is a function that counts down from 2 minutes to 0
        
    });
});


var transcript; // transcript variable will store what the user says to the computer

recognition.addEventListener('result', e => {
    transcript = Array.from(e.results)
       .map(result => result[0])
       .map(result => result.transcript)
       .join('');
   console.log(transcript);
    communicateToUser();
  
});


function communicateToUser() {

    var audio_age = new Audio("age_20.wav");

        var age_regular_expression = /(?=.*\bhow\b)(?=.*\bold\b)(?=.*\byou\b)|(?=.*\bgrab\b)(?=.*\byour\b)(?=.*\bage\b)|(?=.*\bwhat\b)(?=.*\byour\b)(?=.*\bage\b)| (?=.*\btell\b)(?=.*\byour\b)(?=.*\bage\b)|(?=.*\bshare\b)(?=.*\byour\b)(?=.*\bage\b)|(?=.*\bshare\b)(?=.*\bhow\b)(?=.*\bold\b)|(?=.*\byou\b)(?=.*\bhow\b)(?=.*\bold\b)/gi;

    // if regular expression matches all words, then function will be performed

        if (age_regular_expression.test(transcript)) {
        recognition.stop(); /* wanting the speech recognition to stop here so that it 
                            doesn't capture the contents of audio_age */
        audio_age.play(); // audio will play "I am 20 years old" 
        recognition.start(); /* wanting the speech recognition to start again 
                            after audio_age is played */
        
    }

}

问题是 recognition.stop() 函数不起作用,这意味着麦克风将继续捕获 audio_age.wav 的内容并将其转换为文本。所以,当我想再次对着电脑说话并问它一个问题时,将要分析的抄本将包括我刚才讲话时的抄本。

如有任何建议,我们将不胜感激。

我正在考虑一个解决方案,但我不确定如何实施它:
解决方案: 停止识别功能并延迟与音频文件播放相同的秒数(例如5秒),然后在这5秒后再次启动识别功能?

谢谢!

为 CESARE 编辑:

// SPEECH RECOGNITION SET UP 

    var speechRecognition = window.SpeechRecognition || window.webkitSpeechRecognition; 
    var recognition = new webkitSpeechRecognition();
        

    window.addEventListener('DOMContentLoaded', function() {
    document.getElementById("speak_button").addEventListener('click', function() {
            recognition.start();
            setInterval(updateCountDown,1000);
            updateCountDown();
        });
    });

// ALL OF THE AUDIO FILES --> WILL BE PLAYED IF REGEX MATCHES TRUE
    
    const audio_name = new Audio("name_harry.wav");
    
    const audio_age = new Audio("age_20.wav");
    
    const audio_date_of_birth = new Audio("15_nov_1999.wav");
    
    const audio_occupation = new Audio("grocery_store.wav");


// ON SPEECH START --> IF MICROPHONE INPUT IS DETECTED, THEN SPEECH RECOGNITION STARTS 
    
    recognition.onspeechstart = () => {
        console.log("SPEECH STARTED");
        if (!audio_age.paused) {audio_age.pause()}
        else if (!audio_name.paused) {audio_name.pause()}
        else if (!audio_date_of_birth.paused) {audio_date_of_birth.pause()}
        else if (!audio_occupation.paused) {audio_occupation.pause()}
  
    };
    
// ON SPEECH END --> WHEN MICROPHONE INPUT STOPS, SPEECH RECOGNITION SHOULD END 

    recognition.onspeechend = () => {
        console.log("SPEECH ENDED");
        recognition.stop();
    
    };
    
// I have included this because I want the computer to continue listening to the user, but only after the audio is finished playing 

    recognition.addEventListener('end', recognition.start);

// After audio is ended, speech recognition will start again
    
    audio_name.addEventListener('ended', recognition.start);
    audio_age.addEventListener('ended', recognition.start);
    audio_date_of_birth.addEventListener('ended', recognition.start);
    audio_occupation.addEventListener('ended', recognition.start);
    audio_height.addEventListener('ended', recognition.start);
    
    
// USED TO OBTAIN THE USER TRANSCRIPT/ACTUAL SPEECH CONTENT

    var transcript;
    
    recognition.addEventListener('result', e => {
        transcript = Array.from(e.results)
           .map((result) => result[0])
           .map((result) => result.transcript)
           .join('');
       console.log(transcript);
       communicateToUser();
      
    });
    
 




     // ALL OF THE REGULAR EXPRESSIONS

    const name_regex = /what is your name|(?=.*\byour\b)(?=.*\bfull\b)(?=.*\bname\b)|(?=.*\btell\b)(?=.*\bme\b)(?=.*\byour\b)(?=.*\bname\b)|(?=.*\bcan\b)(?=.*\btell\b)(?=.*\bme\b)(?=.*\byour\b)(?=.*\bname\b)|(?=.*\btell\b)(?=.*\bme\b)(?=.*\byour\b)(?=.*\bfull\b)(?=.*\bname\b)|(?=.*\blet\b)(?=.*\bknow\b)(?=.*\bfull\b)(?=.*\bname\b)|(?=.*\bgrab\b)(?=.*\byour\b)(?=.*\bname\b)|(?=.*\bwhat\b)(?=.*\byour\b)(?=.*\bname\b)|(?=.*\bshare\b)(?=.*\bme\b)(?=.*\bfull\b)(?=.*\bname\b)|(?=.*\bwhat\b)(?=.*\byour\b)(?=.*\bfirst\b)(?=.*\band\b)(?=.*\blast\b)(?=.*\bname\b)/ig;

const age_regex = /(?=.*\bhow\b)(?=.*\bold\b)(?=.*\byou\b)|(?=.*\bgrab\b)(?=.*\byour\b)(?=.*\bage\b)|(?=.*\bwhat\b)(?=.*\byour\b)(?=.*\bage\b)| (?=.*\btell\b)(?=.*\byour\b)(?=.*\bage\b)|(?=.*\bshare\b)(?=.*\byour\b)(?=.*\bage\b)|(?=.*\bshare\b)(?=.*\bhow\b)(?=.*\bold\b)|(?=.*\byou\b)(?=.*\bhow\b)(?=.*\bold\b)/gi;

const date_of_birth_regex = /(?=.*\bdate\b)(?=.*\bof\b)(?=.*\bbirth\b)|(?=.*\byour\b)(?=.*\bdate\b)(?=.*\bof\b)(?=.*\bbirth\b)|(?=.*\bshare\b)(?=.*\byour\b)(?=.*\bdate\b)(?=.*\bof\b)(?=.*\bbirth\b)|(?=.*\bshare\b)(?=.*\bdate\b)(?=.*\bof\b)(?=.*\bbirth\b)|(?=.*\bwhen\b)(?=.*\byou\b)(?=.*\bborn\b)|(?=.*\bwhen\b)(?=.*\byou\b)(?=.*\bborn\b)|(?=.*\bwhat\b)(?=.*\bdate\b)(?=.*\byou\b)(?=.*\bborn\b)/gi

const patient_occupation = /do you have a job|(?=.*\bdo\b)(?=.*\byou\b)(?=.*\bwork\b)|(?=.*\byou\b)(?=.*\bhave\b)(?=.*\bjob\b)|(?=.*\byou\b)(?=.*\bwork\b)(?=.*\bwhere\b)|(?=.*\banything\b)(?=.*\bfor\b)(?=.*\bwork\b)|(?=.*\byou\b)(?=.*\bwork\b)(?=.*\banywhere\b)|(?=.*\bwhat\b)(?=.*\boccupation\b)|(?=.*\byour\b)(?=.*\boccupation\b)|(?=.*\byou\b)(?=.*\boccupation\b)|(?=.*\byour\b)(?=.*\bjob\b)|(?=.*\bwhat\b)(?=.*\byour\b)(?=.*\bjob\b)|(?=.*\byou\b)(?=.*\bjob\b)|(?=.*\bjob\b)/ig;

// COMMUNICATE BACK TO USER FUNCTION
 

       function communicateToUser() {
    
    if (name_regex.test(transcript)) {
            audio_name.play();
    }

    if (age_regex.test(transcript)) {
            audio_age.play();
    }
    if (date_of_birth_regex.test(transcript)) {
                audio_date_of_birth.play();
    }
    if (occuptation_regex.test(transcript)) {
                    audio_occupation.play();
        }
    
    }
         

更新倒计时函数

function updateCountDown() {
   
   const minutes = Math.floor(time / 60);
   let seconds = time % 60;

   seconds = seconds < 2 ? '0' + seconds : seconds;

   document.getElementById("countdown").innerHTML = `${minutes}:${seconds}`;

   time--;

   time = time < 0 ? 0 : time; 

    if (minutes == 0 && seconds == 0) {
        document.getElementById('tableStyle').style.display = "block";
        recognition.stop(); //ADDING IN RECOGNITION.STOP ONCE MINUTES AND SECONDS == 0!
        
    }

   };

编辑:

我做了一个工作示例,https://stackblitz.com/edit/web-platform-ppcuh9?file=index.html:

let isListening = false; // use this flag to toggle the recognition
let interval;
const button = document.getElementById('speak_button');

const speaker = new MakeSpeechSynth({
  pitch: 0.5,
  rate: 0.8,
  language: 'en-US',
});

const SpeechRecognition =
  window.SpeechRecognition || window.webkitSpeechRecognition;
const recognition = new SpeechRecognition();

button.addEventListener('click', function() {
  if (isListening) {
    console.log('ABORTING RECOGNITION');
    isListening = false;
    recognition.abort();
    clearInterval(interval);
    button.innerText = 'Click Me To Speak';
  } else {
    console.log('STARTING RECOGNITION');
    recognition.start();
    interval = setInterval(updateCountDown, 1000);
    updateCountDown();
    button.innerText = 'Stop Recognition';
    isListening = true;
  }
});

recognition.onaudiostart = () => {
  console.log('RECOGNITION STARTED');
};

recognition.onaudioend = () => {
  console.log('RECOGNITION FINISHED');
};

recognition.onend = () => {
  console.log('RECOGNITION DISCONNECTED');
  if (isListening) recognition.start();
};

recognition.onspeechstart = () => {
  console.log('SPEECH STARTED');
  // You can stop the bot speaking if you want when you speak over him:
  // Comment if you want him to keep speaking

  //Object.values(data).forEach((d) => d.audio.pause());
  if (speaker.isSpeaking) speaker.cancel();
};

recognition.onspeechend = () => {
  console.log('SPEECH ENDED');
};

recognition.addEventListener('result', (e) => {
  const transcript = Array.from(e.results)
    .map((result) => result[0])
    .map((result) => result.transcript)
    .join('');
  console.log(transcript);
  speakBackToMe(transcript);
});

function speakBackToMe(str) {
  Object.values(data).forEach((d) => {
    if (d.regex.test(str)) {
      // d.audio.play();
      speaker.speak(d.message);
      console.log(d.message);
    }
  });
}

// UPDATE COUNTDOWN
const startingMinutes = 2;
let time = startingMinutes * 60;

function updateCountDown() {
  const minutes = Math.floor(time / 60);
  let seconds = time % 60;
  seconds = seconds < 2 ? '0' + seconds : seconds;
  document.getElementById('countdown').innerHTML = `${minutes}:${seconds}`;
  time--;
  time = time < 0 ? 0 : time;

  if (minutes == 0 && seconds == 0) {
    document.getElementById('tableStyle').style.display = 'table-cell';
  }
}
<div id="app"></div>
<button id="speak_button">Click Me to Speak</button>
<p id="countdown"></p>