AudioContext、getUserMedia 和 websockets 音频流

AudioContext, getUserMedia, and websockets audio streaming

我正在尝试制作一个尽可能简单的 Javascript 前端,它允许我使用 getUserMedia 在网络浏览器中单击鼠标从用户的麦克风接收音频,将其修改为自定义采样率和单通道,并通过 websocket 将其流式传输到我的服务器,在那里它将中继到 Watson Speech API.

我已经使用 autobahn. I have been trying to make an updated client library drawing on whisper and ws-audio-api but both libraries seem outdated and include much functionality I don't need which I am trying to filter out. I am using XAudioJS 构建了 websocket 服务器来重新采样音频。

我目前的进展是Codepen我卡住了,找不到更清楚的例子。

  1. 耳语和 ws-audio-api 在页面加载时初始化 AudioContext,导致 error 至少 Chrome 和 iOS 作为音频上下文必须现在被初始化为对用户交互的响应。我试图将 AudioContext 移动到 onClick 事件中,但这导致我必须单击两次才能开始流式传输。我目前在 onClick 事件中使用 audio_context.resume() 但这似乎是一个迂回的解决方案,结果页面显示它始终在记录,即使它没有记录,这可能会让我的用户感到不安。 如何正确启动点击录制并在点击时终止录制?
  2. 我已从已弃用的 Navigator.getUserMedia() to MediaDevices.getUserMedia() 更新,但不确定是否需要更改第 83-86 行的旧版供应商前缀以匹配新功能?
  3. 最重要的是,一旦我从 getUserMedia 获得流,我如何正确地对其重新采样并将其转发到打开的 websocket?我对从一个节点到另一个节点弹跳音频的结构有点困惑,我需要第 93-108 行的帮助。

我找到了帮助 and was able to build a more modern JavaScript frontend based on the code from vin-ni's Google-Cloud-Speech-Node-Socket-Playground,我稍微调整了一下。 2021 年的许多现有音频流演示要么已经过时,要么具有大量“额外”功能,这增加了开始使用 websockets 和音频流的障碍。我创建了这个“基本”脚本,它将音频流减少到只有四个关键功能:

  1. 打开网络套接字
  2. 开始直播
  3. 重新采样音频
  4. 停止直播

希望这个 KISS(保持简单,愚蠢)演示可以帮助其他人比我更快地开始使用流式音频。

这是我的 JavaScript 前端

//================= CONFIG =================
// Global Variables
let websocket_uri = 'ws://127.0.0.1:9001';
let bufferSize = 4096,
    AudioContext,
    context,
    processor,
    input,
    globalStream,
    websocket;

// Initialize WebSocket
initWebSocket();

//================= RECORDING =================
function startRecording() {
    streamStreaming = true;
    AudioContext = window.AudioContext || window.webkitAudioContext;
    context = new AudioContext({
      // if Non-interactive, use 'playback' or 'balanced' // https://developer.mozilla.org/en-US/docs/Web/API/AudioContextLatencyCategory
      latencyHint: 'interactive',
    });
    processor = context.createScriptProcessor(bufferSize, 1, 1);
    processor.connect(context.destination);
    context.resume();
  
    var handleSuccess = function (stream) {
      globalStream = stream;
      input = context.createMediaStreamSource(stream);
      input.connect(processor);
  
      processor.onaudioprocess = function (e) {
        var left = e.inputBuffer.getChannelData(0);
        var left16 = downsampleBuffer(left, 44100, 16000);
        websocket.send(left16);
      };
    };
  
    navigator.mediaDevices.getUserMedia({audio: true, video: false}).then(handleSuccess);
} // closes function startRecording()

function stopRecording() {
    streamStreaming = false;
  
    let track = globalStream.getTracks()[0];
    track.stop();
  
    input.disconnect(processor);
    processor.disconnect(context.destination);
    context.close().then(function () {
      input = null;
      processor = null;
      context = null;
      AudioContext = null;
    });
} // closes function stopRecording()

function initWebSocket() {
    // Create WebSocket
    websocket = new WebSocket(websocket_uri);
    //console.log("Websocket created...");
  
    // WebSocket Definitions: executed when triggered webSocketStatus
    websocket.onopen = function() {
      console.log("connected to server");
      //websocket.send("CONNECTED TO YOU");
      document.getElementById("webSocketStatus").innerHTML = 'Connected';
    }
    
    websocket.onclose = function(e) {
      console.log("connection closed (" + e.code + ")");
      document.getElementById("webSocketStatus").innerHTML = 'Not Connected';
    }
    
    websocket.onmessage = function(e) {
      //console.log("message received: " + e.data);
      console.log(e.data);
  
      try {
        result = JSON.parse(e.data);
      }  catch (e) {
        $('.message').html('Error retrieving data: ' + e);
      }
  
      if (typeof(result) !== 'undefined' && typeof(result.error) !== 'undefined') {
        $('.message').html('Error: ' + result.error);
      }
      else {
        $('.message').html('Welcome!');
      }
    }
} // closes function initWebSocket()

function downsampleBuffer (buffer, sampleRate, outSampleRate) {
    if (outSampleRate == sampleRate) {
      return buffer;
    }
    if (outSampleRate > sampleRate) {
      throw 'downsampling rate show be smaller than original sample rate';
    }
    var sampleRateRatio = sampleRate / outSampleRate;
    var newLength = Math.round(buffer.length / sampleRateRatio);
    var result = new Int16Array(newLength);
    var offsetResult = 0;
    var offsetBuffer = 0;
    while (offsetResult < result.length) {
      var nextOffsetBuffer = Math.round((offsetResult + 1) * sampleRateRatio);
      var accum = 0,
        count = 0;
      for (var i = offsetBuffer; i < nextOffsetBuffer && i < buffer.length; i++) {
        accum += buffer[i];
        count++;
      }
  
      result[offsetResult] = Math.min(1, accum / count) * 0x7fff;
      offsetResult++;
      offsetBuffer = nextOffsetBuffer;
    }
    return result.buffer;
} // closes function downsampleBuffer()

还有我的 index.html 文件

<!DOCTYPE html>
<html>
  <head>
    <script src='jquery-1.8.3.js'></script>
    <script src='client.js'></script>
  </head>

  <body>
    <div class='message'>Welcome!</div>
    <button onclick='startRecording()'>Start recording</button>
    <button onclick='stopRecording()'>Stop recording</button>
    <br/>
    <div>WebSocket: <span id="webSocketStatus">Not Connected</span></div>
  </body>
</html>

您可以使用可以在 Crossbario's GitHub 上找到的大多数高速公路 python 回声服务器来测试它。 startRecording()stopRecording() 函数也可以从 Storyline 或 H5P 中的变量调用,如果有人想将其用于 ed tech 中的语音识别(比如我)。