如何使用 AWS Transcribe javascript SDK

How to use the AWS Transcribe javascript sdk

我正在尝试在 Angular 项目中使用 @aws-sdk/client-transcribe-streaming,但运气不好。

以下代码是 AWS 提供的唯一示例

// ES6+ example
import {
  TranscribeStreamingClient,
  StartStreamTranscriptionCommand,
} from "@aws-sdk/client-transcribe-streaming";

// a client can be shared by different commands.
const client = new TranscribeStreamingClient({ region: "REGION" });

const params = {
  /** input parameters */
};
const command = new StartStreamTranscriptionCommand(params);

正如 SDK 所说 documentation, the StartStreamTranscriptionCommand object expects the params parameter to be of type StartStreamTranscriptionCommandInput

StartStreamTranscriptionCommandInput 对象有一个 AudioStream 字段,类型为 AsyncIterable<AudioStream>,我假设,这是将发送给 AWS 转录的音频流。

问题是我不知道如何创建这个 AudioStream 对象,文档给我们的唯一提示是它是 "PCM 编码的流音频 blob。音频流被编码为 HTTP2 数据帧。"

任何有关如何创建 AsyncIterable<AudioStream> 的帮助将不胜感激。

我无法直接回答您的问题,但我可能会为您提供一些有用的建议。我设法在我最近建立的网站中实现了 Transcribe websocket 音频流。我用的是VueJS,不过过程应该很相似。 我没有使用 AWS Transcribe Javascript SDK,而是将我的代码基于来自 AWS blog post, using the Github link they provided.

的信息

这两种资源对于让它发挥作用至关重要。如果您克隆 git 存储库和 运行 代码,如果我没记错的话,您应该有一个工作示例。 直到今天我还不完全理解代码是如何工作的,因为我不了解音频的东西,但它是有效的。

我最终修改并实现了一些 JS 文件中的 Github 代码,然后将其添加到我的代码中。然后我必须计算一些 AWS Signature V4 东西,我可以将它们发送到 Transcribe API,然后它会 return 一个我可以使用 JS 打开的 websocket link。 发送到 Transcribe websocket 的数据来自连接的麦克风,可以使用 MediaDevices.getUserMedia() 找到它。上面提到的 Github 代码包含将麦克风音频转换为 Transcribe 所需的文件,因为它只接受 8000 和 16000 的比特率,具体取决于您选择的语言。

很难理解 Transcribe 文档并找到我必须放在一起的所有部分,因为流式传输到 Transcribe 似乎有点边缘情况,但我希望我提到的资源将使它成为一个对你来说更容易一些。

编辑:添加源代码

获取转录 websocket link.

我在 AWS Lambda 函数 运行ning 节点中进行了此设置,但您可以将 exports.handler 中的所有内容复制到普通 JS 文件中。 您将需要 cryptojs、aws-sdk 和 moment 节点模块!

//THIS SCRIPT IS BASED ON https://docs.aws.amazon.com/transcribe/latest/dg/websocket.html
const crypto = require('crypto-js');
const moment = require('moment');
const aws = require('aws-sdk');
const awsRegion = '!YOUR-REGION!'
const accessKey = '!YOUR-IAM-ACCESS-KEY!';
const secretAccessKey = '!YOUR-IAM-SECRET-KEY!';

exports.handler = async (event) => {
    console.log(event);
    
    // let body = JSON.parse(event.body); I made a body object below for you to test with
    let body = {
        languageCode: "en-US", //or en-GB etc. I found en-US works better, even for British people due to the higher sample rate, which makes the audio clearer.
        sampleRate: 16000
    }
    
    console.log(crypto.enc.Hex.stringify(signature_key));
    
    
    let method = "GET"
    let region = awsRegion;
    let endpoint = "wss://transcribestreaming." + region + ".amazonaws.com:8443"
    let host = "transcribestreaming." + region + ".amazonaws.com:8443"
    let amz_date = new moment().format('yyyyMMDDTHHmmss') + 'Z';
    let datestamp = new moment().format('yyyyMMDD');
    let service = 'transcribe';
    let linkExpirationSeconds = 60;
    let signatureString = crypto.enc.Hex.stringify(signature_key);
    let languageCode = body.languageCode;
    let sampleRate = body.sampleRate
    let canonical_uri = "/stream-transcription-websocket"
    let canonical_headers = "host:" + host + "\n"
    let signed_headers = "host" 
    let algorithm = "AWS4-HMAC-SHA256"
    let credential_scope = datestamp + "%2F" + region + "%2F" + service + "%2F" + "aws4_request"
    // Date and time of request - NOT url formatted
    let credential_scope2 = datestamp + "/" + region + "/" + service + "/" + "aws4_request"
  
    
    let canonical_querystring  = "X-Amz-Algorithm=" + algorithm
    canonical_querystring += "&X-Amz-Credential="+ accessKey + "%2F" + credential_scope
    canonical_querystring += "&X-Amz-Date=" + amz_date 
    canonical_querystring += "&X-Amz-Expires=" + linkExpirationSeconds
    canonical_querystring += "&X-Amz-SignedHeaders=" + signed_headers
    canonical_querystring += "&language-code=" + languageCode + "&media-encoding=pcm&sample-rate=" + sampleRate
    
    //Empty hash as playload is unknown
    let emptyHash = crypto.SHA256("");
    let payload_hash = crypto.enc.Hex.stringify(emptyHash);
    
    let canonical_request = method + '\n' 
    + canonical_uri + '\n' 
    + canonical_querystring + '\n' 
    + canonical_headers + '\n' 
    + signed_headers + '\n' 
    + payload_hash
    
    let hashedCanonicalRequest = crypto.SHA256(canonical_request);
    
    let string_to_sign = algorithm + "\n"
    + amz_date + "\n"
    + credential_scope2 + "\n"
    + crypto.enc.Hex.stringify(hashedCanonicalRequest);
    
    //Create the signing key
    let signing_key = getSignatureKey(secretAccessKey, datestamp, region, service);
    
    //Sign the string_to_sign using the signing key
    let inBytes = crypto.HmacSHA256(string_to_sign, signing_key);
    
    let signature = crypto.enc.Hex.stringify(inBytes);
    
    canonical_querystring += "&X-Amz-Signature=" + signature;
    
    let request_url = endpoint + canonical_uri + "?" + canonical_querystring;
    
    //The final product
    console.log(request_url);
    
    let response = {
        statusCode: 200,
        headers: {
          "Access-Control-Allow-Origin": "*"  
        },
        body: JSON.stringify(request_url)
    };
    return response;    
};

function getSignatureKey(key, dateStamp, regionName, serviceName) {
    var kDate = crypto.HmacSHA256(dateStamp, "AWS4" + key);
    var kRegion = crypto.HmacSHA256(regionName, kDate);
    var kService = crypto.HmacSHA256(serviceName, kRegion);
    var kSigning = crypto.HmacSHA256("aws4_request", kService);
    return kSigning;
};

用于打开 websocket、发送音频和接收响应的代码

安装 npm 模块:麦克风流(不确定它是否仍然可用,但它在 Github 存储库的源代码中,我可能只是将它粘贴到 node_modules 文件夹中) , @aws-sdk/util-utf8-node, @aws-sdk/eventstream-marshaller

import audioUtils from "../js/audioUtils.js"; //For encoding audio data as PCM
import mic from "microphone-stream"; //Collect microphone input as a stream of raw bytes
import * as util_utf8_node from "@aws-sdk/util-utf8-node"; //Utilities for encoding and decoding UTF8
import * as marshaller from "@aws-sdk/eventstream-marshaller"; //For converting binary event stream messages to and from JSON

let micstream;
let mediastream;
let inputSampleRate; // The sample rate your mic is producting
let transcribeSampleRate = 16000 //The sample rate you requested from Transcribe
let transcribeLanguageCode = "en-US"; //The language you want Transcribe to use
let websocket;

// first we get the microphone input from the browser (as a promise)...
let mediaStream;
try {
    mediaStream = await window.navigator.mediaDevices.getUserMedia({
            video: false,
            audio: true
        })
}
catch (error) {
    console.log(error);
    alert("Error. Please make sure you allow this website to access your microphone");
    return;
}



this.eventStreamMarshaller = new marshaller.EventStreamMarshaller(util_utf8_node.toUtf8, util_utf8_node.fromUtf8);
            //let's get the mic input from the browser, via the microphone-stream module
            micStream = new mic();

            micStream.on("format", data => {
                inputSampleRate = data.sampleRate;
            });

            micStream.setStream(mediaStream);


//THIS IS WHERE YOU NEED TO GET YOURSELF A LINK FROM TRANSCRIBE
//AS MENTIONED I USED AWS LAMBDA FOR THIS
//LOOK AT THE ABOVE CODE FOR GETTING A TRANSCRIBE LINK

getTranscribeLink(transcribeLanguageCode, transcribeSampleRate) // Not a real funtion, you need to make this! The options are what would be in the body object in AWS Lambda

let url = "!YOUR-GENERATED-URL!"


//Configure your websocket
websocket = new WebSocket(url);
websocket.binaryType = "arraybuffer";

websocket.onopen = () => {
    //Make the spinner disappear
    micStream.on('data', rawAudioChunk => {
        // the audio stream is raw audio bytes. Transcribe expects PCM with additional metadata, encoded as binary
        let binary = convertAudioToBinaryMessage(rawAudioChunk);

        if (websocket.readyState === websocket.OPEN)
            websocket.send(binary);
    }
)};

// handle messages, errors, and close events
websocket.onmessage = async message => {

    //convert the binary event stream message to JSON
    var messageWrapper = this.eventStreamMarshaller.unmarshall(Buffer(message.data));

    var messageBody = JSON.parse(String.fromCharCode.apply(String, messageWrapper.body)); 
    
    //THIS IS WHERE YOU DO SOMETHING WITH WHAT YOU GET FROM TRANSCRIBE
    console.log("Got something from Transcribe!:");
    console.log(messageBody);
}






// FUNCTIONS

function convertAudioToBinaryMessage(audioChunk) {
    var raw = mic.toRaw(audioChunk);
    if (raw == null) return; // downsample and convert the raw audio bytes to PCM
    var downsampledBuffer = audioUtils.downsampleBuffer(raw, inputSampleRate, transcribeSampleRate);
    var pcmEncodedBuffer = audioUtils.pcmEncode(downsampledBuffer); // add the right JSON headers and structure to the message

    var audioEventMessage = this.getAudioEventMessage(Buffer.from(pcmEncodedBuffer)); //convert the JSON object + headers into a binary event stream message

    var binary = this.eventStreamMarshaller.marshall(audioEventMessage);
    return binary;
}

function getAudioEventMessage(buffer) {
    // wrap the audio data in a JSON envelope
    return {
        headers: {
            ':message-type': {
                type: 'string',
                value: 'event'
            },
            ':event-type': {
                type: 'string',
                value: 'AudioEvent'
            }
        },
        body: buffer
    };
}

audioUtils.js

export default {
    pcmEncode: pcmEncode,
    downsampleBuffer: downsampleBuffer
}

export function pcmEncode(input) {
    var offset = 0;
    var buffer = new ArrayBuffer(input.length * 2);
    var view = new DataView(buffer);
    for (var i = 0; i < input.length; i++, offset += 2) {
        var s = Math.max(-1, Math.min(1, input[i]));
        view.setInt16(offset, s < 0 ? s * 0x8000 : s * 0x7FFF, true);
    }
    return buffer;
}

export function downsampleBuffer(buffer, inputSampleRate = 44100, outputSampleRate = 16000) {
        
    if (outputSampleRate === inputSampleRate) {
        return buffer;
    }

    var sampleRateRatio = inputSampleRate / outputSampleRate;
    var newLength = Math.round(buffer.length / sampleRateRatio);
    var result = new Float32Array(newLength);
    var offsetResult = 0;
    var offsetBuffer = 0;
    
    while (offsetResult < result.length) {

        var nextOffsetBuffer = Math.round((offsetResult + 1) * sampleRateRatio);

        var accum = 0,
        count = 0;
        
        for (var i = offsetBuffer; i < nextOffsetBuffer && i < buffer.length; i++ ) {
            accum += buffer[i];
            count++;
        }

        result[offsetResult] = accum / count;
        offsetResult++;
        offsetBuffer = nextOffsetBuffer;

    }

    return result;

}

我想就这些了。它肯定足以让您正常工作

事实证明,出于某种原因,他们从自述文件中删除了关于如何获取此 AsyncIterable<AudioStream> 的唯一说明。通过搜索他们的 GitHub 问题,有人向我指出了旧提交中的 this old version of the readme。此版本包含一些有关如何创建此对象的示例。