A problem with sound producing: 如何用傅立叶系数发声

A problem with sound producing: How to make sound with Fourier coefficients

我正在尝试使用傅立叶系数创建声音。

首先请让我展示一下我是如何得到傅里叶系数的。

(1) 我从麦克风声音中截取了波形。

数据如下所示:(我将 Uint8Array 字符串化,这是 getByteTimeDomainData() 的 return 值,并添加了 length 属性 以更改此稍后反对数组)

const raw = '{"length": 512,"0":126,"1":121,"2":121,"3":124,"4":129,"5":135,"6":140,"7":147,"8":153,"9":156,"10":152,"11":141,"12":125,"13":112,"14":106,"15":108,"16":113,"17":120,"18":127,"19":132,"20":138,"21":142,"22":141,"23":136,"24":126,"25":115,"26":106,"27":103,"28":105,"29":111,"30":117,"31":121,"32":123,"33":124,"34":124,"35":120,"36":112,"37":103,"38":97,"39":95,"40":96,"41":98,"42":101,"43":106,"44":112,"45":117,"46":117,"47":113,"48":105,"49":98,"50":93,"51":91,"52":91,"53":92,"54":93,"55":95,"56":97,"57":101,"58":105,"59":108,"60":106,"61":101,"62":96,"63":95,"64":97,"65":100,"66":100,"67":97,"68":94,"69":94,"70":99,"71":104,"72":106,"73":105,"74":104,"75":105,"76":108,"77":111,"78":112,"79":110,"80":108,"81":105,"82":105,"83":107,"84":110,"85":113,"86":114,"87":115,"88":116,"89":120,"90":123,"91":125,"92":124,"93":121,"94":120,"95":121,"96":123,"97":124,"98":124,"99":126,"100":128,"101":131,"102":133,"103":134,"104":134,"105":134,"106":134,"107":134,"108":134,"109":133,"110":132,"111":131,"112":131,"113":134,"114":137,"115":139,"116":141,"117":142,"118":143,"119":142,"120":142,"121":139,"122":136,"123":131,"124":128,"125":128,"126":131,"127":134,"128":137,"129":139,"130":140,"131":141,"132":142,"133":141,"134":137,"135":132,"136":126,"137":122,"138":123,"139":127,"140":132,"141":135,"142":135,"143":134,"144":134,"145":135,"146":134,"147":130,"148":125,"149":121,"150":120,"151":121,"152":124,"153":129,"154":132,"155":134,"156":134,"157":133,"158":131,"159":129,"160":128,"161":127,"162":125,"163":124,"164":123,"165":124,"166":125,"167":128,"168":130,"169":131,"170":132,"171":132,"172":131,"173":129,"174":129,"175":129,"176":130,"177":129,"178":129,"179":128,"180":129,"181":132,"182":134,"183":135,"184":134,"185":133,"186":131,"187":131,"188":131,"189":132,"190":134,"191":134,"192":134,"193":134,"194":137,"195":140,"196":142,"197":142,"198":141,"199":138,"200":136,"201":135,"202":137,"203":138,"204":137,"205":135,"206":134,"207":137,"208":142,"209":147,"210":148,"211":147,"212":146,"213":144,"214":144,"215":144,"216":144,"217":142,"218":138,"219":136,"220":137,"221":141,"222":145,"223":149,"224":150,"225":150,"226":150,"227":150,"228":150,"229":148,"230":145,"231":142,"232":142,"233":144,"234":146,"235":146,"236":146,"237":147,"238":150,"239":153,"240":153,"241":149,"242":145,"243":143,"244":141,"245":141,"246":142,"247":143,"248":143,"249":142,"250":144,"251":148,"252":153,"253":152,"254":142,"255":130,"256":123,"257":123,"258":127,"259":130,"260":132,"261":134,"262":139,"263":147,"264":154,"265":155,"266":148,"267":134,"268":119,"269":108,"270":106,"271":110,"272":115,"273":119,"274":124,"275":129,"276":136,"277":141,"278":141,"279":135,"280":125,"281":115,"282":108,"283":105,"284":105,"285":108,"286":111,"287":115,"288":119,"289":122,"290":121,"291":116,"292":110,"293":106,"294":104,"295":101,"296":98,"297":96,"298":98,"299":103,"300":110,"301":115,"302":116,"303":112,"304":104,"305":98,"306":95,"307":95,"308":94,"309":91,"310":88,"311":88,"312":94,"313":101,"314":107,"315":110,"316":107,"317":103,"318":100,"319":99,"320":99,"321":98,"322":95,"323":89,"324":87,"325":89,"326":96,"327":103,"328":107,"329":109,"330":110,"331":111,"332":113,"333":113,"334":110,"335":105,"336":102,"337":102,"338":104,"339":105,"340":107,"341":110,"342":115,"343":120,"344":123,"345":123,"346":122,"347":120,"348":120,"349":121,"350":123,"351":124,"352":123,"353":122,"354":122,"355":126,"356":133,"357":137,"358":136,"359":132,"360":128,"361":129,"362":134,"363":139,"364":139,"365":135,"366":131,"367":131,"368":135,"369":141,"370":144,"371":143,"372":140,"373":138,"374":138,"375":140,"376":142,"377":140,"378":136,"379":131,"380":130,"381":133,"382":138,"383":141,"384":141,"385":140,"386":140,"387":140,"388":139,"389":136,"390":132,"391":129,"392":128,"393":128,"394":129,"395":131,"396":133,"397":135,"398":136,"399":136,"400":135,"401":132,"402":129,"403":125,"404":123,"405":123,"406":125,"407":126,"408":126,"409":126,"410":128,"411":131,"412":133,"413":133,"414":130,"415":127,"416":125,"417":125,"418":125,"419":125,"420":125,"421":125,"422":125,"423":126,"424":129,"425":131,"426":132,"427":131,"428":128,"429":126,"430":126,"431":128,"432":129,"433":130,"434":130,"435":130,"436":132,"437":134,"438":136,"439":135,"440":133,"441":131,"442":129,"443":128,"444":129,"445":130,"446":132,"447":134,"448":136,"449":138,"450":140,"451":142,"452":143,"453":142,"454":140,"455":137,"456":135,"457":134,"458":134,"459":134,"460":134,"461":135,"462":137,"463":139,"464":143,"465":147,"466":148,"467":147,"468":146,"469":145,"470":144,"471":141,"472":139,"473":137,"474":136,"475":137,"476":139,"477":142,"478":145,"479":149,"480":150,"481":151,"482":152,"483":152,"484":151,"485":146,"486":141,"487":138,"488":140,"489":145,"490":147,"491":146,"492":145,"493":147,"494":152,"495":157,"496":156,"497":151,"498":145,"499":140,"500":137,"501":139,"502":143,"503":147,"504":147,"505":144,"506":143,"507":146,"508":152,"509":152,"510":143,"511":129}';

※如果我们把这个数据画到canvas,我们可以看到如下:(这是元音'i'的声音(听起来像'ee'))

似乎捕获了 2 个周期的波浪。由于长度为512,我们可以猜测一个周期的数据位于索引0 ~ 255。

(2) 我处理了数据。

const parsed = JSON.parse(raw);
const arrayfied = Array.from(parsed);
const sliced = arrayfied.slice(0, 256);
const refined = [];

// According to the Web Audio API specification,
// "The values stored in the unsigned byte array are computed in the following way.
// Let x[k] be the time-domain data. Then the byte value, b[k], is
// b[k]=⌊128(1+x[k])⌋." So, I manipulate the array like the following:

for (let i = 0; i < sliced.length; i++) {
  refined[i] = (sliced[i] / 128) - 1;
}

(3)我计算了傅里叶系数。

// This function calculates Riemann sum (area approximation using rectangles)
// fn: function to be calculated
// initial: calculation start point
// final: calculation end point
// division: number of rectangles to use
// nth: used for an, bn (please see below)
function numerical_integration(fn, initial, final, division, nth = null) {
  let accumulation = 0;
  const STEP = (final - initial) / division;

  for (let i = initial; i <= final; i++) {
    // calculate an area of a rectangle and add
    accumulation += fn(i, initial, final, nth) * STEP;
  }

  return accumulation;
}

// This is f(t)
function f0(t) {
  const result = refined[t];

  return result;
}

// This is f(t) * cos(nwt)
// ※ w = 2 * Math.PI / period
function fc(t, i, f, n) {
  const result = f0(t) * Math.cos(n * 2 * Math.PI * t / (f - i));

  return result;
}

// This is f(t) * sin(nwt)
function fs(t, i, f, n) {
  const result = f0(t) * Math.sin(n * 2 * Math.PI * t / (f - i));

  return result;
}

// This function returns a0 value
// period is 256 (0 ~ 255) and the last element of array refined is at index 255,
// so I subtract one.
function getA0(period) {
  const result = numerical_integration(f0, 0, period - 1, 100) / period;

  return result;
}

// This function returns an values
function getAn(period) {
  const result = [];

  for (let i = 1; i <= 49; i++) {
    result.push(numerical_integration(fc, 0, period - 1, 100, i) * 2 / period);
  }

  return result;
}

// This function returns bn values
function getBn(period) {
  const result = [];

  for (let i = 1; i <= 49; i++) {
    result.push(numerical_integration(fs, 0, period - 1, 100, i) * 2 / period);
  }

  return result;
}

到目前为止一切顺利!现在我们可以通过使用系数制作波函数并将其绘制到 canvas!

来检查我们的傅里叶系数是否计算得很好
const a0 = getA0(refined.length);
const an = getAn(refined.length);
const bn = getBn(refined.length);

// returns y coordinate
function getY(t) {
  let anSum = 0;
  let bnSum = 0;

  for (let i = 0; i <= 48; i++) {
    anSum += an[i] * Math.cos((i + 1) * 2 * Math.PI * t / refined.length);
    bnSum += bn[i] * Math.sin((i + 1) * 2 * Math.PI * t / refined.length);
  }

  const result = a0 + anSum + bnSum;

  return result;
}

// draw
canvasContext.lineTo(x, getY(t));

哇!做得很好!和原波几乎一模一样!


然后你可能会问“那么,你的问题是什么?”因此,我要问我的问题:如何使用傅里叶系数再现声音?(我对网络音频API和数字声音了解不多)

我想到的是三件事:

我尝试了 AudioWorklet,但它听起来像是饱和的(?)A4(可能)带有噼啪声 'tick tick'。 AudioWorklet代码如下:

class IWaveProducer extends AudioWorkletProcessor {
  constructor() {
    super();

    this.t = 0;
  }

  process(inputs, outputs, parameters) {
    const output = outputs[0];

    output.forEach(channel => {
      for (let i = 0; i < channel.length; i++) {
        channel[i] = getY(this.t);
      }
    });

    this.t++;

    return true;
  }
}

registerProcessor('i-wave-producer', IWaveProducer);

这是图表:

所以这次我尝试了 PeriodicWave 和 OscillatorNode 但也失败了。代码如下:

const real = new Float32Array(50);
const imag = new Float32Array(50);

real[0] = a0;
imag[0] = 0;

for (let i = 1; i <= 48; i++) {
  real[i + 1] = an[i];
  imag[i + 1] = bn[i];
}

const wave = new PeriodicWave(audioCtx, { real, imag, disableNormalization: false });
const osc = new OscillatorNode(audioCtx, { periodicWave: wave });

osc.connect(analyser)
   .connect(audioCtx.destination);
osc.start();

这是图表:

听起来像A4锯齿波(可能)。另外,有趣的是,似乎所有的数据都被正确插入了,因为波形的形式与上图非常相似(请参见'waveform approximated'图)。 (其格局:一高一小山)

...但这完全不是我想要的!我想要的是重现一个元音 'i' 的声音!我怎样才能实现我的目标?如果您知道什么,请告诉我。这将不胜感激。我很好奇。请帮助我ㅠㅠ。非常感谢您阅读这么长的问题。

或者用网络音频 API 制作 'voice' 是不可能的?但是我以前见过图书馆用JavaScript发声。例如:


大家好!我想刚刚找到了答案!答案是……「AudioBuffer」。我真的要哭了……很高兴……不管怎样,这是代码!!!

// Since the length of wave is 256
// and I guess (maybe wrong) that it means
// this wave lasts for 256 / 44100 seconds (= 0.0058).
// Thus, in order to make it longer,
// multiply 1000. So this sound will exist for 5.8 seconds.
// (Since sampling rate is 44100 per sec,
// the formula results in the length of this buffer--256000.)
const audioBuffer = new AudioBuffer({ numberOfChannels: 1, length: 1000 * audioCtx.sampleRate * 256 / 44100, sampleRate: audioCtx.sampleRate });

const buffering = audioBuffer.getChannelData(0);
let count = 0;

for (let i = 0; i < audioBuffer.length; i++) {
  buffering[i] = refined[count];

  if (count === 255) {
    count = 0;
  } else {
    count++;
  }
}

const source = new AudioBufferSourceNode(audioCtx, { buffer: audioBuffer });
  
source.connect(analyser)
      .connect(audioCtx.destination);

source.start();

产生的声音有点滑稽!但我认为这听起来确实像 'i (ee)'。它也听起来像 'fa' 音符。这是为什么?让我们一起考虑一下。首先我认为我们需要计算波的频率。由于波的一个周期为0.0058秒,因此频率为1 / 0.0058,即172.4138赫兹。

接下来,A4 为 440 Hz。因此A3为220Hz。下面四个音符是 F3(A3、G#4、G4、F#3、F3)。那么F3的频率就是220 * 2^(-4/12) = 174.6141 Hz.

172和174差不多。!!!!!这绝对有道理!这个秘密现在已经解开了。这就是为什么听起来像 fa.

感谢您阅读我关于 Web Audio 的艰难但同时又美丽的战斗故事 API。再见!


大家好!我刚刚发现 PeriodicWave 和 OscillatorNode 也可以是答案!

const osc = new OscillatorNode(audioCtx, { periodicWave: wave, frequency: 174 });

设置频率参数是关键!再见!

那么就只剩下AudioWorklet了。它也可以是答案吗?这让我很好奇。

在 golang 中,我采用了一个数组 ARR1,它代表一个时间序列(可以是音频,或者在我的例子中是一个图像),其中这个时域数组的每个元素都是一个浮点值,代表原始音频的高度曲线摆动......然后我将这个浮点数组输入到 FFT 调用中,该调用根据频域中的定义返回一个新数组 ARR2,其中该数组的每个元素都是一个复数,其中实部和虚部都是浮点数……当我将这个数组送入逆向 FFT 调用 ( IFFT ) 时,它在时域中返回一个浮点数数组 ARR3 ……到第一近似值 ARR3 匹配 ARR1 ……不用说如果我然后获取 ARR3 并将其输入 FFT 调用,其输出 ARR4 将匹配 ARR2 ...本质上你有这个 time_domain_array --> FFT 调用 -> frequency_domain_array --> InverseFFT 调用 -> time_domain_array ... 冲洗N次重复

我知道 Web Audio API 有一个 FFT 调用...不知道它是否有一个 IFFT api 调用但是如果没有 IFFT(反向 FFT)你可以写你自己的这样的函数这是如何......遍历 ARR2 并为每个元素计算该频率的大小(ARR2 的每个元素代表一个频率,在文献中你会看到 ARR2 被称为频率仓,这仅仅意味着数组的每个元素都持有一个复数,当您遍历数组时,每个连续的元素代表一个不同的频率,从元素 0 开始存储频率 0,每个后续数组元素将代表通过将 incr_freq 添加到先前数组元素的频率定义的频率)

ARR2 的每个索引代表一个频率,其中元素 0 是直流偏置,它是输入 ARR1 曲线的零偏移偏置,如果它以零交叉点为中心,这个值通常为零,元素 0 可以忽略.. . ARR2的每个元素之间的频率差异是一个恒定的频率增量,可以使用

计算
incr_freq := sample_rate / number_of_samples  //  with sample_rate of 44100 samples per second, and one second worth of samples ( 44100 )
                                              //  this gives you a frequency increment resolution of 1 Hertz ... IE each freq bin is 1 Hertz apart

对于 ARR2 的给定元素,您需要使用

计算幅度
curr_real = real(curr_complex) // pluck out real portion of imaginary number
curr_imag = imag(curr_complex) // ditto for im

curr_mag = 2.0 * math.Sqrt(curr_real*curr_real+curr_imag*curr_imag) / number_of_samples  // magnitude of this freq

curr_theta = math.Atan2(curr_imag, curr_real)  //  phase of this frequency

此时您正在遍历 ARR2,在此循环中,您正在为 ARR2 的每个元素进行以上幅度和相位计算...现在,既然您知道幅度和幅度,现在只需为每个频率合成一条正弦曲线阶段 ... 请记住,当您遍历 ARR2 时,您正在递增上述 incr_freq ... 然后简单地将这些新合成的正弦曲线中的每一个组合成一个单一的输出曲线,并存放到一个新的 ARR3 中,它将与您的相匹配原始来源 ARR1

玩得开心,祝你好运!!!

UPDATE window 的概念指的是一组音频样本...假设您有一个输入 WAV从中打开并遍历其原始音频曲线的格式文件...如果此 WAV 文件包含一首 10 分钟的歌曲,如果您将整个音频输入一个 FFT 调用,则结果将代表整个文件,但是如果您改为截断整个文件一组音频样本(这些只是原始音频曲线上的点)分成几个 windows,其中每个 window 只是说 1024 个这样的音频样本,并将这些 windows 个样本中的每一个馈送到一个FFT 调用 FFT 调用的每个输出将仅特定于歌曲的那部分......当您使用 Audacity 等工具播放歌曲时,您可以查看其实时 FFT 频谱,该频谱向您显示当前 window 因为 Audacity 将输入音频切成新的 window 非常 1024 左右的音频样本