java 在 .wav 文件上使用 FFT 生成的频谱图未产生预期的输出
Spectrogram generation in java using FFT on a .wav file not producing expected output
所以我正在做一个 AI 项目,class将语音转换为“上”、“下”、“左”、右或背景噪音,由此,电子游戏中的角色被移动.
我从数学解释中得出了一个 FFT 算法,我相信这是正确的,因为我已经根据该站点的结果测试了它的输出 (https://engineering.icalculator.info/discrete-fourier-transform-calculator.html)
然后我尝试生成一个频谱图,并使用了基于来自该站点的 App class 主要功能的代码 ()
我在我打招呼的 .wav 文件上测试了我的代码,生成的频谱图不是我所期望的,请参见下面我的 java 制作的频谱图和我的 python 之间的区别制作了光谱图(忽略色差)
Java Spectrogram
Python Spectrogram
New Java Spectrogram with SleuthEyes help
这是我的原始代码 used/written:
package Whosebug;
import com.company.Complex;
import javax.imageio.ImageIO;
import java.awt.*;
import java.awt.image.BufferedImage;
import java.io.File;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.util.Arrays;
import java.util.Scanner;
public class Whosebug {
private static Color getColour(double power) {
var H = power * 0.4;
var S = 1.0;
var B = 1.0;
return Color.getHSBColor((float) H, (float) S, (float) B);
}
private static double[] getAudioData(String filePath) {
var path = Paths.get(filePath);
try {
var entireFileData = Files.readAllBytes(path);
var rawData = Arrays.copyOfRange(entireFileData, 44, entireFileData.length);
var length = rawData.length;
var newLength = length / 4;
var dataMono = new double[newLength];
double left, right;
for (int i = 0; 2 * i + 3< newLength; i++) {
left = (short) ((rawData[2 * i + 1] & 0xff) << 8) | (rawData[2 * i] & 0xff);
right = (short) ((rawData[2 * i + 3] & 0xff) << 8) | (rawData[2 * i + 2] & 0xff);
dataMono[i] = (left + right) / 2.0;
}
return dataMono;
} catch (IOException e) {
e.printStackTrace();
}
return null;
}
private static Complex[] toComplex(double[] samples) {
var l = samples.length;
var cOut = new Complex[l];
for (int i = 0; i < l; i++) {
cOut[i] = new Complex(samples[i], 0);
}
return cOut;
}
private static double modulusSquared(Complex a) {
var real = a.getReal();
var imaginary = a.getImag();
return (real * real) + (imaginary * imaginary);
}
private static Complex[] fft(Complex[] samples) {
var N = samples.length; // number of samples
if (N == 1) return samples; // stops the recursive splits on the samples
// TODO: M only works for N a power of 2
var M = N / 2; // middle index of the samples
var Xeven = new Complex[M]; // array for even split
var Xodd = new Complex[M]; // array for odd split
// splits the samples
for (int i = 0; i < M; i++) {
Xeven[i] = samples[2 * i];
Xodd[i] = samples[2 * i + 1];
}
// recursive calls on even and odd samples
var Feven = new Complex[M];
Feven = fft(Xeven);
var Fodd = new Complex[M];
Fodd = fft(Xodd);
var frequencyBins = new Complex[N];
for (int i = 0; i < (N / 2); i++) {
var cExponential = Complex.multiply(
Complex.polar(1, -2 * Math.PI * i / N),
Fodd[i]
);
frequencyBins[i] = Complex.add(
Feven[i],
cExponential
);
frequencyBins[i + N / 2] = Complex.sub(
Feven[i],
cExponential
);
}
return frequencyBins;
}
public static void makeSpectrogram() {
var scan = new Scanner(System.in);
System.out.println("Enter file path: ");
var filePath = scan.nextLine();
var rawAudioData = getAudioData(filePath);
assert rawAudioData != null;
var length = rawAudioData.length;
var complexAudioData = toComplex(rawAudioData);
// parameters for FFT
var windowSize = 256;
var overlapFactor = 2;
var windowStep = windowSize / overlapFactor;
// plotData array
var nX = (length - windowSize) / windowStep;
var nY = (windowSize / 2);
var plotData = new double[nX][nY];
// amplitudes to normalise
var maxAmplitude = Double.MIN_VALUE;
var minAmplitude = Double.MAX_VALUE;
double amplitudeSquared;
// application of the FFT
for (int i = 0; i < nX; i++) {
var windowSizeArray = fft(Arrays.copyOfRange(complexAudioData, i * windowStep, i * windowStep + windowSize));
for (int j = 0; j < nY; j++) {
amplitudeSquared = modulusSquared(windowSizeArray[2 * j]);
if (amplitudeSquared == 0.0) {
plotData[i][nY - j - 1] = amplitudeSquared;
} else {
var threshold = 1.0; // prevents log(0)
plotData[i][nY - j - 1] = 10 * Math.log10(Math.max(amplitudeSquared, threshold));
}
// find min and max amplitudes
if (plotData[i][j] > maxAmplitude) {
maxAmplitude = plotData[i][j];
} else if (plotData[i][j] < minAmplitude) {
minAmplitude = plotData[i][j];
}
}
}
// normalisation
var difference = maxAmplitude - minAmplitude;
for (int i = 0; i < nX; i++) {
for (int j = 0; j < nY; j++) {
plotData[i][j] = (plotData[i][j] - minAmplitude) / difference;
}
}
// plot the spectrogram
var spectrogram = new BufferedImage(nX, nY, BufferedImage.TYPE_INT_RGB);
double ratio;
for (int i = 0; i < nX; i++) {
for (int j = 0; j < nY; j++) {
ratio = plotData[i][j];
var colour = getColour(1.0 - ratio);
spectrogram.setRGB(i, j, colour.getRGB());
}
}
// write the image to a file
try {
var outputFile = new File("saved.png");
ImageIO.write(spectrogram, "png", outputFile);
} catch (IOException e) {
e.printStackTrace();
}
}
public static void main(String[] args) {
makeSpectrogram();
}
}
这里是上面用到的 Complex class:
package com.company;
import java.text.DecimalFormat;
public class Complex {
private final static DecimalFormat df2 = new DecimalFormat("#.##");
private double r;
private double i;
public Complex(double r, double i) {
this.r = r;
this.i = i;
}
@Override
public String toString() {
return "(" + df2.format(this.r) + ", " + df2.format(this.i) + "i) ";
}
public double abs() {
return Math.hypot(this.r, this.i);
}
public double getReal() {
return this.r;
}
public double getImag() {
return this.i;
}
public void setReal(double r) {
this.r = r;
}
public void setImag(double i) {
this.i = i;
}
public static Complex polar(double r, double theta) {
return new Complex(
r * Math.cos(theta),
r * Math.sin(theta)
);
}
public static Complex multiply(Complex a, Complex b) {
/*
(a + bi) * (c + di) =
ac + adi + cbi + -bd =
(ac - bd) + (ad + cb)i
*/
var real = (a.r * b.r) - (a.i * b.i);
var imag = (a.r * b.i) + (a.i * b.r);
return new Complex(real, imag);
}
public static Complex add(Complex a, Complex b) {
return new Complex(
a.r + b.r,
a.i + b.i
);
}
public static Complex sub(Complex a, Complex b) {
return new Complex(
a.r - b.r,
a.i - b.i
);
}
}
任何指导将不胜感激
正在读取 .wav 文件
that other question you linked 中包含的 .wav 文件解码很难说是一个完整的解码器。它占OP的特定立体声2bytes-per-sample use-case.
看起来您在尝试使其适应不同的用例时偶然发现了其他解码问题。作为一般性建议,我建议使用更完整的 .wav 解码器,它会考虑通道数、字节数-per-sample 等
另一方面,如果您想制作自己的解码器(例如作为学习练习),那么稍微更健壮的实现可能如下所示:
public short getShort(byte[] buffer, int offset) {
return (short) ((buffer[offset + 1] & 0xff) << 8) | (buffer[offset] & 0xff);
}
public int getNumberOfChannels(byte[] entireFileData){
return (int) getShort(entireFileData, 22);
}
public int getBytesPerSample(byte[] entireFileData){
return (int) getShort(entireFileData, 34)/8;
}
private static double[] getAudioData(String filePath) {
...
var entireFileData = Files.readAllBytes(path);
var rawData = Arrays.copyOfRange(entireFileData, 44, entireFileData.length);
var length = rawData.length;
int numChannels = getNumberOfChannels(entireFileData);
int bytesPerSample = getBytesPerSample(entireFileData);
int newLength = length / (bytesPerSample*numChannels);
var dataMono = new double[newLength];
if (2 == bytesPerSample) {
for (int i = 0; 2*numChannels*(i+1)-1 < length; i++) {
double sum = 0.0;
for (int j = 0; j < numChannels; j++) {
sample = (short) ((rawData[2*numChannels*i + 2*j + 1] & 0xff) << 8) | (rawData[2*numChannels*i + 2*j] & 0xff);
sum += sample;
}
dataMono[i] = sum / numChannels;
}
}
else {
... // handle different number of bytes per sample
}
}
请注意,它仍然只涵盖 16 位 PCM 样本,假定固定的 header 结构(参见 this tutorial, but the .wav file format 实际上更灵活),并且会在具有扩展块的文件上被绊倒。
正在处理光谱
that other question you linked returns double
数组中使用的 FFT 库,它被解释为实际复数值的交错实部和虚部。相应地,用于执行幅度计算的索引是使用索引 2*j
和 2*j+1
处的元素对。另一方面,您的实现直接获取复杂的值,因此您不应跳过带有 2*
因子的值,而是使用:
for (int j = 0; j < nY; j++) {
amplitudeSquared = modulusSquared(windowSizeArray[j]);
...
}
所以我正在做一个 AI 项目,class将语音转换为“上”、“下”、“左”、右或背景噪音,由此,电子游戏中的角色被移动.
我从数学解释中得出了一个 FFT 算法,我相信这是正确的,因为我已经根据该站点的结果测试了它的输出 (https://engineering.icalculator.info/discrete-fourier-transform-calculator.html)
然后我尝试生成一个频谱图,并使用了基于来自该站点的 App class 主要功能的代码 (
我在我打招呼的 .wav 文件上测试了我的代码,生成的频谱图不是我所期望的,请参见下面我的 java 制作的频谱图和我的 python 之间的区别制作了光谱图(忽略色差)
Java Spectrogram
Python Spectrogram
New Java Spectrogram with SleuthEyes help
这是我的原始代码 used/written:
package Whosebug;
import com.company.Complex;
import javax.imageio.ImageIO;
import java.awt.*;
import java.awt.image.BufferedImage;
import java.io.File;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.util.Arrays;
import java.util.Scanner;
public class Whosebug {
private static Color getColour(double power) {
var H = power * 0.4;
var S = 1.0;
var B = 1.0;
return Color.getHSBColor((float) H, (float) S, (float) B);
}
private static double[] getAudioData(String filePath) {
var path = Paths.get(filePath);
try {
var entireFileData = Files.readAllBytes(path);
var rawData = Arrays.copyOfRange(entireFileData, 44, entireFileData.length);
var length = rawData.length;
var newLength = length / 4;
var dataMono = new double[newLength];
double left, right;
for (int i = 0; 2 * i + 3< newLength; i++) {
left = (short) ((rawData[2 * i + 1] & 0xff) << 8) | (rawData[2 * i] & 0xff);
right = (short) ((rawData[2 * i + 3] & 0xff) << 8) | (rawData[2 * i + 2] & 0xff);
dataMono[i] = (left + right) / 2.0;
}
return dataMono;
} catch (IOException e) {
e.printStackTrace();
}
return null;
}
private static Complex[] toComplex(double[] samples) {
var l = samples.length;
var cOut = new Complex[l];
for (int i = 0; i < l; i++) {
cOut[i] = new Complex(samples[i], 0);
}
return cOut;
}
private static double modulusSquared(Complex a) {
var real = a.getReal();
var imaginary = a.getImag();
return (real * real) + (imaginary * imaginary);
}
private static Complex[] fft(Complex[] samples) {
var N = samples.length; // number of samples
if (N == 1) return samples; // stops the recursive splits on the samples
// TODO: M only works for N a power of 2
var M = N / 2; // middle index of the samples
var Xeven = new Complex[M]; // array for even split
var Xodd = new Complex[M]; // array for odd split
// splits the samples
for (int i = 0; i < M; i++) {
Xeven[i] = samples[2 * i];
Xodd[i] = samples[2 * i + 1];
}
// recursive calls on even and odd samples
var Feven = new Complex[M];
Feven = fft(Xeven);
var Fodd = new Complex[M];
Fodd = fft(Xodd);
var frequencyBins = new Complex[N];
for (int i = 0; i < (N / 2); i++) {
var cExponential = Complex.multiply(
Complex.polar(1, -2 * Math.PI * i / N),
Fodd[i]
);
frequencyBins[i] = Complex.add(
Feven[i],
cExponential
);
frequencyBins[i + N / 2] = Complex.sub(
Feven[i],
cExponential
);
}
return frequencyBins;
}
public static void makeSpectrogram() {
var scan = new Scanner(System.in);
System.out.println("Enter file path: ");
var filePath = scan.nextLine();
var rawAudioData = getAudioData(filePath);
assert rawAudioData != null;
var length = rawAudioData.length;
var complexAudioData = toComplex(rawAudioData);
// parameters for FFT
var windowSize = 256;
var overlapFactor = 2;
var windowStep = windowSize / overlapFactor;
// plotData array
var nX = (length - windowSize) / windowStep;
var nY = (windowSize / 2);
var plotData = new double[nX][nY];
// amplitudes to normalise
var maxAmplitude = Double.MIN_VALUE;
var minAmplitude = Double.MAX_VALUE;
double amplitudeSquared;
// application of the FFT
for (int i = 0; i < nX; i++) {
var windowSizeArray = fft(Arrays.copyOfRange(complexAudioData, i * windowStep, i * windowStep + windowSize));
for (int j = 0; j < nY; j++) {
amplitudeSquared = modulusSquared(windowSizeArray[2 * j]);
if (amplitudeSquared == 0.0) {
plotData[i][nY - j - 1] = amplitudeSquared;
} else {
var threshold = 1.0; // prevents log(0)
plotData[i][nY - j - 1] = 10 * Math.log10(Math.max(amplitudeSquared, threshold));
}
// find min and max amplitudes
if (plotData[i][j] > maxAmplitude) {
maxAmplitude = plotData[i][j];
} else if (plotData[i][j] < minAmplitude) {
minAmplitude = plotData[i][j];
}
}
}
// normalisation
var difference = maxAmplitude - minAmplitude;
for (int i = 0; i < nX; i++) {
for (int j = 0; j < nY; j++) {
plotData[i][j] = (plotData[i][j] - minAmplitude) / difference;
}
}
// plot the spectrogram
var spectrogram = new BufferedImage(nX, nY, BufferedImage.TYPE_INT_RGB);
double ratio;
for (int i = 0; i < nX; i++) {
for (int j = 0; j < nY; j++) {
ratio = plotData[i][j];
var colour = getColour(1.0 - ratio);
spectrogram.setRGB(i, j, colour.getRGB());
}
}
// write the image to a file
try {
var outputFile = new File("saved.png");
ImageIO.write(spectrogram, "png", outputFile);
} catch (IOException e) {
e.printStackTrace();
}
}
public static void main(String[] args) {
makeSpectrogram();
}
}
这里是上面用到的 Complex class:
package com.company;
import java.text.DecimalFormat;
public class Complex {
private final static DecimalFormat df2 = new DecimalFormat("#.##");
private double r;
private double i;
public Complex(double r, double i) {
this.r = r;
this.i = i;
}
@Override
public String toString() {
return "(" + df2.format(this.r) + ", " + df2.format(this.i) + "i) ";
}
public double abs() {
return Math.hypot(this.r, this.i);
}
public double getReal() {
return this.r;
}
public double getImag() {
return this.i;
}
public void setReal(double r) {
this.r = r;
}
public void setImag(double i) {
this.i = i;
}
public static Complex polar(double r, double theta) {
return new Complex(
r * Math.cos(theta),
r * Math.sin(theta)
);
}
public static Complex multiply(Complex a, Complex b) {
/*
(a + bi) * (c + di) =
ac + adi + cbi + -bd =
(ac - bd) + (ad + cb)i
*/
var real = (a.r * b.r) - (a.i * b.i);
var imag = (a.r * b.i) + (a.i * b.r);
return new Complex(real, imag);
}
public static Complex add(Complex a, Complex b) {
return new Complex(
a.r + b.r,
a.i + b.i
);
}
public static Complex sub(Complex a, Complex b) {
return new Complex(
a.r - b.r,
a.i - b.i
);
}
}
任何指导将不胜感激
正在读取 .wav 文件
that other question you linked 中包含的 .wav 文件解码很难说是一个完整的解码器。它占OP的特定立体声2bytes-per-sample use-case.
看起来您在尝试使其适应不同的用例时偶然发现了其他解码问题。作为一般性建议,我建议使用更完整的 .wav 解码器,它会考虑通道数、字节数-per-sample 等
另一方面,如果您想制作自己的解码器(例如作为学习练习),那么稍微更健壮的实现可能如下所示:
public short getShort(byte[] buffer, int offset) {
return (short) ((buffer[offset + 1] & 0xff) << 8) | (buffer[offset] & 0xff);
}
public int getNumberOfChannels(byte[] entireFileData){
return (int) getShort(entireFileData, 22);
}
public int getBytesPerSample(byte[] entireFileData){
return (int) getShort(entireFileData, 34)/8;
}
private static double[] getAudioData(String filePath) {
...
var entireFileData = Files.readAllBytes(path);
var rawData = Arrays.copyOfRange(entireFileData, 44, entireFileData.length);
var length = rawData.length;
int numChannels = getNumberOfChannels(entireFileData);
int bytesPerSample = getBytesPerSample(entireFileData);
int newLength = length / (bytesPerSample*numChannels);
var dataMono = new double[newLength];
if (2 == bytesPerSample) {
for (int i = 0; 2*numChannels*(i+1)-1 < length; i++) {
double sum = 0.0;
for (int j = 0; j < numChannels; j++) {
sample = (short) ((rawData[2*numChannels*i + 2*j + 1] & 0xff) << 8) | (rawData[2*numChannels*i + 2*j] & 0xff);
sum += sample;
}
dataMono[i] = sum / numChannels;
}
}
else {
... // handle different number of bytes per sample
}
}
请注意,它仍然只涵盖 16 位 PCM 样本,假定固定的 header 结构(参见 this tutorial, but the .wav file format 实际上更灵活),并且会在具有扩展块的文件上被绊倒。
正在处理光谱
that other question you linked returns double
数组中使用的 FFT 库,它被解释为实际复数值的交错实部和虚部。相应地,用于执行幅度计算的索引是使用索引 2*j
和 2*j+1
处的元素对。另一方面,您的实现直接获取复杂的值,因此您不应跳过带有 2*
因子的值,而是使用:
for (int j = 0; j < nY; j++) {
amplitudeSquared = modulusSquared(windowSizeArray[j]);
...
}