如何使用 DTW 算法实施修剪策略?
How do I implement a pruning strategy with a DTW algorithm?
我已经尝试为语音识别应用程序实现 DTW 算法,我已经成功地完成了,现在我正在尝试通过修剪来提高 DTW 算法的性能。我尝试寻找对此算法的改进,发现我应该以某种方式计算二维数组 DTW
中特定范围内的值,如 Image #1
所示,但我没有似乎确切地知道该怎么做。有人可以提供任何帮助吗?
包含算法代码 (C#)
/// <summary>
/// Calculates the distance between two audio frames in the form of two MFCCFrame objects as input parameters
/// returns the difference in a double
/// </summary>
double distance(MFCCFrame frame1, MFCCFrame frame2)
{
double tempSum = 0;
for (int i = 0; i < 13; i++)
tempSum += Math.Pow(Math.Abs(frame1.Features[i] - frame2.Features[i]), 2);
return Math.Sqrt(tempSum);
}
/// <summary>
/// DTW Algorithms
/// Takes input 2 sequences: seq1 and seq2 to calculate the shortest distance between them
/// Uses the function "distance" defined above to calculate distance between two frames
/// 2D array -> DTW[,] with dimentions: number of frames in seq 1, number of frames in seq2
/// returns the last element in the 2D array, which is the difference between the two sequences
/// </summary>
double DTWDistance(Sequence seq1, Sequence seq2)
{
int m = seq1.Frames.Length, n = seq2.Frames.Length;
double[,] DTW = new double[m, n];
DTW[0, 0] = 0;
for (int i = 0; i < m; i++)
{
for (int j = 0; j < n; j++)
{
double cost = distance(seq1.Frames[i], seq2.Frames[j]);
if (i == 0 && j == 0)
DTW[i, j] = cost;
else if (i == 0)
DTW[i, j] = cost + DTW[i, j - 1];
else if (j == 0)
DTW[i, j] = cost + DTW[i - 1, j];
else
DTW[i, j] = (cost + Math.Min(DTW[i - 1, j], Math.Min(DTW[i, j - 1], DTW[i - 1, j - 1])));
}
}
}
由于没有人回答,我想我会回答这个问题,以防有人也需要帮助
这是常规的 DTW 算法:
/// <summary>
/// DTW Algorithms
/// Takes input 2 sequences: input and template to calculate the shortest distance between them
/// Uses the function "distance" defined above to calculate distance between two frames
/// 2D array -> DTW[,] with dimentions: number of frames in input, number of frames in template
/// returns the last element in the 2D array, which is the difference between the two sequences
/// </summary>
///
double DTWDistance(Sequence input, Sequence template)
{
int rows = input.Frames.Length, columns = template.Frames.Length;
if (rows < (double)(columns / 2) || columns < (double)(rows / 2))
{
return double.MaxValue;
}
double[,] DTW = new double[rows, columns];
DTW[0, 0] = 0;
for (int i = 0; i < rows; i++)
{
for (int j = 0; j < columns; j++)
{
double cost = distance(input.Frames[i], template.Frames[j]);
if (i == 0 && j == 0)
DTW[i, j] = cost;
else if (i == 0)
DTW[i, j] = cost + DTW[i, j - 1];
else if (j == 0)
DTW[i, j] = cost + DTW[i - 1, j];
else
DTW[i, j] = (cost + Math.Min(DTW[i - 1, j], DTW[i - 1, j - 1]));// insert ,match
}
}
return DTW[rows - 1, columns - 1];
}
而且我还在常规 DTW 算法上实施了修剪策略,如下所示:
/// <summary>
/// DTW Algotithm with Pruning
/// Only Sakoe-Chiba band is caluculated and the rest is pruned
/// </summary>
float Pruning_DTWDistance(Sequence input, Sequence output)
{
int rows = input.Frames.Length, columns = output.Frames.Length;
if (rows < (double)(columns / 2) || columns < (double)(rows / 2))
{
return float.MaxValue;
}
float cost;
float[,] DTW = new float[rows + 1, columns + 1];
int w = Math.Abs(columns - rows);// window length -> |rows - columns|<= w
for (int i = 1; i <= rows; i++)
{
for (int j = Math.Max(1, i - w); j <= Math.Min(columns, i + w); j++)
{
if (DTW[i - 1, j] == 0)
DTW[i - 1, j] = float.MaxValue;
if (DTW[i - 1, j - 1] == 0)
DTW[i - 1, j - 1] = float.MaxValue;
DTW[0, 0] = 0;
cost = distance(input.Frames[i - 1], output.Frames[j - 1]);// frames 0 based
DTW[i, j] = (cost + Math.Min(DTW[i - 1, j], DTW[i - 1, j - 1]));// insert ,match
}
}
return DTW[rows, columns];
}
两个函数都使用了一个辅助函数distance
,定义如下:
/// <summary>
/// Calculates the distance between two audio frames in the form of MFCCFrame objects
/// returns the difference in a float
/// </summary>
///
float distance(MFCCFrame frame1, MFCCFrame frame2)
{
double tempSum = 0;
for (int i = 0; i < 13; i++)
tempSum += Math.Pow(Math.Abs(frame1.Features[i] - frame2.Features[i]), 2);
return (float)Math.Sqrt(tempSum);
}
[编辑]
为了改善DTW算法的内存消耗,我只使用了2个数组而不是2D矩阵,更改后的算法如下所示:
double DTW_improved(Sequence input, Sequence template)
{
// Don't compare two sequences if one of their lengths is half the other's
if (input.Frames.Length <= (0.5 * template.Frames.Length) || template.Frames.Length <= (0.5 * input.Frames.Length))
return double.PositiveInfinity;
int rows = template.Frames.Length, columns = input.Frames.Length;
double[] c1 = new double[rows];
double[] c2 = new double[rows];
double[] temp; // To hold address only (use it in swapping address)
c1[0] = distance(input.Frames[0], template.Frames[0]);
for (int i = 1; i < rows; i++)
c1[i] = c1[i - 1] + distance(input.Frames[0], template.Frames[i]);
for (int i = 1; i < columns; i++)
{
c2[0] = distance(input.Frames[i], template.Frames[0]) + c1[0];
c2[1] = distance(input.Frames[i], template.Frames[1]) + Math.Min(c1[0], c1[1]);
// Calculating first 2 elements of the array before the loop
//since they both need special conditions
for (int j = 2; j < rows; j++)
c2[j] = Math.Min(c1[j], Math.Min(c1[j - 1], c1[j - 2])) + distance(input.Frames[i], template.Frames[j]);
if (i != columns - 1) // Swapping addresses of c1 & c2
{
temp = c2;
c2 = c1;
c1 = temp;
}
}
return c2[rows - 1] / (0.5 * (input.Frames.Length + template.Frames.Length)); // Normalization: Dividing edit distance by average of input length & template length
}
我已经尝试为语音识别应用程序实现 DTW 算法,我已经成功地完成了,现在我正在尝试通过修剪来提高 DTW 算法的性能。我尝试寻找对此算法的改进,发现我应该以某种方式计算二维数组 DTW
中特定范围内的值,如 Image #1
所示,但我没有似乎确切地知道该怎么做。有人可以提供任何帮助吗?
包含算法代码 (C#)
/// <summary>
/// Calculates the distance between two audio frames in the form of two MFCCFrame objects as input parameters
/// returns the difference in a double
/// </summary>
double distance(MFCCFrame frame1, MFCCFrame frame2)
{
double tempSum = 0;
for (int i = 0; i < 13; i++)
tempSum += Math.Pow(Math.Abs(frame1.Features[i] - frame2.Features[i]), 2);
return Math.Sqrt(tempSum);
}
/// <summary>
/// DTW Algorithms
/// Takes input 2 sequences: seq1 and seq2 to calculate the shortest distance between them
/// Uses the function "distance" defined above to calculate distance between two frames
/// 2D array -> DTW[,] with dimentions: number of frames in seq 1, number of frames in seq2
/// returns the last element in the 2D array, which is the difference between the two sequences
/// </summary>
double DTWDistance(Sequence seq1, Sequence seq2)
{
int m = seq1.Frames.Length, n = seq2.Frames.Length;
double[,] DTW = new double[m, n];
DTW[0, 0] = 0;
for (int i = 0; i < m; i++)
{
for (int j = 0; j < n; j++)
{
double cost = distance(seq1.Frames[i], seq2.Frames[j]);
if (i == 0 && j == 0)
DTW[i, j] = cost;
else if (i == 0)
DTW[i, j] = cost + DTW[i, j - 1];
else if (j == 0)
DTW[i, j] = cost + DTW[i - 1, j];
else
DTW[i, j] = (cost + Math.Min(DTW[i - 1, j], Math.Min(DTW[i, j - 1], DTW[i - 1, j - 1])));
}
}
}
由于没有人回答,我想我会回答这个问题,以防有人也需要帮助 这是常规的 DTW 算法:
/// <summary>
/// DTW Algorithms
/// Takes input 2 sequences: input and template to calculate the shortest distance between them
/// Uses the function "distance" defined above to calculate distance between two frames
/// 2D array -> DTW[,] with dimentions: number of frames in input, number of frames in template
/// returns the last element in the 2D array, which is the difference between the two sequences
/// </summary>
///
double DTWDistance(Sequence input, Sequence template)
{
int rows = input.Frames.Length, columns = template.Frames.Length;
if (rows < (double)(columns / 2) || columns < (double)(rows / 2))
{
return double.MaxValue;
}
double[,] DTW = new double[rows, columns];
DTW[0, 0] = 0;
for (int i = 0; i < rows; i++)
{
for (int j = 0; j < columns; j++)
{
double cost = distance(input.Frames[i], template.Frames[j]);
if (i == 0 && j == 0)
DTW[i, j] = cost;
else if (i == 0)
DTW[i, j] = cost + DTW[i, j - 1];
else if (j == 0)
DTW[i, j] = cost + DTW[i - 1, j];
else
DTW[i, j] = (cost + Math.Min(DTW[i - 1, j], DTW[i - 1, j - 1]));// insert ,match
}
}
return DTW[rows - 1, columns - 1];
}
而且我还在常规 DTW 算法上实施了修剪策略,如下所示:
/// <summary>
/// DTW Algotithm with Pruning
/// Only Sakoe-Chiba band is caluculated and the rest is pruned
/// </summary>
float Pruning_DTWDistance(Sequence input, Sequence output)
{
int rows = input.Frames.Length, columns = output.Frames.Length;
if (rows < (double)(columns / 2) || columns < (double)(rows / 2))
{
return float.MaxValue;
}
float cost;
float[,] DTW = new float[rows + 1, columns + 1];
int w = Math.Abs(columns - rows);// window length -> |rows - columns|<= w
for (int i = 1; i <= rows; i++)
{
for (int j = Math.Max(1, i - w); j <= Math.Min(columns, i + w); j++)
{
if (DTW[i - 1, j] == 0)
DTW[i - 1, j] = float.MaxValue;
if (DTW[i - 1, j - 1] == 0)
DTW[i - 1, j - 1] = float.MaxValue;
DTW[0, 0] = 0;
cost = distance(input.Frames[i - 1], output.Frames[j - 1]);// frames 0 based
DTW[i, j] = (cost + Math.Min(DTW[i - 1, j], DTW[i - 1, j - 1]));// insert ,match
}
}
return DTW[rows, columns];
}
两个函数都使用了一个辅助函数distance
,定义如下:
/// <summary>
/// Calculates the distance between two audio frames in the form of MFCCFrame objects
/// returns the difference in a float
/// </summary>
///
float distance(MFCCFrame frame1, MFCCFrame frame2)
{
double tempSum = 0;
for (int i = 0; i < 13; i++)
tempSum += Math.Pow(Math.Abs(frame1.Features[i] - frame2.Features[i]), 2);
return (float)Math.Sqrt(tempSum);
}
[编辑] 为了改善DTW算法的内存消耗,我只使用了2个数组而不是2D矩阵,更改后的算法如下所示:
double DTW_improved(Sequence input, Sequence template)
{
// Don't compare two sequences if one of their lengths is half the other's
if (input.Frames.Length <= (0.5 * template.Frames.Length) || template.Frames.Length <= (0.5 * input.Frames.Length))
return double.PositiveInfinity;
int rows = template.Frames.Length, columns = input.Frames.Length;
double[] c1 = new double[rows];
double[] c2 = new double[rows];
double[] temp; // To hold address only (use it in swapping address)
c1[0] = distance(input.Frames[0], template.Frames[0]);
for (int i = 1; i < rows; i++)
c1[i] = c1[i - 1] + distance(input.Frames[0], template.Frames[i]);
for (int i = 1; i < columns; i++)
{
c2[0] = distance(input.Frames[i], template.Frames[0]) + c1[0];
c2[1] = distance(input.Frames[i], template.Frames[1]) + Math.Min(c1[0], c1[1]);
// Calculating first 2 elements of the array before the loop
//since they both need special conditions
for (int j = 2; j < rows; j++)
c2[j] = Math.Min(c1[j], Math.Min(c1[j - 1], c1[j - 2])) + distance(input.Frames[i], template.Frames[j]);
if (i != columns - 1) // Swapping addresses of c1 & c2
{
temp = c2;
c2 = c1;
c1 = temp;
}
}
return c2[rows - 1] / (0.5 * (input.Frames.Length + template.Frames.Length)); // Normalization: Dividing edit distance by average of input length & template length
}