为什么我的 levenshtein 距离计算器无法处理 PDF 文件?
Why my levenshtein distance calculator fails with PDF file?
我正在尝试创建一个程序来计算两个文件之间的编辑距离。我用 funcution fread 读取,并使用代码读取二进制文件(“rb”)。我输入了两个 PDF 文件,在调试过程中我发现当我尝试填充 Levenshtein 距离算法的矩阵时,我在第一个文件的 char n° 1354 处得到一个“SIGSEGV(分段错误)”,然后程序退出与:
Process finished with exit code -1073741819 (0xC0000005)
我控制了 char n° 1354 是 \n .
我用来读取文件的代码是:
long getFileSize(FILE *file) {
long int size;
fseek(file, 0, SEEK_END);
size = ftell(file);
fseek(file, 0, SEEK_SET);
return size;
}
char *readFromBinary(char *path) {
FILE *file;
file = fopen(path, "rb");
if (file == NULL)
printf("Error!\n");
long fileSize = getFileSize(file);
char *buffer = malloc((fileSize + 1) * sizeof(char));
fread(buffer, sizeof(char), fileSize, file);
return buffer;
}
这是我用来计算编辑距离的代码:
int calculateDistance(char *pathFile1, char *pathFile2, int choice, char *path) {
FILE *f1 = fopen(pathFile1, "rb");
FILE *f2 = fopen(pathFile2, "rb");
char *contentFile1 = readFromBinary(pathFile1);
char *contentFile2 = readFromBinary(pathFile2);
int distance = 0;
int dim1 = getFileSize(f1);
int dim2 = getFileSize(f2);
int **matrix = constructMatrix(dim1, dim2);
fillMatrix(matrix, dim1, dim2, contentFile1, contentFile2);
distance = matrix[dim1][dim2];
struct Instruction instruction[distance + 1];
int initActions = initInstructions(matrix, pathFile1, &dim1, pathFile2, &dim2, instruction);
endInstructions(pathFile1, &dim1, pathFile2, &dim2, instruction, initActions);
if (choice == 1)
printOnFile(instruction, distance, path);
for (int i = 0; i <= dim1; i++)
free(matrix[i]);
free(matrix);
if (numberOfDivisions > 0)
numberOfDivisions--;
return distance;
}
这是我用来创建和填充矩阵的代码:
int **constructMatrix(int dim1, int dim2) {
//matrice di puntatori
int **matrice = (int **) malloc((dim1 + 1) * sizeof(int *));
//matrice di puntatori
for (int i = 0; i <= dim1; i++)
matrice[i] = (int *) malloc((dim2 + 1) * sizeof(int));
return matrice;
}
void fillMatrix(int **matrix, int dim1, int dim2, char *file1, char *file2) {
for (int i = 0; i <= dim1; i++)
matrix[i][0] = i;
for (int j = 1; j <= dim2; j++)
matrix[0][j] = j;
for (int i = 1; i <= dim1; i++) {
for (int j = 1; j <= dim2; j++) {
if (file1[i - 1] != file2[j - 1]) {
int k = minimum(matrix[i][j - 1], matrix[i - 1][j], matrix[i - 1][j - 1]);
matrix[i][j] = k + 1;
} else
matrix[i][j] = matrix[i - 1][j - 1];
}
}
}
特别是当 i=1354 时调试器在 calculateDistance(fillMatrix(matrix, dim1, dim2, contentFile1, contentFile2);
) 的这一行和 fillMatrix(matrix[i][0] = i;
) 的这一行停止。
PDF相关信息:
PDF文件为188671字节
它有1355行
PS。我的程序使用 txt 文件。
您至少分配了 188671 * 1355 * 4 字节 = 1022596820 字节。您确实需要检查 malloc 中的 return 值以确保它能够成功分配。
当任何内存分配函数,包括malloc, calloc, and realloc() make a request to the OS to obtain memory, unless the OS can find a single block of contiguous所请求的内存大小时,该函数都会returnNULL
。由于您要求的块大小令人难以置信,因此很可能会失败。
始终建议在尝试使用 returned:return 之前测试任何这些函数的 return:
char *buffer = malloc((fileSize + 1) * sizeof(char));
if(!buffer)
{
//handle error
在这种情况下,re-evaluate 你的算法会很好。
我正在尝试创建一个程序来计算两个文件之间的编辑距离。我用 funcution fread 读取,并使用代码读取二进制文件(“rb”)。我输入了两个 PDF 文件,在调试过程中我发现当我尝试填充 Levenshtein 距离算法的矩阵时,我在第一个文件的 char n° 1354 处得到一个“SIGSEGV(分段错误)”,然后程序退出与:
Process finished with exit code -1073741819 (0xC0000005)
我控制了 char n° 1354 是 \n .
我用来读取文件的代码是:
long getFileSize(FILE *file) {
long int size;
fseek(file, 0, SEEK_END);
size = ftell(file);
fseek(file, 0, SEEK_SET);
return size;
}
char *readFromBinary(char *path) {
FILE *file;
file = fopen(path, "rb");
if (file == NULL)
printf("Error!\n");
long fileSize = getFileSize(file);
char *buffer = malloc((fileSize + 1) * sizeof(char));
fread(buffer, sizeof(char), fileSize, file);
return buffer;
}
这是我用来计算编辑距离的代码:
int calculateDistance(char *pathFile1, char *pathFile2, int choice, char *path) {
FILE *f1 = fopen(pathFile1, "rb");
FILE *f2 = fopen(pathFile2, "rb");
char *contentFile1 = readFromBinary(pathFile1);
char *contentFile2 = readFromBinary(pathFile2);
int distance = 0;
int dim1 = getFileSize(f1);
int dim2 = getFileSize(f2);
int **matrix = constructMatrix(dim1, dim2);
fillMatrix(matrix, dim1, dim2, contentFile1, contentFile2);
distance = matrix[dim1][dim2];
struct Instruction instruction[distance + 1];
int initActions = initInstructions(matrix, pathFile1, &dim1, pathFile2, &dim2, instruction);
endInstructions(pathFile1, &dim1, pathFile2, &dim2, instruction, initActions);
if (choice == 1)
printOnFile(instruction, distance, path);
for (int i = 0; i <= dim1; i++)
free(matrix[i]);
free(matrix);
if (numberOfDivisions > 0)
numberOfDivisions--;
return distance;
}
这是我用来创建和填充矩阵的代码:
int **constructMatrix(int dim1, int dim2) {
//matrice di puntatori
int **matrice = (int **) malloc((dim1 + 1) * sizeof(int *));
//matrice di puntatori
for (int i = 0; i <= dim1; i++)
matrice[i] = (int *) malloc((dim2 + 1) * sizeof(int));
return matrice;
}
void fillMatrix(int **matrix, int dim1, int dim2, char *file1, char *file2) {
for (int i = 0; i <= dim1; i++)
matrix[i][0] = i;
for (int j = 1; j <= dim2; j++)
matrix[0][j] = j;
for (int i = 1; i <= dim1; i++) {
for (int j = 1; j <= dim2; j++) {
if (file1[i - 1] != file2[j - 1]) {
int k = minimum(matrix[i][j - 1], matrix[i - 1][j], matrix[i - 1][j - 1]);
matrix[i][j] = k + 1;
} else
matrix[i][j] = matrix[i - 1][j - 1];
}
}
}
特别是当 i=1354 时调试器在 calculateDistance(fillMatrix(matrix, dim1, dim2, contentFile1, contentFile2);
) 的这一行和 fillMatrix(matrix[i][0] = i;
) 的这一行停止。
PDF相关信息:
PDF文件为188671字节
它有1355行
PS。我的程序使用 txt 文件。
您至少分配了 188671 * 1355 * 4 字节 = 1022596820 字节。您确实需要检查 malloc 中的 return 值以确保它能够成功分配。
当任何内存分配函数,包括malloc, calloc, and realloc() make a request to the OS to obtain memory, unless the OS can find a single block of contiguous所请求的内存大小时,该函数都会returnNULL
。由于您要求的块大小令人难以置信,因此很可能会失败。
始终建议在尝试使用 returned:return 之前测试任何这些函数的 return:
char *buffer = malloc((fileSize + 1) * sizeof(char));
if(!buffer)
{
//handle error
在这种情况下,re-evaluate 你的算法会很好。