C:如何在 2000 万条记录上使用合并排序优化内存使用
C: How to optimize memory usage with mergesort on 20million records
所以我有一个大文件,我必须使用合并排序进行排序。
文件大小:700MB,2000 万条记录。
现在我的程序占用了大约 7BG 的内存,我想这很糟糕。
我想我做了一些无用的分配,但我不确定,感谢任何帮助。
让我们开始吧:
typedef struct {
void **array; // array of struct type record
long used;
long size;
int (*precedes)(void *, void *); // function for comparison
} Array;
与struct Array
一起使用的一些函数:
void initArray(Array *a, int (*precedes)(void*, void*)) {
if (precedes == NULL) {
fprintf(stderr, "array_create: precedes parameter cannot be NULL");
exit(EXIT_FAILURE);
}
a->array = (void**)malloc(INITIAL_SIZE * sizeof(void*));
if (a->array == NULL) {
fprintf(stderr, "array_locate: unable to allocate memory for the a-> array");
exit(EXIT_FAILURE);
}
a->used = 0;
a->size = INITIAL_SIZE;
a->precedes = precedes;
}
void insertArray(Array *a, void *element) {
if (a == NULL) {
fprintf(stderr, "array_element: array parameter cannot be NULL");
exit(EXIT_FAILURE);
}
if (element == NULL) {
fprintf(stderr, "array_element: element parameter cannot be NULL");
exit(EXIT_FAILURE);
}
if (a->used == a->size) {
a->size *= 2;
a->array = (void**)realloc(a->array,(unsigned long)a->size * sizeof(void *));
if (a->array == NULL) {
fprintf(stderr,"array_resize: unable to reallocate memory to host the new element");
exit(EXIT_FAILURE);
}
}
a->array[a->used++] = element;
}
void *get_element(Array *a, long index) {
if (a == NULL) {
fprintf(stderr, "array parameter cannot be NULL");
exit(EXIT_FAILURE);
}
if (index >= a->size) {
fprintf(stderr, "Index %lu is out of the array bounds", index);
exit(EXIT_FAILURE);
}
return a->array[index];
}
unsigned long array_size(Array *a) {
if (a == NULL) {
fprintf(stderr, "array_size: ordered_array parameter cannot be NULL");
exit(EXIT_FAILURE);
}
return a->used;
}
int array_is_empty(Array *a) {
if (a == NULL) {
fprintf(stderr, "array_is_empty: ordered_array parameter cannot be NULL");
exit(EXIT_FAILURE);
}
return a->used == 0;
}
void freeArray(Array *a) {
free(a->array);
a->array = NULL;
a->used = a->size = 0;
free(a);
}
合并排序,我猜问题出在void ** temp = malloc(sizeof(void*) * a->used);
:
void mergeSort(Array *a, long lb, long ub) {
if (lb < ub) {
long mid = (lb + ub) / 2;
mergeSort(a, lb, mid);
mergeSort(a, mid + 1, ub);
merge(a, lb, mid, ub);
} else {
return;
}
}
void merge(Array *a, long lb, long mid, long ub) {
long i = lb;
long j = mid + 1;
long k = lb;
void **temp = malloc(sizeof(void*) * a->used); // this is bad I guess
while (i <= mid && j <= ub) {
if (a->precedes(a->array[i], a->array[j]) < 0 || a->precedes(a->array[i], a->array[j]) == 0) {
temp[k] = a->array[i];
i++;
} else {
temp[k] = a->array[j];
j++;
}
k++;
}
if (i > mid) {
while (j <= ub) {
temp[k] = a->array[j];
j++;
k++;
}
} else {
while (i <= mid) {
temp[k] = a->array[i];
i++;
k++;
}
}
for (i = lb ; i <= ub; i++) {
a->array[i] = temp[i];
}
}
主要:
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "lib.h"
#include <time.h>
#define BUFFSIZE 1024
typedef struct {
int id;
char *s_data;
int i_data;
double d_data;
} Record_t;
//function to get csv file fields
char *getfield(char *line, int num, char *parser) {
char *tok = strtok(line, parser);
for (int i = 0; i < num; i++) {
tok = strtok(NULL, parser);
}
if (tok != NULL) {
return tok;
}
return NULL;
}
//function for comparison two strings
static int precedes_record_string_field(void *r1_p, void *r2_p) {
if (r1_p == NULL) {
fprintf(stderr, "precedes_string: the first parameter is a null pointer");
exit(EXIT_FAILURE);
}
if (r2_p == NULL) {
fprintf(stderr, "precedes_string: the second parameter is a null pointer");
exit(EXIT_FAILURE);
}
Record_t *rec1_p = (Record_t *)r1_p;
Record_t *rec2_p = (Record_t *)r2_p;
return strcmp(rec1_p->s_data, rec2_p->s_data);
}
//loading the file
void load_file(Array *a, FILE *f, char *parser) {
printf("Loading file\n");
char line[BUFFSIZE];
while (fgets(line, BUFFSIZE, f)) {
char *tmp = strdup(line);
Record_t *record = malloc(sizeof(Record_t));
record->id = atoi(getfield(tmp, 0, parser));
free(tmp);
tmp = NULL;
tmp = strdup(line);
record->s_data = malloc(sizeof(char) * 150);
strcpy(record->s_data, getfield(tmp, 1, parser));
free(tmp);
tmp = NULL;
tmp = strdup(line);
record->i_data = atoi(getfield(tmp, 2, parser));
free(tmp);
tmp = NULL;
tmp = strdup(line);
record->d_data = atof(getfield(tmp, 3, parser));
insertArray(a, record);
free(tmp);
}
printf("File loaded\n");
}
// writing the file
void writeToFile(Array *a, char *name) {
FILE *fptr = fopen(name, "w");
if (fptr == NULL) {
printf("Could not open file");
return 0;
}
for (int i = 0; i < a->used; i++) {
Record_t *rec1_p = (Record_t *)a->array[i];
fprintf(fptr, "%d,%s,%d,%lf\n", rec1_p->id, rec1_p->s_data, rec1_p->i_data, rec1_p->d_data);
}
fclose(fptr);
}
//main
int main(int argc, char const *argv[]) {
Array *a = (Array *)malloc(sizeof(Array));
setbuf(stdout, NULL);
FILE *f = fopen(argv[1], "r");
if (!f) {
printf("Missing input file: %s\n", argv[1]);
exit(1);
}
int k = atoi(argv[2]);
initArray(a, precedes_record_string_field);
load_file(a, f, ",");
mergeSort(a, 0, a->used - 1);
writeToFile(a, "../recors_string.csv");
printf("done\n");
free(f);
freeArray(a);
return 0;
}
提前感谢您help.Appreciate的任何帮助。
更新:
我想在最后一个函数之前的合并中使用这个:
free(a->array);
a->array = NULL;
for (i = lb; i <= ub; i++) {
a->array[i] = temp[i];
}
但这样做会给我分段错误。据我所知,免费后你可以使用变量。
一次性分配第二个 700 mb 数组以用于合并排序,并使用索引对 select 来自原始数组和第二个数组的子数组进行索引。总内存将是 1.4 GB 加上局部变量,这对于 64 位 OS.
应该不是问题
您的内存分配策略存在多个问题:
merge
函数分配了一个临时数组 temp
,其中的条目太多,并且从不释放它。这是 运行 内存不足的有力候选者,并且由于您不检查 malloc 失败,因此当 malloc
returns NULL
.[=29= 时将发生未定义的行为]
getfield
不测试 strtok
失败,导致无效输入出现未定义的行为。
loadfile
分配和释放许多临时数组。在这个阶段使用本地数组可以避免碎片化。
其他一些问题:
最好使用类型 size_t
代替 long
。
你应该写fclose(f)
而不是free(f)
。
get_element
中的测试 if (index >= a->size)
应该是 if (index >= a->used)
。按照编码,您可能 return 一个未初始化的元素。
insertArray
是用词不当:为了保持一致性,您应该使用 appendArray
或 array_append
。
这是合并排序函数的修改版本:
void mergeSort(Array *a, size_t lb, size_t ub) {
if (lb < ub) {
size_t mid = lb + (ub - lb) / 2;
mergeSort(a, lb, mid);
mergeSort(a, mid + 1, ub);
merge(a, lb, mid, ub);
}
}
void merge(Array *a, size_t lb, size_t mid, size_t ub) {
size_t i = lb;
size_t j = mid + 1;
size_t k = 0;
void **temp = malloc(sizeof(void*) * (ub + 1 - lb);
if (temp == NULL) {
fprintf(stderr, "out of memory in merge\n");
exit(EXIT_FAILURE);
}
/* select elements from either half according to the comparison function */
while (i <= mid && j <= ub) {
if (a->precedes(a->array[i], a->array[j]) <= 0) {
temp[k++] = a->array[i++];
} else {
temp[k++] = a->array[j++];
}
}
/* copy the remaining elements from the left half */
while (i <= mid) {
temp[k++] = a->array[i++];
}
/* no need to copy the remaining elements from the right half:
they are already in place */
/* copy the sorted slice back to the array */
for (i = 0; i < k; i++) {
a->array[lb + i] = temp[i];
}
free(temp);
}
这是程序的修改版本,使用更少的内存:
#include <errno.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
#include "lib.h"
#define BUFFSIZE 1024
typedef struct {
int id;
int i_data; // group int fields together for better structure packing
char *s_data;
double d_data;
} Record_t;
// function to get csv file fields
// use strspn and strcspn to avoid modifying the line
// return a pointer to the beginning of the field
char *getfield(const char *line, int num, const char *delimiters) {
const char *p = line; // points to the beginning of the field
while (num --> 0) {
p += strcspn(p, delimiters); // skip contents
p += strspn(p, delimiters); // skip delimiters
}
return (char *)p;
}
//loading the file
void load_file(Array *a, FILE *f, const char *delimiters) {
printf("Loading file\n");
char line[BUFFSIZE];
while (fgets(line, BUFFSIZE, f)) {
Record_t *record = malloc(sizeof(Record_t));
if (record == NULL) {
fprintf(stderr, "out of memory for record: %s\n", line);
exit(EXIT_FAILURE);
}
record->id = atoi(getfield(line, 0, delimiters));
char *p = getfield(tmp, 1, parser);
record->s_data = strndup(p, strcspn(p, delimiters));
record->i_data = atoi(getfield(tmp, 2, delimiters));
record->d_data = atof(getfield(tmp, 3, delimiters));
insertArray(a, record);
}
printf("File loaded\n");
}
// writing the file
int writeToFile(Array *a, const char *name) {
FILE *fptr = fopen(name, "w");
if (fptr == NULL) {
fprintf(stderr, "Could not open file %s: %s", name, strerror(errno));
return -1;
}
for (long i = 0; i < a->used; i++) {
Record_t *rec1_p = (Record_t *)a->array[i];
fprintf(fptr, "%d,%s,%d,%lf\n", rec1_p->id, rec1_p->s_data, rec1_p->i_data, rec1_p->d_data);
}
fclose(fptr);
return 0;
}
int main(int argc, char *argv[]) {
Array *a = (Array *)malloc(sizeof(Array));
if (a == NULL) {
fprintf(stderr, "out of memory for Array\n");
exit(EXIT_FAILURE);
}
setbuf(stdout, NULL);
FILE *f = fopen(argv[1], "r");
if (!f) {
printf("Missing input file: %s\n", argv[1]);
exit(EXIT_FAILURE);
}
initArray(a, precedes_record_string_field);
load_file(a, f, ",\n");
mergeSort(a, 0, a->used - 1);
writeToFile(a, "../recors_string.csv");
printf("done\n");
fclose(f);
freeArray(a);
return 0;
}
所以我有一个大文件,我必须使用合并排序进行排序。
文件大小:700MB,2000 万条记录。
现在我的程序占用了大约 7BG 的内存,我想这很糟糕。 我想我做了一些无用的分配,但我不确定,感谢任何帮助。
让我们开始吧:
typedef struct {
void **array; // array of struct type record
long used;
long size;
int (*precedes)(void *, void *); // function for comparison
} Array;
与struct Array
一起使用的一些函数:
void initArray(Array *a, int (*precedes)(void*, void*)) {
if (precedes == NULL) {
fprintf(stderr, "array_create: precedes parameter cannot be NULL");
exit(EXIT_FAILURE);
}
a->array = (void**)malloc(INITIAL_SIZE * sizeof(void*));
if (a->array == NULL) {
fprintf(stderr, "array_locate: unable to allocate memory for the a-> array");
exit(EXIT_FAILURE);
}
a->used = 0;
a->size = INITIAL_SIZE;
a->precedes = precedes;
}
void insertArray(Array *a, void *element) {
if (a == NULL) {
fprintf(stderr, "array_element: array parameter cannot be NULL");
exit(EXIT_FAILURE);
}
if (element == NULL) {
fprintf(stderr, "array_element: element parameter cannot be NULL");
exit(EXIT_FAILURE);
}
if (a->used == a->size) {
a->size *= 2;
a->array = (void**)realloc(a->array,(unsigned long)a->size * sizeof(void *));
if (a->array == NULL) {
fprintf(stderr,"array_resize: unable to reallocate memory to host the new element");
exit(EXIT_FAILURE);
}
}
a->array[a->used++] = element;
}
void *get_element(Array *a, long index) {
if (a == NULL) {
fprintf(stderr, "array parameter cannot be NULL");
exit(EXIT_FAILURE);
}
if (index >= a->size) {
fprintf(stderr, "Index %lu is out of the array bounds", index);
exit(EXIT_FAILURE);
}
return a->array[index];
}
unsigned long array_size(Array *a) {
if (a == NULL) {
fprintf(stderr, "array_size: ordered_array parameter cannot be NULL");
exit(EXIT_FAILURE);
}
return a->used;
}
int array_is_empty(Array *a) {
if (a == NULL) {
fprintf(stderr, "array_is_empty: ordered_array parameter cannot be NULL");
exit(EXIT_FAILURE);
}
return a->used == 0;
}
void freeArray(Array *a) {
free(a->array);
a->array = NULL;
a->used = a->size = 0;
free(a);
}
合并排序,我猜问题出在void ** temp = malloc(sizeof(void*) * a->used);
:
void mergeSort(Array *a, long lb, long ub) {
if (lb < ub) {
long mid = (lb + ub) / 2;
mergeSort(a, lb, mid);
mergeSort(a, mid + 1, ub);
merge(a, lb, mid, ub);
} else {
return;
}
}
void merge(Array *a, long lb, long mid, long ub) {
long i = lb;
long j = mid + 1;
long k = lb;
void **temp = malloc(sizeof(void*) * a->used); // this is bad I guess
while (i <= mid && j <= ub) {
if (a->precedes(a->array[i], a->array[j]) < 0 || a->precedes(a->array[i], a->array[j]) == 0) {
temp[k] = a->array[i];
i++;
} else {
temp[k] = a->array[j];
j++;
}
k++;
}
if (i > mid) {
while (j <= ub) {
temp[k] = a->array[j];
j++;
k++;
}
} else {
while (i <= mid) {
temp[k] = a->array[i];
i++;
k++;
}
}
for (i = lb ; i <= ub; i++) {
a->array[i] = temp[i];
}
}
主要:
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "lib.h"
#include <time.h>
#define BUFFSIZE 1024
typedef struct {
int id;
char *s_data;
int i_data;
double d_data;
} Record_t;
//function to get csv file fields
char *getfield(char *line, int num, char *parser) {
char *tok = strtok(line, parser);
for (int i = 0; i < num; i++) {
tok = strtok(NULL, parser);
}
if (tok != NULL) {
return tok;
}
return NULL;
}
//function for comparison two strings
static int precedes_record_string_field(void *r1_p, void *r2_p) {
if (r1_p == NULL) {
fprintf(stderr, "precedes_string: the first parameter is a null pointer");
exit(EXIT_FAILURE);
}
if (r2_p == NULL) {
fprintf(stderr, "precedes_string: the second parameter is a null pointer");
exit(EXIT_FAILURE);
}
Record_t *rec1_p = (Record_t *)r1_p;
Record_t *rec2_p = (Record_t *)r2_p;
return strcmp(rec1_p->s_data, rec2_p->s_data);
}
//loading the file
void load_file(Array *a, FILE *f, char *parser) {
printf("Loading file\n");
char line[BUFFSIZE];
while (fgets(line, BUFFSIZE, f)) {
char *tmp = strdup(line);
Record_t *record = malloc(sizeof(Record_t));
record->id = atoi(getfield(tmp, 0, parser));
free(tmp);
tmp = NULL;
tmp = strdup(line);
record->s_data = malloc(sizeof(char) * 150);
strcpy(record->s_data, getfield(tmp, 1, parser));
free(tmp);
tmp = NULL;
tmp = strdup(line);
record->i_data = atoi(getfield(tmp, 2, parser));
free(tmp);
tmp = NULL;
tmp = strdup(line);
record->d_data = atof(getfield(tmp, 3, parser));
insertArray(a, record);
free(tmp);
}
printf("File loaded\n");
}
// writing the file
void writeToFile(Array *a, char *name) {
FILE *fptr = fopen(name, "w");
if (fptr == NULL) {
printf("Could not open file");
return 0;
}
for (int i = 0; i < a->used; i++) {
Record_t *rec1_p = (Record_t *)a->array[i];
fprintf(fptr, "%d,%s,%d,%lf\n", rec1_p->id, rec1_p->s_data, rec1_p->i_data, rec1_p->d_data);
}
fclose(fptr);
}
//main
int main(int argc, char const *argv[]) {
Array *a = (Array *)malloc(sizeof(Array));
setbuf(stdout, NULL);
FILE *f = fopen(argv[1], "r");
if (!f) {
printf("Missing input file: %s\n", argv[1]);
exit(1);
}
int k = atoi(argv[2]);
initArray(a, precedes_record_string_field);
load_file(a, f, ",");
mergeSort(a, 0, a->used - 1);
writeToFile(a, "../recors_string.csv");
printf("done\n");
free(f);
freeArray(a);
return 0;
}
提前感谢您help.Appreciate的任何帮助。
更新:
我想在最后一个函数之前的合并中使用这个:
free(a->array);
a->array = NULL;
for (i = lb; i <= ub; i++) {
a->array[i] = temp[i];
}
但这样做会给我分段错误。据我所知,免费后你可以使用变量。
一次性分配第二个 700 mb 数组以用于合并排序,并使用索引对 select 来自原始数组和第二个数组的子数组进行索引。总内存将是 1.4 GB 加上局部变量,这对于 64 位 OS.
应该不是问题您的内存分配策略存在多个问题:
merge
函数分配了一个临时数组temp
,其中的条目太多,并且从不释放它。这是 运行 内存不足的有力候选者,并且由于您不检查 malloc 失败,因此当malloc
returnsNULL
.[=29= 时将发生未定义的行为]getfield
不测试strtok
失败,导致无效输入出现未定义的行为。loadfile
分配和释放许多临时数组。在这个阶段使用本地数组可以避免碎片化。
其他一些问题:
最好使用类型
size_t
代替long
。你应该写
fclose(f)
而不是free(f)
。get_element
中的测试if (index >= a->size)
应该是if (index >= a->used)
。按照编码,您可能 return 一个未初始化的元素。insertArray
是用词不当:为了保持一致性,您应该使用appendArray
或array_append
。
这是合并排序函数的修改版本:
void mergeSort(Array *a, size_t lb, size_t ub) {
if (lb < ub) {
size_t mid = lb + (ub - lb) / 2;
mergeSort(a, lb, mid);
mergeSort(a, mid + 1, ub);
merge(a, lb, mid, ub);
}
}
void merge(Array *a, size_t lb, size_t mid, size_t ub) {
size_t i = lb;
size_t j = mid + 1;
size_t k = 0;
void **temp = malloc(sizeof(void*) * (ub + 1 - lb);
if (temp == NULL) {
fprintf(stderr, "out of memory in merge\n");
exit(EXIT_FAILURE);
}
/* select elements from either half according to the comparison function */
while (i <= mid && j <= ub) {
if (a->precedes(a->array[i], a->array[j]) <= 0) {
temp[k++] = a->array[i++];
} else {
temp[k++] = a->array[j++];
}
}
/* copy the remaining elements from the left half */
while (i <= mid) {
temp[k++] = a->array[i++];
}
/* no need to copy the remaining elements from the right half:
they are already in place */
/* copy the sorted slice back to the array */
for (i = 0; i < k; i++) {
a->array[lb + i] = temp[i];
}
free(temp);
}
这是程序的修改版本,使用更少的内存:
#include <errno.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
#include "lib.h"
#define BUFFSIZE 1024
typedef struct {
int id;
int i_data; // group int fields together for better structure packing
char *s_data;
double d_data;
} Record_t;
// function to get csv file fields
// use strspn and strcspn to avoid modifying the line
// return a pointer to the beginning of the field
char *getfield(const char *line, int num, const char *delimiters) {
const char *p = line; // points to the beginning of the field
while (num --> 0) {
p += strcspn(p, delimiters); // skip contents
p += strspn(p, delimiters); // skip delimiters
}
return (char *)p;
}
//loading the file
void load_file(Array *a, FILE *f, const char *delimiters) {
printf("Loading file\n");
char line[BUFFSIZE];
while (fgets(line, BUFFSIZE, f)) {
Record_t *record = malloc(sizeof(Record_t));
if (record == NULL) {
fprintf(stderr, "out of memory for record: %s\n", line);
exit(EXIT_FAILURE);
}
record->id = atoi(getfield(line, 0, delimiters));
char *p = getfield(tmp, 1, parser);
record->s_data = strndup(p, strcspn(p, delimiters));
record->i_data = atoi(getfield(tmp, 2, delimiters));
record->d_data = atof(getfield(tmp, 3, delimiters));
insertArray(a, record);
}
printf("File loaded\n");
}
// writing the file
int writeToFile(Array *a, const char *name) {
FILE *fptr = fopen(name, "w");
if (fptr == NULL) {
fprintf(stderr, "Could not open file %s: %s", name, strerror(errno));
return -1;
}
for (long i = 0; i < a->used; i++) {
Record_t *rec1_p = (Record_t *)a->array[i];
fprintf(fptr, "%d,%s,%d,%lf\n", rec1_p->id, rec1_p->s_data, rec1_p->i_data, rec1_p->d_data);
}
fclose(fptr);
return 0;
}
int main(int argc, char *argv[]) {
Array *a = (Array *)malloc(sizeof(Array));
if (a == NULL) {
fprintf(stderr, "out of memory for Array\n");
exit(EXIT_FAILURE);
}
setbuf(stdout, NULL);
FILE *f = fopen(argv[1], "r");
if (!f) {
printf("Missing input file: %s\n", argv[1]);
exit(EXIT_FAILURE);
}
initArray(a, precedes_record_string_field);
load_file(a, f, ",\n");
mergeSort(a, 0, a->used - 1);
writeToFile(a, "../recors_string.csv");
printf("done\n");
fclose(f);
freeArray(a);
return 0;
}