C:如何在 2000 万条记录上使用合并排序优化内存使用

C: How to optimize memory usage with mergesort on 20million records

所以我有一个大文件,我必须使用合并排序进行排序。

文件大小:700MB,2000 万条记录。

现在我的程序占用了大约 7BG 的内存,我想这很糟糕。 我想我做了一些无用的分配,但我不确定,感谢任何帮助。

让我们开始吧:

typedef struct {
    void **array; // array of struct type record
    long used;
    long size;
    int (*precedes)(void *, void *); // function for comparison 
} Array;

struct Array一起使用的一些函数:

void initArray(Array *a, int (*precedes)(void*, void*)) {
    if (precedes == NULL) {
        fprintf(stderr, "array_create: precedes parameter cannot be NULL");
        exit(EXIT_FAILURE);
    }
    a->array = (void**)malloc(INITIAL_SIZE * sizeof(void*));
    if (a->array == NULL) {
        fprintf(stderr, "array_locate: unable to allocate memory for the a-> array");
        exit(EXIT_FAILURE);
    }
    a->used = 0;
    a->size = INITIAL_SIZE;
    a->precedes = precedes;
}

void insertArray(Array *a, void *element) {
    if (a == NULL) {
        fprintf(stderr, "array_element: array parameter cannot be NULL");
        exit(EXIT_FAILURE);
    }
    if (element == NULL) {
        fprintf(stderr, "array_element: element parameter cannot be NULL");
        exit(EXIT_FAILURE);
    }
    if (a->used == a->size) {
        a->size *= 2;
        a->array = (void**)realloc(a->array,(unsigned long)a->size * sizeof(void *));
        if (a->array == NULL) {
            fprintf(stderr,"array_resize: unable to reallocate memory to host the new element");
            exit(EXIT_FAILURE);
        }
    }
    a->array[a->used++] = element;
}

void *get_element(Array *a, long index) {
    if (a == NULL) {
        fprintf(stderr, "array parameter cannot be NULL");
        exit(EXIT_FAILURE);
    }
    if (index >= a->size) {
        fprintf(stderr,  "Index %lu is out of the array bounds", index);
        exit(EXIT_FAILURE);
    }
    return a->array[index];
}

unsigned long array_size(Array *a) {
    if (a == NULL) {
        fprintf(stderr, "array_size: ordered_array parameter cannot be NULL");
        exit(EXIT_FAILURE);
    }
    return a->used;
}

int array_is_empty(Array *a) {
    if (a == NULL) {
        fprintf(stderr, "array_is_empty: ordered_array parameter cannot be NULL");
        exit(EXIT_FAILURE);
    }
    return a->used == 0;
}

void freeArray(Array *a) {
    free(a->array);
    a->array = NULL;
    a->used = a->size = 0;
    free(a);
}

合并排序,我猜问题出在void ** temp = malloc(sizeof(void*) * a->used);:

void mergeSort(Array *a, long lb, long ub) {
    if (lb < ub) {
        long mid = (lb + ub) / 2;
        mergeSort(a, lb, mid);
        mergeSort(a, mid + 1, ub);
        merge(a, lb, mid, ub);
    } else {
        return;
    }
}

void merge(Array *a, long lb, long mid, long ub) {
    long i = lb;
    long j = mid + 1;
    long k = lb;
    void **temp = malloc(sizeof(void*) * a->used); // this is bad I guess
    while (i <= mid && j <= ub) {
        if (a->precedes(a->array[i], a->array[j]) < 0 || a->precedes(a->array[i], a->array[j]) == 0) {
            temp[k] = a->array[i];
            i++;
        } else {
            temp[k] = a->array[j];
            j++;
        }
        k++;
    }

    if (i > mid) {
        while (j <= ub) {
            temp[k] = a->array[j];
            j++;
            k++;
        }
    } else {
        while (i <= mid) {
            temp[k] = a->array[i];
            i++;
            k++;
        }
    }
    for (i = lb ; i <= ub; i++) {
        a->array[i] = temp[i];
    }
}

主要:

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "lib.h"
#include <time.h>

#define BUFFSIZE 1024

typedef struct {
    int id;
    char *s_data;
    int i_data;
    double d_data;
} Record_t;

//function to get csv file fields
char *getfield(char *line, int num, char *parser) {
    char *tok = strtok(line, parser);
    for (int i = 0; i < num; i++) {
        tok = strtok(NULL, parser);
    }
    if (tok != NULL) {
        return tok;
    }
    return NULL;
}

//function for comparison two strings
static int precedes_record_string_field(void *r1_p, void *r2_p) {
    if (r1_p == NULL) {
        fprintf(stderr, "precedes_string: the first parameter is a null pointer");
        exit(EXIT_FAILURE);
    }
    if (r2_p == NULL) {
        fprintf(stderr, "precedes_string: the second parameter is a null pointer");
        exit(EXIT_FAILURE);
    }
    Record_t *rec1_p = (Record_t *)r1_p;
    Record_t *rec2_p = (Record_t *)r2_p;
    return strcmp(rec1_p->s_data, rec2_p->s_data);
}

//loading the file
void load_file(Array *a, FILE *f, char *parser) {
    printf("Loading file\n");
    char line[BUFFSIZE];
    while (fgets(line, BUFFSIZE, f)) {
        char *tmp = strdup(line);
        Record_t *record = malloc(sizeof(Record_t));
        record->id = atoi(getfield(tmp, 0, parser));
        free(tmp);
        tmp = NULL;
        tmp = strdup(line);
        record->s_data = malloc(sizeof(char) * 150);
        strcpy(record->s_data, getfield(tmp, 1, parser));
        free(tmp);
        tmp = NULL;
        tmp = strdup(line);
        record->i_data = atoi(getfield(tmp, 2, parser));
        free(tmp);
        tmp = NULL;
        tmp = strdup(line);
        record->d_data = atof(getfield(tmp, 3, parser));
        insertArray(a, record);
        free(tmp);
    }
    printf("File loaded\n");
}

// writing the file
void writeToFile(Array *a, char *name) {
    FILE *fptr = fopen(name, "w");
    if (fptr == NULL) {
        printf("Could not open file");
        return 0;
    }
    for (int i = 0; i < a->used; i++) {
        Record_t *rec1_p = (Record_t *)a->array[i];
        fprintf(fptr, "%d,%s,%d,%lf\n", rec1_p->id, rec1_p->s_data, rec1_p->i_data, rec1_p->d_data);
    }
    fclose(fptr);
}

//main
int main(int argc, char const *argv[]) {
    Array *a = (Array *)malloc(sizeof(Array));
    setbuf(stdout, NULL);
    FILE *f = fopen(argv[1], "r");
    if (!f) {
        printf("Missing input file: %s\n", argv[1]);
        exit(1);
    }
    int k = atoi(argv[2]);
    initArray(a, precedes_record_string_field);
    load_file(a, f, ",");
    mergeSort(a, 0, a->used - 1);
    writeToFile(a, "../recors_string.csv");
    printf("done\n");

    free(f);
    freeArray(a);
    return 0;
}

提前感谢您help.Appreciate的任何帮助。


更新:

我想在最后一个函数之前的合并中使用这个:

free(a->array);
a->array = NULL;
for (i = lb; i <= ub; i++) {
    a->array[i] = temp[i];
}

但这样做会给我分段错误。据我所知,免费后你可以使用变量。

一次性分配第二个 700 mb 数组以用于合并排序,并使用索引对 select 来自原始数组和第二个数组的子数组进行索引。总内存将是 1.4 GB 加上局部变量,这对于 64 位 OS.

应该不是问题

您的内存分配策略存在多个问题:

  • merge 函数分配了一个临时数组 temp,其中的条目太多,并且从不释放它。这是 运行 内存不足的有力候选者,并且由于您不检查 malloc 失败,因此当 malloc returns NULL.[=29= 时将发生未定义的行为]

  • getfield 不测试 strtok 失败,导致无效输入出现未定义的行为。

  • loadfile 分配和释放许多临时数组。在这个阶段使用本地数组可以避免碎片化。

其他一些问题:

  • 最好使用类型 size_t 代替 long

  • 你应该写fclose(f)而不是free(f)

  • get_element 中的测试 if (index >= a->size) 应该是 if (index >= a->used)。按照编码,您可能 return 一个未初始化的元素。

  • insertArray 是用词不当:为了保持一致性,您应该使用 appendArrayarray_append

这是合并排序函数的修改版本:

void mergeSort(Array *a, size_t lb, size_t ub) {
    if (lb < ub) {
        size_t mid = lb + (ub - lb) / 2;
        mergeSort(a, lb, mid);
        mergeSort(a, mid + 1, ub);
        merge(a, lb, mid, ub);
    }
}

void merge(Array *a, size_t lb, size_t mid, size_t ub) {
    size_t i = lb;
    size_t j = mid + 1;
    size_t k = 0;
    void **temp = malloc(sizeof(void*) * (ub + 1 - lb);
    if (temp == NULL) {
        fprintf(stderr, "out of memory in merge\n");
        exit(EXIT_FAILURE);
    }
    /* select elements from either half according to the comparison function */
    while (i <= mid && j <= ub) {
        if (a->precedes(a->array[i], a->array[j]) <= 0) {
            temp[k++] = a->array[i++];
        } else {
            temp[k++] = a->array[j++];
        }
    }
    /* copy the remaining elements from the left half */
    while (i <= mid) {
        temp[k++] = a->array[i++];
    }
    /* no need to copy the remaining elements from the right half:
       they are already in place */

    /* copy the sorted slice back to the array */
    for (i = 0; i < k; i++) {
        a->array[lb + i] = temp[i];
    }
    free(temp);
}

这是程序的修改版本,使用更少的内存:

#include <errno.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
#include "lib.h"

#define BUFFSIZE 1024

typedef struct {
    int id;
    int i_data;   // group int fields together for better structure packing
    char *s_data;
    double d_data;
} Record_t;

// function to get csv file fields
// use strspn and strcspn to avoid modifying the line
// return a pointer to the beginning of the field
char *getfield(const char *line, int num, const char *delimiters) {
    const char *p = line;   // points to the beginning of the field
    while (num --> 0) {
        p += strcspn(p, delimiters); // skip contents
        p += strspn(p, delimiters); // skip delimiters
    }
    return (char *)p;
}

//loading the file
void load_file(Array *a, FILE *f, const char *delimiters) {
    printf("Loading file\n");
    char line[BUFFSIZE];
    while (fgets(line, BUFFSIZE, f)) {
        Record_t *record = malloc(sizeof(Record_t));
        if (record == NULL) {
            fprintf(stderr, "out of memory for record: %s\n", line);
            exit(EXIT_FAILURE);
        }
        record->id = atoi(getfield(line, 0, delimiters));
        char *p = getfield(tmp, 1, parser);
        record->s_data = strndup(p, strcspn(p, delimiters));
        record->i_data = atoi(getfield(tmp, 2, delimiters));
        record->d_data = atof(getfield(tmp, 3, delimiters));
        insertArray(a, record);
    }
    printf("File loaded\n");
}

// writing the file
int writeToFile(Array *a, const char *name) {
    FILE *fptr = fopen(name, "w");
    if (fptr == NULL) {
        fprintf(stderr, "Could not open file %s: %s", name, strerror(errno));
        return -1;
    }
    for (long i = 0; i < a->used; i++) {
        Record_t *rec1_p = (Record_t *)a->array[i];
        fprintf(fptr, "%d,%s,%d,%lf\n", rec1_p->id, rec1_p->s_data, rec1_p->i_data, rec1_p->d_data);
    }
    fclose(fptr);
    return 0;
}

int main(int argc, char *argv[]) {
    Array *a = (Array *)malloc(sizeof(Array));
    if (a == NULL) {
        fprintf(stderr, "out of memory for Array\n");
        exit(EXIT_FAILURE);
    }
    setbuf(stdout, NULL);
    FILE *f = fopen(argv[1], "r");
    if (!f) {
        printf("Missing input file: %s\n", argv[1]);
        exit(EXIT_FAILURE);
    }
    initArray(a, precedes_record_string_field);
    load_file(a, f, ",\n");
    mergeSort(a, 0, a->used - 1);
    writeToFile(a, "../recors_string.csv");
    printf("done\n");

    fclose(f);
    freeArray(a);
    return 0;
}