在决策树中拆分数据集步骤处理二维数组时，是否有办法避免使用“memcpy”？

Question

我正在尝试对 Valgrind 检测到的 binary decision tree. The below code is a working example for data splitting as a part of the whole algorithm although it has some error 进行编码。

您可能已经知道，树是由节点组成的。在每个节点中，它根据分裂条件携带所有符合条件的观测值（行）。继续，拆分也依赖于这些行。

并且可以通过对所有变量和值的贪婪搜索找到最佳拆分条件，这意味着该数据拆分步骤可以执行数百次以找到最佳拆分（在分类问题中通过基尼指数评估）。我认为使用 memcpy 时可能会很昂贵。最糟糕的是我必须在每次拆分尝试后为复制的行释放内存。

所以，我在想有没有办法避免使用memcpy字面复制整个数据集，而是在每次拆分时只引用原始数据集的符合条件的行。这样就不需要过多的memcpy和free了。

问题：具体到下面的例子，如何使用一个指针来记录符合条件的行的地址，这样就不需要memcpy并且允许（简单或方便）访问符合条件的行执行贪婪搜索以获得最佳拆分。

如果您不清楚这个问题，请告诉我。非常感谢您的宝贵时间。

欢迎任何帮助和建议。

#include <stdio.h>
#include <stdlib.h>
#include <string.h>

void printArray(double array[], unsigned int size) {
    for (unsigned int i = 0; i < size; ++i) {
        printf("%.3f  ", array[i]);
    }
    printf("\n");
}

double **alloc_2dArray(unsigned int row, unsigned int col)
{
    double **Array = (double **) malloc(row * sizeof(double *));

    for (unsigned int i = 0; i < row; i++) {
        Array[i] = (double *) malloc(col * sizeof(double));
    }

    return Array;
}

void dealloc_2dArray(double **Array, unsigned int row, unsigned int col)
{
    for (unsigned int i = 0; i < row; i++)
    {
        free(Array[i]);
    }
    free(Array);
}

int main(){

    double **data = alloc_2dArray(4,4);
    double delta[4] = {1,0,1,0};

    double **dataL = alloc_2dArray(4,4);
    double **dataR = alloc_2dArray(4,4);

    int i,j;

    int k=0;
    //data initialization
    for(i=0; i<4; i++){
        for(j=0; j<4; j++){
            data[i][j] = k++;
        }
    }

    int l=0, r=0;

    for(i=0; i<4; i++){
        if((int)delta[i] ==0){
            memcpy(dataL[l++], data[i], 4*sizeof(double));
        }else{
            memcpy(dataR[r++], data[i], 4*sizeof(double));
        }
    }

    for(i = l; i<4; i++){
        free(dataL[i]);
    }

    printf("l and r are %d %d\n", l, r);

    dataL = realloc(dataL, l*sizeof(double*));

    for(i = r; i<4; i++){
        free(dataR[i]);
    }

    dataR = realloc(dataR, r*sizeof(double*));

    for(i=0; i<4; i++){
        if(dataL[i] != NULL){
            printArray(dataL[i],4);
        }
        if(dataR[i] != NULL){
            printArray(dataR[i],4);
        }
    }

    dealloc_2dArray(data, 4,4);
    dealloc_2dArray(dataL, l, 4);
    dealloc_2dArray(dataR, r, 4);
    return 0;
}

Answer 1

我认为你问的是是否可以只保留分割矩阵中的原始数据行指针。答案是肯定的，你可以。然而，所有权开始发挥作用。

对于下面的示例（没有 memcpy 操作），dataL 和 dataR 指针床都是 direct-allocated，没有实际的数据行分配给它们.这些行是从原始数据矩阵“借来”的。

#include <stdio.h>
#include <stdlib.h>
#include <string.h>

void printArray(double const array[], unsigned int size)
{
    for (unsigned int i = 0; i < size; ++i)
    {
        printf("%-7.3f", array[i]);
    }
    printf("\n");
}

double **alloc_2dArray(unsigned int row, unsigned int col)
{
    double **Array = malloc(row * sizeof(double *));

    for (unsigned int i = 0; i < row; i++)
    {
        Array[i] = (double *)malloc(col * sizeof(double));
    }

    return Array;
}

void dealloc_2dArray(double **Array, unsigned int row)
{
    for (unsigned int i = 0; i < row; i++)
    {
        free(Array[i]);
    }
    free(Array);
}

int main()
{

    double **data = alloc_2dArray(4,4);
    double delta[4] = {1, 0, 1, 0};

    double **dataL = calloc(4, sizeof *dataL);
    double **dataR = calloc(4, sizeof *dataR);

    unsigned int i, j;

    double k = 0;

    // data initialization
    for (i = 0; i < 4; i++)
    {
        for (j = 0; j < 4; j++)
        {
            data[i][j] = k;
            k += 1;
        }
    }
    printf("data[]\n");
    for (i=0; i<4; ++i)
        printArray(data[i], 4);
    fputc('\n', stdout);

    unsigned int l = 0, r = 0;
    for (i = 0; i < 4; i++)
    {
        if ((int)delta[i] == 0)
        {
            dataL[l++] = data[i];
        }
        else
        {
            dataR[r++] = data[i];
        }
    }

    printf("l=%u r=%u\n", l, r);
    dataL = realloc(dataL, l * sizeof(double *));
    dataR = realloc(dataR, r * sizeof(double *));

    printf("dataL[]\n");
    for (i=0; i<l; ++i)
        printArray(dataL[i], 4);
    fputc('\n', stdout);

    printf("dataR[]\n");
    for (i=0; i<r; ++i)
        printArray(dataR[i], 4);
    fputc('\n', stdout);

    free(dataL);
    free(dataR);
    dealloc_2dArray(data,4);
    return 0;
}

输出

data[]
0.000  1.000  2.000  3.000  
4.000  5.000  6.000  7.000  
8.000  9.000  10.000 11.000 
12.000 13.000 14.000 15.000 

l=2 r=2
dataL[]
4.000  5.000  6.000  7.000  
12.000 13.000 14.000 15.000 

dataR[]
0.000  1.000  2.000  3.000  
8.000  9.000  10.000 11.000

关于那些行指针的实际所有权。只有你知道什么是合适的。如果原始 data 矩阵要继续拥有它们，则必须注意确保 data 及其行在 dataL 和 dataR 仍在使用时不被破坏.同样，如果 dataR 和 dataL 获得所有权，那么在释放 data 时不应该释放这些行（但仍然释放 data 的基指针床），并且让 dataL 和 dataR 释放不仅负责释放它们的基指针床，还负责释放行。

总之，就这样吧。

在决策树中拆分数据集步骤处理二维数组时，是否有办法避免使用“memcpy”？

Is there anyway to avoid use `memcpy` when dealing with 2D Array at splitting dataset step in decision tree?

c

pointers

decision-tree