C中一个文件到另一个文件的转换

One file to another file conversion in C

我有一个名为 datafile.data 的输入文件,如下所示:

1,2,1,1,0
1,3,1,1,0
1,1,2,2,1
2,1,2,2,1
2,3,2,3,1
1,1,2,3,2
3,1,1,4,2
2,1,3,2,2
3,3,3,1,2
2,2,3,4,2

此处第 4 列代表 4 个属性值,例如 A1、A2、A3、A4。最后一列代表 class 值。对于这个特定的示例文件,有 4 个属性,但对于其他一些文件,可以有 'n' 个属性,但对于每个文件,最后一列将给出 class 个值。

现在我想将此文件转换为另一个文件,命名为:outputfile.exp

输出文件的第一行如下所示:

<Number of rows in the .data file> <Number of attributes> <Max value of A1> <Max value of A2> <Max value of A3> <Max value of A4> <(Max value of last column)+1>

输出文件的其余行将与数据文件相同,只有一个变化,即最后一列的每个值将递增1。

例如,上述示例的输出文件如下所示:

10 4 3 3 3 4 3
1,2,1,1,1
1,3,1,1,1
1,1,2,2,2
2,1,2,2,2
2,3,2,3,2
1,1,2,3,3
3,1,1,4,3
2,1,3,2,3
3,3,3,1,3
2,2,3,4,3

其中第1行的10是行数,4是存在的属性数,(3,3,3,4)这4个是属性A1,A2,A3和A4的最大值,最后一个3代表最高class值+1。最后一列的每个值也都增加了 1。

下面附上我的尝试:

#include <stdio.h>
#include <string.h>
#define MAX_FILE_NAME 100
  
int main()
{
    FILE *fp;
    int count = 0;  // Line counter (result)
    char filename[MAX_FILE_NAME], dataToBeRead[50];
    char c;  // To store a character read from file
  
    // Open the file
    fp = fopen("datafile.data", "r");
  
    // Check if file exists
    if (fp == NULL)
    {
        printf("Could not open file %s", filename);
        return 0;
    }
  
    // Extract characters from file and store in character c
    for (c = getc(fp); c != EOF; c = getc(fp))
        if (c == '\n') // Increment count if this character is newline
            count = count + 1;
  
    fclose(fp);
    
    printf("%d\n",count);
    
    fp = fopen("datafile.data", "r");
    
    if ( fp == NULL )
    {
        printf( "Failed to open." ) ;
    }
    else
    {
        while( fgets ( dataToBeRead, 50, fp ) != NULL )
        {
            printf( "%s" , dataToBeRead ) ;
        }
        fclose(fp) ;
    }
  
    return 0;
}

我得到以下输出:

10
1,2,1,1,1
1,3,1,1,1
1,1,2,2,2
2,1,2,2,2
2,3,2,3,2
1,1,2,3,3
3,1,1,4,3
2,1,3,2,3
3,3,3,1,3
2,2,3,4,3

现在我无法继续下去,因为我是C的新手,请帮助我。

编辑 1:示例的输出格式为:

10 4 3 3 3 4 3
1 2 1 1 1
1 3 1 1 1
1 1 2 2 2
2 1 2 2 2
2 3 2 3 2
1 1 2 3 3
3 1 1 4 3
2 1 3 2 3
3 3 3 1 3
2 2 3 4 3

您真的不想这样做,因为倒回输入流是一种反模式。但你可以这样做:

#include <limits.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

FILE * xfopen(const char *path, const char *mode);
void * xmalloc(size_t s);

void
parse_line(const char *buf, int *max, int column_count)
{
    for(int i = 0; i < column_count; i++ ){
        char *end;
        int t = strtol(buf, &end, 10);
        if( t > max[i] ){
            max[i] = t;
        }
        if( !((i < column_count - 1 && *end == ',')
            || (i == column_count - 1 && *end == '\n'))
        ){
            fprintf(stderr, "invalid input '%c' in %s", *end, buf);
            exit(1);
        }
        buf = end + 1;
    }
}


int
main(int argc, char **argv)
{
    const char *path = argc > 1 ? argv[1] : "stdin";
    FILE *in = argc > 1 ? xfopen(path, "r") : stdin;
    char buf[1024];
    int column_count = 1;
    int row_count = 1;
    int *max;

    /* Read first line to determine number of columns */
    if( fgets(buf, sizeof buf, in) == NULL ){
        fputs("Input error\n", stderr);
        return 1;
    }

    for( const char *p = buf; *p; p++ ){
        if( *p == ',' ){
            column_count += 1;
        }
    }
    max = xmalloc(column_count * sizeof *max);
    for( int i = 0; i < column_count; i++ ){
        max[i] = INT_MIN;
    }
    parse_line(buf, max, column_count);
    while( fgets(buf, sizeof buf, in) != NULL ){
        row_count += 1;
        parse_line(buf, max, column_count);
    }
    if( fseek(in, 0L, SEEK_SET) ){
        perror(path);
        return 1;
    }
    printf("%d %d ", row_count, column_count - 1);
    for( int i = 0; i < column_count - 1; i += 1 ){
        printf("%d ", max[i]);
    }
    printf("%d\n", max[column_count - 1] + 1);

    while( fgets(buf, sizeof buf, in) != NULL ){
        char *comma = strrchr(buf, ',');
        if( comma == NULL ){
            fprintf(stderr, "Invalid input\n");
            return 1;
        }
        *comma = '[=10=]';
        int k = strtol(comma + 1, NULL, 10);
        printf("%s,%d\n", buf, k + 1);
    }
}

FILE *
xfopen(const char *path, const char *mode)
{
    FILE *fp = path[0] != '-' || path[1] != '[=10=]' ? fopen(path, mode) :
        *mode == 'r' ? stdin : stdout;
    if( fp == NULL ){
        perror(path);
        exit(EXIT_FAILURE);
    }
    return fp;
}

void *
xmalloc(size_t s)
{
    void *rv = malloc(s);
    if( rv == NULL ){
        perror("malloc");
        exit(EXIT_FAILURE);
    }
    return rv;
}

您可以 ./a.out < datafile.data > outputfile.exp./a.out datafile.data > outputfile.exp 执行此操作,但如果您尝试从管道读取(seek 将失败)。 seek 失败以及无法 运行 将此作为过滤器使该方法成为次优方法,但将整个文件存储在内存中也有缺点。

由于 William Pursell 在 C 中提供了极好的答案,这里有一个 awk 替代方案,尽管 awk 没有标记。

awk -F, -v OFS="," '                            # assign input/output field separator to a comma
    NR==FNR {                                   # this block is invoked for the 1st read of the input file
        for (i = 1; i <= NF; i++) {             # loop over the filelds
            if (max[i] == "" || max[i] < $i) max[i] = $i
                                                # update the max values
        }
        nr = NR; nf = NF                        # store #records and #fields
        next                                    # skip following statements
    }
    FNR==1 {                                    # this block is invoked just before reading he 1st line for the 2nd read of the input file
        printf("%d %d ", nr, nf - 1)            # print #records and #fields - 1
        max[nf]++                               # increment the max value of the last field
        for (i = 1; i <= nf; i++) {             # print max values
            printf("%d%s", max[i], i==nf ? "\n" : " ");
        }
    }
    {                                           # this block is invoked for the 2nd read
        $nf++                                   # increment the value of the last field
        print                                   # print fields as csv
    }
' datafile.data datafile.data                   # read the input file twice

下面是修改后的代码,我想先读取 .names 文件,然后检查 .names 的最后一行是否有零,然后我想生成输出。

#include <limits.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

FILE * xfopen(const char *path, const char *mode);
void * xmalloc(size_t s);

void parse_line(const char *buf, int *max, int column_count)
{
    for(int i = 0; i < column_count; i++ ){
        char *end;
        int t = strtol(buf, &end, 10);
        
        if( t > max[i] ){
            max[i] = t;
        }
        
        if( !((i < column_count - 1 && *end == ',') || (i == column_count - 1 && *end == '\n')) ){
            fprintf(stderr, "invalid input '%c' in %s", *end, buf);
            exit(1);
        }
        buf = end + 1;
    }
}

int main(int argc, char **argv)
{

    char *path1;
    char *path = argc > 1 ? argv[1] : "stdin";
    
    sprintf(path, "%s.data", argv[1]);
    
    FILE *in = argc > 1 ? xfopen(path, "r") : stdin;
    
    char buf[1024];
    int column_count = 1;
    int row_count = 1;
    int *max;

    /* Read first line to determine number of columns */
    if( fgets(buf, sizeof buf, in) == NULL ){
        fputs("Input error\n", stderr);
        return 1;
    }

    for( const char *p = buf; *p; p++ ){
        if( *p == ',' ){
            column_count += 1;
        }
    }
    
    max = xmalloc(column_count * sizeof *max);
    
    for( int i = 0; i < column_count; i++ ){
        max[i] = INT_MIN;
    }
    
    parse_line(buf, max, column_count);
    while( fgets(buf, sizeof buf, in) != NULL ){
        row_count += 1;
        parse_line(buf, max, column_count);
    }
    
    if( fseek(in, 0L, SEEK_SET) ){
        perror(path);
        return 1;
    }
    
    printf("%d %d ", row_count, column_count - 1);
    
    for( int i = 0; i < column_count - 1; i += 1 ){
        printf("%d ", max[i]);
    }
    
    printf("%d\n", max[column_count - 1] + 1);

    while( fgets(buf, sizeof buf, in) != NULL ){
        char *comma = strrchr(buf, ',');
        if( comma == NULL ){
            fprintf(stderr, "Invalid input\n");
            return 1;
        }
        
        *comma = '[=10=]';
        int k = strtol(comma + 1, NULL, 10);
        for(char *p = buf; *p;  p++){
            if( *p == ',' ) *p = ' '; 
        }
        printf("%s %d\n", buf, k + 1);
    }
}

FILE *
xfopen(const char *path, const char *mode)
{
    FILE *fp = path[0] != '-' || path[1] != '[=10=]' ? fopen(path, mode) :
        *mode == 'r' ? stdin : stdout;
    if( fp == NULL ){
        perror(path);
        exit(EXIT_FAILURE);
    }
    
    return fp;
}

void *
xmalloc(size_t s)
{
    void *rv = malloc(s);
    if( rv == NULL ){
        perror("malloc");
        exit(EXIT_FAILURE);
    }
    return rv;
}