C中一个文件到另一个文件的转换
One file to another file conversion in C
我有一个名为 datafile.data
的输入文件,如下所示:
1,2,1,1,0
1,3,1,1,0
1,1,2,2,1
2,1,2,2,1
2,3,2,3,1
1,1,2,3,2
3,1,1,4,2
2,1,3,2,2
3,3,3,1,2
2,2,3,4,2
此处第 4 列代表 4 个属性值,例如 A1、A2、A3、A4。最后一列代表 class 值。对于这个特定的示例文件,有 4 个属性,但对于其他一些文件,可以有 'n' 个属性,但对于每个文件,最后一列将给出 class 个值。
现在我想将此文件转换为另一个文件,命名为:outputfile.exp
输出文件的第一行如下所示:
<Number of rows in the .data file> <Number of attributes> <Max value of A1> <Max value of A2> <Max value of A3> <Max value of A4> <(Max value of last column)+1>
输出文件的其余行将与数据文件相同,只有一个变化,即最后一列的每个值将递增1。
例如,上述示例的输出文件如下所示:
10 4 3 3 3 4 3
1,2,1,1,1
1,3,1,1,1
1,1,2,2,2
2,1,2,2,2
2,3,2,3,2
1,1,2,3,3
3,1,1,4,3
2,1,3,2,3
3,3,3,1,3
2,2,3,4,3
其中第1行的10是行数,4是存在的属性数,(3,3,3,4)这4个是属性A1,A2,A3和A4的最大值,最后一个3代表最高class值+1。最后一列的每个值也都增加了 1。
下面附上我的尝试:
#include <stdio.h>
#include <string.h>
#define MAX_FILE_NAME 100
int main()
{
FILE *fp;
int count = 0; // Line counter (result)
char filename[MAX_FILE_NAME], dataToBeRead[50];
char c; // To store a character read from file
// Open the file
fp = fopen("datafile.data", "r");
// Check if file exists
if (fp == NULL)
{
printf("Could not open file %s", filename);
return 0;
}
// Extract characters from file and store in character c
for (c = getc(fp); c != EOF; c = getc(fp))
if (c == '\n') // Increment count if this character is newline
count = count + 1;
fclose(fp);
printf("%d\n",count);
fp = fopen("datafile.data", "r");
if ( fp == NULL )
{
printf( "Failed to open." ) ;
}
else
{
while( fgets ( dataToBeRead, 50, fp ) != NULL )
{
printf( "%s" , dataToBeRead ) ;
}
fclose(fp) ;
}
return 0;
}
我得到以下输出:
10
1,2,1,1,1
1,3,1,1,1
1,1,2,2,2
2,1,2,2,2
2,3,2,3,2
1,1,2,3,3
3,1,1,4,3
2,1,3,2,3
3,3,3,1,3
2,2,3,4,3
现在我无法继续下去,因为我是C的新手,请帮助我。
编辑 1:示例的输出格式为:
10 4 3 3 3 4 3
1 2 1 1 1
1 3 1 1 1
1 1 2 2 2
2 1 2 2 2
2 3 2 3 2
1 1 2 3 3
3 1 1 4 3
2 1 3 2 3
3 3 3 1 3
2 2 3 4 3
您真的不想这样做,因为倒回输入流是一种反模式。但你可以这样做:
#include <limits.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
FILE * xfopen(const char *path, const char *mode);
void * xmalloc(size_t s);
void
parse_line(const char *buf, int *max, int column_count)
{
for(int i = 0; i < column_count; i++ ){
char *end;
int t = strtol(buf, &end, 10);
if( t > max[i] ){
max[i] = t;
}
if( !((i < column_count - 1 && *end == ',')
|| (i == column_count - 1 && *end == '\n'))
){
fprintf(stderr, "invalid input '%c' in %s", *end, buf);
exit(1);
}
buf = end + 1;
}
}
int
main(int argc, char **argv)
{
const char *path = argc > 1 ? argv[1] : "stdin";
FILE *in = argc > 1 ? xfopen(path, "r") : stdin;
char buf[1024];
int column_count = 1;
int row_count = 1;
int *max;
/* Read first line to determine number of columns */
if( fgets(buf, sizeof buf, in) == NULL ){
fputs("Input error\n", stderr);
return 1;
}
for( const char *p = buf; *p; p++ ){
if( *p == ',' ){
column_count += 1;
}
}
max = xmalloc(column_count * sizeof *max);
for( int i = 0; i < column_count; i++ ){
max[i] = INT_MIN;
}
parse_line(buf, max, column_count);
while( fgets(buf, sizeof buf, in) != NULL ){
row_count += 1;
parse_line(buf, max, column_count);
}
if( fseek(in, 0L, SEEK_SET) ){
perror(path);
return 1;
}
printf("%d %d ", row_count, column_count - 1);
for( int i = 0; i < column_count - 1; i += 1 ){
printf("%d ", max[i]);
}
printf("%d\n", max[column_count - 1] + 1);
while( fgets(buf, sizeof buf, in) != NULL ){
char *comma = strrchr(buf, ',');
if( comma == NULL ){
fprintf(stderr, "Invalid input\n");
return 1;
}
*comma = '[=10=]';
int k = strtol(comma + 1, NULL, 10);
printf("%s,%d\n", buf, k + 1);
}
}
FILE *
xfopen(const char *path, const char *mode)
{
FILE *fp = path[0] != '-' || path[1] != '[=10=]' ? fopen(path, mode) :
*mode == 'r' ? stdin : stdout;
if( fp == NULL ){
perror(path);
exit(EXIT_FAILURE);
}
return fp;
}
void *
xmalloc(size_t s)
{
void *rv = malloc(s);
if( rv == NULL ){
perror("malloc");
exit(EXIT_FAILURE);
}
return rv;
}
您可以 ./a.out < datafile.data > outputfile.exp
或 ./a.out datafile.data > outputfile.exp
执行此操作,但如果您尝试从管道读取(seek
将失败)。 seek
失败以及无法 运行 将此作为过滤器使该方法成为次优方法,但将整个文件存储在内存中也有缺点。
由于 William Pursell 在 C 中提供了极好的答案,这里有一个 awk
替代方案,尽管 awk
没有标记。
awk -F, -v OFS="," ' # assign input/output field separator to a comma
NR==FNR { # this block is invoked for the 1st read of the input file
for (i = 1; i <= NF; i++) { # loop over the filelds
if (max[i] == "" || max[i] < $i) max[i] = $i
# update the max values
}
nr = NR; nf = NF # store #records and #fields
next # skip following statements
}
FNR==1 { # this block is invoked just before reading he 1st line for the 2nd read of the input file
printf("%d %d ", nr, nf - 1) # print #records and #fields - 1
max[nf]++ # increment the max value of the last field
for (i = 1; i <= nf; i++) { # print max values
printf("%d%s", max[i], i==nf ? "\n" : " ");
}
}
{ # this block is invoked for the 2nd read
$nf++ # increment the value of the last field
print # print fields as csv
}
' datafile.data datafile.data # read the input file twice
下面是修改后的代码,我想先读取 .names 文件,然后检查 .names 的最后一行是否有零,然后我想生成输出。
#include <limits.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
FILE * xfopen(const char *path, const char *mode);
void * xmalloc(size_t s);
void parse_line(const char *buf, int *max, int column_count)
{
for(int i = 0; i < column_count; i++ ){
char *end;
int t = strtol(buf, &end, 10);
if( t > max[i] ){
max[i] = t;
}
if( !((i < column_count - 1 && *end == ',') || (i == column_count - 1 && *end == '\n')) ){
fprintf(stderr, "invalid input '%c' in %s", *end, buf);
exit(1);
}
buf = end + 1;
}
}
int main(int argc, char **argv)
{
char *path1;
char *path = argc > 1 ? argv[1] : "stdin";
sprintf(path, "%s.data", argv[1]);
FILE *in = argc > 1 ? xfopen(path, "r") : stdin;
char buf[1024];
int column_count = 1;
int row_count = 1;
int *max;
/* Read first line to determine number of columns */
if( fgets(buf, sizeof buf, in) == NULL ){
fputs("Input error\n", stderr);
return 1;
}
for( const char *p = buf; *p; p++ ){
if( *p == ',' ){
column_count += 1;
}
}
max = xmalloc(column_count * sizeof *max);
for( int i = 0; i < column_count; i++ ){
max[i] = INT_MIN;
}
parse_line(buf, max, column_count);
while( fgets(buf, sizeof buf, in) != NULL ){
row_count += 1;
parse_line(buf, max, column_count);
}
if( fseek(in, 0L, SEEK_SET) ){
perror(path);
return 1;
}
printf("%d %d ", row_count, column_count - 1);
for( int i = 0; i < column_count - 1; i += 1 ){
printf("%d ", max[i]);
}
printf("%d\n", max[column_count - 1] + 1);
while( fgets(buf, sizeof buf, in) != NULL ){
char *comma = strrchr(buf, ',');
if( comma == NULL ){
fprintf(stderr, "Invalid input\n");
return 1;
}
*comma = '[=10=]';
int k = strtol(comma + 1, NULL, 10);
for(char *p = buf; *p; p++){
if( *p == ',' ) *p = ' ';
}
printf("%s %d\n", buf, k + 1);
}
}
FILE *
xfopen(const char *path, const char *mode)
{
FILE *fp = path[0] != '-' || path[1] != '[=10=]' ? fopen(path, mode) :
*mode == 'r' ? stdin : stdout;
if( fp == NULL ){
perror(path);
exit(EXIT_FAILURE);
}
return fp;
}
void *
xmalloc(size_t s)
{
void *rv = malloc(s);
if( rv == NULL ){
perror("malloc");
exit(EXIT_FAILURE);
}
return rv;
}
我有一个名为 datafile.data
的输入文件,如下所示:
1,2,1,1,0
1,3,1,1,0
1,1,2,2,1
2,1,2,2,1
2,3,2,3,1
1,1,2,3,2
3,1,1,4,2
2,1,3,2,2
3,3,3,1,2
2,2,3,4,2
此处第 4 列代表 4 个属性值,例如 A1、A2、A3、A4。最后一列代表 class 值。对于这个特定的示例文件,有 4 个属性,但对于其他一些文件,可以有 'n' 个属性,但对于每个文件,最后一列将给出 class 个值。
现在我想将此文件转换为另一个文件,命名为:outputfile.exp
输出文件的第一行如下所示:
<Number of rows in the .data file> <Number of attributes> <Max value of A1> <Max value of A2> <Max value of A3> <Max value of A4> <(Max value of last column)+1>
输出文件的其余行将与数据文件相同,只有一个变化,即最后一列的每个值将递增1。
例如,上述示例的输出文件如下所示:
10 4 3 3 3 4 3
1,2,1,1,1
1,3,1,1,1
1,1,2,2,2
2,1,2,2,2
2,3,2,3,2
1,1,2,3,3
3,1,1,4,3
2,1,3,2,3
3,3,3,1,3
2,2,3,4,3
其中第1行的10是行数,4是存在的属性数,(3,3,3,4)这4个是属性A1,A2,A3和A4的最大值,最后一个3代表最高class值+1。最后一列的每个值也都增加了 1。
下面附上我的尝试:
#include <stdio.h>
#include <string.h>
#define MAX_FILE_NAME 100
int main()
{
FILE *fp;
int count = 0; // Line counter (result)
char filename[MAX_FILE_NAME], dataToBeRead[50];
char c; // To store a character read from file
// Open the file
fp = fopen("datafile.data", "r");
// Check if file exists
if (fp == NULL)
{
printf("Could not open file %s", filename);
return 0;
}
// Extract characters from file and store in character c
for (c = getc(fp); c != EOF; c = getc(fp))
if (c == '\n') // Increment count if this character is newline
count = count + 1;
fclose(fp);
printf("%d\n",count);
fp = fopen("datafile.data", "r");
if ( fp == NULL )
{
printf( "Failed to open." ) ;
}
else
{
while( fgets ( dataToBeRead, 50, fp ) != NULL )
{
printf( "%s" , dataToBeRead ) ;
}
fclose(fp) ;
}
return 0;
}
我得到以下输出:
10
1,2,1,1,1
1,3,1,1,1
1,1,2,2,2
2,1,2,2,2
2,3,2,3,2
1,1,2,3,3
3,1,1,4,3
2,1,3,2,3
3,3,3,1,3
2,2,3,4,3
现在我无法继续下去,因为我是C的新手,请帮助我。
编辑 1:示例的输出格式为:
10 4 3 3 3 4 3
1 2 1 1 1
1 3 1 1 1
1 1 2 2 2
2 1 2 2 2
2 3 2 3 2
1 1 2 3 3
3 1 1 4 3
2 1 3 2 3
3 3 3 1 3
2 2 3 4 3
您真的不想这样做,因为倒回输入流是一种反模式。但你可以这样做:
#include <limits.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
FILE * xfopen(const char *path, const char *mode);
void * xmalloc(size_t s);
void
parse_line(const char *buf, int *max, int column_count)
{
for(int i = 0; i < column_count; i++ ){
char *end;
int t = strtol(buf, &end, 10);
if( t > max[i] ){
max[i] = t;
}
if( !((i < column_count - 1 && *end == ',')
|| (i == column_count - 1 && *end == '\n'))
){
fprintf(stderr, "invalid input '%c' in %s", *end, buf);
exit(1);
}
buf = end + 1;
}
}
int
main(int argc, char **argv)
{
const char *path = argc > 1 ? argv[1] : "stdin";
FILE *in = argc > 1 ? xfopen(path, "r") : stdin;
char buf[1024];
int column_count = 1;
int row_count = 1;
int *max;
/* Read first line to determine number of columns */
if( fgets(buf, sizeof buf, in) == NULL ){
fputs("Input error\n", stderr);
return 1;
}
for( const char *p = buf; *p; p++ ){
if( *p == ',' ){
column_count += 1;
}
}
max = xmalloc(column_count * sizeof *max);
for( int i = 0; i < column_count; i++ ){
max[i] = INT_MIN;
}
parse_line(buf, max, column_count);
while( fgets(buf, sizeof buf, in) != NULL ){
row_count += 1;
parse_line(buf, max, column_count);
}
if( fseek(in, 0L, SEEK_SET) ){
perror(path);
return 1;
}
printf("%d %d ", row_count, column_count - 1);
for( int i = 0; i < column_count - 1; i += 1 ){
printf("%d ", max[i]);
}
printf("%d\n", max[column_count - 1] + 1);
while( fgets(buf, sizeof buf, in) != NULL ){
char *comma = strrchr(buf, ',');
if( comma == NULL ){
fprintf(stderr, "Invalid input\n");
return 1;
}
*comma = '[=10=]';
int k = strtol(comma + 1, NULL, 10);
printf("%s,%d\n", buf, k + 1);
}
}
FILE *
xfopen(const char *path, const char *mode)
{
FILE *fp = path[0] != '-' || path[1] != '[=10=]' ? fopen(path, mode) :
*mode == 'r' ? stdin : stdout;
if( fp == NULL ){
perror(path);
exit(EXIT_FAILURE);
}
return fp;
}
void *
xmalloc(size_t s)
{
void *rv = malloc(s);
if( rv == NULL ){
perror("malloc");
exit(EXIT_FAILURE);
}
return rv;
}
您可以 ./a.out < datafile.data > outputfile.exp
或 ./a.out datafile.data > outputfile.exp
执行此操作,但如果您尝试从管道读取(seek
将失败)。 seek
失败以及无法 运行 将此作为过滤器使该方法成为次优方法,但将整个文件存储在内存中也有缺点。
由于 William Pursell 在 C 中提供了极好的答案,这里有一个 awk
替代方案,尽管 awk
没有标记。
awk -F, -v OFS="," ' # assign input/output field separator to a comma
NR==FNR { # this block is invoked for the 1st read of the input file
for (i = 1; i <= NF; i++) { # loop over the filelds
if (max[i] == "" || max[i] < $i) max[i] = $i
# update the max values
}
nr = NR; nf = NF # store #records and #fields
next # skip following statements
}
FNR==1 { # this block is invoked just before reading he 1st line for the 2nd read of the input file
printf("%d %d ", nr, nf - 1) # print #records and #fields - 1
max[nf]++ # increment the max value of the last field
for (i = 1; i <= nf; i++) { # print max values
printf("%d%s", max[i], i==nf ? "\n" : " ");
}
}
{ # this block is invoked for the 2nd read
$nf++ # increment the value of the last field
print # print fields as csv
}
' datafile.data datafile.data # read the input file twice
下面是修改后的代码,我想先读取 .names 文件,然后检查 .names 的最后一行是否有零,然后我想生成输出。
#include <limits.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
FILE * xfopen(const char *path, const char *mode);
void * xmalloc(size_t s);
void parse_line(const char *buf, int *max, int column_count)
{
for(int i = 0; i < column_count; i++ ){
char *end;
int t = strtol(buf, &end, 10);
if( t > max[i] ){
max[i] = t;
}
if( !((i < column_count - 1 && *end == ',') || (i == column_count - 1 && *end == '\n')) ){
fprintf(stderr, "invalid input '%c' in %s", *end, buf);
exit(1);
}
buf = end + 1;
}
}
int main(int argc, char **argv)
{
char *path1;
char *path = argc > 1 ? argv[1] : "stdin";
sprintf(path, "%s.data", argv[1]);
FILE *in = argc > 1 ? xfopen(path, "r") : stdin;
char buf[1024];
int column_count = 1;
int row_count = 1;
int *max;
/* Read first line to determine number of columns */
if( fgets(buf, sizeof buf, in) == NULL ){
fputs("Input error\n", stderr);
return 1;
}
for( const char *p = buf; *p; p++ ){
if( *p == ',' ){
column_count += 1;
}
}
max = xmalloc(column_count * sizeof *max);
for( int i = 0; i < column_count; i++ ){
max[i] = INT_MIN;
}
parse_line(buf, max, column_count);
while( fgets(buf, sizeof buf, in) != NULL ){
row_count += 1;
parse_line(buf, max, column_count);
}
if( fseek(in, 0L, SEEK_SET) ){
perror(path);
return 1;
}
printf("%d %d ", row_count, column_count - 1);
for( int i = 0; i < column_count - 1; i += 1 ){
printf("%d ", max[i]);
}
printf("%d\n", max[column_count - 1] + 1);
while( fgets(buf, sizeof buf, in) != NULL ){
char *comma = strrchr(buf, ',');
if( comma == NULL ){
fprintf(stderr, "Invalid input\n");
return 1;
}
*comma = '[=10=]';
int k = strtol(comma + 1, NULL, 10);
for(char *p = buf; *p; p++){
if( *p == ',' ) *p = ' ';
}
printf("%s %d\n", buf, k + 1);
}
}
FILE *
xfopen(const char *path, const char *mode)
{
FILE *fp = path[0] != '-' || path[1] != '[=10=]' ? fopen(path, mode) :
*mode == 'r' ? stdin : stdout;
if( fp == NULL ){
perror(path);
exit(EXIT_FAILURE);
}
return fp;
}
void *
xmalloc(size_t s)
{
void *rv = malloc(s);
if( rv == NULL ){
perror("malloc");
exit(EXIT_FAILURE);
}
return rv;
}