从C中的txt文件中提取逗号分隔的字符串

extract comma separated strings from txt file in C

我需要从一个文件中读取以逗号分隔的不同字符串并将它们存储到一个数组中。

我有以下代码,是我在阅读不同的在线问题时开发的。

#include <stdio.h>
#include <stdlib.h>
#include <string.h>

int main (){
int N = 300; 
int L = 1000;
char Nseq[N][L];

FILE *myfile;
char *token;
const char s[2] = ",";
char line[300];
char* filename = "pathtofile.txt";
int n = 0;

myfile = fopen(filename, "r");
if (myfile == NULL) {printf("could not open file %s", filename); exit(0);}
while (fgets(line, sizeof(line), myfile) != NULL){
  token = strtok(line, s);
  while (token != NULL){
    strcpy(Nseq[n], token);
    printf("%s\t%u\n", token, n);
    token = strtok(NULL, s);
    n++;
  }
}
fclose(myfile);
}

我的 txt 文件如下:

1AAAAAAAAAAAAAAAAAAAAAAAAAAAA,2AAAAAAAAAAAAAAAAAAAAAAAAAAAA,3AAAAAAAAAAAAAAAAAAAAAAAAAAAA,4AAAAAAAAAAAAAAAAAAAAAAAAAAAA,5AAAAAAAAAAAAAAAAAAAAAAAAAAAA,6AAAAAAAAAAAAAAAAAAAAAAAAAAAA,7AAAAAAAAAAAAAAAAAAAAAAAAAAAA,8AAAAAAAAAAAAAAAAAAAAAAAAAAAA,9AAAAAAAAAAAAAAAAAAAAAAAAAAAA,10AAAAAAAAAAAAAAAAAAAAAAAAAAAA,11AAAAAAAAAAAAAAAAAAAAAAAAAAAA,12AAAAAAAAAAAAAAAAAAAAAAAAAAAA,13AAAAAAAAAAAAAAAAAAAAAAAAAAAA,14AAAAAAAAAAAAAAAAAAAAAAAAAAAA,15AAAAAAAAAAAAAAAAAAAAAAAAAAAA,16AAAAAAAAAAAAAAAAAAAAAAAAAAAA,17AAAAAAAAAAAAAAAAAAAAAAAAAAAA,18AAAAAAAAAAAAAAAAAAAAAAAAAAAA,19AAAAAAAAAAAAAAAAAAAAAAAAAAAA,20AAAAAAAAAAAAAAAAAAAAAAAAAAAA,21AAAAAAAAAAAAAAAAAAAAAAAAAAAA,22AAAAAAAAAAAAAAAAAAAAAAAAAAAA,23AAAAAAAAAAAAAAAAAAAAAAAAAAAA,24AAAAAAAAAAAAAAAAAAAAAAAAAAAA,25AAAAAAAAAAAAAAAAAAAAAAAAAAAA,26AAAAAAAAAAAAAAAAAAAAAAAAAAAA,27AAAAAAAAAAAAAAAAAAAAAAAAAAAA,28AAAAAAAAAAAAAAAAAAAAAAAAAAAA,29AAAAAAAAAAAAAAAAAAAAAAAAAAAA,30AAAAAAAAAAAAAAAAAAAAAAAAAAAA,

有 30 个字符串,没有换行符。

我的问题是,当我 运行 代码时,我得到以下输出:

1AAAAAAAAAAAAAAAAAAAAAAAAAAAA   0
2AAAAAAAAAAAAAAAAAAAAAAAAAAAA   1
3AAAAAAAAAAAAAAAAAAAAAAAAAAAA   2
4AAAAAAAAAAAAAAAAAAAAAAAAAAAA   3
5AAAAAAAAAAAAAAAAAAAAAAAAAAAA   4
6AAAAAAAAAAAAAAAAAAAAAAAAAAAA   5
7AAAAAAAAAAAAAAAAAAAAAAAAAAAA   6
8AAAAAAAAAAAAAAAAAAAAAAAAAAAA   7
9AAAAAAAAAAAAAAAAAAAAAAAAAAAA   8
10AAAAAAAAAAAAAAAAAAAAAAAAAAA   9
A       10
11AAAAAAAAAAAAAAAAAAAAAAAAAAAA  11
12AAAAAAAAAAAAAAAAAAAAAAAAAAAA  12
13AAAAAAAAAAAAAAAAAAAAAAAAAAAA  13
14AAAAAAAAAAAAAAAAAAAAAAAAAAAA  14
15AAAAAAAAAAAAAAAAAAAAAAAAAAAA  15
16AAAAAAAAAAAAAAAAAAAAAAAAAAAA  16
17AAAAAAAAAAAAAAAAAAAAAAAAAAAA  17
18AAAAAAAAAAAAAAAAAAAAAAAAAAAA  18
19AAAAAAAAAAAAAAAAAAAAAAAAAAAA  19
20AAAAAAAAAAAAAAAA      20
AAAAAAAAAAAA    21
21AAAAAAAAAAAAAAAAAAAAAAAAAAAA  22
22AAAAAAAAAAAAAAAAAAAAAAAAAAAA  23
23AAAAAAAAAAAAAAAAAAAAAAAAAAAA  24
24AAAAAAAAAAAAAAAAAAAAAAAAAAAA  25
25AAAAAAAAAAAAAAAAAAAAAAAAAAAA  26
26AAAAAAAAAAAAAAAAAAAAAAAAAAAA  27
27AAAAAAAAAAAAAAAAAAAAAAAAAAAA  28
28AAAAAAAAAAAAAAAAAAAAAAAAAAAA  29
29AAAAAAAAAAAAAAAAAAAAAAAAAAAA  30
30AAAAA 31
AAAAAAAAAAAAAAAAAAAAAAA 32

        33

我试过不同的长度,迟早会出现这些奇怪的分裂。

有人知道为什么会这样吗?谢谢!

您的文本文件大小为 921 个字符,并且是 行。

您的 line 缓冲区 只有 300 个字符。

所以,你被截断了。

另外,请注意您的文件有没有 换行符。而且,您的代码没有处理 换行符的情况(特别是,如果该行以 ,<newline> 结尾)。

简单解决方案是增加line的大小,使其大于文件的大小(例如)char line[10000];

长期解决方案是使用(例如)fgetc 逐个字符地读取文件,然后将分隔符后的标记复制到 Nseq[n] 和 store/print 中。

或者,您可以 stat 文件,并使用 malloc 分配文件大小的缓冲区。

但是,虽然稍微高级一些,但是最快的方法[特别是对于大文件],是 stat 文件,mmap 它,然后扫描缓冲区。这将在任何 64 位机器上运行良好,或者您可以将其映射到 32 位机器上


这是一个使用fgetc的版本:

#include <stdio.h>
#include <stdlib.h>
#include <string.h>

int
main(void)
{
    int N = 300;
    int L = 1000;
    char Nseq[N][L];

    FILE *myfile;
    char *token;
    char *filename = "pathtofile.txt";
    int chr;
    int n = 0;

    myfile = fopen(filename, "r");
    if (myfile == NULL) {
        printf("could not open file %s", filename);
        exit(0);
    }

    token = Nseq[n];

    while (1) {
        chr = fgetc(myfile);
        if (chr == EOF)
            break;

        switch (chr) {
        case ',':
        case '\n':
            *token = 0;
            if (token > Nseq[n]) {
                printf("%s\t%u\n", Nseq[n], n);
                ++n;
            }
            token = Nseq[n];
            break;

        default:
            *token++ = chr;
            break;
        }
    }

    fclose(myfile);

    return 0;
}

这是一个使用malloc的版本:

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/stat.h>

int
main(void)
{
    int N = 300;
    int L = 1000;
    char Nseq[N][L];

    FILE *myfile;
    char *token;
    const char s[2] = ",";
    char *line;
    int len;
    char *filename = "pathtofile.txt";
    int err;
    struct stat st;
    int n = 0;

    err = stat(filename,&st);
    if (err < 0) {
        printf("could not stat file %s", filename);
        exit(1);
    }
    len = st.st_size + 1;

    line = malloc(len);

    myfile = fopen(filename, "r");
    if (myfile == NULL) {
        printf("could not open file %s", filename);
        exit(1);
    }

    while (fgets(line, len, myfile) != NULL) {
        token = strtok(line, s);
        while (token != NULL) {
            strcpy(Nseq[n], token);
            printf("%s\t%u\n", token, n);
            token = strtok(NULL, s);
            n++;
        }
    }

    fclose(myfile);

    return 0;
}

这是一个使用mmap的版本:

#include <stdio.h>
#include <stdlib.h>
#include <fcntl.h>
#include <unistd.h>
#include <sys/stat.h>
#include <sys/mman.h>

int
main(void)
{
    int N = 300;
    int L = 1000;
    char Nseq[N][L];

    char *token;
    char *line;
    char *cur;
    char *end;
    char *filename = "pathtofile.txt";
    int fd;
    int chr;
    int n = 0;
    int err;
    struct stat st;
    size_t len;

    fd = open(filename,O_RDONLY);
    if (fd < 0) {
        printf("could not open file %s", filename);
        exit(1);
    }

    err = fstat(fd,&st);
    if (err < 0) {
        printf("could not stat file %s", filename);
        exit(1);
    }
    len = st.st_size;

    line = mmap(NULL,len,PROT_READ,MAP_PRIVATE,fd,0);
    if (line == MAP_FAILED) {
        printf("could not mmap file %s", filename);
        exit(1);
    }

    cur = line;
    end = &line[len];
    token = Nseq[n];

    for (cur = line;  cur < end;  ++cur) {
        chr = *cur;

        switch (chr) {
        case ',':
        case '\n':
            *token = 0;
            if (token > Nseq[n]) {
                printf("%s\t%u\n", Nseq[n], n);
                ++n;
            }
            token = Nseq[n];
            break;

        default:
            *token++ = chr;
            break;
        }
    }

    munmap(line,len);
    close(fd);

    return 0;
}