C 文件到多个 char *groups by word delimiter

Question

我有一个包含类似以下内容的文件：

Really my data is here, and I think its really 
cool. Somewhere, i want to break on some really
awesome data. Please let me really explain what is going
'\n'
on. You are amazing. Something is really awesome. 
Please give me the stuffs.

我想创建一个数组，其中包含指向分隔词之间字符串的字符串指针。

char **字符串：

my data is here, and I think its
cool. Somewhere, i want to break on some
awesome data. Please let me
explain what is going'\n'on. You are amazing. Something is
awesome.'\n'Please give me the stuffs.

尝试的代码：

char *filedata = malloc(fileLength);
fread(filedata, end, 1, fp); //ABC
size_t stringCount = 8;
size_t idx = 0;
char **data = malloc(stringCount * sizeof(*packets));
if(!data) {
    fprintf(stderr, "There was an error");
    return 1;
}
fread(data, end, 1, text);
char *stuff = strtok(data, "really");
while(stuff) {
    data[idx++] = strdup(stuff);
    s = strtok(NULL, "stuff");
    if(idx >= stringCount) {
        stringCount *= 2;
        void *tmp = realloc(stuff, stringCount * sizeof(*stuff));
        if(!tmp) {
            perror("Unable to make a larger string list");
            stringCount /= 2;
            break;
        }
        stuff = tmp;
    }
}

这提供了一些我正在寻找的东西，但它没有对单词本身而不是字母进行定界。

Answer 1

您要在单词 "really" 上标记 "file" 的目标存在一些微妙的困难。这些是什么？文本文件通常一次读取一行，如果存储整个行文件，则作为多个指针，每个指针指向一行的开头。意思是，如果采用通用的 面向行 方法来读取文件，您的标记（从文件开头开始，或以单词 "really" 开始）可能跨越多行。因此，要标记化，您需要组合多行。

或者，您可以将整个文件读入单个缓冲区，然后使用 strstr 解析分隔符 "really"， 但是... ，您将需要确保保存文件的缓冲区 nul-terminated 以避免最终调用 strstr 的未定义行为。（通常将整个文件读入缓冲区不会导致 nul-terminated 缓冲区）

也就是说，即使使用 strstr，您也必须有效地手动解析文件的内容。您将需要保留三个指针（一个指向令牌开头的开始指针，一个用于搜索您的定界符的指针，以处理发现的定界符是 较大单词的较小包含子字符串的情况 ，最后是一个结束指针来标记令牌的结束。

该方案相当简单，您的第一个标记开始和文件的开头，每个后续标记都以单词 "really" 开头。所以你向前扫描找到 " really"（注意 " really" 之前的 space），将结束指针设置为令牌的开头 " really"，将令牌复制到缓冲区， /* do stuff with token */、free (token);，将您的开始指针更新为 "really" 的开头，将您的通用解析指针设置为过去 "really" 并重复直到 "really" 不是成立。当你退出解析循环时，你仍然需要 /* do stuff */ 和最终的标记。

您还可以决定如何处理每个令牌中包含的 '\n'。为了下面的输出目的，它们只是被 ' ' 覆盖。（您可以添加您喜欢的任何其他条件，例如消除由换行符替换引起的任何尾随或中间白色space，留给您）

总而言之，您可以执行类似于以下内容的操作，其中将文件内容读取到 nul-terminated 缓冲区由函数 read_file() 处理其余的分词只是在 main() 中处理，例如

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>

char *read_file (char* fname, size_t *nbytes)
{
    long bytes = 0;
    char* file_content;
    FILE *file = fopen(fname, "rb");

    if (!file)          /* validate file open for reading */
        return NULL;

    fseek (file, 0, SEEK_END);              /* fseek end of file */
    if ((bytes = ftell (file)) == -1) {     /* get number of bytes */
        fprintf (stderr, "error: unable to determine file length.\n");
        return NULL;
    }

    fseek (file, 0, SEEK_SET);              /* fseek beginning of file */

    /* allocate memory for file */
    if (!(file_content = malloc (bytes + 1))) { /* allocate/validate memory */
        perror ("malloc - virtual memory exhausted");
        return NULL;
    }

    /* read all data into file in single call to fread */
    if (fread (file_content, 1, (size_t)bytes, file) != (size_t)bytes) {
        fprintf (stderr, "error: failed to read %ld-bytes from '%s'.\n",
                bytes, fname);
        return NULL;
    }
    fclose (file);              /* close file */

    file_content[bytes] = 0;    /* nul terminate - to allow strstr use */

    *nbytes = (size_t)bytes;    /* update nbytes making size avialable */

    return file_content;        /* return pointer to caller */
}

int main (int argc, char **argv) {

    size_t nbytes;
    char *content;

    if (argc < 2) {     /* validate required argument givent */
        fprintf (stderr, "error: insufficient input. filename req'd.\n");
        return 1;
    }

    if ((content = read_file (argv[1], &nbytes))) { /* read/validate */
        char *sp = content,     /* start pointer for token */
            *p = sp,            /* pointer for parsing token */
            *ep = p;            /* end pointer one past end of token */
        const char *delim = " really";      /* delimiter */

        while ((ep = strstr (p, delim))) {  /* while delimiter found */
            if (isspace (*(ep + sizeof delim - 1)) ||   /* if next isspace */
                ispunct (*(ep + sizeof delim - 1))) {   /* or next ispunct */
                /* delimiter found */
                size_t tlen = ep - sp;              /* get token length */
                char *token = malloc (tlen + 1),    /* allocate for token */
                    *tp = token;                    /* pointer to token */
                if (!token) {                       /* validate allocation */
                    perror ("malloc-token");
                    exit (EXIT_FAILURE);
                }
                memcpy (token, sp, tlen);           /* copy to token */
                *(token + tlen) = 0;                /* nul-termiante */
                while (*tp) {               /* replace '\n' with ' ' */
                    if (*tp == '\n')
                        *tp = ' ';
                    tp++;
                }
                printf ("\ntoken: %s\n", token);    /* output token */
                /* do stuff with token */
                free (token);                       /* free token memory */
                sp = ep + 1;    /* advance start to beginning of next token */
            }
            p = ep + sizeof delim;  /* advance pointer */
        }
        p = sp;             /* use p to change '\n' to ' ' in last token */
        while (*p) {        /* replacement loop */
            if (*p == '\n')
                *p = ' ';
            p++;
        }
        printf ("\ntoken: %s\n", sp);
        /* do stuff with last token */

        free (content);     /* free buffer holding file */
    }

    return 0;
}

示例输入文件

$ cat dat/breakreally.txt
my data is here, and I think its really
cool. Somewhere, i want to break on some really
awesome data. Please let me really explain what is going
on. You are amazing.

例子Use/Output

$ ./bin/freadbreakreally dat/breakreally.txt

token: my data is here, and I think its

token: really  cool. Somewhere, i want to break on some

token: really awesome data. Please let me

token: really explain what is going on. You are amazing.

检查一下，如果您有任何问题，请告诉我。

C 文件到多个 char *groups by word delimiter

C file to multiple char *groups by word delimiter

c

string

strstr

strsplit