使用另一个字符串作为分隔符将字符串分解为标记列表？

Question

假设我有这个字符串：

char *myTestString = "Hello AND test AND test2";

我想将其分解为集合 { Hello、test、test2 }，我最终可以对其进行迭代。

或者我有

char *myTestString2 = "Hi AND there AND test AND test2";

我想将其分解为集合 { Hi、there、test、test2 }，稍后我可以对其进行迭代。

如何使用 C 实现此目的？

编辑：另一个例子是拆分 "Hello there AND test" 应该给出集合 { Hello there, test }。为澄清起见，"AND" 是此处的分隔符。

Answer 1

注意：正如其他人提到的，strtok() 不适合字符串文字，在这种情况下，您应该使用 Chux 的答案 (strcspn)，但如果这不是问题，您可以使用存储在数组中的字符串，然后继续阅读。最后的手段是使用字符串文字的副本。

首先，您必须决定要为您的集合使用哪种数据结构（例如，一个简单的链表，您在插入之前检查重复项）。

然后，将 strtok() 用于您的字符串，如果当前标记不同于 "AND" （或任何其他要忽略的字符串 - 您也会有一组忽略的字符串), 然后将其插入到集合中，否则继续下一个token。

这是一个基本的完整最小示例，可以帮助您入门：

#include <stdio.h>
#include <string.h>

#define N 3     // Max size of set
#define LEN 32  // Max length of word - 1

int main ()
{
  char set[N][LEN] = {0};
  char* ignore_str = "AND";
  char str[] ="Hello AND test AND test2";
  char* pch;
  printf ("Splitting string \"%s\" into tokens:\n",str);
  pch = strtok (str," ");
  int i = 0;
  while (pch != NULL)
  {
    printf ("%s\n",pch);
    if(strcmp(pch, ignore_str))
      strcpy(set[i++], pch);
    pch = strtok (NULL, " ");
  }
  printf("My set is: {");
  for(int j = 0; j < i; ++j)
    printf("%s, ", set[j]);
  printf("}\n");
  return 0;
}

输出：

Splitting string "Hello AND test AND test2" into tokens:
Hello
AND
test
AND
test2
My set is: {Hello, test, test2, }

在这里，我使用数组来表示集合，假设集合的最大大小为3。当然，您可以使用更动态的方法（例如动态内存分配数组或列表）。

Answer 2

当代码不想改变源字符串时，使用 strcspn(s, delimet) 查找 s not 的初始部分 delimit。它 returns 偏移量。

使用 strspn(s, delimet) 查找由 delimit 组成的 s 的初始部分。它 returns 偏移量。

Answer 3

给你。

#include <stdio.h>
#include <stdlib.h>
#include <string.h>

char ** split( const char *s1, const char *s2 )
{
    char **tokens = malloc( sizeof( char * ) );
    int success = tokens != NULL;

    if ( success )
    {
        const char *delim = " \t";
        *tokens = NULL;

        for ( size_t n = 1, len = strlen( s2 ); success && *s1; )
        {
            s1 += strspn( s1, delim );

            if ( *s1 )
            {
                const char *p = s1;

                s1 += strcspn( s1, delim );

                if ( strncmp( p, s2, len ) != 0 )
                {
                    char **tmp = realloc( tokens, ( n + 1 ) * sizeof( char * ) );

                    if ( ( success = tmp != NULL ) )
                    {
                        tokens = tmp;

                        success = ( tokens[n-1] = calloc( 1, s1 - p + 1 ) )  != NULL;
                        strncpy( tokens[n-1], p, s1 - p );
                        tokens[n] = NULL;
                        ++n;
                    }

                    if ( !success )
                    {
                        for ( size_t i = 0; i < n; i++ ) free( tokens[i] );
                        free( tokens );
                    }
                }
            }
        }
    }       

    return tokens;
}

int main(void) 
{
    const char *s1 = "Hi AND there AND test AND test2";
    const char *s2 = "AND";

    char **tokens = split( s1, s2 );

    if ( tokens != NULL )
    {
        for ( char **p = tokens; *p != NULL; ++p )
        {
            puts( *p );
        }

        char **p = tokens;
        do
        {
            free( *p );
        } while ( *p++ != NULL );

        free( tokens );
    }

    return 0;
}

程序输出为

Hi
there
test
test2

函数 returns 如果内存分配不成功则为 NULL。否则，它 returns 指向元素类型为 char * 最后一个元素为空指针的数组的指针。

源字符串中的单词由制表符和空格分隔。您可以根据需要更改分隔符。

在您对我之前的解决方案发表评论后，您似乎需要以下内容

#include <stdio.h>
#include <stdlib.h>
#include <string.h>

char ** split( const char *s1, const char *s2 )
{
    char **tokens = malloc( sizeof( char * ) );
    int success = tokens != NULL;

    if ( success )
    {
        const char *delim = " \t";
        *tokens = NULL;

        for ( size_t n = 1, len2 = strlen( s2 ); success && *s1; )
        {
            for ( int empty = 1; empty; ) 
            {
                s1 += strspn( s1, delim );
                if ( ( empty = strncmp( s1, s2, len2 ) == 0 ) )
                {
                    s1 += len2;
                }
            }               

            if ( *s1 )
            {
                const char *p = strstr( s1, s2 );

                size_t len1 = p == NULL ? strlen( s1 ) : p - s1;

                char **tmp = realloc( tokens, ( n + 1 ) * sizeof( char * ) );

                if ( ( success = tmp != NULL ) )
                {
                    tokens = tmp;

                    success = ( tokens[n-1] = calloc( 1, len1 +  1 ) )  != NULL;
                    strncpy( tokens[n-1], s1, len1 );
                    tokens[n] = NULL;
                    ++n;

                    s1 += p == NULL ? len1 : len1 + len2; 
                }

                if ( !success )
                {
                    for ( size_t i = 0; i < n; i++ ) free( tokens[i] );
                    free( tokens );
                }
            }
        }
    }       

    return tokens;
}

int main(void) 
{
    const char *s1 = "Hi there AND test test2";
    const char *s2 = "AND";

    char **tokens = split( s1, s2 );

    if ( tokens != NULL )
    {
        for ( char **p = tokens; *p != NULL; ++p )
        {
            puts( *p );
        }

        char **p = tokens;
        do
        {
            free( *p );
        } while ( *p++ != NULL );

        free( tokens );
    }

    return 0;
}

程序输出为

Hi there 
test test2

也许您还需要删除提取的子字符串的尾随空格，我希望您可以自己做。:)。

Answer 4

strstr可以用来定位子串。检查前导和尾随字符是否为 space 或尾随终止零。
根据需要删除 whitespace.

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>

int main ( void) {
    char *myTestString = "   AND SANDY AND Hello there AND AND test AND test2 AND test3    ";
    char *match = "AND";
    char *first = myTestString;
    char *start = myTestString;
    char *find = myTestString;
    int len = strlen ( match);

    while ( isspace ( (unsigned char)*start)) {//skip leading whitespace
        ++start;
        ++first;
    }
    while ( ( find = strstr ( start, match))) {
        if ( find != first) {
            //check for leading and trailing space or terminating zero
            while ( ! (isspace ( (unsigned char)*(find - 1))
            &&  ( isspace ( (unsigned char)*(find + len)) || 0 == *(find + len)))) {
                find = strstr ( find + 1, match);
                if ( ! find) {
                    find = start + strlen ( start);
                    while ( isspace ( (unsigned char)*(find - 1))) {
                        --find;
                    }
                    break;
                }
            }
            int span = (int)(find - start);
            if ( span) {
                printf ( "%.*s\n", span, start);
            }
        }
        start = find + strlen ( match);
        while ( isspace ( (unsigned char)*start)) {//skip trailing whitespace
            ++start;
        }
    }
    if ( *start) {
        int end = strlen ( start) - 1;
        while ( isspace ( (unsigned char)start[end])) {
            --end;//remove trailing whitspace
        }
        printf ("%.*s\n", end + 1, start);
    }

    return 0;
}

分配内存给char**，分配内存并复制每个token。

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>

char **freetokens ( char **tokens);
void showtokens ( char **tokens);
char **addtoken ( char **tokens, int *count, char *text, int size);

int main ( void) {
    char *myTestString = "   AND SANDY AND Hello there test AND AND test2 AND test3   ";
    char *match = "AND";
    char *first = myTestString;
    char *start = myTestString;
    char *find = myTestString;
    char **tokens = NULL;
    int items = 0;
    int len = strlen ( match);

    while ( isspace ( (unsigned char)*start)) {//skip leading whitespace
        ++start;
        ++first;
    }
    while ( ( find = strstr ( start, match))) {
        if ( find != first) {
            //check for leading and trailing space or terminating zero
            while ( ! (isspace ( (unsigned char)*(find - 1))
            &&  ( isspace ( (unsigned char)*(find + len)) || 0 == *(find + len)))) {
                find = strstr ( find + 1, match);
                if ( ! find) {
                    find = start + strlen ( start);
                    while ( isspace ( (unsigned char)*(find - 1))) {
                        --find;//remove trailing whitespace
                    }
                    break;
                }
            }
            int span = (int)(find - start);
            if ( span) {
                tokens = addtoken ( tokens, &items, start, span);
            }
        }
        start = find + strlen ( match);
        while ( isspace ( (unsigned char)*start)) {//skip trailing whitespace
            ++start;
        }
    }
    if ( *start) {
        int end = strlen ( start);
        while ( isspace ( (unsigned char)start[end - 1])) {
            --end;
        }
        tokens = addtoken ( tokens, &items, start, end);
    }

    showtokens ( tokens);

    tokens = freetokens ( tokens);

    return 0;
}

char **addtoken ( char **tokens, int *count, char *text, int size) {
    char **temp = NULL;
    if ( NULL == ( temp = realloc ( tokens, sizeof *tokens * ( *count + 2)))) {
        fprintf ( stderr, "problem realloc tokens\n");
        return tokens;
    }
    tokens = temp;
    tokens[*count + 1] = NULL;//sentinel
    if ( NULL == ( tokens[*count] = malloc ( size + 1))) {
        fprintf ( stderr, "problem realloc tokens[]\n");
        return tokens;
    }
    memmove ( tokens[*count], text, size);
    tokens[*count][size] = 0;//terminate
    ++*count;

    return tokens;
}

char **freetokens ( char **tokens) {
    int each = 0;
    while ( tokens && tokens[each]) {
        free ( tokens[each]);
        ++each;
    }
    free ( tokens);

    return NULL;
}

Answer 5

strstr() 是您正在寻找的工具。它可以在另一个字符串中定位一个字符串。

这是一个具有这些额外规格的简单解决方案：

return 值是 (n+1) 个条目的数组，最后一个是空指针。
分隔符字符串可以出现在任何地方，包括单词内部。
子字符串被修剪：删除了开头和结尾的白色 space
子串分配 strndup() ，在 POSIX.
分隔符字符串的长度必须至少为 1

#include <ctype.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

char *strdup_trim(const char *s, size_t n) {
    while (n > 0 && isspace((unsigned char)*s)) {
        s++;
        n--;
    }
    while (n > 0 && isspace((unsigned char)s[n - 1])) {
        n--;
    }
    return strndup(s, n);
}

char **split(const char *str, const char *sep) {
    size_t i, n, sep_len = strlen(sep);
    char **a;
    const char *p, *p0;

    if (sep_len == 0)
        return NULL;
    for (n = 0, p = str; (p = strstr(p, sep)) != NULL; n++, p += sep_len)
        continue;
    a = malloc(sizeof(*a) * (n + 2));
    if (a == NULL)
        return NULL;
    for (i = 0, p = str; (p = strstr(p0 = p, sep)) != NULL; i++, p += sep_len) {
        a[i] = strdup_trim(p0, p - p0);
    }
    a[i++] = strdup_trim(p0, strlen(p0));
    a[i] = NULL;
    return a;
}

void free_split(char **a) {
    if (a) {
        for (size_t i = 0; a[i]; i++)
            free(a[i]);
        free(a);
    }
}

void test(const char *str, const char *sep) {
    char **a = split(str, sep);
    printf("split('%s', '%s') -> {", str, sep);
    for (size_t i = 0; a[i]; i++)
        printf("%s '%s'", &","[!i], a[i]);
    printf(" }\n");
    free_split(a);
}

int main() {
    test("Hello AND test AND test2", "AND");
    test("Hi AND there AND test AND test2", "AND");
    test("Hello there AND test", "AND");
    return 0;
}

输出：

split('Hello AND test AND test2', 'AND') -> { 'Hello', 'test', 'test2' }
split('Hi AND there AND test AND test2', 'AND') -> { 'Hi', 'there', 'test', 'test2' }
split('Hello there AND test', 'AND') -> { 'Hello there', 'test' }

使用另一个字符串作为分隔符将字符串分解为标记列表？

Breaking up a string into a list of tokens using another string as a delimiter?

c

string

split

c-strings

string-matching