使用另一个字符串作为分隔符将字符串分解为标记列表?
Breaking up a string into a list of tokens using another string as a delimiter?
假设我有这个字符串:
char *myTestString = "Hello AND test AND test2";
我想将其分解为集合 { Hello
、test
、test2
},我最终可以对其进行迭代。
或者我有
char *myTestString2 = "Hi AND there AND test AND test2";
我想将其分解为集合 { Hi
、there
、test
、test2
},稍后我可以对其进行迭代。
如何使用 C 实现此目的?
编辑:
另一个例子是拆分 "Hello there AND test"
应该给出集合 { Hello there
, test
}。
为澄清起见,"AND"
是此处的分隔符。
注意:正如其他人提到的,strtok()
不适合字符串文字,在这种情况下,您应该使用 Chux 的答案 (strcspn
),但如果这不是问题,您可以使用存储在数组中的字符串,然后继续阅读。最后的手段是使用字符串文字的副本。
首先,您必须决定要为您的集合使用哪种数据结构(例如,一个简单的链表,您在插入之前检查重复项)。
然后,将 strtok()
用于您的字符串,如果当前标记不同于 "AND"
(或任何其他要忽略的字符串 - 您也会有一组忽略的字符串), 然后将其插入到集合中,否则继续下一个token。
这是一个基本的完整最小示例,可以帮助您入门:
#include <stdio.h>
#include <string.h>
#define N 3 // Max size of set
#define LEN 32 // Max length of word - 1
int main ()
{
char set[N][LEN] = {0};
char* ignore_str = "AND";
char str[] ="Hello AND test AND test2";
char* pch;
printf ("Splitting string \"%s\" into tokens:\n",str);
pch = strtok (str," ");
int i = 0;
while (pch != NULL)
{
printf ("%s\n",pch);
if(strcmp(pch, ignore_str))
strcpy(set[i++], pch);
pch = strtok (NULL, " ");
}
printf("My set is: {");
for(int j = 0; j < i; ++j)
printf("%s, ", set[j]);
printf("}\n");
return 0;
}
输出:
Splitting string "Hello AND test AND test2" into tokens:
Hello
AND
test
AND
test2
My set is: {Hello, test, test2, }
在这里,我使用数组来表示集合,假设集合的最大大小为3。当然,您可以使用更动态的方法(例如动态内存分配数组或列表)。
当代码不想改变源字符串时,使用 strcspn(s, delimet)
查找 s
not 的初始部分 delimit
。它 returns 偏移量。
使用 strspn(s, delimet)
查找由 delimit
组成的 s
的初始部分。它 returns 偏移量。
给你。
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
char ** split( const char *s1, const char *s2 )
{
char **tokens = malloc( sizeof( char * ) );
int success = tokens != NULL;
if ( success )
{
const char *delim = " \t";
*tokens = NULL;
for ( size_t n = 1, len = strlen( s2 ); success && *s1; )
{
s1 += strspn( s1, delim );
if ( *s1 )
{
const char *p = s1;
s1 += strcspn( s1, delim );
if ( strncmp( p, s2, len ) != 0 )
{
char **tmp = realloc( tokens, ( n + 1 ) * sizeof( char * ) );
if ( ( success = tmp != NULL ) )
{
tokens = tmp;
success = ( tokens[n-1] = calloc( 1, s1 - p + 1 ) ) != NULL;
strncpy( tokens[n-1], p, s1 - p );
tokens[n] = NULL;
++n;
}
if ( !success )
{
for ( size_t i = 0; i < n; i++ ) free( tokens[i] );
free( tokens );
}
}
}
}
}
return tokens;
}
int main(void)
{
const char *s1 = "Hi AND there AND test AND test2";
const char *s2 = "AND";
char **tokens = split( s1, s2 );
if ( tokens != NULL )
{
for ( char **p = tokens; *p != NULL; ++p )
{
puts( *p );
}
char **p = tokens;
do
{
free( *p );
} while ( *p++ != NULL );
free( tokens );
}
return 0;
}
程序输出为
Hi
there
test
test2
函数 returns 如果内存分配不成功则为 NULL。否则,它 returns 指向元素类型为 char * 最后一个元素为空指针的数组的指针。
源字符串中的单词由制表符和空格分隔。您可以根据需要更改分隔符。
在您对我之前的解决方案发表评论后,您似乎需要以下内容
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
char ** split( const char *s1, const char *s2 )
{
char **tokens = malloc( sizeof( char * ) );
int success = tokens != NULL;
if ( success )
{
const char *delim = " \t";
*tokens = NULL;
for ( size_t n = 1, len2 = strlen( s2 ); success && *s1; )
{
for ( int empty = 1; empty; )
{
s1 += strspn( s1, delim );
if ( ( empty = strncmp( s1, s2, len2 ) == 0 ) )
{
s1 += len2;
}
}
if ( *s1 )
{
const char *p = strstr( s1, s2 );
size_t len1 = p == NULL ? strlen( s1 ) : p - s1;
char **tmp = realloc( tokens, ( n + 1 ) * sizeof( char * ) );
if ( ( success = tmp != NULL ) )
{
tokens = tmp;
success = ( tokens[n-1] = calloc( 1, len1 + 1 ) ) != NULL;
strncpy( tokens[n-1], s1, len1 );
tokens[n] = NULL;
++n;
s1 += p == NULL ? len1 : len1 + len2;
}
if ( !success )
{
for ( size_t i = 0; i < n; i++ ) free( tokens[i] );
free( tokens );
}
}
}
}
return tokens;
}
int main(void)
{
const char *s1 = "Hi there AND test test2";
const char *s2 = "AND";
char **tokens = split( s1, s2 );
if ( tokens != NULL )
{
for ( char **p = tokens; *p != NULL; ++p )
{
puts( *p );
}
char **p = tokens;
do
{
free( *p );
} while ( *p++ != NULL );
free( tokens );
}
return 0;
}
程序输出为
Hi there
test test2
也许您还需要删除提取的子字符串的尾随空格,我希望您可以自己做。:)。
strstr
可以用来定位子串。检查前导和尾随字符是否为 space 或尾随终止零。
根据需要删除 whitespace.
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>
int main ( void) {
char *myTestString = " AND SANDY AND Hello there AND AND test AND test2 AND test3 ";
char *match = "AND";
char *first = myTestString;
char *start = myTestString;
char *find = myTestString;
int len = strlen ( match);
while ( isspace ( (unsigned char)*start)) {//skip leading whitespace
++start;
++first;
}
while ( ( find = strstr ( start, match))) {
if ( find != first) {
//check for leading and trailing space or terminating zero
while ( ! (isspace ( (unsigned char)*(find - 1))
&& ( isspace ( (unsigned char)*(find + len)) || 0 == *(find + len)))) {
find = strstr ( find + 1, match);
if ( ! find) {
find = start + strlen ( start);
while ( isspace ( (unsigned char)*(find - 1))) {
--find;
}
break;
}
}
int span = (int)(find - start);
if ( span) {
printf ( "%.*s\n", span, start);
}
}
start = find + strlen ( match);
while ( isspace ( (unsigned char)*start)) {//skip trailing whitespace
++start;
}
}
if ( *start) {
int end = strlen ( start) - 1;
while ( isspace ( (unsigned char)start[end])) {
--end;//remove trailing whitspace
}
printf ("%.*s\n", end + 1, start);
}
return 0;
}
分配内存给char**
,分配内存并复制每个token。
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>
char **freetokens ( char **tokens);
void showtokens ( char **tokens);
char **addtoken ( char **tokens, int *count, char *text, int size);
int main ( void) {
char *myTestString = " AND SANDY AND Hello there test AND AND test2 AND test3 ";
char *match = "AND";
char *first = myTestString;
char *start = myTestString;
char *find = myTestString;
char **tokens = NULL;
int items = 0;
int len = strlen ( match);
while ( isspace ( (unsigned char)*start)) {//skip leading whitespace
++start;
++first;
}
while ( ( find = strstr ( start, match))) {
if ( find != first) {
//check for leading and trailing space or terminating zero
while ( ! (isspace ( (unsigned char)*(find - 1))
&& ( isspace ( (unsigned char)*(find + len)) || 0 == *(find + len)))) {
find = strstr ( find + 1, match);
if ( ! find) {
find = start + strlen ( start);
while ( isspace ( (unsigned char)*(find - 1))) {
--find;//remove trailing whitespace
}
break;
}
}
int span = (int)(find - start);
if ( span) {
tokens = addtoken ( tokens, &items, start, span);
}
}
start = find + strlen ( match);
while ( isspace ( (unsigned char)*start)) {//skip trailing whitespace
++start;
}
}
if ( *start) {
int end = strlen ( start);
while ( isspace ( (unsigned char)start[end - 1])) {
--end;
}
tokens = addtoken ( tokens, &items, start, end);
}
showtokens ( tokens);
tokens = freetokens ( tokens);
return 0;
}
char **addtoken ( char **tokens, int *count, char *text, int size) {
char **temp = NULL;
if ( NULL == ( temp = realloc ( tokens, sizeof *tokens * ( *count + 2)))) {
fprintf ( stderr, "problem realloc tokens\n");
return tokens;
}
tokens = temp;
tokens[*count + 1] = NULL;//sentinel
if ( NULL == ( tokens[*count] = malloc ( size + 1))) {
fprintf ( stderr, "problem realloc tokens[]\n");
return tokens;
}
memmove ( tokens[*count], text, size);
tokens[*count][size] = 0;//terminate
++*count;
return tokens;
}
char **freetokens ( char **tokens) {
int each = 0;
while ( tokens && tokens[each]) {
free ( tokens[each]);
++each;
}
free ( tokens);
return NULL;
}
strstr()
是您正在寻找的工具。它可以在另一个字符串中定位一个字符串。
这是一个具有这些额外规格的简单解决方案:
- return 值是 (n+1) 个条目的数组,最后一个是空指针。
- 分隔符字符串可以出现在任何地方,包括单词内部。
- 子字符串被修剪:删除了开头和结尾的白色 space
- 子串分配
strndup()
,在 POSIX. 中标准化
- 分隔符字符串的长度必须至少为 1
#include <ctype.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
char *strdup_trim(const char *s, size_t n) {
while (n > 0 && isspace((unsigned char)*s)) {
s++;
n--;
}
while (n > 0 && isspace((unsigned char)s[n - 1])) {
n--;
}
return strndup(s, n);
}
char **split(const char *str, const char *sep) {
size_t i, n, sep_len = strlen(sep);
char **a;
const char *p, *p0;
if (sep_len == 0)
return NULL;
for (n = 0, p = str; (p = strstr(p, sep)) != NULL; n++, p += sep_len)
continue;
a = malloc(sizeof(*a) * (n + 2));
if (a == NULL)
return NULL;
for (i = 0, p = str; (p = strstr(p0 = p, sep)) != NULL; i++, p += sep_len) {
a[i] = strdup_trim(p0, p - p0);
}
a[i++] = strdup_trim(p0, strlen(p0));
a[i] = NULL;
return a;
}
void free_split(char **a) {
if (a) {
for (size_t i = 0; a[i]; i++)
free(a[i]);
free(a);
}
}
void test(const char *str, const char *sep) {
char **a = split(str, sep);
printf("split('%s', '%s') -> {", str, sep);
for (size_t i = 0; a[i]; i++)
printf("%s '%s'", &","[!i], a[i]);
printf(" }\n");
free_split(a);
}
int main() {
test("Hello AND test AND test2", "AND");
test("Hi AND there AND test AND test2", "AND");
test("Hello there AND test", "AND");
return 0;
}
输出:
split('Hello AND test AND test2', 'AND') -> { 'Hello', 'test', 'test2' }
split('Hi AND there AND test AND test2', 'AND') -> { 'Hi', 'there', 'test', 'test2' }
split('Hello there AND test', 'AND') -> { 'Hello there', 'test' }
假设我有这个字符串:
char *myTestString = "Hello AND test AND test2";
我想将其分解为集合 { Hello
、test
、test2
},我最终可以对其进行迭代。
或者我有
char *myTestString2 = "Hi AND there AND test AND test2";
我想将其分解为集合 { Hi
、there
、test
、test2
},稍后我可以对其进行迭代。
如何使用 C 实现此目的?
编辑:
另一个例子是拆分 "Hello there AND test"
应该给出集合 { Hello there
, test
}。
为澄清起见,"AND"
是此处的分隔符。
注意:正如其他人提到的,strtok()
不适合字符串文字,在这种情况下,您应该使用 Chux 的答案 (strcspn
),但如果这不是问题,您可以使用存储在数组中的字符串,然后继续阅读。最后的手段是使用字符串文字的副本。
首先,您必须决定要为您的集合使用哪种数据结构(例如,一个简单的链表,您在插入之前检查重复项)。
然后,将 strtok()
用于您的字符串,如果当前标记不同于 "AND"
(或任何其他要忽略的字符串 - 您也会有一组忽略的字符串), 然后将其插入到集合中,否则继续下一个token。
这是一个基本的完整最小示例,可以帮助您入门:
#include <stdio.h>
#include <string.h>
#define N 3 // Max size of set
#define LEN 32 // Max length of word - 1
int main ()
{
char set[N][LEN] = {0};
char* ignore_str = "AND";
char str[] ="Hello AND test AND test2";
char* pch;
printf ("Splitting string \"%s\" into tokens:\n",str);
pch = strtok (str," ");
int i = 0;
while (pch != NULL)
{
printf ("%s\n",pch);
if(strcmp(pch, ignore_str))
strcpy(set[i++], pch);
pch = strtok (NULL, " ");
}
printf("My set is: {");
for(int j = 0; j < i; ++j)
printf("%s, ", set[j]);
printf("}\n");
return 0;
}
输出:
Splitting string "Hello AND test AND test2" into tokens:
Hello
AND
test
AND
test2
My set is: {Hello, test, test2, }
在这里,我使用数组来表示集合,假设集合的最大大小为3。当然,您可以使用更动态的方法(例如动态内存分配数组或列表)。
当代码不想改变源字符串时,使用 strcspn(s, delimet)
查找 s
not 的初始部分 delimit
。它 returns 偏移量。
使用 strspn(s, delimet)
查找由 delimit
组成的 s
的初始部分。它 returns 偏移量。
给你。
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
char ** split( const char *s1, const char *s2 )
{
char **tokens = malloc( sizeof( char * ) );
int success = tokens != NULL;
if ( success )
{
const char *delim = " \t";
*tokens = NULL;
for ( size_t n = 1, len = strlen( s2 ); success && *s1; )
{
s1 += strspn( s1, delim );
if ( *s1 )
{
const char *p = s1;
s1 += strcspn( s1, delim );
if ( strncmp( p, s2, len ) != 0 )
{
char **tmp = realloc( tokens, ( n + 1 ) * sizeof( char * ) );
if ( ( success = tmp != NULL ) )
{
tokens = tmp;
success = ( tokens[n-1] = calloc( 1, s1 - p + 1 ) ) != NULL;
strncpy( tokens[n-1], p, s1 - p );
tokens[n] = NULL;
++n;
}
if ( !success )
{
for ( size_t i = 0; i < n; i++ ) free( tokens[i] );
free( tokens );
}
}
}
}
}
return tokens;
}
int main(void)
{
const char *s1 = "Hi AND there AND test AND test2";
const char *s2 = "AND";
char **tokens = split( s1, s2 );
if ( tokens != NULL )
{
for ( char **p = tokens; *p != NULL; ++p )
{
puts( *p );
}
char **p = tokens;
do
{
free( *p );
} while ( *p++ != NULL );
free( tokens );
}
return 0;
}
程序输出为
Hi
there
test
test2
函数 returns 如果内存分配不成功则为 NULL。否则,它 returns 指向元素类型为 char * 最后一个元素为空指针的数组的指针。
源字符串中的单词由制表符和空格分隔。您可以根据需要更改分隔符。
在您对我之前的解决方案发表评论后,您似乎需要以下内容
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
char ** split( const char *s1, const char *s2 )
{
char **tokens = malloc( sizeof( char * ) );
int success = tokens != NULL;
if ( success )
{
const char *delim = " \t";
*tokens = NULL;
for ( size_t n = 1, len2 = strlen( s2 ); success && *s1; )
{
for ( int empty = 1; empty; )
{
s1 += strspn( s1, delim );
if ( ( empty = strncmp( s1, s2, len2 ) == 0 ) )
{
s1 += len2;
}
}
if ( *s1 )
{
const char *p = strstr( s1, s2 );
size_t len1 = p == NULL ? strlen( s1 ) : p - s1;
char **tmp = realloc( tokens, ( n + 1 ) * sizeof( char * ) );
if ( ( success = tmp != NULL ) )
{
tokens = tmp;
success = ( tokens[n-1] = calloc( 1, len1 + 1 ) ) != NULL;
strncpy( tokens[n-1], s1, len1 );
tokens[n] = NULL;
++n;
s1 += p == NULL ? len1 : len1 + len2;
}
if ( !success )
{
for ( size_t i = 0; i < n; i++ ) free( tokens[i] );
free( tokens );
}
}
}
}
return tokens;
}
int main(void)
{
const char *s1 = "Hi there AND test test2";
const char *s2 = "AND";
char **tokens = split( s1, s2 );
if ( tokens != NULL )
{
for ( char **p = tokens; *p != NULL; ++p )
{
puts( *p );
}
char **p = tokens;
do
{
free( *p );
} while ( *p++ != NULL );
free( tokens );
}
return 0;
}
程序输出为
Hi there
test test2
也许您还需要删除提取的子字符串的尾随空格,我希望您可以自己做。:)。
strstr
可以用来定位子串。检查前导和尾随字符是否为 space 或尾随终止零。
根据需要删除 whitespace.
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>
int main ( void) {
char *myTestString = " AND SANDY AND Hello there AND AND test AND test2 AND test3 ";
char *match = "AND";
char *first = myTestString;
char *start = myTestString;
char *find = myTestString;
int len = strlen ( match);
while ( isspace ( (unsigned char)*start)) {//skip leading whitespace
++start;
++first;
}
while ( ( find = strstr ( start, match))) {
if ( find != first) {
//check for leading and trailing space or terminating zero
while ( ! (isspace ( (unsigned char)*(find - 1))
&& ( isspace ( (unsigned char)*(find + len)) || 0 == *(find + len)))) {
find = strstr ( find + 1, match);
if ( ! find) {
find = start + strlen ( start);
while ( isspace ( (unsigned char)*(find - 1))) {
--find;
}
break;
}
}
int span = (int)(find - start);
if ( span) {
printf ( "%.*s\n", span, start);
}
}
start = find + strlen ( match);
while ( isspace ( (unsigned char)*start)) {//skip trailing whitespace
++start;
}
}
if ( *start) {
int end = strlen ( start) - 1;
while ( isspace ( (unsigned char)start[end])) {
--end;//remove trailing whitspace
}
printf ("%.*s\n", end + 1, start);
}
return 0;
}
分配内存给char**
,分配内存并复制每个token。
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>
char **freetokens ( char **tokens);
void showtokens ( char **tokens);
char **addtoken ( char **tokens, int *count, char *text, int size);
int main ( void) {
char *myTestString = " AND SANDY AND Hello there test AND AND test2 AND test3 ";
char *match = "AND";
char *first = myTestString;
char *start = myTestString;
char *find = myTestString;
char **tokens = NULL;
int items = 0;
int len = strlen ( match);
while ( isspace ( (unsigned char)*start)) {//skip leading whitespace
++start;
++first;
}
while ( ( find = strstr ( start, match))) {
if ( find != first) {
//check for leading and trailing space or terminating zero
while ( ! (isspace ( (unsigned char)*(find - 1))
&& ( isspace ( (unsigned char)*(find + len)) || 0 == *(find + len)))) {
find = strstr ( find + 1, match);
if ( ! find) {
find = start + strlen ( start);
while ( isspace ( (unsigned char)*(find - 1))) {
--find;//remove trailing whitespace
}
break;
}
}
int span = (int)(find - start);
if ( span) {
tokens = addtoken ( tokens, &items, start, span);
}
}
start = find + strlen ( match);
while ( isspace ( (unsigned char)*start)) {//skip trailing whitespace
++start;
}
}
if ( *start) {
int end = strlen ( start);
while ( isspace ( (unsigned char)start[end - 1])) {
--end;
}
tokens = addtoken ( tokens, &items, start, end);
}
showtokens ( tokens);
tokens = freetokens ( tokens);
return 0;
}
char **addtoken ( char **tokens, int *count, char *text, int size) {
char **temp = NULL;
if ( NULL == ( temp = realloc ( tokens, sizeof *tokens * ( *count + 2)))) {
fprintf ( stderr, "problem realloc tokens\n");
return tokens;
}
tokens = temp;
tokens[*count + 1] = NULL;//sentinel
if ( NULL == ( tokens[*count] = malloc ( size + 1))) {
fprintf ( stderr, "problem realloc tokens[]\n");
return tokens;
}
memmove ( tokens[*count], text, size);
tokens[*count][size] = 0;//terminate
++*count;
return tokens;
}
char **freetokens ( char **tokens) {
int each = 0;
while ( tokens && tokens[each]) {
free ( tokens[each]);
++each;
}
free ( tokens);
return NULL;
}
strstr()
是您正在寻找的工具。它可以在另一个字符串中定位一个字符串。
这是一个具有这些额外规格的简单解决方案:
- return 值是 (n+1) 个条目的数组,最后一个是空指针。
- 分隔符字符串可以出现在任何地方,包括单词内部。
- 子字符串被修剪:删除了开头和结尾的白色 space
- 子串分配
strndup()
,在 POSIX. 中标准化
- 分隔符字符串的长度必须至少为 1
#include <ctype.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
char *strdup_trim(const char *s, size_t n) {
while (n > 0 && isspace((unsigned char)*s)) {
s++;
n--;
}
while (n > 0 && isspace((unsigned char)s[n - 1])) {
n--;
}
return strndup(s, n);
}
char **split(const char *str, const char *sep) {
size_t i, n, sep_len = strlen(sep);
char **a;
const char *p, *p0;
if (sep_len == 0)
return NULL;
for (n = 0, p = str; (p = strstr(p, sep)) != NULL; n++, p += sep_len)
continue;
a = malloc(sizeof(*a) * (n + 2));
if (a == NULL)
return NULL;
for (i = 0, p = str; (p = strstr(p0 = p, sep)) != NULL; i++, p += sep_len) {
a[i] = strdup_trim(p0, p - p0);
}
a[i++] = strdup_trim(p0, strlen(p0));
a[i] = NULL;
return a;
}
void free_split(char **a) {
if (a) {
for (size_t i = 0; a[i]; i++)
free(a[i]);
free(a);
}
}
void test(const char *str, const char *sep) {
char **a = split(str, sep);
printf("split('%s', '%s') -> {", str, sep);
for (size_t i = 0; a[i]; i++)
printf("%s '%s'", &","[!i], a[i]);
printf(" }\n");
free_split(a);
}
int main() {
test("Hello AND test AND test2", "AND");
test("Hi AND there AND test AND test2", "AND");
test("Hello there AND test", "AND");
return 0;
}
输出:
split('Hello AND test AND test2', 'AND') -> { 'Hello', 'test', 'test2' }
split('Hi AND there AND test AND test2', 'AND') -> { 'Hi', 'there', 'test', 'test2' }
split('Hello there AND test', 'AND') -> { 'Hello there', 'test' }