如何在使用 strtok() 后正确 return 多维数组

How to return multi dimension array properly after using strtok()

我正在尝试制作一个接受字符串和定界符的程序,

并使用分隔符将字符串分成一系列标记。

最后将每个token存储到一个多维数组中。

代码:

char** get_tokens(const char* str, char delim) {
  int i=4;
  
  char *ar[i];

  const char* delim2 = &delim;//to put this as a parameter in strtok()
  char strcopy[50];
  strcpy(strcopy,str);
  char* token;
  token = strtok(strcopy,delim2);//break str into pieces by '+'sign

  int k;
  for (k=0;k<i;k++){
    ar[k] = token;

    token = strtok(NULL,delim2);

  }

  int n;
   for (n=0;n<i;n++)
     printf("ar[%d] is %s\n",n,ar[n]);
  
  return ar;
     
}

int main(){
  
    char** tokens = get_tokens("++All+Along+the+Watchtower++", '+');

    for (int k =0;k<4;k++){
      printf("tokens[%d] is this %s\n",k,tokens[k]);
    }

  return 0;
}

函数 strtok() 工作正常,输出为

ar[0] is All
ar[1] is Along
ar[2] is the
ar[3] is Watchtower

但在主函数中,我希望数组标记得到完全相同的结果,但输出是

tokens[0] is this All
tokens[1] is this (null)
tokens[2] is this 
tokens[3] is this (null)

所以我猜它没有正确返回 ar,因为在索引 0 之后它返回 null。

此外,我收到一条警告:

warning: address of stack memory associated with local variable 'ar' returned [-Wreturn-stack-address]
  return ar;
         ^~
1 warning generated.

你知道为什么会这样吗?

整个输出是

ar[0] is All
ar[1] is Along
ar[2] is the
ar[3] is Watchtower
tokens[0] is this All
tokens[1] is this (null)
tokens[2] is this 
tokens[3] is this (null)

唉,您的代码存在不止一个基本问题。

  • 您正在努力 return VLA。那行不通;不要这样做。
  • 你不是 null-terminating 你的分隔符字符串。
  • 你的函数不能self-determine令牌的数量。

但是,我认为这是一个有趣的编程练习,并提出了一个通用的解决方案。这是带有文档和完全可选的默认参数宏魔法的 header(感谢 Braden Steffaniak’s excellent macro mojo here):

split.h

// Copyright 2021 Michael Thomas Greer.
// Distributed under the Boost Software License, Version 1.0.
// (See accompanying file LICENSE_1_0.txt or copy at
//  https://www.boost.org/LICENSE_1_0.txt )

/*

  char **
  split(
    const char * s,
    const char * sep         = NULL,  // --> whitespace: " \f\n\r\v\t"
    bool         is_dup_s    = true,  // --> non-destructive of source?
    int          granularity = 0      // --> default granularity
  );

  Function:
    Split a string into tokens, much like strtok(). Tokens are delimited
    by the argument separator characters. Empty tokens are not returned.

  Returns:
    • a NULL-terminated array of pointers to the tokens in s.
      You must free() the resulting array. Do NOT free individual tokens!
    • NULL on failure (due to a memory re/allocation failure).

  Arguments:
    s           • The source string to tokenize.
    sep         • Separator characters. Defaults to all whitespace.
    is_dup_s    • By default the source string is duplicated so that
                  the tokenization can be done non-destructively (for
                  example, on literals). If you don't care about the
                  source, or the source is sufficiently large that
                  duplication could be a problem, then turn this off.
    granularity • The algorithm works by building a table of token
                  indices. This is the growth size of that table.
                  It defaults to a reasonably small size. But if you
                  have a good idea of the number of tokens you will
                  typically generate, set it to that.

  Uses totally-optional macro magic for elided default arguments.
  No macros == no elided default argument magic. (You can still specify
  default values for arguments, though.)
*/

#ifndef DUTHOMHAS_SPLIT_H
#define DUTHOMHAS_SPLIT_H

#include <stdbool.h>

char ** split( const char * s, const char * sep, bool is_dup_s, int granularity );

// https://whosebug.com/a/24028231/2706707
#define SPLIT_GLUE(x, y) x y

#define SPLIT_RETURN_ARG_COUNT(_1_, _2_, _3_, _4_, count, ...) count
#define SPLIT_EXPAND_ARGS(args) SPLIT_RETURN_ARG_COUNT args
#define SPLIT_COUNT_ARGS_MAX5(...) SPLIT_EXPAND_ARGS((__VA_ARGS__, 4, 3, 2, 1, 0))

#define SPLIT_OVERLOAD_MACRO2(name, count) name##count
#define SPLIT_OVERLOAD_MACRO1(name, count) SPLIT_OVERLOAD_MACRO2(name, count)
#define SPLIT_OVERLOAD_MACRO(name, count) SPLIT_OVERLOAD_MACRO1(name, count)

#define SPLIT_CALL_OVERLOAD(name, ...) SPLIT_GLUE(SPLIT_OVERLOAD_MACRO(name, SPLIT_COUNT_ARGS_MAX5(__VA_ARGS__)), (__VA_ARGS__))

#define split(...) SPLIT_CALL_OVERLOAD( SPLIT, __VA_ARGS__ )
#define SPLIT1(s)           (split)( s, NULL, true, 0 )
#define SPLIT2(s,sep)       (split)( s, sep,  true, 0 )
#define SPLIT3(s,sep,ids)   (split)( s, sep,  ids,  0 )
#define SPLIT4(s,sep,ids,g) (split)( s, sep,  ids,  g )

#endif

这里是重要的一点:

split.c

#include <stdbool.h>
#include <stdlib.h>
#include <string.h>

char ** split( const char * s, const char * sep, bool is_dup_s, int granularity )
{
  char **  result;
  typedef size_t slot[ 2 ];
  int      max_slots  = (granularity > 0) ? granularity : 32;
  int      num_slots  = 0;
  size_t   index      = 0;
  slot   * slots      = (slot *)malloc( sizeof(slot) * max_slots );

  if (!slots) return NULL;
  if (!sep) sep = " \f\n\r\v\t";

  // Find all tokens
  while (s[ index ])
  {
    index += strspn( s + index, sep );  // skip any leading separators --> beginning of next token
    if (!s[ index ]) break;             // no more tokens

    if (num_slots == max_slots)  // assert: slots available
    {
      slot * new_slots = (slot *)realloc( slots, sizeof(slot) * (max_slots += granularity) );
      if (!new_slots) { free( slots ); return NULL; }
      slots = new_slots;
    }

    slots[ num_slots   ][ 0 ] = index;                               // beginning of token
    slots[ num_slots++ ][ 1 ] = index += strcspn( s + index, sep );  // skip non-separators --> end of token
  }

  // Allocate and build the string array
  result = (char **)malloc( sizeof(char *) * ++num_slots + (is_dup_s ? index + 1 : 0) );
  if (result)
  {
    char * d = is_dup_s ? (char *)(&result[ num_slots ]) : (char *)s;
    if (is_dup_s) memcpy( d, s, index + 1 );

    result[--num_slots ] = NULL;

    while (num_slots --> 0)
    {
      result[ num_slots ] = d + slots[ num_slots ][ 0 ];
      d[ slots[ num_slots ][ 1 ] ] = '[=11=]';
    }
  }

  free( slots );
  return result;
}

下面是一些使用它的示例代码:

a.c

#include <stdio.h>
#include "split.h"

void test( const char * s, char ** ss )
{
  printf( "%s\n", s );
  for (int n = 0;  ss[n];  ++n)
    printf( "  %d: \"%s\"\n", n, ss[n] );
  free( ss );
  printf( "\n" );
}

#define TEST(x) test( #x , x )

int main()
{
  TEST( split( "Hello world! \n" ) );
  TEST( split( " 2, 3, 5, 7, 11, ",  /*sep*/", " ) );
  TEST( split( "::::", ":" ) );
  TEST( split( "", ":" ) );
  TEST( split( "", NULL, true, 15 ) );
  TEST( split( "a b c d e", NULL ) );
  TEST( split( " - a---b   c - d - ", " -", true, 1 ) );

  char s[] = "Never trust a computer you can't throw out a window. --Abraham Lincoln";
  printf( "s = \"%s\"\n", s );
  TEST( split( s, " -.", false ) );
  printf( "Modified s will print only the first token: \"%s\"\n", s );
}

使用

在 Windows 10 上测试
  • MSVC 2019 (19.21.27702.2) cl /EHsc /W4 /Ox a.c split.c
  • LLVM/Clang 9.0.0 clang -Wall -Wextra -pedantic-errors -O3 -o a.exe a.c split.c

和 Ubuntu 20.04 使用

  • 海湾合作委员会 9.3.0 gcc -Wall -Wextra -pedantic-errors -O3 a.c split.c
  • 叮当声 10.0.0 clang -Wall -Wextra -pedantic-errors -O3 a.c split.c

解释这种疯狂!

我知道你是一个初学者,这比你想象的要多得多。别担心,玩弄字符串和 dynamically-allocated 内存实际上是相当困难的。很多人总是搞错。

这里使用的技巧是为每个标记的开头和结尾在字符串中构建一个临时索引列表,使用 strspn() and strcspn() library functions — the very same functions strtok() 在内部使用。该列表可以根据需要动态增长。

一旦该列表完成,我们分配足够的内存来存储每个标记 + 1 的指针(对于数组末尾的 NULL 指针),可选地后跟源字符串的副本。

然后我们简单地计算在字符串中索引的标记的指针值(地址),修改字符串就像 strtok() 对 null-terminate 每个标记所做的那样。

结果是单个内存块,因此当用户完成对数组的迭代时,它可以直接传递给 free()。示例测试函数使用整数索引遍历数组,但字符串迭代器(指向 char 指针的指针)也可以:

char ** tokens = split( my_string, my_delimiters );  // Get tokens
for (char ** ptoken = tokens;  *ptoken;  ++ptoken)   // For each token
  printf( "  %s\n", *ptoken );                       //   (do something with it)
free( tokens );                                      // Free tokens

就是这样!