strtok()函数的实现

Implementation of strtok() function

我需要编写函数 strtok。下面是我的代码。问题是 - 我无法显示结果字符串。在代码中我使用 strcpy() 然后显示新数组。 是否可以仅使用指针显示字符串 *str?

#include <stdio.h>
#include <string.h>   

char* my_strtok(char* s, char* delm){
    char W[100];
    int i = 0, k = 0, j = 0;
    char *ptr;
    static char *Iterator;
    ptr = s;

    if (s == NULL){
        s = Iterator;
    }
    while (s[i] != '[=10=]'){
        j = 0;
        while (delm[j] != '[=10=]'){
            if (s[i] != delm[j])
                W[k] = s[i];
            else goto It;
            j++;
        }
        ptr++;
        i++;
        k++;
    }
It:
    W[i] = 0;
    Iterator = ++ptr;
    return W;
}

int main(void){
    char s[100];
    char delm[] = " ";
    gets(s);
    char newstr[100];
    char *str = my_strtok(s, delm);
    strcpy(newstr, str);
    printf("%s", newstr);
    return 0;
}

这里已经讨论了 strtok 的内部实现:

How does strtok() split the string into tokens in C?

在您的实现类型中(称其为您的类型,因为它与实际类型有很大不同),您没有为局部变量动态分配任何内存'W'。所以当你 return 它时,不能保证你会正确地接收到字符串,因为在本地分配给 'W' 的内存在调用函数中不再为它保留。除此之外,代码中还使用了许多不必要的变量。而且,它需要适当的重组。为了您更好地理解,我修改了您的代码(尽可能保持您的风格)以完成所需的工作:

#include <stdio.h>
#include <string.h>  
#include <malloc.h>

char* my_strtok(char* s, char* delm)
{
    static int currIndex = 0;
    if(!s || !delm || s[currIndex] == '[=10=]')
    return NULL;
    char *W = (char *)malloc(sizeof(char)*100);
    int i = currIndex, k = 0, j = 0;

    while (s[i] != '[=10=]'){
        j = 0;
        while (delm[j] != '[=10=]'){
            if (s[i] != delm[j])
                W[k] = s[i];
            else goto It;
            j++;
        }

        i++;
        k++;
    }
It:
    W[i] = 0;
    currIndex = i+1;
    //Iterator = ++ptr;
    return W;
}

int main(void)
{
    char s[100] = "my name is khan";
    char delm[] = " ";
    //char newstr[100];
    char *str = my_strtok(s, delm);
    while(str){
        printf("%s", str);
        free(str);
        str = my_strtok(s, delm);
    }

    return 0;
}

赏金免责声明

赏金:"Looking for an answer drawing from credible and/or official sources."

我认为 GNU GLibC constitutes a credible and official source, right? You can download GLibC 2.22 here.tar.gz (25mb)。


提取源后:

string/strtok.c

#include <string.h>


static char *olds;

#undef strtok

#ifndef STRTOK
# define STRTOK strtok
#endif

/* Parse S into tokens separated by characters in DELIM.
   If S is NULL, the last string strtok() was called with is
   used.  For example:
    char s[] = "-abc-=-def";
    x = strtok(s, "-");     // x = "abc"
    x = strtok(NULL, "-=");     // x = "def"
    x = strtok(NULL, "=");      // x = NULL
        // s = "abc[=10=]=-def[=10=]"
*/
char *
STRTOK (char *s, const char *delim)
{
  char *token;

  if (s == NULL)
    s = olds;

  /* Scan leading delimiters.  */
  s += strspn (s, delim);
  if (*s == '[=10=]')
    {
      olds = s;
      return NULL;
    }

  /* Find the end of the token.  */
  token = s;
  s = strpbrk (token, delim);
  if (s == NULL)
    /* This token finishes the string.  */
    olds = __rawmemchr (token, '[=10=]');
  else
    {
      /* Terminate the token and make OLDS point past it.  */
      *s = '[=10=]';
      olds = s + 1;
    }
  return token;
}


string/strspn.c

#include <string.h>

#undef strspn
#ifndef STRSPN
#define STRSPN strspn
#endif

/* Return the length of the maximum initial segment
   of S which contains only characters in ACCEPT.  */
size_t
STRSPN (const char *s, const char *accept)
{
  const char *p;
  const char *a;
  size_t count = 0;

  for (p = s; *p != '[=11=]'; ++p)
    {
      for (a = accept; *a != '[=11=]'; ++a)
    if (*p == *a)
      break;
      if (*a == '[=11=]')
    return count;
      else
    ++count;
    }

  return count;
}
libc_hidden_builtin_def (strspn)


string/strpbrk.c

#include <string.h>

#undef strpbrk

#ifndef STRPBRK
#define STRPBRK strpbrk
#endif

/* Find the first occurrence in S of any character in ACCEPT.  */
char *
STRPBRK (const char *s, const char *accept)
{
  while (*s != '[=12=]')
    {
      const char *a = accept;
      while (*a != '[=12=]')
    if (*a++ == *s)
      return (char *) s;
      ++s;
    }

  return NULL;
}
libc_hidden_builtin_def (strpbrk)


sysdeps/x86_64/rawmemchr.S

#include <sysdep.h>

    .text
ENTRY (__rawmemchr)
    movd    %rsi, %xmm1
    mov %rdi, %rcx

    punpcklbw %xmm1, %xmm1
    punpcklbw %xmm1, %xmm1

    and , %rcx
    pshufd  [=13=], %xmm1, %xmm1

    cmp , %rcx
    ja  L(crosscache)

    movdqu  (%rdi), %xmm0
    pcmpeqb %xmm1, %xmm0
/* Check if there is a match.  */
    pmovmskb %xmm0, %eax
    test    %eax, %eax

    jnz L(matches)
    add , %rdi
    and $-16, %rdi
    jmp L(loop_prolog)

    .p2align 4
L(crosscache):
    and , %rcx
    and $-16, %rdi
    movdqa  (%rdi), %xmm0

    pcmpeqb %xmm1, %xmm0
/* Check if there is a match.  */
    pmovmskb %xmm0, %eax
/* Remove the leading bytes.  */
    sar %cl, %eax
    test    %eax, %eax
    je  L(unaligned_no_match)
/* Check which byte is a match.  */
    bsf %eax, %eax

    add %rdi, %rax
    add %rcx, %rax
    ret

    .p2align 4
L(unaligned_no_match):
    add , %rdi

    .p2align 4
L(loop_prolog):
    movdqa  (%rdi), %xmm0
    pcmpeqb %xmm1, %xmm0
    pmovmskb %xmm0, %eax
    test    %eax, %eax
    jnz L(matches)

    movdqa  16(%rdi), %xmm2
    pcmpeqb %xmm1, %xmm2
    pmovmskb %xmm2, %eax
    test    %eax, %eax
    jnz L(matches16)

    movdqa  32(%rdi), %xmm3
    pcmpeqb %xmm1, %xmm3
    pmovmskb %xmm3, %eax
    test    %eax, %eax
    jnz L(matches32)

    movdqa  48(%rdi), %xmm4
    pcmpeqb %xmm1, %xmm4
    add , %rdi
    pmovmskb %xmm4, %eax
    test    %eax, %eax
    jnz L(matches0)

    test    [=13=]x3f, %rdi
    jz  L(align64_loop)

    movdqa  (%rdi), %xmm0
    pcmpeqb %xmm1, %xmm0
    pmovmskb %xmm0, %eax
    test    %eax, %eax
    jnz L(matches)

    movdqa  16(%rdi), %xmm2
    pcmpeqb %xmm1, %xmm2
    pmovmskb %xmm2, %eax
    test    %eax, %eax
    jnz L(matches16)

    movdqa  32(%rdi), %xmm3
    pcmpeqb %xmm1, %xmm3
    pmovmskb %xmm3, %eax
    test    %eax, %eax
    jnz L(matches32)

    movdqa  48(%rdi), %xmm3
    pcmpeqb %xmm1, %xmm3
    pmovmskb %xmm3, %eax

    add , %rdi
    test    %eax, %eax
    jnz L(matches0)

    and $-64, %rdi

    .p2align 4
L(align64_loop):
    movdqa  (%rdi), %xmm0
    movdqa  16(%rdi), %xmm2
    movdqa  32(%rdi), %xmm3
    movdqa  48(%rdi), %xmm4

    pcmpeqb %xmm1, %xmm0
    pcmpeqb %xmm1, %xmm2
    pcmpeqb %xmm1, %xmm3
    pcmpeqb %xmm1, %xmm4

    pmaxub  %xmm0, %xmm3
    pmaxub  %xmm2, %xmm4
    pmaxub  %xmm3, %xmm4
    pmovmskb %xmm4, %eax

    add , %rdi

    test    %eax, %eax
    jz  L(align64_loop)

    sub , %rdi

    pmovmskb %xmm0, %eax
    test    %eax, %eax
    jnz L(matches)

    pmovmskb %xmm2, %eax
    test    %eax, %eax
    jnz L(matches16)

    movdqa  32(%rdi), %xmm3
    pcmpeqb %xmm1, %xmm3

    pcmpeqb 48(%rdi), %xmm1
    pmovmskb %xmm3, %eax
    test    %eax, %eax
    jnz L(matches32)

    pmovmskb %xmm1, %eax
    bsf %eax, %eax
    lea 48(%rdi, %rax), %rax
    ret

    .p2align 4
L(matches0):
    bsf %eax, %eax
    lea -16(%rax, %rdi), %rax
    ret

    .p2align 4
L(matches):
    bsf %eax, %eax
    add %rdi, %rax
    ret

    .p2align 4
L(matches16):
    bsf %eax, %eax
    lea 16(%rax, %rdi), %rax
    ret

    .p2align 4
L(matches32):
    bsf %eax, %eax
    lea 32(%rax, %rdi), %rax
    ret

    .p2align 4
L(return_null):
    xor %rax, %rax
    ret

END (__rawmemchr)

weak_alias (__rawmemchr, rawmemchr)
libc_hidden_builtin_def (__rawmemchr)

实际上这个是我对一般情况的解决方案...当 str 是 char* 时你不能写入它。

这里是:

And here is the link to github资源:

#include<stdio.h>
#include<stdlib.h>
#include<string.h>

#define DICT_LEN 256

int *create_delim_dict(char *delim)
{
    int *d = (int*)malloc(sizeof(int)*DICT_LEN);
    memset((void*)d, 0, sizeof(int)*DICT_LEN);

    int i;
    for(i=0; i< strlen(delim); i++) {
        d[delim[i]] = 1;
    }
    return d;
}



char *my_strtok(char *str, char *delim)
{

    static char *last, *to_free;
    int *deli_dict = create_delim_dict(delim);

    if(!deli_dict) {
        return NULL;
    }

    if(str) {
        last = (char*)malloc(strlen(str)+1);
        if(!last) {
            free(deli_dict);
        }
        to_free = last;
        strcpy(last, str);
    }

    while(deli_dict[*last] && *last != '[=10=]') {
        last++;
    }
    str = last;
    if(*last == '[=10=]') {
        free(deli_dict);
        free(to_free);
        return NULL;
    }
    while (*last != '[=10=]' && !deli_dict[*last]) {
        last++;
    }

    *last = '[=10=]';
    last++;

    free(deli_dict);
    return str;
}

int main()
{
    char * str = "- This, a sample string.";
    char *del = " ,.-";
    char *s = my_strtok(str, del);
    while(s) {
        printf("%s\n", s);
        s = my_strtok(NULL, del);
    }
    return 0;
}
char * strtok2(char *str, const char *delim)
{
static char *nxt; /* static variable used to advance the string to replace delimiters */
static int size;  /* static variable used to count until the end of the string        */

/* IMPORTANT: any advance to 'nxt' must be followed by a diminution of 'size', and vice verce */

int i; /* counter of delimiter(s) in string */

/* initialize the string when strtok2 is first calles and supplied with a valid string */
if(str != NULL)
{
    nxt = str;
    size = strlen(str);
}

/* if we havn't reached the end of the string, any other call to strtok2 with NULL will come here */
else if(size > 0)
{
    nxt++;      /* last run left nxt on a null terminator, so we have to advance it */
    size--;     /* any advancement must follow diminution of size                   */
    str = nxt;  /* string now points to the first char after the last delimiter     */ 
}

/* if we reached the end of string, return NULL pointer */
else
{
    str = NULL;
}

/* nxt is used to advance the string until a delimiter or a series of delimiters are encountered; 
 * it then stops on the last delimiter which has turned to NULL terminator
 */
while(*nxt)
{
    i = strspn(nxt, delim);
    while(i > 1)        /* turns delimiters to NULL terminator (except the last one) */
    {
        *nxt = '[=10=]';
        nxt++;
        size--;
        i--;
    }                   /* in the last delimiter we have to do something a      */
    if(1 == i)          /* bit different we have to actually take nxt backwards */
    {                   /* one step, to break the while(*nxt) loop              */
        *nxt = '[=10=]';
        if(size > 1)    /* if the delimiter is last char, don't do next lines   */
        {
            nxt--;
            size++;     /* nxt is diminished so size increases                    */
        }
    }
    nxt++;          /* if no delimiter is found, advance nxt                  */
    size--;         /* in case of the last delimiter in a series, we took nxt */
}                   /* a step back, so now moving it a step forward means     */
                    /* it rests on a NULL terminator                          */
return str;
}

这是一个实现,根据您的需要增加输入缓冲区,并增加对 strtok 的调用次数(str1 为 NULL)以取回更多令牌

#include<stdio.h>
#include<string.h>
#include<stdlib.h>
char *ret_nondelim(char *str1,char *str2);
char *my_strtok(char *str1,char *str2);
char *ret_noofbytes(char *str1,char *str2);
int main()
{
    char str1[20];
    char str2[20];
    char *res;
    printf("enter string one\n");
    scanf("%s",str1);
    printf("enter string two\n");
    scanf("%s",str2);
    res=my_strtok(str1,str2);
    if(res)
        printf("%s\n",res);
    else
        printf("returned NULL\n");
    free(res);
    res=my_strtok(NULL,str2);
    if(res)
        printf("%s\n",res);
    else
        printf("returned NULL\n");
    free(res);
    res=my_strtok(NULL,str2);
    if(res)
        printf("%s\n",res);
    else
        printf("returned NULL\n");
    free(res);
}
char *my_strtok(char *str1,char *str2)
{
    static char *str1_cpy;
    static int i;
    char *ptr;
    int flag=0;
    if(str1!=NULL)
    {
        str1_cpy=ret_nondelim(str1,str2);      //get location of non delimiting byte
        str1=ret_noofbytes(str1_cpy,str2);  //scan and get location of delimitng byte
        if((str1-str1_cpy))         //no of bytes = non delimiting location - delimting location
        {
            ptr=malloc(str1-str1_cpy+1);        //malloc location and and store the string`enter code here`
            int k;
            for(k=0;k<(str1-str1_cpy);k++)
                ptr[k]=str1_cpy[k];
            ptr[k]='[=10=]';
            str1_cpy=str1;              //save pointer for next iteration
            return ptr;             //return pointer
        }
        else
            return NULL;

    }
    else
    {
        str1_cpy=ret_nondelim(str1_cpy,str2);   //same as above but pass saved pointer 
        str1=ret_noofbytes(str1_cpy,str2);
        if((str1-str1_cpy))
        {
            ptr=malloc(str1-str1_cpy+1);
            int k;
            for(k=0;k<(str1-str1_cpy);k++)
                ptr[k]=str1_cpy[k];
            ptr[k]='[=10=]';
            str1_cpy=str1;          //save pointer for next iteration
            return ptr;
        }
        else
            return NULL;
    }
}
char *ret_nondelim(char *str1,char *str2)
{
    int flag=0;
    for(;*(str1);)
    {
        for(int j=0;j<strlen(str2);j++)
        {
            if(*(str1)==*(str2+j))          //check if slected byte is non delimiting byte
            {
                str1++;             //break if true go check next byte
                break;
            }
            else
                flag++;             //shows that selected byte is none of the delimitng byte
        }
        if(flag==strlen(str2))              //(above comment)-so non delimiting byte found return that pointer to caller
            break;
        else
            flag=0;
    }
    return str1;
}

char *ret_noofbytes(char *str1,char *str2)
{
    int flag=0;
    for(;*(str1);)
    {
        for(int k=0;k<strlen(str2);k++)
        {
            if(*(str2+k)==*(str1))          //check if selected bytes is delimiting byte
            {
                flag=1;
                break;              //break twice if true ie .flag==1
            }
        }
        if(flag==1)
            break;
        else
            str1++;                 //if not found go check next byte
    }
    return str1;                        //return the point where delimiting byte was found
}
Here is the code which explains how strtok works in C.

#include <stdio.h>
#include <string.h>

#define SIZE_OF_STRING 50 
#define SIZE_OF_DELIMITER 5

/* Make the temp_ptr as static, so it will hold the previous pointing address */
static char *temp_ptr = NULL;

char *string_token(char *str, char *delimiter)
{
    char *final_ptr = NULL;  `enter code here`
    /* Flag has been defined as static to avoid the parent function loop
     * runs infinitely after reaching end of string.
     */ 
    static int flag = 0;
    int i, j;

    if (delimiter == NULL) {
        return NULL;
    }

    /* If flag is 1, it will denote the end of the string */
    if (flag == 1) {
        return NULL;
    }

    /* The below condition is to avoid temp_ptr is getting assigned 
     * with NULL string from the parent function main. Without
     * the below condition, NULL will be assigned to temp_ptr 
     * and final_ptr, so accessing these pointers will lead to
     * segmentation fault.
     */
    if (str != NULL) { 
        temp_ptr = str; 
    }

    /* Before function call ends, temp_ptr should move to the next char,
     * so we can't return the temp_ptr. Hence, we introduced a new pointer
     * final_ptr which points to temp_ptr.
     */
    final_ptr = temp_ptr;

    printf("%s %d str: %s delimiter: %s length: %ld temp_ptr: %s strlen: %ld"
           " final_ptr: %s \n",__func__, __LINE__, str, delimiter, 
           strlen(delimiter), temp_ptr, strlen(temp_ptr), final_ptr);

    for (i = 0; i <= strlen(temp_ptr); i++)
    {
        for (j = 0; j < strlen(delimiter); j++) {

            if (temp_ptr[i] == '[=10=]') {
                /* If the flag is not set, both the temp_ptr and flag_ptr 
                 * will be holding string "Jasmine" which will make parent 
                 * to call this function string_token infinitely. 
                 */
                flag = 1;
                return final_ptr;
            }

            if ((temp_ptr[i] == delimiter[j])) {
                /* NULL character is assigned, so that final_ptr will return 
                 * til NULL character. Here, final_ptr points to temp_ptr.
                 */
                temp_ptr[i] = '[=10=]';
                temp_ptr += i+1;
                return final_ptr;
            }
        }
    }
    /* You will somehow end up here if for loop condition fails.
     * If this function doesn't return any char pointer, it will 
     * lead to segmentation fault in the parent function.
     */
    return NULL;
}


int main()
{
    char str[SIZE_OF_STRING] = "shirley|Rose|Jasmine";
    char *token = NULL;
    char del[SIZE_OF_DELIMITER] = "|";

    token = string_token(str, del);
    while (token != NULL) {       
        printf("token %s\n", token);
        token = string_token(NULL, del);
    }
    return 0;
}

https://shirleyanengineer.blogspot.com/2019/11/write-your-own-strtok-in-c.html

@Enzo Ferber 提供的 glibc 解决方案对我不起作用(MS Visual Studio 2019,Visual Micro Release 22.02.18.5 --> Arduino(1.8),avr-gcc 5.4.0)在:

/* 终止令牌并使 OLDS 指向它。 */ *s = '[=10=]';

因为s只是指针的浅拷贝。将 *s = '[=10=]'; 更改为:

**((char**)&s) = '[=12=]';

而且效果很好。不确定这是否特定于 gcc 实现,但似乎尝试修改函数中指针的浅表副本在任何 ANSI/ISO C 实现中都会失败。