在 C 中连接字符串的未定义行为

Undefined behaviour whist concatenating strings in C

我正在编写一个 C 程序,它将连接所有行(包括“\n”),同时保存指向最终字符串的最后一个字符的指针。但是,我没有得到预期的结果。可能是什么问题?

#include <stdio.h>
#include <stdlib.h>
#include <string.h>

struct Node {
  struct Node *next;
  struct Node *prev;
};

struct Node *CreateNewNode() {
  struct Node *newNode = malloc(sizeof(struct Node));
  return newNode;
}

struct PieceTable {
  char *buffer;
  char *ptr_to_last_character;
} PT;

void strconcatenate(char *source) {
  size_t source_len = strlen(source);
  size_t buffer_len = strlen(PT.buffer);
  PT.buffer = realloc(PT.buffer, buffer_len + source_len + 1);
  while (*source)
    *PT.ptr_to_last_character++ = *source++;
  *PT.ptr_to_last_character = '[=10=]';
}

int main(int argc, char *argv[]) {

  char input_line[1024];

  PT.buffer = malloc(sizeof(char) * 2);
  *PT.buffer = '[=10=]';
  PT.ptr_to_last_character = PT.buffer;

  struct Node *new_node = CreateNewNode();
  new_node->next = NULL;
  new_node->prev = NULL;

  strconcatenate("Lorem ipsum\n");
  strconcatenate("dolor sit amet\n");
  strconcatenate("consectetur adipiscing elit\n");

  printf("%s", PT.buffer);

  return 0;
}

预期输出:

Lorem ipsum
dolor sit amet
consectetur adipiscing elit

输出:

etur adipiscing elit

您重新分配 PT.buffer,但没有更新 PT.ptr_to_last_character。当 realloc 不能只扩展当前分配而是 returns 在不同地址的新的更大区域时,这会导致未定义的行为,留下 ptr_to_last_character 指向旧内存。

可能更好的方法是存储缓冲区的长度而不是指向最后一个字符的指针,这样您就不必担心它会失效。

警告: Paul 已经找到了问题,但既然你说你 需要 一个指针......

您可以通过saving/restoring偏移量调整“最后”指针:

ptrdiff_t offset = PT.ptr_to_last_character - PT.buffer;
PT.buffer = realloc(PT.buffer, buffer_len + source_len + 1);
PT.ptr_to_last_character = PT.buffer + offset;

我在下面的完整代码中注释了一些其他清理问题。 #if 0 包装 old/original 代码。 #if 1 包装 new/refactored 代码:

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <stddef.h>

struct Node {
    struct Node *next;
    struct Node *prev;
};

struct Node *
CreateNewNode()
{
    struct Node *newNode = malloc(sizeof(struct Node));

    // NOTE/WARN: this was moved from main to be more general
#if 1
    newNode->next = NULL;
    newNode->prev = NULL;
#endif

    return newNode;
}

struct PieceTable {
    char *buffer;
    char *ptr_to_last_character;
} PT;

void
strconcatenate(char *source)
{
    size_t source_len = strlen(source);
    size_t buffer_len = strlen(PT.buffer);

    // NOTE/BUG: after the realloc, PT.buffer can change but
    // PT.ptr_to_last_character is pointing within the _old_ buffer
    // we need to recalc using an offset
#if 0
    PT.buffer = realloc(PT.buffer, buffer_len + source_len + 1);
#else
    ptrdiff_t offset = PT.ptr_to_last_character - PT.buffer;
    PT.buffer = realloc(PT.buffer, buffer_len + source_len + 1);
    PT.ptr_to_last_character = PT.buffer + offset;
#endif

    while (*source)
        *PT.ptr_to_last_character++ = *source++;

    *PT.ptr_to_last_character = '[=11=]';
}

int
main(int argc, char *argv[])
{

    char input_line[1024];

    // NOTE/BUG: sizeof(char) is _always_ 1 by definition
#if 0
    PT.buffer = malloc(sizeof(char) * 2);
#else
    PT.buffer = malloc(2);
#endif
    *PT.buffer = '[=11=]';
    PT.ptr_to_last_character = PT.buffer;

    // NOTE/WARN: this isn't used anywhere
    struct Node *new_node = CreateNewNode();

    // NOTE/WARN: this would be better inside CreateNewNode
#if 0
    new_node->next = NULL;
    new_node->prev = NULL;
#endif

    strconcatenate("Lorem ipsum\n");
    strconcatenate("dolor sit amet\n");
    strconcatenate("consectetur adipiscing elit\n");

    printf("%s", PT.buffer);

    return 0;
}

对于初学者来说,不清楚为什么在这个语句中分配了两个字节而不是一个字节。

PT.buffer = malloc(sizeof(char) * 2);
*PT.buffer = '[=10=]';

在函数内这条语句

size_t buffer_len = strlen(PT.buffer);

考虑到您已经有一个指向终止零的指针,这没有意义。所以效率很低。

realloc

调用后
PT.buffer = realloc(PT.buffer, buffer_len + source_len + 1);

指针PT.ptr_to_last_character可能无效。

另外你应该检查内存分配是否成功。否则指针 PT.buffer 可以设置为 NULL 以防失败。

该功能可以通过以下方式实现,如下面的演示程序所示。

#include <stdio.h>
#include <stdlib.h>
#include <string.h>

struct PieceTable {
  char *buffer;
  char *ptr_to_last_character;
} PT;

int strconcatenate( const char *source ) 
{
    int success = 1;
    
    size_t n = PT.ptr_to_last_character - PT.buffer;
    
    size_t source_len = strlen( source );
    
    if ( source_len != 0 )
    {
        char *tmp = realloc( PT.buffer, n + source_len + 1 );
        
        if ( ( success = tmp != NULL ) )
        {
            PT.buffer = tmp;
            PT.ptr_to_last_character = PT.buffer + n;

            while ( ( *PT.ptr_to_last_character = *source++ ) ) ++PT.ptr_to_last_character;
        }
    }
    
    return success;
}   

int main(void) 
{
    PT.buffer = malloc( sizeof(char) );
    *PT.buffer = '[=13=]';
    PT.ptr_to_last_character = PT.buffer;
    
    strconcatenate( "Lorem ipsum\n" );
    strconcatenate( "dolor sit amet\n" );
    strconcatenate( "consectetur adipiscing elit\n" );

    puts( PT.buffer );
    
    return 0;
}

程序输出为

Lorem ipsum
dolor sit amet
consectetur adipiscing elit

还要考虑到,而不是这个手动编写的循环

while ( ( *PT.ptr_to_last_character = *source++ ) ) ++PT.ptr_to_last_character;

您可以使用标准的 C 字符串函数 strcatstrcpy,这样效率会更高。

在这种情况下,您的函数可以如下所示

int strconcatenate( const char *source ) 
{
    int success = 1;
    
    size_t n = PT.ptr_to_last_character - PT.buffer;
    
    size_t source_len = strlen( source );
    
    if ( source_len != 0 )
    {
        char *tmp = realloc( PT.buffer, n + source_len + 1 );
        
        if ( ( success = tmp != NULL ) )
        {
            PT.buffer = tmp;
            PT.ptr_to_last_character = PT.buffer + n;

            strcpy( PT.ptr_to_last_character, source );
            
            PT.ptr_to_last_character += source_len;
        }
    }
    
    return success;
}