Win32/C:将行结尾转换为DOS/Windows格式

Win32/C: Convert line endings to DOS/Windows format

我在读取文件的 Windows API 项目中有以下 C 函数,并根据行结尾(UNIX、MAC、DOS)替换行以 Windows (\r\n) 的正确行结尾结尾:

// Standard C header needed for string functions
#include <string.h>

// Defines for line-ending conversion function
#define LESTATUS INT 
#define LE_NO_CHANGES_NEEDED (0)
#define LE_CHANGES_SUCCEEDED (1)
#define LE_CHANGES_FAILED   (-1)

/// <summary>
/// If the line endings in a block of data loaded from a file contain UNIX (\n) or MAC (\r) line endings, this function replaces it with DOS (\r\n) endings.
/// </summary>
/// <param name="inData">An array of bytes of input data.</param>
/// <param name="inLen">The size, in bytes, of inData.</param>
/// <param name="outData">An array of bytes to be populated with output data.  This array must already be allocated</param>
/// <param name="outLen">The maximum number of bytes that can be stored in outData.</param>
/// <param name="bytesWritten">A pointer to an integer that receives the number of bytes written into outData.</param>
/// <returns>
/// If no changes were necessary (the file already contains \r\n line endings), then the return value is LE_NO_CHANGES_NEEDED.<br/>
/// If changes were necessary, and it was possible to store the entire output buffer, the return value is LE_CHANGES_SUCCEEDED.<br/>
/// If changes were necessary but the output buffer was too small, the return value is LE_CHANGES_FAILED.<br/>
/// </returns>
LESTATUS ConvertLineEndings(BYTE* inData, INT inLen, BYTE* outData, INT outLen, INT* bytesWritten)
{
    char *posR = strstr(inData, "\r");
    char *posN = strstr(inData, "\n");
    // Case 1: the file already contains DOS/Windows line endings.
    // So, copy the input array into the output array as-is (if we can)
    // Report an error if the output array is too small to hold the input array; report success otherwise.
    if (posN != NULL && posR != NULL)
    {
        if (outLen >= inLen)
        {
            strcpy(outData, inData);
            return LE_NO_CHANGES_NEEDED;
        }
        return LE_CHANGES_FAILED;
    }
    // Case 2: the file contains UNIX line endings.
    else if (posN != NULL && posR == NULL)
    {
        int i = 0;
        int track = 0;
        for (i = 0; i < inLen; i++)
        {
            if (inData[i] != '\n')
            {
                outData[track] = inData[i];
                track++;
                if (track>outLen) return LE_CHANGES_FAILED;
            }
            else
            {
                outData[track] = '\r';
                track++;
                if (track > outLen) return LE_CHANGES_FAILED;
                outData[track] = '\n';
                track++;
                if (track > outLen) return LE_CHANGES_FAILED;
            }
            *bytesWritten = track;
        }
    }
    // Case 3: the file contains Mac-style line endings.
    else if (posN == NULL && posR != NULL)
    {
        int i = 0;
        int track = 0;
        for (i = 0; i < inLen; i++)
        {
            if (inData[i] != '\r')
            {
                outData[track] = inData[i];
                track++;
                if (track>outLen) return LE_CHANGES_FAILED;
            }
            else
            {
                outData[track] = '\r';
                track++;
                if (track > outLen) return LE_CHANGES_FAILED;
                outData[track] = '\n';
                track++;
                if (track > outLen) return LE_CHANGES_FAILED;
            }
            *bytesWritten = track;
        }
    }
    return LE_CHANGES_SUCCEEDED;
}

但是,我觉得这个函数 很长 长(将近 70 行)并且可以以某种方式减少。我在 Google 上搜索过,但找不到任何有用的东西; C 库或 Windows API 中是否有任何函数允许我执行字符串替换而不是在 O(n) 时间内逐字节手动搜索字符串?

每个角色都需要精确看一遍,不多不少。您的代码的第一行已经进行了重复比较,因为两个 strstr 调用都从同一位置开始。你可以使用像

这样的东西
char *posR = strstr(inData, "\r");
if (posR && posR[1] == '\n')
   // Case 1: the file already contains DOS/Windows line endings.

如果失败,如果找到 \r,则从结束处继续,如果 posR == NULL,则再次从顶部开始。但是后来你让 strstr 已经 "look at" 每个字符直到结束!

两个补充说明:

  1. 不需要 strstr 因为你正在寻找一个字符;下次使用strchr
  2. strXXX 函数都假定您的输入是格式正确的 C 字符串:它应该以终止符 0 结尾。但是,您已经在 inLen 中提供了长度,因此您 没有 检查零。如果您输入的 inLen 字节之前可能有也可能没有 0,您需要采取适当的措施。基于此函数的目的,我假设您根本不需要检查零。

我的建议:每个字符从头看一次,只有在或者和[=13=时才采取行动] 一个\n。如果您遇到的第一个是 \r ,下一个是 \n,您就完成了。 (假设行尾不是 "mixed"。)

如果您在第一个循环中没有 return,那么除了 \r\n 之外还有其他内容,您可以从那一点继续。但是您仍然只需要对 \r \n 采取行动!所以我建议这个更短的代码(和一个 enum 而不是你的定义):

enum LEStatus_e { LE_CHANGES_FAILED=-1, LE_NO_CHANGES_NEEDED, LE_CHANGES_SUCCEEDED };

enum LEStatus_e ConvertLineEndings(BYTE *inData, INT inLen, BYTE *outData, INT outLen, INT *bytesWritten)
{
    INT sourceIndex = 0, destIndex;

    if (outLen < inLen)
        return LE_CHANGES_FAILED;

    /*  Find first occurrence of either \r or \n
        This will return immediately for No Change Needed */
    while (sourceIndex < inLen)
    {
        if (inData[sourceIndex] == '\r')
        {
            if (sourceIndex < inLen-1 && inData[sourceIndex+1] == '\n')
            {
                memcpy (outData, inData, inLen);
                *bytesWritten = inLen;
                return LE_NO_CHANGES_NEEDED;
            }
            break;
        }
        if (inData[sourceIndex] == '\n')
            break;
        sourceIndex++;
    }
    /* We processed this far already: */
    memcpy (outData, inData, sourceIndex);
    if (sourceIndex == inLen)
        return LE_NO_CHANGES_NEEDED;
    destIndex = sourceIndex;

    while (sourceIndex < inLen)
    {
        switch (inData[sourceIndex])
        {
            case '\n':
            case '\r':
                sourceIndex++;
                if (destIndex+2 >= outLen)
                    return LE_CHANGES_FAILED;
                outData[destIndex++] = '\r';
                outData[destIndex++] = '\n';
                break;
            default:
                outData[destIndex++] = inData[sourceIndex++];
        }
    }
    *bytesWritten = destIndex;
    return LE_CHANGES_SUCCEEDED;
}

有一些古老而罕见的 'plain text' 格式使用了其他结构;从记忆中,类似于 \r\n\n。如果您希望能够清理 anything,您可以在单个 \n 之后为所有 \r 添加一个跳过,相反的情况也是如此。这也将清除任何 "mixed" 行结尾,因为它也会正确处理 \r\n

这是我认为更简单的代码,行数减半。当然,正如 Ben Voigt 指出的那样,您无法击败 O(n) 时间,因此我没有尝试这样做。我没有使用任何库函数,因为这样看起来更简单,而且我怀疑额外的函数调用能否使代码更快。

enum lestatus {
  le_no_changes_needed = 0,
  le_changes_succeeded = 1,
  le_changes_failed = -1
};

enum lestatus ConvertLineEndings(char *indata, int inlen,
                                 char *outdata, int outlen)
{
  int outpos = 0, inpos;
  enum lestatus it_changed = le_no_changes_needed;
  for (inpos = 0; inpos<inlen;inpos++) {
    if (outpos + 1 > outlen) return le_changes_failed;
    if (indata[inpos] != '\r' && indata[inpos] != '\n') {
      /* it is an ordinary character, just copy it */
      outdata[outpos++] = indata[inpos];
    } else if (outpos + 2 > outlen) {
      return le_changes_failed;
    } else if ((indata[inpos+1] == '\r' || indata[inpos+1] == '\n')
               && indata[inpos] != indata[inpos+1]) {
      /* it is \r\n or \n\r, output it in canonical order */
      outdata[outpos++] = '\r';
      outdata[outpos++] = '\n';
      inpos++; /* skip the second character */
    } else {
      /* it is a mac or unix line ending, convert to dos */
      outdata[outpos++] = '\r';
      outdata[outpos++] = '\n';
      it_changed = le_changes_succeeded;
    }
  }
  return it_changed;
}

我的代码最大的区别在于

  1. 我使用了增量运算符。
  2. 为了简单起见,我避免使用库函数。
  3. 我的函数正确处理了混合结尾的文件(根据我的解释)。
  4. 我更喜欢小写字符。这显然是一种风格偏好。
  5. 与#defines 相比,我更喜欢枚举。也是一种风格偏好。