Win32/C:将行结尾转换为DOS/Windows格式
Win32/C: Convert line endings to DOS/Windows format
我在读取文件的 Windows API 项目中有以下 C 函数,并根据行结尾(UNIX、MAC、DOS)替换行以 Windows (\r\n
) 的正确行结尾结尾:
// Standard C header needed for string functions
#include <string.h>
// Defines for line-ending conversion function
#define LESTATUS INT
#define LE_NO_CHANGES_NEEDED (0)
#define LE_CHANGES_SUCCEEDED (1)
#define LE_CHANGES_FAILED (-1)
/// <summary>
/// If the line endings in a block of data loaded from a file contain UNIX (\n) or MAC (\r) line endings, this function replaces it with DOS (\r\n) endings.
/// </summary>
/// <param name="inData">An array of bytes of input data.</param>
/// <param name="inLen">The size, in bytes, of inData.</param>
/// <param name="outData">An array of bytes to be populated with output data. This array must already be allocated</param>
/// <param name="outLen">The maximum number of bytes that can be stored in outData.</param>
/// <param name="bytesWritten">A pointer to an integer that receives the number of bytes written into outData.</param>
/// <returns>
/// If no changes were necessary (the file already contains \r\n line endings), then the return value is LE_NO_CHANGES_NEEDED.<br/>
/// If changes were necessary, and it was possible to store the entire output buffer, the return value is LE_CHANGES_SUCCEEDED.<br/>
/// If changes were necessary but the output buffer was too small, the return value is LE_CHANGES_FAILED.<br/>
/// </returns>
LESTATUS ConvertLineEndings(BYTE* inData, INT inLen, BYTE* outData, INT outLen, INT* bytesWritten)
{
char *posR = strstr(inData, "\r");
char *posN = strstr(inData, "\n");
// Case 1: the file already contains DOS/Windows line endings.
// So, copy the input array into the output array as-is (if we can)
// Report an error if the output array is too small to hold the input array; report success otherwise.
if (posN != NULL && posR != NULL)
{
if (outLen >= inLen)
{
strcpy(outData, inData);
return LE_NO_CHANGES_NEEDED;
}
return LE_CHANGES_FAILED;
}
// Case 2: the file contains UNIX line endings.
else if (posN != NULL && posR == NULL)
{
int i = 0;
int track = 0;
for (i = 0; i < inLen; i++)
{
if (inData[i] != '\n')
{
outData[track] = inData[i];
track++;
if (track>outLen) return LE_CHANGES_FAILED;
}
else
{
outData[track] = '\r';
track++;
if (track > outLen) return LE_CHANGES_FAILED;
outData[track] = '\n';
track++;
if (track > outLen) return LE_CHANGES_FAILED;
}
*bytesWritten = track;
}
}
// Case 3: the file contains Mac-style line endings.
else if (posN == NULL && posR != NULL)
{
int i = 0;
int track = 0;
for (i = 0; i < inLen; i++)
{
if (inData[i] != '\r')
{
outData[track] = inData[i];
track++;
if (track>outLen) return LE_CHANGES_FAILED;
}
else
{
outData[track] = '\r';
track++;
if (track > outLen) return LE_CHANGES_FAILED;
outData[track] = '\n';
track++;
if (track > outLen) return LE_CHANGES_FAILED;
}
*bytesWritten = track;
}
}
return LE_CHANGES_SUCCEEDED;
}
但是,我觉得这个函数 很长 长(将近 70 行)并且可以以某种方式减少。我在 Google 上搜索过,但找不到任何有用的东西; C 库或 Windows API 中是否有任何函数允许我执行字符串替换而不是在 O(n) 时间内逐字节手动搜索字符串?
每个角色都需要精确看一遍,不多不少。您的代码的第一行已经进行了重复比较,因为两个 strstr
调用都从同一位置开始。你可以使用像
这样的东西
char *posR = strstr(inData, "\r");
if (posR && posR[1] == '\n')
// Case 1: the file already contains DOS/Windows line endings.
如果失败,如果找到 \r
,则从结束处继续,如果 posR == NULL
,则再次从顶部开始。但是后来你让 strstr
已经 "look at" 每个字符直到结束!
两个补充说明:
- 不需要
strstr
因为你正在寻找一个字符;下次使用strchr
;
strXXX
函数都假定您的输入是格式正确的 C 字符串:它应该以终止符 0
结尾。但是,您已经在 inLen
中提供了长度,因此您 没有 检查零。如果您输入的 inLen
字节之前可能有也可能没有 0
,您需要采取适当的措施。基于此函数的目的,我假设您根本不需要检查零。
我的建议:每个字符从头看一次,只有在或者和[=13=时才采取行动] 或一个\n
。如果您遇到的第一个是 \r
和 ,下一个是 \n
,您就完成了。 (假设行尾不是 "mixed"。)
如果您在第一个循环中没有 return,那么除了 \r\n
之外还有其他内容,您可以从那一点继续。但是您仍然只需要对 或 \r
或 \n
采取行动!所以我建议这个更短的代码(和一个 enum
而不是你的定义):
enum LEStatus_e { LE_CHANGES_FAILED=-1, LE_NO_CHANGES_NEEDED, LE_CHANGES_SUCCEEDED };
enum LEStatus_e ConvertLineEndings(BYTE *inData, INT inLen, BYTE *outData, INT outLen, INT *bytesWritten)
{
INT sourceIndex = 0, destIndex;
if (outLen < inLen)
return LE_CHANGES_FAILED;
/* Find first occurrence of either \r or \n
This will return immediately for No Change Needed */
while (sourceIndex < inLen)
{
if (inData[sourceIndex] == '\r')
{
if (sourceIndex < inLen-1 && inData[sourceIndex+1] == '\n')
{
memcpy (outData, inData, inLen);
*bytesWritten = inLen;
return LE_NO_CHANGES_NEEDED;
}
break;
}
if (inData[sourceIndex] == '\n')
break;
sourceIndex++;
}
/* We processed this far already: */
memcpy (outData, inData, sourceIndex);
if (sourceIndex == inLen)
return LE_NO_CHANGES_NEEDED;
destIndex = sourceIndex;
while (sourceIndex < inLen)
{
switch (inData[sourceIndex])
{
case '\n':
case '\r':
sourceIndex++;
if (destIndex+2 >= outLen)
return LE_CHANGES_FAILED;
outData[destIndex++] = '\r';
outData[destIndex++] = '\n';
break;
default:
outData[destIndex++] = inData[sourceIndex++];
}
}
*bytesWritten = destIndex;
return LE_CHANGES_SUCCEEDED;
}
有一些古老而罕见的 'plain text' 格式使用了其他结构;从记忆中,类似于 \r\n\n
。如果您希望能够清理 anything,您可以在单个 \n
之后为所有 \r
添加一个跳过,相反的情况也是如此。这也将清除任何 "mixed" 行结尾,因为它也会正确处理 \r\n
。
这是我认为更简单的代码,行数减半。当然,正如 Ben Voigt 指出的那样,您无法击败 O(n) 时间,因此我没有尝试这样做。我没有使用任何库函数,因为这样看起来更简单,而且我怀疑额外的函数调用能否使代码更快。
enum lestatus {
le_no_changes_needed = 0,
le_changes_succeeded = 1,
le_changes_failed = -1
};
enum lestatus ConvertLineEndings(char *indata, int inlen,
char *outdata, int outlen)
{
int outpos = 0, inpos;
enum lestatus it_changed = le_no_changes_needed;
for (inpos = 0; inpos<inlen;inpos++) {
if (outpos + 1 > outlen) return le_changes_failed;
if (indata[inpos] != '\r' && indata[inpos] != '\n') {
/* it is an ordinary character, just copy it */
outdata[outpos++] = indata[inpos];
} else if (outpos + 2 > outlen) {
return le_changes_failed;
} else if ((indata[inpos+1] == '\r' || indata[inpos+1] == '\n')
&& indata[inpos] != indata[inpos+1]) {
/* it is \r\n or \n\r, output it in canonical order */
outdata[outpos++] = '\r';
outdata[outpos++] = '\n';
inpos++; /* skip the second character */
} else {
/* it is a mac or unix line ending, convert to dos */
outdata[outpos++] = '\r';
outdata[outpos++] = '\n';
it_changed = le_changes_succeeded;
}
}
return it_changed;
}
我的代码最大的区别在于
- 我使用了增量运算符。
- 为了简单起见,我避免使用库函数。
- 我的函数正确处理了混合结尾的文件(根据我的解释)。
- 我更喜欢小写字符。这显然是一种风格偏好。
- 与#defines 相比,我更喜欢枚举。也是一种风格偏好。
我在读取文件的 Windows API 项目中有以下 C 函数,并根据行结尾(UNIX、MAC、DOS)替换行以 Windows (\r\n
) 的正确行结尾结尾:
// Standard C header needed for string functions
#include <string.h>
// Defines for line-ending conversion function
#define LESTATUS INT
#define LE_NO_CHANGES_NEEDED (0)
#define LE_CHANGES_SUCCEEDED (1)
#define LE_CHANGES_FAILED (-1)
/// <summary>
/// If the line endings in a block of data loaded from a file contain UNIX (\n) or MAC (\r) line endings, this function replaces it with DOS (\r\n) endings.
/// </summary>
/// <param name="inData">An array of bytes of input data.</param>
/// <param name="inLen">The size, in bytes, of inData.</param>
/// <param name="outData">An array of bytes to be populated with output data. This array must already be allocated</param>
/// <param name="outLen">The maximum number of bytes that can be stored in outData.</param>
/// <param name="bytesWritten">A pointer to an integer that receives the number of bytes written into outData.</param>
/// <returns>
/// If no changes were necessary (the file already contains \r\n line endings), then the return value is LE_NO_CHANGES_NEEDED.<br/>
/// If changes were necessary, and it was possible to store the entire output buffer, the return value is LE_CHANGES_SUCCEEDED.<br/>
/// If changes were necessary but the output buffer was too small, the return value is LE_CHANGES_FAILED.<br/>
/// </returns>
LESTATUS ConvertLineEndings(BYTE* inData, INT inLen, BYTE* outData, INT outLen, INT* bytesWritten)
{
char *posR = strstr(inData, "\r");
char *posN = strstr(inData, "\n");
// Case 1: the file already contains DOS/Windows line endings.
// So, copy the input array into the output array as-is (if we can)
// Report an error if the output array is too small to hold the input array; report success otherwise.
if (posN != NULL && posR != NULL)
{
if (outLen >= inLen)
{
strcpy(outData, inData);
return LE_NO_CHANGES_NEEDED;
}
return LE_CHANGES_FAILED;
}
// Case 2: the file contains UNIX line endings.
else if (posN != NULL && posR == NULL)
{
int i = 0;
int track = 0;
for (i = 0; i < inLen; i++)
{
if (inData[i] != '\n')
{
outData[track] = inData[i];
track++;
if (track>outLen) return LE_CHANGES_FAILED;
}
else
{
outData[track] = '\r';
track++;
if (track > outLen) return LE_CHANGES_FAILED;
outData[track] = '\n';
track++;
if (track > outLen) return LE_CHANGES_FAILED;
}
*bytesWritten = track;
}
}
// Case 3: the file contains Mac-style line endings.
else if (posN == NULL && posR != NULL)
{
int i = 0;
int track = 0;
for (i = 0; i < inLen; i++)
{
if (inData[i] != '\r')
{
outData[track] = inData[i];
track++;
if (track>outLen) return LE_CHANGES_FAILED;
}
else
{
outData[track] = '\r';
track++;
if (track > outLen) return LE_CHANGES_FAILED;
outData[track] = '\n';
track++;
if (track > outLen) return LE_CHANGES_FAILED;
}
*bytesWritten = track;
}
}
return LE_CHANGES_SUCCEEDED;
}
但是,我觉得这个函数 很长 长(将近 70 行)并且可以以某种方式减少。我在 Google 上搜索过,但找不到任何有用的东西; C 库或 Windows API 中是否有任何函数允许我执行字符串替换而不是在 O(n) 时间内逐字节手动搜索字符串?
每个角色都需要精确看一遍,不多不少。您的代码的第一行已经进行了重复比较,因为两个 strstr
调用都从同一位置开始。你可以使用像
char *posR = strstr(inData, "\r");
if (posR && posR[1] == '\n')
// Case 1: the file already contains DOS/Windows line endings.
如果失败,如果找到 \r
,则从结束处继续,如果 posR == NULL
,则再次从顶部开始。但是后来你让 strstr
已经 "look at" 每个字符直到结束!
两个补充说明:
- 不需要
strstr
因为你正在寻找一个字符;下次使用strchr
; strXXX
函数都假定您的输入是格式正确的 C 字符串:它应该以终止符0
结尾。但是,您已经在inLen
中提供了长度,因此您 没有 检查零。如果您输入的inLen
字节之前可能有也可能没有0
,您需要采取适当的措施。基于此函数的目的,我假设您根本不需要检查零。
我的建议:每个字符从头看一次,只有在或者和[=13=时才采取行动] 或一个\n
。如果您遇到的第一个是 \r
和 ,下一个是 \n
,您就完成了。 (假设行尾不是 "mixed"。)
如果您在第一个循环中没有 return,那么除了 \r\n
之外还有其他内容,您可以从那一点继续。但是您仍然只需要对 或 \r
或 \n
采取行动!所以我建议这个更短的代码(和一个 enum
而不是你的定义):
enum LEStatus_e { LE_CHANGES_FAILED=-1, LE_NO_CHANGES_NEEDED, LE_CHANGES_SUCCEEDED };
enum LEStatus_e ConvertLineEndings(BYTE *inData, INT inLen, BYTE *outData, INT outLen, INT *bytesWritten)
{
INT sourceIndex = 0, destIndex;
if (outLen < inLen)
return LE_CHANGES_FAILED;
/* Find first occurrence of either \r or \n
This will return immediately for No Change Needed */
while (sourceIndex < inLen)
{
if (inData[sourceIndex] == '\r')
{
if (sourceIndex < inLen-1 && inData[sourceIndex+1] == '\n')
{
memcpy (outData, inData, inLen);
*bytesWritten = inLen;
return LE_NO_CHANGES_NEEDED;
}
break;
}
if (inData[sourceIndex] == '\n')
break;
sourceIndex++;
}
/* We processed this far already: */
memcpy (outData, inData, sourceIndex);
if (sourceIndex == inLen)
return LE_NO_CHANGES_NEEDED;
destIndex = sourceIndex;
while (sourceIndex < inLen)
{
switch (inData[sourceIndex])
{
case '\n':
case '\r':
sourceIndex++;
if (destIndex+2 >= outLen)
return LE_CHANGES_FAILED;
outData[destIndex++] = '\r';
outData[destIndex++] = '\n';
break;
default:
outData[destIndex++] = inData[sourceIndex++];
}
}
*bytesWritten = destIndex;
return LE_CHANGES_SUCCEEDED;
}
有一些古老而罕见的 'plain text' 格式使用了其他结构;从记忆中,类似于 \r\n\n
。如果您希望能够清理 anything,您可以在单个 \n
之后为所有 \r
添加一个跳过,相反的情况也是如此。这也将清除任何 "mixed" 行结尾,因为它也会正确处理 \r\n
。
这是我认为更简单的代码,行数减半。当然,正如 Ben Voigt 指出的那样,您无法击败 O(n) 时间,因此我没有尝试这样做。我没有使用任何库函数,因为这样看起来更简单,而且我怀疑额外的函数调用能否使代码更快。
enum lestatus {
le_no_changes_needed = 0,
le_changes_succeeded = 1,
le_changes_failed = -1
};
enum lestatus ConvertLineEndings(char *indata, int inlen,
char *outdata, int outlen)
{
int outpos = 0, inpos;
enum lestatus it_changed = le_no_changes_needed;
for (inpos = 0; inpos<inlen;inpos++) {
if (outpos + 1 > outlen) return le_changes_failed;
if (indata[inpos] != '\r' && indata[inpos] != '\n') {
/* it is an ordinary character, just copy it */
outdata[outpos++] = indata[inpos];
} else if (outpos + 2 > outlen) {
return le_changes_failed;
} else if ((indata[inpos+1] == '\r' || indata[inpos+1] == '\n')
&& indata[inpos] != indata[inpos+1]) {
/* it is \r\n or \n\r, output it in canonical order */
outdata[outpos++] = '\r';
outdata[outpos++] = '\n';
inpos++; /* skip the second character */
} else {
/* it is a mac or unix line ending, convert to dos */
outdata[outpos++] = '\r';
outdata[outpos++] = '\n';
it_changed = le_changes_succeeded;
}
}
return it_changed;
}
我的代码最大的区别在于
- 我使用了增量运算符。
- 为了简单起见,我避免使用库函数。
- 我的函数正确处理了混合结尾的文件(根据我的解释)。
- 我更喜欢小写字符。这显然是一种风格偏好。
- 与#defines 相比,我更喜欢枚举。也是一种风格偏好。