C# 尽可能高效地从字符串中删除回车符 returns、换行符和空格(基准测试)
C# remove carriage returns, line breaks and whitespaces from string as efficient as possible (benchmark)
在 C# 中,我有一个包含空格、回车 returns and/or 换行符的字符串。是否有一种简单的方法来规范化从文本文件导入的大字符串(100.000 到 1.000.000 个字符)尽可能 高效 ?
澄清我的意思:假设我的字符串看起来像 string1,但我希望它像 string2
string1 = " ab c\r\n de.\nf";
string2 = "abcde.f";
var input = " ab c\r\n de.\nf";
var result = Regex.Replace(input, @"\s+", "");
// result is now "abcde.f"
你可以看到它的实际效果here
你可以这样做。您可以定义要在配置文件中允许的特殊字符。在我的例子中,我在 appsettings.json 文件中定义了
private string RemoveUnnecessaryChars(string firstName)
{
StringBuilder sb = new StringBuilder();
string allowedCharacters = _configuration["AllowedChars"];
foreach (char ch in firstName)
{
if (char.IsLetterOrDigit(ch))
{
sb.Append(ch);
}
else
{
if (allowedCharacters.Contains(ch))
{
sb.Append(ch);
}
}
}
return sb.ToString();
}
为了有效地做到这一点,您需要避免正则表达式并将内存分配保持在最低限度:这里我使用了原始字符缓冲区(而不是 StringBuilder
)和 for
而不是 foreach
优化对每个字符的访问:
string Strip(string text)
{
var validCharacters = new char[text.Length];
var next = 0;
for(int i = 0; i < text.Length; i++)
{
char c = text[i];
switch(c)
{
case ' ':
case '\r':
case '\n':
// Ignore then
break;
default:
validCharacters[next++] = c;
break;
}
}
return new string(validCharacters, 0, next);
}
“高效”一词在很大程度上取决于您的实际字符串及其数量。我想出了下一个基准(BenchmarkDotNet):
public class Replace
{
private static readonly string S = " ab c\r\n de.\nf";
private static readonly Regex Reg = new Regex(@"\s+", RegexOptions.Compiled);
[Benchmark]
public string SimpleReplace() => S
.Replace(" ","")
.Replace("\r","")
.Replace("\n","");
[Benchmark]
public string StringBuilder() => new StringBuilder().Append(S)
.Replace(" ","")
.Replace("\r","")
.Replace("\n","")
.ToString();
[Benchmark]
public string RegexReplace() => Reg.Replace(S, "");
[Benchmark]
public string NewString()
{
var arr = new char[S.Length];
var cnt = 0;
for (int i = 0; i < S.Length; i++)
{
switch(S[i])
{
case ' ':
case '\r':
case '\n':
break;
default:
arr[cnt] = S[i];
cnt++;
break;
}
}
return new string(arr, 0, cnt);
}
[Benchmark]
public string NewStringForeach()
{
var validCharacters = new char[S.Length];
var next = 0;
foreach(var c in S)
{
switch(c)
{
case ' ':
case '\r':
case '\n':
// Ignore then
break;
default:
validCharacters[next++] = c;
break;
}
}
return new string(validCharacters, 0, next);
}
}
这在我的机器上给出:
| Method | Mean | Error | StdDev |
|---------------- |------------:|----------:|----------:|
| SimpleReplace | 122.09 ns | 1.273 ns | 1.063 ns |
| StringBuilder | 311.28 ns | 6.313 ns | 8.850 ns |
| RegexReplace | 1,194.91 ns | 23.376 ns | 34.265 ns |
| NewString | 52.26 ns | 1.122 ns | 1.812 ns |
|NewStringForeach | 40.04 ns | 0.877 ns | 1.979 ns |
在 C# 中,我有一个包含空格、回车 returns and/or 换行符的字符串。是否有一种简单的方法来规范化从文本文件导入的大字符串(100.000 到 1.000.000 个字符)尽可能 高效 ?
澄清我的意思:假设我的字符串看起来像 string1,但我希望它像 string2
string1 = " ab c\r\n de.\nf";
string2 = "abcde.f";
var input = " ab c\r\n de.\nf";
var result = Regex.Replace(input, @"\s+", "");
// result is now "abcde.f"
你可以看到它的实际效果here
你可以这样做。您可以定义要在配置文件中允许的特殊字符。在我的例子中,我在 appsettings.json 文件中定义了
private string RemoveUnnecessaryChars(string firstName)
{
StringBuilder sb = new StringBuilder();
string allowedCharacters = _configuration["AllowedChars"];
foreach (char ch in firstName)
{
if (char.IsLetterOrDigit(ch))
{
sb.Append(ch);
}
else
{
if (allowedCharacters.Contains(ch))
{
sb.Append(ch);
}
}
}
return sb.ToString();
}
为了有效地做到这一点,您需要避免正则表达式并将内存分配保持在最低限度:这里我使用了原始字符缓冲区(而不是 StringBuilder
)和 for
而不是 foreach
优化对每个字符的访问:
string Strip(string text)
{
var validCharacters = new char[text.Length];
var next = 0;
for(int i = 0; i < text.Length; i++)
{
char c = text[i];
switch(c)
{
case ' ':
case '\r':
case '\n':
// Ignore then
break;
default:
validCharacters[next++] = c;
break;
}
}
return new string(validCharacters, 0, next);
}
“高效”一词在很大程度上取决于您的实际字符串及其数量。我想出了下一个基准(BenchmarkDotNet):
public class Replace
{
private static readonly string S = " ab c\r\n de.\nf";
private static readonly Regex Reg = new Regex(@"\s+", RegexOptions.Compiled);
[Benchmark]
public string SimpleReplace() => S
.Replace(" ","")
.Replace("\r","")
.Replace("\n","");
[Benchmark]
public string StringBuilder() => new StringBuilder().Append(S)
.Replace(" ","")
.Replace("\r","")
.Replace("\n","")
.ToString();
[Benchmark]
public string RegexReplace() => Reg.Replace(S, "");
[Benchmark]
public string NewString()
{
var arr = new char[S.Length];
var cnt = 0;
for (int i = 0; i < S.Length; i++)
{
switch(S[i])
{
case ' ':
case '\r':
case '\n':
break;
default:
arr[cnt] = S[i];
cnt++;
break;
}
}
return new string(arr, 0, cnt);
}
[Benchmark]
public string NewStringForeach()
{
var validCharacters = new char[S.Length];
var next = 0;
foreach(var c in S)
{
switch(c)
{
case ' ':
case '\r':
case '\n':
// Ignore then
break;
default:
validCharacters[next++] = c;
break;
}
}
return new string(validCharacters, 0, next);
}
}
这在我的机器上给出:
| Method | Mean | Error | StdDev |
|---------------- |------------:|----------:|----------:|
| SimpleReplace | 122.09 ns | 1.273 ns | 1.063 ns |
| StringBuilder | 311.28 ns | 6.313 ns | 8.850 ns |
| RegexReplace | 1,194.91 ns | 23.376 ns | 34.265 ns |
| NewString | 52.26 ns | 1.122 ns | 1.812 ns |
|NewStringForeach | 40.04 ns | 0.877 ns | 1.979 ns |