64 位整数上的 C++ 与 C# 按位运算 - 性能
C++ vs C# bitwise operations on 64-bit ints - performance
我将 2D 位域存储在 5 个无符号长整数的数组中。
我要争取最好的表现。
我在 C# 中工作,但我试图通过在 C++ 中实现我的 class 来设置基准。
这里的问题是 C# 实现需要大约 10 秒才能完成,而 C++ 需要大约 1 秒才能完成,这使得它 快了 10 倍。 C++ 是在 VS2015 中构建的 x64。 C# 在 x64 VS2015 .NET 4.6 中。当然两者都在发布中。
编辑: 稍微优化 C# 代码后,它仍然需要 7 到 8 秒,而 C++ 需要 1.3 秒。
注意: x86 中的 C++ 大约需要 6 秒才能完成。我是运行64位机器上的代码
问题:是什么让 C++ 更快?有没有办法优化 C# 代码,使其至少同样快? (也许是一些不安全的魔法?)
令我困惑的是,我们只是在谈论遍历数组和按位运算。它不应该被 JIT 化为与 C++ 几乎相同的东西吗?
示例代码:
实现中有两个简单的函数。 Left() 和 Right() 分别将整个文件向左移动 1 位。在多头之间携带适当的位。
C++
#include <iostream>
#include <chrono>
using namespace std;
using namespace std::chrono;
class BitField
{
private:
unsigned long long LEFTMOST_BIT = 0x8000000000000000;
unsigned long long RIGHTMOST_BIT = 1;
public:
unsigned long long Cells_l[5];
BitField()
{
for (size_t i = 0; i < 5; i++)
{
Cells_l[i] = rand(); // Random initialization
}
}
void Left()
{
unsigned long long carry = 0;
unsigned long long nextCarry = 0;
for (int i = 0; i < 5; i++)
{
nextCarry = (Cells_l[i] & LEFTMOST_BIT) >> 63;
Cells_l[i] = Cells_l[i] << 1 | carry;
carry = nextCarry;
}
}
void Right()
{
unsigned long long carry = 0;
unsigned long long nextCarry = 0;
for (int i = 4; i >= 0; i--)
{
nextCarry = (Cells_l[i] & RIGHTMOST_BIT) << 63;
Cells_l[i] = Cells_l[i] >> 1 | carry;
carry = nextCarry;
}
}
};
int main()
{
BitField bf;
high_resolution_clock::time_point t1 = high_resolution_clock::now();
for (int i = 0; i < 100000000; i++)
{
bf.Left();
bf.Left();
bf.Left();
bf.Right();
bf.Right();
bf.Left();
bf.Right();
bf.Right();
}
high_resolution_clock::time_point t2 = high_resolution_clock::now();
auto duration = duration_cast<milliseconds>(t2 - t1).count();
cout << "Time: " << duration << endl << endl;
// Print to avoid compiler optimizations
for (size_t i = 0; i < 5; i++)
{
cout << bf.Cells_l[i] << endl;
}
return 0;
}
C#
using System;
using System.Diagnostics;
namespace TestCS
{
class BitField
{
const ulong LEFTMOST_BIT = 0x8000000000000000;
const ulong RIGHTMOST_BIT = 1;
static Random rnd = new Random();
ulong[] Cells;
public BitField()
{
Cells = new ulong[5];
for (int i = 0; i < 5; i++)
{
Cells[i] = (ulong)rnd.Next(); // Random initialization
}
}
public void Left()
{
ulong carry = 0;
ulong nextCarry = 0;
for (int i = 0; i < 5; i++)
{
nextCarry = (Cells[i] & LEFTMOST_BIT) >> 63;
Cells[i] = Cells[i] << 1 | carry;
carry = nextCarry;
}
}
public void Right()
{
ulong carry = 0;
ulong nextCarry = 0;
for (int i = 4; i >= 0; i--)
{
nextCarry = (Cells[i] & RIGHTMOST_BIT) << 63;
Cells[i] = Cells[i] >> 1 | carry;
carry = nextCarry;
}
}
}
class Program
{
static void Main(string[] args)
{
BitField bf = new BitField();
Stopwatch sw = new Stopwatch();
// Call to remove the compilation time from measurements
bf.Left();
bf.Right();
sw.Start();
for (int i = 0; i < 100000000; i++)
{
bf.Left();
bf.Left();
bf.Left();
bf.Right();
bf.Right();
bf.Left();
bf.Right();
bf.Right();
}
sw.Stop();
Console.WriteLine($"Done in: {sw.Elapsed.TotalMilliseconds.ToString()}ms");
}
}
}
编辑: 修复了示例代码中的 "nextCarry" 拼写错误。
部分差异可能是因为两个版本之间的代码差异 - 您不会在 C++ Left
或 C# Right
中分配给 nextCarry
, 但这些可能是示例中的拼写错误。
您想查看两者的反汇编以了解差异,但这主要是由于 C++ 编译器有更多时间用于优化代码。在这种情况下,它展开循环,内联所有函数调用(包括构造函数),并将 Cells_l
中的所有内容推送到寄存器中。所以有一个大循环使用寄存器并且没有访问内存。
我没有看过 C# 编译的输出,但我怀疑它做的与此相近。
此外,如评论中所述,将 C# 代码中的所有 Cells.Length
调用替换为 5(就像在 C++ 代码中一样)。
我从评论和@AntoninLejsek 删除的答案中获得了足够的信息,我可以自己回答这个问题。
TL;DR C++ 编译器在优化方面做得更好,而 C# 管理的数组访问在循环中完成时成本很高。然而,不安全的代码和固定的访问权限不足以与 C++ 相媲美。
看来我们需要手动优化 C# 代码才能获得与 C++ 相当的性能。
- 展开循环
- 对固定数组访问使用不安全代码
- 不要重复访问数组 - 而是将项目存储到局部变量中。
以下 C# 代码的运行速度与 C++ 代码一样快(实际上快了大约 100 毫秒)。在 .NET 4.6 VS 2015 版本 x64 上编译。
unsafe struct BitField
{
static Random rnd = new Random();
public fixed ulong Cells[5];
public BitField(int nothing)
{
fixed (ulong* p = Cells)
{
for (int i = 0; i < 5; i++)
{
p[i] = (ulong)rnd.Next(); // Just some random number
}
}
}
public void StuffUnrolledNonManaged()
{
ulong u0;
ulong u1;
ulong u2;
ulong u3;
ulong u4;
fixed (ulong *p = Cells)
{
u0 = p[0];
u1 = p[1];
u2 = p[2];
u3 = p[3];
u4 = p[4];
}
ulong carry = 0;
ulong nextCarry = 0;
for (int i = 0; i < 100000000; i++)
{
//left
carry = 0;
nextCarry = u0 >> 63;
u0 = u0 << 1 | carry;
carry = nextCarry;
nextCarry = u1 >> 63;
u1 = u1 << 1 | carry;
carry = nextCarry;
nextCarry = u2 >> 63;
u2 = u2 << 1 | carry;
carry = nextCarry;
nextCarry = u3 >> 63;
u3 = u3 << 1 | carry;
carry = nextCarry;
u4 = u4 << 1 | carry;
//left
carry = 0;
nextCarry = u0 >> 63;
u0 = u0 << 1 | carry;
carry = nextCarry;
nextCarry = u1 >> 63;
u1 = u1 << 1 | carry;
carry = nextCarry;
nextCarry = u2 >> 63;
u2 = u2 << 1 | carry;
carry = nextCarry;
nextCarry = u3 >> 63;
u3 = u3 << 1 | carry;
carry = nextCarry;
u4 = u4 << 1 | carry;
//left
carry = 0;
nextCarry = u0 >> 63;
u0 = u0 << 1 | carry;
carry = nextCarry;
nextCarry = u1 >> 63;
u1 = u1 << 1 | carry;
carry = nextCarry;
nextCarry = u2 >> 63;
u2 = u2 << 1 | carry;
carry = nextCarry;
nextCarry = u3 >> 63;
u3 = u3 << 1 | carry;
carry = nextCarry;
u4 = u4 << 1 | carry;
//right
carry = 0;
nextCarry = u4 << 63;
u4 = u4 >> 1 | carry;
carry = nextCarry;
nextCarry = u3 << 63;
u3 = u3 >> 1 | carry;
carry = nextCarry;
nextCarry = u2 << 63;
u2 = u2 >> 1 | carry;
carry = nextCarry;
nextCarry = u1 << 63;
u1 = u1 >> 1 | carry;
carry = nextCarry;
u0 = u0 >> 1 | carry;
//right
carry = 0;
nextCarry = u4 << 63;
u4 = u4 >> 1 | carry;
carry = nextCarry;
nextCarry = u3 << 63;
u3 = u3 >> 1 | carry;
carry = nextCarry;
nextCarry = u2 << 63;
u2 = u2 >> 1 | carry;
carry = nextCarry;
nextCarry = u1 << 63;
u1 = u1 >> 1 | carry;
carry = nextCarry;
u0 = u0 >> 1 | carry;
//left
carry = 0;
nextCarry = u0 >> 63;
u0 = u0 << 1 | carry;
carry = nextCarry;
nextCarry = u1 >> 63;
u1 = u1 << 1 | carry;
carry = nextCarry;
nextCarry = u2 >> 63;
u2 = u2 << 1 | carry;
carry = nextCarry;
nextCarry = u3 >> 63;
u3 = u3 << 1 | carry;
carry = nextCarry;
u4 = u4 << 1 | carry;
//right
carry = 0;
nextCarry = u4 << 63;
u4 = u4 >> 1 | carry;
carry = nextCarry;
nextCarry = u3 << 63;
u3 = u3 >> 1 | carry;
carry = nextCarry;
nextCarry = u2 << 63;
u2 = u2 >> 1 | carry;
carry = nextCarry;
nextCarry = u1 << 63;
u1 = u1 >> 1 | carry;
carry = nextCarry;
u0 = u0 >> 1 | carry;
//right
carry = 0;
nextCarry = u4 << 63;
u4 = u4 >> 1 | carry;
carry = nextCarry;
nextCarry = u3 << 63;
u3 = u3 >> 1 | carry;
carry = nextCarry;
nextCarry = u2 << 63;
u2 = u2 >> 1 | carry;
carry = nextCarry;
nextCarry = u1 << 63;
u1 = u1 >> 1 | carry;
carry = nextCarry;
u0 = u0 >> 1 | carry;
}
fixed (ulong* p = Cells)
{
p[0] = u0;
p[1] = u1;
p[2] = u2;
p[3] = u3;
p[4] = u4;
}
}
测试代码
static void Main(string[] args)
{
BitField bf = new BitField(0);
Stopwatch sw = new Stopwatch();
// Call to remove the compilation time from measurements
bf.StuffUnrolledNonManaged();
sw.Start();
bf.StuffUnrolledNonManaged();
sw.Stop();
Console.WriteLine($"Non managed access unrolled in: {sw.Elapsed.TotalMilliseconds.ToString()}ms");
}
此代码在大约 1.1 秒后完成。
注:仅固定数组访问不足以匹配C++性能。如果我们不使用局部变量 - u0 的每个实例都被 p[0] 等替换。时间约为 3.6 秒.
如果我们仅对问题代码使用固定访问(在循环中调用 Left() 和 Right() 函数)。时间约为5.8秒.
我将 2D 位域存储在 5 个无符号长整数的数组中。 我要争取最好的表现。 我在 C# 中工作,但我试图通过在 C++ 中实现我的 class 来设置基准。
这里的问题是 C# 实现需要大约 10 秒才能完成,而 C++ 需要大约 1 秒才能完成,这使得它 快了 10 倍。 C++ 是在 VS2015 中构建的 x64。 C# 在 x64 VS2015 .NET 4.6 中。当然两者都在发布中。
编辑: 稍微优化 C# 代码后,它仍然需要 7 到 8 秒,而 C++ 需要 1.3 秒。
注意: x86 中的 C++ 大约需要 6 秒才能完成。我是运行64位机器上的代码
问题:是什么让 C++ 更快?有没有办法优化 C# 代码,使其至少同样快? (也许是一些不安全的魔法?)
令我困惑的是,我们只是在谈论遍历数组和按位运算。它不应该被 JIT 化为与 C++ 几乎相同的东西吗?
示例代码: 实现中有两个简单的函数。 Left() 和 Right() 分别将整个文件向左移动 1 位。在多头之间携带适当的位。
C++
#include <iostream>
#include <chrono>
using namespace std;
using namespace std::chrono;
class BitField
{
private:
unsigned long long LEFTMOST_BIT = 0x8000000000000000;
unsigned long long RIGHTMOST_BIT = 1;
public:
unsigned long long Cells_l[5];
BitField()
{
for (size_t i = 0; i < 5; i++)
{
Cells_l[i] = rand(); // Random initialization
}
}
void Left()
{
unsigned long long carry = 0;
unsigned long long nextCarry = 0;
for (int i = 0; i < 5; i++)
{
nextCarry = (Cells_l[i] & LEFTMOST_BIT) >> 63;
Cells_l[i] = Cells_l[i] << 1 | carry;
carry = nextCarry;
}
}
void Right()
{
unsigned long long carry = 0;
unsigned long long nextCarry = 0;
for (int i = 4; i >= 0; i--)
{
nextCarry = (Cells_l[i] & RIGHTMOST_BIT) << 63;
Cells_l[i] = Cells_l[i] >> 1 | carry;
carry = nextCarry;
}
}
};
int main()
{
BitField bf;
high_resolution_clock::time_point t1 = high_resolution_clock::now();
for (int i = 0; i < 100000000; i++)
{
bf.Left();
bf.Left();
bf.Left();
bf.Right();
bf.Right();
bf.Left();
bf.Right();
bf.Right();
}
high_resolution_clock::time_point t2 = high_resolution_clock::now();
auto duration = duration_cast<milliseconds>(t2 - t1).count();
cout << "Time: " << duration << endl << endl;
// Print to avoid compiler optimizations
for (size_t i = 0; i < 5; i++)
{
cout << bf.Cells_l[i] << endl;
}
return 0;
}
C#
using System;
using System.Diagnostics;
namespace TestCS
{
class BitField
{
const ulong LEFTMOST_BIT = 0x8000000000000000;
const ulong RIGHTMOST_BIT = 1;
static Random rnd = new Random();
ulong[] Cells;
public BitField()
{
Cells = new ulong[5];
for (int i = 0; i < 5; i++)
{
Cells[i] = (ulong)rnd.Next(); // Random initialization
}
}
public void Left()
{
ulong carry = 0;
ulong nextCarry = 0;
for (int i = 0; i < 5; i++)
{
nextCarry = (Cells[i] & LEFTMOST_BIT) >> 63;
Cells[i] = Cells[i] << 1 | carry;
carry = nextCarry;
}
}
public void Right()
{
ulong carry = 0;
ulong nextCarry = 0;
for (int i = 4; i >= 0; i--)
{
nextCarry = (Cells[i] & RIGHTMOST_BIT) << 63;
Cells[i] = Cells[i] >> 1 | carry;
carry = nextCarry;
}
}
}
class Program
{
static void Main(string[] args)
{
BitField bf = new BitField();
Stopwatch sw = new Stopwatch();
// Call to remove the compilation time from measurements
bf.Left();
bf.Right();
sw.Start();
for (int i = 0; i < 100000000; i++)
{
bf.Left();
bf.Left();
bf.Left();
bf.Right();
bf.Right();
bf.Left();
bf.Right();
bf.Right();
}
sw.Stop();
Console.WriteLine($"Done in: {sw.Elapsed.TotalMilliseconds.ToString()}ms");
}
}
}
编辑: 修复了示例代码中的 "nextCarry" 拼写错误。
部分差异可能是因为两个版本之间的代码差异 - 您不会在 C++ Left
或 C# Right
中分配给 nextCarry
, 但这些可能是示例中的拼写错误。
您想查看两者的反汇编以了解差异,但这主要是由于 C++ 编译器有更多时间用于优化代码。在这种情况下,它展开循环,内联所有函数调用(包括构造函数),并将 Cells_l
中的所有内容推送到寄存器中。所以有一个大循环使用寄存器并且没有访问内存。
我没有看过 C# 编译的输出,但我怀疑它做的与此相近。
此外,如评论中所述,将 C# 代码中的所有 Cells.Length
调用替换为 5(就像在 C++ 代码中一样)。
我从评论和@AntoninLejsek 删除的答案中获得了足够的信息,我可以自己回答这个问题。
TL;DR C++ 编译器在优化方面做得更好,而 C# 管理的数组访问在循环中完成时成本很高。然而,不安全的代码和固定的访问权限不足以与 C++ 相媲美。
看来我们需要手动优化 C# 代码才能获得与 C++ 相当的性能。
- 展开循环
- 对固定数组访问使用不安全代码
- 不要重复访问数组 - 而是将项目存储到局部变量中。
以下 C# 代码的运行速度与 C++ 代码一样快(实际上快了大约 100 毫秒)。在 .NET 4.6 VS 2015 版本 x64 上编译。
unsafe struct BitField
{
static Random rnd = new Random();
public fixed ulong Cells[5];
public BitField(int nothing)
{
fixed (ulong* p = Cells)
{
for (int i = 0; i < 5; i++)
{
p[i] = (ulong)rnd.Next(); // Just some random number
}
}
}
public void StuffUnrolledNonManaged()
{
ulong u0;
ulong u1;
ulong u2;
ulong u3;
ulong u4;
fixed (ulong *p = Cells)
{
u0 = p[0];
u1 = p[1];
u2 = p[2];
u3 = p[3];
u4 = p[4];
}
ulong carry = 0;
ulong nextCarry = 0;
for (int i = 0; i < 100000000; i++)
{
//left
carry = 0;
nextCarry = u0 >> 63;
u0 = u0 << 1 | carry;
carry = nextCarry;
nextCarry = u1 >> 63;
u1 = u1 << 1 | carry;
carry = nextCarry;
nextCarry = u2 >> 63;
u2 = u2 << 1 | carry;
carry = nextCarry;
nextCarry = u3 >> 63;
u3 = u3 << 1 | carry;
carry = nextCarry;
u4 = u4 << 1 | carry;
//left
carry = 0;
nextCarry = u0 >> 63;
u0 = u0 << 1 | carry;
carry = nextCarry;
nextCarry = u1 >> 63;
u1 = u1 << 1 | carry;
carry = nextCarry;
nextCarry = u2 >> 63;
u2 = u2 << 1 | carry;
carry = nextCarry;
nextCarry = u3 >> 63;
u3 = u3 << 1 | carry;
carry = nextCarry;
u4 = u4 << 1 | carry;
//left
carry = 0;
nextCarry = u0 >> 63;
u0 = u0 << 1 | carry;
carry = nextCarry;
nextCarry = u1 >> 63;
u1 = u1 << 1 | carry;
carry = nextCarry;
nextCarry = u2 >> 63;
u2 = u2 << 1 | carry;
carry = nextCarry;
nextCarry = u3 >> 63;
u3 = u3 << 1 | carry;
carry = nextCarry;
u4 = u4 << 1 | carry;
//right
carry = 0;
nextCarry = u4 << 63;
u4 = u4 >> 1 | carry;
carry = nextCarry;
nextCarry = u3 << 63;
u3 = u3 >> 1 | carry;
carry = nextCarry;
nextCarry = u2 << 63;
u2 = u2 >> 1 | carry;
carry = nextCarry;
nextCarry = u1 << 63;
u1 = u1 >> 1 | carry;
carry = nextCarry;
u0 = u0 >> 1 | carry;
//right
carry = 0;
nextCarry = u4 << 63;
u4 = u4 >> 1 | carry;
carry = nextCarry;
nextCarry = u3 << 63;
u3 = u3 >> 1 | carry;
carry = nextCarry;
nextCarry = u2 << 63;
u2 = u2 >> 1 | carry;
carry = nextCarry;
nextCarry = u1 << 63;
u1 = u1 >> 1 | carry;
carry = nextCarry;
u0 = u0 >> 1 | carry;
//left
carry = 0;
nextCarry = u0 >> 63;
u0 = u0 << 1 | carry;
carry = nextCarry;
nextCarry = u1 >> 63;
u1 = u1 << 1 | carry;
carry = nextCarry;
nextCarry = u2 >> 63;
u2 = u2 << 1 | carry;
carry = nextCarry;
nextCarry = u3 >> 63;
u3 = u3 << 1 | carry;
carry = nextCarry;
u4 = u4 << 1 | carry;
//right
carry = 0;
nextCarry = u4 << 63;
u4 = u4 >> 1 | carry;
carry = nextCarry;
nextCarry = u3 << 63;
u3 = u3 >> 1 | carry;
carry = nextCarry;
nextCarry = u2 << 63;
u2 = u2 >> 1 | carry;
carry = nextCarry;
nextCarry = u1 << 63;
u1 = u1 >> 1 | carry;
carry = nextCarry;
u0 = u0 >> 1 | carry;
//right
carry = 0;
nextCarry = u4 << 63;
u4 = u4 >> 1 | carry;
carry = nextCarry;
nextCarry = u3 << 63;
u3 = u3 >> 1 | carry;
carry = nextCarry;
nextCarry = u2 << 63;
u2 = u2 >> 1 | carry;
carry = nextCarry;
nextCarry = u1 << 63;
u1 = u1 >> 1 | carry;
carry = nextCarry;
u0 = u0 >> 1 | carry;
}
fixed (ulong* p = Cells)
{
p[0] = u0;
p[1] = u1;
p[2] = u2;
p[3] = u3;
p[4] = u4;
}
}
测试代码
static void Main(string[] args)
{
BitField bf = new BitField(0);
Stopwatch sw = new Stopwatch();
// Call to remove the compilation time from measurements
bf.StuffUnrolledNonManaged();
sw.Start();
bf.StuffUnrolledNonManaged();
sw.Stop();
Console.WriteLine($"Non managed access unrolled in: {sw.Elapsed.TotalMilliseconds.ToString()}ms");
}
此代码在大约 1.1 秒后完成。
注:仅固定数组访问不足以匹配C++性能。如果我们不使用局部变量 - u0 的每个实例都被 p[0] 等替换。时间约为 3.6 秒.
如果我们仅对问题代码使用固定访问(在循环中调用 Left() 和 Right() 函数)。时间约为5.8秒.