为什么我的 D 代码查找素数比我的 C++ 代码快得多?
Why is my D code for finding prime numbers much faster than my C++ code?
我分别用C++和D(lang)编写了两个小项目来计算给定数量的素数。两个项目中的代码非常相似。然而,我的 D 代码运行速度比我的 C++ 代码快得多,尽管据说 C++ 更快。我使用 dmd 和 dub 编译 D 代码,使用 clang (LLVM 11.0) 和 Visual C++ 编译我的 C++ 代码。我使用 Visual Studio 代码从命令行实际开发和编译我的 C++ 程序,尽管使用 -O3。如果某些变量名称不匹配,我很抱歉,我很快从德语翻译了我的代码。下面是我的代码:
C++ 实现:
main.cpp:
#include <iostream>
#include <chrono>
#include <vector>
#include "isqrt.hpp"
bool prime(int number)
{
for(unsigned i = 2; i < isqrt(number)+1; i++)
{
if (!(number % i))
{
return false;
}
}
return true;
}
int main(int argc, char *argv[])
{
std::cout << "Prime numbers\n-------------------------\n" << std::endl;
while (true)
{
std::cout << "Please enter the amount of prime numbers you want to get calculated: ";
unsigned amount;
std::cin >> amount;
std::vector<unsigned> prime_numbers = {2,3,5};
unsigned start = 6;
bool p;
std::chrono::system_clock::time_point before = std::chrono::system_clock::now();
while(prime_numbers.size() < amount)
{
p = prime(start);
if(p)
{
prime_numbers.push_back(start);
}
start++;
}
std::chrono::system_clock::time_point after = std::chrono::system_clock::now();
//std::cout << prime_numbers << std::endl;
std::chrono::system_clock::duration diff = after - before;
std::cout << std::chrono::duration_cast<std::chrono::milliseconds>(diff).count() << std::endl;
}
std::cout << "Why has thou forsaken me?" << std::endl;
return 0;
}
isqrt.hpp:
#ifndef ISQRT_HPP
#define ISQRT_HPP
unsigned isqrt(unsigned number);
#endif
isqrt.cpp:
#include "isqrt.hpp"
unsigned isqrt(unsigned number) {
if (!number) return 0;
unsigned left = 1;
unsigned right = (number >> 1) + 1;
unsigned res;
unsigned mid;
while (left <= right) {
mid = left + ((right-left) >> 1);
if (mid*mid < number){
left = mid+1;
res=mid;
}
else {
right=mid-1;
}
}
return res;
}
D 实现:
main.d:
import std.stdio;
import std.datetime.stopwatch;
import isqrt;
/** Whether the number is a prime */
bool prime(int number)
{
foreach(i; 2 .. iSqrt(number)+1)
{
if (!(number % i))
{
return false;
}
}
return true;
}
void main()
{
writeln("Prime numbers\n-------------------------\n");
auto sw = StopWatch(AutoStart.no);
int amount;
while (true)
{
sw.reset();
write("Please enter the amount of prime numbers that are to be calculated: ");
readf("%d\n",amount);
int[] prime_numbers = [2,3];
int start = 5;
bool p;
sw.start();
while(prime_numbers.length < amount)
{
p = prime(start);
if(p)
{
primzahlen ~= start;
}
start++;
}
sw.stop();
//writefln("%(%s%|, %)\n",prime_numbers);
writeln(sw.peek.total!"msecs");
}
}
isqrt.d:
module isqrt;
/** Int squareroot */
public uint iSqrt(uint number) {
if (!number) return 0;
uint left = 1;
uint right = number >> 1 + 1;
uint res;
uint mid;
while (left <= right) {
mid = left + ((right-left) >> 1);
if (mid<=number/mid){
left = mid+1;
res=mid;
}
else {
right=mid-1;
}
}
return res;
}
如果你稍微改变你的 main.cpp
diff main.old.cpp main.cpp
9c9,10
< for(unsigned i = 2; i < isqrt(number)+1; i++)
---
> int max = isqrt(number) + 1;
> for(unsigned i = 2; i < max; i++)
并重新构建您的 C++ 代码
g++ -O3 -o cppisqrt isqrt.cpp main.cpp
你会看到 C++ 版本只比用
编译的 D 版本慢一点
gdc -O3 -o disqrt isqrt.d main.d
为了公平起见,我使用了相同的编译器后端(G++ 和 GDC 都使用 GCC 后端)。在我的家庭工作站上,C++ 变体需要 ~1140ms(10 个样本的平均值)来处理 200000 个数字,而 D 变体需要 ~1090ms(也是 10 个样本的平均值)。
G++ 和 GDC 生成的代码相似:
https://godbolt.org 上的 GCC 10.2 x86-64 生成
isqrt(unsigned int):
push rbp
mov rbp, rsp
mov DWORD PTR [rbp-20], edi
cmp DWORD PTR [rbp-20], 0
jne .L2
mov eax, 0
jmp .L3
.L2:
mov DWORD PTR [rbp-4], 1
mov eax, DWORD PTR [rbp-20]
shr eax
add eax, 1
mov DWORD PTR [rbp-8], eax
.L7:
mov eax, DWORD PTR [rbp-4]
cmp eax, DWORD PTR [rbp-8]
ja .L4
mov eax, DWORD PTR [rbp-8]
sub eax, DWORD PTR [rbp-4]
shr eax
mov edx, eax
mov eax, DWORD PTR [rbp-4]
add eax, edx
mov DWORD PTR [rbp-16], eax
mov eax, DWORD PTR [rbp-16]
imul eax, eax
cmp DWORD PTR [rbp-20], eax
jbe .L5
mov eax, DWORD PTR [rbp-16]
add eax, 1
mov DWORD PTR [rbp-4], eax
mov eax, DWORD PTR [rbp-16]
mov DWORD PTR [rbp-12], eax
jmp .L7
.L5:
mov eax, DWORD PTR [rbp-16]
sub eax, 1
mov DWORD PTR [rbp-8], eax
jmp .L7
.L4:
mov eax, DWORD PTR [rbp-12]
.L3:
pop rbp
ret
https://godbolt.org 上的 GDC 10.2 x86-64 产生
uint example.iSqrt(uint):
push rbp
mov rbp, rsp
mov DWORD PTR [rbp-20], edi
cmp DWORD PTR [rbp-20], 0
jne .L2
mov eax, 0
jmp .L3
.L2:
mov DWORD PTR [rbp-4], 1
mov eax, DWORD PTR [rbp-20]
shr eax, 2
mov DWORD PTR [rbp-8], eax
mov DWORD PTR [rbp-12], 0
mov DWORD PTR [rbp-16], 0
.L7:
mov eax, DWORD PTR [rbp-4]
cmp eax, DWORD PTR [rbp-8]
ja .L4
mov eax, DWORD PTR [rbp-8]
sub eax, DWORD PTR [rbp-4]
shr eax
mov edx, eax
mov eax, DWORD PTR [rbp-4]
add eax, edx
mov DWORD PTR [rbp-16], eax
mov eax, DWORD PTR [rbp-20]
mov edx, 0
div DWORD PTR [rbp-16]
cmp DWORD PTR [rbp-16], eax
ja .L5
mov eax, DWORD PTR [rbp-16]
add eax, 1
mov DWORD PTR [rbp-4], eax
mov eax, DWORD PTR [rbp-16]
mov DWORD PTR [rbp-12], eax
jmp .L7
.L5:
mov eax, DWORD PTR [rbp-16]
sub eax, 1
mov DWORD PTR [rbp-8], eax
jmp .L7
.L4:
mov eax, DWORD PTR [rbp-12]
.L3:
pop rbp
ret
PS。在这种情况下,D 版本不需要 public
说明符。
我分别用C++和D(lang)编写了两个小项目来计算给定数量的素数。两个项目中的代码非常相似。然而,我的 D 代码运行速度比我的 C++ 代码快得多,尽管据说 C++ 更快。我使用 dmd 和 dub 编译 D 代码,使用 clang (LLVM 11.0) 和 Visual C++ 编译我的 C++ 代码。我使用 Visual Studio 代码从命令行实际开发和编译我的 C++ 程序,尽管使用 -O3。如果某些变量名称不匹配,我很抱歉,我很快从德语翻译了我的代码。下面是我的代码:
C++ 实现:
main.cpp:
#include <iostream>
#include <chrono>
#include <vector>
#include "isqrt.hpp"
bool prime(int number)
{
for(unsigned i = 2; i < isqrt(number)+1; i++)
{
if (!(number % i))
{
return false;
}
}
return true;
}
int main(int argc, char *argv[])
{
std::cout << "Prime numbers\n-------------------------\n" << std::endl;
while (true)
{
std::cout << "Please enter the amount of prime numbers you want to get calculated: ";
unsigned amount;
std::cin >> amount;
std::vector<unsigned> prime_numbers = {2,3,5};
unsigned start = 6;
bool p;
std::chrono::system_clock::time_point before = std::chrono::system_clock::now();
while(prime_numbers.size() < amount)
{
p = prime(start);
if(p)
{
prime_numbers.push_back(start);
}
start++;
}
std::chrono::system_clock::time_point after = std::chrono::system_clock::now();
//std::cout << prime_numbers << std::endl;
std::chrono::system_clock::duration diff = after - before;
std::cout << std::chrono::duration_cast<std::chrono::milliseconds>(diff).count() << std::endl;
}
std::cout << "Why has thou forsaken me?" << std::endl;
return 0;
}
isqrt.hpp:
#ifndef ISQRT_HPP
#define ISQRT_HPP
unsigned isqrt(unsigned number);
#endif
isqrt.cpp:
#include "isqrt.hpp"
unsigned isqrt(unsigned number) {
if (!number) return 0;
unsigned left = 1;
unsigned right = (number >> 1) + 1;
unsigned res;
unsigned mid;
while (left <= right) {
mid = left + ((right-left) >> 1);
if (mid*mid < number){
left = mid+1;
res=mid;
}
else {
right=mid-1;
}
}
return res;
}
D 实现:
main.d:
import std.stdio;
import std.datetime.stopwatch;
import isqrt;
/** Whether the number is a prime */
bool prime(int number)
{
foreach(i; 2 .. iSqrt(number)+1)
{
if (!(number % i))
{
return false;
}
}
return true;
}
void main()
{
writeln("Prime numbers\n-------------------------\n");
auto sw = StopWatch(AutoStart.no);
int amount;
while (true)
{
sw.reset();
write("Please enter the amount of prime numbers that are to be calculated: ");
readf("%d\n",amount);
int[] prime_numbers = [2,3];
int start = 5;
bool p;
sw.start();
while(prime_numbers.length < amount)
{
p = prime(start);
if(p)
{
primzahlen ~= start;
}
start++;
}
sw.stop();
//writefln("%(%s%|, %)\n",prime_numbers);
writeln(sw.peek.total!"msecs");
}
}
isqrt.d:
module isqrt;
/** Int squareroot */
public uint iSqrt(uint number) {
if (!number) return 0;
uint left = 1;
uint right = number >> 1 + 1;
uint res;
uint mid;
while (left <= right) {
mid = left + ((right-left) >> 1);
if (mid<=number/mid){
left = mid+1;
res=mid;
}
else {
right=mid-1;
}
}
return res;
}
如果你稍微改变你的 main.cpp
diff main.old.cpp main.cpp
9c9,10
< for(unsigned i = 2; i < isqrt(number)+1; i++)
---
> int max = isqrt(number) + 1;
> for(unsigned i = 2; i < max; i++)
并重新构建您的 C++ 代码
g++ -O3 -o cppisqrt isqrt.cpp main.cpp
你会看到 C++ 版本只比用
编译的 D 版本慢一点gdc -O3 -o disqrt isqrt.d main.d
为了公平起见,我使用了相同的编译器后端(G++ 和 GDC 都使用 GCC 后端)。在我的家庭工作站上,C++ 变体需要 ~1140ms(10 个样本的平均值)来处理 200000 个数字,而 D 变体需要 ~1090ms(也是 10 个样本的平均值)。
G++ 和 GDC 生成的代码相似:
https://godbolt.org 上的 GCC 10.2 x86-64 生成
isqrt(unsigned int):
push rbp
mov rbp, rsp
mov DWORD PTR [rbp-20], edi
cmp DWORD PTR [rbp-20], 0
jne .L2
mov eax, 0
jmp .L3
.L2:
mov DWORD PTR [rbp-4], 1
mov eax, DWORD PTR [rbp-20]
shr eax
add eax, 1
mov DWORD PTR [rbp-8], eax
.L7:
mov eax, DWORD PTR [rbp-4]
cmp eax, DWORD PTR [rbp-8]
ja .L4
mov eax, DWORD PTR [rbp-8]
sub eax, DWORD PTR [rbp-4]
shr eax
mov edx, eax
mov eax, DWORD PTR [rbp-4]
add eax, edx
mov DWORD PTR [rbp-16], eax
mov eax, DWORD PTR [rbp-16]
imul eax, eax
cmp DWORD PTR [rbp-20], eax
jbe .L5
mov eax, DWORD PTR [rbp-16]
add eax, 1
mov DWORD PTR [rbp-4], eax
mov eax, DWORD PTR [rbp-16]
mov DWORD PTR [rbp-12], eax
jmp .L7
.L5:
mov eax, DWORD PTR [rbp-16]
sub eax, 1
mov DWORD PTR [rbp-8], eax
jmp .L7
.L4:
mov eax, DWORD PTR [rbp-12]
.L3:
pop rbp
ret
https://godbolt.org 上的 GDC 10.2 x86-64 产生
uint example.iSqrt(uint):
push rbp
mov rbp, rsp
mov DWORD PTR [rbp-20], edi
cmp DWORD PTR [rbp-20], 0
jne .L2
mov eax, 0
jmp .L3
.L2:
mov DWORD PTR [rbp-4], 1
mov eax, DWORD PTR [rbp-20]
shr eax, 2
mov DWORD PTR [rbp-8], eax
mov DWORD PTR [rbp-12], 0
mov DWORD PTR [rbp-16], 0
.L7:
mov eax, DWORD PTR [rbp-4]
cmp eax, DWORD PTR [rbp-8]
ja .L4
mov eax, DWORD PTR [rbp-8]
sub eax, DWORD PTR [rbp-4]
shr eax
mov edx, eax
mov eax, DWORD PTR [rbp-4]
add eax, edx
mov DWORD PTR [rbp-16], eax
mov eax, DWORD PTR [rbp-20]
mov edx, 0
div DWORD PTR [rbp-16]
cmp DWORD PTR [rbp-16], eax
ja .L5
mov eax, DWORD PTR [rbp-16]
add eax, 1
mov DWORD PTR [rbp-4], eax
mov eax, DWORD PTR [rbp-16]
mov DWORD PTR [rbp-12], eax
jmp .L7
.L5:
mov eax, DWORD PTR [rbp-16]
sub eax, 1
mov DWORD PTR [rbp-8], eax
jmp .L7
.L4:
mov eax, DWORD PTR [rbp-12]
.L3:
pop rbp
ret
PS。在这种情况下,D 版本不需要 public
说明符。