为什么 gcc 和 clang 会为 std::find 生成这么多代码?
Why do gcc and clang generate so much code for std::find?
我将以下代码输入godbolt.org,并用gcc 10.1和clang 10编译:
#include <algorithm>
#include <vector>
typedef std::vector<int> V;
template<class InputIt, class T>
InputIt myfind(InputIt first, InputIt last, const T& value) {
for (; first != last; ++first) {
if (*first == value) {
return first;
}
}
return last;
}
V::iterator my_find_int(V& v, int i) {
return myfind(v.begin(), v.end(), i);
}
V::iterator std_find_int(V& v, int i) {
return std::find(v.begin(), v.end(), i);
}
使用 -O3
或 -Os
,两个编译器都会生成我所期望的 my_find_int
(gcc 10.1, -Os
):
my_find_int(std::vector<int, std::allocator<int> >&, int):
mov rdx, QWORD PTR [rdi+8]
mov rax, QWORD PTR [rdi]
.L3:
mov r8, rax
cmp rdx, rax
je .L2
add rax, 4
cmp DWORD PTR [rax-4], esi
jne .L3
.L2:
mov rax, r8
ret
但是,对于 std_find_int
,使用 -O3
或 -Os
,它们都会生成 十几个 指令(gcc 10.1,-Os
):
std_find_int(std::vector<int, std::allocator<int> >&, int):
mov rax, rdi
mov rdi, QWORD PTR [rdi+8]
mov rdx, QWORD PTR [rax]
mov rcx, rdi
sub rcx, rdx
sar rcx, 4
.L12:
mov rax, rdx
test rcx, rcx
jle .L7
cmp DWORD PTR [rdx], esi
je .L8
cmp DWORD PTR [rdx+4], esi
jne .L9
add rax, 4
ret
.L9:
cmp DWORD PTR [rdx+8], esi
jne .L10
add rax, 8
ret
.L10:
lea rdx, [rdx+16]
cmp DWORD PTR [rax+12], esi
jne .L11
add rax, 12
ret
.L11:
dec rcx
jmp .L12
.L7:
mov rdx, rdi
sub rdx, rax
cmp rdx, 8
je .L13
cmp rdx, 12
je .L14
cmp rdx, 4
jne .L23
jmp .L15
.L14:
cmp esi, DWORD PTR [rax]
je .L8
add rax, 4
.L13:
cmp esi, DWORD PTR [rax]
je .L8
add rax, 4
.L15:
cmp esi, DWORD PTR [rax]
je .L8
.L23:
mov rax, rdi
.L8:
ret
根据 cppreference.com,myfind
是 std::find
的有效实现(他们将其描述为 std::find
的“可能实现”)。
该行为似乎与版本无关;回到至少 4.9 的每个主要版本的 gcc 的输出看起来都相似。
看起来 my_find_int
和 std_find_int
在功能上应该是相同的,那么为什么在使用 std::find
时两个编译器生成这么多代码?
原因很简单:随机访问迭代器std::find
的实现不是简单的for
循环,而是something more complicated:
template<typename _RandomAccessIterator, typename _Predicate>
_GLIBCXX20_CONSTEXPR
_RandomAccessIterator
__find_if(_RandomAccessIterator __first, _RandomAccessIterator __last,
_Predicate __pred, random_access_iterator_tag)
{
typename iterator_traits<_RandomAccessIterator>::difference_type
__trip_count = (__last - __first) >> 2;
for (; __trip_count > 0; --__trip_count)
{
if (__pred(__first))
return __first;
++__first;
if (__pred(__first))
return __first;
++__first;
if (__pred(__first))
return __first;
++__first;
if (__pred(__first))
return __first;
++__first;
}
switch (__last - __first)
{
case 3:
if (__pred(__first))
return __first;
++__first;
// FALLTHRU
case 2:
if (__pred(__first))
return __first;
++__first;
// FALLTHRU
case 1:
if (__pred(__first))
return __first;
++__first;
// FALLTHRU
case 0:
default:
return __last;
}
}
循环是手动展开的,因此每次迭代不仅包含一个谓词调用,还包含四个调用。 std::find
是根据 __find_if
实现的,谓词是比较。
这个实现可以追溯到 SGI STL, at least. Alexander Stepanov explains:
Typically people unroll by 4
or 8
but not more. The main reason that people don’t go beyond 8
has to do with the law of diminishing returns. The point of loop unrolling is to gain a decent percent improvement in the ratio loop overhead to overall code. Starting with, say 30% loop overhead, unrolling by a factor of 4
leaves us with about 8% overhead. Unrolling by a factor of 8
brings it down to a 4% overhead. Overhead below 4% is commonly viewed as noise – results could vary from CPU to CPU, etc. In research we do unroll loops – 30% does not matter when we only want to demonstrate feasibility. But when it is time to transfer code to real applications then unrolling can be worth considering.
我将以下代码输入godbolt.org,并用gcc 10.1和clang 10编译:
#include <algorithm>
#include <vector>
typedef std::vector<int> V;
template<class InputIt, class T>
InputIt myfind(InputIt first, InputIt last, const T& value) {
for (; first != last; ++first) {
if (*first == value) {
return first;
}
}
return last;
}
V::iterator my_find_int(V& v, int i) {
return myfind(v.begin(), v.end(), i);
}
V::iterator std_find_int(V& v, int i) {
return std::find(v.begin(), v.end(), i);
}
使用 -O3
或 -Os
,两个编译器都会生成我所期望的 my_find_int
(gcc 10.1, -Os
):
my_find_int(std::vector<int, std::allocator<int> >&, int):
mov rdx, QWORD PTR [rdi+8]
mov rax, QWORD PTR [rdi]
.L3:
mov r8, rax
cmp rdx, rax
je .L2
add rax, 4
cmp DWORD PTR [rax-4], esi
jne .L3
.L2:
mov rax, r8
ret
但是,对于 std_find_int
,使用 -O3
或 -Os
,它们都会生成 十几个 指令(gcc 10.1,-Os
):
std_find_int(std::vector<int, std::allocator<int> >&, int):
mov rax, rdi
mov rdi, QWORD PTR [rdi+8]
mov rdx, QWORD PTR [rax]
mov rcx, rdi
sub rcx, rdx
sar rcx, 4
.L12:
mov rax, rdx
test rcx, rcx
jle .L7
cmp DWORD PTR [rdx], esi
je .L8
cmp DWORD PTR [rdx+4], esi
jne .L9
add rax, 4
ret
.L9:
cmp DWORD PTR [rdx+8], esi
jne .L10
add rax, 8
ret
.L10:
lea rdx, [rdx+16]
cmp DWORD PTR [rax+12], esi
jne .L11
add rax, 12
ret
.L11:
dec rcx
jmp .L12
.L7:
mov rdx, rdi
sub rdx, rax
cmp rdx, 8
je .L13
cmp rdx, 12
je .L14
cmp rdx, 4
jne .L23
jmp .L15
.L14:
cmp esi, DWORD PTR [rax]
je .L8
add rax, 4
.L13:
cmp esi, DWORD PTR [rax]
je .L8
add rax, 4
.L15:
cmp esi, DWORD PTR [rax]
je .L8
.L23:
mov rax, rdi
.L8:
ret
根据 cppreference.com,myfind
是 std::find
的有效实现(他们将其描述为 std::find
的“可能实现”)。
该行为似乎与版本无关;回到至少 4.9 的每个主要版本的 gcc 的输出看起来都相似。
看起来 my_find_int
和 std_find_int
在功能上应该是相同的,那么为什么在使用 std::find
时两个编译器生成这么多代码?
原因很简单:随机访问迭代器std::find
的实现不是简单的for
循环,而是something more complicated:
template<typename _RandomAccessIterator, typename _Predicate>
_GLIBCXX20_CONSTEXPR
_RandomAccessIterator
__find_if(_RandomAccessIterator __first, _RandomAccessIterator __last,
_Predicate __pred, random_access_iterator_tag)
{
typename iterator_traits<_RandomAccessIterator>::difference_type
__trip_count = (__last - __first) >> 2;
for (; __trip_count > 0; --__trip_count)
{
if (__pred(__first))
return __first;
++__first;
if (__pred(__first))
return __first;
++__first;
if (__pred(__first))
return __first;
++__first;
if (__pred(__first))
return __first;
++__first;
}
switch (__last - __first)
{
case 3:
if (__pred(__first))
return __first;
++__first;
// FALLTHRU
case 2:
if (__pred(__first))
return __first;
++__first;
// FALLTHRU
case 1:
if (__pred(__first))
return __first;
++__first;
// FALLTHRU
case 0:
default:
return __last;
}
}
循环是手动展开的,因此每次迭代不仅包含一个谓词调用,还包含四个调用。 std::find
是根据 __find_if
实现的,谓词是比较。
这个实现可以追溯到 SGI STL, at least. Alexander Stepanov explains:
Typically people unroll by
4
or8
but not more. The main reason that people don’t go beyond8
has to do with the law of diminishing returns. The point of loop unrolling is to gain a decent percent improvement in the ratio loop overhead to overall code. Starting with, say 30% loop overhead, unrolling by a factor of4
leaves us with about 8% overhead. Unrolling by a factor of8
brings it down to a 4% overhead. Overhead below 4% is commonly viewed as noise – results could vary from CPU to CPU, etc. In research we do unroll loops – 30% does not matter when we only want to demonstrate feasibility. But when it is time to transfer code to real applications then unrolling can be worth considering.