C++11+ 编译器是否足够智能以优化内部 "if" 语句？

Question

这最好用代码描述：

void foo(vector<vector<int>>& a, vector<vector<int>>& b, bool flag) { 

    vector<vector<int>> c; 
    for (int i ...) { 
        for (int j ...) { 
            int value; 
            if (flag) 
                value = a[i][j] + b[i][j]; 
            else 
                value = a[i][j] - b[i][j];
        } 

    } 
}

从表面上看，标志会在每个内部循环中被评估和分支，尽管在任何一个循环之前都是已知的。 C++11+ 编译器会生成两个独立的代码路径，在开始时评估分支，还是应该手动完成？

在向我讲授过早优化之前，请理解这是为了成为一个对次要细节更有认识的程序员。

Answer 1

这可能取决于您的示例的复杂性，但编译器能够进行这种优化。让我们看一个简单而完整的例子：

extern bool get_bool() noexcept;
extern int get_int() noexcept;
extern void foo1() noexcept;
extern void foo0() noexcept;

void foo() noexcept {
  bool b = get_bool();
  int i_mx = get_int();
  int j_mx = get_int();

  for (int i = 0; i < i_mx; ++i) {
    for (int j = 0; j < j_mx; ++j) {
      if (b)
        foo1();
      else
        foo0();
    }
  }
}

如果我们用 clang 编译这个，here 是生成的代码：

foo():                                # @foo()
        push    rbp
        push    r15
        push    r14
        push    r12
        push    rbx
        call    get_bool()
        mov     r14d, eax
        call    get_int()
        mov     r15d, eax
        call    get_int()
        test    r15d, r15d
        jle     .LBB0_9
        mov     r12d, eax
        test    eax, eax
        jle     .LBB0_9
        xor     ebx, ebx
        test    r14b, r14b
        je      .LBB0_3
.LBB0_6:                                # =>This Loop Header: Depth=1
        mov     ebp, r12d
.LBB0_7:                                #   Parent Loop BB0_6 Depth=1
        call    foo1()
        dec     ebp
        jne     .LBB0_7
        inc     ebx
        cmp     ebx, r15d
        jne     .LBB0_6
        jmp     .LBB0_9
.LBB0_3:                                # =>This Loop Header: Depth=1
        mov     ebp, r12d
.LBB0_4:                                #   Parent Loop BB0_3 Depth=1
        call    foo0()
        dec     ebp
        jne     .LBB0_4
        inc     ebx
        cmp     ebx, r15d
        jne     .LBB0_3
.LBB0_9:
        pop     rbx
        pop     r12
        pop     r14
        pop     r15
        pop     rbp
        ret

很明显 test r14b, r14b 行移到了循环之外。同样，您的里程可能会因代码的复杂性而异。最好检查生成的程序集以确保。

Answer 2

即使是今天（2021 年）最新的 C++ 标准，也几乎没有优化方法，例如复制省略和 return 值优化。

这样 'compilers' 就可以应用任何特定于平台的优化。

问题中的函数没有产生任何效果，因此很可能被完全优化掉了。

但是为了解决（我假设的是）'underlying' 问题，典型的编译器将能够推断出相同的条件甚至适用于嵌套循环，例如

int foo(vector<vector<int>> a, vector<vector<int>> b, bool flag) { 
    int value = 0;
    for (int i = 0; i < a.size(); i++) { 
        for (int j = 0; j < a[i].size(); j++) { 
            if (flag) 
                value += a[i][j] + b[i][j]; 
            else 
                value += a[i][j] - b[i][j];
        } 

    } 
    return value;
}

下面的汇编代码说 'yes, it is optimized'（由 Clang Intel x86 64 位编译器生成）：

注意third argument foo 在寄存器dl 中找到（64 位寄存器RDX 的8 位版本）在两个循环开始之前被测试
根据 'foo' 条件复制了两个循环：LBB1_2 和 LBB1_6

运行生成的（编辑过的）汇编代码：g++ -std=c++17 -O3 -c -S code.cpp：

__Z3fooNSt3__16vectorINS0_IiNS_9allocatorIiEEEENS1_IS3_EEEES5_b: ## @_Z3fooNSt3__16vectorINS0_IiNS_9allocatorIiEEEENS1_IS3_EEEES5_b
        .cfi_startproc
## %bb.0:
        pushq   %rbp
...
        xorl    %eax, %eax    # <======== int value = 0;
        testb   %dl, %dl      # <======== if (flag)
        jne     LBB1_6
        jmp     LBB1_2
        .p2align        4, 0x90
LBB1_7:                                 ##   in Loop: Header=BB1_6 Depth=1
        incq    %r10
        cmpq    %r10, %r8
        jbe     LBB1_26
LBB1_6:                                 ## =>This Loop Header: Depth=1
                                        ##     Child Loop BB1_13 Depth 2
                                        ##     Child Loop BB1_17 Depth 2
        leaq    (%r10,%r10,2), %rcx
        movq    (%r9,%rcx,8), %rdi
        movq    8(%r9,%rcx,8), %r11
        subq    %rdi, %r11
...
LBB1_2:                                 ## =>This Loop Header: Depth=1
                                        ##     Child Loop BB1_21 Depth 2
                                        ##     Child Loop BB1_5 Depth 2
        leaq    (%r10,%r10,2), %rcx
        movq    (%r9,%rcx,8), %rdi
        movq    8(%r9,%rcx,8), %r11
        subq    %rdi, %r11

Answer 3

这个东西的优化会是这样的

#include <iostream>
#include <vector>
#include <execution>


int main(int argc , char *argv[])
{
    std::vector<std::vector<int>> b{{2,3} ,{4,7}};
    std::vector<std::vector<int>> aq{{7,8} ,{2,17}};

    std::for_each(std::execution::par, b.begin(), b.end() ,[&](auto& x){
                    for (auto& w : x)
                        w = aq[&x - b.data()][&w - x.data()];
                });
    std::cout << "and then " << b[0][1] << std::endl;
}

-ltbb -std=c++20

C++11+ 编译器是否足够智能以优化内部 "if" 语句？

Are C++11+ compilers smart enough to optimize inner "if" statements?

c++

optimization

branch

compilation