多线程模拟比单线程慢几个数量级

Question

我正在使用 Verilator 从一个非常简单的程序中模拟电路只是重复设置时钟线高，然后低，直到满足一些输出条件：

#include "VSim.h"
#include <iostream>

vluint64_t main_time = 0;

double sc_time_stamp ()
{
    return main_time;
}

void vstep(VSim* top)
{
    top->RESET = 0;
    top->CLK_25MHZ = 1;
    top->eval();
    main_time++;
    top->CLK_25MHZ = 0;
    top->eval();
    main_time++;
}

int main(int argc, char** argv, char** env)
{
    VSim* top = new VSim();

    int cycles = 0;

    for (int j = 0; j < 10; ++j)
    {
        for (;;)
        {
            vstep(top);
            cycles++;
            if (top->VGA_HSYNC == 0 && top->VGA_VSYNC == 0) break;
        }

        for (;;)
        {
            vstep(top);
            cycles++;
            if (top->VGA_DE) break;
        }
    }

    printf("Verilator, from C: %d cycles\n", cycles);
    delete top;
    return 0;
}

所以问题是如果我运行Verilator处于单线程模式（即我运行 verilator 没有 --threads N 标志，我不设置 VL_THREADED 在编译期间，我不 link -lpthread 和 verilated_threads.o 进入结果），那么这个程序需要大约 150 毫秒：

$ time ../_build/verilator/SimMain 
Verilator, from C: 4192001 cycles

real    0m0.137s
user    0m0.133s
sys 0m0.004s

但是如果我使用 4 个线程，那么在同一台机器上（它有 4 个真正的总共 8 个 HT 内核的内核），我看到该过程使用了 400% CPU，但它使用 250 多倍 CPU 并且 挂钟时间增加 70x:

$ time ../_build/verilator/SimMain 
Verilator, from C: 4192001 cycles

real    0m9.528s
user    0m37.965s
sys 0m0.016s

这是什么原因造成的，我该如何解决？

编辑添加：这个问题是关于 Verilator 的。 c++ 问题标签似乎吸引了对 Verilator 一无所知的人，他们认为他们应该能够从基本原理中推断出这一点。您不会，因为实际的多线程发生在 Verilator 生成的代码中。

编辑添加：我正在模拟的 RTL 不是世界上最好的 Verilog，因为它是由 Clash 生成的；但在评论中提到这种行为可能是因为它的某些属性。所以这是完整的 Verilog 代码：

/* AUTOMATICALLY GENERATED VERILOG-2001 SOURCE CODE.
** GENERATED BY CLASH 1.3.0. DO NOT MODIFY.
*/
`timescale 100fs/100fs
module topEntity
    ( // Inputs
      input  CLK_25MHZ // clock
    , input  RESET // reset

      // Outputs
    , output wire  VGA_HSYNC
    , output wire  VGA_VSYNC
    , output wire  VGA_DE
    , output wire [7:0] VGA_RED
    , output wire [7:0] VGA_GREEN
    , output wire [7:0] VGA_BLUE
    );
  wire [23:0] result;
  wire  b1;
  wire [23:0] result_0;
  wire  result_1;
  wire  result_2;
  // ../src/Bounce.hs:(52,1)-(58,54)
  wire signed [9:0] \x' ;
  // ../src/Bounce.hs:(52,1)-(58,54)
  wire signed [63:0] \c$x'_app_arg ;
  // ../src/Bounce.hs:(52,1)-(58,54)
  wire [8:0] x;
  // ../src/Bounce.hs:(52,1)-(58,54)
  reg [19:0] ds = {10'sd0,   10'sd2};
  // ../src/Bounce.hs:84:1-66
  wire signed [9:0] dx;
  // ../src/Bounce.hs:89:1-74
  wire signed [9:0] x_0;
  // ../src/Bounce.hs:89:1-74
  wire signed [9:0] dx_0;
  // ../src/Bounce.hs:89:1-74
  wire signed [9:0] diff;
  // ../src/Bounce.hs:89:1-74
  wire [1:0] ds2;
  reg [19:0] result_3;
  // ../src/Bounce.hs:89:1-74
  wire [1:0] c$ds2_case_alt;
  // ../src/Bounce.hs:89:1-74
  wire [19:0] ds1;
  // ../src/Bounce.hs:89:1-74
  wire signed [9:0] x_1;
  // ../src/Bounce.hs:89:1-74
  wire signed [9:0] dx_1;
  // ../src/Bounce.hs:89:1-74
  wire signed [9:0] diff_0;
  // ../src/Bounce.hs:89:1-74
  wire [1:0] ds2_0;
  reg [19:0] result_4;
  // ../src/Bounce.hs:89:1-74
  wire [1:0] c$ds2_case_alt_0;
  // ../src/Bounce.hs:(52,1)-(58,54)
  wire signed [9:0] ballY;
  wire  result_5;
  wire  result_6;
  // ../src/Bounce.hs:(52,1)-(58,54)
  wire signed [10:0] \x'_0 ;
  // ../src/Bounce.hs:(52,1)-(58,54)
  wire signed [63:0] \c$x'_app_arg_0 ;
  // ../src/Bounce.hs:(52,1)-(58,54)
  wire [9:0] x_2;
  // ../src/Bounce.hs:(52,1)-(58,54)
  reg [21:0] ds_0 = {11'sd0,   11'sd3};
  // ../src/Bounce.hs:84:1-66
  wire signed [10:0] dx_2;
  // ../src/Bounce.hs:89:1-74
  wire signed [10:0] x_3;
  // ../src/Bounce.hs:89:1-74
  wire signed [10:0] dx_3;
  // ../src/Bounce.hs:89:1-74
  wire signed [10:0] diff_1;
  // ../src/Bounce.hs:89:1-74
  wire [1:0] ds2_1;
  reg [21:0] result_7;
  // ../src/Bounce.hs:89:1-74
  wire [1:0] c$ds2_case_alt_1;
  // ../src/Bounce.hs:89:1-74
  wire [21:0] ds1_0;
  // ../src/Bounce.hs:89:1-74
  wire signed [10:0] x_4;
  // ../src/Bounce.hs:89:1-74
  wire signed [10:0] dx_4;
  // ../src/Bounce.hs:89:1-74
  wire signed [10:0] diff_2;
  // ../src/Bounce.hs:89:1-74
  wire [1:0] ds2_2;
  reg [21:0] result_8;
  // ../src/Bounce.hs:89:1-74
  wire [1:0] c$ds2_case_alt_2;
  // ../src/Bounce.hs:(52,1)-(58,54)
  wire signed [10:0] ballX;
  wire  result_9;
  // ../src/Bounce.hs:(52,1)-(58,54)
  reg  old = 1'b0;
  // ../src/Bounce.hs:(52,1)-(58,54)
  wire  c$frameEnd_case_alt;
  // ../src/Bounce.hs:(52,1)-(58,54)
  wire  s;
  wire [23:0] result_10;
  wire  c$app_arg;
  reg [9:0] vgaY;
  wire  c$app_arg_0;
  reg [10:0] vgaX;
  wire [0:0] c$app_arg_1;
  reg  eta;
  wire [0:0] c$app_arg_2;
  reg  eta_0;
  wire [8:0] coord;
  reg  result_11;
  wire [10:0] c$case_alt;
  wire [10:0] c$case_alt_0;
  wire [10:0] c$case_alt_1;
  wire [10:0] c$case_alt_2;
  reg [10:0] result_12;
  reg [11:0] result_13;
  wire [5:0] cnt;
  wire [11:0] c$case_alt_3;
  wire [6:0] cnt_0;
  wire [11:0] c$case_alt_4;
  wire [3:0] cnt_1;
  wire [11:0] c$case_alt_5;
  wire [9:0] cnt_2;
  wire [11:0] c$case_alt_6;
  reg [11:0] result_14 = {2'b00,10'd0};
  wire [5:0] cnt_3;
  wire [8:0] cnt_4;
  wire [3:0] cnt_5;
  wire [0:0] cnt_6;
  wire [4:0] cnt_7;
  reg [10:0] result_15 = {2'b00,9'd0};
  wire [9:0] coord_0;
  reg [11:0] result_16;
  wire [5:0] cnt_8;
  wire [11:0] c$case_alt_7;
  wire [6:0] cnt_9;
  wire [11:0] c$case_alt_8;
  wire [3:0] cnt_10;
  wire [11:0] c$case_alt_9;
  wire [9:0] cnt_11;
  wire [11:0] c$case_alt_10;
  reg [11:0] result_17 = {2'b00,10'd0};
  wire [9:0] result_selection_3;
  wire [10:0] result_selection_9;
  wire [9:0] s_selection_1;
  wire [26:0] VGA;
  wire [2:0] VGA_0;

  assign result = b1 ? {8'd0,   8'd0,
                        8'd0} : result_0;

  assign VGA = {result_10[23:21],
                result[23:16],   result[15:8],   result[7:0]};

  assign b1 = ~ result_10[21:21];

  assign result_0 = (result_5 & result_1) ? {8'd240,
                                             8'd224,   8'd64} : {8'd48,   8'd48,   8'd48};

  assign result_selection_3 = result_10[9:0];

  assign result_1 = result_selection_3[9:9] ? result_2 : 1'b0;

  assign result_2 = (ballY <= \x' ) ? (\x'  < (ballY + 10'sd15)) : 1'b0;

  assign \x'  = $signed(\c$x'_app_arg [0+:10]);

  assign \c$x'_app_arg  = $unsigned({{(64-9) {1'b0}},x});

  assign x = result_10[8:0];

  // register begin
  always @(posedge CLK_25MHZ or  posedge  RESET) begin : ds_register
    if ( RESET) begin
      ds <= {10'sd0,   10'sd2};
    end else if (result_9) begin
      ds <= result_3;
    end
  end
  // register end

  assign dx = $signed(ds[9:0]);

  assign x_0 = $signed(result_4[19:10]);

  assign dx_0 = $signed(result_4[9:0]);

  assign diff = 10'sd0 - x_0;

  assign ds2 = (10'sd0 == diff) ? 2'd1 : c$ds2_case_alt;

  always @(*) begin
    case(ds2)
      2'b00 : result_3 = {10'sd0 + diff,   -dx_0};
      default : result_3 = result_4;
    endcase
  end

  assign c$ds2_case_alt = (10'sd0 <= diff) ? 2'd0 : 2'd2;

  assign ds1 = {ballY + dx,   dx};

  assign x_1 = $signed(ds1[19:10]);

  assign dx_1 = $signed(ds1[9:0]);

  assign diff_0 = 10'sd464 - x_1;

  assign ds2_0 = (10'sd0 == diff_0) ? 2'd1 : c$ds2_case_alt_0;

  always @(*) begin
    case(ds2_0)
      2'b10 : result_4 = {10'sd464 + diff_0,
                          -dx_1};
      default : result_4 = ds1;
    endcase
  end

  assign c$ds2_case_alt_0 = (10'sd0 <= diff_0) ? 2'd0 : 2'd2;

  assign ballY = $signed(ds[19:10]);

  assign result_selection_9 = result_10[20:10];

  assign result_5 = result_selection_9[10:10] ? result_6 : 1'b0;

  assign result_6 = (ballX <= \x'_0 ) ? (\x'_0  < (ballX + 11'sd15)) : 1'b0;

  assign \x'_0  = $signed(\c$x'_app_arg_0 [0+:11]);

  assign \c$x'_app_arg_0  = $unsigned({{(64-10) {1'b0}},x_2});

  assign x_2 = result_10[19:10];

  // register begin
  always @(posedge CLK_25MHZ or  posedge  RESET) begin : ds_0_register
    if ( RESET) begin
      ds_0 <= {11'sd0,   11'sd3};
    end else if (result_9) begin
      ds_0 <= result_7;
    end
  end
  // register end

  assign dx_2 = $signed(ds_0[10:0]);

  assign x_3 = $signed(result_8[21:11]);

  assign dx_3 = $signed(result_8[10:0]);

  assign diff_1 = 11'sd0 - x_3;

  assign ds2_1 = (11'sd0 == diff_1) ? 2'd1 : c$ds2_case_alt_1;

  always @(*) begin
    case(ds2_1)
      2'b00 : result_7 = {11'sd0 + diff_1,   -dx_3};
      default : result_7 = result_8;
    endcase
  end

  assign c$ds2_case_alt_1 = (11'sd0 <= diff_1) ? 2'd0 : 2'd2;

  assign ds1_0 = {ballX + dx_2,   dx_2};

  assign x_4 = $signed(ds1_0[21:11]);

  assign dx_4 = $signed(ds1_0[10:0]);

  assign diff_2 = 11'sd624 - x_4;

  assign ds2_2 = (11'sd0 == diff_2) ? 2'd1 : c$ds2_case_alt_2;

  always @(*) begin
    case(ds2_2)
      2'b10 : result_8 = {11'sd624 + diff_2,
                          -dx_4};
      default : result_8 = ds1_0;
    endcase
  end

  assign c$ds2_case_alt_2 = (11'sd0 <= diff_2) ? 2'd0 : 2'd2;

  assign ballX = $signed(ds_0[21:11]);

  assign result_9 = old ? c$frameEnd_case_alt : 1'b0;

  // register begin
  always @(posedge CLK_25MHZ or  posedge  RESET) begin : old_register
    if ( RESET) begin
      old <= 1'b0;
    end else if (1'b1) begin
      old <= s;
    end
  end
  // register end

  assign c$frameEnd_case_alt = s ? 1'b0 : 1'b1;

  assign s_selection_1 = result_10[9:0];

  assign s = s_selection_1[9:9] ? 1'b1 : 1'b0;

  assign result_10 = {{~ (c$app_arg_2),
                       ~ (c$app_arg_1),   c$app_arg_0 & c$app_arg},
                      vgaX,   vgaY};

  assign c$app_arg = vgaY[9:9] ? 1'b1 : 1'b0;

  always @(*) begin
    case(result_15[10:9])
      2'b00 : vgaY = {1'b1,coord};
      default : vgaY = {1'b0,9'bxxxxxxxxx};
    endcase
  end

  assign c$app_arg_0 = vgaX[10:10] ? 1'b1 : 1'b0;

  always @(*) begin
    case(result_17[11:10])
      2'b00 : vgaX = {1'b1,coord_0};
      default : vgaX = {1'b0,10'bxxxxxxxxxx};
    endcase
  end

  assign c$app_arg_1 = eta ? 1'b1 : 1'b0;

  always @(*) begin
    case(result_15[10:9])
      2'b10 : eta = 1'b1;
      default : eta = 1'b0;
    endcase
  end

  assign c$app_arg_2 = eta_0 ? 1'b1 : 1'b0;

  always @(*) begin
    case(result_17[11:10])
      2'b10 : eta_0 = 1'b1;
      default : eta_0 = 1'b0;
    endcase
  end

  assign coord = result_15[8:0];

  always @(*) begin
    case(result_14[11:10])
      2'b11 : result_11 = cnt_3 == 6'd47;
      default : result_11 = 1'b0;
    endcase
  end

  assign c$case_alt = (cnt_4 == 9'd479) ? {2'b01,4'd0,5'bxxxxx} : {2'b00,cnt_4 + 9'd1};

  assign c$case_alt_0 = (cnt_5 == 4'd10) ? {2'b10,1'd0,8'bxxxxxxxx} : {2'b01,cnt_5 + 4'd1,5'bxxxxx};

  assign c$case_alt_1 = (cnt_6 == 1'd1) ? {2'b11,5'd0,4'bxxxx} : {2'b10,cnt_6 + 1'd1,8'bxxxxxxxx};

  assign c$case_alt_2 = (cnt_7 == 5'd30) ? {2'b00,9'd0} : {2'b11,cnt_7 + 5'd1,4'bxxxx};

  always @(*) begin
    case(result_15[10:9])
      2'b00 : result_12 = c$case_alt;
      2'b01 : result_12 = c$case_alt_0;
      2'b10 : result_12 = c$case_alt_1;
      default : result_12 = c$case_alt_2;
    endcase
  end

  always @(*) begin
    case(result_14[11:10])
      2'b00 : result_13 = c$case_alt_6;
      2'b01 : result_13 = c$case_alt_5;
      2'b10 : result_13 = c$case_alt_4;
      default : result_13 = c$case_alt_3;
    endcase
  end

  assign cnt = result_14[9:4];

  assign c$case_alt_3 = (cnt == 6'd47) ? {2'b00,10'd0} : {2'b11,cnt + 6'd1,4'bxxxx};

  assign cnt_0 = result_14[9:3];

  assign c$case_alt_4 = (cnt_0 == 7'd95) ? {2'b11,6'd0,4'bxxxx} : {2'b10,cnt_0 + 7'd1,3'bxxx};

  assign cnt_1 = result_14[9:6];

  assign c$case_alt_5 = (cnt_1 == 4'd15) ? {2'b10,7'd0,3'bxxx} : {2'b01,cnt_1 + 4'd1,6'bxxxxxx};

  assign cnt_2 = result_14[9:0];

  assign c$case_alt_6 = (cnt_2 == 10'd639) ? {2'b01,4'd0,6'bxxxxxx} : {2'b00,cnt_2 + 10'd1};

  // register begin
  always @(posedge CLK_25MHZ or  posedge  RESET) begin : result_14_register
    if ( RESET) begin
      result_14 <= {2'b00,10'd0};
    end else if (1'b1) begin
      result_14 <= result_13;
    end
  end
  // register end

  assign cnt_3 = result_14[9:4];

  assign cnt_4 = result_15[8:0];

  assign cnt_5 = result_15[8:5];

  assign cnt_6 = result_15[8:8];

  assign cnt_7 = result_15[8:4];

  // register begin
  always @(posedge CLK_25MHZ or  posedge  RESET) begin : result_15_register
    if ( RESET) begin
      result_15 <= {2'b00,9'd0};
    end else if (result_11) begin
      result_15 <= result_12;
    end
  end
  // register end

  assign coord_0 = result_17[9:0];

  always @(*) begin
    case(result_17[11:10])
      2'b00 : result_16 = c$case_alt_10;
      2'b01 : result_16 = c$case_alt_9;
      2'b10 : result_16 = c$case_alt_8;
      default : result_16 = c$case_alt_7;
    endcase
  end

  assign cnt_8 = result_17[9:4];

  assign c$case_alt_7 = (cnt_8 == 6'd47) ? {2'b00,10'd0} : {2'b11,cnt_8 + 6'd1,4'bxxxx};

  assign cnt_9 = result_17[9:3];

  assign c$case_alt_8 = (cnt_9 == 7'd95) ? {2'b11,6'd0,4'bxxxx} : {2'b10,cnt_9 + 7'd1,3'bxxx};

  assign cnt_10 = result_17[9:6];

  assign c$case_alt_9 = (cnt_10 == 4'd15) ? {2'b10,7'd0,3'bxxx} : {2'b01,cnt_10 + 4'd1,6'bxxxxxx};

  assign cnt_11 = result_17[9:0];

  assign c$case_alt_10 = (cnt_11 == 10'd639) ? {2'b01,4'd0,6'bxxxxxx} : {2'b00,cnt_11 + 10'd1};

  // register begin
  always @(posedge CLK_25MHZ or  posedge  RESET) begin : result_17_register
    if ( RESET) begin
      result_17 <= {2'b00,10'd0};
    end else if (1'b1) begin
      result_17 <= result_16;
    end
  end
  // register end

  assign VGA_0 = VGA[26:24];

  assign VGA_RED = VGA[23:16];

  assign VGA_GREEN = VGA[15:8];

  assign VGA_BLUE = VGA[7:0];

  assign VGA_HSYNC = VGA_0[2:2];

  assign VGA_VSYNC = VGA_0[1:1];

  assign VGA_DE = VGA_0[0:0];


endmodule

Answer 1

根据他们给的开发者 this answer:

Multithreading will only show speedups on much larger designs. In small designs the communication between cores will be much larger than leaving it on one core.

所以看起来最初的猜测是正确的，并且有问题的代码不足以表现出加速。涉及的开销很高，因此它无法从多线程中受益。

多线程模拟比单线程慢几个数量级

Multithreaded simulation orders of magnitude slower than single-threaded

c++

performance

multithreading

verilator