多线程模拟比单线程慢几个数量级
Multithreaded simulation orders of magnitude slower than single-threaded
我正在使用 Verilator 从一个非常简单的程序中模拟电路
只是重复设置时钟线高,然后低,直到
满足一些输出条件:
#include "VSim.h"
#include <iostream>
vluint64_t main_time = 0;
double sc_time_stamp ()
{
return main_time;
}
void vstep(VSim* top)
{
top->RESET = 0;
top->CLK_25MHZ = 1;
top->eval();
main_time++;
top->CLK_25MHZ = 0;
top->eval();
main_time++;
}
int main(int argc, char** argv, char** env)
{
VSim* top = new VSim();
int cycles = 0;
for (int j = 0; j < 10; ++j)
{
for (;;)
{
vstep(top);
cycles++;
if (top->VGA_HSYNC == 0 && top->VGA_VSYNC == 0) break;
}
for (;;)
{
vstep(top);
cycles++;
if (top->VGA_DE) break;
}
}
printf("Verilator, from C: %d cycles\n", cycles);
delete top;
return 0;
}
所以问题是如果我运行Verilator处于单线程模式
(即我 运行 verilator
没有 --threads N
标志,我不设置
VL_THREADED
在编译期间,我不 link -lpthread
和
verilated_threads.o
进入结果),那么这个程序需要大约 150 毫秒:
$ time ../_build/verilator/SimMain
Verilator, from C: 4192001 cycles
real 0m0.137s
user 0m0.133s
sys 0m0.004s
但是如果我使用 4 个线程,那么在同一台机器上(它有 4 个真正的
总共 8 个 HT 内核的内核),我看到该过程使用了 400% CPU,但它
使用 250 多倍 CPU 并且 挂钟时间增加
70x:
$ time ../_build/verilator/SimMain
Verilator, from C: 4192001 cycles
real 0m9.528s
user 0m37.965s
sys 0m0.016s
这是什么原因造成的,我该如何解决?
编辑添加:这个问题是关于 Verilator 的。 c++
问题标签似乎吸引了对 Verilator 一无所知的人,他们认为他们应该能够从基本原理中推断出这一点。您不会,因为实际的多线程发生在 Verilator 生成的代码中 。
编辑添加:我正在模拟的 RTL 不是世界上最好的 Verilog,因为它是由 Clash 生成的;但在评论中提到这种行为可能是因为它的某些属性。所以这是完整的 Verilog 代码:
/* AUTOMATICALLY GENERATED VERILOG-2001 SOURCE CODE.
** GENERATED BY CLASH 1.3.0. DO NOT MODIFY.
*/
`timescale 100fs/100fs
module topEntity
( // Inputs
input CLK_25MHZ // clock
, input RESET // reset
// Outputs
, output wire VGA_HSYNC
, output wire VGA_VSYNC
, output wire VGA_DE
, output wire [7:0] VGA_RED
, output wire [7:0] VGA_GREEN
, output wire [7:0] VGA_BLUE
);
wire [23:0] result;
wire b1;
wire [23:0] result_0;
wire result_1;
wire result_2;
// ../src/Bounce.hs:(52,1)-(58,54)
wire signed [9:0] \x' ;
// ../src/Bounce.hs:(52,1)-(58,54)
wire signed [63:0] \c$x'_app_arg ;
// ../src/Bounce.hs:(52,1)-(58,54)
wire [8:0] x;
// ../src/Bounce.hs:(52,1)-(58,54)
reg [19:0] ds = {10'sd0, 10'sd2};
// ../src/Bounce.hs:84:1-66
wire signed [9:0] dx;
// ../src/Bounce.hs:89:1-74
wire signed [9:0] x_0;
// ../src/Bounce.hs:89:1-74
wire signed [9:0] dx_0;
// ../src/Bounce.hs:89:1-74
wire signed [9:0] diff;
// ../src/Bounce.hs:89:1-74
wire [1:0] ds2;
reg [19:0] result_3;
// ../src/Bounce.hs:89:1-74
wire [1:0] c$ds2_case_alt;
// ../src/Bounce.hs:89:1-74
wire [19:0] ds1;
// ../src/Bounce.hs:89:1-74
wire signed [9:0] x_1;
// ../src/Bounce.hs:89:1-74
wire signed [9:0] dx_1;
// ../src/Bounce.hs:89:1-74
wire signed [9:0] diff_0;
// ../src/Bounce.hs:89:1-74
wire [1:0] ds2_0;
reg [19:0] result_4;
// ../src/Bounce.hs:89:1-74
wire [1:0] c$ds2_case_alt_0;
// ../src/Bounce.hs:(52,1)-(58,54)
wire signed [9:0] ballY;
wire result_5;
wire result_6;
// ../src/Bounce.hs:(52,1)-(58,54)
wire signed [10:0] \x'_0 ;
// ../src/Bounce.hs:(52,1)-(58,54)
wire signed [63:0] \c$x'_app_arg_0 ;
// ../src/Bounce.hs:(52,1)-(58,54)
wire [9:0] x_2;
// ../src/Bounce.hs:(52,1)-(58,54)
reg [21:0] ds_0 = {11'sd0, 11'sd3};
// ../src/Bounce.hs:84:1-66
wire signed [10:0] dx_2;
// ../src/Bounce.hs:89:1-74
wire signed [10:0] x_3;
// ../src/Bounce.hs:89:1-74
wire signed [10:0] dx_3;
// ../src/Bounce.hs:89:1-74
wire signed [10:0] diff_1;
// ../src/Bounce.hs:89:1-74
wire [1:0] ds2_1;
reg [21:0] result_7;
// ../src/Bounce.hs:89:1-74
wire [1:0] c$ds2_case_alt_1;
// ../src/Bounce.hs:89:1-74
wire [21:0] ds1_0;
// ../src/Bounce.hs:89:1-74
wire signed [10:0] x_4;
// ../src/Bounce.hs:89:1-74
wire signed [10:0] dx_4;
// ../src/Bounce.hs:89:1-74
wire signed [10:0] diff_2;
// ../src/Bounce.hs:89:1-74
wire [1:0] ds2_2;
reg [21:0] result_8;
// ../src/Bounce.hs:89:1-74
wire [1:0] c$ds2_case_alt_2;
// ../src/Bounce.hs:(52,1)-(58,54)
wire signed [10:0] ballX;
wire result_9;
// ../src/Bounce.hs:(52,1)-(58,54)
reg old = 1'b0;
// ../src/Bounce.hs:(52,1)-(58,54)
wire c$frameEnd_case_alt;
// ../src/Bounce.hs:(52,1)-(58,54)
wire s;
wire [23:0] result_10;
wire c$app_arg;
reg [9:0] vgaY;
wire c$app_arg_0;
reg [10:0] vgaX;
wire [0:0] c$app_arg_1;
reg eta;
wire [0:0] c$app_arg_2;
reg eta_0;
wire [8:0] coord;
reg result_11;
wire [10:0] c$case_alt;
wire [10:0] c$case_alt_0;
wire [10:0] c$case_alt_1;
wire [10:0] c$case_alt_2;
reg [10:0] result_12;
reg [11:0] result_13;
wire [5:0] cnt;
wire [11:0] c$case_alt_3;
wire [6:0] cnt_0;
wire [11:0] c$case_alt_4;
wire [3:0] cnt_1;
wire [11:0] c$case_alt_5;
wire [9:0] cnt_2;
wire [11:0] c$case_alt_6;
reg [11:0] result_14 = {2'b00,10'd0};
wire [5:0] cnt_3;
wire [8:0] cnt_4;
wire [3:0] cnt_5;
wire [0:0] cnt_6;
wire [4:0] cnt_7;
reg [10:0] result_15 = {2'b00,9'd0};
wire [9:0] coord_0;
reg [11:0] result_16;
wire [5:0] cnt_8;
wire [11:0] c$case_alt_7;
wire [6:0] cnt_9;
wire [11:0] c$case_alt_8;
wire [3:0] cnt_10;
wire [11:0] c$case_alt_9;
wire [9:0] cnt_11;
wire [11:0] c$case_alt_10;
reg [11:0] result_17 = {2'b00,10'd0};
wire [9:0] result_selection_3;
wire [10:0] result_selection_9;
wire [9:0] s_selection_1;
wire [26:0] VGA;
wire [2:0] VGA_0;
assign result = b1 ? {8'd0, 8'd0,
8'd0} : result_0;
assign VGA = {result_10[23:21],
result[23:16], result[15:8], result[7:0]};
assign b1 = ~ result_10[21:21];
assign result_0 = (result_5 & result_1) ? {8'd240,
8'd224, 8'd64} : {8'd48, 8'd48, 8'd48};
assign result_selection_3 = result_10[9:0];
assign result_1 = result_selection_3[9:9] ? result_2 : 1'b0;
assign result_2 = (ballY <= \x' ) ? (\x' < (ballY + 10'sd15)) : 1'b0;
assign \x' = $signed(\c$x'_app_arg [0+:10]);
assign \c$x'_app_arg = $unsigned({{(64-9) {1'b0}},x});
assign x = result_10[8:0];
// register begin
always @(posedge CLK_25MHZ or posedge RESET) begin : ds_register
if ( RESET) begin
ds <= {10'sd0, 10'sd2};
end else if (result_9) begin
ds <= result_3;
end
end
// register end
assign dx = $signed(ds[9:0]);
assign x_0 = $signed(result_4[19:10]);
assign dx_0 = $signed(result_4[9:0]);
assign diff = 10'sd0 - x_0;
assign ds2 = (10'sd0 == diff) ? 2'd1 : c$ds2_case_alt;
always @(*) begin
case(ds2)
2'b00 : result_3 = {10'sd0 + diff, -dx_0};
default : result_3 = result_4;
endcase
end
assign c$ds2_case_alt = (10'sd0 <= diff) ? 2'd0 : 2'd2;
assign ds1 = {ballY + dx, dx};
assign x_1 = $signed(ds1[19:10]);
assign dx_1 = $signed(ds1[9:0]);
assign diff_0 = 10'sd464 - x_1;
assign ds2_0 = (10'sd0 == diff_0) ? 2'd1 : c$ds2_case_alt_0;
always @(*) begin
case(ds2_0)
2'b10 : result_4 = {10'sd464 + diff_0,
-dx_1};
default : result_4 = ds1;
endcase
end
assign c$ds2_case_alt_0 = (10'sd0 <= diff_0) ? 2'd0 : 2'd2;
assign ballY = $signed(ds[19:10]);
assign result_selection_9 = result_10[20:10];
assign result_5 = result_selection_9[10:10] ? result_6 : 1'b0;
assign result_6 = (ballX <= \x'_0 ) ? (\x'_0 < (ballX + 11'sd15)) : 1'b0;
assign \x'_0 = $signed(\c$x'_app_arg_0 [0+:11]);
assign \c$x'_app_arg_0 = $unsigned({{(64-10) {1'b0}},x_2});
assign x_2 = result_10[19:10];
// register begin
always @(posedge CLK_25MHZ or posedge RESET) begin : ds_0_register
if ( RESET) begin
ds_0 <= {11'sd0, 11'sd3};
end else if (result_9) begin
ds_0 <= result_7;
end
end
// register end
assign dx_2 = $signed(ds_0[10:0]);
assign x_3 = $signed(result_8[21:11]);
assign dx_3 = $signed(result_8[10:0]);
assign diff_1 = 11'sd0 - x_3;
assign ds2_1 = (11'sd0 == diff_1) ? 2'd1 : c$ds2_case_alt_1;
always @(*) begin
case(ds2_1)
2'b00 : result_7 = {11'sd0 + diff_1, -dx_3};
default : result_7 = result_8;
endcase
end
assign c$ds2_case_alt_1 = (11'sd0 <= diff_1) ? 2'd0 : 2'd2;
assign ds1_0 = {ballX + dx_2, dx_2};
assign x_4 = $signed(ds1_0[21:11]);
assign dx_4 = $signed(ds1_0[10:0]);
assign diff_2 = 11'sd624 - x_4;
assign ds2_2 = (11'sd0 == diff_2) ? 2'd1 : c$ds2_case_alt_2;
always @(*) begin
case(ds2_2)
2'b10 : result_8 = {11'sd624 + diff_2,
-dx_4};
default : result_8 = ds1_0;
endcase
end
assign c$ds2_case_alt_2 = (11'sd0 <= diff_2) ? 2'd0 : 2'd2;
assign ballX = $signed(ds_0[21:11]);
assign result_9 = old ? c$frameEnd_case_alt : 1'b0;
// register begin
always @(posedge CLK_25MHZ or posedge RESET) begin : old_register
if ( RESET) begin
old <= 1'b0;
end else if (1'b1) begin
old <= s;
end
end
// register end
assign c$frameEnd_case_alt = s ? 1'b0 : 1'b1;
assign s_selection_1 = result_10[9:0];
assign s = s_selection_1[9:9] ? 1'b1 : 1'b0;
assign result_10 = {{~ (c$app_arg_2),
~ (c$app_arg_1), c$app_arg_0 & c$app_arg},
vgaX, vgaY};
assign c$app_arg = vgaY[9:9] ? 1'b1 : 1'b0;
always @(*) begin
case(result_15[10:9])
2'b00 : vgaY = {1'b1,coord};
default : vgaY = {1'b0,9'bxxxxxxxxx};
endcase
end
assign c$app_arg_0 = vgaX[10:10] ? 1'b1 : 1'b0;
always @(*) begin
case(result_17[11:10])
2'b00 : vgaX = {1'b1,coord_0};
default : vgaX = {1'b0,10'bxxxxxxxxxx};
endcase
end
assign c$app_arg_1 = eta ? 1'b1 : 1'b0;
always @(*) begin
case(result_15[10:9])
2'b10 : eta = 1'b1;
default : eta = 1'b0;
endcase
end
assign c$app_arg_2 = eta_0 ? 1'b1 : 1'b0;
always @(*) begin
case(result_17[11:10])
2'b10 : eta_0 = 1'b1;
default : eta_0 = 1'b0;
endcase
end
assign coord = result_15[8:0];
always @(*) begin
case(result_14[11:10])
2'b11 : result_11 = cnt_3 == 6'd47;
default : result_11 = 1'b0;
endcase
end
assign c$case_alt = (cnt_4 == 9'd479) ? {2'b01,4'd0,5'bxxxxx} : {2'b00,cnt_4 + 9'd1};
assign c$case_alt_0 = (cnt_5 == 4'd10) ? {2'b10,1'd0,8'bxxxxxxxx} : {2'b01,cnt_5 + 4'd1,5'bxxxxx};
assign c$case_alt_1 = (cnt_6 == 1'd1) ? {2'b11,5'd0,4'bxxxx} : {2'b10,cnt_6 + 1'd1,8'bxxxxxxxx};
assign c$case_alt_2 = (cnt_7 == 5'd30) ? {2'b00,9'd0} : {2'b11,cnt_7 + 5'd1,4'bxxxx};
always @(*) begin
case(result_15[10:9])
2'b00 : result_12 = c$case_alt;
2'b01 : result_12 = c$case_alt_0;
2'b10 : result_12 = c$case_alt_1;
default : result_12 = c$case_alt_2;
endcase
end
always @(*) begin
case(result_14[11:10])
2'b00 : result_13 = c$case_alt_6;
2'b01 : result_13 = c$case_alt_5;
2'b10 : result_13 = c$case_alt_4;
default : result_13 = c$case_alt_3;
endcase
end
assign cnt = result_14[9:4];
assign c$case_alt_3 = (cnt == 6'd47) ? {2'b00,10'd0} : {2'b11,cnt + 6'd1,4'bxxxx};
assign cnt_0 = result_14[9:3];
assign c$case_alt_4 = (cnt_0 == 7'd95) ? {2'b11,6'd0,4'bxxxx} : {2'b10,cnt_0 + 7'd1,3'bxxx};
assign cnt_1 = result_14[9:6];
assign c$case_alt_5 = (cnt_1 == 4'd15) ? {2'b10,7'd0,3'bxxx} : {2'b01,cnt_1 + 4'd1,6'bxxxxxx};
assign cnt_2 = result_14[9:0];
assign c$case_alt_6 = (cnt_2 == 10'd639) ? {2'b01,4'd0,6'bxxxxxx} : {2'b00,cnt_2 + 10'd1};
// register begin
always @(posedge CLK_25MHZ or posedge RESET) begin : result_14_register
if ( RESET) begin
result_14 <= {2'b00,10'd0};
end else if (1'b1) begin
result_14 <= result_13;
end
end
// register end
assign cnt_3 = result_14[9:4];
assign cnt_4 = result_15[8:0];
assign cnt_5 = result_15[8:5];
assign cnt_6 = result_15[8:8];
assign cnt_7 = result_15[8:4];
// register begin
always @(posedge CLK_25MHZ or posedge RESET) begin : result_15_register
if ( RESET) begin
result_15 <= {2'b00,9'd0};
end else if (result_11) begin
result_15 <= result_12;
end
end
// register end
assign coord_0 = result_17[9:0];
always @(*) begin
case(result_17[11:10])
2'b00 : result_16 = c$case_alt_10;
2'b01 : result_16 = c$case_alt_9;
2'b10 : result_16 = c$case_alt_8;
default : result_16 = c$case_alt_7;
endcase
end
assign cnt_8 = result_17[9:4];
assign c$case_alt_7 = (cnt_8 == 6'd47) ? {2'b00,10'd0} : {2'b11,cnt_8 + 6'd1,4'bxxxx};
assign cnt_9 = result_17[9:3];
assign c$case_alt_8 = (cnt_9 == 7'd95) ? {2'b11,6'd0,4'bxxxx} : {2'b10,cnt_9 + 7'd1,3'bxxx};
assign cnt_10 = result_17[9:6];
assign c$case_alt_9 = (cnt_10 == 4'd15) ? {2'b10,7'd0,3'bxxx} : {2'b01,cnt_10 + 4'd1,6'bxxxxxx};
assign cnt_11 = result_17[9:0];
assign c$case_alt_10 = (cnt_11 == 10'd639) ? {2'b01,4'd0,6'bxxxxxx} : {2'b00,cnt_11 + 10'd1};
// register begin
always @(posedge CLK_25MHZ or posedge RESET) begin : result_17_register
if ( RESET) begin
result_17 <= {2'b00,10'd0};
end else if (1'b1) begin
result_17 <= result_16;
end
end
// register end
assign VGA_0 = VGA[26:24];
assign VGA_RED = VGA[23:16];
assign VGA_GREEN = VGA[15:8];
assign VGA_BLUE = VGA[7:0];
assign VGA_HSYNC = VGA_0[2:2];
assign VGA_VSYNC = VGA_0[1:1];
assign VGA_DE = VGA_0[0:0];
endmodule
根据他们给的开发者 this answer:
Multithreading will only show speedups on much larger designs. In small designs the communication between cores will be much larger than leaving it on one core.
所以看起来最初的猜测是正确的,并且有问题的代码不足以表现出加速。涉及的开销很高,因此它无法从多线程中受益。
我正在使用 Verilator 从一个非常简单的程序中模拟电路 只是重复设置时钟线高,然后低,直到 满足一些输出条件:
#include "VSim.h"
#include <iostream>
vluint64_t main_time = 0;
double sc_time_stamp ()
{
return main_time;
}
void vstep(VSim* top)
{
top->RESET = 0;
top->CLK_25MHZ = 1;
top->eval();
main_time++;
top->CLK_25MHZ = 0;
top->eval();
main_time++;
}
int main(int argc, char** argv, char** env)
{
VSim* top = new VSim();
int cycles = 0;
for (int j = 0; j < 10; ++j)
{
for (;;)
{
vstep(top);
cycles++;
if (top->VGA_HSYNC == 0 && top->VGA_VSYNC == 0) break;
}
for (;;)
{
vstep(top);
cycles++;
if (top->VGA_DE) break;
}
}
printf("Verilator, from C: %d cycles\n", cycles);
delete top;
return 0;
}
所以问题是如果我运行Verilator处于单线程模式
(即我 运行 verilator
没有 --threads N
标志,我不设置
VL_THREADED
在编译期间,我不 link -lpthread
和
verilated_threads.o
进入结果),那么这个程序需要大约 150 毫秒:
$ time ../_build/verilator/SimMain
Verilator, from C: 4192001 cycles
real 0m0.137s
user 0m0.133s
sys 0m0.004s
但是如果我使用 4 个线程,那么在同一台机器上(它有 4 个真正的 总共 8 个 HT 内核的内核),我看到该过程使用了 400% CPU,但它 使用 250 多倍 CPU 并且 挂钟时间增加 70x:
$ time ../_build/verilator/SimMain
Verilator, from C: 4192001 cycles
real 0m9.528s
user 0m37.965s
sys 0m0.016s
这是什么原因造成的,我该如何解决?
编辑添加:这个问题是关于 Verilator 的。 c++
问题标签似乎吸引了对 Verilator 一无所知的人,他们认为他们应该能够从基本原理中推断出这一点。您不会,因为实际的多线程发生在 Verilator 生成的代码中 。
编辑添加:我正在模拟的 RTL 不是世界上最好的 Verilog,因为它是由 Clash 生成的;但在评论中提到这种行为可能是因为它的某些属性。所以这是完整的 Verilog 代码:
/* AUTOMATICALLY GENERATED VERILOG-2001 SOURCE CODE.
** GENERATED BY CLASH 1.3.0. DO NOT MODIFY.
*/
`timescale 100fs/100fs
module topEntity
( // Inputs
input CLK_25MHZ // clock
, input RESET // reset
// Outputs
, output wire VGA_HSYNC
, output wire VGA_VSYNC
, output wire VGA_DE
, output wire [7:0] VGA_RED
, output wire [7:0] VGA_GREEN
, output wire [7:0] VGA_BLUE
);
wire [23:0] result;
wire b1;
wire [23:0] result_0;
wire result_1;
wire result_2;
// ../src/Bounce.hs:(52,1)-(58,54)
wire signed [9:0] \x' ;
// ../src/Bounce.hs:(52,1)-(58,54)
wire signed [63:0] \c$x'_app_arg ;
// ../src/Bounce.hs:(52,1)-(58,54)
wire [8:0] x;
// ../src/Bounce.hs:(52,1)-(58,54)
reg [19:0] ds = {10'sd0, 10'sd2};
// ../src/Bounce.hs:84:1-66
wire signed [9:0] dx;
// ../src/Bounce.hs:89:1-74
wire signed [9:0] x_0;
// ../src/Bounce.hs:89:1-74
wire signed [9:0] dx_0;
// ../src/Bounce.hs:89:1-74
wire signed [9:0] diff;
// ../src/Bounce.hs:89:1-74
wire [1:0] ds2;
reg [19:0] result_3;
// ../src/Bounce.hs:89:1-74
wire [1:0] c$ds2_case_alt;
// ../src/Bounce.hs:89:1-74
wire [19:0] ds1;
// ../src/Bounce.hs:89:1-74
wire signed [9:0] x_1;
// ../src/Bounce.hs:89:1-74
wire signed [9:0] dx_1;
// ../src/Bounce.hs:89:1-74
wire signed [9:0] diff_0;
// ../src/Bounce.hs:89:1-74
wire [1:0] ds2_0;
reg [19:0] result_4;
// ../src/Bounce.hs:89:1-74
wire [1:0] c$ds2_case_alt_0;
// ../src/Bounce.hs:(52,1)-(58,54)
wire signed [9:0] ballY;
wire result_5;
wire result_6;
// ../src/Bounce.hs:(52,1)-(58,54)
wire signed [10:0] \x'_0 ;
// ../src/Bounce.hs:(52,1)-(58,54)
wire signed [63:0] \c$x'_app_arg_0 ;
// ../src/Bounce.hs:(52,1)-(58,54)
wire [9:0] x_2;
// ../src/Bounce.hs:(52,1)-(58,54)
reg [21:0] ds_0 = {11'sd0, 11'sd3};
// ../src/Bounce.hs:84:1-66
wire signed [10:0] dx_2;
// ../src/Bounce.hs:89:1-74
wire signed [10:0] x_3;
// ../src/Bounce.hs:89:1-74
wire signed [10:0] dx_3;
// ../src/Bounce.hs:89:1-74
wire signed [10:0] diff_1;
// ../src/Bounce.hs:89:1-74
wire [1:0] ds2_1;
reg [21:0] result_7;
// ../src/Bounce.hs:89:1-74
wire [1:0] c$ds2_case_alt_1;
// ../src/Bounce.hs:89:1-74
wire [21:0] ds1_0;
// ../src/Bounce.hs:89:1-74
wire signed [10:0] x_4;
// ../src/Bounce.hs:89:1-74
wire signed [10:0] dx_4;
// ../src/Bounce.hs:89:1-74
wire signed [10:0] diff_2;
// ../src/Bounce.hs:89:1-74
wire [1:0] ds2_2;
reg [21:0] result_8;
// ../src/Bounce.hs:89:1-74
wire [1:0] c$ds2_case_alt_2;
// ../src/Bounce.hs:(52,1)-(58,54)
wire signed [10:0] ballX;
wire result_9;
// ../src/Bounce.hs:(52,1)-(58,54)
reg old = 1'b0;
// ../src/Bounce.hs:(52,1)-(58,54)
wire c$frameEnd_case_alt;
// ../src/Bounce.hs:(52,1)-(58,54)
wire s;
wire [23:0] result_10;
wire c$app_arg;
reg [9:0] vgaY;
wire c$app_arg_0;
reg [10:0] vgaX;
wire [0:0] c$app_arg_1;
reg eta;
wire [0:0] c$app_arg_2;
reg eta_0;
wire [8:0] coord;
reg result_11;
wire [10:0] c$case_alt;
wire [10:0] c$case_alt_0;
wire [10:0] c$case_alt_1;
wire [10:0] c$case_alt_2;
reg [10:0] result_12;
reg [11:0] result_13;
wire [5:0] cnt;
wire [11:0] c$case_alt_3;
wire [6:0] cnt_0;
wire [11:0] c$case_alt_4;
wire [3:0] cnt_1;
wire [11:0] c$case_alt_5;
wire [9:0] cnt_2;
wire [11:0] c$case_alt_6;
reg [11:0] result_14 = {2'b00,10'd0};
wire [5:0] cnt_3;
wire [8:0] cnt_4;
wire [3:0] cnt_5;
wire [0:0] cnt_6;
wire [4:0] cnt_7;
reg [10:0] result_15 = {2'b00,9'd0};
wire [9:0] coord_0;
reg [11:0] result_16;
wire [5:0] cnt_8;
wire [11:0] c$case_alt_7;
wire [6:0] cnt_9;
wire [11:0] c$case_alt_8;
wire [3:0] cnt_10;
wire [11:0] c$case_alt_9;
wire [9:0] cnt_11;
wire [11:0] c$case_alt_10;
reg [11:0] result_17 = {2'b00,10'd0};
wire [9:0] result_selection_3;
wire [10:0] result_selection_9;
wire [9:0] s_selection_1;
wire [26:0] VGA;
wire [2:0] VGA_0;
assign result = b1 ? {8'd0, 8'd0,
8'd0} : result_0;
assign VGA = {result_10[23:21],
result[23:16], result[15:8], result[7:0]};
assign b1 = ~ result_10[21:21];
assign result_0 = (result_5 & result_1) ? {8'd240,
8'd224, 8'd64} : {8'd48, 8'd48, 8'd48};
assign result_selection_3 = result_10[9:0];
assign result_1 = result_selection_3[9:9] ? result_2 : 1'b0;
assign result_2 = (ballY <= \x' ) ? (\x' < (ballY + 10'sd15)) : 1'b0;
assign \x' = $signed(\c$x'_app_arg [0+:10]);
assign \c$x'_app_arg = $unsigned({{(64-9) {1'b0}},x});
assign x = result_10[8:0];
// register begin
always @(posedge CLK_25MHZ or posedge RESET) begin : ds_register
if ( RESET) begin
ds <= {10'sd0, 10'sd2};
end else if (result_9) begin
ds <= result_3;
end
end
// register end
assign dx = $signed(ds[9:0]);
assign x_0 = $signed(result_4[19:10]);
assign dx_0 = $signed(result_4[9:0]);
assign diff = 10'sd0 - x_0;
assign ds2 = (10'sd0 == diff) ? 2'd1 : c$ds2_case_alt;
always @(*) begin
case(ds2)
2'b00 : result_3 = {10'sd0 + diff, -dx_0};
default : result_3 = result_4;
endcase
end
assign c$ds2_case_alt = (10'sd0 <= diff) ? 2'd0 : 2'd2;
assign ds1 = {ballY + dx, dx};
assign x_1 = $signed(ds1[19:10]);
assign dx_1 = $signed(ds1[9:0]);
assign diff_0 = 10'sd464 - x_1;
assign ds2_0 = (10'sd0 == diff_0) ? 2'd1 : c$ds2_case_alt_0;
always @(*) begin
case(ds2_0)
2'b10 : result_4 = {10'sd464 + diff_0,
-dx_1};
default : result_4 = ds1;
endcase
end
assign c$ds2_case_alt_0 = (10'sd0 <= diff_0) ? 2'd0 : 2'd2;
assign ballY = $signed(ds[19:10]);
assign result_selection_9 = result_10[20:10];
assign result_5 = result_selection_9[10:10] ? result_6 : 1'b0;
assign result_6 = (ballX <= \x'_0 ) ? (\x'_0 < (ballX + 11'sd15)) : 1'b0;
assign \x'_0 = $signed(\c$x'_app_arg_0 [0+:11]);
assign \c$x'_app_arg_0 = $unsigned({{(64-10) {1'b0}},x_2});
assign x_2 = result_10[19:10];
// register begin
always @(posedge CLK_25MHZ or posedge RESET) begin : ds_0_register
if ( RESET) begin
ds_0 <= {11'sd0, 11'sd3};
end else if (result_9) begin
ds_0 <= result_7;
end
end
// register end
assign dx_2 = $signed(ds_0[10:0]);
assign x_3 = $signed(result_8[21:11]);
assign dx_3 = $signed(result_8[10:0]);
assign diff_1 = 11'sd0 - x_3;
assign ds2_1 = (11'sd0 == diff_1) ? 2'd1 : c$ds2_case_alt_1;
always @(*) begin
case(ds2_1)
2'b00 : result_7 = {11'sd0 + diff_1, -dx_3};
default : result_7 = result_8;
endcase
end
assign c$ds2_case_alt_1 = (11'sd0 <= diff_1) ? 2'd0 : 2'd2;
assign ds1_0 = {ballX + dx_2, dx_2};
assign x_4 = $signed(ds1_0[21:11]);
assign dx_4 = $signed(ds1_0[10:0]);
assign diff_2 = 11'sd624 - x_4;
assign ds2_2 = (11'sd0 == diff_2) ? 2'd1 : c$ds2_case_alt_2;
always @(*) begin
case(ds2_2)
2'b10 : result_8 = {11'sd624 + diff_2,
-dx_4};
default : result_8 = ds1_0;
endcase
end
assign c$ds2_case_alt_2 = (11'sd0 <= diff_2) ? 2'd0 : 2'd2;
assign ballX = $signed(ds_0[21:11]);
assign result_9 = old ? c$frameEnd_case_alt : 1'b0;
// register begin
always @(posedge CLK_25MHZ or posedge RESET) begin : old_register
if ( RESET) begin
old <= 1'b0;
end else if (1'b1) begin
old <= s;
end
end
// register end
assign c$frameEnd_case_alt = s ? 1'b0 : 1'b1;
assign s_selection_1 = result_10[9:0];
assign s = s_selection_1[9:9] ? 1'b1 : 1'b0;
assign result_10 = {{~ (c$app_arg_2),
~ (c$app_arg_1), c$app_arg_0 & c$app_arg},
vgaX, vgaY};
assign c$app_arg = vgaY[9:9] ? 1'b1 : 1'b0;
always @(*) begin
case(result_15[10:9])
2'b00 : vgaY = {1'b1,coord};
default : vgaY = {1'b0,9'bxxxxxxxxx};
endcase
end
assign c$app_arg_0 = vgaX[10:10] ? 1'b1 : 1'b0;
always @(*) begin
case(result_17[11:10])
2'b00 : vgaX = {1'b1,coord_0};
default : vgaX = {1'b0,10'bxxxxxxxxxx};
endcase
end
assign c$app_arg_1 = eta ? 1'b1 : 1'b0;
always @(*) begin
case(result_15[10:9])
2'b10 : eta = 1'b1;
default : eta = 1'b0;
endcase
end
assign c$app_arg_2 = eta_0 ? 1'b1 : 1'b0;
always @(*) begin
case(result_17[11:10])
2'b10 : eta_0 = 1'b1;
default : eta_0 = 1'b0;
endcase
end
assign coord = result_15[8:0];
always @(*) begin
case(result_14[11:10])
2'b11 : result_11 = cnt_3 == 6'd47;
default : result_11 = 1'b0;
endcase
end
assign c$case_alt = (cnt_4 == 9'd479) ? {2'b01,4'd0,5'bxxxxx} : {2'b00,cnt_4 + 9'd1};
assign c$case_alt_0 = (cnt_5 == 4'd10) ? {2'b10,1'd0,8'bxxxxxxxx} : {2'b01,cnt_5 + 4'd1,5'bxxxxx};
assign c$case_alt_1 = (cnt_6 == 1'd1) ? {2'b11,5'd0,4'bxxxx} : {2'b10,cnt_6 + 1'd1,8'bxxxxxxxx};
assign c$case_alt_2 = (cnt_7 == 5'd30) ? {2'b00,9'd0} : {2'b11,cnt_7 + 5'd1,4'bxxxx};
always @(*) begin
case(result_15[10:9])
2'b00 : result_12 = c$case_alt;
2'b01 : result_12 = c$case_alt_0;
2'b10 : result_12 = c$case_alt_1;
default : result_12 = c$case_alt_2;
endcase
end
always @(*) begin
case(result_14[11:10])
2'b00 : result_13 = c$case_alt_6;
2'b01 : result_13 = c$case_alt_5;
2'b10 : result_13 = c$case_alt_4;
default : result_13 = c$case_alt_3;
endcase
end
assign cnt = result_14[9:4];
assign c$case_alt_3 = (cnt == 6'd47) ? {2'b00,10'd0} : {2'b11,cnt + 6'd1,4'bxxxx};
assign cnt_0 = result_14[9:3];
assign c$case_alt_4 = (cnt_0 == 7'd95) ? {2'b11,6'd0,4'bxxxx} : {2'b10,cnt_0 + 7'd1,3'bxxx};
assign cnt_1 = result_14[9:6];
assign c$case_alt_5 = (cnt_1 == 4'd15) ? {2'b10,7'd0,3'bxxx} : {2'b01,cnt_1 + 4'd1,6'bxxxxxx};
assign cnt_2 = result_14[9:0];
assign c$case_alt_6 = (cnt_2 == 10'd639) ? {2'b01,4'd0,6'bxxxxxx} : {2'b00,cnt_2 + 10'd1};
// register begin
always @(posedge CLK_25MHZ or posedge RESET) begin : result_14_register
if ( RESET) begin
result_14 <= {2'b00,10'd0};
end else if (1'b1) begin
result_14 <= result_13;
end
end
// register end
assign cnt_3 = result_14[9:4];
assign cnt_4 = result_15[8:0];
assign cnt_5 = result_15[8:5];
assign cnt_6 = result_15[8:8];
assign cnt_7 = result_15[8:4];
// register begin
always @(posedge CLK_25MHZ or posedge RESET) begin : result_15_register
if ( RESET) begin
result_15 <= {2'b00,9'd0};
end else if (result_11) begin
result_15 <= result_12;
end
end
// register end
assign coord_0 = result_17[9:0];
always @(*) begin
case(result_17[11:10])
2'b00 : result_16 = c$case_alt_10;
2'b01 : result_16 = c$case_alt_9;
2'b10 : result_16 = c$case_alt_8;
default : result_16 = c$case_alt_7;
endcase
end
assign cnt_8 = result_17[9:4];
assign c$case_alt_7 = (cnt_8 == 6'd47) ? {2'b00,10'd0} : {2'b11,cnt_8 + 6'd1,4'bxxxx};
assign cnt_9 = result_17[9:3];
assign c$case_alt_8 = (cnt_9 == 7'd95) ? {2'b11,6'd0,4'bxxxx} : {2'b10,cnt_9 + 7'd1,3'bxxx};
assign cnt_10 = result_17[9:6];
assign c$case_alt_9 = (cnt_10 == 4'd15) ? {2'b10,7'd0,3'bxxx} : {2'b01,cnt_10 + 4'd1,6'bxxxxxx};
assign cnt_11 = result_17[9:0];
assign c$case_alt_10 = (cnt_11 == 10'd639) ? {2'b01,4'd0,6'bxxxxxx} : {2'b00,cnt_11 + 10'd1};
// register begin
always @(posedge CLK_25MHZ or posedge RESET) begin : result_17_register
if ( RESET) begin
result_17 <= {2'b00,10'd0};
end else if (1'b1) begin
result_17 <= result_16;
end
end
// register end
assign VGA_0 = VGA[26:24];
assign VGA_RED = VGA[23:16];
assign VGA_GREEN = VGA[15:8];
assign VGA_BLUE = VGA[7:0];
assign VGA_HSYNC = VGA_0[2:2];
assign VGA_VSYNC = VGA_0[1:1];
assign VGA_DE = VGA_0[0:0];
endmodule
根据他们给的开发者 this answer:
Multithreading will only show speedups on much larger designs. In small designs the communication between cores will be much larger than leaving it on one core.
所以看起来最初的猜测是正确的,并且有问题的代码不足以表现出加速。涉及的开销很高,因此它无法从多线程中受益。