增加设计的 PPA 限制

increasing the PPA limitation of a design

我完成了 sha256 算法的 vhdl 设计。 现在我试图通过了解如何更改代码来提高我的设计水平,这样我将获得更高的功率、性能和面积结果。最终目标是尝试在我的设计中获得最佳网表,以便我可以将它们放入芯片中。

所以对于我的设计:我在 cyclone 4 FPGA 中获得了 85 mhz 的最大频率,使用了 8,500 个逻辑元件,占 FPGA 的 55%。

我认为使我的设计如此庞大的主要问题是我以层次结构的方式编写代码,其中有很多 "elsif" 和变量。我认为,另一件可能更好的事情是,如果 quartus 将我的存储器设计实现为存储器而不是逻辑元件,即使它只有 16 个字的 32 位数组。 你们认为我可以改进什么?

library ieee;
use ieee.std_logic_1164.all;
use ieee.std_logic_unsigned.all;
USE ieee.numeric_std.ALL;

entity padding is
port(       clk     :   in      std_logic;
            rst     :   in      std_logic;
            ward    :   in      std_logic_vector(31 downto 0);
            ready   :   out     std_logic;
            hash    :   out     std_logic_vector(255 downto 0));
end;

architecture padding of padding is

component sha256 
    port (      clk         :  in   std_logic;
                rst         :  in   std_logic;
                enable      :  in   std_logic;
                ward            :  in   std_logic_vector(31 downto 0);
                k               :  in   std_logic_vector(31 downto 0);
                h0              :  in   std_logic_vector(31 downto 0);
                h1              :  in   std_logic_vector(31 downto 0);
                h2              :  in   std_logic_vector(31 downto 0);
                h3              :  in   std_logic_vector(31 downto 0);
                h4              :  in   std_logic_vector(31 downto 0);
                h5              :  in   std_logic_vector(31 downto 0);
                h6              :  in   std_logic_vector(31 downto 0);
                h7              :  in   std_logic_vector(31 downto 0);
                ready           :  out  std_logic;
                digest      :  out  std_logic_vector(255 downto 0));
end component;

type kconst is array ( 0 to 63 ) of std_logic_vector(31 downto 0);
type mem    is array ( 0 to 15 ) of std_logic_vector(31 downto 0);

signal k                : kconst := (x"428a2f98", x"71374491", x"b5c0fbcf", x"e9b5dba5", x"3956c25b", x"59f111f1", x"923f82a4", x"ab1c5ed5",
                                             x"d807aa98", x"12835b01", x"243185be", x"550c7dc3", x"72be5d74", x"80deb1fe", x"9bdc06a7", x"c19bf174",
                                             x"e49b69c1", x"efbe4786", x"0fc19dc6", x"240ca1cc", x"2de92c6f", x"4a7484aa", x"5cb0a9dc", x"76f988da",
                                             x"983e5152", x"a831c66d", x"b00327c8", x"bf597fc7", x"c6e00bf3", x"d5a79147", x"06ca6351", x"14292967",
                                             x"27b70a85", x"2e1b2138", x"4d2c6dfc", x"53380d13", x"650a7354", x"766a0abb", x"81c2c92e", x"92722c85",
                                             x"a2bfe8a1", x"a81a664b", x"c24b8b70", x"c76c51a3", x"d192e819", x"d6990624", x"f40e3585", x"106aa070",
                                             x"19a4c116", x"1e376c08", x"2748774c", x"34b0bcb5", x"391c0cb3", x"4ed8aa4a", x"5b9cca4f", x"682e6ff3",
                                             x"748f82ee", x"78a5636f", x"84c87814", x"8cc70208", x"90befffa", x"a4506ceb", x"bef9a3f7", x"c67178f2");

signal first_mem        : mem:= (   x"00000000", x"00000000", x"00000000", x"00000000", x"00000000", x"00000000", x"00000000", x"00000000",
                                    x"00000000", x"00000000", x"00000000", x"00000000", x"00000000", x"00000000", x"00000000", x"00000000"); 

signal second_mem       : mem:= (   x"00000000", x"00000000", x"00000000", x"00000000", x"80000000", x"00000000", x"00000000", x"00000000",
                                    x"00000000", x"00000000", x"00000000", x"00000000", x"00000000", x"00000000", x"00000000", x"00000280");

signal enable           : std_logic;
signal enable1          : std_logic;
signal enable2          : std_logic;
signal r_d              : std_logic;
signal k_in             : std_logic_vector(31 downto 0);
signal ward_in          : std_logic_vector(31 downto 0);
signal ward_in1         : std_logic_vector(31 downto 0);
signal ward_in2         : std_logic_vector(31 downto 0);
signal h0,h1,h2,h3  : std_logic_vector(31 downto 0);
signal h4,h5,h6,h7  : std_logic_vector(31 downto 0);

signal temp             : std_logic_vector(255 downto 0);
signal temp1            : std_logic_vector(255 downto 0);
signal gama0            : std_logic_vector(31 downto 0);
signal gama1            : std_logic_vector(31 downto 0);
signal gama2            : std_logic_vector(31 downto 0);
signal gama3            : std_logic_vector(31 downto 0);
signal gama4            : std_logic_vector(31 downto 0);
signal gama5            : std_logic_vector(31 downto 0);

begin

sha1: sha256 port map(  
                clk         ,
                rst         ,
                enable      ,
                ward_in     ,
                k_in            ,
                h0              ,
                h1              ,
                h2              ,
                h3              ,
                h4              ,
                h5              ,
                h6              ,
                h7              ,
                enable1         ,
                temp            );
sha2: sha256 port map(  
                clk         ,
                rst         ,
                enable1     ,
                ward_in1        ,
                k_in            ,
                temp(255 downto 224),
                temp(223 downto 192),
                temp(191 downto 160),
                temp(159 downto 128),
                temp(127 downto 96 ),
                temp(95 downto 64  ),
                temp(63 downto 32  ),
                temp(31 downto 0   ),
                enable2         ,
                temp1           );
sha3: sha256 port map(  
                clk         ,
                rst         ,
                r_d         ,
                ward_in2        ,
                k_in            ,
                h0              ,
                h1              ,
                h2              ,
                h3              ,
                h4              ,
                h5              ,
                h6              ,
                h7              ,
                ready           ,
                hash            );

h0  <= x"6a09e667";
h1  <= x"bb67ae85";
h2  <= x"3c6ef372";
h3  <= x"a54ff53a";
h4  <= x"510e527f";
h5  <= x"9b05688c";
h6  <= x"1f83d9ab";
h7  <= x"5be0cd19";             

process (clk,rst)
variable i : integer;
variable j : integer;
variable m : integer;
variable n : integer;
variable l : integer;
begin
    if rst = '0' then
        enable      <= '0';
        i := 0;
        j := 0;
        m := 9;
        n := 15;
        l := 8; 
    elsif clk'event and clk = '1' then
        if j = 16 then
           j := 0;
        end if;
        if m = 16 then
           m := 0;
        end if;
        if n = 16 then
           n := 0;
        end if;
        if l = 16 then
           l := 0;
        end if;
        if i  = 193 then
           i := 0;
        elsif i  > 144  then
            first_mem(n) <= gama4 + first_mem(l) + gama5 + first_mem(n);
            ward_in2     <= gama4 + first_mem(l) + gama5 + first_mem(n);
            k_in         <= k(i-129);
        elsif i  > 136 then
            ward_in2           <= first_mem(n);
            k_in             <= k(i-129);  
        elsif i  = 136 then
            first_mem(n)   <= temp1(31 downto 0);
            ward_in2           <= temp1(31 downto 0);
            k_in             <= k(i-129);
        elsif i  = 135 then
            first_mem(n)   <= temp1(63 downto 32);
            ward_in2           <= temp1(63 downto 32);
            k_in             <= k(i-129);
        elsif i  = 134 then
            first_mem(n)   <= temp1(95 downto 64);
            ward_in2           <= temp1(95 downto 64);
            k_in             <= k(i-129);
        elsif i  = 133 then
            first_mem(n)   <= temp1(127 downto 96);
            ward_in2           <= temp1(127 downto 96);
            k_in             <= k(i-129);
        elsif i  = 132 then
            first_mem(n)   <= temp1(159 downto 128);
            ward_in2           <= temp1(159 downto 128);
            k_in             <= k(i-129);
        elsif i  = 131 then
            first_mem(n)   <= temp1(191 downto 160);
            ward_in2           <= temp1(191 downto 160);
            k_in             <= k(i-129);
        elsif i  = 130 then
            first_mem(n)   <= temp1(223 downto 192);
            ward_in2           <= temp1(223 downto 192);
            k_in             <= k(i-129);
        elsif i  = 129 then
            first_mem(15) <= x"00000100";
            first_mem(14) <= x"00000000";
            first_mem(13) <= x"00000000";
            first_mem(12) <= x"00000000";
            first_mem(11) <= x"00000000";
            first_mem(10) <= x"00000000";
            first_mem(9) <= x"00000000";
            first_mem(8) <= x"80000000";
            first_mem(n) <= temp1(255 downto 224);
            ward_in2         <= temp1(255 downto 224);
            k_in             <= k(i-129);
        elsif i  = 128 then 
        elsif i  > 79  then
            second_mem(j) <= gama2 + second_mem(m) + gama3 + second_mem(j);
            ward_in1      <= gama2 + second_mem(m) + gama3 + second_mem(j);
            k_in          <= k(i-64);       
        elsif i  > 63  then
            enable       <= '0';
            ward_in1         <= second_mem(j);
            k_in         <= k(i-64);
        elsif i  > 19  then
            first_mem(j) <= gama0 + first_mem(m) + gama1 + first_mem(j);
            ward_in      <= gama0 + first_mem(m) + gama1 + first_mem(j);
            k_in         <= k(i);
            enable      <= '1';
        elsif i  > 15  then
            second_mem(j)<= ward;
            first_mem(j) <= gama0 + first_mem(m) + gama1 + first_mem(j);
            ward_in      <= gama0 + first_mem(m) + gama1 + first_mem(j);
            k_in         <= k(i);   
            enable      <= '1';
        elsif i  >= 0   then
            first_mem(i) <= ward;
            ward_in      <= ward;
            k_in         <= k(i);
            enable      <= '1';
        end if;
        i := i + 1;
        j := j + 1;
        m := m + 1;
        n := n + 1;
        l := l + 1;
    end if;
end process;

process (clk, rst)
begin
    if rst = '0' then
        r_d <= '0';
    elsif clk'event and clk = '1' then
        r_d <= enable2;
    end if;
end process;

process (clk, rst)
variable f: integer;
variable j: integer;
variable l: integer;
variable m: integer;
begin
    if rst = '0' then
        f := 2;
        j := 15;
        l := 1;
        m := 14;
    elsif clk'event and clk = '1' then
        if j = 16 then
            j := 0;
        end if;
        if f = 16 then
            f := 0;
        end if;
        if l = 16 then
            l := 0;
        end if;
        if m = 16 then
            m := 0;
        end if;
        gama0 <= ((first_mem(f)(6 downto 0) & first_mem(f)(31 downto 7)) xor (first_mem(f)(17 downto 0) & first_mem(f)(31 downto 18)) xor ("000" & first_mem(f)(31 downto 3)));
        gama1 <= ((first_mem(j)(16 downto 0) & first_mem(j)(31 downto 17)) xor (first_mem(j)(18 downto 0) & first_mem(j)(31 downto 19)) xor ("0000000000" & first_mem(j)(31 downto 10)));
        gama4 <= ((first_mem(l)(6 downto 0) & first_mem(l)(31 downto 7)) xor (first_mem(l)(17 downto 0) & first_mem(l)(31 downto 18)) xor ("000" & first_mem(l)(31 downto 3)));
        gama5 <= ((first_mem(m)(16 downto 0) & first_mem(m)(31 downto 17)) xor (first_mem(m)(18 downto 0) & first_mem(m)(31 downto 19)) xor ("0000000000" & first_mem(m)(31 downto 10)));
        gama2 <= ((second_mem(f)(6 downto 0) & second_mem(f)(31 downto 7)) xor (second_mem(f)(17 downto 0) & second_mem(f)(31 downto 18)) xor ("000" & second_mem(f)(31 downto 3)));
        gama3 <= ((second_mem(j)(16 downto 0) & second_mem(j)(31 downto 17)) xor (second_mem(j)(18 downto 0) & second_mem(j)(31 downto 19)) xor ("0000000000" & second_mem(j)(31 downto 10)));
        f := f + 1;
        j := j + 1;
        l := l + 1;
        m := m + 1;
    end if;
end process;

end;

elsif,即"priority en/decoding",会影响你设计的频率。使用您剩下的所有可用逻辑资源,您可以考虑使用 case 语句……除非您确实需要优先级 en/decoding。即便如此,如果您能够承受延迟权衡,您可以在几个时钟周期内进行解码(流水线解码)并且您的设计可能会增加频率......最终,您需要 运行 一份时序报告并查看在了解瓶颈的缓慢路径上。

如果你真的想使用 RAM 而不是 FF,你可以推断出一个 RAM(创建一个数组),或者如果这对你不起作用,你可以手动实例化一个设备特定的 RAM ... . 然后,当然,为它添加控制逻辑。如果是原语,将其黑盒以稍后交换 "same" ASIC 库原语

就"variables"而言,讨论与"VHDL" vs. "Verilog",或"synch" vs "asynch"重置相同,大部分只是意见,我的是,"I am not a fan of variables in synthesizable RTL"...它们对于合成是合法的,但它们在合成过程中 "disappear",所以如果你想查看网表并与你的 RTL 进行比较,你可以手动追踪连接。通常没有充分的理由使用变量,因为它们在硬件方面不代表任何东西,并且混淆了设计与网表。我喜欢看到 wire/net/regs 的逻辑类型,这样就很清楚你在 HW 中创建了什么。但是,如你所愿,当我看到它们时,我往往会畏缩。

同样,就数组而言,我不是 "bundling signals into arrays" 的忠实粉丝...人们会争辩说它是 "faster" 和 "easier" 来处理,但对我来说,它进一步混淆了设计。同样,这不是非法的,但是当涉及到 OPC(其他人的代码)时,尝试跟踪信号可能会非常烦人,不仅在模块内,而且跨端口的阵列......然后,如果他们对这些阵列进行切片,或者以其他方式消灭它们,它会变得更加烦人。有点像这样的咆哮:)

最终,您可以做任何您想做的事,尤其是在 FPGA 中,与 ASIC 相比,有些人往往不太关注将要创建的内容的细节。如果你正在设计一个 ASIC,我会说你应该宁可更迂腐,也应该能够查看你的 RTL 并知道(在某种程度上)将要创建什么,因此能够估计门如果需要,请数数。为此,我强烈建议花时间在绘图程序(例如 visio)中绘制您的设计,包括门、FF、解码器、多路复用器、FSM、适当的伪代码、时钟和复位树的详细信息,以及所有CDC交叉逻辑等,包括信号名称。一旦你有了它,这只是一个转换为 RTL 的问题......也可能,作为对那些同意我对变量的看法的人的奖励,你会发现你的绘图中没有变量,因此 none 在你的返校。 :)