Altera opt 2 (#2602)

The second optimization for Altera FPGA is to move the BHT to LUTRAM. Same as before, the reason why the optimization previously done for Xilinx is not working, is that in that case asynchronous RAM primitives are used, and Altera does not support asynchronous RAM. Therefore, this optimization consists in using synchronous RAM for the BHT. The main changes to the existing code are: New RAM module to infer synchronous RAM in altera with 2 independent read ports and one write port (SyncThreePortRam.sv) Changes in the frontend.sv file: modify input to vpc_i port of BHT, by advancing the address to read, in order to compensate for the delay of synchronous RAM. Changes in the bht.sv file: This case is more complex because of the logic operations that need to be performed inside the BHT. First, the pc pointed by bht_update_i is read from the memory, modified according to the saturation counter and valid bit, and finally written again in the memory. The prediction output is given based on the vpc_i. With asynchronous memory, the new data written via update_i is available one clock cycle after writing it. So, if vpc_i tries to read the address that was previously written by update_i, everything is fine. However, in the case of synchronous memory there are three clock cycles of latency (one for reading the pc content (read port 1), another one for writing it, and another one for reading in the other port (read port 0)). For this reason, there is the need to adapt the design to these new latency constraints: First, there is the need for a delay on the address write of the synchronous RAM, to wait for the previous pc read and store the right modified data. Once this is solved, similarly to the FIFO case, there is the need for an auxiliary buffer that will store the data written in the FIFO, allowing to have it available 2 clock cycles after the update_i was valid. This is because after having the correct data, the RAM takes 2 clock cycles until data can be available in the output (one clock cycle for writing and one for reading). Finally, there is a multiplexer in the output that permits to deliver the correct prediction providing the data from the update logic (1 cycle of delay), the auxiliary register (2 cycles of delay), or the RAM (3 or more cycles of delay), depending on the delay since the update_i was valid (i.e. written to the memory).
2025-04-20 04:07:36 -04:00 · 2024-11-21 23:36:18 +01:00 · 2024-11-21 23:36:18 +01:00 · c389382c89
commit c389382c89
parent 8a84f788d6
3 changed files with 220 additions and 59 deletions
--- a/core/frontend/bht.sv
+++ b/core/frontend/bht.sv
@ -1,5 +1,6 @@
 // Copyright 2018 - 2019 ETH Zurich and University of Bologna.
-// Copyright 2023 - Thales for additionnal conribution.
+// Copyright 2023 - Thales for additionnal contribution.
+// Copyright 2024 - PlanV Technologies for additionnal contribution.
 // Copyright and related rights are licensed under the Solderpad Hardware
 // License, Version 2.0 (the "License"); you may not use this file except in
 // compliance with the License.  You may obtain a copy of the License at
@ -15,6 +16,8 @@
 // Date: 09.06.2018
 // FPGA optimization: Sebastien Jacq, Thales
 // Date: 2023-01-30
+// FPGA optimization for Altera: Angela Gonzalez, PlanV Technolgies
+// Date: 2024-10-16

 // branch history table - 2 bit saturation counter

@ -47,8 +50,6 @@ module bht #(
  localparam ROW_INDEX_BITS = CVA6Cfg.RVC == 1'b1 ? $clog2(CVA6Cfg.INSTR_PER_FETCH) : 1;
  // number of bits we should use for prediction
  localparam PREDICTION_BITS = $clog2(NR_ROWS) + OFFSET + ROW_ADDR_BITS;
-  // we are not interested in all bits of the address
-  unread i_unread (.d_i(|vpc_i));

  struct packed {
    logic       valid;
@ -58,7 +59,7 @@ module bht #(
      bht_q[NR_ROWS-1:0][CVA6Cfg.INSTR_PER_FETCH-1:0];

  logic [$clog2(NR_ROWS)-1:0] index, update_pc;
-  logic [ROW_INDEX_BITS-1:0] update_row_index;
+  logic [ROW_INDEX_BITS-1:0] update_row_index, update_row_index_q, check_update_row_index;

  assign index     = vpc_i[PREDICTION_BITS-1:ROW_ADDR_BITS+OFFSET];
  assign update_pc = bht_update_i.pc[PREDICTION_BITS-1:ROW_ADDR_BITS+OFFSET];
@ -127,17 +128,23 @@ module bht #(

    // number of bits par word in the bram
    localparam BRAM_WORD_BITS = $bits(ariane_pkg::bht_t);
-    logic             [                         ROW_INDEX_BITS-1:0] row_index;
-    logic             [                CVA6Cfg.INSTR_PER_FETCH-1:0] bht_ram_we;
-    logic             [CVA6Cfg.INSTR_PER_FETCH*$clog2(NR_ROWS)-1:0] bht_ram_read_address_0;
-    logic             [CVA6Cfg.INSTR_PER_FETCH*$clog2(NR_ROWS)-1:0] bht_ram_read_address_1;
-    logic             [CVA6Cfg.INSTR_PER_FETCH*$clog2(NR_ROWS)-1:0] bht_ram_write_address;
-    logic             [ CVA6Cfg.INSTR_PER_FETCH*BRAM_WORD_BITS-1:0] bht_ram_wdata;
-    logic             [ CVA6Cfg.INSTR_PER_FETCH*BRAM_WORD_BITS-1:0] bht_ram_rdata_0;
-    logic             [ CVA6Cfg.INSTR_PER_FETCH*BRAM_WORD_BITS-1:0] bht_ram_rdata_1;
+    logic [ROW_INDEX_BITS-1:0] row_index, row_index_q, check_row_index;
+    logic [CVA6Cfg.INSTR_PER_FETCH-1:0] bht_ram_we, bht_ram_we_q;
+    logic [CVA6Cfg.INSTR_PER_FETCH*$clog2(NR_ROWS)-1:0] bht_ram_read_address_0;
+    logic [CVA6Cfg.INSTR_PER_FETCH*$clog2(NR_ROWS)-1:0] bht_ram_read_address_1;
+    logic [CVA6Cfg.INSTR_PER_FETCH*$clog2(NR_ROWS)-1:0]
+        bht_ram_write_address, bht_ram_write_address_q;
+    logic [CVA6Cfg.INSTR_PER_FETCH*BRAM_WORD_BITS-1:0] bht_ram_wdata, bht_ram_wdata_q;
+    logic [CVA6Cfg.INSTR_PER_FETCH*BRAM_WORD_BITS-1:0] bht_ram_rdata_0;
+    logic [CVA6Cfg.INSTR_PER_FETCH*BRAM_WORD_BITS-1:0] bht_ram_rdata_1;

-    ariane_pkg::bht_t [                CVA6Cfg.INSTR_PER_FETCH-1:0] bht;
-    ariane_pkg::bht_t [                CVA6Cfg.INSTR_PER_FETCH-1:0] bht_updated;
+    ariane_pkg::bht_t [CVA6Cfg.INSTR_PER_FETCH-1:0] bht;
+    ariane_pkg::bht_t [CVA6Cfg.INSTR_PER_FETCH-1:0] bht_updated;
+
+    logic [CVA6Cfg.INSTR_PER_FETCH-1:0][1:0] bht_updated_valid;
+    logic [CVA6Cfg.INSTR_PER_FETCH-1:0][1:0][CVA6Cfg.VLEN-1:0] bht_updated_pc;
+    logic bht_update_taken, check_bht_update_taken;
+    logic [CVA6Cfg.VLEN-1:0] vpc_q;

    if (CVA6Cfg.RVC) begin : gen_row_index
      assign row_index = vpc_i[ROW_ADDR_BITS+OFFSET-1:OFFSET];
@ -157,64 +164,150 @@ module bht #(
      bht_updated = '0;
      bht = '0;

-      for (int i = 0; i < CVA6Cfg.INSTR_PER_FETCH; i++) begin
-        bht_ram_read_address_0[i*$clog2(NR_ROWS)+:$clog2(NR_ROWS)] = index;
-        bht_prediction_o[i].valid = bht_ram_rdata_0[i*BRAM_WORD_BITS+2];
-        bht_prediction_o[i].taken = bht_ram_rdata_0[i*BRAM_WORD_BITS+1];
-      end
-
+      //Write to RAM
      if (bht_update_i.valid && !debug_mode_i) begin
        for (int i = 0; i < CVA6Cfg.INSTR_PER_FETCH; i++) begin
          if (update_row_index == i) begin
-            bht_ram_read_address_1[i*$clog2(NR_ROWS)+:$clog2(NR_ROWS)] = update_pc;
-            bht[i].saturation_counter = bht_ram_rdata_1[i*BRAM_WORD_BITS+:2];
-
-            if (bht[i].saturation_counter == 2'b11) begin
-              // we can safely decrease it
-              if (!bht_update_i.taken)
-                bht_updated[i].saturation_counter = bht[i].saturation_counter - 1;
-              else bht_updated[i].saturation_counter = 2'b11;
-              // then check if it saturated in the negative regime e.g.: branch not taken
-            end else if (bht[i].saturation_counter == 2'b00) begin
-              // we can safely increase it
-              if (bht_update_i.taken)
-                bht_updated[i].saturation_counter = bht[i].saturation_counter + 1;
-              else bht_updated[i].saturation_counter = 2'b00;
-            end else begin // otherwise we are not in any boundaries and can decrease or increase it
-              if (bht_update_i.taken)
-                bht_updated[i].saturation_counter = bht[i].saturation_counter + 1;
-              else bht_updated[i].saturation_counter = bht[i].saturation_counter - 1;
-            end
-
            bht_updated[i].valid = 1'b1;
            bht_ram_we[i] = 1'b1;
            bht_ram_write_address[i*$clog2(NR_ROWS)+:$clog2(NR_ROWS)] = update_pc;
-            //bht_ram_wdata[(i+1)*BRAM_WORD_BITS-1] =  1'b1; //valid
-            bht_ram_wdata[i*BRAM_WORD_BITS+:BRAM_WORD_BITS] = {
-              bht_updated[i].valid, bht_updated[i].saturation_counter
-            };
+          end
+        end
+      end

+      for (int i = 0; i < CVA6Cfg.INSTR_PER_FETCH; i++) begin
+
+        //When synchronous RAM is used, addresses are needed as soon as available
+        if (CVA6Cfg.FpgaAlteraEn)
+          bht_ram_read_address_0[i*$clog2(NR_ROWS)+:$clog2(NR_ROWS)] = index;
+        if (CVA6Cfg.FpgaAlteraEn)
+          bht_ram_read_address_1[i*$clog2(NR_ROWS)+:$clog2(NR_ROWS)] = update_pc;
+
+        if (check_update_row_index == i) begin
+          //When asynchronous RAM is used, the address can be updated on the cycle when data is read
+          if (!CVA6Cfg.FpgaAlteraEn)
+            bht_ram_read_address_1[i*$clog2(NR_ROWS)+:$clog2(NR_ROWS)] = update_pc;
+          bht[i].saturation_counter = bht_ram_rdata_1[i*BRAM_WORD_BITS+:2];
+
+          if (bht[i].saturation_counter == 2'b11) begin
+            // we can safely decrease it
+            if (!check_bht_update_taken)
+              bht_updated[i].saturation_counter = bht[i].saturation_counter - 1;
+            else bht_updated[i].saturation_counter = 2'b11;
+            // then check if it saturated in the negative regime e.g.: branch not taken
+          end else if (bht[i].saturation_counter == 2'b00) begin
+            // we can safely increase it
+            if (check_bht_update_taken)
+              bht_updated[i].saturation_counter = bht[i].saturation_counter + 1;
+            else bht_updated[i].saturation_counter = 2'b00;
+          end else begin  // otherwise we are not in any boundaries and can decrease or increase it
+            if (check_bht_update_taken)
+              bht_updated[i].saturation_counter = bht[i].saturation_counter + 1;
+            else bht_updated[i].saturation_counter = bht[i].saturation_counter - 1;
+          end
+
+          //The data written in the RAM will have the valid bit from current input (async RAM) or the one from one clock cycle before (sync RAM)
+          bht_ram_wdata[i*BRAM_WORD_BITS+:BRAM_WORD_BITS] = CVA6Cfg.FpgaAlteraEn ? {bht_updated_valid[i][0], bht_updated[i].saturation_counter} :
+                                                                           {bht_updated[i].valid, bht_updated[i].saturation_counter};
+        end
+
+
+        if (!rst_ni) begin
+          //initialize output
+          bht_prediction_o[i] = '0;
+        end else begin
+          //When asynchronous RAM is used, addresses can be calculated on the same cycle as data is read
+          if (!CVA6Cfg.FpgaAlteraEn)
+            bht_ram_read_address_0[i*$clog2(NR_ROWS)+:$clog2(NR_ROWS)] = index;
+          //When synchronous RAM is used and data is read right after writing, we need some buffering
+          // This is one cycle of buffering
+          if (CVA6Cfg.FpgaAlteraEn && bht_updated_valid[i][0] && vpc_q == bht_updated_pc[i][0]) begin
+            bht_prediction_o[i].valid = bht_ram_wdata[i*BRAM_WORD_BITS+2];
+            bht_prediction_o[i].taken = bht_ram_wdata[i*BRAM_WORD_BITS+1];
+            //This is two cycles of buffering
+          end else if (CVA6Cfg.FpgaAlteraEn && bht_updated_valid[i][1] && vpc_q == bht_updated_pc[i][1]) begin
+            bht_prediction_o[i].valid = bht_ram_wdata_q[i*BRAM_WORD_BITS+2];
+            bht_prediction_o[i].taken = bht_ram_wdata_q[i*BRAM_WORD_BITS+1];
+            //In any other case we can safely read from the RAM as data is available
+          end else begin
+            bht_prediction_o[i].valid = bht_ram_rdata_0[i*BRAM_WORD_BITS+2];
+            bht_prediction_o[i].taken = bht_ram_rdata_0[i*BRAM_WORD_BITS+1];
          end
        end
      end
    end

    for (genvar i = 0; i < CVA6Cfg.INSTR_PER_FETCH; i++) begin : gen_bht_ram
-      AsyncThreePortRam #(
-          .ADDR_WIDTH($clog2(NR_ROWS)),
-          .DATA_DEPTH(NR_ROWS),
-          .DATA_WIDTH(BRAM_WORD_BITS)
-      ) i_bht_ram (
-          .Clk_CI     (clk_i),
-          .WrEn_SI    (bht_ram_we[i]),
-          .WrAddr_DI  (bht_ram_write_address[i*$clog2(NR_ROWS)+:$clog2(NR_ROWS)]),
-          .WrData_DI  (bht_ram_wdata[i*BRAM_WORD_BITS+:BRAM_WORD_BITS]),
-          .RdAddr_DI_0(bht_ram_read_address_0[i*$clog2(NR_ROWS)+:$clog2(NR_ROWS)]),
-          .RdAddr_DI_1(bht_ram_read_address_1[i*$clog2(NR_ROWS)+:$clog2(NR_ROWS)]),
-          .RdData_DO_0(bht_ram_rdata_0[i*BRAM_WORD_BITS+:BRAM_WORD_BITS]),
-          .RdData_DO_1(bht_ram_rdata_1[i*BRAM_WORD_BITS+:BRAM_WORD_BITS])
-      );
+      if (CVA6Cfg.FpgaAlteraEn) begin
+        SyncThreePortRam #(
+            .ADDR_WIDTH($clog2(NR_ROWS)),
+            .DATA_DEPTH(NR_ROWS),
+            .DATA_WIDTH(BRAM_WORD_BITS)
+        ) i_bht_ram (
+            .Clk_CI     (clk_i),
+            .WrEn_SI    (bht_ram_we_q[i]),
+            .WrAddr_DI  (bht_ram_write_address_q[i*$clog2(NR_ROWS)+:$clog2(NR_ROWS)]),
+            .WrData_DI  (bht_ram_wdata[i*BRAM_WORD_BITS+:BRAM_WORD_BITS]),
+            .RdAddr_DI_0(bht_ram_read_address_0[i*$clog2(NR_ROWS)+:$clog2(NR_ROWS)]),
+            .RdAddr_DI_1(bht_ram_read_address_1[i*$clog2(NR_ROWS)+:$clog2(NR_ROWS)]),
+            .RdData_DO_0(bht_ram_rdata_0[i*BRAM_WORD_BITS+:BRAM_WORD_BITS]),
+            .RdData_DO_1(bht_ram_rdata_1[i*BRAM_WORD_BITS+:BRAM_WORD_BITS])
+        );
+
+      end else begin
+        AsyncThreePortRam #(
+            .ADDR_WIDTH($clog2(NR_ROWS)),
+            .DATA_DEPTH(NR_ROWS),
+            .DATA_WIDTH(BRAM_WORD_BITS)
+        ) i_bht_ram (
+            .Clk_CI     (clk_i),
+            .WrEn_SI    (bht_ram_we[i]),
+            .WrAddr_DI  (bht_ram_write_address[i*$clog2(NR_ROWS)+:$clog2(NR_ROWS)]),
+            .WrData_DI  (bht_ram_wdata[i*BRAM_WORD_BITS+:BRAM_WORD_BITS]),
+            .RdAddr_DI_0(bht_ram_read_address_0[i*$clog2(NR_ROWS)+:$clog2(NR_ROWS)]),
+            .RdAddr_DI_1(bht_ram_read_address_1[i*$clog2(NR_ROWS)+:$clog2(NR_ROWS)]),
+            .RdData_DO_0(bht_ram_rdata_0[i*BRAM_WORD_BITS+:BRAM_WORD_BITS]),
+            .RdData_DO_1(bht_ram_rdata_1[i*BRAM_WORD_BITS+:BRAM_WORD_BITS])
+        );
+      end
    end

+    // Extra buffering signals needed when synchronous RAM is used
+
+    always_ff @(posedge clk_i or negedge rst_ni) begin
+      if (CVA6Cfg.FpgaAlteraEn) begin
+        if (!rst_ni) begin
+          bht_updated_valid <= '0;
+          bht_update_taken <= '0;
+          bht_ram_wdata_q <= '0;
+          row_index_q <= '0;
+          bht_ram_we_q <= '0;
+          bht_ram_write_address_q <= '0;
+          update_row_index_q <= '0;
+        end else begin
+          for (int i = 0; i < CVA6Cfg.INSTR_PER_FETCH; i++) begin
+            bht_updated_valid[i][1] <= bht_updated_valid[i][0];
+            bht_updated_valid[i][0] <= bht_updated[i].valid;
+            bht_updated_pc[i][1] <= bht_updated_pc[i][0];
+            bht_updated_pc[i][0] <= bht_update_i.pc;
+
+          end
+          vpc_q <= vpc_i;
+          bht_update_taken <= bht_update_i.taken;
+          bht_ram_wdata_q <= bht_ram_wdata;
+          bht_ram_we_q <= bht_ram_we;
+          bht_ram_write_address_q <= bht_ram_write_address;
+          update_row_index_q <= update_row_index;
+
+          row_index_q <= row_index;
+        end
+      end
+    end
+
+    // Assignment of indexes checked to generate data written in the RAM. When synchronous RAM is used these signals need to be delayed
+    assign check_update_row_index = CVA6Cfg.FpgaAlteraEn ? update_row_index_q : update_row_index;
+    assign check_bht_update_taken = CVA6Cfg.FpgaAlteraEn ? bht_update_taken : bht_update_i.taken;
+    assign check_row_index        = CVA6Cfg.FpgaAlteraEn ? row_index_q : row_index;
+
  end
 endmodule
--- a/core/frontend/frontend.sv
+++ b/core/frontend/frontend.sv
@ -140,6 +140,7 @@ module frontend
  btb_prediction_t [CVA6Cfg.INSTR_PER_FETCH-1:0]                   btb_prediction_shifted;
  ras_t                                                            ras_predict;
  logic            [           CVA6Cfg.VLEN-1:0]                   vpc_btb;
+  logic            [           CVA6Cfg.VLEN-1:0]                   vpc_bht;

  // branch-predict update
  logic                                                            is_mispredict;
@ -484,7 +485,9 @@ module frontend
  //For FPGA, BTB is implemented in read synchronous BRAM
  //while for ASIC, BTB is implemented in D flip-flop
  //and can be read at the same cycle.
+  //Same for BHT
  assign vpc_btb = (CVA6Cfg.FpgaEn) ? icache_dreq_i.vaddr : icache_vaddr_q;
+  assign vpc_bht = (CVA6Cfg.FpgaEn && CVA6Cfg.FpgaAlteraEn && icache_dreq_i.valid) ? icache_dreq_i.vaddr : icache_vaddr_q;

  if (CVA6Cfg.BTBEntries == 0) begin
    assign btb_prediction = '0;
@ -517,7 +520,7 @@ module frontend
        .rst_ni,
        .flush_bp_i      (flush_bp_i),
        .debug_mode_i,
-        .vpc_i           (icache_vaddr_q),
+        .vpc_i           (vpc_bht),
        .bht_update_i    (bht_update),
        .bht_prediction_o(bht_prediction)
    );
--- a/vendor/pulp-platform/fpga-support/rtl/SyncThreePortRam.sv
+++ b/vendor/pulp-platform/fpga-support/rtl/SyncThreePortRam.sv
@ -0,0 +1,65 @@
+// Copyright 2024 PlanV Technologies
+//
+// Licensed under the Solderpad Hardware Licence, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// SPDX-License-Identifier: Apache-2.0 WITH SHL-2.0
+// You may obtain a copy of the License at https://solderpad.org/licenses
+//
+// Inferable, Asynchronous Three-Ports RAM, there are a write port and two read ports
+//
+//
+// This module is designed to work with both Xilinx, Microchip and Altera FPGA tools by following the respective
+// guidelines:
+// - Xilinx UG901 Vivado Design Suite User Guide: Synthesis
+// - Inferring Microchip PolarFire RAM Blocks
+// - Altera Quartus II Handbook Volume 1: Design and Synthesis (p. 768)
+//
+// Current Maintainers:: Angela Gonzalez - PlanV Technologies
+
+ 
+module SyncThreePortRam
+#(
+  parameter ADDR_WIDTH = 10,
+  parameter DATA_DEPTH = 1024, // usually 2**ADDR_WIDTH, but can be lower
+  parameter DATA_WIDTH = 32
+)(
+  input  logic                    Clk_CI,
+
+  // Write port
+  input  logic                    WrEn_SI,
+  input  logic [ADDR_WIDTH-1:0]   WrAddr_DI,
+  input  logic [DATA_WIDTH-1:0]   WrData_DI,
+  
+  // Read ports
+  input  logic [ADDR_WIDTH-1:0]   RdAddr_DI_0,
+  input  logic [ADDR_WIDTH-1:0]   RdAddr_DI_1,
+  
+  output logic [DATA_WIDTH-1:0]   RdData_DO_0,
+  output logic [DATA_WIDTH-1:0]   RdData_DO_1
+);
+
+logic [DATA_WIDTH-1:0] mem [DATA_DEPTH-1:0]= '{default:0};
+
+  // WRITE
+  always_ff @(posedge Clk_CI)
+  begin
+    if (WrEn_SI) begin
+      mem[WrAddr_DI] <= WrData_DI;
+    end
+    
+    RdData_DO_0 = mem[RdAddr_DI_0];
+    RdData_DO_1 = mem[RdAddr_DI_1];
+  
+  end
+  
+  ////////////////////////////
+  // assertions
+  ////////////////////////////
+
+  // pragma translate_off
+  assert property
+    (@(posedge Clk_CI) (longint'(2)**longint'(ADDR_WIDTH) >= longint'(DATA_DEPTH)))
+    else $error("depth out of bounds");
+  // pragma translate_on
+
+endmodule