mirror of
https://github.com/openhwgroup/cva6.git
synced 2025-04-20 12:17:19 -04:00
BHT optimized for fpga (#1039)
It counts and saves the saturation bits in D flip-flops for the ASIC version and in a three-port asynchronous read memory for the FPGA version. FPGA flushing is not supported because the frontend module flushing signal is not connected.
This commit is contained in:
parent
1498700dd7
commit
3b55657552
4 changed files with 214 additions and 38 deletions
|
@ -30,6 +30,7 @@
|
|||
//FPGA memories
|
||||
${CVA6_REPO_DIR}/vendor/pulp-platform/fpga-support/rtl/SyncDpRam.sv
|
||||
${CVA6_REPO_DIR}/vendor/pulp-platform/fpga-support/rtl/AsyncDpRam.sv
|
||||
${CVA6_REPO_DIR}/vendor/pulp-platform/fpga-support/rtl/AsyncThreePortRam.sv
|
||||
|
||||
+incdir+${CVA6_REPO_DIR}/vendor/pulp-platform/common_cells/include/
|
||||
+incdir+${CVA6_REPO_DIR}/vendor/pulp-platform/common_cells/src/
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
// Copyright 2018 - 2019 ETH Zurich and University of Bologna.
|
||||
// Copyright 2023 - Thales for additionnal conribution.
|
||||
// Copyright and related rights are licensed under the Solderpad Hardware
|
||||
// License, Version 2.0 (the "License"); you may not use this file except in
|
||||
// compliance with the License. You may obtain a copy of the License at
|
||||
|
@ -12,8 +13,11 @@
|
|||
// Date: 08.02.2018
|
||||
// Migrated: Luis Vitorio Cargnini, IEEE
|
||||
// Date: 09.06.2018
|
||||
// FPGA optimization: Sebastien Jacq, Thales
|
||||
// Date: 2023-01-30
|
||||
|
||||
// branch history table - 2 bit saturation counter
|
||||
|
||||
module bht #(
|
||||
parameter int unsigned NR_ENTRIES = 1024
|
||||
)(
|
||||
|
@ -32,7 +36,7 @@ module bht #(
|
|||
localparam NR_ROWS = NR_ENTRIES / ariane_pkg::INSTR_PER_FETCH;
|
||||
// number of bits needed to index the row
|
||||
localparam ROW_ADDR_BITS = $clog2(ariane_pkg::INSTR_PER_FETCH);
|
||||
localparam ROW_INDEX_BITS = ariane_pkg::RVC == 1'b1 ? $clog2(ariane_pkg::INSTR_PER_FETCH) : 1; // 1 1
|
||||
localparam ROW_INDEX_BITS = ariane_pkg::RVC == 1'b1 ? $clog2(ariane_pkg::INSTR_PER_FETCH) : 1;
|
||||
// number of bits we should use for prediction
|
||||
localparam PREDICTION_BITS = $clog2(NR_ROWS) + OFFSET + ROW_ADDR_BITS;
|
||||
// we are not interested in all bits of the address
|
||||
|
@ -44,8 +48,7 @@ module bht #(
|
|||
} bht_d[NR_ROWS-1:0][ariane_pkg::INSTR_PER_FETCH-1:0], bht_q[NR_ROWS-1:0][ariane_pkg::INSTR_PER_FETCH-1:0];
|
||||
|
||||
logic [$clog2(NR_ROWS)-1:0] index, update_pc;
|
||||
logic [ROW_INDEX_BITS-1:0] update_row_index;
|
||||
logic [1:0] saturation_counter;
|
||||
logic [ROW_INDEX_BITS-1:0] update_row_index;
|
||||
|
||||
assign index = vpc_i[PREDICTION_BITS - 1:ROW_ADDR_BITS + OFFSET];
|
||||
assign update_pc = bht_update_i.pc[PREDICTION_BITS - 1:ROW_ADDR_BITS + OFFSET];
|
||||
|
@ -55,56 +58,157 @@ module bht #(
|
|||
assign update_row_index = '0;
|
||||
end
|
||||
|
||||
// prediction assignment
|
||||
for (genvar i = 0; i < ariane_pkg::INSTR_PER_FETCH; i++) begin : gen_bht_output
|
||||
if (!ariane_pkg::FPGA_EN) begin : gen_asic_bht // ASIC TARGET
|
||||
|
||||
logic [1:0] saturation_counter;
|
||||
// prediction assignment
|
||||
for (genvar i = 0; i < ariane_pkg::INSTR_PER_FETCH; i++) begin : gen_bht_output
|
||||
assign bht_prediction_o[i].valid = bht_q[index][i].valid;
|
||||
assign bht_prediction_o[i].taken = bht_q[index][i].saturation_counter[1] == 1'b1;
|
||||
end
|
||||
end
|
||||
|
||||
always_comb begin : update_bht
|
||||
always_comb begin : update_bht
|
||||
bht_d = bht_q;
|
||||
saturation_counter = bht_q[update_pc][update_row_index].saturation_counter;
|
||||
|
||||
if (bht_update_i.valid && !debug_mode_i) begin
|
||||
bht_d[update_pc][update_row_index].valid = 1'b1;
|
||||
bht_d[update_pc][update_row_index].valid = 1'b1;
|
||||
|
||||
if (saturation_counter == 2'b11) begin
|
||||
if (saturation_counter == 2'b11) begin
|
||||
// we can safely decrease it
|
||||
if (!bht_update_i.taken)
|
||||
bht_d[update_pc][update_row_index].saturation_counter = saturation_counter - 1;
|
||||
// then check if it saturated in the negative regime e.g.: branch not taken
|
||||
end else if (saturation_counter == 2'b00) begin
|
||||
// we can safely increase it
|
||||
if (bht_update_i.taken)
|
||||
bht_d[update_pc][update_row_index].saturation_counter = saturation_counter + 1;
|
||||
end else begin // otherwise we are not in any boundaries and can decrease or increase it
|
||||
if (bht_update_i.taken)
|
||||
bht_d[update_pc][update_row_index].saturation_counter = saturation_counter + 1;
|
||||
else
|
||||
bht_d[update_pc][update_row_index].saturation_counter = saturation_counter - 1;
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
always_ff @(posedge clk_i or negedge rst_ni) begin
|
||||
if (!rst_ni) begin
|
||||
for (int unsigned i = 0; i < NR_ROWS; i++) begin
|
||||
for (int j = 0; j < ariane_pkg::INSTR_PER_FETCH; j++) begin
|
||||
bht_q[i][j] <= '0;
|
||||
end
|
||||
end
|
||||
end else begin
|
||||
// evict all entries
|
||||
if (flush_i) begin
|
||||
for (int i = 0; i < NR_ROWS; i++) begin
|
||||
for (int j = 0; j < ariane_pkg::INSTR_PER_FETCH; j++) begin
|
||||
bht_q[i][j].valid <= 1'b0;
|
||||
bht_q[i][j].saturation_counter <= 2'b10;
|
||||
end
|
||||
end
|
||||
end else begin
|
||||
bht_q <= bht_d;
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
end else begin : gen_fpga_bht //FPGA TARGETS
|
||||
|
||||
// number of bits par word in the bram
|
||||
localparam BRAM_WORD_BITS = $bits(ariane_pkg::bht_t);
|
||||
logic [ROW_INDEX_BITS-1:0] row_index;
|
||||
logic [ariane_pkg::INSTR_PER_FETCH-1:0] bht_ram_we;
|
||||
logic [ariane_pkg::INSTR_PER_FETCH*$clog2(NR_ROWS)-1:0] bht_ram_read_address_0;
|
||||
logic [ariane_pkg::INSTR_PER_FETCH*$clog2(NR_ROWS)-1:0] bht_ram_read_address_1;
|
||||
logic [ariane_pkg::INSTR_PER_FETCH*$clog2(NR_ROWS)-1:0] bht_ram_write_address;
|
||||
logic [ariane_pkg::INSTR_PER_FETCH*BRAM_WORD_BITS-1:0] bht_ram_wdata;
|
||||
logic [ariane_pkg::INSTR_PER_FETCH*BRAM_WORD_BITS-1:0] bht_ram_rdata_0;
|
||||
logic [ariane_pkg::INSTR_PER_FETCH*BRAM_WORD_BITS-1:0] bht_ram_rdata_1;
|
||||
|
||||
ariane_pkg::bht_t [ariane_pkg::INSTR_PER_FETCH-1:0] bht;
|
||||
ariane_pkg::bht_t [ariane_pkg::INSTR_PER_FETCH-1:0] bht_updated;
|
||||
|
||||
if (ariane_pkg::RVC) begin : gen_row_index
|
||||
assign row_index = vpc_i[ROW_ADDR_BITS + OFFSET - 1:OFFSET];
|
||||
end else begin
|
||||
assign row_index = '0;
|
||||
end
|
||||
|
||||
// -------------------------
|
||||
// prediction assignment & update Branch History Table
|
||||
// -------------------------
|
||||
always_comb begin : prediction_update_bht
|
||||
bht_ram_we = '0;
|
||||
bht_ram_read_address_0 = '0;
|
||||
bht_ram_read_address_1 = '0;
|
||||
bht_ram_write_address = '0;
|
||||
bht_ram_wdata ='0;
|
||||
bht_updated = '0;
|
||||
bht = '0;
|
||||
|
||||
for (int i = 0; i < ariane_pkg::INSTR_PER_FETCH; i++) begin
|
||||
if (row_index == i) begin
|
||||
bht_ram_read_address_0[i*$clog2(NR_ROWS) +: $clog2(NR_ROWS)] = index;
|
||||
bht_prediction_o[i].valid = bht_ram_rdata_0[i*BRAM_WORD_BITS+2] ;
|
||||
bht_prediction_o[i].taken = bht_ram_rdata_0[i*BRAM_WORD_BITS+1] ;
|
||||
end
|
||||
end
|
||||
|
||||
if (bht_update_i.valid && !debug_mode_i) begin
|
||||
for (int i = 0; i < ariane_pkg::INSTR_PER_FETCH; i++) begin
|
||||
if (update_row_index == i) begin
|
||||
bht_ram_read_address_1[i*$clog2(NR_ROWS) +: $clog2(NR_ROWS)] = update_pc;
|
||||
bht[i].saturation_counter = bht_ram_rdata_1[i*BRAM_WORD_BITS +: 2];
|
||||
|
||||
if (bht[i].saturation_counter == 2'b11) begin
|
||||
// we can safely decrease it
|
||||
if (!bht_update_i.taken)
|
||||
bht_d[update_pc][update_row_index].saturation_counter = saturation_counter - 1;
|
||||
// then check if it saturated in the negative regime e.g.: branch not taken
|
||||
end else if (saturation_counter == 2'b00) begin
|
||||
bht_updated[i].saturation_counter = bht[i].saturation_counter - 1;
|
||||
else
|
||||
bht_updated[i].saturation_counter = 2'b11;
|
||||
// then check if it saturated in the negative regime e.g.: branch not taken
|
||||
end else if (bht[i].saturation_counter == 2'b00) begin
|
||||
// we can safely increase it
|
||||
if (bht_update_i.taken)
|
||||
bht_d[update_pc][update_row_index].saturation_counter = saturation_counter + 1;
|
||||
end else begin // otherwise we are not in any boundaries and can decrease or increase it
|
||||
if (bht_update_i.taken)
|
||||
bht_d[update_pc][update_row_index].saturation_counter = saturation_counter + 1;
|
||||
bht_updated[i].saturation_counter = bht[i].saturation_counter + 1;
|
||||
else
|
||||
bht_d[update_pc][update_row_index].saturation_counter = saturation_counter - 1;
|
||||
bht_updated[i].saturation_counter = 2'b00;
|
||||
end else begin // otherwise we are not in any boundaries and can decrease or increase it
|
||||
if (bht_update_i.taken)
|
||||
bht_updated[i].saturation_counter = bht[i].saturation_counter + 1;
|
||||
else
|
||||
bht_updated[i].saturation_counter = bht[i].saturation_counter - 1;
|
||||
end
|
||||
|
||||
bht_updated[i].valid = 1'b1;
|
||||
bht_ram_we[i] = 1'b1;
|
||||
bht_ram_write_address[i*$clog2(NR_ROWS) +: $clog2(NR_ROWS)] = update_pc;
|
||||
//bht_ram_wdata[(i+1)*BRAM_WORD_BITS-1] = 1'b1; //valid
|
||||
bht_ram_wdata[i*BRAM_WORD_BITS +: BRAM_WORD_BITS] = {bht_updated[i].valid , bht_updated[i].saturation_counter};
|
||||
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
for (genvar i = 0; i < ariane_pkg::INSTR_PER_FETCH; i++) begin : gen_bht_ram
|
||||
AsyncThreePortRam #(
|
||||
.ADDR_WIDTH($clog2(NR_ROWS)),
|
||||
.DATA_DEPTH (NR_ROWS),
|
||||
.DATA_WIDTH(BRAM_WORD_BITS)
|
||||
) i_bht_ram (
|
||||
.Clk_CI ( clk_i ),
|
||||
.WrEn_SI ( bht_ram_we[i] ),
|
||||
.WrAddr_DI ( bht_ram_write_address[i*$clog2(NR_ROWS) +: $clog2(NR_ROWS)] ),
|
||||
.WrData_DI ( bht_ram_wdata[i*BRAM_WORD_BITS +: BRAM_WORD_BITS] ),
|
||||
.RdAddr_DI_0 ( bht_ram_read_address_0[i*$clog2(NR_ROWS) +: $clog2(NR_ROWS)] ),
|
||||
.RdAddr_DI_1 ( bht_ram_read_address_1[i*$clog2(NR_ROWS) +: $clog2(NR_ROWS)] ),
|
||||
.RdData_DO_0 ( bht_ram_rdata_0[i*BRAM_WORD_BITS +: BRAM_WORD_BITS] ),
|
||||
.RdData_DO_1 ( bht_ram_rdata_1[i*BRAM_WORD_BITS +: BRAM_WORD_BITS] )
|
||||
);
|
||||
end
|
||||
|
||||
always_ff @(posedge clk_i or negedge rst_ni) begin
|
||||
if (!rst_ni) begin
|
||||
for (int unsigned i = 0; i < NR_ROWS; i++) begin
|
||||
for (int j = 0; j < ariane_pkg::INSTR_PER_FETCH; j++) begin
|
||||
bht_q[i][j] <= '0;
|
||||
end
|
||||
end
|
||||
end else begin
|
||||
// evict all entries
|
||||
if (flush_i) begin
|
||||
for (int i = 0; i < NR_ROWS; i++) begin
|
||||
for (int j = 0; j < ariane_pkg::INSTR_PER_FETCH; j++) begin
|
||||
bht_q[i][j].valid <= 1'b0;
|
||||
bht_q[i][j].saturation_counter <= 2'b10;
|
||||
end
|
||||
end
|
||||
end else begin
|
||||
bht_q <= bht_d;
|
||||
end
|
||||
end
|
||||
end
|
||||
endmodule
|
||||
|
|
|
@ -379,6 +379,11 @@ package ariane_pkg;
|
|||
logic taken;
|
||||
} bht_prediction_t;
|
||||
|
||||
typedef struct packed {
|
||||
logic valid;
|
||||
logic [1:0] saturation_counter;
|
||||
} bht_t;
|
||||
|
||||
typedef enum logic[3:0] {
|
||||
NONE, // 0
|
||||
LOAD, // 1
|
||||
|
|
66
vendor/pulp-platform/fpga-support/rtl/AsyncThreePortRam.sv
vendored
Normal file
66
vendor/pulp-platform/fpga-support/rtl/AsyncThreePortRam.sv
vendored
Normal file
|
@ -0,0 +1,66 @@
|
|||
// Copyright 2023 Thales Research and Technology
|
||||
//
|
||||
// Licensed under the Solderpad Hardware Licence, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// SPDX-License-Identifier: Apache-2.0 WITH SHL-2.0
|
||||
// You may obtain a copy of the License at https://solderpad.org/licenses
|
||||
//
|
||||
// Inferable, Asynchronous Three-Ports RAM, there are a write port and two read ports
|
||||
//
|
||||
//
|
||||
// This module is designed to work with both Xilinx and Microchip FPGA tools by following the respective
|
||||
// guidelines:
|
||||
// - Xilinx UG901 Vivado Design Suite User Guide: Synthesis
|
||||
// - Inferring Microchip PolarFire RAM Blocks
|
||||
//
|
||||
// Intel FPGA (Altera) doesn't seem to support asynchronous RAM
|
||||
//
|
||||
// Current Maintainers:: Sébastien Jacq - sjthales on github.com
|
||||
|
||||
|
||||
module AsyncThreePortRam
|
||||
#(
|
||||
parameter ADDR_WIDTH = 10,
|
||||
parameter DATA_DEPTH = 1024, // usually 2**ADDR_WIDTH, but can be lower
|
||||
parameter DATA_WIDTH = 32
|
||||
)(
|
||||
input logic Clk_CI,
|
||||
|
||||
// Write port
|
||||
input logic WrEn_SI,
|
||||
input logic [ADDR_WIDTH-1:0] WrAddr_DI,
|
||||
input logic [DATA_WIDTH-1:0] WrData_DI,
|
||||
|
||||
// Read ports
|
||||
input logic [ADDR_WIDTH-1:0] RdAddr_DI_0,
|
||||
input logic [ADDR_WIDTH-1:0] RdAddr_DI_1,
|
||||
|
||||
output logic [DATA_WIDTH-1:0] RdData_DO_0,
|
||||
output logic [DATA_WIDTH-1:0] RdData_DO_1
|
||||
);
|
||||
|
||||
logic [DATA_WIDTH-1:0] mem [DATA_DEPTH-1:0]= '{default:0};
|
||||
|
||||
// WRITE
|
||||
always_ff @(posedge Clk_CI)
|
||||
begin
|
||||
if (WrEn_SI) begin
|
||||
mem[WrAddr_DI] <= WrData_DI;
|
||||
end
|
||||
end
|
||||
|
||||
// READ
|
||||
assign RdData_DO_0 = mem[RdAddr_DI_0];
|
||||
assign RdData_DO_1 = mem[RdAddr_DI_1];
|
||||
|
||||
////////////////////////////
|
||||
// assertions
|
||||
////////////////////////////
|
||||
|
||||
// pragma translate_off
|
||||
assert property
|
||||
(@(posedge Clk_CI) (longint'(2)**longint'(ADDR_WIDTH) >= longint'(DATA_DEPTH)))
|
||||
else $error("depth out of bounds");
|
||||
// pragma translate_on
|
||||
|
||||
endmodule
|
Loading…
Add table
Add a link
Reference in a new issue