BHT optimized for fpga (#1039)

It counts and saves the saturation bits in D flip-flops for 
the ASIC version and in a three-port asynchronous read memory
for the FPGA version.
FPGA flushing is not supported because the frontend module
flushing signal is not connected.
This commit is contained in:
sébastien jacq 2023-02-01 16:37:44 +01:00 committed by GitHub
parent 1498700dd7
commit 3b55657552
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
4 changed files with 214 additions and 38 deletions

View file

@ -30,6 +30,7 @@
//FPGA memories
${CVA6_REPO_DIR}/vendor/pulp-platform/fpga-support/rtl/SyncDpRam.sv
${CVA6_REPO_DIR}/vendor/pulp-platform/fpga-support/rtl/AsyncDpRam.sv
${CVA6_REPO_DIR}/vendor/pulp-platform/fpga-support/rtl/AsyncThreePortRam.sv
+incdir+${CVA6_REPO_DIR}/vendor/pulp-platform/common_cells/include/
+incdir+${CVA6_REPO_DIR}/vendor/pulp-platform/common_cells/src/

View file

@ -1,4 +1,5 @@
// Copyright 2018 - 2019 ETH Zurich and University of Bologna.
// Copyright 2023 - Thales for additionnal conribution.
// Copyright and related rights are licensed under the Solderpad Hardware
// License, Version 2.0 (the "License"); you may not use this file except in
// compliance with the License. You may obtain a copy of the License at
@ -12,8 +13,11 @@
// Date: 08.02.2018
// Migrated: Luis Vitorio Cargnini, IEEE
// Date: 09.06.2018
// FPGA optimization: Sebastien Jacq, Thales
// Date: 2023-01-30
// branch history table - 2 bit saturation counter
module bht #(
parameter int unsigned NR_ENTRIES = 1024
)(
@ -32,7 +36,7 @@ module bht #(
localparam NR_ROWS = NR_ENTRIES / ariane_pkg::INSTR_PER_FETCH;
// number of bits needed to index the row
localparam ROW_ADDR_BITS = $clog2(ariane_pkg::INSTR_PER_FETCH);
localparam ROW_INDEX_BITS = ariane_pkg::RVC == 1'b1 ? $clog2(ariane_pkg::INSTR_PER_FETCH) : 1; // 1 1
localparam ROW_INDEX_BITS = ariane_pkg::RVC == 1'b1 ? $clog2(ariane_pkg::INSTR_PER_FETCH) : 1;
// number of bits we should use for prediction
localparam PREDICTION_BITS = $clog2(NR_ROWS) + OFFSET + ROW_ADDR_BITS;
// we are not interested in all bits of the address
@ -44,8 +48,7 @@ module bht #(
} bht_d[NR_ROWS-1:0][ariane_pkg::INSTR_PER_FETCH-1:0], bht_q[NR_ROWS-1:0][ariane_pkg::INSTR_PER_FETCH-1:0];
logic [$clog2(NR_ROWS)-1:0] index, update_pc;
logic [ROW_INDEX_BITS-1:0] update_row_index;
logic [1:0] saturation_counter;
logic [ROW_INDEX_BITS-1:0] update_row_index;
assign index = vpc_i[PREDICTION_BITS - 1:ROW_ADDR_BITS + OFFSET];
assign update_pc = bht_update_i.pc[PREDICTION_BITS - 1:ROW_ADDR_BITS + OFFSET];
@ -55,56 +58,157 @@ module bht #(
assign update_row_index = '0;
end
// prediction assignment
for (genvar i = 0; i < ariane_pkg::INSTR_PER_FETCH; i++) begin : gen_bht_output
if (!ariane_pkg::FPGA_EN) begin : gen_asic_bht // ASIC TARGET
logic [1:0] saturation_counter;
// prediction assignment
for (genvar i = 0; i < ariane_pkg::INSTR_PER_FETCH; i++) begin : gen_bht_output
assign bht_prediction_o[i].valid = bht_q[index][i].valid;
assign bht_prediction_o[i].taken = bht_q[index][i].saturation_counter[1] == 1'b1;
end
end
always_comb begin : update_bht
always_comb begin : update_bht
bht_d = bht_q;
saturation_counter = bht_q[update_pc][update_row_index].saturation_counter;
if (bht_update_i.valid && !debug_mode_i) begin
bht_d[update_pc][update_row_index].valid = 1'b1;
bht_d[update_pc][update_row_index].valid = 1'b1;
if (saturation_counter == 2'b11) begin
if (saturation_counter == 2'b11) begin
// we can safely decrease it
if (!bht_update_i.taken)
bht_d[update_pc][update_row_index].saturation_counter = saturation_counter - 1;
// then check if it saturated in the negative regime e.g.: branch not taken
end else if (saturation_counter == 2'b00) begin
// we can safely increase it
if (bht_update_i.taken)
bht_d[update_pc][update_row_index].saturation_counter = saturation_counter + 1;
end else begin // otherwise we are not in any boundaries and can decrease or increase it
if (bht_update_i.taken)
bht_d[update_pc][update_row_index].saturation_counter = saturation_counter + 1;
else
bht_d[update_pc][update_row_index].saturation_counter = saturation_counter - 1;
end
end
end
always_ff @(posedge clk_i or negedge rst_ni) begin
if (!rst_ni) begin
for (int unsigned i = 0; i < NR_ROWS; i++) begin
for (int j = 0; j < ariane_pkg::INSTR_PER_FETCH; j++) begin
bht_q[i][j] <= '0;
end
end
end else begin
// evict all entries
if (flush_i) begin
for (int i = 0; i < NR_ROWS; i++) begin
for (int j = 0; j < ariane_pkg::INSTR_PER_FETCH; j++) begin
bht_q[i][j].valid <= 1'b0;
bht_q[i][j].saturation_counter <= 2'b10;
end
end
end else begin
bht_q <= bht_d;
end
end
end
end else begin : gen_fpga_bht //FPGA TARGETS
// number of bits par word in the bram
localparam BRAM_WORD_BITS = $bits(ariane_pkg::bht_t);
logic [ROW_INDEX_BITS-1:0] row_index;
logic [ariane_pkg::INSTR_PER_FETCH-1:0] bht_ram_we;
logic [ariane_pkg::INSTR_PER_FETCH*$clog2(NR_ROWS)-1:0] bht_ram_read_address_0;
logic [ariane_pkg::INSTR_PER_FETCH*$clog2(NR_ROWS)-1:0] bht_ram_read_address_1;
logic [ariane_pkg::INSTR_PER_FETCH*$clog2(NR_ROWS)-1:0] bht_ram_write_address;
logic [ariane_pkg::INSTR_PER_FETCH*BRAM_WORD_BITS-1:0] bht_ram_wdata;
logic [ariane_pkg::INSTR_PER_FETCH*BRAM_WORD_BITS-1:0] bht_ram_rdata_0;
logic [ariane_pkg::INSTR_PER_FETCH*BRAM_WORD_BITS-1:0] bht_ram_rdata_1;
ariane_pkg::bht_t [ariane_pkg::INSTR_PER_FETCH-1:0] bht;
ariane_pkg::bht_t [ariane_pkg::INSTR_PER_FETCH-1:0] bht_updated;
if (ariane_pkg::RVC) begin : gen_row_index
assign row_index = vpc_i[ROW_ADDR_BITS + OFFSET - 1:OFFSET];
end else begin
assign row_index = '0;
end
// -------------------------
// prediction assignment & update Branch History Table
// -------------------------
always_comb begin : prediction_update_bht
bht_ram_we = '0;
bht_ram_read_address_0 = '0;
bht_ram_read_address_1 = '0;
bht_ram_write_address = '0;
bht_ram_wdata ='0;
bht_updated = '0;
bht = '0;
for (int i = 0; i < ariane_pkg::INSTR_PER_FETCH; i++) begin
if (row_index == i) begin
bht_ram_read_address_0[i*$clog2(NR_ROWS) +: $clog2(NR_ROWS)] = index;
bht_prediction_o[i].valid = bht_ram_rdata_0[i*BRAM_WORD_BITS+2] ;
bht_prediction_o[i].taken = bht_ram_rdata_0[i*BRAM_WORD_BITS+1] ;
end
end
if (bht_update_i.valid && !debug_mode_i) begin
for (int i = 0; i < ariane_pkg::INSTR_PER_FETCH; i++) begin
if (update_row_index == i) begin
bht_ram_read_address_1[i*$clog2(NR_ROWS) +: $clog2(NR_ROWS)] = update_pc;
bht[i].saturation_counter = bht_ram_rdata_1[i*BRAM_WORD_BITS +: 2];
if (bht[i].saturation_counter == 2'b11) begin
// we can safely decrease it
if (!bht_update_i.taken)
bht_d[update_pc][update_row_index].saturation_counter = saturation_counter - 1;
// then check if it saturated in the negative regime e.g.: branch not taken
end else if (saturation_counter == 2'b00) begin
bht_updated[i].saturation_counter = bht[i].saturation_counter - 1;
else
bht_updated[i].saturation_counter = 2'b11;
// then check if it saturated in the negative regime e.g.: branch not taken
end else if (bht[i].saturation_counter == 2'b00) begin
// we can safely increase it
if (bht_update_i.taken)
bht_d[update_pc][update_row_index].saturation_counter = saturation_counter + 1;
end else begin // otherwise we are not in any boundaries and can decrease or increase it
if (bht_update_i.taken)
bht_d[update_pc][update_row_index].saturation_counter = saturation_counter + 1;
bht_updated[i].saturation_counter = bht[i].saturation_counter + 1;
else
bht_d[update_pc][update_row_index].saturation_counter = saturation_counter - 1;
bht_updated[i].saturation_counter = 2'b00;
end else begin // otherwise we are not in any boundaries and can decrease or increase it
if (bht_update_i.taken)
bht_updated[i].saturation_counter = bht[i].saturation_counter + 1;
else
bht_updated[i].saturation_counter = bht[i].saturation_counter - 1;
end
bht_updated[i].valid = 1'b1;
bht_ram_we[i] = 1'b1;
bht_ram_write_address[i*$clog2(NR_ROWS) +: $clog2(NR_ROWS)] = update_pc;
//bht_ram_wdata[(i+1)*BRAM_WORD_BITS-1] = 1'b1; //valid
bht_ram_wdata[i*BRAM_WORD_BITS +: BRAM_WORD_BITS] = {bht_updated[i].valid , bht_updated[i].saturation_counter};
end
end
end
end
end
for (genvar i = 0; i < ariane_pkg::INSTR_PER_FETCH; i++) begin : gen_bht_ram
AsyncThreePortRam #(
.ADDR_WIDTH($clog2(NR_ROWS)),
.DATA_DEPTH (NR_ROWS),
.DATA_WIDTH(BRAM_WORD_BITS)
) i_bht_ram (
.Clk_CI ( clk_i ),
.WrEn_SI ( bht_ram_we[i] ),
.WrAddr_DI ( bht_ram_write_address[i*$clog2(NR_ROWS) +: $clog2(NR_ROWS)] ),
.WrData_DI ( bht_ram_wdata[i*BRAM_WORD_BITS +: BRAM_WORD_BITS] ),
.RdAddr_DI_0 ( bht_ram_read_address_0[i*$clog2(NR_ROWS) +: $clog2(NR_ROWS)] ),
.RdAddr_DI_1 ( bht_ram_read_address_1[i*$clog2(NR_ROWS) +: $clog2(NR_ROWS)] ),
.RdData_DO_0 ( bht_ram_rdata_0[i*BRAM_WORD_BITS +: BRAM_WORD_BITS] ),
.RdData_DO_1 ( bht_ram_rdata_1[i*BRAM_WORD_BITS +: BRAM_WORD_BITS] )
);
end
always_ff @(posedge clk_i or negedge rst_ni) begin
if (!rst_ni) begin
for (int unsigned i = 0; i < NR_ROWS; i++) begin
for (int j = 0; j < ariane_pkg::INSTR_PER_FETCH; j++) begin
bht_q[i][j] <= '0;
end
end
end else begin
// evict all entries
if (flush_i) begin
for (int i = 0; i < NR_ROWS; i++) begin
for (int j = 0; j < ariane_pkg::INSTR_PER_FETCH; j++) begin
bht_q[i][j].valid <= 1'b0;
bht_q[i][j].saturation_counter <= 2'b10;
end
end
end else begin
bht_q <= bht_d;
end
end
end
endmodule

View file

@ -379,6 +379,11 @@ package ariane_pkg;
logic taken;
} bht_prediction_t;
typedef struct packed {
logic valid;
logic [1:0] saturation_counter;
} bht_t;
typedef enum logic[3:0] {
NONE, // 0
LOAD, // 1

View file

@ -0,0 +1,66 @@
// Copyright 2023 Thales Research and Technology
//
// Licensed under the Solderpad Hardware Licence, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// SPDX-License-Identifier: Apache-2.0 WITH SHL-2.0
// You may obtain a copy of the License at https://solderpad.org/licenses
//
// Inferable, Asynchronous Three-Ports RAM, there are a write port and two read ports
//
//
// This module is designed to work with both Xilinx and Microchip FPGA tools by following the respective
// guidelines:
// - Xilinx UG901 Vivado Design Suite User Guide: Synthesis
// - Inferring Microchip PolarFire RAM Blocks
//
// Intel FPGA (Altera) doesn't seem to support asynchronous RAM
//
// Current Maintainers:: Sébastien Jacq - sjthales on github.com
module AsyncThreePortRam
#(
parameter ADDR_WIDTH = 10,
parameter DATA_DEPTH = 1024, // usually 2**ADDR_WIDTH, but can be lower
parameter DATA_WIDTH = 32
)(
input logic Clk_CI,
// Write port
input logic WrEn_SI,
input logic [ADDR_WIDTH-1:0] WrAddr_DI,
input logic [DATA_WIDTH-1:0] WrData_DI,
// Read ports
input logic [ADDR_WIDTH-1:0] RdAddr_DI_0,
input logic [ADDR_WIDTH-1:0] RdAddr_DI_1,
output logic [DATA_WIDTH-1:0] RdData_DO_0,
output logic [DATA_WIDTH-1:0] RdData_DO_1
);
logic [DATA_WIDTH-1:0] mem [DATA_DEPTH-1:0]= '{default:0};
// WRITE
always_ff @(posedge Clk_CI)
begin
if (WrEn_SI) begin
mem[WrAddr_DI] <= WrData_DI;
end
end
// READ
assign RdData_DO_0 = mem[RdAddr_DI_0];
assign RdData_DO_1 = mem[RdAddr_DI_1];
////////////////////////////
// assertions
////////////////////////////
// pragma translate_off
assert property
(@(posedge Clk_CI) (longint'(2)**longint'(ADDR_WIDTH) >= longint'(DATA_DEPTH)))
else $error("depth out of bounds");
// pragma translate_on
endmodule