Altera opt 2 (#2602)

The second optimization for Altera FPGA is to move the BHT to LUTRAM. Same as before, the reason why the optimization previously done for Xilinx is not working, is that in that case asynchronous RAM primitives are used, and Altera does not support asynchronous RAM. Therefore, this optimization consists in using synchronous RAM for the BHT.

The main changes to the existing code are:

New RAM module to infer synchronous RAM in altera with 2 independent read ports and one write port (SyncThreePortRam.sv)

Changes in the frontend.sv file: modify input to vpc_i port of BHT, by advancing the address to read, in order to compensate for the delay of synchronous RAM.

Changes in the bht.sv file: This case is more complex because of the logic operations that need to be performed inside the BHT. First, the pc pointed by bht_update_i is read from the memory, modified according to the saturation counter and valid bit, and finally written again in the memory. The prediction output is given based on the vpc_i. With asynchronous memory, the new data written via update_i is available one clock cycle after writing it. So, if vpc_i tries to read the address that was previously written by update_i, everything is fine. However, in the case of synchronous memory there are three clock cycles of latency (one for reading the pc content (read port 1), another one for writing it, and another one for reading in the other port (read port 0)). For this reason, there is the need to adapt the design to these new latency constraints:

First, there is the need for a delay on the address write of the synchronous RAM, to wait for the previous pc read and store the right modified data.

Once this is solved, similarly to the FIFO case, there is the need for an auxiliary buffer that will store the data written in the FIFO, allowing to have it available 2 clock cycles after the update_i was valid. This is because after having the correct data, the RAM takes 2 clock cycles until data can be available in the output (one clock cycle for writing and one for reading).

Finally, there is a multiplexer in the output that permits to deliver the correct prediction providing the data from the update logic (1 cycle of delay), the auxiliary register (2 cycles of delay), or the RAM (3 or more cycles of delay), depending on the delay since the update_i was valid (i.e. written to the memory).
This commit is contained in:
AngelaGonzalezMarino 2024-11-21 23:36:18 +01:00 committed by GitHub
parent 8a84f788d6
commit c389382c89
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
3 changed files with 220 additions and 59 deletions

View file

@ -1,5 +1,6 @@
// Copyright 2018 - 2019 ETH Zurich and University of Bologna.
// Copyright 2023 - Thales for additionnal conribution.
// Copyright 2023 - Thales for additionnal contribution.
// Copyright 2024 - PlanV Technologies for additionnal contribution.
// Copyright and related rights are licensed under the Solderpad Hardware
// License, Version 2.0 (the "License"); you may not use this file except in
// compliance with the License. You may obtain a copy of the License at
@ -15,6 +16,8 @@
// Date: 09.06.2018
// FPGA optimization: Sebastien Jacq, Thales
// Date: 2023-01-30
// FPGA optimization for Altera: Angela Gonzalez, PlanV Technolgies
// Date: 2024-10-16
// branch history table - 2 bit saturation counter
@ -47,8 +50,6 @@ module bht #(
localparam ROW_INDEX_BITS = CVA6Cfg.RVC == 1'b1 ? $clog2(CVA6Cfg.INSTR_PER_FETCH) : 1;
// number of bits we should use for prediction
localparam PREDICTION_BITS = $clog2(NR_ROWS) + OFFSET + ROW_ADDR_BITS;
// we are not interested in all bits of the address
unread i_unread (.d_i(|vpc_i));
struct packed {
logic valid;
@ -58,7 +59,7 @@ module bht #(
bht_q[NR_ROWS-1:0][CVA6Cfg.INSTR_PER_FETCH-1:0];
logic [$clog2(NR_ROWS)-1:0] index, update_pc;
logic [ROW_INDEX_BITS-1:0] update_row_index;
logic [ROW_INDEX_BITS-1:0] update_row_index, update_row_index_q, check_update_row_index;
assign index = vpc_i[PREDICTION_BITS-1:ROW_ADDR_BITS+OFFSET];
assign update_pc = bht_update_i.pc[PREDICTION_BITS-1:ROW_ADDR_BITS+OFFSET];
@ -127,17 +128,23 @@ module bht #(
// number of bits par word in the bram
localparam BRAM_WORD_BITS = $bits(ariane_pkg::bht_t);
logic [ ROW_INDEX_BITS-1:0] row_index;
logic [ CVA6Cfg.INSTR_PER_FETCH-1:0] bht_ram_we;
logic [CVA6Cfg.INSTR_PER_FETCH*$clog2(NR_ROWS)-1:0] bht_ram_read_address_0;
logic [CVA6Cfg.INSTR_PER_FETCH*$clog2(NR_ROWS)-1:0] bht_ram_read_address_1;
logic [CVA6Cfg.INSTR_PER_FETCH*$clog2(NR_ROWS)-1:0] bht_ram_write_address;
logic [ CVA6Cfg.INSTR_PER_FETCH*BRAM_WORD_BITS-1:0] bht_ram_wdata;
logic [ CVA6Cfg.INSTR_PER_FETCH*BRAM_WORD_BITS-1:0] bht_ram_rdata_0;
logic [ CVA6Cfg.INSTR_PER_FETCH*BRAM_WORD_BITS-1:0] bht_ram_rdata_1;
logic [ROW_INDEX_BITS-1:0] row_index, row_index_q, check_row_index;
logic [CVA6Cfg.INSTR_PER_FETCH-1:0] bht_ram_we, bht_ram_we_q;
logic [CVA6Cfg.INSTR_PER_FETCH*$clog2(NR_ROWS)-1:0] bht_ram_read_address_0;
logic [CVA6Cfg.INSTR_PER_FETCH*$clog2(NR_ROWS)-1:0] bht_ram_read_address_1;
logic [CVA6Cfg.INSTR_PER_FETCH*$clog2(NR_ROWS)-1:0]
bht_ram_write_address, bht_ram_write_address_q;
logic [CVA6Cfg.INSTR_PER_FETCH*BRAM_WORD_BITS-1:0] bht_ram_wdata, bht_ram_wdata_q;
logic [CVA6Cfg.INSTR_PER_FETCH*BRAM_WORD_BITS-1:0] bht_ram_rdata_0;
logic [CVA6Cfg.INSTR_PER_FETCH*BRAM_WORD_BITS-1:0] bht_ram_rdata_1;
ariane_pkg::bht_t [ CVA6Cfg.INSTR_PER_FETCH-1:0] bht;
ariane_pkg::bht_t [ CVA6Cfg.INSTR_PER_FETCH-1:0] bht_updated;
ariane_pkg::bht_t [CVA6Cfg.INSTR_PER_FETCH-1:0] bht;
ariane_pkg::bht_t [CVA6Cfg.INSTR_PER_FETCH-1:0] bht_updated;
logic [CVA6Cfg.INSTR_PER_FETCH-1:0][1:0] bht_updated_valid;
logic [CVA6Cfg.INSTR_PER_FETCH-1:0][1:0][CVA6Cfg.VLEN-1:0] bht_updated_pc;
logic bht_update_taken, check_bht_update_taken;
logic [CVA6Cfg.VLEN-1:0] vpc_q;
if (CVA6Cfg.RVC) begin : gen_row_index
assign row_index = vpc_i[ROW_ADDR_BITS+OFFSET-1:OFFSET];
@ -157,64 +164,150 @@ module bht #(
bht_updated = '0;
bht = '0;
for (int i = 0; i < CVA6Cfg.INSTR_PER_FETCH; i++) begin
bht_ram_read_address_0[i*$clog2(NR_ROWS)+:$clog2(NR_ROWS)] = index;
bht_prediction_o[i].valid = bht_ram_rdata_0[i*BRAM_WORD_BITS+2];
bht_prediction_o[i].taken = bht_ram_rdata_0[i*BRAM_WORD_BITS+1];
end
//Write to RAM
if (bht_update_i.valid && !debug_mode_i) begin
for (int i = 0; i < CVA6Cfg.INSTR_PER_FETCH; i++) begin
if (update_row_index == i) begin
bht_ram_read_address_1[i*$clog2(NR_ROWS)+:$clog2(NR_ROWS)] = update_pc;
bht[i].saturation_counter = bht_ram_rdata_1[i*BRAM_WORD_BITS+:2];
if (bht[i].saturation_counter == 2'b11) begin
// we can safely decrease it
if (!bht_update_i.taken)
bht_updated[i].saturation_counter = bht[i].saturation_counter - 1;
else bht_updated[i].saturation_counter = 2'b11;
// then check if it saturated in the negative regime e.g.: branch not taken
end else if (bht[i].saturation_counter == 2'b00) begin
// we can safely increase it
if (bht_update_i.taken)
bht_updated[i].saturation_counter = bht[i].saturation_counter + 1;
else bht_updated[i].saturation_counter = 2'b00;
end else begin // otherwise we are not in any boundaries and can decrease or increase it
if (bht_update_i.taken)
bht_updated[i].saturation_counter = bht[i].saturation_counter + 1;
else bht_updated[i].saturation_counter = bht[i].saturation_counter - 1;
end
bht_updated[i].valid = 1'b1;
bht_ram_we[i] = 1'b1;
bht_ram_write_address[i*$clog2(NR_ROWS)+:$clog2(NR_ROWS)] = update_pc;
//bht_ram_wdata[(i+1)*BRAM_WORD_BITS-1] = 1'b1; //valid
bht_ram_wdata[i*BRAM_WORD_BITS+:BRAM_WORD_BITS] = {
bht_updated[i].valid, bht_updated[i].saturation_counter
};
end
end
end
for (int i = 0; i < CVA6Cfg.INSTR_PER_FETCH; i++) begin
//When synchronous RAM is used, addresses are needed as soon as available
if (CVA6Cfg.FpgaAlteraEn)
bht_ram_read_address_0[i*$clog2(NR_ROWS)+:$clog2(NR_ROWS)] = index;
if (CVA6Cfg.FpgaAlteraEn)
bht_ram_read_address_1[i*$clog2(NR_ROWS)+:$clog2(NR_ROWS)] = update_pc;
if (check_update_row_index == i) begin
//When asynchronous RAM is used, the address can be updated on the cycle when data is read
if (!CVA6Cfg.FpgaAlteraEn)
bht_ram_read_address_1[i*$clog2(NR_ROWS)+:$clog2(NR_ROWS)] = update_pc;
bht[i].saturation_counter = bht_ram_rdata_1[i*BRAM_WORD_BITS+:2];
if (bht[i].saturation_counter == 2'b11) begin
// we can safely decrease it
if (!check_bht_update_taken)
bht_updated[i].saturation_counter = bht[i].saturation_counter - 1;
else bht_updated[i].saturation_counter = 2'b11;
// then check if it saturated in the negative regime e.g.: branch not taken
end else if (bht[i].saturation_counter == 2'b00) begin
// we can safely increase it
if (check_bht_update_taken)
bht_updated[i].saturation_counter = bht[i].saturation_counter + 1;
else bht_updated[i].saturation_counter = 2'b00;
end else begin // otherwise we are not in any boundaries and can decrease or increase it
if (check_bht_update_taken)
bht_updated[i].saturation_counter = bht[i].saturation_counter + 1;
else bht_updated[i].saturation_counter = bht[i].saturation_counter - 1;
end
//The data written in the RAM will have the valid bit from current input (async RAM) or the one from one clock cycle before (sync RAM)
bht_ram_wdata[i*BRAM_WORD_BITS+:BRAM_WORD_BITS] = CVA6Cfg.FpgaAlteraEn ? {bht_updated_valid[i][0], bht_updated[i].saturation_counter} :
{bht_updated[i].valid, bht_updated[i].saturation_counter};
end
if (!rst_ni) begin
//initialize output
bht_prediction_o[i] = '0;
end else begin
//When asynchronous RAM is used, addresses can be calculated on the same cycle as data is read
if (!CVA6Cfg.FpgaAlteraEn)
bht_ram_read_address_0[i*$clog2(NR_ROWS)+:$clog2(NR_ROWS)] = index;
//When synchronous RAM is used and data is read right after writing, we need some buffering
// This is one cycle of buffering
if (CVA6Cfg.FpgaAlteraEn && bht_updated_valid[i][0] && vpc_q == bht_updated_pc[i][0]) begin
bht_prediction_o[i].valid = bht_ram_wdata[i*BRAM_WORD_BITS+2];
bht_prediction_o[i].taken = bht_ram_wdata[i*BRAM_WORD_BITS+1];
//This is two cycles of buffering
end else if (CVA6Cfg.FpgaAlteraEn && bht_updated_valid[i][1] && vpc_q == bht_updated_pc[i][1]) begin
bht_prediction_o[i].valid = bht_ram_wdata_q[i*BRAM_WORD_BITS+2];
bht_prediction_o[i].taken = bht_ram_wdata_q[i*BRAM_WORD_BITS+1];
//In any other case we can safely read from the RAM as data is available
end else begin
bht_prediction_o[i].valid = bht_ram_rdata_0[i*BRAM_WORD_BITS+2];
bht_prediction_o[i].taken = bht_ram_rdata_0[i*BRAM_WORD_BITS+1];
end
end
end
end
for (genvar i = 0; i < CVA6Cfg.INSTR_PER_FETCH; i++) begin : gen_bht_ram
AsyncThreePortRam #(
.ADDR_WIDTH($clog2(NR_ROWS)),
.DATA_DEPTH(NR_ROWS),
.DATA_WIDTH(BRAM_WORD_BITS)
) i_bht_ram (
.Clk_CI (clk_i),
.WrEn_SI (bht_ram_we[i]),
.WrAddr_DI (bht_ram_write_address[i*$clog2(NR_ROWS)+:$clog2(NR_ROWS)]),
.WrData_DI (bht_ram_wdata[i*BRAM_WORD_BITS+:BRAM_WORD_BITS]),
.RdAddr_DI_0(bht_ram_read_address_0[i*$clog2(NR_ROWS)+:$clog2(NR_ROWS)]),
.RdAddr_DI_1(bht_ram_read_address_1[i*$clog2(NR_ROWS)+:$clog2(NR_ROWS)]),
.RdData_DO_0(bht_ram_rdata_0[i*BRAM_WORD_BITS+:BRAM_WORD_BITS]),
.RdData_DO_1(bht_ram_rdata_1[i*BRAM_WORD_BITS+:BRAM_WORD_BITS])
);
if (CVA6Cfg.FpgaAlteraEn) begin
SyncThreePortRam #(
.ADDR_WIDTH($clog2(NR_ROWS)),
.DATA_DEPTH(NR_ROWS),
.DATA_WIDTH(BRAM_WORD_BITS)
) i_bht_ram (
.Clk_CI (clk_i),
.WrEn_SI (bht_ram_we_q[i]),
.WrAddr_DI (bht_ram_write_address_q[i*$clog2(NR_ROWS)+:$clog2(NR_ROWS)]),
.WrData_DI (bht_ram_wdata[i*BRAM_WORD_BITS+:BRAM_WORD_BITS]),
.RdAddr_DI_0(bht_ram_read_address_0[i*$clog2(NR_ROWS)+:$clog2(NR_ROWS)]),
.RdAddr_DI_1(bht_ram_read_address_1[i*$clog2(NR_ROWS)+:$clog2(NR_ROWS)]),
.RdData_DO_0(bht_ram_rdata_0[i*BRAM_WORD_BITS+:BRAM_WORD_BITS]),
.RdData_DO_1(bht_ram_rdata_1[i*BRAM_WORD_BITS+:BRAM_WORD_BITS])
);
end else begin
AsyncThreePortRam #(
.ADDR_WIDTH($clog2(NR_ROWS)),
.DATA_DEPTH(NR_ROWS),
.DATA_WIDTH(BRAM_WORD_BITS)
) i_bht_ram (
.Clk_CI (clk_i),
.WrEn_SI (bht_ram_we[i]),
.WrAddr_DI (bht_ram_write_address[i*$clog2(NR_ROWS)+:$clog2(NR_ROWS)]),
.WrData_DI (bht_ram_wdata[i*BRAM_WORD_BITS+:BRAM_WORD_BITS]),
.RdAddr_DI_0(bht_ram_read_address_0[i*$clog2(NR_ROWS)+:$clog2(NR_ROWS)]),
.RdAddr_DI_1(bht_ram_read_address_1[i*$clog2(NR_ROWS)+:$clog2(NR_ROWS)]),
.RdData_DO_0(bht_ram_rdata_0[i*BRAM_WORD_BITS+:BRAM_WORD_BITS]),
.RdData_DO_1(bht_ram_rdata_1[i*BRAM_WORD_BITS+:BRAM_WORD_BITS])
);
end
end
// Extra buffering signals needed when synchronous RAM is used
always_ff @(posedge clk_i or negedge rst_ni) begin
if (CVA6Cfg.FpgaAlteraEn) begin
if (!rst_ni) begin
bht_updated_valid <= '0;
bht_update_taken <= '0;
bht_ram_wdata_q <= '0;
row_index_q <= '0;
bht_ram_we_q <= '0;
bht_ram_write_address_q <= '0;
update_row_index_q <= '0;
end else begin
for (int i = 0; i < CVA6Cfg.INSTR_PER_FETCH; i++) begin
bht_updated_valid[i][1] <= bht_updated_valid[i][0];
bht_updated_valid[i][0] <= bht_updated[i].valid;
bht_updated_pc[i][1] <= bht_updated_pc[i][0];
bht_updated_pc[i][0] <= bht_update_i.pc;
end
vpc_q <= vpc_i;
bht_update_taken <= bht_update_i.taken;
bht_ram_wdata_q <= bht_ram_wdata;
bht_ram_we_q <= bht_ram_we;
bht_ram_write_address_q <= bht_ram_write_address;
update_row_index_q <= update_row_index;
row_index_q <= row_index;
end
end
end
// Assignment of indexes checked to generate data written in the RAM. When synchronous RAM is used these signals need to be delayed
assign check_update_row_index = CVA6Cfg.FpgaAlteraEn ? update_row_index_q : update_row_index;
assign check_bht_update_taken = CVA6Cfg.FpgaAlteraEn ? bht_update_taken : bht_update_i.taken;
assign check_row_index = CVA6Cfg.FpgaAlteraEn ? row_index_q : row_index;
end
endmodule

View file

@ -140,6 +140,7 @@ module frontend
btb_prediction_t [CVA6Cfg.INSTR_PER_FETCH-1:0] btb_prediction_shifted;
ras_t ras_predict;
logic [ CVA6Cfg.VLEN-1:0] vpc_btb;
logic [ CVA6Cfg.VLEN-1:0] vpc_bht;
// branch-predict update
logic is_mispredict;
@ -484,7 +485,9 @@ module frontend
//For FPGA, BTB is implemented in read synchronous BRAM
//while for ASIC, BTB is implemented in D flip-flop
//and can be read at the same cycle.
//Same for BHT
assign vpc_btb = (CVA6Cfg.FpgaEn) ? icache_dreq_i.vaddr : icache_vaddr_q;
assign vpc_bht = (CVA6Cfg.FpgaEn && CVA6Cfg.FpgaAlteraEn && icache_dreq_i.valid) ? icache_dreq_i.vaddr : icache_vaddr_q;
if (CVA6Cfg.BTBEntries == 0) begin
assign btb_prediction = '0;
@ -517,7 +520,7 @@ module frontend
.rst_ni,
.flush_bp_i (flush_bp_i),
.debug_mode_i,
.vpc_i (icache_vaddr_q),
.vpc_i (vpc_bht),
.bht_update_i (bht_update),
.bht_prediction_o(bht_prediction)
);

View file

@ -0,0 +1,65 @@
// Copyright 2024 PlanV Technologies
//
// Licensed under the Solderpad Hardware Licence, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// SPDX-License-Identifier: Apache-2.0 WITH SHL-2.0
// You may obtain a copy of the License at https://solderpad.org/licenses
//
// Inferable, Asynchronous Three-Ports RAM, there are a write port and two read ports
//
//
// This module is designed to work with both Xilinx, Microchip and Altera FPGA tools by following the respective
// guidelines:
// - Xilinx UG901 Vivado Design Suite User Guide: Synthesis
// - Inferring Microchip PolarFire RAM Blocks
// - Altera Quartus II Handbook Volume 1: Design and Synthesis (p. 768)
//
// Current Maintainers:: Angela Gonzalez - PlanV Technologies
module SyncThreePortRam
#(
parameter ADDR_WIDTH = 10,
parameter DATA_DEPTH = 1024, // usually 2**ADDR_WIDTH, but can be lower
parameter DATA_WIDTH = 32
)(
input logic Clk_CI,
// Write port
input logic WrEn_SI,
input logic [ADDR_WIDTH-1:0] WrAddr_DI,
input logic [DATA_WIDTH-1:0] WrData_DI,
// Read ports
input logic [ADDR_WIDTH-1:0] RdAddr_DI_0,
input logic [ADDR_WIDTH-1:0] RdAddr_DI_1,
output logic [DATA_WIDTH-1:0] RdData_DO_0,
output logic [DATA_WIDTH-1:0] RdData_DO_1
);
logic [DATA_WIDTH-1:0] mem [DATA_DEPTH-1:0]= '{default:0};
// WRITE
always_ff @(posedge Clk_CI)
begin
if (WrEn_SI) begin
mem[WrAddr_DI] <= WrData_DI;
end
RdData_DO_0 = mem[RdAddr_DI_0];
RdData_DO_1 = mem[RdAddr_DI_1];
end
////////////////////////////
// assertions
////////////////////////////
// pragma translate_off
assert property
(@(posedge Clk_CI) (longint'(2)**longint'(ADDR_WIDTH) >= longint'(DATA_DEPTH)))
else $error("depth out of bounds");
// pragma translate_on
endmodule